Коммит
c203ea29aa
@ -11,7 +11,7 @@
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2006-2015 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2006-2011 Los Alamos National Security, LLC. All rights
|
||||
# Copyright (c) 2006-2015 Los Alamos National Security, LLC. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
|
||||
# Copyright (c) 2010-2012 Oracle and/or its affiliates. All rights reserved.
|
||||
@ -140,7 +140,7 @@ AC_DEFUN([OPAL_CHECK_OPENFABRICS],[
|
||||
|
||||
# If we have the openib stuff available, find out what we've got
|
||||
AS_IF([test "$ompi_check_openib_happy" = "yes"],
|
||||
[AC_CHECK_DECLS([IBV_EVENT_CLIENT_REREGISTER, IBV_ACCESS_SO], [], [],
|
||||
[AC_CHECK_DECLS([IBV_EVENT_CLIENT_REREGISTER, IBV_ACCESS_SO, IBV_ATOMIC_HCA], [], [],
|
||||
[#include <infiniband/verbs.h>])
|
||||
AC_CHECK_FUNCS([ibv_get_device_list ibv_resize_cq])
|
||||
|
||||
|
@ -91,7 +91,7 @@ static void mca_bml_base_completion(
|
||||
{
|
||||
mca_bml_base_context_t* ctx = (mca_bml_base_context_t*) des->des_cbdata;
|
||||
/* restore original state */
|
||||
((unsigned char*)des->des_local[0].seg_addr.pval)[ctx->index] ^= ~0;
|
||||
((unsigned char*)des->des_segments[0].seg_addr.pval)[ctx->index] ^= ~0;
|
||||
des->des_cbdata = ctx->cbdata;
|
||||
des->des_cbfunc = ctx->cbfunc;
|
||||
free(ctx);
|
||||
@ -121,11 +121,11 @@ int mca_bml_base_send( mca_bml_base_btl_t* bml_btl,
|
||||
malloc(sizeof(mca_bml_base_context_t));
|
||||
if(NULL != ctx) {
|
||||
opal_output(0, "%s:%d: corrupting data\n", __FILE__, __LINE__);
|
||||
ctx->index = (size_t) ((des->des_local[0].seg_len *
|
||||
ctx->index = (size_t) ((des->des_segments[0].seg_len *
|
||||
opal_rand(&mca_bml_base_rand_buff) * 1.0) / (UINT32_MAX + 1.0));
|
||||
ctx->cbfunc = des->des_cbfunc;
|
||||
ctx->cbdata = des->des_cbdata;
|
||||
((unsigned char*)des->des_local[0].seg_addr.pval)[ctx->index] ^= ~0;
|
||||
((unsigned char*)des->des_segments[0].seg_addr.pval)[ctx->index] ^= ~0;
|
||||
des->des_cbdata = ctx;
|
||||
des->des_cbfunc = mca_bml_base_completion;
|
||||
}
|
||||
|
@ -1,3 +1,4 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
@ -10,7 +11,7 @@
|
||||
* Copyright (c) 2004-2006 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -307,27 +308,30 @@ static inline int mca_bml_base_sendi( mca_bml_base_btl_t* bml_btl,
|
||||
payload_size, order, flags, tag, descriptor);
|
||||
}
|
||||
|
||||
static inline int mca_bml_base_put( mca_bml_base_btl_t* bml_btl,
|
||||
mca_btl_base_descriptor_t* des)
|
||||
static inline int mca_bml_base_put( mca_bml_base_btl_t* bml_btl, void *local_address, uint64_t remote_address,
|
||||
struct mca_btl_base_registration_handle_t *local_handle,
|
||||
struct mca_btl_base_registration_handle_t *remote_handle, size_t size,
|
||||
int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbdata)
|
||||
{
|
||||
mca_btl_base_module_t* btl = bml_btl->btl;
|
||||
|
||||
des->des_context = (void*) bml_btl;
|
||||
return btl->btl_put( btl, bml_btl->btl_endpoint, des );
|
||||
return btl->btl_put( btl, bml_btl->btl_endpoint, local_address, remote_address, local_handle,
|
||||
remote_handle, size, flags, order, cbfunc, (void *) bml_btl, cbdata);
|
||||
}
|
||||
|
||||
static inline int mca_bml_base_get( mca_bml_base_btl_t* bml_btl,
|
||||
mca_btl_base_descriptor_t* des)
|
||||
static inline int mca_bml_base_get( mca_bml_base_btl_t* bml_btl, void *local_address, uint64_t remote_address,
|
||||
struct mca_btl_base_registration_handle_t *local_handle,
|
||||
struct mca_btl_base_registration_handle_t *remote_handle, size_t size,
|
||||
int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbdata)
|
||||
{
|
||||
mca_btl_base_module_t* btl = bml_btl->btl;
|
||||
|
||||
des->des_context = (void*) bml_btl;
|
||||
return btl->btl_get( btl, bml_btl->btl_endpoint, des );
|
||||
return btl->btl_get( btl, bml_btl->btl_endpoint, local_address, remote_address, local_handle,
|
||||
remote_handle, size, flags, order, cbfunc, (void *) bml_btl, cbdata);
|
||||
}
|
||||
|
||||
|
||||
static inline void mca_bml_base_prepare_src(mca_bml_base_btl_t* bml_btl,
|
||||
mca_mpool_base_registration_t* reg,
|
||||
struct opal_convertor_t* conv,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
@ -337,29 +341,27 @@ static inline void mca_bml_base_prepare_src(mca_bml_base_btl_t* bml_btl,
|
||||
{
|
||||
mca_btl_base_module_t* btl = bml_btl->btl;
|
||||
|
||||
*des = btl->btl_prepare_src( btl, bml_btl->btl_endpoint, reg, conv,
|
||||
*des = btl->btl_prepare_src( btl, bml_btl->btl_endpoint, conv,
|
||||
order, reserve, size, flags );
|
||||
if( OPAL_LIKELY((*des) != NULL) ) {
|
||||
(*des)->des_context = (void*) bml_btl;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void mca_bml_base_prepare_dst(mca_bml_base_btl_t* bml_btl,
|
||||
mca_mpool_base_registration_t* reg,
|
||||
struct opal_convertor_t* conv,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t *size,
|
||||
uint32_t flags,
|
||||
mca_btl_base_descriptor_t** des)
|
||||
{
|
||||
static inline void mca_bml_base_register_mem (mca_bml_base_btl_t* bml_btl, void *base,
|
||||
size_t size, uint32_t flags,
|
||||
mca_btl_base_registration_handle_t **handle)
|
||||
{
|
||||
mca_btl_base_module_t* btl = bml_btl->btl;
|
||||
|
||||
*des = btl->btl_prepare_dst( btl, bml_btl->btl_endpoint, reg, conv,
|
||||
order, reserve, size, flags );
|
||||
if( OPAL_LIKELY((*des) != NULL) ) {
|
||||
(*des)->des_context = (void*) bml_btl;
|
||||
}
|
||||
*handle = btl->btl_register_mem (btl, bml_btl->btl_endpoint, base, size, flags);
|
||||
}
|
||||
|
||||
static inline void mca_bml_base_deregister_mem (mca_bml_base_btl_t* bml_btl, mca_btl_base_registration_handle_t *handle)
|
||||
{
|
||||
mca_btl_base_module_t* btl = bml_btl->btl;
|
||||
|
||||
btl->btl_deregister_mem (btl, handle);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -17,6 +17,8 @@
|
||||
* Copyright (c) 2014 NVIDIA Corporation. All rights reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -86,9 +88,7 @@ static int mca_bml_r2_add_btls( void )
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
for(selected_btl = (mca_btl_base_selected_module_t*)opal_list_get_first(btls);
|
||||
selected_btl != (mca_btl_base_selected_module_t*)opal_list_get_end(btls);
|
||||
selected_btl = (mca_btl_base_selected_module_t*)opal_list_get_next(selected_btl)) {
|
||||
OPAL_LIST_FOREACH(selected_btl, btls, mca_btl_base_selected_module_t) {
|
||||
mca_btl_base_module_t *btl = selected_btl->btl_module;
|
||||
mca_bml_r2.btl_modules[mca_bml_r2.num_btl_modules++] = btl;
|
||||
for (i = 0; NULL != btl_names_argv && NULL != btl_names_argv[i]; ++i) {
|
||||
@ -127,6 +127,23 @@ static int btl_bandwidth_compare(const void *v1, const void *v2)
|
||||
return b2->btl->btl_bandwidth - b1->btl->btl_bandwidth;
|
||||
}
|
||||
|
||||
static void mca_bml_r2_calculate_bandwidth_latency (mca_bml_base_btl_array_t *btl_array, double *total_bandwidth, uint32_t *latency)
|
||||
{
|
||||
const size_t array_length = mca_bml_base_btl_array_get_size (btl_array);
|
||||
|
||||
*latency = UINT_MAX;
|
||||
*total_bandwidth = 0.;
|
||||
|
||||
for (size_t i = 0 ; i < array_length ; ++i) {
|
||||
mca_bml_base_btl_t *bml_btl = mca_bml_base_btl_array_get_index (btl_array, i);
|
||||
mca_btl_base_module_t *btl = bml_btl->btl;
|
||||
*total_bandwidth += btl->btl_bandwidth;
|
||||
if (btl->btl_latency < *latency) {
|
||||
*latency = btl->btl_latency;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* For each proc setup a datastructure that indicates the BTLs
|
||||
* that can be used to reach the destination.
|
||||
@ -189,6 +206,7 @@ static int mca_bml_r2_add_procs( size_t nprocs,
|
||||
for(p_index = 0; p_index < mca_bml_r2.num_btl_modules; p_index++) {
|
||||
mca_btl_base_module_t* btl = mca_bml_r2.btl_modules[p_index];
|
||||
int btl_inuse = 0;
|
||||
int btl_flags;
|
||||
|
||||
/* if the r2 can reach the destination proc it sets the
|
||||
* corresponding bit (proc index) in the reachable bitmap
|
||||
@ -212,7 +230,7 @@ static int mca_bml_r2_add_procs( size_t nprocs,
|
||||
ompi_proc_t *proc = new_procs[p];
|
||||
mca_bml_base_endpoint_t * bml_endpoint =
|
||||
(mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
|
||||
mca_bml_base_btl_t* bml_btl;
|
||||
mca_bml_base_btl_t* bml_btl = NULL;
|
||||
size_t size;
|
||||
|
||||
if(NULL == bml_endpoint) {
|
||||
@ -236,12 +254,35 @@ static int mca_bml_r2_add_procs( size_t nprocs,
|
||||
bml_endpoint->btl_flags_or = 0;
|
||||
}
|
||||
|
||||
btl_flags = btl->btl_flags;
|
||||
if( (btl_flags & MCA_BTL_FLAGS_PUT) && (NULL == btl->btl_put) ) {
|
||||
opal_output(0, "mca_bml_r2_add_procs: The PUT flag is specified for"
|
||||
" the %s BTL without any PUT function attached. Discard the flag !",
|
||||
bml_btl->btl->btl_component->btl_version.mca_component_name);
|
||||
btl_flags ^= MCA_BTL_FLAGS_PUT;
|
||||
}
|
||||
if( (btl_flags & MCA_BTL_FLAGS_GET) && (NULL == btl->btl_get) ) {
|
||||
opal_output(0, "mca_bml_r2_add_procs: The GET flag is specified for"
|
||||
" the %s BTL without any GET function attached. Discard the flag !",
|
||||
bml_btl->btl->btl_component->btl_version.mca_component_name);
|
||||
btl_flags ^= MCA_BTL_FLAGS_GET;
|
||||
}
|
||||
|
||||
if( (btl_flags & (MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_SEND)) == 0 ) {
|
||||
/**
|
||||
* If no protocol specified, we have 2 choices: we ignore the BTL
|
||||
* as we don't know which protocl to use, or we suppose that all
|
||||
* BTLs support the send protocol.
|
||||
*/
|
||||
btl_flags |= MCA_BTL_FLAGS_SEND;
|
||||
}
|
||||
|
||||
/* dont allow an additional BTL with a lower exclusivity ranking */
|
||||
size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
|
||||
if(size > 0) {
|
||||
bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, size-1);
|
||||
/* skip this btl if the exclusivity is less than the previous */
|
||||
if(bml_btl->btl->btl_exclusivity > btl->btl_exclusivity) {
|
||||
/* skip this btl if the exclusivity is less than the previous only if the btl does not provide full rdma (for one-sided) */
|
||||
if(bml_btl->btl->btl_exclusivity > btl->btl_exclusivity && ((btl_flags & MCA_BTL_FLAGS_RDMA) != MCA_BTL_FLAGS_RDMA)) {
|
||||
btl->btl_del_procs(btl, 1, (opal_proc_t**)&proc, &btl_endpoints[p]);
|
||||
opal_output_verbose(20, opal_btl_base_framework.framework_output,
|
||||
"mca: bml: Not using %s btl to %s on node %s "
|
||||
@ -261,39 +302,44 @@ static int mca_bml_r2_add_procs( size_t nprocs,
|
||||
proc->super.proc_hostname);
|
||||
|
||||
/* cache the endpoint on the proc */
|
||||
bml_btl = mca_bml_base_btl_array_insert(&bml_endpoint->btl_send);
|
||||
bml_btl->btl = btl;
|
||||
bml_btl->btl_endpoint = btl_endpoints[p];
|
||||
bml_btl->btl_weight = 0;
|
||||
bml_btl->btl_flags = btl->btl_flags;
|
||||
if( (bml_btl->btl_flags & MCA_BTL_FLAGS_PUT) && (NULL == btl->btl_put) ) {
|
||||
opal_output(0, "mca_bml_r2_add_procs: The PUT flag is specified for"
|
||||
" the %s BTL without any PUT function attached. Discard the flag !",
|
||||
bml_btl->btl->btl_component->btl_version.mca_component_name);
|
||||
bml_btl->btl_flags ^= MCA_BTL_FLAGS_PUT;
|
||||
}
|
||||
if( (bml_btl->btl_flags & MCA_BTL_FLAGS_GET) && (NULL == btl->btl_get) ) {
|
||||
opal_output(0, "mca_bml_r2_add_procs: The GET flag is specified for"
|
||||
" the %s BTL without any GET function attached. Discard the flag !",
|
||||
bml_btl->btl->btl_component->btl_version.mca_component_name);
|
||||
bml_btl->btl_flags ^= MCA_BTL_FLAGS_GET;
|
||||
}
|
||||
if( (bml_btl->btl_flags & (MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_SEND)) == 0 ) {
|
||||
if (NULL == bml_btl || (bml_btl->btl->btl_exclusivity <= btl->btl_exclusivity)) {
|
||||
bml_btl = mca_bml_base_btl_array_insert(&bml_endpoint->btl_send);
|
||||
bml_btl->btl = btl;
|
||||
bml_btl->btl_endpoint = btl_endpoints[p];
|
||||
bml_btl->btl_weight = 0;
|
||||
bml_btl->btl_flags = btl_flags;
|
||||
|
||||
/**
|
||||
* If no protocol specified, we have 2 choices: we ignore the BTL
|
||||
* as we don't know which protocl to use, or we suppose that all
|
||||
* BTLs support the send protocol.
|
||||
* calculate the bitwise OR of the btl flags
|
||||
*/
|
||||
bml_btl->btl_flags |= MCA_BTL_FLAGS_SEND;
|
||||
bml_endpoint->btl_flags_or |= bml_btl->btl_flags;
|
||||
}
|
||||
/**
|
||||
* calculate the bitwise OR of the btl flags
|
||||
*/
|
||||
bml_endpoint->btl_flags_or |= bml_btl->btl_flags;
|
||||
|
||||
/* always add rdma endpoints */
|
||||
if ((btl_flags & MCA_BTL_FLAGS_RDMA) &&
|
||||
!((proc->super.proc_arch != ompi_proc_local_proc->super.proc_arch) &&
|
||||
(0 == (btl->btl_flags & MCA_BTL_FLAGS_HETEROGENEOUS_RDMA)))) {
|
||||
mca_bml_base_btl_t *bml_btl_rdma = mca_bml_base_btl_array_insert(&bml_endpoint->btl_rdma);
|
||||
|
||||
bml_btl_rdma->btl = btl;
|
||||
bml_btl_rdma->btl_endpoint = btl_endpoints[p];
|
||||
bml_btl_rdma->btl_weight = 0;
|
||||
bml_btl_rdma->btl_flags = btl_flags;
|
||||
|
||||
if (bml_endpoint->btl_pipeline_send_length < btl->btl_rdma_pipeline_send_length) {
|
||||
bml_endpoint->btl_pipeline_send_length = btl->btl_rdma_pipeline_send_length;
|
||||
}
|
||||
|
||||
if (bml_endpoint->btl_send_limit < btl->btl_min_rdma_pipeline_size) {
|
||||
bml_endpoint->btl_send_limit = btl->btl_min_rdma_pipeline_size;
|
||||
}
|
||||
}
|
||||
|
||||
/* This BTL is in use, allow the progress registration */
|
||||
btl_inuse++;
|
||||
}
|
||||
}
|
||||
|
||||
if(btl_inuse > 0 && NULL != btl->btl_component->btl_progress) {
|
||||
size_t p;
|
||||
bool found = false;
|
||||
@ -319,9 +365,8 @@ static int mca_bml_r2_add_procs( size_t nprocs,
|
||||
mca_bml_base_endpoint_t* bml_endpoint =
|
||||
(mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
|
||||
double total_bandwidth = 0;
|
||||
uint32_t latency = 0xffffffff;
|
||||
size_t n_index;
|
||||
size_t n_size;
|
||||
uint32_t latency;
|
||||
size_t n_send, n_rdma;
|
||||
|
||||
/* skip over procs w/ no btl's registered */
|
||||
if(NULL == bml_endpoint) {
|
||||
@ -335,28 +380,22 @@ static int mca_bml_r2_add_procs( size_t nprocs,
|
||||
* weighting. Once the left over is smaller than this number we will
|
||||
* start using the weight to compute the correct amount.
|
||||
*/
|
||||
n_size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
|
||||
|
||||
n_send = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
|
||||
n_rdma = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
|
||||
|
||||
/* sort BTLs in descending order according to bandwidth value */
|
||||
qsort(bml_endpoint->btl_send.bml_btls, n_size,
|
||||
qsort(bml_endpoint->btl_send.bml_btls, n_send,
|
||||
sizeof(mca_bml_base_btl_t), btl_bandwidth_compare);
|
||||
|
||||
bml_endpoint->btl_rdma_index = 0;
|
||||
for(n_index = 0; n_index < n_size; n_index++) {
|
||||
mca_bml_base_btl_t* bml_btl =
|
||||
mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index);
|
||||
mca_btl_base_module_t* btl = bml_btl->btl;
|
||||
total_bandwidth += bml_btl->btl->btl_bandwidth;
|
||||
if(btl->btl_latency < latency) {
|
||||
latency = btl->btl_latency;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
mca_bml_r2_calculate_bandwidth_latency (&bml_endpoint->btl_send, &total_bandwidth, &latency);
|
||||
|
||||
/* (1) set the weight of each btl as a percentage of overall bandwidth
|
||||
* (2) copy all btl instances at the highest priority ranking into the
|
||||
* list of btls used for first fragments
|
||||
*/
|
||||
for(n_index = 0; n_index < n_size; n_index++) {
|
||||
for (size_t n_index = 0 ; n_index < n_send ; ++n_index) {
|
||||
mca_bml_base_btl_t* bml_btl =
|
||||
mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index);
|
||||
mca_btl_base_module_t *btl = bml_btl->btl;
|
||||
@ -365,7 +404,7 @@ static int mca_bml_r2_add_procs( size_t nprocs,
|
||||
if(btl->btl_bandwidth > 0) {
|
||||
bml_btl->btl_weight = (float)(btl->btl_bandwidth / total_bandwidth);
|
||||
} else {
|
||||
bml_btl->btl_weight = (float)(1.0 / n_size);
|
||||
bml_btl->btl_weight = (float)(1.0 / n_send);
|
||||
}
|
||||
|
||||
/* check to see if this r2 is already in the array of r2s
|
||||
@ -380,21 +419,24 @@ static int mca_bml_r2_add_procs( size_t nprocs,
|
||||
/* set endpoint max send size as min of available btls */
|
||||
if(bml_endpoint->btl_max_send_size > btl->btl_max_send_size)
|
||||
bml_endpoint->btl_max_send_size = btl->btl_max_send_size;
|
||||
}
|
||||
|
||||
/* check flags - is rdma prefered */
|
||||
if ((btl->btl_flags & (MCA_BTL_FLAGS_PUT|MCA_BTL_FLAGS_GET)) &&
|
||||
!((proc->super.proc_arch != ompi_proc_local_proc->super.proc_arch) &&
|
||||
(0 == (btl->btl_flags & MCA_BTL_FLAGS_HETEROGENEOUS_RDMA)))) {
|
||||
mca_bml_base_btl_t* bml_btl_rdma = mca_bml_base_btl_array_insert(&bml_endpoint->btl_rdma);
|
||||
mca_btl_base_module_t* btl_rdma = bml_btl->btl;
|
||||
/* sort BTLs in descending order according to bandwidth value */
|
||||
qsort(bml_endpoint->btl_rdma.bml_btls, n_rdma,
|
||||
sizeof(mca_bml_base_btl_t), btl_bandwidth_compare);
|
||||
|
||||
*bml_btl_rdma = *bml_btl;
|
||||
if(bml_endpoint->btl_pipeline_send_length < btl_rdma->btl_rdma_pipeline_send_length) {
|
||||
bml_endpoint->btl_pipeline_send_length = btl_rdma->btl_rdma_pipeline_send_length;
|
||||
}
|
||||
if(bml_endpoint->btl_send_limit < btl_rdma->btl_min_rdma_pipeline_size) {
|
||||
bml_endpoint->btl_send_limit = btl_rdma->btl_min_rdma_pipeline_size;
|
||||
}
|
||||
mca_bml_r2_calculate_bandwidth_latency (&bml_endpoint->btl_rdma, &total_bandwidth, &latency);
|
||||
|
||||
/* set rdma btl weights */
|
||||
for (size_t n_index = 0 ; n_index < n_rdma ; ++n_index) {
|
||||
mca_bml_base_btl_t *bml_btl =
|
||||
mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma, n_index);
|
||||
|
||||
/* compute weighting factor for this r2 */
|
||||
if (bml_btl->btl->btl_bandwidth > 0.0) {
|
||||
bml_btl->btl_weight = (float)(bml_btl->btl->btl_bandwidth / total_bandwidth);
|
||||
} else {
|
||||
bml_btl->btl_weight = (float)(1.0 / n_rdma);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
0
ompi/mca/pml/bfo/.opal_ignore
Обычный файл
0
ompi/mca/pml/bfo/.opal_ignore
Обычный файл
@ -14,7 +14,7 @@
|
||||
* Copyright (c) 2006-2008 University of Houston. All rights reserved.
|
||||
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved
|
||||
* Copyright (c) 2011 Sandia National Laboratories. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2012 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -500,17 +500,17 @@ static void mca_pml_ob1_dump_hdr(mca_pml_ob1_hdr_t* hdr)
|
||||
case MCA_PML_OB1_HDR_TYPE_RGET:
|
||||
type = "RGET";
|
||||
snprintf( header, 128, "ctx %5d src %d tag %d seq %d msg_length %" PRIu64
|
||||
"seg_cnt %d hdr_des %" PRIu64,
|
||||
"frag %" PRIu64 " src_ptr %" PRIu64,
|
||||
hdr->hdr_rndv.hdr_match.hdr_ctx, hdr->hdr_rndv.hdr_match.hdr_src,
|
||||
hdr->hdr_rndv.hdr_match.hdr_tag, hdr->hdr_rndv.hdr_match.hdr_seq,
|
||||
hdr->hdr_rndv.hdr_msg_length,
|
||||
hdr->hdr_rget.hdr_seg_cnt, hdr->hdr_rget.hdr_des.lval);
|
||||
hdr->hdr_rndv.hdr_msg_length, hdr->hdr_rget.hdr_frag.lval,
|
||||
hdr->hdr_rget.hdr_src_ptr);
|
||||
break;
|
||||
case MCA_PML_OB1_HDR_TYPE_ACK:
|
||||
type = "ACK";
|
||||
snprintf( header, 128, "src_req %p dst_req %p offset %" PRIu64,
|
||||
snprintf( header, 128, "src_req %p dst_req %p offset %" PRIu64 " size %" PRIu64,
|
||||
hdr->hdr_ack.hdr_src_req.pval, hdr->hdr_ack.hdr_dst_req.pval,
|
||||
hdr->hdr_ack.hdr_send_offset);
|
||||
hdr->hdr_ack.hdr_send_offset, hdr->hdr_ack.hdr_send_size);
|
||||
break;
|
||||
case MCA_PML_OB1_HDR_TYPE_FRAG:
|
||||
type = "FRAG";
|
||||
@ -520,10 +520,11 @@ static void mca_pml_ob1_dump_hdr(mca_pml_ob1_hdr_t* hdr)
|
||||
break;
|
||||
case MCA_PML_OB1_HDR_TYPE_PUT:
|
||||
type = "PUT";
|
||||
snprintf( header, 128, "seg_cnt %d dst_req %p src_des %p recv_req %p offset %" PRIu64 " [%p %" PRIu64 "]",
|
||||
hdr->hdr_rdma.hdr_seg_cnt, hdr->hdr_rdma.hdr_req.pval, hdr->hdr_rdma.hdr_des.pval,
|
||||
snprintf( header, 128, "dst_req %p src_frag %p recv_req %p offset %" PRIu64
|
||||
" dst_ptr %" PRIu64 " dst_size %" PRIu64,
|
||||
hdr->hdr_rdma.hdr_req.pval, hdr->hdr_rdma.hdr_frag.pval,
|
||||
hdr->hdr_rdma.hdr_recv_req.pval, hdr->hdr_rdma.hdr_rdma_offset,
|
||||
hdr->hdr_rdma.hdr_segs[0].seg_addr.pval, hdr->hdr_rdma.hdr_segs[0].seg_len);
|
||||
hdr->hdr_rdma.hdr_dst_ptr, hdr->hdr_rdma.hdr_dst_size);
|
||||
break;
|
||||
case MCA_PML_OB1_HDR_TYPE_FIN:
|
||||
type = "FIN";
|
||||
@ -638,37 +639,32 @@ static void mca_pml_ob1_fin_completion( mca_btl_base_module_t* btl,
|
||||
*/
|
||||
int mca_pml_ob1_send_fin( ompi_proc_t* proc,
|
||||
mca_bml_base_btl_t* bml_btl,
|
||||
opal_ptr_t hdr_des,
|
||||
opal_ptr_t hdr_frag,
|
||||
uint64_t rdma_size,
|
||||
uint8_t order,
|
||||
uint32_t status )
|
||||
int status )
|
||||
{
|
||||
mca_btl_base_descriptor_t* fin;
|
||||
mca_pml_ob1_fin_hdr_t* hdr;
|
||||
int rc;
|
||||
|
||||
mca_bml_base_alloc(bml_btl, &fin, order, sizeof(mca_pml_ob1_fin_hdr_t),
|
||||
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_FLAGS_SIGNAL);
|
||||
|
||||
if(NULL == fin) {
|
||||
MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status);
|
||||
MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_frag, rdma_size, bml_btl, order, status);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
fin->des_cbfunc = mca_pml_ob1_fin_completion;
|
||||
fin->des_cbdata = NULL;
|
||||
|
||||
/* fill in header */
|
||||
hdr = (mca_pml_ob1_fin_hdr_t*)fin->des_local->seg_addr.pval;
|
||||
hdr->hdr_common.hdr_flags = 0;
|
||||
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_FIN;
|
||||
hdr->hdr_des = hdr_des;
|
||||
hdr->hdr_fail = status;
|
||||
mca_pml_ob1_fin_hdr_prepare ((mca_pml_ob1_fin_hdr_t *) fin->des_segments->seg_addr.pval,
|
||||
0, hdr_frag.lval, status ? status : (int64_t) rdma_size);
|
||||
|
||||
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_FIN, proc);
|
||||
|
||||
/* queue request */
|
||||
rc = mca_bml_base_send( bml_btl,
|
||||
fin,
|
||||
MCA_PML_OB1_HDR_TYPE_FIN );
|
||||
rc = mca_bml_base_send( bml_btl, fin, MCA_PML_OB1_HDR_TYPE_FIN );
|
||||
if( OPAL_LIKELY( rc >= 0 ) ) {
|
||||
if( OPAL_LIKELY( 1 == rc ) ) {
|
||||
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
|
||||
@ -676,7 +672,7 @@ int mca_pml_ob1_send_fin( ompi_proc_t* proc,
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
mca_bml_base_free(bml_btl, fin);
|
||||
MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status);
|
||||
MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_frag, rdma_size, bml_btl, order, status);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
@ -717,6 +713,7 @@ void mca_pml_ob1_process_pending_packets(mca_bml_base_btl_t* bml_btl)
|
||||
pckt->hdr.hdr_ack.hdr_src_req.lval,
|
||||
pckt->hdr.hdr_ack.hdr_dst_req.pval,
|
||||
pckt->hdr.hdr_ack.hdr_send_offset,
|
||||
pckt->hdr.hdr_ack.hdr_send_size,
|
||||
pckt->hdr.hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA);
|
||||
if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) {
|
||||
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
|
||||
@ -728,9 +725,10 @@ void mca_pml_ob1_process_pending_packets(mca_bml_base_btl_t* bml_btl)
|
||||
break;
|
||||
case MCA_PML_OB1_HDR_TYPE_FIN:
|
||||
rc = mca_pml_ob1_send_fin(pckt->proc, send_dst,
|
||||
pckt->hdr.hdr_fin.hdr_des,
|
||||
pckt->hdr.hdr_fin.hdr_frag,
|
||||
pckt->hdr.hdr_fin.hdr_size,
|
||||
pckt->order,
|
||||
pckt->hdr.hdr_fin.hdr_fail);
|
||||
pckt->status);
|
||||
if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) {
|
||||
return;
|
||||
}
|
||||
|
@ -12,7 +12,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved
|
||||
* Copyright (c) 2011 Sandia National Laboratories. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -216,6 +216,7 @@ struct mca_pml_ob1_pckt_pending_t {
|
||||
mca_pml_ob1_hdr_t hdr;
|
||||
struct mca_bml_base_btl_t *bml_btl;
|
||||
uint8_t order;
|
||||
int status;
|
||||
};
|
||||
typedef struct mca_pml_ob1_pckt_pending_t mca_pml_ob1_pckt_pending_t;
|
||||
OBJ_CLASS_DECLARATION(mca_pml_ob1_pckt_pending_t);
|
||||
@ -234,17 +235,17 @@ do { \
|
||||
(ompi_free_list_item_t*)pckt); \
|
||||
} while(0)
|
||||
|
||||
#define MCA_PML_OB1_ADD_FIN_TO_PENDING(P, D, B, O, S) \
|
||||
#define MCA_PML_OB1_ADD_FIN_TO_PENDING(P, D, Sz, B, O, S) \
|
||||
do { \
|
||||
mca_pml_ob1_pckt_pending_t *_pckt; \
|
||||
\
|
||||
MCA_PML_OB1_PCKT_PENDING_ALLOC(_pckt); \
|
||||
_pckt->hdr.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_FIN; \
|
||||
_pckt->hdr.hdr_fin.hdr_des = (D); \
|
||||
_pckt->hdr.hdr_fin.hdr_fail = (S); \
|
||||
mca_pml_ob1_fin_hdr_prepare (&_pckt->hdr.hdr_fin, 0, \
|
||||
(D).lval, (Sz)); \
|
||||
_pckt->proc = (P); \
|
||||
_pckt->bml_btl = (B); \
|
||||
_pckt->order = (O); \
|
||||
_pckt->status = (S); \
|
||||
OPAL_THREAD_LOCK(&mca_pml_ob1.lock); \
|
||||
opal_list_append(&mca_pml_ob1.pckt_pending, \
|
||||
(opal_list_item_t*)_pckt); \
|
||||
@ -253,7 +254,7 @@ do { \
|
||||
|
||||
|
||||
int mca_pml_ob1_send_fin(ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl,
|
||||
opal_ptr_t hdr_des, uint8_t order, uint32_t status);
|
||||
opal_ptr_t hdr_frag, uint64_t size, uint8_t order, int status);
|
||||
|
||||
/* This function tries to resend FIN/ACK packets from pckt_pending queue.
|
||||
* Packets are added to the queue when sending of FIN or ACK is failed due to
|
||||
@ -283,20 +284,6 @@ void mca_pml_ob1_process_pending_rdma(void);
|
||||
/*
|
||||
* Compute the total number of bytes on supplied descriptor
|
||||
*/
|
||||
static inline size_t
|
||||
mca_pml_ob1_compute_segment_length(size_t seg_size, void *segments,
|
||||
size_t count, size_t hdrlen)
|
||||
{
|
||||
size_t i, length = 0;
|
||||
mca_btl_base_segment_t *segment = (mca_btl_base_segment_t*)segments;
|
||||
|
||||
for (i = 0; i < count ; ++i) {
|
||||
length += segment->seg_len;
|
||||
segment = (mca_btl_base_segment_t *)((char *)segment + seg_size);
|
||||
}
|
||||
return (length - hdrlen);
|
||||
}
|
||||
|
||||
static inline size_t
|
||||
mca_pml_ob1_compute_segment_length_base(mca_btl_base_segment_t *segments,
|
||||
size_t count, size_t hdrlen)
|
||||
@ -338,7 +325,7 @@ mca_pml_ob1_compute_segment_length_remote (size_t seg_size, void *segments,
|
||||
/* represent BTL chosen for sending request */
|
||||
struct mca_pml_ob1_com_btl_t {
|
||||
mca_bml_base_btl_t *bml_btl;
|
||||
struct mca_mpool_base_registration_t* btl_reg;
|
||||
struct mca_btl_base_registration_handle_t *btl_reg;
|
||||
size_t length;
|
||||
};
|
||||
typedef struct mca_pml_ob1_com_btl_t mca_pml_ob1_com_btl_t;
|
||||
|
@ -1,3 +1,4 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
@ -12,6 +13,8 @@
|
||||
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012-2013 NVIDIA Corporation. All rights reserved.
|
||||
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -123,19 +126,20 @@ size_t mca_pml_ob1_rdma_cuda_btls(
|
||||
mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n);
|
||||
|
||||
if (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET) {
|
||||
mca_mpool_base_registration_t* reg = NULL;
|
||||
mca_mpool_base_module_t *btl_mpool = bml_btl->btl->btl_mpool;
|
||||
mca_btl_base_registration_handle_t *handle = NULL;
|
||||
|
||||
if( NULL != btl_mpool ) {
|
||||
if( NULL != bml_btl->btl->btl_register_mem ) {
|
||||
/* register the memory */
|
||||
btl_mpool->mpool_register(btl_mpool, base, size, MCA_MPOOL_FLAGS_CUDA_GPU_MEM, ®);
|
||||
handle = bml_btl->btl->btl_register_mem (bml_btl->btl, bml_btl->btl_endpoint,
|
||||
base, size, MCA_BTL_REG_FLAG_CUDA_GPU_MEM |
|
||||
MCA_BTL_REG_FLAG_REMOTE_READ);
|
||||
}
|
||||
|
||||
if(NULL == reg)
|
||||
if(NULL == handle)
|
||||
continue;
|
||||
|
||||
rdma_btls[num_btls_used].bml_btl = bml_btl;
|
||||
rdma_btls[num_btls_used].btl_reg = reg;
|
||||
rdma_btls[num_btls_used].btl_reg = handle;
|
||||
weight_total += bml_btl->btl_weight;
|
||||
num_btls_used++;
|
||||
}
|
||||
|
@ -11,7 +11,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2009 IBM Corporation. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -64,6 +64,13 @@ struct mca_pml_ob1_common_hdr_t {
|
||||
};
|
||||
typedef struct mca_pml_ob1_common_hdr_t mca_pml_ob1_common_hdr_t;
|
||||
|
||||
static inline void mca_pml_ob1_common_hdr_prepare (mca_pml_ob1_common_hdr_t *hdr, uint8_t hdr_type,
|
||||
uint8_t hdr_flags)
|
||||
{
|
||||
hdr->hdr_type = hdr_type;
|
||||
hdr->hdr_flags = hdr_flags;
|
||||
}
|
||||
|
||||
#define MCA_PML_OB1_COMMON_HDR_NTOH(h)
|
||||
#define MCA_PML_OB1_COMMON_HDR_HTON(h)
|
||||
|
||||
@ -89,15 +96,19 @@ struct mca_pml_ob1_match_hdr_t {
|
||||
|
||||
typedef struct mca_pml_ob1_match_hdr_t mca_pml_ob1_match_hdr_t;
|
||||
|
||||
static inline void mca_pml_ob1_match_hdr_prepare (mca_pml_ob1_match_hdr_t *hdr, uint8_t hdr_type, uint8_t hdr_flags,
|
||||
uint16_t hdr_ctx, int32_t hdr_src, int32_t hdr_tag, uint16_t hdr_seq)
|
||||
{
|
||||
mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, hdr_type, hdr_flags);
|
||||
hdr->hdr_ctx = hdr_ctx;
|
||||
hdr->hdr_src = hdr_src;
|
||||
hdr->hdr_tag = hdr_tag;
|
||||
hdr->hdr_seq = hdr_seq;
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
|
||||
#define MCA_PML_OB1_MATCH_HDR_FILL(h) \
|
||||
do { \
|
||||
(h).hdr_padding[0] = 0; \
|
||||
(h).hdr_padding[1] = 0; \
|
||||
} while(0)
|
||||
#else
|
||||
#define MCA_PML_OB1_MATCH_HDR_FILL(h)
|
||||
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
|
||||
hdr->hdr_padding[0] = 0;
|
||||
hdr->hdr_padding[1] = 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
#define MCA_PML_OB1_MATCH_HDR_NTOH(h) \
|
||||
do { \
|
||||
@ -111,7 +122,6 @@ do { \
|
||||
#define MCA_PML_OB1_MATCH_HDR_HTON(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
|
||||
MCA_PML_OB1_MATCH_HDR_FILL(h); \
|
||||
(h).hdr_ctx = htons((h).hdr_ctx); \
|
||||
(h).hdr_src = htonl((h).hdr_src); \
|
||||
(h).hdr_tag = htonl((h).hdr_tag); \
|
||||
@ -130,12 +140,14 @@ struct mca_pml_ob1_rendezvous_hdr_t {
|
||||
};
|
||||
typedef struct mca_pml_ob1_rendezvous_hdr_t mca_pml_ob1_rendezvous_hdr_t;
|
||||
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
|
||||
#define MCA_PML_OB1_RNDV_HDR_FILL(h) \
|
||||
MCA_PML_OB1_MATCH_HDR_FILL((h).hdr_match)
|
||||
#else
|
||||
#define MCA_PML_OB1_RNDV_HDR_FILL(h)
|
||||
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
|
||||
static inline void mca_pml_ob1_rendezvous_hdr_prepare (mca_pml_ob1_rendezvous_hdr_t *hdr, uint8_t hdr_type, uint8_t hdr_flags,
|
||||
uint16_t hdr_ctx, int32_t hdr_src, int32_t hdr_tag, uint16_t hdr_seq,
|
||||
uint64_t hdr_msg_length, void *hdr_src_req)
|
||||
{
|
||||
mca_pml_ob1_match_hdr_prepare (&hdr->hdr_match, hdr_type, hdr_flags, hdr_ctx, hdr_src, hdr_tag, hdr_seq);
|
||||
hdr->hdr_msg_length = hdr_msg_length;
|
||||
hdr->hdr_src_req.pval = hdr_src_req;
|
||||
}
|
||||
|
||||
/* Note that hdr_src_req is not put in network byte order because it
|
||||
is never processed by the receiver, other than being copied into
|
||||
@ -149,7 +161,6 @@ typedef struct mca_pml_ob1_rendezvous_hdr_t mca_pml_ob1_rendezvous_hdr_t;
|
||||
#define MCA_PML_OB1_RNDV_HDR_HTON(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_MATCH_HDR_HTON((h).hdr_match); \
|
||||
MCA_PML_OB1_RNDV_HDR_FILL(h); \
|
||||
(h).hdr_msg_length = hton64((h).hdr_msg_length); \
|
||||
} while (0)
|
||||
|
||||
@ -158,38 +169,47 @@ typedef struct mca_pml_ob1_rendezvous_hdr_t mca_pml_ob1_rendezvous_hdr_t;
|
||||
*/
|
||||
struct mca_pml_ob1_rget_hdr_t {
|
||||
mca_pml_ob1_rendezvous_hdr_t hdr_rndv;
|
||||
uint32_t hdr_seg_cnt; /**< number of segments for rdma */
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
uint8_t hdr_padding[4];
|
||||
#endif
|
||||
opal_ptr_t hdr_des; /**< source descriptor */
|
||||
opal_ptr_t hdr_frag; /**< source fragment (for fin) */
|
||||
uint64_t hdr_src_ptr; /**< source pointer */
|
||||
/* btl registration handle data follows */
|
||||
};
|
||||
typedef struct mca_pml_ob1_rget_hdr_t mca_pml_ob1_rget_hdr_t;
|
||||
|
||||
static inline void mca_pml_ob1_rget_hdr_prepare (mca_pml_ob1_rget_hdr_t *hdr, uint8_t hdr_flags,
|
||||
uint16_t hdr_ctx, int32_t hdr_src, int32_t hdr_tag, uint16_t hdr_seq,
|
||||
uint64_t hdr_msg_length, void *hdr_src_req, void *hdr_frag,
|
||||
void *hdr_src_ptr, void *local_handle, size_t local_handle_size)
|
||||
{
|
||||
mca_pml_ob1_rendezvous_hdr_prepare (&hdr->hdr_rndv, MCA_PML_OB1_HDR_TYPE_RGET, hdr_flags,
|
||||
hdr_ctx, hdr_src, hdr_tag, hdr_seq, hdr_msg_length, hdr_src_req);
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
|
||||
#define MCA_PML_OB1_RGET_HDR_FILL(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_RNDV_HDR_FILL((h).hdr_rndv); \
|
||||
(h).hdr_padding[0] = 0; \
|
||||
(h).hdr_padding[1] = 0; \
|
||||
(h).hdr_padding[2] = 0; \
|
||||
(h).hdr_padding[3] = 0; \
|
||||
} while(0)
|
||||
#else
|
||||
#define MCA_PML_OB1_RGET_HDR_FILL(h)
|
||||
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
|
||||
hdr->hdr_padding[0] = 0;
|
||||
hdr->hdr_padding[1] = 0;
|
||||
hdr->hdr_padding[2] = 0;
|
||||
hdr->hdr_padding[3] = 0;
|
||||
#endif
|
||||
hdr->hdr_frag.pval = hdr_frag;
|
||||
hdr->hdr_src_ptr = (uint64_t)(intptr_t) hdr_src_ptr;
|
||||
|
||||
#define MCA_PML_OB1_RGET_HDR_NTOH(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_RNDV_HDR_NTOH((h).hdr_rndv); \
|
||||
(h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \
|
||||
/* copy registration handle */
|
||||
memcpy (hdr + 1, local_handle, local_handle_size);
|
||||
}
|
||||
|
||||
#define MCA_PML_OB1_RGET_HDR_NTOH(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_RNDV_HDR_NTOH((h).hdr_rndv); \
|
||||
(h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \
|
||||
(h).hdr_src_ptr = ntoh64((h).hdr_src_ptr); \
|
||||
} while (0)
|
||||
|
||||
#define MCA_PML_OB1_RGET_HDR_HTON(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_RNDV_HDR_HTON((h).hdr_rndv); \
|
||||
MCA_PML_OB1_RGET_HDR_FILL(h); \
|
||||
(h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \
|
||||
#define MCA_PML_OB1_RGET_HDR_HTON(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_RNDV_HDR_HTON((h).hdr_rndv); \
|
||||
(h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \
|
||||
(h).hdr_src_ptr = hton64((h).hdr_src_ptr); \
|
||||
} while (0)
|
||||
|
||||
/**
|
||||
@ -206,19 +226,23 @@ struct mca_pml_ob1_frag_hdr_t {
|
||||
};
|
||||
typedef struct mca_pml_ob1_frag_hdr_t mca_pml_ob1_frag_hdr_t;
|
||||
|
||||
static inline void mca_pml_ob1_frag_hdr_prepare (mca_pml_ob1_frag_hdr_t *hdr, uint8_t hdr_flags,
|
||||
uint64_t hdr_frag_offset, void *hdr_src_req,
|
||||
uint64_t hdr_dst_req)
|
||||
{
|
||||
mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, MCA_PML_OB1_HDR_TYPE_FRAG, hdr_flags);
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
|
||||
#define MCA_PML_OB1_FRAG_HDR_FILL(h) \
|
||||
do { \
|
||||
(h).hdr_padding[0] = 0; \
|
||||
(h).hdr_padding[1] = 0; \
|
||||
(h).hdr_padding[2] = 0; \
|
||||
(h).hdr_padding[3] = 0; \
|
||||
(h).hdr_padding[4] = 0; \
|
||||
(h).hdr_padding[5] = 0; \
|
||||
} while(0)
|
||||
#else
|
||||
#define MCA_PML_OB1_FRAG_HDR_FILL(h)
|
||||
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
|
||||
hdr->hdr_padding[0] = 0;
|
||||
hdr->hdr_padding[1] = 0;
|
||||
hdr->hdr_padding[2] = 0;
|
||||
hdr->hdr_padding[3] = 0;
|
||||
hdr->hdr_padding[4] = 0;
|
||||
hdr->hdr_padding[5] = 0;
|
||||
#endif
|
||||
hdr->hdr_frag_offset = hdr_frag_offset;
|
||||
hdr->hdr_src_req.pval = hdr_src_req;
|
||||
hdr->hdr_dst_req.lval = hdr_dst_req;
|
||||
}
|
||||
|
||||
#define MCA_PML_OB1_FRAG_HDR_NTOH(h) \
|
||||
do { \
|
||||
@ -229,7 +253,6 @@ do { \
|
||||
#define MCA_PML_OB1_FRAG_HDR_HTON(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
|
||||
MCA_PML_OB1_FRAG_HDR_FILL(h); \
|
||||
(h).hdr_frag_offset = hton64((h).hdr_frag_offset); \
|
||||
} while (0)
|
||||
|
||||
@ -245,38 +268,45 @@ struct mca_pml_ob1_ack_hdr_t {
|
||||
opal_ptr_t hdr_src_req; /**< source request */
|
||||
opal_ptr_t hdr_dst_req; /**< matched receive request */
|
||||
uint64_t hdr_send_offset; /**< starting point of copy in/out */
|
||||
uint64_t hdr_send_size; /**< number of bytes requested (0: all remaining) */
|
||||
};
|
||||
typedef struct mca_pml_ob1_ack_hdr_t mca_pml_ob1_ack_hdr_t;
|
||||
|
||||
static inline void mca_pml_ob1_ack_hdr_prepare (mca_pml_ob1_ack_hdr_t *hdr, uint8_t hdr_flags,
|
||||
uint64_t hdr_src_req, void *hdr_dst_req,
|
||||
uint64_t hdr_send_offset, uint64_t hdr_send_size)
|
||||
{
|
||||
mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, MCA_PML_OB1_HDR_TYPE_ACK, hdr_flags);
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
|
||||
#define MCA_PML_OB1_ACK_HDR_FILL(h) \
|
||||
do { \
|
||||
(h).hdr_padding[0] = 0; \
|
||||
(h).hdr_padding[1] = 0; \
|
||||
(h).hdr_padding[2] = 0; \
|
||||
(h).hdr_padding[3] = 0; \
|
||||
(h).hdr_padding[4] = 0; \
|
||||
(h).hdr_padding[5] = 0; \
|
||||
} while (0)
|
||||
#else
|
||||
#define MCA_PML_OB1_ACK_HDR_FILL(h)
|
||||
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
|
||||
hdr->hdr_padding[0] = 0;
|
||||
hdr->hdr_padding[1] = 0;
|
||||
hdr->hdr_padding[2] = 0;
|
||||
hdr->hdr_padding[3] = 0;
|
||||
hdr->hdr_padding[4] = 0;
|
||||
hdr->hdr_padding[5] = 0;
|
||||
#endif
|
||||
hdr->hdr_src_req.lval = hdr_src_req;
|
||||
hdr->hdr_dst_req.pval = hdr_dst_req;
|
||||
hdr->hdr_send_offset = hdr_send_offset;
|
||||
hdr->hdr_send_size = hdr_send_size;
|
||||
}
|
||||
|
||||
/* Note that the request headers are not put in NBO because the
|
||||
src_req is already in receiver's byte order and the dst_req is not
|
||||
used by the receiver for anything other than backpointers in return
|
||||
headers */
|
||||
#define MCA_PML_OB1_ACK_HDR_NTOH(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \
|
||||
#define MCA_PML_OB1_ACK_HDR_NTOH(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \
|
||||
(h).hdr_send_offset = ntoh64((h).hdr_send_offset); \
|
||||
(h).hdr_send_size = ntoh64((h).hdr_send_size); \
|
||||
} while (0)
|
||||
|
||||
#define MCA_PML_OB1_ACK_HDR_HTON(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
|
||||
MCA_PML_OB1_ACK_HDR_FILL(h); \
|
||||
#define MCA_PML_OB1_ACK_HDR_HTON(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
|
||||
(h).hdr_send_offset = hton64((h).hdr_send_offset); \
|
||||
(h).hdr_send_size = hton64((h).hdr_send_size); \
|
||||
} while (0)
|
||||
|
||||
/**
|
||||
@ -288,38 +318,55 @@ struct mca_pml_ob1_rdma_hdr_t {
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
uint8_t hdr_padding[2]; /** two to pad out the hdr to a 4 byte alignment. hdr_req will then be 8 byte aligned after 4 for hdr_seg_cnt */
|
||||
#endif
|
||||
uint32_t hdr_seg_cnt; /**< number of segments for rdma */
|
||||
/* TODO: add real support for multiple destination segments */
|
||||
opal_ptr_t hdr_req; /**< destination request */
|
||||
opal_ptr_t hdr_des; /**< source descriptor */
|
||||
opal_ptr_t hdr_frag; /**< receiver fragment */
|
||||
opal_ptr_t hdr_recv_req; /**< receive request (NTH: needed for put fallback on send) */
|
||||
uint64_t hdr_rdma_offset; /**< current offset into user buffer */
|
||||
mca_btl_base_segment_t hdr_segs[1]; /**< list of segments for rdma */
|
||||
uint64_t hdr_rdma_offset; /**< current offset into user buffer */
|
||||
uint64_t hdr_dst_ptr; /**< destination address */
|
||||
uint64_t hdr_dst_size; /**< destination size */
|
||||
/* registration data follows */
|
||||
};
|
||||
typedef struct mca_pml_ob1_rdma_hdr_t mca_pml_ob1_rdma_hdr_t;
|
||||
|
||||
static inline void mca_pml_ob1_rdma_hdr_prepare (mca_pml_ob1_rdma_hdr_t *hdr, uint8_t hdr_flags,
|
||||
uint64_t hdr_req, void *hdr_frag, void *hdr_recv_req,
|
||||
uint64_t hdr_rdma_offset, void *hdr_dst_ptr,
|
||||
uint64_t hdr_dst_size, void *local_handle,
|
||||
size_t local_handle_size)
|
||||
{
|
||||
mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, MCA_PML_OB1_HDR_TYPE_PUT, hdr_flags);
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
|
||||
#define MCA_PML_OB1_RDMA_HDR_FILL(h) \
|
||||
do { \
|
||||
(h).hdr_padding[0] = 0; \
|
||||
(h).hdr_padding[1] = 0; \
|
||||
} while(0)
|
||||
#else
|
||||
#define MCA_PML_OB1_RDMA_HDR_FILL(h)
|
||||
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
|
||||
hdr->hdr_padding[0] = 0;
|
||||
hdr->hdr_padding[1] = 0;
|
||||
#endif
|
||||
hdr->hdr_req.lval = hdr_req;
|
||||
hdr->hdr_frag.pval = hdr_frag;
|
||||
hdr->hdr_recv_req.pval = hdr_recv_req;
|
||||
hdr->hdr_rdma_offset = hdr_rdma_offset;
|
||||
hdr->hdr_dst_ptr = (uint64_t)(intptr_t) hdr_dst_ptr;
|
||||
hdr->hdr_dst_size = hdr_dst_size;
|
||||
|
||||
#define MCA_PML_OB1_RDMA_HDR_NTOH(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \
|
||||
(h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \
|
||||
/* copy segments */
|
||||
memcpy (hdr + 1, local_handle, local_handle_size);
|
||||
}
|
||||
|
||||
#define MCA_PML_OB1_RDMA_HDR_NTOH(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \
|
||||
(h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \
|
||||
(h).hdr_rdma_offset = ntoh64((h).hdr_rdma_offset); \
|
||||
(h).hdr_dst_ptr = ntoh64((h).hdr_dst_ptr); \
|
||||
(h).hdr_dst_size = ntoh64((h).hdr_dst_size); \
|
||||
} while (0)
|
||||
|
||||
#define MCA_PML_OB1_RDMA_HDR_HTON(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
|
||||
MCA_PML_OB1_RDMA_HDR_FILL(h); \
|
||||
(h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \
|
||||
#define MCA_PML_OB1_RDMA_HDR_HTON(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
|
||||
(h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \
|
||||
(h).hdr_rdma_offset = hton64((h).hdr_rdma_offset); \
|
||||
(h).hdr_dst_ptr = hton64((h).hdr_dst_ptr); \
|
||||
(h).hdr_dst_size = hton64((h).hdr_dst_size); \
|
||||
} while (0)
|
||||
|
||||
/**
|
||||
@ -331,31 +378,34 @@ struct mca_pml_ob1_fin_hdr_t {
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
uint8_t hdr_padding[2];
|
||||
#endif
|
||||
uint32_t hdr_fail; /**< RDMA operation failed */
|
||||
opal_ptr_t hdr_des; /**< completed descriptor */
|
||||
int64_t hdr_size; /**< number of bytes completed (positive), error code (negative) */
|
||||
opal_ptr_t hdr_frag; /**< completed RDMA fragment */
|
||||
};
|
||||
typedef struct mca_pml_ob1_fin_hdr_t mca_pml_ob1_fin_hdr_t;
|
||||
|
||||
static inline void mca_pml_ob1_fin_hdr_prepare (mca_pml_ob1_fin_hdr_t *hdr, uint8_t hdr_flags,
|
||||
uint64_t hdr_frag, int64_t hdr_size)
|
||||
{
|
||||
mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, MCA_PML_OB1_HDR_TYPE_FIN, hdr_flags);
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
|
||||
#define MCA_PML_OB1_FIN_HDR_FILL(h) \
|
||||
do { \
|
||||
(h).hdr_padding[0] = 0; \
|
||||
(h).hdr_padding[1] = 0; \
|
||||
} while (0)
|
||||
#else
|
||||
#define MCA_PML_OB1_FIN_HDR_FILL(h)
|
||||
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
|
||||
hdr->hdr_padding[0] = 0;
|
||||
hdr->hdr_padding[1] = 0;
|
||||
#endif
|
||||
hdr->hdr_frag.lval = hdr_frag;
|
||||
hdr->hdr_size = hdr_size;
|
||||
}
|
||||
|
||||
#define MCA_PML_OB1_FIN_HDR_NTOH(h) \
|
||||
do { \
|
||||
#define MCA_PML_OB1_FIN_HDR_NTOH(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \
|
||||
(h).hdr_size = ntoh64((h).hdr_size); \
|
||||
} while (0)
|
||||
|
||||
#define MCA_PML_OB1_FIN_HDR_HTON(h) \
|
||||
do { \
|
||||
#define MCA_PML_OB1_FIN_HDR_HTON(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
|
||||
MCA_PML_OB1_FIN_HDR_FILL(h); \
|
||||
} while (0)
|
||||
(h).hdr_size = hton64((h).hdr_size); \
|
||||
} while (0)
|
||||
|
||||
/**
|
||||
* Union of defined hdr types.
|
||||
|
@ -10,7 +10,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
@ -68,7 +68,6 @@ static inline int mca_pml_ob1_send_inline (void *buf, size_t count,
|
||||
ompi_proc_t *dst_proc, mca_bml_base_endpoint_t* endpoint,
|
||||
ompi_communicator_t * comm)
|
||||
{
|
||||
mca_btl_base_descriptor_t *des = NULL;
|
||||
mca_pml_ob1_match_hdr_t match;
|
||||
mca_bml_base_btl_t *bml_btl;
|
||||
opal_convertor_t convertor;
|
||||
@ -98,28 +97,21 @@ static inline int mca_pml_ob1_send_inline (void *buf, size_t count,
|
||||
size = 0;
|
||||
}
|
||||
|
||||
match.hdr_common.hdr_flags = 0;
|
||||
match.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH;
|
||||
match.hdr_ctx = comm->c_contextid;
|
||||
match.hdr_src = comm->c_my_rank;
|
||||
match.hdr_tag = tag;
|
||||
match.hdr_seq = seqn;
|
||||
mca_pml_ob1_match_hdr_prepare (&match, MCA_PML_OB1_HDR_TYPE_MATCH, 0,
|
||||
comm->c_contextid, comm->c_my_rank,
|
||||
tag, seqn);
|
||||
|
||||
ob1_hdr_hton(&match, MCA_PML_OB1_HDR_TYPE_MATCH, dst_proc);
|
||||
|
||||
/* try to send immediately */
|
||||
rc = mca_bml_base_sendi (bml_btl, &convertor, &match, OMPI_PML_OB1_MATCH_HDR_LEN,
|
||||
size, MCA_BTL_NO_ORDER, MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP,
|
||||
MCA_PML_OB1_HDR_TYPE_MATCH, &des);
|
||||
MCA_PML_OB1_HDR_TYPE_MATCH, NULL);
|
||||
if (count > 0) {
|
||||
opal_convertor_cleanup (&convertor);
|
||||
}
|
||||
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
if (des) {
|
||||
mca_bml_base_free (bml_btl, des);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -224,7 +216,7 @@ int mca_pml_ob1_send(void *buf,
|
||||
|
||||
OBJ_CONSTRUCT(sendreq, mca_pml_ob1_send_request_t);
|
||||
sendreq->req_send.req_base.req_proc = dst_proc;
|
||||
sendreq->src_des = NULL;
|
||||
sendreq->rdma_frag = NULL;
|
||||
|
||||
MCA_PML_OB1_SEND_REQUEST_INIT(sendreq,
|
||||
buf,
|
||||
|
@ -1,3 +1,4 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
@ -9,6 +10,8 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -27,11 +30,6 @@
|
||||
#include "pml_ob1.h"
|
||||
#include "pml_ob1_rdma.h"
|
||||
|
||||
/* Use this registration if no registration needed for a BTL instead of NULL.
|
||||
* This will help other code to distinguish case when memory is not registered
|
||||
* from case when registration is not needed */
|
||||
static mca_mpool_base_registration_t pml_ob1_dummy_reg;
|
||||
|
||||
/*
|
||||
* Check to see if memory is registered or can be registered. Build a
|
||||
* set of registrations on the request.
|
||||
@ -45,7 +43,7 @@ size_t mca_pml_ob1_rdma_btls(
|
||||
{
|
||||
int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
|
||||
double weight_total = 0;
|
||||
int num_btls_used = 0, n;
|
||||
int num_btls_used = 0;
|
||||
|
||||
/* shortcut when there are no rdma capable btls */
|
||||
if(num_btls == 0) {
|
||||
@ -53,29 +51,33 @@ size_t mca_pml_ob1_rdma_btls(
|
||||
}
|
||||
|
||||
/* check to see if memory is registered */
|
||||
for(n = 0; n < num_btls && num_btls_used < mca_pml_ob1.max_rdma_per_request;
|
||||
n++) {
|
||||
for (int n = 0; n < num_btls && num_btls_used < mca_pml_ob1.max_rdma_per_request; n++) {
|
||||
mca_bml_base_btl_t* bml_btl =
|
||||
mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma,
|
||||
(bml_endpoint->btl_rdma_index + n) % num_btls);
|
||||
mca_mpool_base_registration_t* reg = &pml_ob1_dummy_reg;
|
||||
mca_mpool_base_module_t *btl_mpool = bml_btl->btl->btl_mpool;
|
||||
(bml_endpoint->btl_rdma_index + n) % num_btls);
|
||||
mca_btl_base_registration_handle_t *reg_handle = NULL;
|
||||
mca_btl_base_module_t *btl = bml_btl->btl;
|
||||
|
||||
if( NULL != btl_mpool ) {
|
||||
if(!mca_pml_ob1.leave_pinned) {
|
||||
/* look through existing registrations */
|
||||
btl_mpool->mpool_find(btl_mpool, base, size, ®);
|
||||
} else {
|
||||
/* register the memory */
|
||||
btl_mpool->mpool_register(btl_mpool, base, size, 0, ®);
|
||||
if (btl->btl_register_mem) {
|
||||
/* do not use the RDMA protocol with this btl if 1) leave pinned is disabled,
|
||||
* 2) the btl supports put, and 3) the fragment is larger than the minimum
|
||||
* pipeline size specified by the BTL */
|
||||
if (!mca_pml_ob1.leave_pinned && (btl->btl_flags & MCA_BTL_FLAGS_PUT) &&
|
||||
size > btl->btl_min_rdma_pipeline_size) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if(NULL == reg)
|
||||
/* try to register the memory region with the btl */
|
||||
reg_handle = btl->btl_register_mem (btl, bml_btl->btl_endpoint, base,
|
||||
size, MCA_BTL_REG_FLAG_REMOTE_READ);
|
||||
if (NULL == reg_handle) {
|
||||
/* btl requires registration but the registration failed */
|
||||
continue;
|
||||
}
|
||||
}
|
||||
} /* else no registration is needed with this btl */
|
||||
|
||||
rdma_btls[num_btls_used].bml_btl = bml_btl;
|
||||
rdma_btls[num_btls_used].btl_reg = reg;
|
||||
rdma_btls[num_btls_used].btl_reg = reg_handle;
|
||||
weight_total += bml_btl->btl_weight;
|
||||
num_btls_used++;
|
||||
}
|
||||
@ -83,7 +85,7 @@ size_t mca_pml_ob1_rdma_btls(
|
||||
/* if we don't use leave_pinned and all BTLs that already have this memory
|
||||
* registered amount to less then half of available bandwidth - fall back to
|
||||
* pipeline protocol */
|
||||
if(0 == num_btls_used || (!mca_pml_ob1.leave_pinned && weight_total < 0.5))
|
||||
if (0 == num_btls_used || (!mca_pml_ob1.leave_pinned && weight_total < 0.5))
|
||||
return 0;
|
||||
|
||||
mca_pml_ob1_calc_weighted_length(rdma_btls, num_btls_used, size,
|
||||
@ -103,10 +105,7 @@ size_t mca_pml_ob1_rdma_pipeline_btls( mca_bml_base_endpoint_t* bml_endpoint,
|
||||
for(i = 0; i < num_btls && i < mca_pml_ob1.max_rdma_per_request; i++) {
|
||||
rdma_btls[i].bml_btl =
|
||||
mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma);
|
||||
if(NULL != rdma_btls[i].bml_btl->btl->btl_mpool)
|
||||
rdma_btls[i].btl_reg = NULL;
|
||||
else
|
||||
rdma_btls[i].btl_reg = &pml_ob1_dummy_reg;
|
||||
rdma_btls[i].btl_reg = NULL;
|
||||
|
||||
weight_total += rdma_btls[i].bml_btl->btl_weight;
|
||||
}
|
||||
|
@ -1,3 +1,4 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
@ -9,6 +10,8 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -21,9 +24,13 @@
|
||||
#include "pml_ob1.h"
|
||||
#include "pml_ob1_rdmafrag.h"
|
||||
|
||||
static void mca_pml_ob1_rdma_frag_constructor (mca_pml_ob1_rdma_frag_t *frag)
|
||||
{
|
||||
frag->local_handle = NULL;
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(
|
||||
mca_pml_ob1_rdma_frag_t,
|
||||
ompi_free_list_item_t,
|
||||
NULL,
|
||||
mca_pml_ob1_rdma_frag_constructor,
|
||||
NULL);
|
||||
|
@ -10,6 +10,8 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -32,38 +34,52 @@ typedef enum {
|
||||
MCA_PML_OB1_RDMA_GET
|
||||
} mca_pml_ob1_rdma_state_t;
|
||||
|
||||
struct mca_pml_ob1_rdma_frag_t;
|
||||
|
||||
typedef void (*mca_pml_ob1_rdma_frag_callback_t)(struct mca_pml_ob1_rdma_frag_t *frag, int64_t rdma_length);
|
||||
|
||||
/**
|
||||
* Used to keep track of local and remote RDMA operations.
|
||||
*/
|
||||
struct mca_pml_ob1_rdma_frag_t {
|
||||
ompi_free_list_item_t super;
|
||||
mca_bml_base_btl_t* rdma_bml;
|
||||
mca_bml_base_btl_t *rdma_bml;
|
||||
mca_pml_ob1_hdr_t rdma_hdr;
|
||||
mca_pml_ob1_rdma_state_t rdma_state;
|
||||
size_t rdma_length;
|
||||
uint8_t rdma_segs[MCA_BTL_SEG_MAX_SIZE * MCA_BTL_DES_MAX_SEGMENTS];
|
||||
void *rdma_req;
|
||||
struct mca_bml_base_endpoint_t* rdma_ep;
|
||||
opal_convertor_t convertor;
|
||||
mca_mpool_base_registration_t* reg;
|
||||
uint32_t retries;
|
||||
mca_pml_ob1_rdma_frag_callback_t cbfunc;
|
||||
|
||||
uint64_t rdma_offset;
|
||||
void *local_address;
|
||||
mca_btl_base_registration_handle_t *local_handle;
|
||||
|
||||
uint64_t remote_address;
|
||||
uint8_t remote_handle[MCA_BTL_REG_HANDLE_MAX_SIZE];
|
||||
};
|
||||
typedef struct mca_pml_ob1_rdma_frag_t mca_pml_ob1_rdma_frag_t;
|
||||
|
||||
OBJ_CLASS_DECLARATION(mca_pml_ob1_rdma_frag_t);
|
||||
|
||||
|
||||
#define MCA_PML_OB1_RDMA_FRAG_ALLOC(frag) \
|
||||
do { \
|
||||
ompi_free_list_item_t* item; \
|
||||
#define MCA_PML_OB1_RDMA_FRAG_ALLOC(frag) \
|
||||
do { \
|
||||
ompi_free_list_item_t* item; \
|
||||
OMPI_FREE_LIST_WAIT_MT(&mca_pml_ob1.rdma_frags, item); \
|
||||
frag = (mca_pml_ob1_rdma_frag_t*)item; \
|
||||
} while(0)
|
||||
|
||||
#define MCA_PML_OB1_RDMA_FRAG_RETURN(frag) \
|
||||
do { \
|
||||
/* return fragment */ \
|
||||
OMPI_FREE_LIST_RETURN_MT(&mca_pml_ob1.rdma_frags, \
|
||||
(ompi_free_list_item_t*)frag); \
|
||||
frag = (mca_pml_ob1_rdma_frag_t*)item; \
|
||||
} while(0)
|
||||
|
||||
#define MCA_PML_OB1_RDMA_FRAG_RETURN(frag) \
|
||||
do { \
|
||||
/* return fragment */ \
|
||||
if (frag->local_handle) { \
|
||||
mca_bml_base_deregister_mem (frag->rdma_bml, frag->local_handle); \
|
||||
frag->local_handle = NULL; \
|
||||
} \
|
||||
OMPI_FREE_LIST_RETURN_MT(&mca_pml_ob1.rdma_frags, \
|
||||
(ompi_free_list_item_t*)frag); \
|
||||
} while (0)
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
|
@ -13,7 +13,7 @@
|
||||
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2006-2008 University of Houston. All rights reserved.
|
||||
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -108,13 +108,13 @@ void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl,
|
||||
mca_btl_base_descriptor_t* des,
|
||||
void* cbdata )
|
||||
{
|
||||
mca_btl_base_segment_t* segments = des->des_local;
|
||||
mca_btl_base_segment_t* segments = des->des_segments;
|
||||
mca_pml_ob1_match_hdr_t* hdr = (mca_pml_ob1_match_hdr_t*)segments->seg_addr.pval;
|
||||
ompi_communicator_t *comm_ptr;
|
||||
mca_pml_ob1_recv_request_t *match = NULL;
|
||||
mca_pml_ob1_comm_t *comm;
|
||||
mca_pml_ob1_comm_proc_t *proc;
|
||||
size_t num_segments = des->des_local_count;
|
||||
size_t num_segments = des->des_segment_count;
|
||||
size_t bytes_received = 0;
|
||||
|
||||
assert(num_segments <= MCA_BTL_DES_MAX_SEGMENTS);
|
||||
@ -256,7 +256,7 @@ void mca_pml_ob1_recv_frag_callback_rndv(mca_btl_base_module_t* btl,
|
||||
mca_btl_base_descriptor_t* des,
|
||||
void* cbdata )
|
||||
{
|
||||
mca_btl_base_segment_t* segments = des->des_local;
|
||||
mca_btl_base_segment_t* segments = des->des_segments;
|
||||
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
|
||||
|
||||
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) {
|
||||
@ -264,7 +264,7 @@ void mca_pml_ob1_recv_frag_callback_rndv(mca_btl_base_module_t* btl,
|
||||
}
|
||||
ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_RNDV);
|
||||
mca_pml_ob1_recv_frag_match(btl, &hdr->hdr_match, segments,
|
||||
des->des_local_count, MCA_PML_OB1_HDR_TYPE_RNDV);
|
||||
des->des_segment_count, MCA_PML_OB1_HDR_TYPE_RNDV);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -273,7 +273,7 @@ void mca_pml_ob1_recv_frag_callback_rget(mca_btl_base_module_t* btl,
|
||||
mca_btl_base_descriptor_t* des,
|
||||
void* cbdata )
|
||||
{
|
||||
mca_btl_base_segment_t* segments = des->des_local;
|
||||
mca_btl_base_segment_t* segments = des->des_segments;
|
||||
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
|
||||
|
||||
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) {
|
||||
@ -281,7 +281,7 @@ void mca_pml_ob1_recv_frag_callback_rget(mca_btl_base_module_t* btl,
|
||||
}
|
||||
ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_RGET);
|
||||
mca_pml_ob1_recv_frag_match(btl, &hdr->hdr_match, segments,
|
||||
des->des_local_count, MCA_PML_OB1_HDR_TYPE_RGET);
|
||||
des->des_segment_count, MCA_PML_OB1_HDR_TYPE_RGET);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -292,9 +292,10 @@ void mca_pml_ob1_recv_frag_callback_ack(mca_btl_base_module_t* btl,
|
||||
mca_btl_base_descriptor_t* des,
|
||||
void* cbdata )
|
||||
{
|
||||
mca_btl_base_segment_t* segments = des->des_local;
|
||||
mca_btl_base_segment_t* segments = des->des_segments;
|
||||
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
|
||||
mca_pml_ob1_send_request_t* sendreq;
|
||||
size_t size;
|
||||
|
||||
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) {
|
||||
return;
|
||||
@ -307,19 +308,25 @@ void mca_pml_ob1_recv_frag_callback_ack(mca_btl_base_module_t* btl,
|
||||
/* if the request should be delivered entirely by copy in/out
|
||||
* then throttle sends */
|
||||
if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA) {
|
||||
if (NULL != sendreq->src_des) {
|
||||
/* release registered memory */
|
||||
mca_bml_base_free (sendreq->req_rdma[0].bml_btl, sendreq->src_des);
|
||||
sendreq->src_des = NULL;
|
||||
if (NULL != sendreq->rdma_frag) {
|
||||
if (NULL != sendreq->rdma_frag->local_handle) {
|
||||
mca_bml_base_deregister_mem (sendreq->req_rdma[0].bml_btl, sendreq->rdma_frag->local_handle);
|
||||
sendreq->rdma_frag->local_handle = NULL;
|
||||
}
|
||||
MCA_PML_OB1_RDMA_FRAG_RETURN(sendreq->rdma_frag);
|
||||
sendreq->rdma_frag = NULL;
|
||||
}
|
||||
|
||||
sendreq->req_throttle_sends = true;
|
||||
}
|
||||
|
||||
mca_pml_ob1_send_request_copy_in_out(sendreq,
|
||||
hdr->hdr_ack.hdr_send_offset,
|
||||
sendreq->req_send.req_bytes_packed -
|
||||
hdr->hdr_ack.hdr_send_offset);
|
||||
|
||||
if (hdr->hdr_ack.hdr_send_size) {
|
||||
size = hdr->hdr_ack.hdr_send_size;
|
||||
} else {
|
||||
size = sendreq->req_send.req_bytes_packed - hdr->hdr_ack.hdr_send_offset;
|
||||
}
|
||||
|
||||
mca_pml_ob1_send_request_copy_in_out(sendreq, hdr->hdr_ack.hdr_send_offset, size);
|
||||
|
||||
if (sendreq->req_state != 0) {
|
||||
/* Typical receipt of an ACK message causes req_state to be
|
||||
@ -355,13 +362,14 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t* des,
|
||||
void* cbdata ) {
|
||||
mca_btl_base_segment_t* segments = des->des_local;
|
||||
mca_btl_base_segment_t* segments = des->des_segments;
|
||||
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
|
||||
mca_pml_ob1_recv_request_t* recvreq;
|
||||
|
||||
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) {
|
||||
return;
|
||||
}
|
||||
|
||||
ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_FRAG);
|
||||
recvreq = (mca_pml_ob1_recv_request_t*)hdr->hdr_frag.hdr_dst_req.pval;
|
||||
#if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
|
||||
@ -372,7 +380,7 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl,
|
||||
assert(btl->btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV);
|
||||
|
||||
/* This will trigger the opal_convertor_pack to start asynchronous copy. */
|
||||
mca_pml_ob1_recv_request_frag_copy_start(recvreq,btl,segments,des->des_local_count,des);
|
||||
mca_pml_ob1_recv_request_frag_copy_start(recvreq,btl,segments,des->des_segment_count,des);
|
||||
|
||||
/* Let BTL know that it CANNOT free the frag */
|
||||
des->des_flags |= MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC;
|
||||
@ -380,7 +388,8 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl,
|
||||
return;
|
||||
}
|
||||
#endif /* OPAL_CUDA_SUPPORT */
|
||||
mca_pml_ob1_recv_request_progress_frag(recvreq,btl,segments,des->des_local_count);
|
||||
|
||||
mca_pml_ob1_recv_request_progress_frag(recvreq,btl,segments,des->des_segment_count);
|
||||
|
||||
return;
|
||||
}
|
||||
@ -390,7 +399,7 @@ void mca_pml_ob1_recv_frag_callback_put(mca_btl_base_module_t* btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t* des,
|
||||
void* cbdata ) {
|
||||
mca_btl_base_segment_t* segments = des->des_local;
|
||||
mca_btl_base_segment_t* segments = des->des_segments;
|
||||
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
|
||||
mca_pml_ob1_send_request_t* sendreq;
|
||||
|
||||
@ -410,20 +419,17 @@ void mca_pml_ob1_recv_frag_callback_fin(mca_btl_base_module_t* btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t* des,
|
||||
void* cbdata ) {
|
||||
mca_btl_base_segment_t* segments = des->des_local;
|
||||
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
|
||||
mca_btl_base_descriptor_t* rdma;
|
||||
mca_btl_base_segment_t* segments = des->des_segments;
|
||||
mca_pml_ob1_fin_hdr_t* hdr = (mca_pml_ob1_fin_hdr_t *) segments->seg_addr.pval;
|
||||
mca_pml_ob1_rdma_frag_t *frag;
|
||||
|
||||
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) {
|
||||
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_fin_hdr_t)) ) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_FIN);
|
||||
rdma = (mca_btl_base_descriptor_t*)hdr->hdr_fin.hdr_des.pval;
|
||||
rdma->des_cbfunc(btl, NULL, rdma,
|
||||
hdr->hdr_fin.hdr_fail ? OMPI_ERROR : OMPI_SUCCESS);
|
||||
|
||||
return;
|
||||
frag = (mca_pml_ob1_rdma_frag_t *) hdr->hdr_frag.pval;
|
||||
frag->cbfunc (frag, hdr->hdr_size);
|
||||
}
|
||||
|
||||
|
||||
@ -699,7 +705,7 @@ out_of_order_match:
|
||||
OPAL_THREAD_UNLOCK(&comm->matching_lock);
|
||||
|
||||
if(OPAL_LIKELY(match)) {
|
||||
switch(type) {
|
||||
switch(type) {
|
||||
case MCA_PML_OB1_HDR_TYPE_MATCH:
|
||||
mca_pml_ob1_recv_request_progress_match(match, btl, segments, num_segments);
|
||||
break;
|
||||
|
@ -13,7 +13,7 @@
|
||||
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2011 Sandia National Laboratories. All rights reserved.
|
||||
* Copyright (c) 2012-2013 NVIDIA Corporation. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2012 FUJITSU LIMITED. All rights reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
@ -150,12 +150,17 @@ static void mca_pml_ob1_recv_request_construct(mca_pml_ob1_recv_request_t* reque
|
||||
request->req_recv.req_base.req_ompi.req_free = mca_pml_ob1_recv_request_free;
|
||||
request->req_recv.req_base.req_ompi.req_cancel = mca_pml_ob1_recv_request_cancel;
|
||||
request->req_rdma_cnt = 0;
|
||||
request->local_handle = NULL;
|
||||
OBJ_CONSTRUCT(&request->lock, opal_mutex_t);
|
||||
}
|
||||
|
||||
static void mca_pml_ob1_recv_request_destruct(mca_pml_ob1_recv_request_t* request)
|
||||
{
|
||||
OBJ_DESTRUCT(&request->lock);
|
||||
if (OPAL_UNLIKELY(request->local_handle)) {
|
||||
mca_bml_base_deregister_mem (request->rdma_bml, request->local_handle);
|
||||
request->local_handle = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(
|
||||
@ -183,31 +188,27 @@ static void mca_pml_ob1_recv_ctl_completion( mca_btl_base_module_t* btl,
|
||||
* Put operation has completed remotely - update request status
|
||||
*/
|
||||
|
||||
static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* ep,
|
||||
struct mca_btl_base_descriptor_t* des,
|
||||
int status )
|
||||
static void mca_pml_ob1_put_completion (mca_pml_ob1_rdma_frag_t *frag, int64_t rdma_size)
|
||||
{
|
||||
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context;
|
||||
mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)des->des_cbdata;
|
||||
size_t bytes_received = 0;
|
||||
mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req;
|
||||
mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
|
||||
|
||||
if( OPAL_LIKELY(status == OMPI_SUCCESS) ) {
|
||||
bytes_received = mca_pml_ob1_compute_segment_length (btl->btl_seg_size,
|
||||
(void *) des->des_local,
|
||||
des->des_local_count, 0);
|
||||
}
|
||||
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth,-1);
|
||||
|
||||
mca_bml_base_free(bml_btl, des);
|
||||
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
|
||||
|
||||
/* check completion status */
|
||||
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, bytes_received);
|
||||
if(recv_request_pml_complete_check(recvreq) == false &&
|
||||
if (OPAL_LIKELY(0 < rdma_size)) {
|
||||
assert ((uint64_t) rdma_size == frag->rdma_length);
|
||||
|
||||
/* check completion status */
|
||||
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, (size_t) rdma_size);
|
||||
if (recv_request_pml_complete_check(recvreq) == false &&
|
||||
recvreq->req_rdma_offset < recvreq->req_send_offset) {
|
||||
/* schedule additional rdma operations */
|
||||
mca_pml_ob1_recv_request_schedule(recvreq, bml_btl);
|
||||
/* schedule additional rdma operations */
|
||||
mca_pml_ob1_recv_request_schedule(recvreq, bml_btl);
|
||||
}
|
||||
}
|
||||
|
||||
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
|
||||
}
|
||||
|
||||
@ -218,7 +219,7 @@ static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl,
|
||||
int mca_pml_ob1_recv_request_ack_send_btl(
|
||||
ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl,
|
||||
uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset,
|
||||
bool nordma)
|
||||
uint64_t size, bool nordma)
|
||||
{
|
||||
mca_btl_base_descriptor_t* des;
|
||||
mca_pml_ob1_ack_hdr_t* ack;
|
||||
@ -234,12 +235,9 @@ int mca_pml_ob1_recv_request_ack_send_btl(
|
||||
}
|
||||
|
||||
/* fill out header */
|
||||
ack = (mca_pml_ob1_ack_hdr_t*)des->des_local->seg_addr.pval;
|
||||
ack->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_ACK;
|
||||
ack->hdr_common.hdr_flags = nordma ? MCA_PML_OB1_HDR_FLAGS_NORDMA : 0;
|
||||
ack->hdr_src_req.lval = hdr_src_req;
|
||||
ack->hdr_dst_req.pval = hdr_dst_req;
|
||||
ack->hdr_send_offset = hdr_send_offset;
|
||||
ack = (mca_pml_ob1_ack_hdr_t*)des->des_segments->seg_addr.pval;
|
||||
mca_pml_ob1_ack_hdr_prepare (ack, nordma ? MCA_PML_OB1_HDR_FLAGS_NORDMA : 0,
|
||||
hdr_src_req, hdr_dst_req, hdr_send_offset, size);
|
||||
|
||||
ob1_hdr_hton(ack, MCA_PML_OB1_HDR_TYPE_ACK, proc);
|
||||
|
||||
@ -313,63 +311,99 @@ static int mca_pml_ob1_recv_request_ack(
|
||||
if(recvreq->req_send_offset == hdr->hdr_msg_length)
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* let know to shedule function there is no need to put ACK flag */
|
||||
recvreq->req_ack_sent = true;
|
||||
return mca_pml_ob1_recv_request_ack_send(proc, hdr->hdr_src_req.lval,
|
||||
recvreq, recvreq->req_send_offset,
|
||||
recvreq, recvreq->req_send_offset, 0,
|
||||
recvreq->req_send_offset == bytes_received);
|
||||
}
|
||||
|
||||
static int mca_pml_ob1_recv_request_put_frag (mca_pml_ob1_rdma_frag_t *frag);
|
||||
|
||||
static int mca_pml_ob1_recv_request_get_frag_failed (mca_pml_ob1_rdma_frag_t *frag, int rc)
|
||||
{
|
||||
mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req;
|
||||
ompi_proc_t *proc = (ompi_proc_t *) recvreq->req_recv.req_base.req_proc;
|
||||
|
||||
if (OMPI_ERR_NOT_AVAILABLE == rc) {
|
||||
/* get isn't supported for this transfer. tell peer to fallback on put */
|
||||
rc = mca_pml_ob1_recv_request_put_frag (frag);
|
||||
if (OMPI_ERR_OUT_OF_RESOURCE == rc) {
|
||||
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
|
||||
opal_list_append (&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
|
||||
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
}
|
||||
|
||||
if (++frag->retries < mca_pml_ob1.rdma_retries_limit &&
|
||||
OMPI_ERR_OUT_OF_RESOURCE == rc) {
|
||||
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
|
||||
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
|
||||
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* tell peer to fall back on send for this region */
|
||||
rc = mca_pml_ob1_recv_request_ack_send(proc, frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req.lval,
|
||||
recvreq, frag->rdma_offset, frag->rdma_length, false);
|
||||
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return resources used by the RDMA
|
||||
*/
|
||||
|
||||
static void mca_pml_ob1_rget_completion( mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* ep,
|
||||
struct mca_btl_base_descriptor_t* des,
|
||||
int status )
|
||||
static void mca_pml_ob1_rget_completion (mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* ep,
|
||||
void *local_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
void *context, void *cbdata, int status)
|
||||
{
|
||||
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context;
|
||||
mca_pml_ob1_rdma_frag_t* frag = (mca_pml_ob1_rdma_frag_t*)des->des_cbdata;
|
||||
mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)frag->rdma_req;
|
||||
mca_bml_base_btl_t *bml_btl = (mca_bml_base_btl_t *) context;
|
||||
mca_pml_ob1_rdma_frag_t *frag = (mca_pml_ob1_rdma_frag_t *) cbdata;
|
||||
mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req;
|
||||
|
||||
/* check completion status */
|
||||
if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
|
||||
/* TSW - FIX */
|
||||
OMPI_ERROR_LOG(status);
|
||||
ompi_rte_abort(-1, NULL);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != status)) {
|
||||
status = mca_pml_ob1_recv_request_get_frag_failed (frag, status);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != status)) {
|
||||
/* TSW - FIX */
|
||||
OMPI_ERROR_LOG(status);
|
||||
ompi_rte_abort(-1, NULL);
|
||||
}
|
||||
} else {
|
||||
/* is receive request complete */
|
||||
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, frag->rdma_length);
|
||||
/* TODO: re-add order */
|
||||
mca_pml_ob1_send_fin (recvreq->req_recv.req_base.req_proc,
|
||||
bml_btl, frag->rdma_hdr.hdr_rget.hdr_frag,
|
||||
frag->rdma_length, 0, 0);
|
||||
|
||||
recv_request_pml_complete_check(recvreq);
|
||||
|
||||
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
|
||||
}
|
||||
|
||||
/* is receive request complete */
|
||||
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, frag->rdma_length);
|
||||
if (recvreq->req_recv.req_bytes_packed <= recvreq->req_bytes_received) {
|
||||
mca_pml_ob1_send_fin(recvreq->req_recv.req_base.req_proc,
|
||||
bml_btl,
|
||||
frag->rdma_hdr.hdr_rget.hdr_des,
|
||||
des->order, 0);
|
||||
}
|
||||
|
||||
recv_request_pml_complete_check(recvreq);
|
||||
|
||||
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
|
||||
|
||||
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
|
||||
}
|
||||
|
||||
|
||||
static int mca_pml_ob1_init_get_fallback (mca_pml_ob1_rdma_frag_t *frag,
|
||||
mca_btl_base_descriptor_t *dst) {
|
||||
static int mca_pml_ob1_recv_request_put_frag (mca_pml_ob1_rdma_frag_t *frag)
|
||||
{
|
||||
mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req;
|
||||
mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
|
||||
mca_btl_base_descriptor_t *ctl;
|
||||
mca_pml_ob1_rdma_hdr_t *hdr;
|
||||
size_t seg_size;
|
||||
size_t reg_size;
|
||||
int rc;
|
||||
|
||||
seg_size = bml_btl->btl->btl_seg_size * dst->des_local_count;
|
||||
reg_size = bml_btl->btl->btl_registration_handle_size;
|
||||
|
||||
/* prepare a descriptor for rdma control message */
|
||||
mca_bml_base_alloc (bml_btl, &ctl, MCA_BTL_NO_ORDER, sizeof (mca_pml_ob1_rdma_hdr_t) + seg_size,
|
||||
mca_bml_base_alloc (bml_btl, &ctl, MCA_BTL_NO_ORDER, sizeof (mca_pml_ob1_rdma_hdr_t) + reg_size,
|
||||
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
|
||||
MCA_BTL_DES_SEND_ALWAYS_CALLBACK | MCA_BTL_DES_FLAGS_SIGNAL);
|
||||
if (OPAL_UNLIKELY(NULL == ctl)) {
|
||||
@ -378,26 +412,19 @@ static int mca_pml_ob1_init_get_fallback (mca_pml_ob1_rdma_frag_t *frag,
|
||||
ctl->des_cbfunc = mca_pml_ob1_recv_ctl_completion;
|
||||
|
||||
/* fill in rdma header */
|
||||
hdr = (mca_pml_ob1_rdma_hdr_t *) ctl->des_local->seg_addr.pval;
|
||||
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_PUT;
|
||||
hdr->hdr_common.hdr_flags =
|
||||
(!recvreq->req_ack_sent) ? MCA_PML_OB1_HDR_TYPE_ACK : 0;
|
||||
hdr = (mca_pml_ob1_rdma_hdr_t *) ctl->des_segments->seg_addr.pval;
|
||||
mca_pml_ob1_rdma_hdr_prepare (hdr, (!recvreq->req_ack_sent) ? MCA_PML_OB1_HDR_TYPE_ACK : 0,
|
||||
recvreq->remote_req_send.lval, frag, recvreq, frag->rdma_offset,
|
||||
frag->local_address, frag->rdma_length, frag->local_handle,
|
||||
reg_size);
|
||||
|
||||
hdr->hdr_req = frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req;
|
||||
hdr->hdr_rdma_offset = recvreq->req_rdma_offset;
|
||||
hdr->hdr_des.pval = dst;
|
||||
hdr->hdr_recv_req.pval = recvreq;
|
||||
frag->cbfunc = mca_pml_ob1_put_completion;
|
||||
|
||||
hdr->hdr_seg_cnt = dst->des_local_count;
|
||||
recvreq->req_ack_sent = true;
|
||||
|
||||
/* copy segments */
|
||||
memcpy (hdr + 1, dst->des_local, seg_size);
|
||||
|
||||
dst->des_cbfunc = mca_pml_ob1_put_completion;
|
||||
dst->des_cbdata = recvreq;
|
||||
|
||||
if (!recvreq->req_ack_sent)
|
||||
recvreq->req_ack_sent = true;
|
||||
PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE,
|
||||
&(recvreq->req_recv.req_base), size,
|
||||
PERUSE_RECV);
|
||||
|
||||
/* send rdma request to peer */
|
||||
rc = mca_bml_base_send (bml_btl, ctl, MCA_PML_OB1_HDR_TYPE_PUT);
|
||||
@ -412,71 +439,38 @@ static int mca_pml_ob1_init_get_fallback (mca_pml_ob1_rdma_frag_t *frag,
|
||||
/*
|
||||
*
|
||||
*/
|
||||
int mca_pml_ob1_recv_request_get_frag( mca_pml_ob1_rdma_frag_t* frag )
|
||||
int mca_pml_ob1_recv_request_get_frag (mca_pml_ob1_rdma_frag_t *frag)
|
||||
{
|
||||
mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)frag->rdma_req;
|
||||
mca_bml_base_btl_t* bml_btl = frag->rdma_bml;
|
||||
mca_btl_base_descriptor_t* descriptor;
|
||||
size_t save_size = frag->rdma_length;
|
||||
mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req;
|
||||
mca_btl_base_registration_handle_t *local_handle = NULL;
|
||||
mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
|
||||
int rc;
|
||||
|
||||
/* prepare descriptor */
|
||||
mca_bml_base_prepare_dst( bml_btl,
|
||||
NULL,
|
||||
&recvreq->req_recv.req_base.req_convertor,
|
||||
MCA_BTL_NO_ORDER,
|
||||
0,
|
||||
&frag->rdma_length,
|
||||
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK |
|
||||
MCA_BTL_DES_FLAGS_GET,
|
||||
&descriptor );
|
||||
if( OPAL_UNLIKELY(NULL == descriptor) ) {
|
||||
if (frag->retries < mca_pml_ob1.rdma_retries_limit) {
|
||||
frag->rdma_length = save_size;
|
||||
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
|
||||
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
|
||||
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
} else {
|
||||
ompi_proc_t *proc = (ompi_proc_t *) recvreq->req_recv.req_base.req_proc;
|
||||
|
||||
/* tell peer to fall back on send */
|
||||
recvreq->req_send_offset = 0;
|
||||
rc = mca_pml_ob1_recv_request_ack_send(proc, frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req.lval,
|
||||
recvreq, recvreq->req_send_offset, true);
|
||||
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
|
||||
return rc;
|
||||
if (bml_btl->btl->btl_register_mem && !frag->local_handle && !recvreq->local_handle) {
|
||||
mca_bml_base_register_mem (bml_btl, frag->local_address, frag->rdma_length, MCA_BTL_REG_FLAG_LOCAL_WRITE |
|
||||
MCA_BTL_REG_FLAG_REMOTE_WRITE, &frag->local_handle);
|
||||
if (OPAL_UNLIKELY(NULL == frag->local_handle)) {
|
||||
return mca_pml_ob1_recv_request_get_frag_failed (frag, OMPI_ERR_OUT_OF_RESOURCE);
|
||||
}
|
||||
}
|
||||
|
||||
descriptor->des_remote = (mca_btl_base_segment_t *) frag->rdma_segs;
|
||||
descriptor->des_remote_count = frag->rdma_hdr.hdr_rdma.hdr_seg_cnt;
|
||||
descriptor->des_cbfunc = mca_pml_ob1_rget_completion;
|
||||
descriptor->des_cbdata = frag;
|
||||
if (frag->local_handle) {
|
||||
local_handle = frag->local_handle;
|
||||
} else if (recvreq->local_handle) {
|
||||
local_handle = recvreq->local_handle;
|
||||
}
|
||||
|
||||
PERUSE_TRACE_COMM_OMPI_EVENT(PERUSE_COMM_REQ_XFER_CONTINUE,
|
||||
&(recvreq->req_recv.req_base),
|
||||
&(((mca_pml_ob1_recv_request_t *) frag->rdma_req)->req_recv.req_base),
|
||||
frag->rdma_length, PERUSE_RECV);
|
||||
|
||||
/* queue up get request */
|
||||
rc = mca_bml_base_get(bml_btl,descriptor);
|
||||
rc = mca_bml_base_get (bml_btl, frag->local_address, frag->remote_address, local_handle,
|
||||
(mca_btl_base_registration_handle_t *) frag->remote_handle, frag->rdma_length,
|
||||
0, MCA_BTL_NO_ORDER, mca_pml_ob1_rget_completion, frag);
|
||||
if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
|
||||
if (OPAL_UNLIKELY(OMPI_ERR_NOT_AVAILABLE == rc)) {
|
||||
/* get isn't supported for this transfer. tell peer to fallback on put */
|
||||
rc = mca_pml_ob1_init_get_fallback (frag, descriptor);
|
||||
}
|
||||
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
|
||||
mca_bml_base_free(bml_btl, descriptor);
|
||||
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
|
||||
opal_list_append(&mca_pml_ob1.rdma_pending,
|
||||
(opal_list_item_t*)frag);
|
||||
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
} else if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
OMPI_ERROR_LOG(rc);
|
||||
ompi_rte_abort(-1, NULL);
|
||||
}
|
||||
return mca_pml_ob1_recv_request_get_frag_failed (frag, OMPI_ERR_OUT_OF_RESOURCE);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
@ -502,6 +496,7 @@ void mca_pml_ob1_recv_request_progress_frag( mca_pml_ob1_recv_request_t* recvreq
|
||||
bytes_received = mca_pml_ob1_compute_segment_length_base (segments, num_segments,
|
||||
sizeof(mca_pml_ob1_frag_hdr_t));
|
||||
data_offset = hdr->hdr_frag.hdr_frag_offset;
|
||||
|
||||
/*
|
||||
* Make user buffer accessible(defined) before unpacking.
|
||||
*/
|
||||
@ -573,7 +568,7 @@ void mca_pml_ob1_recv_request_frag_copy_start( mca_pml_ob1_recv_request_t* recvr
|
||||
/* Store the receive request in unused context pointer. */
|
||||
des->des_context = (void *)recvreq;
|
||||
/* Store the amount of bytes in unused remote count value */
|
||||
des->des_remote_count = bytes_delivered;
|
||||
des->des_segment_count = bytes_delivered;
|
||||
/* Then record an event that will get triggered by a PML progress call which
|
||||
* checks the stream events. If we get an error, abort. Should get message
|
||||
* from CUDA code about what went wrong. */
|
||||
@ -598,7 +593,7 @@ void mca_pml_ob1_recv_request_frag_copy_finished( mca_btl_base_module_t* btl,
|
||||
int status )
|
||||
{
|
||||
mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)des->des_context;
|
||||
size_t bytes_received = des->des_remote_count;
|
||||
size_t bytes_received = des->des_segment_count;
|
||||
|
||||
OPAL_OUTPUT((-1, "frag_copy_finished (delivered=%d), frag=%p", (int)bytes_received, (void *)des));
|
||||
/* Call into the BTL so it can free the descriptor. At this point, it is
|
||||
@ -629,7 +624,6 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
|
||||
mca_pml_ob1_rget_hdr_t* hdr = (mca_pml_ob1_rget_hdr_t*)segments->seg_addr.pval;
|
||||
mca_bml_base_endpoint_t* bml_endpoint = NULL;
|
||||
size_t bytes_remaining, prev_sent, offset;
|
||||
mca_btl_base_segment_t *r_segments;
|
||||
mca_pml_ob1_rdma_frag_t *frag;
|
||||
mca_bml_base_btl_t *rdma_bml;
|
||||
int rc;
|
||||
@ -637,6 +631,7 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
|
||||
prev_sent = offset = 0;
|
||||
bytes_remaining = hdr->hdr_rndv.hdr_msg_length;
|
||||
recvreq->req_recv.req_bytes_packed = hdr->hdr_rndv.hdr_msg_length;
|
||||
recvreq->req_send_offset = 0;
|
||||
|
||||
MCA_PML_OB1_RECV_REQUEST_MATCHED(recvreq, &hdr->hdr_rndv.hdr_match);
|
||||
|
||||
@ -680,8 +675,28 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
|
||||
ompi_rte_abort(-1, NULL);
|
||||
}
|
||||
|
||||
bytes_remaining = mca_pml_ob1_compute_segment_length_remote (btl->btl_seg_size, (void *)(hdr + 1),
|
||||
hdr->hdr_seg_cnt, recvreq->req_recv.req_base.req_proc);
|
||||
bytes_remaining = hdr->hdr_rndv.hdr_msg_length;
|
||||
|
||||
/* save the request for put fallback */
|
||||
recvreq->remote_req_send = hdr->hdr_rndv.hdr_src_req;
|
||||
recvreq->rdma_bml = rdma_bml;
|
||||
|
||||
/* try to register the entire buffer */
|
||||
if (rdma_bml->btl->btl_register_mem) {
|
||||
void *data_ptr;
|
||||
|
||||
offset = 0;
|
||||
|
||||
OPAL_THREAD_LOCK(&recvreq->lock);
|
||||
opal_convertor_set_position( &recvreq->req_recv.req_base.req_convertor, &offset);
|
||||
opal_convertor_get_current_pointer (&recvreq->req_recv.req_base.req_convertor, &data_ptr);
|
||||
OPAL_THREAD_UNLOCK(&recvreq->lock);
|
||||
|
||||
mca_bml_base_register_mem (rdma_bml, data_ptr, bytes_remaining, MCA_BTL_REG_FLAG_LOCAL_WRITE |
|
||||
MCA_BTL_REG_FLAG_REMOTE_WRITE, &recvreq->local_handle);
|
||||
/* It is not an error if the memory region can not be registered here. The registration will
|
||||
* be attempted again for each get fragment. */
|
||||
}
|
||||
|
||||
/* The while loop adds a fragmentation mechanism. The variable bytes_remaining holds the num
|
||||
* of bytes left to be send. In each iteration we send the max possible bytes supported
|
||||
@ -690,7 +705,12 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
|
||||
* the next iteration with the updated size.
|
||||
* Also - In each iteration we update the location in the buffer to be used for writing
|
||||
* the message ,and the location to read from. This is done using the offset variable that
|
||||
* accumulates the number of bytes that were sent so far. */
|
||||
* accumulates the number of bytes that were sent so far.
|
||||
*
|
||||
* NTH: This fragmentation may go away if we change the btls to require them to handle
|
||||
* get fragmentation internally. This is a reasonable solution since some btls do not
|
||||
* need any fragmentation (sm, vader, self, etc). Remove this loop if this ends up
|
||||
* being the case. */
|
||||
while (bytes_remaining > 0) {
|
||||
/* allocate/initialize a fragment */
|
||||
MCA_PML_OB1_RDMA_FRAG_ALLOC(frag);
|
||||
@ -700,29 +720,31 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
|
||||
ompi_rte_abort(-1, NULL);
|
||||
}
|
||||
|
||||
assert (btl->btl_seg_size * hdr->hdr_seg_cnt <= sizeof (frag->rdma_segs));
|
||||
memcpy (frag->remote_handle, hdr + 1, btl->btl_registration_handle_size);
|
||||
|
||||
memcpy (frag->rdma_segs, hdr + 1, btl->btl_seg_size * hdr->hdr_seg_cnt);
|
||||
|
||||
/* update the read location -- NTH: note this will only work if there is exactly one
|
||||
segment. TODO -- make this work with multiple segments */
|
||||
r_segments = (mca_btl_base_segment_t *) frag->rdma_segs;
|
||||
r_segments->seg_addr.lval += offset;
|
||||
/* update the read location */
|
||||
frag->remote_address = hdr->hdr_src_ptr + offset;
|
||||
|
||||
/* updating the write location */
|
||||
OPAL_THREAD_LOCK(&recvreq->lock);
|
||||
opal_convertor_set_position( &recvreq->req_recv.req_base.req_convertor, &offset);
|
||||
opal_convertor_get_current_pointer (&recvreq->req_recv.req_base.req_convertor, &frag->local_address);
|
||||
OPAL_THREAD_UNLOCK(&recvreq->lock);
|
||||
|
||||
frag->rdma_bml = rdma_bml;
|
||||
|
||||
frag->rdma_hdr.hdr_rget = *hdr;
|
||||
frag->retries = 0;
|
||||
frag->rdma_req = recvreq;
|
||||
frag->rdma_ep = bml_endpoint;
|
||||
frag->rdma_state = MCA_PML_OB1_RDMA_GET;
|
||||
frag->reg = NULL;
|
||||
frag->rdma_length = bytes_remaining;
|
||||
frag->retries = 0;
|
||||
frag->rdma_req = recvreq;
|
||||
frag->rdma_state = MCA_PML_OB1_RDMA_GET;
|
||||
frag->local_handle = NULL;
|
||||
frag->rdma_offset = offset;
|
||||
|
||||
if (bytes_remaining > rdma_bml->btl->btl_get_limit) {
|
||||
frag->rdma_length = rdma_bml->btl->btl_get_limit;
|
||||
} else {
|
||||
frag->rdma_length = bytes_remaining;
|
||||
}
|
||||
|
||||
/* NTH: TODO -- handle error conditions gracefully */
|
||||
rc = mca_pml_ob1_recv_request_get_frag(frag);
|
||||
@ -921,13 +943,11 @@ int mca_pml_ob1_recv_request_schedule_once( mca_pml_ob1_recv_request_t* recvreq,
|
||||
|
||||
while(bytes_remaining > 0 &&
|
||||
recvreq->req_pipeline_depth < mca_pml_ob1.recv_pipeline_depth) {
|
||||
size_t size, seg_size;
|
||||
mca_pml_ob1_rdma_hdr_t* hdr;
|
||||
mca_btl_base_descriptor_t* dst;
|
||||
mca_btl_base_descriptor_t* ctl;
|
||||
mca_mpool_base_registration_t * reg = NULL;
|
||||
mca_btl_base_module_t* btl;
|
||||
mca_pml_ob1_rdma_frag_t *frag = NULL;
|
||||
mca_btl_base_module_t *btl;
|
||||
int rc, rdma_idx;
|
||||
void *data_ptr;
|
||||
size_t size;
|
||||
|
||||
if(prev_bytes_remaining == bytes_remaining) {
|
||||
if(++num_fail == num_tries) {
|
||||
@ -948,86 +968,62 @@ int mca_pml_ob1_recv_request_schedule_once( mca_pml_ob1_recv_request_t* recvreq,
|
||||
do {
|
||||
rdma_idx = recvreq->req_rdma_idx;
|
||||
bml_btl = recvreq->req_rdma[rdma_idx].bml_btl;
|
||||
reg = recvreq->req_rdma[rdma_idx].btl_reg;
|
||||
size = recvreq->req_rdma[rdma_idx].length;
|
||||
if(++recvreq->req_rdma_idx >= recvreq->req_rdma_cnt)
|
||||
recvreq->req_rdma_idx = 0;
|
||||
} while(!size);
|
||||
btl = bml_btl->btl;
|
||||
|
||||
/* makes sure that we don't exceed BTL max rdma size
|
||||
* if memory is not pinned already */
|
||||
if( (NULL == reg) && (btl->btl_rdma_pipeline_frag_size != 0) &&
|
||||
(size > btl->btl_rdma_pipeline_frag_size)) {
|
||||
/* NTH: This conditional used to check if there was a registration in
|
||||
* recvreq->req_rdma[rdma_idx].btl_reg. If once existed it was due to
|
||||
* the btl not needed registration (equivalent to btl->btl_register_mem
|
||||
* != NULL. This new check is equivalent. Note: I feel this protocol
|
||||
* needs work to better improve resource usage when running with a
|
||||
* leave pinned protocol. */
|
||||
if (btl->btl_register_mem && (btl->btl_rdma_pipeline_frag_size != 0) &&
|
||||
(size > btl->btl_rdma_pipeline_frag_size)) {
|
||||
size = btl->btl_rdma_pipeline_frag_size;
|
||||
}
|
||||
|
||||
/* take lock to protect converter against concurrent access
|
||||
MCA_PML_OB1_RDMA_FRAG_ALLOC(frag);
|
||||
if (OPAL_UNLIKELY(NULL == frag)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* take lock to protect convertor against concurrent access
|
||||
* from unpack */
|
||||
OPAL_THREAD_LOCK(&recvreq->lock);
|
||||
opal_convertor_set_position( &recvreq->req_recv.req_base.req_convertor,
|
||||
&recvreq->req_rdma_offset );
|
||||
|
||||
/* prepare a descriptor for RDMA */
|
||||
mca_bml_base_prepare_dst(bml_btl, reg,
|
||||
&recvreq->req_recv.req_base.req_convertor,
|
||||
MCA_BTL_NO_ORDER, 0, &size, MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
|
||||
MCA_BTL_DES_FLAGS_PUT, &dst);
|
||||
opal_convertor_set_position (&recvreq->req_recv.req_base.req_convertor,
|
||||
&recvreq->req_rdma_offset);
|
||||
opal_convertor_get_current_pointer (&recvreq->req_recv.req_base.req_convertor, &data_ptr);
|
||||
OPAL_THREAD_UNLOCK(&recvreq->lock);
|
||||
|
||||
if(OPAL_UNLIKELY(dst == NULL)) {
|
||||
continue;
|
||||
if (btl->btl_register_mem) {
|
||||
mca_bml_base_register_mem (bml_btl, data_ptr, size, MCA_BTL_REG_FLAG_REMOTE_WRITE,
|
||||
&frag->local_handle);
|
||||
if (OPAL_UNLIKELY(NULL == frag->local_handle)) {
|
||||
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
dst->des_cbfunc = mca_pml_ob1_put_completion;
|
||||
dst->des_cbdata = recvreq;
|
||||
/* fill in the minimum information needed to handle the fin message */
|
||||
frag->cbfunc = mca_pml_ob1_put_completion;
|
||||
frag->rdma_length = size;
|
||||
frag->rdma_req = recvreq;
|
||||
frag->rdma_bml = bml_btl;
|
||||
frag->local_address = data_ptr;
|
||||
frag->rdma_offset = recvreq->req_rdma_offset;
|
||||
|
||||
seg_size = btl->btl_seg_size * dst->des_local_count;
|
||||
|
||||
/* prepare a descriptor for rdma control message */
|
||||
mca_bml_base_alloc(bml_btl, &ctl, MCA_BTL_NO_ORDER, sizeof(mca_pml_ob1_rdma_hdr_t) + seg_size,
|
||||
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
|
||||
MCA_BTL_DES_SEND_ALWAYS_CALLBACK | MCA_BTL_DES_FLAGS_SIGNAL);
|
||||
|
||||
if( OPAL_UNLIKELY(NULL == ctl) ) {
|
||||
mca_bml_base_free(bml_btl,dst);
|
||||
continue;
|
||||
}
|
||||
ctl->des_cbfunc = mca_pml_ob1_recv_ctl_completion;
|
||||
|
||||
/* fill in rdma header */
|
||||
hdr = (mca_pml_ob1_rdma_hdr_t*)ctl->des_local->seg_addr.pval;
|
||||
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_PUT;
|
||||
hdr->hdr_common.hdr_flags =
|
||||
(!recvreq->req_ack_sent) ? MCA_PML_OB1_HDR_TYPE_ACK : 0;
|
||||
hdr->hdr_req = recvreq->remote_req_send;
|
||||
hdr->hdr_des.pval = dst;
|
||||
hdr->hdr_recv_req.pval = recvreq;
|
||||
hdr->hdr_rdma_offset = recvreq->req_rdma_offset;
|
||||
hdr->hdr_seg_cnt = dst->des_local_count;
|
||||
|
||||
/* copy segments */
|
||||
memmove (hdr + 1, dst->des_local, seg_size);
|
||||
|
||||
if(!recvreq->req_ack_sent)
|
||||
recvreq->req_ack_sent = true;
|
||||
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_PUT, recvreq->req_recv.req_base.req_proc);
|
||||
|
||||
PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE,
|
||||
&(recvreq->req_recv.req_base), size,
|
||||
PERUSE_RECV);
|
||||
|
||||
/* send rdma request to peer */
|
||||
rc = mca_bml_base_send(bml_btl, ctl, MCA_PML_OB1_HDR_TYPE_PUT);
|
||||
if( OPAL_LIKELY( rc >= 0 ) ) {
|
||||
rc = mca_pml_ob1_recv_request_put_frag (frag);
|
||||
if (OPAL_LIKELY(OMPI_SUCCESS == rc)) {
|
||||
/* update request state */
|
||||
recvreq->req_rdma_offset += size;
|
||||
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth, 1);
|
||||
recvreq->req_rdma[rdma_idx].length -= size;
|
||||
bytes_remaining -= size;
|
||||
} else {
|
||||
mca_bml_base_free(bml_btl,ctl);
|
||||
mca_bml_base_free(bml_btl,dst);
|
||||
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,3 +1,4 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
@ -10,7 +11,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
@ -52,6 +53,8 @@ struct mca_pml_ob1_recv_request_t {
|
||||
bool req_ack_sent; /**< whether ack was sent to the sender */
|
||||
bool req_match_received; /**< Prevent request to be completed prematurely */
|
||||
opal_mutex_t lock;
|
||||
mca_bml_base_btl_t *rdma_bml;
|
||||
mca_btl_base_registration_handle_t *local_handle;
|
||||
mca_pml_ob1_com_btl_t req_rdma[1];
|
||||
};
|
||||
typedef struct mca_pml_ob1_recv_request_t mca_pml_ob1_recv_request_t;
|
||||
@ -131,8 +134,12 @@ do { \
|
||||
#define MCA_PML_OB1_RECV_REQUEST_RETURN(recvreq) \
|
||||
{ \
|
||||
MCA_PML_BASE_RECV_REQUEST_FINI(&(recvreq)->req_recv); \
|
||||
OMPI_FREE_LIST_RETURN_MT( &mca_pml_base_recv_requests, \
|
||||
(ompi_free_list_item_t*)(recvreq)); \
|
||||
if ((recvreq)->local_handle) { \
|
||||
mca_bml_base_deregister_mem ((recvreq)->rdma_bml, (recvreq)->local_handle); \
|
||||
(recvreq)->local_handle = NULL; \
|
||||
} \
|
||||
OMPI_FREE_LIST_RETURN_MT( &mca_pml_base_recv_requests, \
|
||||
(ompi_free_list_item_t*)(recvreq)); \
|
||||
}
|
||||
|
||||
/**
|
||||
@ -154,9 +161,11 @@ recv_request_pml_complete(mca_pml_ob1_recv_request_t *recvreq)
|
||||
}
|
||||
|
||||
for(i = 0; i < recvreq->req_rdma_cnt; i++) {
|
||||
mca_mpool_base_registration_t* btl_reg = recvreq->req_rdma[i].btl_reg;
|
||||
if( NULL != btl_reg && btl_reg->mpool != NULL) {
|
||||
btl_reg->mpool->mpool_deregister( btl_reg->mpool, btl_reg );
|
||||
struct mca_btl_base_registration_handle_t *handle = recvreq->req_rdma[i].btl_reg;
|
||||
mca_bml_base_btl_t *bml_btl = recvreq->req_rdma[i].bml_btl;
|
||||
|
||||
if (NULL != handle) {
|
||||
mca_bml_base_deregister_mem (bml_btl, handle);
|
||||
}
|
||||
}
|
||||
recvreq->req_rdma_cnt = 0;
|
||||
@ -178,6 +187,10 @@ recv_request_pml_complete(mca_pml_ob1_recv_request_t *recvreq)
|
||||
recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR =
|
||||
MPI_ERR_TRUNCATE;
|
||||
}
|
||||
if (OPAL_UNLIKELY(recvreq->local_handle)) {
|
||||
mca_bml_base_deregister_mem (recvreq->rdma_bml, recvreq->local_handle);
|
||||
recvreq->local_handle = NULL;
|
||||
}
|
||||
MCA_PML_OB1_RECV_REQUEST_MPI_COMPLETE(recvreq);
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&ompi_request_lock);
|
||||
@ -387,7 +400,7 @@ static inline void mca_pml_ob1_recv_request_schedule(
|
||||
(void)mca_pml_ob1_recv_request_schedule_exclusive(req, start_bml_btl);
|
||||
}
|
||||
|
||||
#define MCA_PML_OB1_ADD_ACK_TO_PENDING(P, S, D, O) \
|
||||
#define MCA_PML_OB1_ADD_ACK_TO_PENDING(P, S, D, O, Sz) \
|
||||
do { \
|
||||
mca_pml_ob1_pckt_pending_t *_pckt; \
|
||||
\
|
||||
@ -396,6 +409,7 @@ static inline void mca_pml_ob1_recv_request_schedule(
|
||||
_pckt->hdr.hdr_ack.hdr_src_req.lval = (S); \
|
||||
_pckt->hdr.hdr_ack.hdr_dst_req.pval = (D); \
|
||||
_pckt->hdr.hdr_ack.hdr_send_offset = (O); \
|
||||
_pckt->hdr.hdr_ack.hdr_send_size = (Sz); \
|
||||
_pckt->proc = (P); \
|
||||
_pckt->bml_btl = NULL; \
|
||||
OPAL_THREAD_LOCK(&mca_pml_ob1.lock); \
|
||||
@ -406,11 +420,11 @@ static inline void mca_pml_ob1_recv_request_schedule(
|
||||
|
||||
int mca_pml_ob1_recv_request_ack_send_btl(ompi_proc_t* proc,
|
||||
mca_bml_base_btl_t* bml_btl, uint64_t hdr_src_req, void *hdr_dst_req,
|
||||
uint64_t hdr_rdma_offset, bool nordma);
|
||||
uint64_t hdr_rdma_offset, uint64_t size, bool nordma);
|
||||
|
||||
static inline int mca_pml_ob1_recv_request_ack_send(ompi_proc_t* proc,
|
||||
uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset,
|
||||
bool nordma)
|
||||
uint64_t size, bool nordma)
|
||||
{
|
||||
size_t i;
|
||||
mca_bml_base_btl_t* bml_btl;
|
||||
@ -420,12 +434,12 @@ static inline int mca_pml_ob1_recv_request_ack_send(ompi_proc_t* proc,
|
||||
for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) {
|
||||
bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
|
||||
if(mca_pml_ob1_recv_request_ack_send_btl(proc, bml_btl, hdr_src_req,
|
||||
hdr_dst_req, hdr_send_offset, nordma) == OMPI_SUCCESS)
|
||||
hdr_dst_req, hdr_send_offset, size, nordma) == OMPI_SUCCESS)
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
MCA_PML_OB1_ADD_ACK_TO_PENDING(proc, hdr_src_req, hdr_dst_req,
|
||||
hdr_send_offset);
|
||||
hdr_send_offset, size);
|
||||
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
@ -13,7 +13,7 @@
|
||||
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012 NVIDIA Corporation. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -137,6 +137,7 @@ static void mca_pml_ob1_send_request_construct(mca_pml_ob1_send_request_t* req)
|
||||
req->req_send.req_base.req_ompi.req_cancel = mca_pml_ob1_send_request_cancel;
|
||||
req->req_rdma_cnt = 0;
|
||||
req->req_throttle_sends = false;
|
||||
req->rdma_frag = NULL;
|
||||
OBJ_CONSTRUCT(&req->req_send_ranges, opal_list_t);
|
||||
OBJ_CONSTRUCT(&req->req_send_range_lock, opal_mutex_t);
|
||||
}
|
||||
@ -145,6 +146,10 @@ static void mca_pml_ob1_send_request_destruct(mca_pml_ob1_send_request_t* req)
|
||||
{
|
||||
OBJ_DESTRUCT(&req->req_send_ranges);
|
||||
OBJ_DESTRUCT(&req->req_send_range_lock);
|
||||
if (req->rdma_frag) {
|
||||
MCA_PML_OB1_RDMA_FRAG_RETURN(req->rdma_frag);
|
||||
req->rdma_frag = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE( mca_pml_ob1_send_request_t,
|
||||
@ -236,10 +241,9 @@ mca_pml_ob1_rndv_completion( mca_btl_base_module_t* btl,
|
||||
* happens in one thread, the increase of the req_bytes_delivered does not
|
||||
* have to be atomic.
|
||||
*/
|
||||
req_bytes_delivered = mca_pml_ob1_compute_segment_length (btl->btl_seg_size,
|
||||
(void *) des->des_local,
|
||||
des->des_local_count,
|
||||
sizeof(mca_pml_ob1_rendezvous_hdr_t));
|
||||
req_bytes_delivered = mca_pml_ob1_compute_segment_length_base ((void *) des->des_segments,
|
||||
des->des_segment_count,
|
||||
sizeof(mca_pml_ob1_rendezvous_hdr_t));
|
||||
|
||||
mca_pml_ob1_rndv_completion_request( bml_btl, sendreq, req_bytes_delivered );
|
||||
}
|
||||
@ -250,27 +254,18 @@ mca_pml_ob1_rndv_completion( mca_btl_base_module_t* btl,
|
||||
*/
|
||||
|
||||
static void
|
||||
mca_pml_ob1_rget_completion( mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* ep,
|
||||
struct mca_btl_base_descriptor_t* des,
|
||||
int status )
|
||||
mca_pml_ob1_rget_completion (mca_pml_ob1_rdma_frag_t *frag, int64_t rdma_length)
|
||||
{
|
||||
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)des->des_cbdata;
|
||||
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context;
|
||||
size_t req_bytes_delivered;
|
||||
mca_pml_ob1_send_request_t *sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req;
|
||||
mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
|
||||
|
||||
/* count bytes of user data actually delivered and check for request completion */
|
||||
if (OPAL_LIKELY(OMPI_SUCCESS == status)) {
|
||||
req_bytes_delivered = mca_pml_ob1_compute_segment_length (btl->btl_seg_size,
|
||||
(void *) des->des_local,
|
||||
des->des_local_count, 0);
|
||||
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered);
|
||||
if (OPAL_LIKELY(0 < rdma_length)) {
|
||||
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, (size_t) rdma_length);
|
||||
}
|
||||
sendreq->src_des = NULL;
|
||||
|
||||
send_request_pml_complete_check(sendreq);
|
||||
/* free the descriptor */
|
||||
mca_bml_base_free(bml_btl, des);
|
||||
|
||||
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
|
||||
}
|
||||
|
||||
@ -314,10 +309,9 @@ mca_pml_ob1_frag_completion( mca_btl_base_module_t* btl,
|
||||
}
|
||||
|
||||
/* count bytes of user data actually delivered */
|
||||
req_bytes_delivered = mca_pml_ob1_compute_segment_length (btl->btl_seg_size,
|
||||
(void *) des->des_local,
|
||||
des->des_local_count,
|
||||
sizeof(mca_pml_ob1_frag_hdr_t));
|
||||
req_bytes_delivered = mca_pml_ob1_compute_segment_length_base ((void *) des->des_segments,
|
||||
des->des_segment_count,
|
||||
sizeof(mca_pml_ob1_frag_hdr_t));
|
||||
|
||||
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, -1);
|
||||
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered);
|
||||
@ -389,7 +383,7 @@ int mca_pml_ob1_send_request_start_buffered(
|
||||
if( OPAL_UNLIKELY(NULL == des) ) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
segment = des->des_local;
|
||||
segment = des->des_segments;
|
||||
|
||||
/* pack the data into the BTL supplied buffer */
|
||||
iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval +
|
||||
@ -408,17 +402,14 @@ int mca_pml_ob1_send_request_start_buffered(
|
||||
|
||||
/* build rendezvous header */
|
||||
hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
|
||||
hdr->hdr_common.hdr_flags = 0;
|
||||
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_RNDV;
|
||||
hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
|
||||
hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
|
||||
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
|
||||
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
|
||||
hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed;
|
||||
hdr->hdr_rndv.hdr_src_req.pval = sendreq;
|
||||
mca_pml_ob1_rendezvous_hdr_prepare (&hdr->hdr_rndv, MCA_PML_OB1_HDR_TYPE_RNDV, 0,
|
||||
sendreq->req_send.req_base.req_comm->c_contextid,
|
||||
sendreq->req_send.req_base.req_comm->c_my_rank,
|
||||
sendreq->req_send.req_base.req_tag,
|
||||
(uint16_t)sendreq->req_send.req_base.req_sequence,
|
||||
sendreq->req_send.req_bytes_packed, sendreq);
|
||||
|
||||
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV,
|
||||
sendreq->req_send.req_base.req_proc);
|
||||
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV, sendreq->req_send.req_base.req_proc);
|
||||
|
||||
/* update lengths */
|
||||
segment->seg_len = sizeof(mca_pml_ob1_rendezvous_hdr_t) + max_data;
|
||||
@ -491,15 +482,13 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq,
|
||||
|
||||
if(NULL != bml_btl->btl->btl_sendi) {
|
||||
mca_pml_ob1_match_hdr_t match;
|
||||
match.hdr_common.hdr_flags = 0;
|
||||
match.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH;
|
||||
match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
|
||||
match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
|
||||
match.hdr_tag = sendreq->req_send.req_base.req_tag;
|
||||
match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
|
||||
mca_pml_ob1_match_hdr_prepare (&match, MCA_PML_OB1_HDR_TYPE_MATCH, 0,
|
||||
sendreq->req_send.req_base.req_comm->c_contextid,
|
||||
sendreq->req_send.req_base.req_comm->c_my_rank,
|
||||
sendreq->req_send.req_base.req_tag,
|
||||
(uint16_t)sendreq->req_send.req_base.req_sequence);
|
||||
|
||||
ob1_hdr_hton(&match, MCA_PML_OB1_HDR_TYPE_MATCH,
|
||||
sendreq->req_send.req_base.req_proc);
|
||||
ob1_hdr_hton (&match, MCA_PML_OB1_HDR_TYPE_MATCH, sendreq->req_send.req_base.req_proc);
|
||||
|
||||
/* try to send immediately */
|
||||
rc = mca_bml_base_sendi( bml_btl, &sendreq->req_send.req_base.req_convertor,
|
||||
@ -532,7 +521,7 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq,
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
segment = des->des_local;
|
||||
segment = des->des_segments;
|
||||
|
||||
if(size > 0) {
|
||||
/* pack the data into the supplied buffer */
|
||||
@ -566,15 +555,13 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq,
|
||||
|
||||
/* build match header */
|
||||
hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
|
||||
hdr->hdr_common.hdr_flags = 0;
|
||||
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH;
|
||||
hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
|
||||
hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
|
||||
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
|
||||
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
|
||||
mca_pml_ob1_match_hdr_prepare (&hdr->hdr_match, MCA_PML_OB1_HDR_TYPE_MATCH, 0,
|
||||
sendreq->req_send.req_base.req_comm->c_contextid,
|
||||
sendreq->req_send.req_base.req_comm->c_my_rank,
|
||||
sendreq->req_send.req_base.req_tag,
|
||||
(uint16_t)sendreq->req_send.req_base.req_sequence);
|
||||
|
||||
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH,
|
||||
sendreq->req_send.req_base.req_proc);
|
||||
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH, sendreq->req_send.req_base.req_proc);
|
||||
|
||||
/* update lengths */
|
||||
segment->seg_len = OMPI_PML_OB1_MATCH_HDR_LEN + max_data;
|
||||
@ -618,7 +605,6 @@ int mca_pml_ob1_send_request_start_prepare( mca_pml_ob1_send_request_t* sendreq,
|
||||
|
||||
/* prepare descriptor */
|
||||
mca_bml_base_prepare_src( bml_btl,
|
||||
NULL,
|
||||
&sendreq->req_send.req_base.req_convertor,
|
||||
MCA_BTL_NO_ORDER,
|
||||
OMPI_PML_OB1_MATCH_HDR_LEN,
|
||||
@ -628,19 +614,17 @@ int mca_pml_ob1_send_request_start_prepare( mca_pml_ob1_send_request_t* sendreq,
|
||||
if( OPAL_UNLIKELY(NULL == des) ) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
segment = des->des_local;
|
||||
segment = des->des_segments;
|
||||
|
||||
/* build match header */
|
||||
hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
|
||||
hdr->hdr_common.hdr_flags = 0;
|
||||
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH;
|
||||
hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
|
||||
hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
|
||||
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
|
||||
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
|
||||
mca_pml_ob1_match_hdr_prepare (&hdr->hdr_match, MCA_PML_OB1_HDR_TYPE_MATCH, 0,
|
||||
sendreq->req_send.req_base.req_comm->c_contextid,
|
||||
sendreq->req_send.req_base.req_comm->c_my_rank,
|
||||
sendreq->req_send.req_base.req_tag,
|
||||
(uint16_t)sendreq->req_send.req_base.req_sequence);
|
||||
|
||||
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH,
|
||||
sendreq->req_send.req_base.req_proc);
|
||||
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH, sendreq->req_send.req_base.req_proc);
|
||||
|
||||
/* short message */
|
||||
des->des_cbfunc = mca_pml_ob1_match_completion_free;
|
||||
@ -674,80 +658,68 @@ int mca_pml_ob1_send_request_start_rdma( mca_pml_ob1_send_request_t* sendreq,
|
||||
* one RDMA capable BTLs). This way round robin distribution of RDMA
|
||||
* operation is achieved.
|
||||
*/
|
||||
|
||||
mca_btl_base_descriptor_t *des, *src = NULL;
|
||||
mca_btl_base_registration_handle_t *local_handle;
|
||||
mca_btl_base_descriptor_t *des;
|
||||
mca_pml_ob1_rdma_frag_t *frag;
|
||||
mca_pml_ob1_rget_hdr_t *hdr;
|
||||
size_t seg_size;
|
||||
size_t reg_size;
|
||||
void *data_ptr;
|
||||
int rc;
|
||||
|
||||
sendreq->src_des = NULL;
|
||||
|
||||
bml_btl = sendreq->req_rdma[0].bml_btl;
|
||||
if (!(bml_btl->btl_flags & (MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_CUDA_GET))) {
|
||||
sendreq->rdma_frag = NULL;
|
||||
/* This BTL does not support get. Use rendezvous to start the RDMA operation using put instead. */
|
||||
return mca_pml_ob1_send_request_start_rndv (sendreq, bml_btl, 0, MCA_PML_OB1_HDR_FLAGS_CONTIG |
|
||||
MCA_PML_OB1_HDR_FLAGS_PIN);
|
||||
}
|
||||
|
||||
MEMCHECKER(
|
||||
memchecker_call(&opal_memchecker_base_mem_defined,
|
||||
sendreq->req_send.req_base.req_addr,
|
||||
sendreq->req_send.req_base.req_count,
|
||||
sendreq->req_send.req_base.req_datatype);
|
||||
);
|
||||
/* prepare source descriptor/segment(s) */
|
||||
/* PML owns this descriptor and will free it in */
|
||||
/* mca_pml_ob1_rget_completion */
|
||||
mca_bml_base_prepare_src( bml_btl, sendreq->req_rdma[0].btl_reg,
|
||||
&sendreq->req_send.req_base.req_convertor,
|
||||
MCA_BTL_NO_ORDER, 0, &size, MCA_BTL_DES_FLAGS_GET |
|
||||
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP, &src );
|
||||
MEMCHECKER(
|
||||
memchecker_call(&opal_memchecker_base_mem_noaccess,
|
||||
sendreq->req_send.req_base.req_addr,
|
||||
sendreq->req_send.req_base.req_count,
|
||||
sendreq->req_send.req_base.req_datatype);
|
||||
);
|
||||
if( OPAL_UNLIKELY(NULL == src) ) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
/* at this time ob1 does not support non-contiguous gets. the convertor represents a
|
||||
* contiguous block of memory */
|
||||
opal_convertor_get_current_pointer (&sendreq->req_send.req_base.req_convertor, &data_ptr);
|
||||
|
||||
local_handle = sendreq->req_rdma[0].btl_reg;
|
||||
|
||||
/* allocate an rdma fragment to keep track of the request size for use in the fin message */
|
||||
MCA_PML_OB1_RDMA_FRAG_ALLOC(frag);
|
||||
if (OPAL_UNLIKELY(NULL == frag)) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
src->des_cbfunc = mca_pml_ob1_rget_completion;
|
||||
src->des_cbdata = sendreq;
|
||||
|
||||
sendreq->src_des = src;
|
||||
/* fill in necessary fragment data */
|
||||
frag->rdma_req = sendreq;
|
||||
frag->rdma_bml = bml_btl;
|
||||
frag->rdma_length = size;
|
||||
frag->cbfunc = mca_pml_ob1_rget_completion;
|
||||
/* do not store the local handle in the fragment. it will be released by mca_pml_ob1_free_rdma_resources */
|
||||
|
||||
seg_size = bml_btl->btl->btl_seg_size * src->des_local_count;
|
||||
/* save the fragment for get->put fallback */
|
||||
sendreq->rdma_frag = frag;
|
||||
|
||||
reg_size = bml_btl->btl->btl_registration_handle_size;
|
||||
|
||||
/* allocate space for get hdr + segment list */
|
||||
mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, sizeof (*hdr) + seg_size,
|
||||
mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, sizeof (*hdr) + reg_size,
|
||||
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
|
||||
MCA_BTL_DES_FLAGS_SIGNAL);
|
||||
if( OPAL_UNLIKELY(NULL == des) ) {
|
||||
/* NTH: no need to reset the converter here. it will be reset before it is retried */
|
||||
mca_bml_base_free(bml_btl, src);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* build match header */
|
||||
hdr = (mca_pml_ob1_rget_hdr_t *) des->des_local->seg_addr.pval;
|
||||
|
||||
hdr->hdr_rndv.hdr_match.hdr_common.hdr_flags = MCA_PML_OB1_HDR_FLAGS_CONTIG|MCA_PML_OB1_HDR_FLAGS_PIN;
|
||||
hdr->hdr_rndv.hdr_match.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_RGET;
|
||||
hdr->hdr_rndv.hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
|
||||
hdr->hdr_rndv.hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
|
||||
hdr->hdr_rndv.hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
|
||||
hdr->hdr_rndv.hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
|
||||
hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed;
|
||||
hdr->hdr_rndv.hdr_src_req.pval = sendreq;
|
||||
hdr->hdr_des.pval = src;
|
||||
hdr->hdr_seg_cnt = src->des_local_count;
|
||||
hdr = (mca_pml_ob1_rget_hdr_t *) des->des_segments->seg_addr.pval;
|
||||
/* TODO -- Add support for multiple segments for get */
|
||||
mca_pml_ob1_rget_hdr_prepare (hdr, MCA_PML_OB1_HDR_FLAGS_CONTIG | MCA_PML_OB1_HDR_FLAGS_PIN,
|
||||
sendreq->req_send.req_base.req_comm->c_contextid,
|
||||
sendreq->req_send.req_base.req_comm->c_my_rank,
|
||||
sendreq->req_send.req_base.req_tag,
|
||||
(uint16_t)sendreq->req_send.req_base.req_sequence,
|
||||
sendreq->req_send.req_bytes_packed, sendreq,
|
||||
frag, data_ptr, local_handle, reg_size);
|
||||
|
||||
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RGET, sendreq->req_send.req_base.req_proc);
|
||||
|
||||
/* copy segment data */
|
||||
memcpy (hdr + 1, src->des_local, seg_size);
|
||||
|
||||
des->des_cbfunc = mca_pml_ob1_send_ctl_completion;
|
||||
des->des_cbdata = sendreq;
|
||||
|
||||
@ -765,12 +737,6 @@ int mca_pml_ob1_send_request_start_rdma( mca_pml_ob1_send_request_t* sendreq,
|
||||
rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_RGET);
|
||||
if (OPAL_UNLIKELY(rc < 0)) {
|
||||
mca_bml_base_free(bml_btl, des);
|
||||
|
||||
if (sendreq->src_des) {
|
||||
mca_bml_base_free (bml_btl, sendreq->src_des);
|
||||
sendreq->src_des = NULL;
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -808,7 +774,6 @@ int mca_pml_ob1_send_request_start_rndv( mca_pml_ob1_send_request_t* sendreq,
|
||||
sendreq->req_send.req_base.req_datatype);
|
||||
);
|
||||
mca_bml_base_prepare_src( bml_btl,
|
||||
NULL,
|
||||
&sendreq->req_send.req_base.req_convertor,
|
||||
MCA_BTL_NO_ORDER,
|
||||
sizeof(mca_pml_ob1_rendezvous_hdr_t),
|
||||
@ -827,21 +792,19 @@ int mca_pml_ob1_send_request_start_rndv( mca_pml_ob1_send_request_t* sendreq,
|
||||
if( OPAL_UNLIKELY(NULL == des) ) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
segment = des->des_local;
|
||||
segment = des->des_segments;
|
||||
|
||||
/* build hdr */
|
||||
hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
|
||||
hdr->hdr_common.hdr_flags = flags | MCA_PML_OB1_HDR_FLAGS_SIGNAL;
|
||||
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_RNDV;
|
||||
hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
|
||||
hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
|
||||
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
|
||||
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
|
||||
hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed;
|
||||
hdr->hdr_rndv.hdr_src_req.pval = sendreq;
|
||||
mca_pml_ob1_rendezvous_hdr_prepare (&hdr->hdr_rndv, MCA_PML_OB1_HDR_TYPE_RNDV, flags |
|
||||
MCA_PML_OB1_HDR_FLAGS_SIGNAL,
|
||||
sendreq->req_send.req_base.req_comm->c_contextid,
|
||||
sendreq->req_send.req_base.req_comm->c_my_rank,
|
||||
sendreq->req_send.req_base.req_tag,
|
||||
(uint16_t)sendreq->req_send.req_base.req_sequence,
|
||||
sendreq->req_send.req_bytes_packed, sendreq);
|
||||
|
||||
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV,
|
||||
sendreq->req_send.req_base.req_proc);
|
||||
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV, sendreq->req_send.req_base.req_proc);
|
||||
|
||||
/* first fragment of a long message */
|
||||
des->des_cbdata = sendreq;
|
||||
@ -1022,13 +985,10 @@ cannot_pack:
|
||||
sendreq->req_send.req_base.req_count,
|
||||
sendreq->req_send.req_base.req_datatype);
|
||||
);
|
||||
mca_bml_base_prepare_src(bml_btl, NULL,
|
||||
&sendreq->req_send.req_base.req_convertor,
|
||||
MCA_BTL_NO_ORDER,
|
||||
sizeof(mca_pml_ob1_frag_hdr_t),
|
||||
mca_bml_base_prepare_src(bml_btl, &sendreq->req_send.req_base.req_convertor,
|
||||
MCA_BTL_NO_ORDER, sizeof(mca_pml_ob1_frag_hdr_t),
|
||||
&size, MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK |
|
||||
MCA_BTL_DES_FLAGS_SIGNAL,
|
||||
&des);
|
||||
MCA_BTL_DES_FLAGS_SIGNAL, &des);
|
||||
MEMCHECKER(
|
||||
memchecker_call(&opal_memchecker_base_mem_noaccess,
|
||||
sendreq->req_send.req_base.req_addr,
|
||||
@ -1051,12 +1011,9 @@ cannot_pack:
|
||||
des->des_cbdata = sendreq;
|
||||
|
||||
/* setup header */
|
||||
hdr = (mca_pml_ob1_frag_hdr_t*)des->des_local->seg_addr.pval;
|
||||
hdr->hdr_common.hdr_flags = 0;
|
||||
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_FRAG;
|
||||
hdr->hdr_frag_offset = range->range_send_offset;
|
||||
hdr->hdr_src_req.pval = sendreq;
|
||||
hdr->hdr_dst_req = sendreq->req_recv;
|
||||
hdr = (mca_pml_ob1_frag_hdr_t*)des->des_segments->seg_addr.pval;
|
||||
mca_pml_ob1_frag_hdr_prepare (hdr, 0, range->range_send_offset, sendreq,
|
||||
sendreq->req_recv.lval);
|
||||
|
||||
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_FRAG,
|
||||
sendreq->req_send.req_base.req_proc);
|
||||
@ -1113,38 +1070,66 @@ cannot_pack:
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* A put fragment could not be started. Queue the fragment to be retried later or
|
||||
* fall back on send/recv.
|
||||
*/
|
||||
static void mca_pml_ob1_send_request_put_frag_failed (mca_pml_ob1_rdma_frag_t *frag, int rc)
|
||||
{
|
||||
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req;
|
||||
mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
|
||||
|
||||
if (++frag->retries < mca_pml_ob1.rdma_retries_limit && OMPI_ERR_OUT_OF_RESOURCE == rc) {
|
||||
/* queue the frag for later if there was a resource error */
|
||||
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
|
||||
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
|
||||
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
|
||||
} else {
|
||||
/* tell receiver to deregister memory */
|
||||
mca_pml_ob1_send_fin (sendreq->req_send.req_base.req_proc, bml_btl,
|
||||
frag->rdma_hdr.hdr_rdma.hdr_frag, 0, MCA_BTL_NO_ORDER,
|
||||
OPAL_ERR_TEMP_OUT_OF_RESOURCE);
|
||||
|
||||
/* send fragment by copy in/out */
|
||||
mca_pml_ob1_send_request_copy_in_out(sendreq, frag->rdma_hdr.hdr_rdma.hdr_rdma_offset,
|
||||
frag->rdma_length);
|
||||
/* if a pointer to a receive request is not set it means that
|
||||
* ACK was not yet received. Don't schedule sends before ACK */
|
||||
if (NULL != sendreq->req_recv.pval)
|
||||
mca_pml_ob1_send_request_schedule (sendreq);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* An RDMA put operation has completed:
|
||||
* (1) Update request status and if required set completed
|
||||
* (2) Send FIN control message to the destination
|
||||
* (2) Send FIN control message to the destination
|
||||
*/
|
||||
|
||||
static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* ep,
|
||||
struct mca_btl_base_descriptor_t* des,
|
||||
int status )
|
||||
static void mca_pml_ob1_put_completion (mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* ep,
|
||||
void *local_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
void *context, void *cbdata, int status)
|
||||
{
|
||||
mca_pml_ob1_rdma_frag_t* frag = (mca_pml_ob1_rdma_frag_t*)des->des_cbdata;
|
||||
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)frag->rdma_req;
|
||||
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context;
|
||||
mca_pml_ob1_rdma_frag_t *frag = (mca_pml_ob1_rdma_frag_t *) cbdata;
|
||||
mca_pml_ob1_send_request_t *sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req;
|
||||
mca_bml_base_btl_t *bml_btl = (mca_bml_base_btl_t *) context;
|
||||
|
||||
/* check completion status */
|
||||
if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
|
||||
/* TSW - FIX */
|
||||
OMPI_ERROR_LOG(status);
|
||||
ompi_rte_abort(-1, NULL);
|
||||
if( OPAL_UNLIKELY(OMPI_SUCCESS == status) ) {
|
||||
/* TODO -- readd ordering */
|
||||
mca_pml_ob1_send_fin (sendreq->req_send.req_base.req_proc, bml_btl,
|
||||
frag->rdma_hdr.hdr_rdma.hdr_frag, frag->rdma_length,
|
||||
0, 0);
|
||||
|
||||
/* check for request completion */
|
||||
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, frag->rdma_length);
|
||||
|
||||
send_request_pml_complete_check(sendreq);
|
||||
} else {
|
||||
/* try to fall back on send/recv */
|
||||
mca_pml_ob1_send_request_put_frag_failed (frag, status);
|
||||
}
|
||||
|
||||
mca_pml_ob1_send_fin(sendreq->req_send.req_base.req_proc,
|
||||
bml_btl,
|
||||
frag->rdma_hdr.hdr_rdma.hdr_des,
|
||||
des->order, 0);
|
||||
|
||||
/* check for request completion */
|
||||
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, frag->rdma_length);
|
||||
|
||||
send_request_pml_complete_check(sendreq);
|
||||
|
||||
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
|
||||
|
||||
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
|
||||
@ -1152,81 +1137,45 @@ static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl,
|
||||
|
||||
int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t *frag )
|
||||
{
|
||||
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)frag->rdma_req;
|
||||
mca_mpool_base_registration_t *reg = NULL;
|
||||
mca_pml_ob1_send_request_t *sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req;
|
||||
mca_btl_base_registration_handle_t *local_handle = NULL;
|
||||
mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
|
||||
mca_btl_base_descriptor_t *des;
|
||||
size_t save_size = frag->rdma_length;
|
||||
int rc;
|
||||
|
||||
if (OPAL_LIKELY(NULL == sendreq->src_des)) {
|
||||
/* setup descriptor */
|
||||
mca_bml_base_prepare_src( bml_btl,
|
||||
reg,
|
||||
&frag->convertor,
|
||||
MCA_BTL_NO_ORDER,
|
||||
0,
|
||||
&frag->rdma_length,
|
||||
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
|
||||
MCA_BTL_DES_FLAGS_PUT,
|
||||
&des );
|
||||
|
||||
if( OPAL_UNLIKELY(NULL == des) ) {
|
||||
if(frag->retries < mca_pml_ob1.rdma_retries_limit) {
|
||||
size_t offset = (size_t)frag->rdma_hdr.hdr_rdma.hdr_rdma_offset;
|
||||
frag->rdma_length = save_size;
|
||||
opal_convertor_set_position(&frag->convertor, &offset);
|
||||
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
|
||||
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
|
||||
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
|
||||
} else {
|
||||
mca_pml_ob1_send_request_t *sendreq =
|
||||
(mca_pml_ob1_send_request_t*)frag->rdma_req;
|
||||
if (bml_btl->btl->btl_register_mem && NULL == frag->local_handle) {
|
||||
/* Check if the segment is already registered */
|
||||
for (size_t i = 0 ; i < sendreq->req_rdma_cnt ; ++i) {
|
||||
if (sendreq->req_rdma[i].bml_btl == frag->rdma_bml) {
|
||||
/* do not copy the handle to the fragment to avoid deregistring it twice */
|
||||
local_handle = sendreq->req_rdma[i].btl_reg;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* tell receiver to unregister memory */
|
||||
mca_pml_ob1_send_fin(sendreq->req_send.req_base.req_proc,
|
||||
bml_btl, frag->rdma_hdr.hdr_rdma.hdr_des,
|
||||
MCA_BTL_NO_ORDER, 1);
|
||||
if (NULL == frag->local_handle) {
|
||||
/* Not already registered. Register the region with the BTL. */
|
||||
mca_bml_base_register_mem (bml_btl, frag->local_address, frag->rdma_length, 0,
|
||||
&frag->local_handle);
|
||||
|
||||
/* send fragment by copy in/out */
|
||||
mca_pml_ob1_send_request_copy_in_out(sendreq,
|
||||
frag->rdma_hdr.hdr_rdma.hdr_rdma_offset, frag->rdma_length);
|
||||
/* if a pointer to a receive request is not set it means that
|
||||
* ACK was not yet received. Don't schedule sends before ACK */
|
||||
if(NULL != sendreq->req_recv.pval)
|
||||
mca_pml_ob1_send_request_schedule(sendreq);
|
||||
if (OPAL_UNLIKELY(NULL == frag->local_handle)) {
|
||||
mca_pml_ob1_send_request_put_frag_failed (frag, OMPI_ERR_OUT_OF_RESOURCE);
|
||||
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
local_handle = frag->local_handle;
|
||||
}
|
||||
} else {
|
||||
/* already have a source descriptor */
|
||||
des = sendreq->src_des;
|
||||
sendreq->src_des = NULL;
|
||||
}
|
||||
|
||||
des->des_remote = (mca_btl_base_segment_t *) frag->rdma_segs;
|
||||
des->des_remote_count = frag->rdma_hdr.hdr_rdma.hdr_seg_cnt;
|
||||
des->des_cbfunc = mca_pml_ob1_put_completion;
|
||||
des->des_cbdata = frag;
|
||||
|
||||
PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE,
|
||||
&(((mca_pml_ob1_send_request_t*)frag->rdma_req)->req_send.req_base), save_size, PERUSE_SEND );
|
||||
|
||||
rc = mca_bml_base_put(bml_btl, des);
|
||||
rc = mca_bml_base_put (bml_btl, frag->local_address, frag->remote_address, local_handle,
|
||||
(mca_btl_base_registration_handle_t *) frag->remote_handle, frag->rdma_length,
|
||||
0, MCA_BTL_NO_ORDER, mca_pml_ob1_put_completion, frag);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
mca_bml_base_free(bml_btl, des);
|
||||
frag->rdma_length = save_size;
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
|
||||
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
|
||||
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
|
||||
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
} else {
|
||||
/* TSW - FIX */
|
||||
OMPI_ERROR_LOG(rc);
|
||||
ompi_rte_abort(-1, NULL);
|
||||
}
|
||||
mca_pml_ob1_send_request_put_frag_failed (frag, rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
@ -1240,12 +1189,11 @@ int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t *frag )
|
||||
*/
|
||||
|
||||
void mca_pml_ob1_send_request_put( mca_pml_ob1_send_request_t* sendreq,
|
||||
mca_btl_base_module_t* btl,
|
||||
mca_btl_base_module_t* btl,
|
||||
mca_pml_ob1_rdma_hdr_t* hdr )
|
||||
{
|
||||
mca_bml_base_endpoint_t *bml_endpoint = sendreq->req_endpoint;
|
||||
mca_pml_ob1_rdma_frag_t* frag;
|
||||
size_t i, size = 0;
|
||||
|
||||
if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_TYPE_ACK) {
|
||||
OPAL_THREAD_ADD32(&sendreq->req_state, -1);
|
||||
@ -1253,61 +1201,36 @@ void mca_pml_ob1_send_request_put( mca_pml_ob1_send_request_t* sendreq,
|
||||
|
||||
sendreq->req_recv.pval = hdr->hdr_recv_req.pval;
|
||||
|
||||
MCA_PML_OB1_RDMA_FRAG_ALLOC(frag);
|
||||
if (NULL == sendreq->rdma_frag) {
|
||||
MCA_PML_OB1_RDMA_FRAG_ALLOC(frag);
|
||||
|
||||
if( OPAL_UNLIKELY(NULL == frag) ) {
|
||||
/* TSW - FIX */
|
||||
OMPI_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE);
|
||||
ompi_rte_abort(-1, NULL);
|
||||
}
|
||||
|
||||
assert (btl->btl_seg_size * hdr->hdr_seg_cnt <= sizeof (frag->rdma_segs));
|
||||
|
||||
/* setup fragment */
|
||||
memcpy (frag->rdma_segs, hdr + 1, btl->btl_seg_size * hdr->hdr_seg_cnt);
|
||||
|
||||
for( i = 0; i < hdr->hdr_seg_cnt; i++ ) {
|
||||
mca_btl_base_segment_t *seg = (mca_btl_base_segment_t *) ((uintptr_t)(frag->rdma_segs) + i * btl->btl_seg_size);
|
||||
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
if ((sendreq->req_send.req_base.req_proc->super.proc_arch & OPAL_ARCH_ISBIGENDIAN) !=
|
||||
(ompi_proc_local()->super.proc_arch & OPAL_ARCH_ISBIGENDIAN)) {
|
||||
size += opal_swap_bytes4(seg->seg_len);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
size += seg->seg_len;
|
||||
if( OPAL_UNLIKELY(NULL == frag) ) {
|
||||
/* TSW - FIX */
|
||||
OMPI_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE);
|
||||
ompi_rte_abort(-1, NULL);
|
||||
}
|
||||
} else {
|
||||
/* rget fallback on put */
|
||||
frag = sendreq->rdma_frag;
|
||||
sendreq->rdma_frag = NULL;
|
||||
sendreq->req_state = 0;
|
||||
}
|
||||
|
||||
/* copy registration data */
|
||||
memcpy (frag->remote_handle, hdr + 1, btl->btl_registration_handle_size);
|
||||
|
||||
frag->rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl);
|
||||
frag->rdma_hdr.hdr_rdma = *hdr;
|
||||
frag->rdma_req = sendreq;
|
||||
frag->rdma_ep = bml_endpoint;
|
||||
frag->rdma_length = size;
|
||||
frag->rdma_length = hdr->hdr_dst_size;
|
||||
frag->rdma_state = MCA_PML_OB1_RDMA_PUT;
|
||||
frag->reg = NULL;
|
||||
frag->remote_address = hdr->hdr_dst_ptr;
|
||||
frag->retries = 0;
|
||||
|
||||
if (OPAL_UNLIKELY(NULL != sendreq->src_des)) {
|
||||
/* get fallback path */
|
||||
sendreq->req_state = 0;
|
||||
}
|
||||
|
||||
/* lookup the corresponding registration */
|
||||
for(i=0; i<sendreq->req_rdma_cnt; i++) {
|
||||
if(sendreq->req_rdma[i].bml_btl == frag->rdma_bml) {
|
||||
frag->reg = sendreq->req_rdma[i].btl_reg;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* RDMA writes may proceed in parallel to send and to each other, so
|
||||
* create clone of the convertor for each RDMA fragment
|
||||
*/
|
||||
size = hdr->hdr_rdma_offset;
|
||||
opal_convertor_clone_with_position(&sendreq->req_send.req_base.req_convertor,
|
||||
&frag->convertor, 0, &size);
|
||||
/* Get the address of the current offset. Note: at this time ob1 CAN NOT handle
|
||||
* non-contiguous RDMA. If that changes this code will be wrong. */
|
||||
opal_convertor_get_offset_pointer (&sendreq->req_send.req_base.req_convertor,
|
||||
hdr->hdr_rdma_offset, &frag->local_address);
|
||||
|
||||
mca_pml_ob1_send_request_put_frag(frag);
|
||||
}
|
||||
|
@ -12,7 +12,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2012 NVIDIA Corporation. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -54,7 +54,7 @@ struct mca_pml_ob1_send_request_t {
|
||||
mca_pml_ob1_send_pending_t req_pending;
|
||||
opal_mutex_t req_send_range_lock;
|
||||
opal_list_t req_send_ranges;
|
||||
mca_btl_base_descriptor_t *src_des;
|
||||
mca_pml_ob1_rdma_frag_t *rdma_frag;
|
||||
mca_pml_ob1_com_btl_t req_rdma[1];
|
||||
};
|
||||
typedef struct mca_pml_ob1_send_request_t mca_pml_ob1_send_request_t;
|
||||
@ -124,10 +124,9 @@ get_request_from_send_pending(mca_pml_ob1_send_pending_t *type)
|
||||
ompi_free_list_item_t* item; \
|
||||
\
|
||||
if( OPAL_LIKELY(NULL != proc) ) { \
|
||||
OMPI_FREE_LIST_WAIT_MT(&mca_pml_base_send_requests, item); \
|
||||
OMPI_FREE_LIST_WAIT_MT(&mca_pml_base_send_requests, item); \
|
||||
sendreq = (mca_pml_ob1_send_request_t*)item; \
|
||||
sendreq->req_send.req_base.req_proc = proc; \
|
||||
sendreq->src_des = NULL; \
|
||||
} \
|
||||
}
|
||||
|
||||
@ -163,15 +162,18 @@ get_request_from_send_pending(mca_pml_ob1_send_pending_t *type)
|
||||
assert( 0 == _position ); \
|
||||
}
|
||||
|
||||
static inline void mca_pml_ob1_free_rdma_resources(mca_pml_ob1_send_request_t* sendreq)
|
||||
static inline void mca_pml_ob1_free_rdma_resources (mca_pml_ob1_send_request_t* sendreq)
|
||||
{
|
||||
size_t r;
|
||||
|
||||
/* return mpool resources */
|
||||
for(r = 0; r < sendreq->req_rdma_cnt; r++) {
|
||||
mca_mpool_base_registration_t* reg = sendreq->req_rdma[r].btl_reg;
|
||||
if( NULL != reg && reg->mpool != NULL ) {
|
||||
reg->mpool->mpool_deregister(reg->mpool, reg);
|
||||
struct mca_btl_base_registration_handle_t *handle = sendreq->req_rdma[r].btl_reg;
|
||||
mca_bml_base_btl_t *bml_btl = sendreq->req_rdma[r].bml_btl;
|
||||
|
||||
if (NULL != handle) {
|
||||
mca_bml_base_deregister_mem (bml_btl, handle);
|
||||
sendreq->req_rdma[r].btl_reg = NULL;
|
||||
}
|
||||
}
|
||||
sendreq->req_rdma_cnt = 0;
|
||||
@ -218,10 +220,14 @@ do {
|
||||
|
||||
#define MCA_PML_OB1_SEND_REQUEST_RETURN(sendreq) \
|
||||
do { \
|
||||
/* Let the base handle the reference counts */ \
|
||||
MCA_PML_BASE_SEND_REQUEST_FINI((&(sendreq)->req_send)); \
|
||||
OMPI_FREE_LIST_RETURN_MT( &mca_pml_base_send_requests, \
|
||||
(ompi_free_list_item_t*)sendreq); \
|
||||
/* Let the base handle the reference counts */ \
|
||||
MCA_PML_BASE_SEND_REQUEST_FINI((&(sendreq)->req_send)); \
|
||||
if (sendreq->rdma_frag) { \
|
||||
MCA_PML_OB1_RDMA_FRAG_RETURN (sendreq->rdma_frag); \
|
||||
sendreq->rdma_frag = NULL; \
|
||||
} \
|
||||
OMPI_FREE_LIST_RETURN_MT( &mca_pml_base_send_requests, \
|
||||
(ompi_free_list_item_t*)sendreq); \
|
||||
} while(0)
|
||||
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
@ -219,6 +219,14 @@ static inline void opal_convertor_get_current_pointer( const opal_convertor_t* p
|
||||
*position = (void*)base;
|
||||
}
|
||||
|
||||
static inline void opal_convertor_get_offset_pointer( const opal_convertor_t* pConv,
|
||||
size_t offset, void** position )
|
||||
{
|
||||
unsigned char* base = pConv->pBaseBuf + offset + pConv->pDesc->true_lb;
|
||||
*position = (void*)base;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
*
|
||||
*/
|
||||
|
@ -36,10 +36,8 @@ mca_btl_active_message_callback_t mca_btl_base_active_message_trigger[MCA_BTL_TA
|
||||
|
||||
static void mca_btl_base_descriptor_constructor(mca_btl_base_descriptor_t* des)
|
||||
{
|
||||
des->des_local = NULL;
|
||||
des->des_local_count = 0;
|
||||
des->des_remote = NULL;
|
||||
des->des_remote_count = 0;
|
||||
des->des_segments = NULL;
|
||||
des->des_segment_count = 0;
|
||||
des->des_cbfunc = NULL;
|
||||
des->des_cbdata = NULL;
|
||||
des->des_flags = 0;
|
||||
|
@ -1,3 +1,4 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
@ -45,13 +46,15 @@ int mca_btl_base_param_register(mca_base_component_t *version,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&module->btl_exclusivity);
|
||||
|
||||
asprintf(&msg, "BTL bit flags (general flags: SEND=%d, PUT=%d, GET=%d, SEND_INPLACE=%d, RDMA_MATCHED=%d, HETEROGENEOUS_RDMA=%d; flags only used by the \"dr\" PML (ignored by others): ACK=%d, CHECKSUM=%d, RDMA_COMPLETION=%d; flags only used by the \"bfo\" PML (ignored by others): FAILOVER_SUPPORT=%d)",
|
||||
asprintf(&msg, "BTL bit flags (general flags: SEND=%d, PUT=%d, GET=%d, SEND_INPLACE=%d, HETEROGENEOUS_RDMA=%d, "
|
||||
"ATOMIC_OPS=%d; flags only used by the \"dr\" PML (ignored by others): ACK=%d, CHECKSUM=%d, "
|
||||
"RDMA_COMPLETION=%d; flags only used by the \"bfo\" PML (ignored by others): FAILOVER_SUPPORT=%d)",
|
||||
MCA_BTL_FLAGS_SEND,
|
||||
MCA_BTL_FLAGS_PUT,
|
||||
MCA_BTL_FLAGS_GET,
|
||||
MCA_BTL_FLAGS_SEND_INPLACE,
|
||||
MCA_BTL_FLAGS_RDMA_MATCHED,
|
||||
MCA_BTL_FLAGS_HETEROGENEOUS_RDMA,
|
||||
MCA_BTL_FLAGS_ATOMIC_OPS,
|
||||
MCA_BTL_FLAGS_NEED_ACK,
|
||||
MCA_BTL_FLAGS_NEED_CSUM,
|
||||
MCA_BTL_FLAGS_RDMA_COMPLETION,
|
||||
@ -63,6 +66,14 @@ int mca_btl_base_param_register(mca_base_component_t *version,
|
||||
&module->btl_flags);
|
||||
free(msg);
|
||||
|
||||
asprintf (&msg, "BTL atomic bit flags (general flags: ADD=%d, AND=%d, OR=%d, XOR=%d",
|
||||
MCA_BTL_ATOMIC_SUPPORTS_ADD, MCA_BTL_ATOMIC_SUPPORTS_AND, MCA_BTL_ATOMIC_SUPPORTS_OR,
|
||||
MCA_BTL_ATOMIC_SUPPORTS_XOR);
|
||||
(void) mca_base_component_var_register(version, "atomic_flags", msg, MCA_BASE_VAR_TYPE_UNSIGNED_INT,
|
||||
NULL, 0, MCA_BASE_VAR_FLAG_DEFAULT_ONLY, OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_CONSTANT, &module->btl_atomic_flags);
|
||||
free(msg);
|
||||
|
||||
(void) mca_base_component_var_register(version, "rndv_eager_limit", "Size (in bytes, including header) of \"phase 1\" fragment sent for all large messages (must be >= 0 and <= eager_limit)",
|
||||
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_4,
|
||||
@ -74,6 +85,39 @@ int mca_btl_base_param_register(mca_base_component_t *version,
|
||||
OPAL_INFO_LVL_4,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&module->btl_eager_limit);
|
||||
|
||||
if ((module->btl_flags & MCA_BTL_FLAGS_GET) && module->btl_get) {
|
||||
if (0 == module->btl_get_limit) {
|
||||
module->btl_get_limit = SIZE_MAX;
|
||||
}
|
||||
|
||||
(void) mca_base_component_var_register(version, "get_limit", "Maximum size (in bytes) for btl get",
|
||||
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, OPAL_INFO_LVL_4,
|
||||
MCA_BASE_VAR_SCOPE_READONLY, &module->btl_get_limit);
|
||||
|
||||
/* Allow the user to set the alignment. The BTL should double-check the alignment in its open
|
||||
* function. */
|
||||
(void) mca_base_component_var_register(version, "get_alignment", "Alignment required for btl get",
|
||||
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, OPAL_INFO_LVL_6,
|
||||
MCA_BASE_VAR_SCOPE_CONSTANT, &module->btl_get_alignment);
|
||||
}
|
||||
|
||||
if ((module->btl_flags & MCA_BTL_FLAGS_PUT) && module->btl_put) {
|
||||
if (0 == module->btl_put_limit) {
|
||||
module->btl_put_limit = SIZE_MAX;
|
||||
}
|
||||
(void) mca_base_component_var_register(version, "put_limit", "Maximum size (in bytes) for btl put",
|
||||
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, OPAL_INFO_LVL_4,
|
||||
MCA_BASE_VAR_SCOPE_READONLY, &module->btl_put_limit);
|
||||
|
||||
/* Allow the user to set the alignment. The BTL should double-check the alignment in its open
|
||||
* function. */
|
||||
(void) mca_base_component_var_register(version, "put_alignment", "Alignment required for btl put",
|
||||
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, OPAL_INFO_LVL_6,
|
||||
MCA_BASE_VAR_SCOPE_CONSTANT, &module->btl_put_alignment);
|
||||
}
|
||||
|
||||
|
||||
#if OPAL_CUDA_GDR_SUPPORT
|
||||
/* If no CUDA RDMA support, zero them out */
|
||||
if (!(MCA_BTL_FLAGS_CUDA_GET & module->btl_flags)) {
|
||||
@ -149,5 +193,17 @@ int mca_btl_base_param_verify(mca_btl_base_module_t *module)
|
||||
module->btl_flags &= ~MCA_BTL_FLAGS_GET;
|
||||
}
|
||||
|
||||
if (0 == module->btl_atomic_flags) {
|
||||
module->btl_flags &= ~MCA_BTL_FLAGS_ATOMIC_OPS;
|
||||
}
|
||||
|
||||
if (0 == module->btl_get_limit) {
|
||||
module->btl_get_limit = SIZE_MAX;
|
||||
}
|
||||
|
||||
if (0 == module->btl_put_limit) {
|
||||
module->btl_put_limit = SIZE_MAX;
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
@ -6,18 +6,19 @@
|
||||
* Copyright (c) 2004-2008 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2006-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012-2013 NVIDIA Corporation. All rights reserved.
|
||||
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
@ -75,8 +76,8 @@
|
||||
* TCP 0 Selected based on network reachability
|
||||
* TCP 0 Selected based on network reachability
|
||||
*
|
||||
* When mca_btl_base_add_proc_fn_t() is called on a BTL module, the BTL
|
||||
* will populate an OUT variable with mca_btl_base_endpoint_t pointers.
|
||||
* When mca_btl_base_add_proc_fn_t() is called on a BTL module, the BTL
|
||||
* will populate an OUT variable with mca_btl_base_endpoint_t pointers.
|
||||
* Each pointer is treated as an opaque handle by the upper layer and is
|
||||
* returned to the BTL on subsequent data transfer calls to the
|
||||
* corresponding destination process. The actual contents of the
|
||||
@ -132,8 +133,25 @@ struct mca_btl_base_module_t;
|
||||
struct mca_btl_base_endpoint_t;
|
||||
struct mca_btl_base_descriptor_t;
|
||||
struct mca_mpool_base_resources_t;
|
||||
struct opal_proc_t;
|
||||
struct opal_proc_t;
|
||||
|
||||
/**
|
||||
* Opaque registration handle for executing RDMA and atomic
|
||||
* operations on a memory region.
|
||||
*
|
||||
* This data inside this handle is appropriate for passing
|
||||
* to remote peers to execute RDMA and atomic operations. The
|
||||
* size needed to send the registration handle can be
|
||||
* obtained from the btl via the btl_registration_handle_size
|
||||
* member. If this size is 0 then no registration data is
|
||||
* needed to execute RDMA or atomic operations.
|
||||
*/
|
||||
struct mca_btl_base_registration_handle_t;
|
||||
typedef struct mca_btl_base_registration_handle_t mca_btl_base_registration_handle_t;
|
||||
|
||||
|
||||
/* Wildcard endpoint for use in the register_mem function */
|
||||
#define MCA_BTL_ENDPOINT_ANY (struct mca_btl_base_endpoint_t *) -1
|
||||
|
||||
/* send/recv operations require tag matching */
|
||||
typedef uint8_t mca_btl_base_tag_t;
|
||||
@ -173,6 +191,9 @@ typedef uint8_t mca_btl_base_tag_t;
|
||||
#define MCA_BTL_FLAGS_SEND 0x0001
|
||||
#define MCA_BTL_FLAGS_PUT 0x0002
|
||||
#define MCA_BTL_FLAGS_GET 0x0004
|
||||
/* btls that set the MCA_BTL_FLAGS_RDMA will always get added to the BML
|
||||
* rdma_btls list. This allows the updated one-sided component to
|
||||
* use btls that are not otherwise used for send/recv. */
|
||||
#define MCA_BTL_FLAGS_RDMA (MCA_BTL_FLAGS_GET|MCA_BTL_FLAGS_PUT)
|
||||
|
||||
/* btl can send directly from user buffer w/out registration */
|
||||
@ -182,8 +203,7 @@ typedef uint8_t mca_btl_base_tag_t;
|
||||
#define MCA_BTL_FLAGS_NEED_ACK 0x0010
|
||||
#define MCA_BTL_FLAGS_NEED_CSUM 0x0020
|
||||
|
||||
/** RDMA put/get calls must have a matching prepare_{src,dst} call
|
||||
on the target with the same base (and possibly bound). */
|
||||
/** deprecated (BTL 3.0) */
|
||||
#define MCA_BTL_FLAGS_RDMA_MATCHED 0x0040
|
||||
|
||||
/* btl needs local rdma completion */
|
||||
@ -209,6 +229,12 @@ typedef uint8_t mca_btl_base_tag_t;
|
||||
*/
|
||||
#define MCA_BTL_FLAGS_SIGNALED 0x4000
|
||||
|
||||
|
||||
/** The BTL supports network atomic operations */
|
||||
#define MCA_BTL_FLAGS_ATOMIC_OPS 0x08000
|
||||
/** The BTL supports fetching network atomic operations */
|
||||
#define MCA_BTL_FLAGS_ATOMIC_FOPS 0x10000
|
||||
|
||||
/* Default exclusivity levels */
|
||||
#define MCA_BTL_EXCLUSIVITY_HIGH (64*1024) /* internal loopback */
|
||||
#define MCA_BTL_EXCLUSIVITY_DEFAULT 1024 /* GM/IB/etc. */
|
||||
@ -219,11 +245,67 @@ typedef uint8_t mca_btl_base_tag_t;
|
||||
#define MCA_BTL_ERROR_FLAGS_NONFATAL 0x2
|
||||
#define MCA_BTL_ERROR_FLAGS_ADD_CUDA_IPC 0x4
|
||||
|
||||
/** registration flags */
|
||||
enum {
|
||||
/** Allow local write on the registered region. If a region is registered
|
||||
* with this flag the registration can be used as the local handle for a
|
||||
* btl_get operation. */
|
||||
MCA_BTL_REG_FLAG_LOCAL_WRITE = 0x00000001,
|
||||
/** Allow remote read on the registered region. If a region is registered
|
||||
* with this flag the registration can be used as the remote handle for a
|
||||
* btl_get operation. */
|
||||
MCA_BTL_REG_FLAG_REMOTE_READ = 0x00000002,
|
||||
/** Allow remote write on the registered region. If a region is registered
|
||||
* with this flag the registration can be used as the remote handle for a
|
||||
* btl_put operation. */
|
||||
MCA_BTL_REG_FLAG_REMOTE_WRITE = 0x00000004,
|
||||
/** Allow remote atomic operations on the registered region. If a region is
|
||||
* registered with this flag the registration can be used as the remote
|
||||
* handle for a btl_atomic_op or btl_atomic_fop operation. */
|
||||
MCA_BTL_REG_FLAG_REMOTE_ATOMIC = 0x00000008,
|
||||
/** Allow any btl operation on the registered region. If a region is registered
|
||||
* with this flag the registration can be used as the local or remote handle for
|
||||
* any btl operation. */
|
||||
MCA_BTL_REG_FLAG_ACCESS_ANY = 0x0000000f,
|
||||
#if OPAL_CUDA_GDR_SUPPORT
|
||||
/** Region is in GPU memory */
|
||||
MCA_BTL_REG_FLAG_CUDA_GPU_MEM = 0x00010000,
|
||||
#endif
|
||||
};
|
||||
|
||||
/** supported atomic operations */
|
||||
enum {
|
||||
/** The btl supports atomic add */
|
||||
MCA_BTL_ATOMIC_SUPPORTS_ADD = 0x00000001,
|
||||
/** The btl supports atomic bitwise and */
|
||||
MCA_BTL_ATOMIC_SUPPORTS_AND = 0x00000200,
|
||||
/** The btl supports atomic bitwise or */
|
||||
MCA_BTL_ATOMIC_SUPPORTS_OR = 0x00000400,
|
||||
/** The btl supports atomic bitwise exclusive or */
|
||||
MCA_BTL_ATOMIC_SUPPORTS_XOR = 0x00000800,
|
||||
/** The btl supports atomic compare-and-swap */
|
||||
MCA_BTL_ATOMIC_SUPPORTS_CSWAP = 0x10000000,
|
||||
/** The btl guarantees global atomicity (can mix btl atomics with cpu atomics) */
|
||||
MCA_BTL_ATOMIC_SUPPORTS_GLOB = 0x20000000,
|
||||
};
|
||||
|
||||
enum mca_btl_base_atomic_op_t {
|
||||
/** Atomic add: (*remote_address) = (*remote_address) + operand */
|
||||
MCA_BTL_ATOMIC_ADD = 0x0001,
|
||||
/** Atomic and: (*remote_address) = (*remote_address) & operand */
|
||||
MCA_BTL_ATOMIC_AND = 0x0011,
|
||||
/** Atomic or: (*remote_address) = (*remote_address) | operand */
|
||||
MCA_BTL_ATOMIC_OR = 0x0012,
|
||||
/** Atomic xor: (*remote_address) = (*remote_address) ^ operand */
|
||||
MCA_BTL_ATOMIC_XOR = 0x0014,
|
||||
};
|
||||
typedef enum mca_btl_base_atomic_op_t mca_btl_base_atomic_op_t;
|
||||
|
||||
/**
|
||||
* Asynchronous callback function on completion of an operation.
|
||||
* Completion Semantics: The descriptor can be reused or returned to the
|
||||
* Completion Semantics: The descriptor can be reused or returned to the
|
||||
* BTL via mca_btl_base_module_free_fn_t. The operation has been queued to
|
||||
* the network device or will otherwise make asynchronous progress without
|
||||
* the network device or will otherwise make asynchronous progress without
|
||||
* subsequent calls to btl_progress.
|
||||
*
|
||||
* @param[IN] module the BTL module
|
||||
@ -237,8 +319,34 @@ typedef void (*mca_btl_base_completion_fn_t)(
|
||||
struct mca_btl_base_descriptor_t* descriptor,
|
||||
int status);
|
||||
|
||||
|
||||
/**
|
||||
* Describes a region/segment of memory that is addressable
|
||||
* Asynchronous callback function on completion of an rdma or atomic operation.
|
||||
* Completion Semantics: The rdma or atomic memory operation has completed
|
||||
* remotely (i.e.) is remotely visible and the caller is free to deregister
|
||||
* the local_handle or modify the memory in local_address.
|
||||
*
|
||||
* @param[IN] module the BTL module
|
||||
* @param[IN] endpoint the BTL endpoint
|
||||
* @param[IN] local_address local address for the operation (if any)
|
||||
* @param[IN] local_handle local handle associated with the local_address
|
||||
* @param[IN] context callback context supplied to the rdma/atomic operation
|
||||
* @param[IN] cbdata callback data supplied to the rdma/atomic operation
|
||||
* @param[IN] status status of the operation
|
||||
*
|
||||
*/
|
||||
typedef void (*mca_btl_base_rdma_completion_fn_t)(
|
||||
struct mca_btl_base_module_t* module,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
void *local_address,
|
||||
struct mca_btl_base_registration_handle_t *local_handle,
|
||||
void *context,
|
||||
void *cbdata,
|
||||
int status);
|
||||
|
||||
|
||||
/**
|
||||
* Describes a region/segment of memory that is addressable
|
||||
* by an BTL.
|
||||
*
|
||||
* Note: In many cases the alloc and prepare methods of BTLs
|
||||
@ -256,38 +364,37 @@ typedef void (*mca_btl_base_completion_fn_t)(
|
||||
|
||||
struct mca_btl_base_segment_t {
|
||||
/** Address of the memory */
|
||||
opal_ptr_t seg_addr;
|
||||
opal_ptr_t seg_addr;
|
||||
/** Length in bytes */
|
||||
uint64_t seg_len;
|
||||
};
|
||||
typedef struct mca_btl_base_segment_t mca_btl_base_segment_t;
|
||||
|
||||
|
||||
/**
|
||||
* A descriptor that holds the parameters to a send/put/get
|
||||
* operation along w/ a callback routine that is called on
|
||||
* completion of the request.
|
||||
* Note: receive callbacks will store the incomming data segments in
|
||||
* des_local
|
||||
* des_segments
|
||||
*/
|
||||
|
||||
struct mca_btl_base_descriptor_t {
|
||||
ompi_free_list_item_t super;
|
||||
mca_btl_base_segment_t *des_local; /**< local segments */
|
||||
size_t des_local_count; /**< number of local segments */
|
||||
mca_btl_base_segment_t *des_remote; /**< remote segments */
|
||||
size_t des_remote_count; /**< number of destination segments */
|
||||
mca_btl_base_completion_fn_t des_cbfunc; /**< local callback function */
|
||||
ompi_free_list_item_t super;
|
||||
mca_btl_base_segment_t *des_segments; /**< local segments */
|
||||
size_t des_segment_count; /**< number of local segments */
|
||||
mca_btl_base_completion_fn_t des_cbfunc; /**< local callback function */
|
||||
void* des_cbdata; /**< opaque callback data */
|
||||
void* des_context; /**< more opaque callback data */
|
||||
uint32_t des_flags; /**< hints to BTL */
|
||||
/** order value, this is only
|
||||
valid in the local completion callback
|
||||
and may be used in subsequent calls to
|
||||
btl_alloc, btl_prepare_src/dst to request
|
||||
a descriptor that will be ordered w.r.t.
|
||||
/** order value, this is only
|
||||
valid in the local completion callback
|
||||
and may be used in subsequent calls to
|
||||
btl_alloc, btl_prepare_src to request
|
||||
a descriptor that will be ordered w.r.t.
|
||||
this descriptor
|
||||
*/
|
||||
uint8_t order;
|
||||
uint8_t order;
|
||||
};
|
||||
typedef struct mca_btl_base_descriptor_t mca_btl_base_descriptor_t;
|
||||
|
||||
@ -329,13 +436,18 @@ OPAL_DECLSPEC OBJ_CLASS_DECLARATION(mca_btl_base_descriptor_t);
|
||||
*/
|
||||
#define MCA_BTL_SEG_MAX_SIZE 256
|
||||
|
||||
/*
|
||||
* BTL base header, stores the tag at a minimum
|
||||
*/
|
||||
struct mca_btl_base_header_t{
|
||||
mca_btl_base_tag_t tag;
|
||||
};
|
||||
typedef struct mca_btl_base_header_t mca_btl_base_header_t;
|
||||
/**
|
||||
* Maximum size of a BTL registration handle in bytes
|
||||
*/
|
||||
#define MCA_BTL_REG_HANDLE_MAX_SIZE 256
|
||||
|
||||
/*
|
||||
* BTL base header, stores the tag at a minimum
|
||||
*/
|
||||
struct mca_btl_base_header_t{
|
||||
mca_btl_base_tag_t tag;
|
||||
};
|
||||
typedef struct mca_btl_base_header_t mca_btl_base_header_t;
|
||||
|
||||
#define MCA_BTL_BASE_HEADER_HTON(hdr)
|
||||
#define MCA_BTL_BASE_HEADER_NTOH(hdr)
|
||||
@ -359,19 +471,19 @@ typedef struct mca_btl_base_header_t mca_btl_base_header_t;
|
||||
* indicates whether multiple threads may invoke this component
|
||||
* simultaneously or not.
|
||||
*
|
||||
* @return Array of pointers to BTL modules, or NULL if the transport
|
||||
* @return Array of pointers to BTL modules, or NULL if the transport
|
||||
* is not available.
|
||||
*
|
||||
* During component initialization, the BTL component should discover
|
||||
* the physical devices that are available for the given transport,
|
||||
* and create a BTL module to represent each device. Any addressing
|
||||
* information required by peers to reach the device should be published
|
||||
* during this function via the modex_send() interface.
|
||||
* and create a BTL module to represent each device. Any addressing
|
||||
* information required by peers to reach the device should be published
|
||||
* during this function via the modex_send() interface.
|
||||
*
|
||||
*/
|
||||
|
||||
typedef struct mca_btl_base_module_t** (*mca_btl_base_component_init_fn_t)(
|
||||
int *num_btls,
|
||||
int *num_btls,
|
||||
bool enable_progress_threads,
|
||||
bool enable_mpi_threads
|
||||
);
|
||||
@ -380,8 +492,8 @@ typedef struct mca_btl_base_module_t** (*mca_btl_base_component_init_fn_t)(
|
||||
* MCA->BTL Called to progress outstanding requests for
|
||||
* non-threaded polling environments.
|
||||
*
|
||||
* @return Count of "completions", a metric of
|
||||
* how many items where completed in the call
|
||||
* @return Count of "completions", a metric of
|
||||
* how many items where completed in the call
|
||||
* to progress.
|
||||
*/
|
||||
|
||||
@ -390,22 +502,22 @@ typedef int (*mca_btl_base_component_progress_fn_t)(void);
|
||||
|
||||
/**
|
||||
* Callback function that is called asynchronously on receipt
|
||||
* of data by the transport layer.
|
||||
* Note that the the mca_btl_base_descriptor_t is only valid within the
|
||||
* completion function, this implies that all data payload in the
|
||||
* mca_btl_base_descriptor_t must be copied out within this callback or
|
||||
* of data by the transport layer.
|
||||
* Note that the the mca_btl_base_descriptor_t is only valid within the
|
||||
* completion function, this implies that all data payload in the
|
||||
* mca_btl_base_descriptor_t must be copied out within this callback or
|
||||
* forfeited back to the BTL.
|
||||
* Note also that descriptor segments (des_local) must be base
|
||||
* Note also that descriptor segments (des_segments) must be base
|
||||
* segments for all callbacks.
|
||||
*
|
||||
*
|
||||
* @param[IN] btl BTL module
|
||||
* @param[IN] tag The active message receive callback tag value
|
||||
* @param[IN] descriptor The BTL descriptor (contains the receive payload)
|
||||
* @param[IN] tag The active message receive callback tag value
|
||||
* @param[IN] descriptor The BTL descriptor (contains the receive payload)
|
||||
* @param[IN] cbdata Opaque callback data
|
||||
*/
|
||||
|
||||
typedef void (*mca_btl_base_module_recv_cb_fn_t)(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_module_t* btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t* descriptor,
|
||||
void* cbdata
|
||||
@ -424,26 +536,22 @@ mca_btl_active_message_callback_t mca_btl_base_active_message_trigger[MCA_BTL_TA
|
||||
* and component open/close/init functions.
|
||||
*/
|
||||
|
||||
struct mca_btl_base_component_2_0_0_t {
|
||||
struct mca_btl_base_component_3_0_0_t {
|
||||
mca_base_component_t btl_version;
|
||||
mca_base_component_data_t btl_data;
|
||||
mca_btl_base_component_init_fn_t btl_init;
|
||||
mca_btl_base_component_progress_fn_t btl_progress;
|
||||
};
|
||||
typedef struct mca_btl_base_component_2_0_0_t mca_btl_base_component_2_0_0_t;
|
||||
typedef struct mca_btl_base_component_2_0_0_t mca_btl_base_component_t;
|
||||
typedef struct mca_btl_base_component_3_0_0_t mca_btl_base_component_3_0_0_t;
|
||||
typedef struct mca_btl_base_component_3_0_0_t mca_btl_base_component_t;
|
||||
|
||||
/* add the 1_0_0_t typedef for source compatibility
|
||||
* we can do this safely because 1_0_0 components are the same as
|
||||
* 1_0_1 components, the difference is in the btl module.
|
||||
* Fortunately the only difference in the module is an additional interface
|
||||
* function added to 1_0_1. We can therefore safely treat an older module just
|
||||
* just like the new one so long as we check the component version
|
||||
* prior to invoking the new interface function.
|
||||
/* add the 2_0_0_t typedef for source compatibility
|
||||
* we can do this safely because 2_0_0 components are the same as
|
||||
* 3_0_0 components, the difference is in the btl module.
|
||||
* Unfortunately 2_0_0 modules are not compatible with BTL 3_0_0 and
|
||||
* can not be used with the new interface.
|
||||
*/
|
||||
typedef struct mca_btl_base_component_2_0_0_t mca_btl_base_component_1_0_1_t;
|
||||
typedef struct mca_btl_base_component_2_0_0_t mca_btl_base_component_1_0_0_t;
|
||||
|
||||
typedef struct mca_btl_base_component_3_0_0_t mca_btl_base_component_2_0_0_t;
|
||||
|
||||
|
||||
/*
|
||||
@ -451,24 +559,24 @@ typedef struct mca_btl_base_component_2_0_0_t mca_btl_base_component_1_0_0_t;
|
||||
*/
|
||||
|
||||
/**
|
||||
* MCA->BTL Clean up any resources held by BTL module
|
||||
* MCA->BTL Clean up any resources held by BTL module
|
||||
* before the module is unloaded.
|
||||
*
|
||||
*
|
||||
* @param btl (IN) BTL module.
|
||||
* @return OPAL_SUCCESS or error status on failure.
|
||||
*
|
||||
* Prior to unloading a BTL module, the MCA framework will call
|
||||
* the BTL finalize method of the module. Any resources held by
|
||||
* Prior to unloading a BTL module, the MCA framework will call
|
||||
* the BTL finalize method of the module. Any resources held by
|
||||
* the BTL should be released and if required the memory corresponding
|
||||
* to the BTL module freed.
|
||||
*
|
||||
*
|
||||
*/
|
||||
typedef int (*mca_btl_base_module_finalize_fn_t)(
|
||||
struct mca_btl_base_module_t* btl
|
||||
);
|
||||
|
||||
|
||||
/**
|
||||
* BML->BTL notification of change in the process list.
|
||||
* BML->BTL notification of change in the process list.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param nprocs (IN) Number of processes
|
||||
@ -477,24 +585,24 @@ typedef int (*mca_btl_base_module_finalize_fn_t)(
|
||||
* @param reachable (OUT) Bitmask indicating set of peer processes that are reachable by this BTL.
|
||||
* @return OPAL_SUCCESS or error status on failure.
|
||||
*
|
||||
* The mca_btl_base_module_add_procs_fn_t() is called by the BML to
|
||||
* The mca_btl_base_module_add_procs_fn_t() is called by the BML to
|
||||
* determine the set of BTLs that should be used to reach each process.
|
||||
* Any addressing information exported by the peer via the modex_send()
|
||||
* function should be available during this call via the corresponding
|
||||
* modex_recv() function. The BTL may utilize this information to
|
||||
* determine reachability of each peer process.
|
||||
* function should be available during this call via the corresponding
|
||||
* modex_recv() function. The BTL may utilize this information to
|
||||
* determine reachability of each peer process.
|
||||
*
|
||||
* For each process that is reachable by the BTL, the bit corresponding to the index
|
||||
* into the proc array (nprocs) should be set in the reachable bitmask. The BTL
|
||||
* For each process that is reachable by the BTL, the bit corresponding to the index
|
||||
* into the proc array (nprocs) should be set in the reachable bitmask. The BTL
|
||||
* will return an array of pointers to a data structure defined
|
||||
* by the BTL that is then returned to the BTL on subsequent calls to the BTL data
|
||||
* transfer functions (e.g btl_send). This may be used by the BTL to cache any addressing
|
||||
* transfer functions (e.g btl_send). This may be used by the BTL to cache any addressing
|
||||
* or connection information (e.g. TCP socket, IB queue pair).
|
||||
*/
|
||||
typedef int (*mca_btl_base_module_add_procs_fn_t)(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_module_t* btl,
|
||||
size_t nprocs,
|
||||
struct opal_proc_t** procs,
|
||||
struct opal_proc_t** procs,
|
||||
struct mca_btl_base_endpoint_t** endpoints,
|
||||
struct opal_bitmap_t* reachable
|
||||
);
|
||||
@ -513,9 +621,9 @@ typedef int (*mca_btl_base_module_add_procs_fn_t)(
|
||||
* resources associated with the peer.
|
||||
*/
|
||||
typedef int (*mca_btl_base_module_del_procs_fn_t)(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_module_t* btl,
|
||||
size_t nprocs,
|
||||
struct opal_proc_t** procs,
|
||||
struct opal_proc_t** procs,
|
||||
struct mca_btl_base_endpoint_t** peer
|
||||
);
|
||||
|
||||
@ -524,17 +632,17 @@ typedef int (*mca_btl_base_module_del_procs_fn_t)(
|
||||
* of a fragment.
|
||||
*
|
||||
* @param[IN] btl BTL module
|
||||
* @param[IN] tag tag value of this callback
|
||||
* @param[IN] tag tag value of this callback
|
||||
* (specified on subsequent send operations)
|
||||
* @param[IN] cbfunc The callback function
|
||||
* @param[IN] cbdata Opaque callback data
|
||||
*
|
||||
* @param[IN] cbdata Opaque callback data
|
||||
*
|
||||
* @return OPAL_SUCCESS The callback was registered successfully
|
||||
* @return OPAL_ERROR The callback was NOT registered successfully
|
||||
*
|
||||
*/
|
||||
typedef int (*mca_btl_base_module_register_fn_t)(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_module_t* btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_module_recv_cb_fn_t cbfunc,
|
||||
void* cbdata
|
||||
@ -543,10 +651,10 @@ typedef int (*mca_btl_base_module_register_fn_t)(
|
||||
|
||||
/**
|
||||
* Callback function that is called asynchronously on receipt
|
||||
* of an error from the transport layer
|
||||
* of an error from the transport layer
|
||||
*
|
||||
* @param[IN] btl BTL module
|
||||
* @param[IN] flags type of error
|
||||
* @param[IN] flags type of error
|
||||
* @param[IN] errproc process that had an error
|
||||
* @param[IN] btlinfo descriptive string from the BTL
|
||||
*/
|
||||
@ -571,21 +679,21 @@ typedef void (*mca_btl_base_module_error_cb_fn_t)(
|
||||
*
|
||||
*/
|
||||
typedef int (*mca_btl_base_module_register_error_fn_t)(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_module_t* btl,
|
||||
mca_btl_base_module_error_cb_fn_t cbfunc
|
||||
);
|
||||
|
||||
|
||||
/**
|
||||
* Allocate a descriptor with a segment of the requested size.
|
||||
* Allocate a descriptor with a segment of the requested size.
|
||||
* Note that the BTL layer may choose to return a smaller size
|
||||
* if it cannot support the request. The order tag value ensures that
|
||||
* operations on the descriptor that is allocated will be
|
||||
* ordered w.r.t. a previous operation on a particular descriptor.
|
||||
* Ordering is only guaranteed if the previous descriptor had its
|
||||
* local completion callback function called and the order tag of
|
||||
* operations on the descriptor that is allocated will be
|
||||
* ordered w.r.t. a previous operation on a particular descriptor.
|
||||
* Ordering is only guaranteed if the previous descriptor had its
|
||||
* local completion callback function called and the order tag of
|
||||
* that descriptor is only valid upon the local completion callback function.
|
||||
*
|
||||
*
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param size (IN) Request segment size.
|
||||
@ -602,9 +710,9 @@ typedef mca_btl_base_descriptor_t* (*mca_btl_base_module_alloc_fn_t)(
|
||||
|
||||
/**
|
||||
* Return a descriptor allocated from this BTL via alloc/prepare.
|
||||
* A descriptor can only be deallocated after its local completion
|
||||
* A descriptor can only be deallocated after its local completion
|
||||
* callback function has called for all send/put/get operations.
|
||||
*
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param segment (IN) Descriptor allocated from the BTL
|
||||
*/
|
||||
@ -615,23 +723,16 @@ typedef int (*mca_btl_base_module_free_fn_t)(
|
||||
|
||||
|
||||
/**
|
||||
* Prepare a descriptor for send/put/get using the supplied
|
||||
* convertor. If the convertor references data that is contiguous,
|
||||
* the descriptor may simply point to the user buffer. Otherwise,
|
||||
* this routine is responsible for allocating buffer space and
|
||||
* packing if required.
|
||||
* Prepare a descriptor for send using the supplied convertor. If the convertor
|
||||
* references data that is contiguous, the descriptor may simply point to the
|
||||
* user buffer. Otherwise, this routine is responsible for allocating buffer
|
||||
* space and packing if required.
|
||||
*
|
||||
* The descriptor returned can be used in multiple concurrent operations
|
||||
* (send/put/get) unless the BTL has the MCA_BTL_FLAGS_RDMA_MATCHED flag set
|
||||
* in which case a corresponding prepare call must accompany the put/get call
|
||||
* in addition, the address and length that is put/get must match the address
|
||||
* and length which is prepared.
|
||||
*
|
||||
* The order tag value ensures that operations on the
|
||||
* The order tag value ensures that operations on the
|
||||
* descriptor that is prepared will be ordered w.r.t. a previous
|
||||
* operation on a particular descriptor. Ordering is only guaranteed if
|
||||
* the previous descriptor had its local completion callback function
|
||||
* called and the order tag of that descriptor is only valid upon the local
|
||||
* operation on a particular descriptor. Ordering is only guaranteed if
|
||||
* the previous descriptor had its local completion callback function
|
||||
* called and the order tag of that descriptor is only valid upon the local
|
||||
* completion callback function.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
@ -647,7 +748,6 @@ typedef int (*mca_btl_base_module_free_fn_t)(
|
||||
typedef struct mca_btl_base_descriptor_t* (*mca_btl_base_module_prepare_fn_t)(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
@ -655,22 +755,67 @@ typedef struct mca_btl_base_descriptor_t* (*mca_btl_base_module_prepare_fn_t)(
|
||||
uint32_t flags
|
||||
);
|
||||
|
||||
/**
|
||||
* @brief Register a memory region for put/get/atomic operations.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint(IN) BTL addressing information (or NULL for all endpoints)
|
||||
* @param base (IN) Pointer to start of region
|
||||
* @param size (IN) Size of region
|
||||
* @param flags (IN) Flags including access permissions
|
||||
*
|
||||
* @returns a memory registration handle valid for both local and remote operations
|
||||
* @returns NULL if the region could not be registered
|
||||
*
|
||||
* This function registers the specified region with the hardware for use with
|
||||
* the btl_put, btl_get, btl_atomic_cas, btl_atomic_op, and btl_atomic_fop
|
||||
* functions. Care should be taken to not hold an excessive number of registrations
|
||||
* as they may use limited system/NIC resources.
|
||||
*
|
||||
* Ownership of the memory pointed to by the returned (struct
|
||||
* mca_btl_base_registration_handle_t*) is passed to the caller. The
|
||||
* BTL module cannot free or reuse the handle until it is returned via
|
||||
* the mca_btl_base_module_deregister_mem_fn_t function.
|
||||
*/
|
||||
typedef struct mca_btl_base_registration_handle_t *(*mca_btl_base_module_register_mem_fn_t)(
|
||||
struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t *endpoint, void *base,
|
||||
size_t size, uint32_t flags);
|
||||
|
||||
/**
|
||||
* @brief Deregister a memory region
|
||||
*
|
||||
* @param btl (IN) BTL module region was registered with
|
||||
* @param handle (IN) BTL registration handle to deregister
|
||||
*
|
||||
* This function deregisters the memory region associated with the specified handle. Care
|
||||
* should be taken to not perform any RDMA or atomic operation on this memory region
|
||||
* after it is deregistered. It is erroneous to specify a memory handle associated with
|
||||
* a remote node.
|
||||
*
|
||||
* The handle passed in will be a value previously returned by the
|
||||
* mca_btl_base_module_register_mem_fn_t function. Ownership of the
|
||||
* memory pointed to by handle passes to the BTL module; this function
|
||||
* is now is allowed to free the memory, return it to a freelist, etc.
|
||||
*/
|
||||
typedef int (*mca_btl_base_module_deregister_mem_fn_t)(
|
||||
struct mca_btl_base_module_t* btl, struct mca_btl_base_registration_handle_t *handle);
|
||||
|
||||
/**
|
||||
* Initiate an asynchronous send.
|
||||
* Completion Semantics: the descriptor has been queued for a send operation
|
||||
* the BTL now controls the descriptor until local
|
||||
* the BTL now controls the descriptor until local
|
||||
* completion callback is made on the descriptor
|
||||
*
|
||||
*
|
||||
* All BTLs allow multiple concurrent asynchronous send operations on a descriptor
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param descriptor (IN) Description of the data to be transfered
|
||||
* @param tag (IN) The tag value used to notify the peer.
|
||||
*
|
||||
* @retval OPAL_SUCCESS The descriptor was successfully queued for a send
|
||||
* @retval OPAL_ERROR The descriptor was NOT successfully queued for a send
|
||||
* @retval OPAL_ERR_UNREACH The endpoint is not reachable
|
||||
*
|
||||
* @retval OPAL_SUCCESS The descriptor was successfully queued for a send
|
||||
* @retval OPAL_ERROR The descriptor was NOT successfully queued for a send
|
||||
* @retval OPAL_ERR_UNREACH The endpoint is not reachable
|
||||
*/
|
||||
typedef int (*mca_btl_base_module_send_fn_t)(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
@ -680,12 +825,12 @@ typedef int (*mca_btl_base_module_send_fn_t)(
|
||||
);
|
||||
|
||||
/**
|
||||
* Initiate an immediate blocking send.
|
||||
* Completion Semantics: the BTL will make a best effort
|
||||
* to send the header and "size" bytes from the datatype using the convertor.
|
||||
* The header is guaranteed to be delivered entirely in the first segment.
|
||||
* Should the BTL be unable to deliver the data due to resource constraints
|
||||
* the BTL will return a descriptor (via the OUT param)
|
||||
* Initiate an immediate blocking send.
|
||||
* Completion Semantics: the BTL will make a best effort
|
||||
* to send the header and "size" bytes from the datatype using the convertor.
|
||||
* The header is guaranteed to be delivered entirely in the first segment.
|
||||
* Should the BTL be unable to deliver the data due to resource constraints
|
||||
* the BTL will return a descriptor (via the OUT param)
|
||||
* of size "payload_size + header_size".
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
@ -698,13 +843,13 @@ typedef int (*mca_btl_base_module_send_fn_t)(
|
||||
* @param flags (IN) Flags.
|
||||
* @param tag (IN) The tag value used to notify the peer.
|
||||
* @param descriptor (OUT) The descriptor to be returned unable to be sent immediately
|
||||
|
||||
* @retval OPAL_SUCCESS The send was successfully queued
|
||||
* @retval OPAL_ERROR The send failed
|
||||
* @retval OPAL_ERR_UNREACH The endpoint is not reachable
|
||||
* @retval OPAL_ERR_RESOURCE_BUSY The BTL is busy a descriptor will be returned
|
||||
* (via the OUT param) if descriptors are available
|
||||
|
||||
* (may be NULL).
|
||||
*
|
||||
* @retval OPAL_SUCCESS The send was successfully queued
|
||||
* @retval OPAL_ERROR The send failed
|
||||
* @retval OPAL_ERR_UNREACH The endpoint is not reachable
|
||||
* @retval OPAL_ERR_RESOURCE_BUSY The BTL is busy a descriptor will be returned
|
||||
* (via the OUT param) if descriptors are available
|
||||
*/
|
||||
|
||||
typedef int (*mca_btl_base_module_sendi_fn_t)(
|
||||
@ -721,59 +866,211 @@ typedef int (*mca_btl_base_module_sendi_fn_t)(
|
||||
);
|
||||
|
||||
/**
|
||||
* Initiate an asynchronous put.
|
||||
* Completion Semantics: the descriptor has been queued for a put operation
|
||||
* the BTL now controls the descriptor until local
|
||||
* completion callback is made on the descriptor
|
||||
* Initiate an asynchronous put.
|
||||
* Completion Semantics: if this function returns a 1 then the operation
|
||||
* is complete. a return of OPAL_SUCCESS indicates
|
||||
* the put operation has been queued with the
|
||||
* network. the local_handle can not be deregistered
|
||||
* until all outstanding operations on that handle
|
||||
* have been completed.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param local_address (IN) Local address to put from (registered)
|
||||
* @param remote_address (IN) Remote address to put to (registered remotely)
|
||||
* @param local_handle (IN) Registration handle for region containing
|
||||
* (local_address, local_address + size)
|
||||
* @param remote_handle (IN) Remote registration handle for region containing
|
||||
* (remote_address, remote_address + size)
|
||||
* @param size (IN) Number of bytes to put
|
||||
* @param flags (IN) Flags for this put operation
|
||||
* @param order (IN) Ordering
|
||||
* @param cbfunc (IN) Function to call on completion (if queued)
|
||||
* @param cbcontext (IN) Context for the callback
|
||||
* @param cbdata (IN) Data for callback
|
||||
*
|
||||
* BTLs that do not have the MCA_BTL_FLAGS_RDMA_MATCHED flag set
|
||||
* allow multiple concurrent put operations on the same descriptor.
|
||||
* BTLs that do have the MCA_BTL_FLAGS_RDMA_MATCHED flag set require
|
||||
* a corresponding prepare_src/dst call for each put operation and
|
||||
* therefore prohibit multiple concurrent put operations.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param descriptor (IN) Description of the data to be transferred
|
||||
*
|
||||
* @retval OPAL_SUCCESS The descriptor was successfully queued for a put
|
||||
* @retval OPAL_ERROR The descriptor was NOT successfully queued for a put
|
||||
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put
|
||||
* operation. Try again later
|
||||
* @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or
|
||||
* alignment restrictions.
|
||||
*/
|
||||
|
||||
typedef int (*mca_btl_base_module_put_fn_t)(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_btl_base_descriptor_t* descriptor
|
||||
);
|
||||
typedef int (*mca_btl_base_module_put_fn_t) (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
|
||||
struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
|
||||
/**
|
||||
* Initiate an asynchronous get.
|
||||
* Completion Semantics: if this function returns a 1 then the operation
|
||||
* is complete. a return of OPAL_SUCCESS indicates
|
||||
* the get operation has been queued with the
|
||||
* network. the local_handle can not be deregistered
|
||||
* until all outstanding operations on that handle
|
||||
* have been completed.
|
||||
*
|
||||
* Completion Semantics: the descriptor has been queued for a get operation
|
||||
* the BTL now controls the descriptor until local
|
||||
* completion callback is made on the descriptor
|
||||
*
|
||||
* BTLs that do not have the MCA_BTL_FLAGS_RDMA_MATCHED flag set
|
||||
* allow multiple concurrent get operations on the same descriptor.
|
||||
* BTLs that do have the MCA_BTL_FLAGS_RDMA_MATCHED flag set require
|
||||
* a corresponding prepare_src/dst call for each get operation and
|
||||
* therefore prohibit multiple concurrent get operations.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param descriptor (IN) Description of the data to be transferred
|
||||
*
|
||||
* @retval OPAL_SUCCESS The descriptor was successfully queued for a get
|
||||
* @retval OPAL_ERROR The descriptor was NOT successfully queued for a get
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param local_address (IN) Local address to put from (registered)
|
||||
* @param remote_address (IN) Remote address to put to (registered remotely)
|
||||
* @param local_handle (IN) Registration handle for region containing
|
||||
* (local_address, local_address + size)
|
||||
* @param remote_handle (IN) Remote registration handle for region containing
|
||||
* (remote_address, remote_address + size)
|
||||
* @param size (IN) Number of bytes to put
|
||||
* @param flags (IN) Flags for this put operation
|
||||
* @param order (IN) Ordering
|
||||
* @param cbfunc (IN) Function to call on completion (if queued)
|
||||
* @param cbcontext (IN) Context for the callback
|
||||
* @param cbdata (IN) Data for callback
|
||||
*
|
||||
* @retval OPAL_SUCCESS The descriptor was successfully queued for a put
|
||||
* @retval OPAL_ERROR The descriptor was NOT successfully queued for a put
|
||||
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put
|
||||
* operation. Try again later
|
||||
* @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or
|
||||
* alignment restrictions.
|
||||
*/
|
||||
typedef int (*mca_btl_base_module_get_fn_t) (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
|
||||
struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
|
||||
typedef int (*mca_btl_base_module_get_fn_t)(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_btl_base_descriptor_t* descriptor
|
||||
);
|
||||
/**
|
||||
* Initiate an asynchronous atomic operation.
|
||||
* Completion Semantics: if this function returns a 1 then the operation
|
||||
* is complete. a return of OPAL_SUCCESS indicates
|
||||
* the atomic operation has been queued with the
|
||||
* network.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param remote_address (IN) Remote address to put to (registered remotely)
|
||||
* @param remote_handle (IN) Remote registration handle for region containing
|
||||
* (remote_address, remote_address + 8)
|
||||
* @param op (IN) Operation to perform
|
||||
* @param operand (IN) Operand for the operation
|
||||
* @param flags (IN) Flags for this put operation
|
||||
* @param order (IN) Ordering
|
||||
* @param cbfunc (IN) Function to call on completion (if queued)
|
||||
* @param cbcontext (IN) Context for the callback
|
||||
* @param cbdata (IN) Data for callback
|
||||
*
|
||||
* @retval OPAL_SUCCESS The operation was successfully queued
|
||||
* @retval 1 The operation is complete
|
||||
* @retval OPAL_ERROR The operation was NOT successfully queued
|
||||
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the atomic
|
||||
* operation. Try again later
|
||||
* @retval OPAL_ERR_NOT_AVAILABLE Atomic operation can not be performed due to
|
||||
* alignment restrictions or the operation {op} is not supported
|
||||
* by the hardware.
|
||||
*
|
||||
* After the operation is complete the remote address specified by {remote_address} and
|
||||
* {remote_handle} will be updated with (*remote_address) = (*remote_address) op operand.
|
||||
* The btl will guarantee consistency of atomic operations performed via the btl. Note,
|
||||
* however, that not all btls will provide consistency between btl atomic operations and
|
||||
* cpu or other btl atomics.
|
||||
*/
|
||||
typedef int (*mca_btl_base_module_atomic_op64_fn_t) (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint, uint64_t remote_address,
|
||||
struct mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
|
||||
uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata);
|
||||
|
||||
/**
|
||||
* Initiate an asynchronous fetching atomic operation.
|
||||
* Completion Semantics: if this function returns a 1 then the operation
|
||||
* is complete. a return of OPAL_SUCCESS indicates
|
||||
* the atomic operation has been queued with the
|
||||
* network.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param local_address (OUT) Local address to store the result in
|
||||
* @param remote_address (IN) Remote address perfom operation on to (registered remotely)
|
||||
* @param local_handle (IN) Local registration handle for region containing
|
||||
* (local_address, local_address + 8)
|
||||
* @param remote_handle (IN) Remote registration handle for region containing
|
||||
* (remote_address, remote_address + 8)
|
||||
* @param op (IN) Operation to perform
|
||||
* @param operand (IN) Operand for the operation
|
||||
* @param flags (IN) Flags for this put operation
|
||||
* @param order (IN) Ordering
|
||||
* @param cbfunc (IN) Function to call on completion (if queued)
|
||||
* @param cbcontext (IN) Context for the callback
|
||||
* @param cbdata (IN) Data for callback
|
||||
*
|
||||
* @retval OPAL_SUCCESS The operation was successfully queued
|
||||
* @retval 1 The operation is complete
|
||||
* @retval OPAL_ERROR The operation was NOT successfully queued
|
||||
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the atomic
|
||||
* operation. Try again later
|
||||
* @retval OPAL_ERR_NOT_AVAILABLE Atomic operation can not be performed due to
|
||||
* alignment restrictions or the operation {op} is not supported
|
||||
* by the hardware.
|
||||
*
|
||||
* After the operation is complete the remote address specified by {remote_address} and
|
||||
* {remote_handle} will be updated with (*remote_address) = (*remote_address) op operand.
|
||||
* {local_address} will be updated with the previous value stored in {remote_address}.
|
||||
* The btl will guarantee consistency of atomic operations performed via the btl. Note,
|
||||
* however, that not all btls will provide consistency between btl atomic operations and
|
||||
* cpu or other btl atomics.
|
||||
*/
|
||||
typedef int (*mca_btl_base_module_atomic_fop64_fn_t) (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address,
|
||||
struct mca_btl_base_registration_handle_t *local_handle,
|
||||
struct mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
|
||||
uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata);
|
||||
|
||||
/**
|
||||
* Initiate an asynchronous compare and swap operation.
|
||||
* Completion Semantics: if this function returns a 1 then the operation
|
||||
* is complete. a return of OPAL_SUCCESS indicates
|
||||
* the atomic operation has been queued with the
|
||||
* network.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param local_address (OUT) Local address to store the result in
|
||||
* @param remote_address (IN) Remote address perfom operation on to (registered remotely)
|
||||
* @param local_handle (IN) Local registration handle for region containing
|
||||
* (local_address, local_address + 8)
|
||||
* @param remote_handle (IN) Remote registration handle for region containing
|
||||
* (remote_address, remote_address + 8)
|
||||
* @param compare (IN) Operand for the operation
|
||||
* @param value (IN) Value to store on success
|
||||
* @param flags (IN) Flags for this put operation
|
||||
* @param order (IN) Ordering
|
||||
* @param cbfunc (IN) Function to call on completion (if queued)
|
||||
* @param cbcontext (IN) Context for the callback
|
||||
* @param cbdata (IN) Data for callback
|
||||
*
|
||||
* @retval OPAL_SUCCESS The operation was successfully queued
|
||||
* @retval 1 The operation is complete
|
||||
* @retval OPAL_ERROR The operation was NOT successfully queued
|
||||
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the atomic
|
||||
* operation. Try again later
|
||||
* @retval OPAL_ERR_NOT_AVAILABLE Atomic operation can not be performed due to
|
||||
* alignment restrictions or the operation {op} is not supported
|
||||
* by the hardware.
|
||||
*
|
||||
* After the operation is complete the remote address specified by {remote_address} and
|
||||
* {remote_handle} will be updated with {value} if *remote_address == compare.
|
||||
* {local_address} will be updated with the previous value stored in {remote_address}.
|
||||
* The btl will guarantee consistency of atomic operations performed via the btl. Note,
|
||||
* however, that not all btls will provide consistency between btl atomic operations and
|
||||
* cpu atomics.
|
||||
*/
|
||||
typedef int (*mca_btl_base_module_atomic_cswap64_fn_t) (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address,
|
||||
struct mca_btl_base_registration_handle_t *local_handle,
|
||||
struct mca_btl_base_registration_handle_t *remote_handle, uint64_t compare,
|
||||
uint64_t value, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata);
|
||||
|
||||
/**
|
||||
* Diagnostic dump of btl state.
|
||||
@ -813,7 +1110,14 @@ struct mca_btl_base_module_t {
|
||||
uint32_t btl_latency; /**< relative ranking of latency used to prioritize btls */
|
||||
uint32_t btl_bandwidth; /**< bandwidth (Mbytes/sec) supported by each endpoint */
|
||||
uint32_t btl_flags; /**< flags (put/get...) */
|
||||
size_t btl_seg_size; /**< size of a btl segment */
|
||||
uint32_t btl_atomic_flags; /**< atomic operations supported (add, and, xor, etc) */
|
||||
size_t btl_registration_handle_size; /**< size of the BTLs registration handles */
|
||||
|
||||
/* One-sided limitations (0 for no alignment, SIZE_MAX for no limit ) */
|
||||
size_t btl_get_limit; /**< maximum size supported by the btl_get function */
|
||||
size_t btl_get_alignment; /**< minimum alignment/size needed by btl_get (power of 2) */
|
||||
size_t btl_put_limit; /**< maximum size supported by the btl_put function */
|
||||
size_t btl_put_alignment; /**< minimum alignment/size needed by btl_put (power of 2) */
|
||||
|
||||
/* BTL function table */
|
||||
mca_btl_base_module_add_procs_fn_t btl_add_procs;
|
||||
@ -824,16 +1128,24 @@ struct mca_btl_base_module_t {
|
||||
mca_btl_base_module_alloc_fn_t btl_alloc;
|
||||
mca_btl_base_module_free_fn_t btl_free;
|
||||
mca_btl_base_module_prepare_fn_t btl_prepare_src;
|
||||
mca_btl_base_module_prepare_fn_t btl_prepare_dst;
|
||||
mca_btl_base_module_send_fn_t btl_send;
|
||||
mca_btl_base_module_sendi_fn_t btl_sendi;
|
||||
mca_btl_base_module_put_fn_t btl_put;
|
||||
mca_btl_base_module_get_fn_t btl_get;
|
||||
mca_btl_base_module_dump_fn_t btl_dump;
|
||||
|
||||
/** the mpool associated with this btl (optional) */
|
||||
mca_mpool_base_module_t* btl_mpool;
|
||||
/** register a default error handler */
|
||||
mca_btl_base_module_dump_fn_t btl_dump;
|
||||
|
||||
/* atomic operations */
|
||||
mca_btl_base_module_atomic_op64_fn_t btl_atomic_op;
|
||||
mca_btl_base_module_atomic_fop64_fn_t btl_atomic_fop;
|
||||
mca_btl_base_module_atomic_cswap64_fn_t btl_atomic_cswap;
|
||||
|
||||
/* new memory registration functions */
|
||||
mca_btl_base_module_register_mem_fn_t btl_register_mem; /**< memory registration function (NULL if not needed) */
|
||||
mca_btl_base_module_deregister_mem_fn_t btl_deregister_mem; /**< memory deregistration function (NULL if not needed) */
|
||||
|
||||
/** the mpool associated with this btl (optional) */
|
||||
mca_mpool_base_module_t* btl_mpool;
|
||||
/** register a default error handler */
|
||||
mca_btl_base_module_register_error_fn_t btl_register_error;
|
||||
/** fault tolerant even notification */
|
||||
mca_btl_base_module_ft_event_fn_t btl_ft_event;
|
||||
|
@ -59,6 +59,9 @@ sources = \
|
||||
btl_openib_fd.c \
|
||||
btl_openib_ip.h \
|
||||
btl_openib_ip.c \
|
||||
btl_openib_put.c \
|
||||
btl_openib_get.c \
|
||||
btl_openib_atomic.c \
|
||||
connect/base.h \
|
||||
connect/btl_openib_connect_base.c \
|
||||
connect/btl_openib_connect_empty.c \
|
||||
|
@ -92,6 +92,11 @@
|
||||
#define MIN(a,b) ((a)<(b)?(a):(b))
|
||||
#endif
|
||||
|
||||
static mca_btl_base_registration_handle_t *mca_btl_openib_register_mem (mca_btl_base_module_t *btl,
|
||||
mca_btl_base_endpoint_t *endpoint,
|
||||
void *base, size_t size, uint32_t flags);
|
||||
static int mca_btl_openib_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle);
|
||||
|
||||
mca_btl_openib_module_t mca_btl_openib_module = {
|
||||
.super = {
|
||||
.btl_component = &mca_btl_openib_component.super,
|
||||
@ -102,14 +107,19 @@ mca_btl_openib_module_t mca_btl_openib_module = {
|
||||
.btl_alloc = mca_btl_openib_alloc,
|
||||
.btl_free = mca_btl_openib_free,
|
||||
.btl_prepare_src = mca_btl_openib_prepare_src,
|
||||
.btl_prepare_dst = mca_btl_openib_prepare_dst,
|
||||
.btl_send = mca_btl_openib_send,
|
||||
.btl_sendi = mca_btl_openib_sendi, /* send immediate */
|
||||
.btl_put = mca_btl_openib_put,
|
||||
.btl_get = mca_btl_openib_get,
|
||||
.btl_dump = mca_btl_base_dump,
|
||||
.btl_register_error = mca_btl_openib_register_error_cb, /* error call back registration */
|
||||
.btl_ft_event = mca_btl_openib_ft_event
|
||||
.btl_ft_event = mca_btl_openib_ft_event,
|
||||
.btl_register_mem = mca_btl_openib_register_mem,
|
||||
.btl_deregister_mem = mca_btl_openib_deregister_mem,
|
||||
#if HAVE_DECL_IBV_ATOMIC_HCA
|
||||
.btl_atomic_fop = mca_btl_openib_atomic_fop,
|
||||
.btl_atomic_cswap = mca_btl_openib_atomic_cswap,
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
@ -854,6 +864,12 @@ int mca_btl_openib_add_procs(
|
||||
return rc;
|
||||
}
|
||||
|
||||
rc = mca_btl_openib_size_queues(openib_btl, nprocs);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
BTL_ERROR(("error creating cqs"));
|
||||
return rc;
|
||||
}
|
||||
|
||||
for (i = 0, local_procs = 0 ; i < (int) nprocs; i++) {
|
||||
struct opal_proc_t* proc = procs[i];
|
||||
mca_btl_openib_proc_t* ib_proc;
|
||||
@ -865,11 +881,6 @@ int mca_btl_openib_add_procs(
|
||||
local_procs ++;
|
||||
}
|
||||
|
||||
/* OOB, XOOB, and RDMACM do not support SELF comunication, so
|
||||
* mark the prco as unreachable by openib btl */
|
||||
if (0 == opal_compare_proc(OPAL_PROC_MY_NAME, proc->proc_name)) {
|
||||
continue;
|
||||
}
|
||||
#if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE)
|
||||
/* Most current iWARP adapters (June 2008) cannot handle
|
||||
talking to other processes on the same host (!) -- so mark
|
||||
@ -1036,7 +1047,7 @@ int mca_btl_openib_add_procs(
|
||||
openib_btl->local_procs += local_procs;
|
||||
openib_btl->device->mem_reg_max /= openib_btl->local_procs;
|
||||
|
||||
return mca_btl_openib_size_queues(openib_btl, nprocs);
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1275,18 +1286,6 @@ int mca_btl_openib_free(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
mca_btl_base_descriptor_t* des)
|
||||
{
|
||||
/* is this fragment pointing at user memory? */
|
||||
if(MCA_BTL_OPENIB_FRAG_SEND_USER == openib_frag_type(des) ||
|
||||
MCA_BTL_OPENIB_FRAG_RECV_USER == openib_frag_type(des)) {
|
||||
mca_btl_openib_com_frag_t* frag = to_com_frag(des);
|
||||
|
||||
if(frag->registration != NULL) {
|
||||
btl->btl_mpool->mpool_deregister(btl->btl_mpool,
|
||||
(mca_mpool_base_registration_t*)frag->registration);
|
||||
frag->registration = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/* reset those field on free so we will not have to do it on alloc */
|
||||
to_base_frag(des)->base.des_flags = 0;
|
||||
switch(openib_frag_type(des)) {
|
||||
@ -1302,12 +1301,6 @@ int mca_btl_openib_free(
|
||||
to_send_frag(des)->hdr + 1;
|
||||
assert(!opal_list_get_size(&to_send_frag(des)->coalesced_frags));
|
||||
/* fall through */
|
||||
case MCA_BTL_OPENIB_FRAG_RECV:
|
||||
case MCA_BTL_OPENIB_FRAG_RECV_USER:
|
||||
case MCA_BTL_OPENIB_FRAG_SEND_USER:
|
||||
to_base_frag(des)->base.des_remote = NULL;
|
||||
to_base_frag(des)->base.des_remote_count = 0;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@ -1351,15 +1344,12 @@ int mca_btl_openib_free(
|
||||
mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t* size,
|
||||
uint32_t flags)
|
||||
{
|
||||
mca_btl_openib_module_t *openib_btl;
|
||||
mca_btl_openib_reg_t *openib_reg;
|
||||
mca_btl_openib_com_frag_t *frag = NULL;
|
||||
struct iovec iov;
|
||||
uint32_t iov_count = 1;
|
||||
@ -1367,85 +1357,20 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
|
||||
void *ptr;
|
||||
int rc;
|
||||
|
||||
openib_btl = (mca_btl_openib_module_t*)btl;
|
||||
|
||||
#if OPAL_CUDA_GDR_SUPPORT
|
||||
if(opal_convertor_cuda_need_buffers(convertor) == false && 0 == reserve) {
|
||||
#else
|
||||
if(opal_convertor_need_buffers(convertor) == false && 0 == reserve) {
|
||||
#endif /* OPAL_CUDA_GDR_SUPPORT */
|
||||
/* GMS bloody HACK! */
|
||||
if(registration != NULL || max_data > btl->btl_max_send_size) {
|
||||
frag = alloc_send_user_frag();
|
||||
if(NULL == frag) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
iov.iov_len = max_data;
|
||||
iov.iov_base = NULL;
|
||||
|
||||
opal_convertor_pack(convertor, &iov, &iov_count, &max_data);
|
||||
|
||||
*size = max_data;
|
||||
|
||||
if(NULL == registration) {
|
||||
rc = btl->btl_mpool->mpool_register(btl->btl_mpool,
|
||||
iov.iov_base, max_data, 0, ®istration);
|
||||
if(OPAL_SUCCESS != rc || NULL == registration) {
|
||||
MCA_BTL_IB_FRAG_RETURN(frag);
|
||||
return NULL;
|
||||
}
|
||||
/* keep track of the registration we did */
|
||||
to_com_frag(frag)->registration =
|
||||
(mca_btl_openib_reg_t*)registration;
|
||||
}
|
||||
openib_reg = (mca_btl_openib_reg_t*)registration;
|
||||
|
||||
frag->sg_entry.length = max_data;
|
||||
frag->sg_entry.lkey = openib_reg->mr->lkey;
|
||||
frag->sg_entry.addr = (uint64_t)(uintptr_t)iov.iov_base;
|
||||
|
||||
to_base_frag(frag)->base.order = order;
|
||||
to_base_frag(frag)->base.des_flags = flags;
|
||||
to_base_frag(frag)->segment.base.seg_len = max_data;
|
||||
to_base_frag(frag)->segment.base.seg_addr.lval = (uint64_t)(uintptr_t) iov.iov_base;
|
||||
to_base_frag(frag)->segment.key = frag->sg_entry.lkey;
|
||||
|
||||
assert(MCA_BTL_NO_ORDER == order);
|
||||
|
||||
BTL_VERBOSE(("frag->sg_entry.lkey = %" PRIu32 " .addr = %" PRIx64,
|
||||
frag->sg_entry.lkey, frag->sg_entry.addr));
|
||||
|
||||
return &to_base_frag(frag)->base;
|
||||
}
|
||||
}
|
||||
|
||||
assert(MCA_BTL_NO_ORDER == order);
|
||||
|
||||
if(max_data + reserve > btl->btl_max_send_size) {
|
||||
if (max_data + reserve > btl->btl_max_send_size) {
|
||||
max_data = btl->btl_max_send_size - reserve;
|
||||
}
|
||||
|
||||
if (OPAL_UNLIKELY(0 == reserve)) {
|
||||
frag = (mca_btl_openib_com_frag_t *) ib_frag_alloc(openib_btl, max_data, order, flags);
|
||||
if(NULL == frag)
|
||||
return NULL;
|
||||
|
||||
/* NTH: this frag will be ue used for either a get or put so we need to set the lval to be
|
||||
consistent with the usage in get and put. the pval will be restored in mca_btl_openib_free */
|
||||
ptr = to_base_frag(frag)->segment.base.seg_addr.pval;
|
||||
to_base_frag(frag)->segment.base.seg_addr.lval =
|
||||
(uint64_t)(uintptr_t) ptr;
|
||||
} else {
|
||||
frag =
|
||||
(mca_btl_openib_com_frag_t *) mca_btl_openib_alloc(btl, endpoint, order,
|
||||
frag = (mca_btl_openib_com_frag_t *) mca_btl_openib_alloc (btl, endpoint, order,
|
||||
max_data + reserve, flags);
|
||||
if(NULL == frag)
|
||||
return NULL;
|
||||
|
||||
ptr = to_base_frag(frag)->segment.base.seg_addr.pval;
|
||||
if (NULL == frag) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ptr = to_base_frag(frag)->segment.base.seg_addr.pval;
|
||||
|
||||
iov.iov_len = max_data;
|
||||
iov.iov_base = (IOVBASE_TYPE *) ( (unsigned char*) ptr + reserve );
|
||||
rc = opal_convertor_pack(convertor, &iov, &iov_count, &max_data);
|
||||
@ -1468,103 +1393,6 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
|
||||
return &to_base_frag(frag)->base;
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepare the dst buffer
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param peer (IN) BTL peer addressing
|
||||
* prepare dest's behavior depends on the following:
|
||||
* Has a valid memory registration been passed to prepare_src?
|
||||
* if so we attempt to use the pre-registered user-buffer, if the memory registration
|
||||
* is to small (only a portion of the user buffer) then we must reregister the user buffer
|
||||
* Has the user requested the memory to be left pinned?
|
||||
* if so we insert the memory registration into a memory tree for later lookup, we
|
||||
* may also remove a previous registration if a MRU (most recently used) list of
|
||||
* registrations is full, this prevents resources from being exhausted.
|
||||
*/
|
||||
mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t* size,
|
||||
uint32_t flags)
|
||||
{
|
||||
mca_btl_openib_module_t *openib_btl;
|
||||
mca_btl_openib_component_t *openib_component;
|
||||
mca_btl_openib_com_frag_t *frag;
|
||||
mca_btl_openib_reg_t *openib_reg;
|
||||
uint32_t max_msg_sz;
|
||||
int rc;
|
||||
void *buffer;
|
||||
|
||||
openib_btl = (mca_btl_openib_module_t*)btl;
|
||||
openib_component = (mca_btl_openib_component_t*)btl->btl_component;
|
||||
|
||||
frag = alloc_recv_user_frag();
|
||||
if(NULL == frag) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* max_msg_sz is the maximum message size of the HCA (hw limitation)
|
||||
set the minimum between local max_msg_sz and the remote */
|
||||
max_msg_sz = MIN(openib_btl->ib_port_attr.max_msg_sz,
|
||||
endpoint->endpoint_btl->ib_port_attr.max_msg_sz);
|
||||
|
||||
/* check if user has explicitly limited the max message size */
|
||||
if (openib_component->max_hw_msg_size > 0 &&
|
||||
max_msg_sz > (size_t)openib_component->max_hw_msg_size) {
|
||||
max_msg_sz = openib_component->max_hw_msg_size;
|
||||
}
|
||||
|
||||
/* limit the message so to max_msg_sz */
|
||||
if (*size > (size_t)max_msg_sz) {
|
||||
*size = (size_t)max_msg_sz;
|
||||
BTL_VERBOSE(("message size limited to %" PRIsize_t "\n", *size));
|
||||
}
|
||||
|
||||
opal_convertor_get_current_pointer(convertor, &buffer);
|
||||
|
||||
if(NULL == registration){
|
||||
/* we didn't get a memory registration passed in, so we have to
|
||||
* register the region ourselves
|
||||
*/
|
||||
uint32_t mflags = 0;
|
||||
#if OPAL_CUDA_GDR_SUPPORT
|
||||
if (convertor->flags & CONVERTOR_CUDA) {
|
||||
mflags |= MCA_MPOOL_FLAGS_CUDA_GPU_MEM;
|
||||
}
|
||||
#endif /* OPAL_CUDA_GDR_SUPPORT */
|
||||
rc = btl->btl_mpool->mpool_register(btl->btl_mpool, buffer, *size, mflags,
|
||||
®istration);
|
||||
if(OPAL_SUCCESS != rc || NULL == registration) {
|
||||
MCA_BTL_IB_FRAG_RETURN(frag);
|
||||
return NULL;
|
||||
}
|
||||
/* keep track of the registration we did */
|
||||
frag->registration = (mca_btl_openib_reg_t*)registration;
|
||||
}
|
||||
openib_reg = (mca_btl_openib_reg_t*)registration;
|
||||
|
||||
frag->sg_entry.length = *size;
|
||||
frag->sg_entry.lkey = openib_reg->mr->lkey;
|
||||
frag->sg_entry.addr = (uint64_t)(uintptr_t)buffer;
|
||||
|
||||
to_base_frag(frag)->segment.base.seg_addr.lval = (uint64_t)(uintptr_t) buffer;
|
||||
to_base_frag(frag)->segment.base.seg_len = *size;
|
||||
to_base_frag(frag)->segment.key = openib_reg->mr->rkey;
|
||||
to_base_frag(frag)->base.order = order;
|
||||
to_base_frag(frag)->base.des_flags = flags;
|
||||
|
||||
BTL_VERBOSE(("frag->sg_entry.lkey = %" PRIu32 " .addr = %" PRIx64 " "
|
||||
"rkey = %" PRIu32, frag->sg_entry.lkey, frag->sg_entry.addr,
|
||||
openib_reg->mr->rkey));
|
||||
|
||||
return &to_base_frag(frag)->base;
|
||||
}
|
||||
|
||||
static int mca_btl_openib_finalize_resources(struct mca_btl_base_module_t* btl) {
|
||||
mca_btl_openib_module_t* openib_btl;
|
||||
mca_btl_openib_endpoint_t* endpoint;
|
||||
@ -1825,7 +1653,10 @@ cant_send_wqe:
|
||||
cant_send:
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
/* We can not send the data directly, so we just return descriptor */
|
||||
*descriptor = mca_btl_openib_alloc(btl, ep, order, size, flags);
|
||||
if (NULL != descriptor) {
|
||||
*descriptor = mca_btl_openib_alloc(btl, ep, order, size, flags);
|
||||
}
|
||||
|
||||
return OPAL_ERR_RESOURCE_BUSY;
|
||||
}
|
||||
/*
|
||||
@ -1855,7 +1686,7 @@ int mca_btl_openib_send(
|
||||
|
||||
to_coalesced_frag(des)->sent = true;
|
||||
to_coalesced_frag(des)->hdr->tag = tag;
|
||||
to_coalesced_frag(des)->hdr->size = des->des_local->seg_len;
|
||||
to_coalesced_frag(des)->hdr->size = des->des_segments->seg_len;
|
||||
if(ep->nbo)
|
||||
BTL_OPENIB_HEADER_COALESCED_HTON(*to_coalesced_frag(des)->hdr);
|
||||
} else {
|
||||
@ -1869,171 +1700,34 @@ int mca_btl_openib_send(
|
||||
return mca_btl_openib_endpoint_send(ep, frag);
|
||||
}
|
||||
|
||||
/*
|
||||
* RDMA WRITE local buffer to remote buffer address.
|
||||
*/
|
||||
|
||||
int mca_btl_openib_put( mca_btl_base_module_t* btl,
|
||||
mca_btl_base_endpoint_t* ep,
|
||||
mca_btl_base_descriptor_t* descriptor)
|
||||
static mca_btl_base_registration_handle_t *mca_btl_openib_register_mem (mca_btl_base_module_t *btl,
|
||||
mca_btl_base_endpoint_t *endpoint,
|
||||
void *base, size_t size, uint32_t flags)
|
||||
{
|
||||
mca_btl_openib_segment_t *src_seg = (mca_btl_openib_segment_t *) descriptor->des_local;
|
||||
mca_btl_openib_segment_t *dst_seg = (mca_btl_openib_segment_t *) descriptor->des_remote;
|
||||
struct ibv_send_wr* bad_wr;
|
||||
mca_btl_openib_out_frag_t* frag = to_out_frag(descriptor);
|
||||
int qp = descriptor->order;
|
||||
uint64_t rem_addr = dst_seg->base.seg_addr.lval;
|
||||
uint32_t rkey = dst_seg->key;
|
||||
mca_btl_openib_reg_t *reg;
|
||||
uint32_t mflags = 0;
|
||||
int rc;
|
||||
|
||||
assert(openib_frag_type(frag) == MCA_BTL_OPENIB_FRAG_SEND_USER ||
|
||||
openib_frag_type(frag) == MCA_BTL_OPENIB_FRAG_SEND);
|
||||
#if OPAL_CUDA_GDR_SUPPORT
|
||||
if (flags & MCA_BTL_REG_FLAG_CUDA_GPU_MEM) {
|
||||
mflags |= MCA_MPOOL_FLAGS_CUDA_GPU_MEM;
|
||||
}
|
||||
#endif /* OPAL_CUDA_GDR_SUPPORT */
|
||||
|
||||
descriptor->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
|
||||
|
||||
if(ep->endpoint_state != MCA_BTL_IB_CONNECTED) {
|
||||
int rc;
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
rc = check_endpoint_state(ep, descriptor, &ep->pending_put_frags);
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
if(OPAL_ERR_RESOURCE_BUSY == rc)
|
||||
return OPAL_SUCCESS;
|
||||
if(OPAL_SUCCESS != rc)
|
||||
return rc;
|
||||
rc = btl->btl_mpool->mpool_register (btl->btl_mpool, base, size, mflags,
|
||||
(mca_mpool_base_registration_t **) ®);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc || NULL == reg)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if(MCA_BTL_NO_ORDER == qp)
|
||||
qp = mca_btl_openib_component.rdma_qp;
|
||||
|
||||
/* check for a send wqe */
|
||||
if (qp_get_wqe(ep, qp) < 0) {
|
||||
qp_put_wqe(ep, qp);
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
opal_list_append(&ep->pending_put_frags, (opal_list_item_t*)frag);
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
/* post descriptor */
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
if((ep->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN)
|
||||
!= (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) {
|
||||
rem_addr = opal_swap_bytes8(rem_addr);
|
||||
rkey = opal_swap_bytes4(rkey);
|
||||
}
|
||||
#endif
|
||||
frag->sr_desc.wr.rdma.remote_addr = rem_addr;
|
||||
frag->sr_desc.wr.rdma.rkey = rkey;
|
||||
|
||||
to_com_frag(frag)->sg_entry.addr = src_seg->base.seg_addr.lval;
|
||||
to_com_frag(frag)->sg_entry.length = src_seg->base.seg_len;
|
||||
to_com_frag(frag)->endpoint = ep;
|
||||
#if HAVE_XRC
|
||||
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp))
|
||||
#if OPAL_HAVE_CONNECTX_XRC_DOMAINS
|
||||
frag->sr_desc.qp_type.xrc.remote_srqn=ep->rem_info.rem_srqs[qp].rem_srq_num;
|
||||
#else
|
||||
frag->sr_desc.xrc_remote_srq_num=ep->rem_info.rem_srqs[qp].rem_srq_num;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
descriptor->order = qp;
|
||||
/* Setting opcode on a frag constructor isn't enough since prepare_src
|
||||
* may return send_frag instead of put_frag */
|
||||
frag->sr_desc.opcode = IBV_WR_RDMA_WRITE;
|
||||
frag->sr_desc.send_flags = ib_send_flags(descriptor->des_local->seg_len, &(ep->qps[qp]), 1);
|
||||
qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag));
|
||||
qp_reset_signal_count(ep, qp);
|
||||
|
||||
qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag));
|
||||
qp_reset_signal_count(ep, qp);
|
||||
|
||||
if(ibv_post_send(ep->qps[qp].qp->lcl_qp, &frag->sr_desc, &bad_wr))
|
||||
return OPAL_ERROR;
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
return ®->btl_handle;
|
||||
}
|
||||
|
||||
/*
|
||||
* RDMA READ remote buffer to local buffer address.
|
||||
*/
|
||||
|
||||
int mca_btl_openib_get(mca_btl_base_module_t* btl,
|
||||
mca_btl_base_endpoint_t* ep,
|
||||
mca_btl_base_descriptor_t* descriptor)
|
||||
static int mca_btl_openib_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle)
|
||||
{
|
||||
mca_btl_openib_segment_t *src_seg = (mca_btl_openib_segment_t *) descriptor->des_remote;
|
||||
mca_btl_openib_segment_t *dst_seg = (mca_btl_openib_segment_t *) descriptor->des_local;
|
||||
struct ibv_send_wr* bad_wr;
|
||||
mca_btl_openib_get_frag_t* frag = to_get_frag(descriptor);
|
||||
int qp = descriptor->order;
|
||||
uint64_t rem_addr = src_seg->base.seg_addr.lval;
|
||||
uint32_t rkey = src_seg->key;
|
||||
mca_btl_openib_reg_t *reg = (mca_btl_openib_reg_t *)((intptr_t) handle - offsetof (mca_btl_openib_reg_t, btl_handle));
|
||||
|
||||
assert(openib_frag_type(frag) == MCA_BTL_OPENIB_FRAG_RECV_USER);
|
||||
|
||||
descriptor->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
|
||||
|
||||
if(ep->endpoint_state != MCA_BTL_IB_CONNECTED) {
|
||||
int rc;
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
rc = check_endpoint_state(ep, descriptor, &ep->pending_get_frags);
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
if(OPAL_ERR_RESOURCE_BUSY == rc)
|
||||
return OPAL_SUCCESS;
|
||||
if(OPAL_SUCCESS != rc)
|
||||
return rc;
|
||||
}
|
||||
|
||||
if(MCA_BTL_NO_ORDER == qp)
|
||||
qp = mca_btl_openib_component.rdma_qp;
|
||||
|
||||
/* check for a send wqe */
|
||||
if (qp_get_wqe(ep, qp) < 0) {
|
||||
qp_put_wqe(ep, qp);
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
opal_list_append(&ep->pending_get_frags, (opal_list_item_t*)frag);
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
/* check for a get token */
|
||||
if(OPAL_THREAD_ADD32(&ep->get_tokens,-1) < 0) {
|
||||
qp_put_wqe(ep, qp);
|
||||
OPAL_THREAD_ADD32(&ep->get_tokens,1);
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
opal_list_append(&ep->pending_get_frags, (opal_list_item_t*)frag);
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
if((ep->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN)
|
||||
!= (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) {
|
||||
rem_addr = opal_swap_bytes8(rem_addr);
|
||||
rkey = opal_swap_bytes4(rkey);
|
||||
}
|
||||
#endif
|
||||
frag->sr_desc.wr.rdma.remote_addr = rem_addr;
|
||||
frag->sr_desc.wr.rdma.rkey = rkey;
|
||||
|
||||
to_com_frag(frag)->sg_entry.addr = dst_seg->base.seg_addr.lval;
|
||||
to_com_frag(frag)->sg_entry.length = dst_seg->base.seg_len;
|
||||
to_com_frag(frag)->endpoint = ep;
|
||||
|
||||
#if HAVE_XRC
|
||||
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp))
|
||||
#if OPAL_HAVE_CONNECTX_XRC_DOMAINS
|
||||
frag->sr_desc.qp_type.xrc.remote_srqn=ep->rem_info.rem_srqs[qp].rem_srq_num;
|
||||
#else
|
||||
frag->sr_desc.xrc_remote_srq_num=ep->rem_info.rem_srqs[qp].rem_srq_num;
|
||||
#endif
|
||||
#endif
|
||||
descriptor->order = qp;
|
||||
|
||||
qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag));
|
||||
qp_reset_signal_count(ep, qp);
|
||||
|
||||
if(ibv_post_send(ep->qps[qp].qp->lcl_qp, &frag->sr_desc, &bad_wr))
|
||||
return OPAL_ERROR;
|
||||
btl->btl_mpool->mpool_deregister (btl->btl_mpool, (mca_mpool_base_registration_t *) reg);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
@ -12,7 +12,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2006-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
||||
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
|
||||
@ -148,7 +148,7 @@ typedef struct mca_btl_openib_srq_manager_t {
|
||||
} mca_btl_openib_srq_manager_t;
|
||||
|
||||
struct mca_btl_openib_component_t {
|
||||
mca_btl_base_component_2_0_0_t super; /**< base BTL component */
|
||||
mca_btl_base_component_3_0_0_t super; /**< base BTL component */
|
||||
|
||||
int ib_max_btls;
|
||||
/**< maximum number of devices available to openib component */
|
||||
@ -496,9 +496,15 @@ typedef struct mca_btl_openib_module_t mca_btl_openib_module_t;
|
||||
|
||||
extern mca_btl_openib_module_t mca_btl_openib_module;
|
||||
|
||||
struct mca_btl_base_registration_handle_t {
|
||||
uint32_t rkey;
|
||||
uint32_t lkey;
|
||||
};
|
||||
|
||||
struct mca_btl_openib_reg_t {
|
||||
mca_mpool_base_registration_t base;
|
||||
struct ibv_mr *mr;
|
||||
mca_btl_base_registration_handle_t btl_handle;
|
||||
};
|
||||
typedef struct mca_btl_openib_reg_t mca_btl_openib_reg_t;
|
||||
|
||||
@ -611,32 +617,182 @@ extern int mca_btl_openib_sendi( struct mca_btl_base_module_t* btl,
|
||||
mca_btl_base_descriptor_t** descriptor
|
||||
);
|
||||
|
||||
/**
|
||||
* PML->BTL Initiate a put of the specified size.
|
||||
*
|
||||
* @param btl (IN) BTL instance
|
||||
* @param btl_peer (IN) BTL peer addressing
|
||||
* @param descriptor (IN) Descriptor of data to be transmitted.
|
||||
*/
|
||||
extern int mca_btl_openib_put(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* btl_peer,
|
||||
struct mca_btl_base_descriptor_t* descriptor
|
||||
);
|
||||
/* forward decaration for internal put/get */
|
||||
struct mca_btl_openib_put_frag_t;
|
||||
struct mca_btl_openib_get_frag_t;
|
||||
|
||||
/**
|
||||
* PML->BTL Initiate a get of the specified size.
|
||||
* @brief Schedule a put fragment with the HCA (internal)
|
||||
*
|
||||
* @param btl (IN) BTL instance
|
||||
* @param btl_base_peer (IN) BTL peer addressing
|
||||
* @param descriptor (IN) Descriptor of data to be transmitted.
|
||||
* @param ep (IN) BTL endpoint
|
||||
* @param frag (IN) Fragment prepared by mca_btl_openib_put
|
||||
*
|
||||
* If the fragment can not be scheduled due to resource limitations then
|
||||
* the fragment will be put on the pending put fragment list and retried
|
||||
* when another get/put fragment has completed.
|
||||
*/
|
||||
extern int mca_btl_openib_get(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* btl_peer,
|
||||
struct mca_btl_base_descriptor_t* descriptor
|
||||
);
|
||||
int mca_btl_openib_put_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep,
|
||||
struct mca_btl_openib_put_frag_t *frag);
|
||||
|
||||
/**
|
||||
* @brief Schedule an RDMA write with the HCA
|
||||
*
|
||||
* @param btl (IN) BTL instance
|
||||
* @param ep (IN) BTL endpoint
|
||||
* @param local_address (IN) Source address
|
||||
* @param remote_address (IN) Destination address
|
||||
* @param local_handle (IN) Registration handle for region containing the region {local_address, size}
|
||||
* @param remote_handle (IN) Registration handle for region containing the region {remote_address, size}
|
||||
* @param size (IN) Number of bytes to write
|
||||
* @param flags (IN) Transfer flags
|
||||
* @param order (IN) Ordering
|
||||
* @param cbfunc (IN) Function to call on completion
|
||||
* @param cbcontext (IN) Context for completion callback
|
||||
* @param cbdata (IN) Data for completion callback
|
||||
*
|
||||
* @return OPAL_ERR_BAD_PARAM if a bad parameter was passed
|
||||
* @return OPAL_SUCCCESS if the operation was successfully scheduled
|
||||
*
|
||||
* This function will attempt to schedule a put operation with the HCA.
|
||||
*/
|
||||
int mca_btl_openib_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
|
||||
/**
|
||||
* @brief Schedule a get fragment with the HCA (internal)
|
||||
*
|
||||
* @param btl (IN) BTL instance
|
||||
* @param ep (IN) BTL endpoint
|
||||
* @param qp (IN) ID of queue pair to schedule the get on
|
||||
* @param frag (IN) Fragment prepared by mca_btl_openib_get
|
||||
*
|
||||
* If the fragment can not be scheduled due to resource limitations then
|
||||
* the fragment will be put on the pending get fragment list and retried
|
||||
* when another get/put fragment has completed.
|
||||
*/
|
||||
int mca_btl_openib_get_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep,
|
||||
struct mca_btl_openib_get_frag_t *frag);
|
||||
|
||||
/**
|
||||
* @brief Schedule an RDMA read with the HCA
|
||||
*
|
||||
* @param btl (IN) BTL instance
|
||||
* @param ep (IN) BTL endpoint
|
||||
* @param local_address (IN) Destination address
|
||||
* @param remote_address (IN) Source address
|
||||
* @param local_handle (IN) Registration handle for region containing the region {local_address, size}
|
||||
* @param remote_handle (IN) Registration handle for region containing the region {remote_address, size}
|
||||
* @param size (IN) Number of bytes to read
|
||||
* @param flags (IN) Transfer flags
|
||||
* @param order (IN) Ordering
|
||||
* @param cbfunc (IN) Function to call on completion
|
||||
* @param cbcontext (IN) Context for completion callback
|
||||
* @param cbdata (IN) Data for completion callback
|
||||
*
|
||||
* @return OPAL_ERR_BAD_PARAM if a bad parameter was passed
|
||||
* @return OPAL_SUCCCESS if the operation was successfully scheduled
|
||||
*
|
||||
* This function will attempt to schedule a get operation with the HCA.
|
||||
*/
|
||||
int mca_btl_openib_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
|
||||
/**
|
||||
* Initiate an asynchronous fetching atomic operation.
|
||||
* Completion Semantics: if this function returns a 1 then the operation
|
||||
* is complete. a return of OPAL_SUCCESS indicates
|
||||
* the atomic operation has been queued with the
|
||||
* network.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param local_address (OUT) Local address to store the result in
|
||||
* @param remote_address (IN) Remote address perfom operation on to (registered remotely)
|
||||
* @param local_handle (IN) Local registration handle for region containing
|
||||
* (local_address, local_address + 8)
|
||||
* @param remote_handle (IN) Remote registration handle for region containing
|
||||
* (remote_address, remote_address + 8)
|
||||
* @param op (IN) Operation to perform
|
||||
* @param operand (IN) Operand for the operation
|
||||
* @param flags (IN) Flags for this put operation
|
||||
* @param order (IN) Ordering
|
||||
* @param cbfunc (IN) Function to call on completion (if queued)
|
||||
* @param cbcontext (IN) Context for the callback
|
||||
* @param cbdata (IN) Data for callback
|
||||
*
|
||||
* @retval OPAL_SUCCESS The operation was successfully queued
|
||||
* @retval 1 The operation is complete
|
||||
* @retval OPAL_ERROR The operation was NOT successfully queued
|
||||
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the atomic
|
||||
* operation. Try again later
|
||||
* @retval OPAL_ERR_NOT_AVAILABLE Atomic operation can not be performed due to
|
||||
* alignment restrictions or the operation {op} is not supported
|
||||
* by the hardware.
|
||||
*
|
||||
* After the operation is complete the remote address specified by {remote_address} and
|
||||
* {remote_handle} will be updated with (*remote_address) = (*remote_address) op operand.
|
||||
* {local_address} will be updated with the previous value stored in {remote_address}.
|
||||
* The btl will guarantee consistency of atomic operations performed via the btl. Note,
|
||||
* however, that not all btls will provide consistency between btl atomic operations and
|
||||
* cpu atomics.
|
||||
*/
|
||||
int mca_btl_openib_atomic_fop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, uint64_t remote_address,
|
||||
struct mca_btl_base_registration_handle_t *local_handle,
|
||||
struct mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
|
||||
uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata);
|
||||
|
||||
/**
|
||||
* Initiate an asynchronous compare and swap operation.
|
||||
* Completion Semantics: if this function returns a 1 then the operation
|
||||
* is complete. a return of OPAL_SUCCESS indicates
|
||||
* the atomic operation has been queued with the
|
||||
* network.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param local_address (OUT) Local address to store the result in
|
||||
* @param remote_address (IN) Remote address perfom operation on to (registered remotely)
|
||||
* @param local_handle (IN) Local registration handle for region containing
|
||||
* (local_address, local_address + 8)
|
||||
* @param remote_handle (IN) Remote registration handle for region containing
|
||||
* (remote_address, remote_address + 8)
|
||||
* @param compare (IN) Operand for the operation
|
||||
* @param value (IN) Value to store on success
|
||||
* @param flags (IN) Flags for this put operation
|
||||
* @param order (IN) Ordering
|
||||
* @param cbfunc (IN) Function to call on completion (if queued)
|
||||
* @param cbcontext (IN) Context for the callback
|
||||
* @param cbdata (IN) Data for callback
|
||||
*
|
||||
* @retval OPAL_SUCCESS The operation was successfully queued
|
||||
* @retval 1 The operation is complete
|
||||
* @retval OPAL_ERROR The operation was NOT successfully queued
|
||||
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the atomic
|
||||
* operation. Try again later
|
||||
* @retval OPAL_ERR_NOT_AVAILABLE Atomic operation can not be performed due to
|
||||
* alignment restrictions or the operation {op} is not supported
|
||||
* by the hardware.
|
||||
*
|
||||
* After the operation is complete the remote address specified by {remote_address} and
|
||||
* {remote_handle} will be updated with {value} if *remote_address == compare.
|
||||
* {local_address} will be updated with the previous value stored in {remote_address}.
|
||||
* The btl will guarantee consistency of atomic operations performed via the btl. Note,
|
||||
* however, that not all btls will provide consistency between btl atomic operations and
|
||||
* cpu atomics.
|
||||
*/
|
||||
int mca_btl_openib_atomic_cswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, uint64_t remote_address,
|
||||
struct mca_btl_base_registration_handle_t *local_handle,
|
||||
struct mca_btl_base_registration_handle_t *remote_handle, uint64_t compare,
|
||||
uint64_t value, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata);
|
||||
|
||||
/**
|
||||
* Allocate a descriptor.
|
||||
@ -673,7 +829,6 @@ extern int mca_btl_openib_free(
|
||||
mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* peer,
|
||||
mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
@ -681,22 +836,6 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
|
||||
uint32_t flags
|
||||
);
|
||||
|
||||
/**
|
||||
* Allocate a descriptor initialized for RDMA write.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param peer (IN) BTL peer addressing
|
||||
*/
|
||||
extern mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* peer,
|
||||
mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t* size,
|
||||
uint32_t flags);
|
||||
|
||||
extern void mca_btl_openib_frag_progress_pending_put_get(
|
||||
struct mca_btl_base_endpoint_t*, const int);
|
||||
|
||||
|
135
opal/mca/btl/openib/btl_openib_atomic.c
Обычный файл
135
opal/mca/btl/openib/btl_openib_atomic.c
Обычный файл
@ -0,0 +1,135 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "btl_openib.h"
|
||||
#include "btl_openib_endpoint.h"
|
||||
#include "btl_openib_xrc.h"
|
||||
|
||||
#if HAVE_DECL_IBV_ATOMIC_HCA
|
||||
|
||||
static int mca_btl_openib_atomic_internal (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, enum ibv_wr_opcode opcode,
|
||||
int64_t operand, int operand2, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata)
|
||||
{
|
||||
mca_btl_openib_get_frag_t* frag = NULL;
|
||||
int qp = order;
|
||||
int rc;
|
||||
|
||||
frag = to_get_frag(alloc_recv_user_frag());
|
||||
if (OPAL_UNLIKELY(NULL == frag)) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
if (MCA_BTL_NO_ORDER == qp) {
|
||||
qp = mca_btl_openib_component.rdma_qp;
|
||||
}
|
||||
|
||||
/* set base descriptor flags */
|
||||
to_base_frag(frag)->base.order = qp;
|
||||
/* free this descriptor when the operation is complete */
|
||||
to_base_frag(frag)->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
|
||||
|
||||
/* set up scatter-gather entry */
|
||||
to_com_frag(frag)->sg_entry.length = 8;
|
||||
to_com_frag(frag)->sg_entry.lkey = local_handle->lkey;
|
||||
to_com_frag(frag)->sg_entry.addr = (uint64_t)(uintptr_t) local_address;
|
||||
to_com_frag(frag)->endpoint = endpoint;
|
||||
|
||||
/* set up rdma callback */
|
||||
frag->cb.func = cbfunc;
|
||||
frag->cb.context = cbcontext;
|
||||
frag->cb.data = cbdata;
|
||||
frag->cb.local_handle = local_handle;
|
||||
|
||||
/* set up descriptor */
|
||||
frag->sr_desc.wr.atomic.remote_addr = remote_address;
|
||||
frag->sr_desc.opcode = opcode;
|
||||
frag->sr_desc.wr.atomic.compare_add = operand;
|
||||
frag->sr_desc.wr.atomic.swap = operand2;
|
||||
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
if((endpoint->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN)
|
||||
!= (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) {
|
||||
frag->sr_desc.wr.atomic.rkey = opal_swap_bytes4 (remote_handle->rkey);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
frag->sr_desc.wr.atomic.rkey = remote_handle->rkey;
|
||||
}
|
||||
|
||||
#if HAVE_XRC
|
||||
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) {
|
||||
frag->sr_desc.xrc_remote_srq_num=endpoint->rem_info.rem_srqs[qp].rem_srq_num;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (endpoint->endpoint_state != MCA_BTL_IB_CONNECTED) {
|
||||
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
|
||||
rc = check_endpoint_state(endpoint, &to_base_frag(frag)->base, &endpoint->pending_get_frags);
|
||||
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
|
||||
if (OPAL_ERR_RESOURCE_BUSY == rc) {
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
MCA_BTL_IB_FRAG_RETURN (frag);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
rc = mca_btl_openib_get_internal (btl, endpoint, frag);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
if (OPAL_LIKELY(OPAL_ERR_OUT_OF_RESOURCE == rc)) {
|
||||
rc = OPAL_SUCCESS;
|
||||
|
||||
OPAL_THREAD_SCOPED_LOCK(&endpoint->endpoint_lock,
|
||||
opal_list_append(&endpoint->pending_get_frags, (opal_list_item_t*)frag));
|
||||
} else {
|
||||
MCA_BTL_IB_FRAG_RETURN (frag);
|
||||
}
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
int mca_btl_openib_atomic_fop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, uint64_t remote_address,
|
||||
struct mca_btl_base_registration_handle_t *local_handle,
|
||||
struct mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
|
||||
uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata)
|
||||
{
|
||||
|
||||
if (OPAL_UNLIKELY(MCA_BTL_ATOMIC_ADD != op)) {
|
||||
return OPAL_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
return mca_btl_openib_atomic_internal (btl, endpoint, local_address, remote_address, local_handle,
|
||||
remote_handle, IBV_WR_ATOMIC_FETCH_AND_ADD, operand, 0,
|
||||
flags, order, cbfunc, cbcontext, cbdata);
|
||||
}
|
||||
|
||||
int mca_btl_openib_atomic_cswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, uint64_t remote_address,
|
||||
struct mca_btl_base_registration_handle_t *local_handle,
|
||||
struct mca_btl_base_registration_handle_t *remote_handle, uint64_t compare,
|
||||
uint64_t value, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata)
|
||||
{
|
||||
return mca_btl_openib_atomic_internal (btl, endpoint, local_address, remote_address, local_handle,
|
||||
remote_handle, IBV_WR_ATOMIC_CMP_AND_SWP, compare, value,
|
||||
flags, order, cbfunc, cbcontext, cbdata);
|
||||
}
|
||||
|
||||
#endif
|
@ -468,7 +468,7 @@ static void btl_openib_control(mca_btl_base_module_t* btl,
|
||||
mca_btl_openib_header_coalesced_t *clsc_hdr =
|
||||
(mca_btl_openib_header_coalesced_t*)(ctl_hdr + 1);
|
||||
mca_btl_active_message_callback_t* reg;
|
||||
size_t len = des->des_local->seg_len - sizeof(*ctl_hdr);
|
||||
size_t len = des->des_segments->seg_len - sizeof(*ctl_hdr);
|
||||
|
||||
switch (ctl_hdr->type) {
|
||||
case MCA_BTL_OPENIB_CONTROL_CREDITS:
|
||||
@ -519,8 +519,8 @@ static void btl_openib_control(mca_btl_base_module_t* btl,
|
||||
|
||||
skip = (sizeof(*clsc_hdr) + clsc_hdr->alloc_size - pad);
|
||||
|
||||
tmp_des.des_local = &tmp_seg;
|
||||
tmp_des.des_local_count = 1;
|
||||
tmp_des.des_segments = &tmp_seg;
|
||||
tmp_des.des_segment_count = 1;
|
||||
tmp_seg.seg_addr.pval = clsc_hdr + 1;
|
||||
tmp_seg.seg_len = clsc_hdr->size;
|
||||
|
||||
@ -580,6 +580,10 @@ static int openib_reg_mr(void *reg_data, void *base, size_t size,
|
||||
enum ibv_access_flags access_flag = (enum ibv_access_flags) (IBV_ACCESS_LOCAL_WRITE |
|
||||
IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ);
|
||||
|
||||
#if HAVE_DECL_IBV_ATOMIC_HCA
|
||||
access_flag |= IBV_ACCESS_REMOTE_ATOMIC;
|
||||
#endif
|
||||
|
||||
if (device->mem_reg_max &&
|
||||
device->mem_reg_max < (device->mem_reg_active + size)) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
@ -602,6 +606,9 @@ static int openib_reg_mr(void *reg_data, void *base, size_t size,
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
openib_reg->btl_handle.lkey = openib_reg->mr->lkey;
|
||||
openib_reg->btl_handle.rkey = openib_reg->mr->rkey;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((30, mca_btl_openib_component.memory_registration_verbose,
|
||||
"openib_reg_mr: base=%p, bound=%p, size=%d, flags=0x%x", reg->base, reg->bound,
|
||||
(int) (reg->bound - reg->base + 1), reg->flags));
|
||||
@ -799,7 +806,30 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
|
||||
mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbfunc = btl_openib_control;
|
||||
mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbdata = NULL;
|
||||
|
||||
openib_btl->super.btl_seg_size = sizeof (mca_btl_openib_segment_t);
|
||||
if (openib_btl->super.btl_get_limit > openib_btl->ib_port_attr.max_msg_sz) {
|
||||
openib_btl->super.btl_get_limit = openib_btl->ib_port_attr.max_msg_sz;
|
||||
}
|
||||
|
||||
openib_btl->super.btl_get_alignment = 0;
|
||||
|
||||
if (openib_btl->super.btl_put_limit > openib_btl->ib_port_attr.max_msg_sz) {
|
||||
openib_btl->super.btl_put_limit = openib_btl->ib_port_attr.max_msg_sz;
|
||||
}
|
||||
|
||||
#if HAVE_DECL_IBV_ATOMIC_HCA
|
||||
if (openib_btl->device->ib_dev_attr.atomic_cap == IBV_ATOMIC_NONE) {
|
||||
openib_btl->super.btl_flags &= ~MCA_BTL_FLAGS_ATOMIC_FOPS;
|
||||
openib_btl->super.btl_atomic_flags = 0;
|
||||
openib_btl->super.btl_atomic_fop = NULL;
|
||||
openib_btl->super.btl_atomic_cswap = NULL;
|
||||
} else if (IBV_ATOMIC_GLOB == openib_btl->device->ib_dev_attr.atomic_cap) {
|
||||
openib_btl->super.btl_flags |= MCA_BTL_ATOMIC_SUPPORTS_GLOB;
|
||||
}
|
||||
#endif
|
||||
|
||||
openib_btl->super.btl_put_alignment = 0;
|
||||
|
||||
openib_btl->super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t);
|
||||
|
||||
/* Check bandwidth configured for this device */
|
||||
sprintf(param, "bandwidth_%s", ibv_get_device_name(device->ib_dev));
|
||||
@ -2976,17 +3006,20 @@ void mca_btl_openib_frag_progress_pending_put_get(mca_btl_base_endpoint_t *ep,
|
||||
size_t i, len = opal_list_get_size(&ep->pending_get_frags);
|
||||
int rc;
|
||||
|
||||
for(i = 0; i < len && ep->qps[qp].qp->sd_wqe > 0 && ep->get_tokens > 0; i++)
|
||||
{
|
||||
for(i = 0; i < len && ep->qps[qp].qp->sd_wqe > 0 && ep->get_tokens > 0; i++) {
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
frag = opal_list_remove_first(&(ep->pending_get_frags));
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
if(NULL == frag)
|
||||
if (NULL == frag)
|
||||
break;
|
||||
rc = mca_btl_openib_get((mca_btl_base_module_t *)openib_btl, ep,
|
||||
&to_base_frag(frag)->base);
|
||||
if(OPAL_ERR_OUT_OF_RESOURCE == rc)
|
||||
rc = mca_btl_openib_get_internal ((mca_btl_base_module_t *)openib_btl, ep,
|
||||
to_get_frag(frag));
|
||||
if (OPAL_ERR_OUT_OF_RESOURCE == rc) {
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
opal_list_prepend (&ep->pending_get_frags, frag);
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
len = opal_list_get_size(&ep->pending_put_frags);
|
||||
@ -2994,12 +3027,16 @@ void mca_btl_openib_frag_progress_pending_put_get(mca_btl_base_endpoint_t *ep,
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
frag = opal_list_remove_first(&(ep->pending_put_frags));
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
if(NULL == frag)
|
||||
if (NULL == frag)
|
||||
break;
|
||||
rc = mca_btl_openib_put((mca_btl_base_module_t*)openib_btl, ep,
|
||||
&to_base_frag(frag)->base);
|
||||
if(OPAL_ERR_OUT_OF_RESOURCE == rc)
|
||||
rc = mca_btl_openib_put_internal ((mca_btl_base_module_t*)openib_btl, ep,
|
||||
to_put_frag(frag));
|
||||
if (OPAL_ERR_OUT_OF_RESOURCE == rc) {
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
opal_list_prepend (&ep->pending_put_frags, frag);
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -3020,7 +3057,7 @@ static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl,
|
||||
|
||||
/* advance the segment address past the header and subtract from the
|
||||
* length.*/
|
||||
des->des_local->seg_len = byte_len - sizeof(mca_btl_openib_header_t);
|
||||
des->des_segments->seg_len = byte_len - sizeof(mca_btl_openib_header_t);
|
||||
|
||||
if(OPAL_LIKELY(!(is_credit_msg = is_credit_message(frag)))) {
|
||||
/* call registered callback */
|
||||
@ -3055,7 +3092,7 @@ static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl,
|
||||
}
|
||||
} else {
|
||||
mca_btl_openib_rdma_credits_header_t *chdr =
|
||||
(mca_btl_openib_rdma_credits_header_t *) des->des_local->seg_addr.pval;
|
||||
(mca_btl_openib_rdma_credits_header_t *) des->des_segments->seg_addr.pval;
|
||||
if(ep->nbo) {
|
||||
BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH(*chdr);
|
||||
}
|
||||
@ -3361,11 +3398,27 @@ static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq,
|
||||
/* Handle work completions */
|
||||
switch(wc->opcode) {
|
||||
case IBV_WC_RDMA_READ:
|
||||
OPAL_OUTPUT((-1, "Got WC: RDMA_READ"));
|
||||
OPAL_THREAD_ADD32(&endpoint->get_tokens, 1);
|
||||
/* fall through */
|
||||
case IBV_WC_COMP_SWAP:
|
||||
case IBV_WC_FETCH_ADD:
|
||||
OPAL_OUTPUT((-1, "Got WC: RDMA_READ or RDMA_WRITE"));
|
||||
|
||||
OPAL_THREAD_ADD32(&endpoint->get_tokens, 1);
|
||||
|
||||
mca_btl_openib_get_frag_t *get_frag = to_get_frag(des);
|
||||
|
||||
get_frag->cb.func (&openib_btl->super, endpoint, (void *)(intptr_t) frag->sg_entry.addr,
|
||||
get_frag->cb.local_handle, get_frag->cb.context, get_frag->cb.data,
|
||||
OPAL_SUCCESS);
|
||||
case IBV_WC_RDMA_WRITE:
|
||||
if (MCA_BTL_OPENIB_FRAG_SEND_USER == openib_frag_type(des)) {
|
||||
mca_btl_openib_put_frag_t *put_frag = to_put_frag(des);
|
||||
|
||||
put_frag->cb.func (&openib_btl->super, endpoint, (void *)(intptr_t) frag->sg_entry.addr,
|
||||
put_frag->cb.local_handle, put_frag->cb.context, put_frag->cb.data,
|
||||
OPAL_SUCCESS);
|
||||
put_frag->cb.func = NULL;
|
||||
}
|
||||
/* fall through */
|
||||
case IBV_WC_SEND:
|
||||
OPAL_OUTPUT((-1, "Got WC: RDMA_WRITE or SEND"));
|
||||
if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) {
|
||||
@ -3394,7 +3447,7 @@ static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq,
|
||||
/* Process a completed send/put/get */
|
||||
btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
||||
if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
|
||||
des->des_cbfunc(&openib_btl->super, endpoint, des,OPAL_SUCCESS);
|
||||
des->des_cbfunc(&openib_btl->super, endpoint, des, OPAL_SUCCESS);
|
||||
}
|
||||
if( btl_ownership ) {
|
||||
mca_btl_openib_free(&openib_btl->super, des);
|
||||
|
@ -89,7 +89,7 @@ int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t *endpoint,
|
||||
if(acquire_wqe(endpoint, frag) != OPAL_SUCCESS)
|
||||
return OPAL_ERR_RESOURCE_BUSY;
|
||||
|
||||
size = des->des_local->seg_len + frag->coalesced_length;
|
||||
size = des->des_segments->seg_len + frag->coalesced_length;
|
||||
|
||||
rc = mca_btl_openib_endpoint_credit_acquire (endpoint, qp, prio, size,
|
||||
&do_rdma, frag, true);
|
||||
|
@ -1,3 +1,4 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
@ -10,7 +11,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
||||
* Copyright (c) 2007-2009 Mellanox Technologies. All rights reserved.
|
||||
|
@ -153,8 +153,8 @@ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
|
||||
if (NULL != btlname) free(btlname);
|
||||
|
||||
/* Since we believe we have done a send, read or write, then the
|
||||
* des_local fields should have valid data. */
|
||||
assert(des->des_local != NULL);
|
||||
* des_segments fields should have valid data. */
|
||||
assert(des->des_segments != NULL);
|
||||
|
||||
/* If the endpoint is not yet in the MCA_BTL_IB_CLOSED state, then
|
||||
* change the status. Since this connection was mapped out in the
|
||||
|
@ -68,8 +68,8 @@ static void out_constructor(mca_btl_openib_out_frag_t *frag)
|
||||
{
|
||||
mca_btl_openib_frag_t *base_frag = to_base_frag(frag);
|
||||
|
||||
base_frag->base.des_local = &base_frag->segment.base;
|
||||
base_frag->base.des_local_count = 1;
|
||||
base_frag->base.des_segments = &base_frag->segment.base;
|
||||
base_frag->base.des_segment_count = 1;
|
||||
|
||||
frag->sr_desc.wr_id = (uint64_t)(uintptr_t)frag;
|
||||
frag->sr_desc.sg_list = &to_com_frag(frag)->sg_entry;
|
||||
@ -83,8 +83,8 @@ static void in_constructor(mca_btl_openib_in_frag_t *frag)
|
||||
{
|
||||
mca_btl_openib_frag_t *base_frag = to_base_frag(frag);
|
||||
|
||||
base_frag->base.des_local = &base_frag->segment.base;
|
||||
base_frag->base.des_local_count = 1;
|
||||
base_frag->base.des_segments = &base_frag->segment.base;
|
||||
base_frag->base.des_segment_count = 1;
|
||||
}
|
||||
|
||||
static void send_constructor(mca_btl_openib_send_frag_t *frag)
|
||||
@ -134,6 +134,7 @@ static void put_constructor(mca_btl_openib_put_frag_t *frag)
|
||||
{
|
||||
to_base_frag(frag)->type = MCA_BTL_OPENIB_FRAG_SEND_USER;
|
||||
to_out_frag(frag)->sr_desc.opcode = IBV_WR_RDMA_WRITE;
|
||||
frag->cb.func = NULL;
|
||||
}
|
||||
|
||||
static void get_constructor(mca_btl_openib_get_frag_t *frag)
|
||||
@ -154,8 +155,8 @@ static void coalesced_constructor(mca_btl_openib_coalesced_frag_t *frag)
|
||||
|
||||
base_frag->type = MCA_BTL_OPENIB_FRAG_COALESCED;
|
||||
|
||||
base_frag->base.des_local = &base_frag->segment.base;
|
||||
base_frag->base.des_local_count = 1;
|
||||
base_frag->base.des_segments = &base_frag->segment.base;
|
||||
base_frag->base.des_segment_count = 1;
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(
|
||||
|
@ -349,7 +349,15 @@ OBJ_CLASS_DECLARATION(mca_btl_openib_recv_frag_t);
|
||||
|
||||
#define to_recv_frag(f) ((mca_btl_openib_recv_frag_t*)(f))
|
||||
|
||||
typedef struct mca_btl_openib_out_frag_t mca_btl_openib_put_frag_t;
|
||||
typedef struct mca_btl_openib_put_frag_t {
|
||||
mca_btl_openib_out_frag_t super;
|
||||
struct {
|
||||
mca_btl_base_rdma_completion_fn_t func;
|
||||
mca_btl_base_registration_handle_t *local_handle;
|
||||
void *context;
|
||||
void *data;
|
||||
} cb;
|
||||
} mca_btl_openib_put_frag_t;
|
||||
OBJ_CLASS_DECLARATION(mca_btl_openib_put_frag_t);
|
||||
|
||||
#define to_put_frag(f) ((mca_btl_openib_put_frag_t*)(f))
|
||||
@ -357,6 +365,12 @@ OBJ_CLASS_DECLARATION(mca_btl_openib_put_frag_t);
|
||||
typedef struct mca_btl_openib_get_frag_t {
|
||||
mca_btl_openib_in_frag_t super;
|
||||
struct ibv_send_wr sr_desc;
|
||||
struct {
|
||||
mca_btl_base_rdma_completion_fn_t func;
|
||||
mca_btl_base_registration_handle_t *local_handle;
|
||||
void *context;
|
||||
void *data;
|
||||
} cb;
|
||||
} mca_btl_openib_get_frag_t;
|
||||
OBJ_CLASS_DECLARATION(mca_btl_openib_get_frag_t);
|
||||
|
||||
|
163
opal/mca/btl/openib/btl_openib_get.c
Обычный файл
163
opal/mca/btl/openib/btl_openib_get.c
Обычный файл
@ -0,0 +1,163 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2013 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007-2013 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
||||
* Copyright (c) 2008-2012 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2009 IBM Corporation. All rights reserved.
|
||||
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "btl_openib.h"
|
||||
#include "btl_openib_frag.h"
|
||||
#include "btl_openib_endpoint.h"
|
||||
#include "btl_openib_xrc.h"
|
||||
|
||||
/*
|
||||
* RDMA READ remote buffer to local buffer address.
|
||||
*/
|
||||
|
||||
int mca_btl_openib_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
{
|
||||
mca_btl_openib_get_frag_t* frag = NULL;
|
||||
int qp = order;
|
||||
int rc;
|
||||
|
||||
if (OPAL_UNLIKELY(size > btl->btl_get_limit)) {
|
||||
return OPAL_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
frag = to_get_frag(alloc_recv_user_frag());
|
||||
if (OPAL_UNLIKELY(NULL == frag)) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
if (MCA_BTL_NO_ORDER == qp) {
|
||||
qp = mca_btl_openib_component.rdma_qp;
|
||||
}
|
||||
|
||||
/* set base descriptor flags */
|
||||
to_base_frag(frag)->base.order = qp;
|
||||
/* free this descriptor when the operation is complete */
|
||||
to_base_frag(frag)->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
|
||||
|
||||
/* set up scatter-gather entry */
|
||||
to_com_frag(frag)->sg_entry.length = size;
|
||||
to_com_frag(frag)->sg_entry.lkey = local_handle->lkey;
|
||||
to_com_frag(frag)->sg_entry.addr = (uint64_t)(uintptr_t) local_address;
|
||||
to_com_frag(frag)->endpoint = ep;
|
||||
|
||||
/* set up rdma callback */
|
||||
frag->cb.func = cbfunc;
|
||||
frag->cb.context = cbcontext;
|
||||
frag->cb.data = cbdata;
|
||||
frag->cb.local_handle = local_handle;
|
||||
|
||||
/* set up descriptor */
|
||||
frag->sr_desc.wr.rdma.remote_addr = remote_address;
|
||||
/* the opcode may have been changed by an atomic operation */
|
||||
frag->sr_desc.opcode = IBV_WR_RDMA_READ;
|
||||
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
if((ep->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN)
|
||||
!= (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) {
|
||||
frag->sr_desc.wr.rdma.rkey = opal_swap_bytes4 (remote_handle->rkey);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
frag->sr_desc.wr.rdma.rkey = remote_handle->rkey;
|
||||
}
|
||||
|
||||
#if HAVE_XRC
|
||||
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) {
|
||||
#if OPAL_HAVE_CONNECTX_XRC_DOMAINS
|
||||
frag->sr_desc.qp_type.xrc.remote_srqn = ep->rem_info.rem_srqs[qp].rem_srq_num;
|
||||
#else
|
||||
frag->sr_desc.xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num;
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
if (ep->endpoint_state != MCA_BTL_IB_CONNECTED) {
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
rc = check_endpoint_state(ep, &to_base_frag(frag)->base, &ep->pending_get_frags);
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
if (OPAL_ERR_RESOURCE_BUSY == rc) {
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
MCA_BTL_IB_FRAG_RETURN (frag);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
rc = mca_btl_openib_get_internal (btl, ep, frag);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
if (OPAL_LIKELY(OPAL_ERR_OUT_OF_RESOURCE == rc)) {
|
||||
rc = OPAL_SUCCESS;
|
||||
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
opal_list_append(&ep->pending_get_frags, (opal_list_item_t*)frag);
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
} else {
|
||||
MCA_BTL_IB_FRAG_RETURN (frag);
|
||||
}
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
int mca_btl_openib_get_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep,
|
||||
mca_btl_openib_get_frag_t *frag)
|
||||
{
|
||||
int qp = to_base_frag(frag)->base.order;
|
||||
struct ibv_send_wr *bad_wr;
|
||||
|
||||
/* check for a send wqe */
|
||||
if (qp_get_wqe(ep, qp) < 0) {
|
||||
qp_put_wqe(ep, qp);
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* check for a get token */
|
||||
if (OPAL_THREAD_ADD32(&ep->get_tokens,-1) < 0) {
|
||||
qp_put_wqe(ep, qp);
|
||||
OPAL_THREAD_ADD32(&ep->get_tokens,1);
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag));
|
||||
qp_reset_signal_count(ep, qp);
|
||||
|
||||
if (ibv_post_send(ep->qps[qp].qp->lcl_qp, &frag->sr_desc, &bad_wr)) {
|
||||
qp_put_wqe(ep, qp);
|
||||
OPAL_THREAD_ADD32(&ep->get_tokens,1);
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
@ -1,3 +1,4 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
@ -11,7 +12,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2013 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
||||
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
|
||||
@ -567,10 +568,16 @@ int btl_openib_register_mca_params(void)
|
||||
mca_btl_openib_module.super.btl_rdma_pipeline_frag_size = 1024 * 1024;
|
||||
mca_btl_openib_module.super.btl_min_rdma_pipeline_size = 256 * 1024;
|
||||
mca_btl_openib_module.super.btl_flags = MCA_BTL_FLAGS_RDMA |
|
||||
MCA_BTL_FLAGS_NEED_ACK | MCA_BTL_FLAGS_NEED_CSUM | MCA_BTL_FLAGS_HETEROGENEOUS_RDMA;
|
||||
MCA_BTL_FLAGS_NEED_ACK | MCA_BTL_FLAGS_NEED_CSUM | MCA_BTL_FLAGS_HETEROGENEOUS_RDMA;
|
||||
#if BTL_OPENIB_FAILOVER_ENABLED
|
||||
mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_FAILOVER_SUPPORT;
|
||||
#endif
|
||||
|
||||
#if HAVE_DECL_IBV_ATOMIC_HCA
|
||||
mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_ATOMIC_FOPS;
|
||||
mca_btl_openib_module.super.btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD | MCA_BTL_ATOMIC_SUPPORTS_CSWAP;
|
||||
#endif
|
||||
|
||||
/* Default to bandwidth auto-detection */
|
||||
mca_btl_openib_module.super.btl_bandwidth = 0;
|
||||
mca_btl_openib_module.super.btl_latency = 4;
|
||||
|
160
opal/mca/btl/openib/btl_openib_put.c
Обычный файл
160
opal/mca/btl/openib/btl_openib_put.c
Обычный файл
@ -0,0 +1,160 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2013 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007-2013 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
||||
* Copyright (c) 2008-2012 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2009 IBM Corporation. All rights reserved.
|
||||
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "btl_openib.h"
|
||||
#include "btl_openib_frag.h"
|
||||
#include "btl_openib_endpoint.h"
|
||||
#include "btl_openib_xrc.h"
|
||||
|
||||
/*
|
||||
* RDMA WRITE local buffer to remote buffer address.
|
||||
*/
|
||||
|
||||
int mca_btl_openib_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
{
|
||||
mca_btl_openib_put_frag_t *frag = NULL;
|
||||
int rc, qp = order;
|
||||
|
||||
if (OPAL_UNLIKELY(size > btl->btl_put_limit)) {
|
||||
return OPAL_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
frag = to_put_frag(alloc_send_user_frag ());
|
||||
if (OPAL_UNLIKELY(NULL == frag)) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
if (MCA_BTL_NO_ORDER == qp) {
|
||||
qp = mca_btl_openib_component.rdma_qp;
|
||||
}
|
||||
|
||||
/* set base descriptor flags */
|
||||
to_base_frag(frag)->base.order = qp;
|
||||
/* free this descriptor when the operation is complete */
|
||||
to_base_frag(frag)->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
|
||||
|
||||
/* set up scatter-gather entry */
|
||||
to_com_frag(frag)->sg_entry.length = size;
|
||||
to_com_frag(frag)->sg_entry.lkey = local_handle->lkey;
|
||||
to_com_frag(frag)->sg_entry.addr = (uint64_t)(intptr_t) local_address;
|
||||
to_com_frag(frag)->endpoint = ep;
|
||||
|
||||
/* set up rdma callback */
|
||||
frag->cb.func = cbfunc;
|
||||
frag->cb.context = cbcontext;
|
||||
frag->cb.data = cbdata;
|
||||
frag->cb.local_handle = local_handle;
|
||||
|
||||
/* post descriptor */
|
||||
to_out_frag(frag)->sr_desc.opcode = IBV_WR_RDMA_WRITE;
|
||||
to_out_frag(frag)->sr_desc.send_flags = ib_send_flags(size, &(ep->qps[qp]), 1);
|
||||
to_out_frag(frag)->sr_desc.wr.rdma.remote_addr = remote_address;
|
||||
|
||||
qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag));
|
||||
qp_reset_signal_count(ep, qp);
|
||||
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
if ((ep->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN)
|
||||
!= (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) {
|
||||
to_out_frag(frag)->sr_desc.wr.rdma.rkey = opal_swap_bytes4(remote_handle->rkey);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
to_out_frag(frag)->sr_desc.wr.rdma.rkey = remote_handle->rkey;
|
||||
}
|
||||
|
||||
#if HAVE_XRC
|
||||
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) {
|
||||
#if OPAL_HAVE_CONNECTX_XRC_DOMAINS
|
||||
to_out_frag(frag)->sr_desc.qp_type.xrc.remote_srqn = ep->rem_info.rem_srqs[qp].rem_srq_num;
|
||||
#else
|
||||
to_out_frag(frag)->sr_desc.xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num;
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
if (ep->endpoint_state != MCA_BTL_IB_CONNECTED) {
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
rc = check_endpoint_state(ep, &to_base_frag(frag)->base, &ep->pending_put_frags);
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
if (OPAL_ERR_RESOURCE_BUSY == rc) {
|
||||
/* descriptor was queued pending connection */
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
MCA_BTL_IB_FRAG_RETURN (frag);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
rc = mca_btl_openib_put_internal (btl, ep, frag);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
if (OPAL_LIKELY(OPAL_ERR_OUT_OF_RESOURCE == rc)) {
|
||||
rc = OPAL_SUCCESS;
|
||||
|
||||
/* queue the fragment for when resources are available */
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
opal_list_append(&ep->pending_put_frags, (opal_list_item_t*)frag);
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
} else {
|
||||
MCA_BTL_IB_FRAG_RETURN (frag);
|
||||
}
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
int mca_btl_openib_put_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep,
|
||||
mca_btl_openib_put_frag_t *frag)
|
||||
{
|
||||
int qp = to_base_frag(frag)->base.order;
|
||||
struct ibv_send_wr *bad_wr;
|
||||
int rc;
|
||||
|
||||
/* check for a send wqe */
|
||||
if (qp_get_wqe(ep, qp) < 0) {
|
||||
qp_put_wqe(ep, qp);
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag));
|
||||
qp_reset_signal_count(ep, qp);
|
||||
|
||||
if (0 != (rc = ibv_post_send(ep->qps[qp].qp->lcl_qp, &to_out_frag(frag)->sr_desc, &bad_wr))) {
|
||||
qp_put_wqe(ep, qp);
|
||||
return OPAL_ERROR;;
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
@ -119,7 +119,6 @@ AC_DEFUN([MCA_opal_btl_openib_CONFIG],[
|
||||
[enable openib BTL failover])
|
||||
AM_CONDITIONAL([MCA_btl_openib_enable_failover], [test "x$btl_openib_failover_enabled" = "x1"])
|
||||
|
||||
|
||||
# Check for __malloc_hook availability
|
||||
AC_ARG_ENABLE(btl-openib-malloc-alignment,
|
||||
AC_HELP_STRING([--enable-btl-openib-malloc-alignment], [Enable support for allocated memory alignment. Default: enabled if supported, disabled otherwise.]))
|
||||
|
@ -322,20 +322,21 @@ static int udcm_send_request (mca_btl_base_endpoint_t *lcl_ep,
|
||||
|
||||
static void udcm_send_timeout (evutil_socket_t fd, short event, void *arg);
|
||||
static int udcm_finish_connection (mca_btl_openib_endpoint_t *lcl_ep);
|
||||
static int udcm_rc_qps_to_rts(mca_btl_openib_endpoint_t *lcl_ep);
|
||||
|
||||
/* XRC support */
|
||||
#if HAVE_XRC
|
||||
static int udcm_xrc_start_connect (opal_btl_openib_connect_base_module_t *cpc,
|
||||
mca_btl_base_endpoint_t *lcl_ep);
|
||||
static int udcm_xrc_restart_connect (mca_btl_base_endpoint_t *lcl_ep);
|
||||
static int udcm_xrc_send_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_hdr_t *msg_hdr);
|
||||
static int udcm_xrc_send_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, uint32_t rem_qp_num, uint32_t rem_psn);
|
||||
static int udcm_xrc_send_qp_create (mca_btl_base_endpoint_t *lcl_ep);
|
||||
#if OPAL_HAVE_CONNECTX_XRC_DOMAINS
|
||||
static int udcm_xrc_recv_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, uint32_t qp_num);
|
||||
#else
|
||||
static int udcm_xrc_recv_qp_connect (mca_btl_openib_endpoint_t *lcl_ep);
|
||||
#endif
|
||||
static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_hdr_t *msg_hdr);
|
||||
static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, uint32_t rem_qp_num, uint32_t rem_psn);
|
||||
static int udcm_xrc_send_request (mca_btl_base_endpoint_t *lcl_ep, mca_btl_base_endpoint_t *rem_ep,
|
||||
uint8_t msg_type);
|
||||
static int udcm_xrc_send_xresponse (mca_btl_base_endpoint_t *lcl_ep, mca_btl_base_endpoint_t *rem_ep,
|
||||
@ -512,6 +513,96 @@ static int udcm_component_finalize(void)
|
||||
|
||||
/* mark: udcm module */
|
||||
|
||||
#if HAVE_XRC
|
||||
static int udcm_endpoint_init_self_xrc (struct mca_btl_base_endpoint_t *lcl_ep)
|
||||
{
|
||||
udcm_endpoint_t *udep = UDCM_ENDPOINT_DATA(lcl_ep);
|
||||
int rc;
|
||||
|
||||
opal_mutex_lock (&udep->udep_lock);
|
||||
do {
|
||||
#if OPAL_HAVE_CONNECTX_XRC_DOMAINS
|
||||
rc = udcm_xrc_recv_qp_connect (lcl_ep, lcl_ep->qps[0].qp->lcl_qp->qp_num);
|
||||
#else
|
||||
lcl_ep->xrc_recv_qp_num = lcl_ep->qps[0].qp->lcl_qp->qp_num;
|
||||
rc = udcm_xrc_recv_qp_connect (lcl_ep);
|
||||
#endif
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
BTL_VERBOSE(("error connecting loopback XRC receive queue pair"));
|
||||
break;
|
||||
}
|
||||
|
||||
rc = mca_btl_openib_endpoint_post_recvs (lcl_ep);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
BTL_VERBOSE(("error posting receives for loopback queue pair"));
|
||||
break;
|
||||
}
|
||||
|
||||
rc = udcm_xrc_recv_qp_create (lcl_ep, lcl_ep->qps[0].qp->lcl_qp->qp_num,
|
||||
lcl_ep->qps[0].qp->lcl_psn);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
BTL_VERBOSE(("error creating loopback XRC receive queue pair"));
|
||||
break;
|
||||
}
|
||||
|
||||
rc = udcm_xrc_send_qp_connect (lcl_ep, lcl_ep->qps[0].qp->lcl_qp->qp_num,
|
||||
lcl_ep->qps[0].qp->lcl_psn);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
BTL_VERBOSE(("error creating loopback XRC send queue pair"));
|
||||
break;
|
||||
}
|
||||
|
||||
lcl_ep->endpoint_state = MCA_BTL_IB_CONNECTED;
|
||||
|
||||
rc = udcm_finish_connection (lcl_ep);
|
||||
} while (0);
|
||||
opal_mutex_unlock (&udep->udep_lock);
|
||||
|
||||
return rc;
|
||||
}
|
||||
#endif
|
||||
|
||||
static int udcm_endpoint_init_self (struct mca_btl_base_endpoint_t *lcl_ep)
|
||||
{
|
||||
udcm_endpoint_t *udep = UDCM_ENDPOINT_DATA(lcl_ep);
|
||||
int rc;
|
||||
|
||||
opal_mutex_lock (&udep->udep_lock);
|
||||
do {
|
||||
if (OPAL_SUCCESS != (rc = udcm_endpoint_init_data (lcl_ep))) {
|
||||
BTL_VERBOSE(("error initializing loopback endpoint cpc data"));
|
||||
break;
|
||||
}
|
||||
|
||||
if (OPAL_SUCCESS != (rc = udcm_rc_qp_create_all (lcl_ep))) {
|
||||
BTL_VERBOSE(("error initializing loopback endpoint qps"));
|
||||
break;
|
||||
}
|
||||
|
||||
/* save queue pair info */
|
||||
lcl_ep->rem_info.rem_index = lcl_ep->index;
|
||||
|
||||
for (int i = 0 ; i < mca_btl_openib_component.num_qps ; ++i) {
|
||||
lcl_ep->rem_info.rem_qps[i].rem_psn = lcl_ep->qps[i].qp->lcl_psn;
|
||||
lcl_ep->rem_info.rem_qps[i].rem_qp_num = lcl_ep->qps[i].qp->lcl_qp->qp_num;
|
||||
}
|
||||
|
||||
if (OPAL_SUCCESS != (rc = udcm_rc_qps_to_rts (lcl_ep))) {
|
||||
BTL_VERBOSE(("error moving loopback endpoint qps to RTS"));
|
||||
break;
|
||||
}
|
||||
|
||||
lcl_ep->endpoint_state = MCA_BTL_IB_CONNECTED;
|
||||
|
||||
rc = udcm_finish_connection (lcl_ep);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
} while (0);
|
||||
opal_mutex_unlock (&udep->udep_lock);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int udcm_endpoint_init (struct mca_btl_base_endpoint_t *lcl_ep)
|
||||
{
|
||||
udcm_endpoint_t *udep = lcl_ep->endpoint_local_cpc_data =
|
||||
@ -523,6 +614,16 @@ static int udcm_endpoint_init (struct mca_btl_base_endpoint_t *lcl_ep)
|
||||
|
||||
OBJ_CONSTRUCT(&udep->udep_lock, opal_mutex_t);
|
||||
|
||||
if (lcl_ep->endpoint_proc->proc_opal == opal_proc_local_get ()) {
|
||||
/* go ahead and try to create a loopback queue pair */
|
||||
#if HAVE_XRC
|
||||
if (mca_btl_openib_component.num_xrc_qps > 0) {
|
||||
return udcm_endpoint_init_self_xrc (lcl_ep);
|
||||
} else
|
||||
#endif
|
||||
return udcm_endpoint_init_self (lcl_ep);
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
@ -1072,6 +1173,9 @@ static inline int udcm_rc_qp_to_init (struct ibv_qp *qp,
|
||||
attr.pkey_index = btl->pkey_index;
|
||||
attr.port_num = btl->port_num;
|
||||
attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ;
|
||||
#if HAVE_DECL_IBV_ATOMIC_HCA
|
||||
attr.qp_access_flags |= IBV_ACCESS_REMOTE_ATOMIC;
|
||||
#endif
|
||||
attr_mask = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT |
|
||||
IBV_QP_ACCESS_FLAGS;
|
||||
|
||||
@ -2313,7 +2417,7 @@ static int udcm_xrc_restart_connect (mca_btl_base_endpoint_t *lcl_ep)
|
||||
/* mark: xrc send qp */
|
||||
|
||||
/* Send qp connect */
|
||||
static int udcm_xrc_send_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_hdr_t *msg_hdr)
|
||||
static int udcm_xrc_send_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, uint32_t rem_qp_num, uint32_t rem_psn)
|
||||
{
|
||||
mca_btl_openib_module_t *openib_btl = lcl_ep->endpoint_btl;
|
||||
struct ibv_qp_attr attr;
|
||||
@ -2322,7 +2426,7 @@ static int udcm_xrc_send_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg
|
||||
int ret;
|
||||
|
||||
BTL_VERBOSE(("Connecting send qp: %p, remote qp: %d", (void *)lcl_ep->qps[0].qp->lcl_qp,
|
||||
msg_hdr->data.xres.rem_qp_num));
|
||||
rem_qp_num));
|
||||
assert(NULL != lcl_ep->qps);
|
||||
qp = lcl_ep->qps[0].qp->lcl_qp;
|
||||
psn = lcl_ep->qps[0].qp->lcl_psn;
|
||||
@ -2332,8 +2436,8 @@ static int udcm_xrc_send_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg
|
||||
attr.qp_state = IBV_QPS_RTR;
|
||||
attr.path_mtu = (openib_btl->device->mtu < lcl_ep->rem_info.rem_mtu) ?
|
||||
openib_btl->device->mtu : lcl_ep->rem_info.rem_mtu;
|
||||
attr.dest_qp_num = msg_hdr->data.xres.rem_qp_num;
|
||||
attr.rq_psn = msg_hdr->data.xres.rem_psn;
|
||||
attr.dest_qp_num = rem_qp_num;
|
||||
attr.rq_psn = rem_psn;
|
||||
attr.max_dest_rd_atomic = mca_btl_openib_component.ib_max_rdma_dst_ops;
|
||||
attr.min_rnr_timer = mca_btl_openib_component.ib_min_rnr_timer;
|
||||
attr.ah_attr.is_global = 0;
|
||||
@ -2481,6 +2585,9 @@ static int udcm_xrc_send_qp_create (mca_btl_base_endpoint_t *lcl_ep)
|
||||
attr.pkey_index = openib_btl->pkey_index;
|
||||
attr.port_num = openib_btl->port_num;
|
||||
attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ;
|
||||
#if HAVE_DECL_IBV_ATOMIC_HCA
|
||||
attr.qp_access_flags |= IBV_ACCESS_REMOTE_ATOMIC;
|
||||
#endif
|
||||
ret = ibv_modify_qp(*qp, &attr,
|
||||
IBV_QP_STATE |
|
||||
IBV_QP_PKEY_INDEX |
|
||||
@ -2546,7 +2653,7 @@ static int udcm_xrc_recv_qp_connect (mca_btl_openib_endpoint_t *lcl_ep)
|
||||
}
|
||||
|
||||
/* Recv qp create */
|
||||
static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_hdr_t *msg_hdr)
|
||||
static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, uint32_t rem_qp_num, uint32_t rem_psn)
|
||||
{
|
||||
mca_btl_openib_module_t* openib_btl = lcl_ep->endpoint_btl;
|
||||
#if OPAL_HAVE_CONNECTX_XRC_DOMAINS
|
||||
@ -2588,6 +2695,11 @@ static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_
|
||||
attr.pkey_index = openib_btl->pkey_index;
|
||||
attr.port_num = openib_btl->port_num;
|
||||
attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ;
|
||||
|
||||
#if HAVE_DECL_IBV_ATOMIC_HCA
|
||||
attr.qp_access_flags |= IBV_ACCESS_REMOTE_ATOMIC;
|
||||
#endif
|
||||
|
||||
#if OPAL_HAVE_CONNECTX_XRC_DOMAINS
|
||||
ret = ibv_modify_qp(lcl_ep->xrc_recv_qp,
|
||||
&attr,
|
||||
@ -2617,8 +2729,8 @@ static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_
|
||||
attr.qp_state = IBV_QPS_RTR;
|
||||
attr.path_mtu = (openib_btl->device->mtu < lcl_ep->rem_info.rem_mtu) ?
|
||||
openib_btl->device->mtu : lcl_ep->rem_info.rem_mtu;
|
||||
attr.dest_qp_num = msg_hdr->data.xreq.rem_qp_num;
|
||||
attr.rq_psn = msg_hdr->data.xreq.rem_psn;
|
||||
attr.dest_qp_num = rem_qp_num;
|
||||
attr.rq_psn = rem_psn;
|
||||
attr.max_dest_rd_atomic = mca_btl_openib_component.ib_max_rdma_dst_ops;
|
||||
attr.min_rnr_timer = mca_btl_openib_component.ib_min_rnr_timer;
|
||||
attr.ah_attr.is_global = 0;
|
||||
@ -2834,7 +2946,7 @@ static int udcm_xrc_handle_xconnect (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg
|
||||
|
||||
response_type = UDCM_MESSAGE_XRESPONSE;
|
||||
|
||||
rc = udcm_xrc_recv_qp_create (lcl_ep, msg_hdr);
|
||||
rc = udcm_xrc_recv_qp_create (lcl_ep, msg_hdr->data.xreq.rem_qp_num, msg_hdr->data.xreq.rem_psn);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
break;
|
||||
}
|
||||
@ -2880,7 +2992,7 @@ static int udcm_xrc_handle_xresponse (mca_btl_openib_endpoint_t *lcl_ep, udcm_ms
|
||||
|
||||
udep->recv_resp = true;
|
||||
|
||||
rc = udcm_xrc_send_qp_connect (lcl_ep, msg_hdr);
|
||||
rc = udcm_xrc_send_qp_connect (lcl_ep, msg_hdr->data.xres.rem_qp_num, msg_hdr->data.xres.rem_psn);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
mca_btl_openib_endpoint_invoke_error (lcl_ep);
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
@ -98,7 +98,7 @@ typedef struct mca_btl_scif_module_t {
|
||||
|
||||
typedef struct mca_btl_scif_component_t {
|
||||
/* base BTL component */
|
||||
mca_btl_base_component_2_0_0_t super;
|
||||
mca_btl_base_component_3_0_0_t super;
|
||||
|
||||
/* DMA free list settings */
|
||||
int scif_free_list_num;
|
||||
@ -197,29 +197,21 @@ int mca_btl_scif_sendi (struct mca_btl_base_module_t *btl,
|
||||
* Initiate a get operation.
|
||||
*
|
||||
* location: btl_scif_get.c
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param descriptor (IN) Description of the data to be transferred
|
||||
*/
|
||||
int
|
||||
mca_btl_scif_get (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint,
|
||||
struct mca_btl_base_descriptor_t *des);
|
||||
int mca_btl_scif_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
|
||||
/**
|
||||
* Initiate a put operation.
|
||||
*
|
||||
* location: btl_scif_put.c
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param descriptor (IN) Description of the data to be transferred
|
||||
*/
|
||||
int
|
||||
mca_btl_scif_put (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint,
|
||||
struct mca_btl_base_descriptor_t *des);
|
||||
int mca_btl_scif_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
|
||||
mca_btl_base_descriptor_t *
|
||||
mca_btl_scif_alloc(struct mca_btl_base_module_t *btl,
|
||||
@ -228,9 +220,25 @@ mca_btl_scif_alloc(struct mca_btl_base_module_t *btl,
|
||||
|
||||
int mca_btl_scif_progress_send_wait_list (struct mca_btl_base_endpoint_t *endpoint);
|
||||
|
||||
struct mca_btl_scif_reg_t;
|
||||
|
||||
struct mca_btl_base_registration_handle_t {
|
||||
/** scif offset */
|
||||
off_t scif_offset;
|
||||
/** base address of this scif region */
|
||||
uintptr_t scif_base;
|
||||
};
|
||||
|
||||
struct mca_btl_scif_registration_handle_t {
|
||||
mca_btl_base_registration_handle_t btl_handle;
|
||||
struct mca_btl_scif_reg_t *reg;
|
||||
};
|
||||
typedef struct mca_btl_scif_registration_handle_t mca_btl_scif_registration_handle_t;
|
||||
|
||||
typedef struct mca_btl_scif_reg_t {
|
||||
mca_mpool_base_registration_t base;
|
||||
off_t *registrations;
|
||||
/** per-endpoint btl handles for this registration */
|
||||
mca_btl_scif_registration_handle_t *handles;
|
||||
} mca_btl_scif_reg_t;
|
||||
|
||||
/* Global structures */
|
||||
|
@ -165,14 +165,14 @@ static int scif_dereg_mem (void *reg_data, mca_mpool_base_registration_t *reg)
|
||||
|
||||
/* register the fragment with all connected endpoints */
|
||||
for (i = 0 ; i < (int) mca_btl_scif_module.endpoint_count ; ++i) {
|
||||
if ((off_t)-1 != scif_reg->registrations[i] &&
|
||||
if ((off_t)-1 != scif_reg->handles[i].btl_handle.scif_offset &&
|
||||
MCA_BTL_SCIF_EP_STATE_CONNECTED == mca_btl_scif_module.endpoints[i].state) {
|
||||
(void) scif_unregister(mca_btl_scif_module.endpoints[i].scif_epd,
|
||||
scif_reg->registrations[i], size);
|
||||
scif_reg->handles[i].btl_handle.scif_offset, size);
|
||||
}
|
||||
}
|
||||
|
||||
free (scif_reg->registrations);
|
||||
free (scif_reg->handles);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
@ -184,17 +184,22 @@ static int scif_reg_mem (void *reg_data, void *base, size_t size,
|
||||
int rc = OPAL_SUCCESS;
|
||||
unsigned int i;
|
||||
|
||||
scif_reg->registrations = calloc (mca_btl_scif_module.endpoint_count,
|
||||
sizeof (off_t));
|
||||
memset (scif_reg->registrations, -1, mca_btl_scif_module.endpoint_count * sizeof (off_t));
|
||||
scif_reg->handles = calloc (mca_btl_scif_module.endpoint_count, sizeof (scif_reg->handles[0]));
|
||||
|
||||
/* intialize all scif offsets to -1 and initialize the pointer back to the mpool registration */
|
||||
for (i = 0 ; i < mca_btl_scif_module.endpoint_count ; ++i) {
|
||||
scif_reg->handles[i].btl_handle.scif_offset = -1;
|
||||
scif_reg->handles[i].btl_handle.scif_base = (intptr_t) base;
|
||||
scif_reg->handles[i].reg = scif_reg;
|
||||
}
|
||||
|
||||
/* register the pointer with all connected endpoints */
|
||||
for (i = 0 ; i < mca_btl_scif_module.endpoint_count ; ++i) {
|
||||
if (MCA_BTL_SCIF_EP_STATE_CONNECTED == mca_btl_scif_module.endpoints[i].state) {
|
||||
scif_reg->registrations[i] = scif_register(mca_btl_scif_module.endpoints[i].scif_epd,
|
||||
base, size, 0, SCIF_PROT_READ |
|
||||
SCIF_PROT_WRITE, 0);
|
||||
if (SCIF_REGISTER_FAILED == scif_reg->registrations[i]) {
|
||||
scif_reg->handles[i].btl_handle.scif_offset = scif_register (mca_btl_scif_module.endpoints[i].scif_epd,
|
||||
base, size, 0, SCIF_PROT_READ |
|
||||
SCIF_PROT_WRITE, 0);
|
||||
if (SCIF_REGISTER_FAILED == scif_reg->handles[i].btl_handle.scif_offset) {
|
||||
/* cleanup */
|
||||
scif_dereg_mem (reg_data, reg);
|
||||
rc = OPAL_ERR_OUT_OF_RESOURCE;
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
@ -171,7 +171,7 @@ static int btl_scif_component_register(void)
|
||||
mca_btl_scif_module.super.btl_flags = MCA_BTL_FLAGS_SEND |
|
||||
MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND_INPLACE;
|
||||
|
||||
mca_btl_scif_module.super.btl_seg_size = sizeof (mca_btl_scif_segment_t);
|
||||
mca_btl_scif_module.super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t);
|
||||
|
||||
mca_btl_scif_module.super.btl_bandwidth = 50000; /* Mbs */
|
||||
mca_btl_scif_module.super.btl_latency = 2; /* Microsecs */
|
||||
@ -329,11 +329,11 @@ static int mca_btl_scif_progress_recvs (mca_btl_base_endpoint_t *ep)
|
||||
* the fragment without introducing another copy here. this
|
||||
* limitation has not appeared to cause any performance
|
||||
* problems. */
|
||||
frag.base.des_local_count = 1;
|
||||
frag.segments[0].base.seg_len = hdr->size;
|
||||
frag.segments[0].base.seg_addr.pval = (void *) (hdr + 1);
|
||||
frag.base.des_segment_count = 1;
|
||||
frag.segments[0].seg_len = hdr->size;
|
||||
frag.segments[0].seg_addr.pval = (void *) (hdr + 1);
|
||||
|
||||
frag.base.des_local = &frag.segments[0].base;
|
||||
frag.base.des_segments = frag.segments;
|
||||
|
||||
/* call the registered callback function */
|
||||
reg->cbfunc(&mca_btl_scif_module.super, hdr->tag, &frag.base, reg->cbdata);
|
||||
|
@ -15,13 +15,13 @@
|
||||
static inline void mca_btl_scif_base_frag_constructor (mca_btl_scif_base_frag_t *frag)
|
||||
{
|
||||
memset ((char *) frag + sizeof (frag->base), 0, sizeof (*frag) - sizeof (frag->base));
|
||||
frag->segments[0].base.seg_addr.pval = frag->base.super.ptr;
|
||||
frag->segments[0].seg_addr.pval = frag->base.super.ptr;
|
||||
}
|
||||
|
||||
static inline void mca_btl_scif_eager_frag_constructor (mca_btl_scif_base_frag_t *frag)
|
||||
{
|
||||
memset ((char *) frag + sizeof (frag->base), 0, sizeof (*frag) - sizeof (frag->base));
|
||||
frag->segments[0].base.seg_addr.pval = frag->base.super.ptr;
|
||||
frag->segments[0].seg_addr.pval = frag->base.super.ptr;
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_btl_scif_eager_frag_t, mca_btl_base_descriptor_t,
|
||||
|
@ -15,16 +15,6 @@
|
||||
#include "btl_scif.h"
|
||||
#include "btl_scif_endpoint.h"
|
||||
|
||||
typedef struct mca_btl_scif_segment_t {
|
||||
mca_btl_base_segment_t base;
|
||||
|
||||
/* scif offset */
|
||||
off_t scif_offset;
|
||||
|
||||
/* original pointer */
|
||||
uint64_t orig_ptr;
|
||||
} mca_btl_scif_segment_t;
|
||||
|
||||
typedef struct mca_btl_scif_frag_hdr_t {
|
||||
#if defined(SCIF_USE_SEQ)
|
||||
uint32_t seq;
|
||||
@ -41,7 +31,7 @@ typedef void (*frag_cb_t) (struct mca_btl_scif_base_frag_t *, int);
|
||||
typedef struct mca_btl_scif_base_frag_t {
|
||||
mca_btl_base_descriptor_t base;
|
||||
mca_btl_scif_frag_hdr_t hdr;
|
||||
mca_btl_scif_segment_t segments[2];
|
||||
mca_btl_base_segment_t segments[2];
|
||||
mca_btl_base_endpoint_t *endpoint;
|
||||
mca_btl_scif_reg_t *registration;
|
||||
ompi_free_list_t *my_list;
|
||||
@ -78,9 +68,9 @@ static inline int mca_btl_scif_frag_return (mca_btl_scif_base_frag_t *frag)
|
||||
frag->registration = NULL;
|
||||
}
|
||||
|
||||
frag->segments[0].base.seg_addr.pval = frag->base.super.ptr;
|
||||
frag->segments[0].base.seg_len = 0;
|
||||
frag->segments[1].base.seg_len = 0;
|
||||
frag->segments[0].seg_addr.pval = frag->base.super.ptr;
|
||||
frag->segments[0].seg_len = 0;
|
||||
frag->segments[1].seg_len = 0;
|
||||
|
||||
OMPI_FREE_LIST_RETURN_MT(frag->my_list, (ompi_free_list_item_t *) frag);
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
@ -20,18 +20,13 @@
|
||||
|
||||
/**
|
||||
* Initiate a get operation.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param descriptor (IN) Description of the data to be transferred
|
||||
*/
|
||||
int mca_btl_scif_get (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint,
|
||||
struct mca_btl_base_descriptor_t *des) {
|
||||
mca_btl_scif_segment_t *src = (mca_btl_scif_segment_t *) des->des_remote;
|
||||
mca_btl_scif_segment_t *dst = (mca_btl_scif_segment_t *) des->des_local;
|
||||
size_t len = lmin (src->base.seg_len, dst->base.seg_len);
|
||||
int rc, mark, flags = 0;
|
||||
int mca_btl_scif_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
{
|
||||
int rc, mark, scif_flags = 0;
|
||||
off_t roffset, loffset;
|
||||
#if defined(SCIF_TIMING)
|
||||
struct timespec ts;
|
||||
@ -41,30 +36,27 @@ int mca_btl_scif_get (struct mca_btl_base_module_t *btl,
|
||||
mca_btl_scif_component.get_count++;
|
||||
#endif
|
||||
|
||||
BTL_VERBOSE(("Using DMA Get for frag %p from offset %lu", (void *) des,
|
||||
(unsigned long) src->scif_offset));
|
||||
BTL_VERBOSE(("Using DMA Get from remote address %" PRIx64 " to local address %p",
|
||||
remote_address, local_address));
|
||||
|
||||
roffset = src->scif_offset + (off_t)(src->orig_ptr - src->base.seg_addr.lval);
|
||||
loffset = dst->scif_offset + (off_t)(dst->orig_ptr - dst->base.seg_addr.lval);
|
||||
roffset = remote_handle->scif_offset + (off_t)(remote_address - remote_handle->scif_base);
|
||||
loffset = local_handle->scif_offset + (off_t)((intptr_t)local_address - local_handle->scif_base);
|
||||
|
||||
if (mca_btl_scif_component.rma_use_cpu) {
|
||||
flags = SCIF_RMA_USECPU;
|
||||
scif_flags = SCIF_RMA_USECPU;
|
||||
}
|
||||
|
||||
if (mca_btl_scif_component.rma_sync) {
|
||||
flags |= SCIF_RMA_SYNC;
|
||||
scif_flags |= SCIF_RMA_SYNC;
|
||||
}
|
||||
|
||||
/* start the read */
|
||||
rc = scif_readfrom (endpoint->scif_epd, loffset, len, roffset, flags);
|
||||
rc = scif_readfrom (endpoint->scif_epd, loffset, size, roffset, scif_flags);
|
||||
if (OPAL_UNLIKELY(-1 == rc)) {
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
/* always call the callback function */
|
||||
des->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
|
||||
|
||||
if (!(flags & SCIF_RMA_SYNC)) {
|
||||
if (!(scif_flags & SCIF_RMA_SYNC)) {
|
||||
/* according to the scif documentation is is better to use a fence rather
|
||||
* than using the SCIF_RMA_SYNC flag with scif_readfrom */
|
||||
scif_fence_mark (endpoint->scif_epd, SCIF_FENCE_INIT_SELF, &mark);
|
||||
@ -76,8 +68,8 @@ int mca_btl_scif_get (struct mca_btl_base_module_t *btl,
|
||||
mca_btl_scif_component.get_time_max, ts);
|
||||
#endif
|
||||
|
||||
/* since we completed the fence the RMA operation is complete */
|
||||
mca_btl_scif_frag_complete ((mca_btl_scif_base_frag_t *) des, OPAL_SUCCESS);
|
||||
/* always call the callback function */
|
||||
cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
@ -24,17 +24,14 @@ mca_btl_scif_free (struct mca_btl_base_module_t *btl,
|
||||
static int
|
||||
mca_btl_scif_module_finalize (struct mca_btl_base_module_t* btl);
|
||||
|
||||
static mca_btl_base_descriptor_t *
|
||||
mca_btl_scif_prepare_dst (mca_btl_base_module_t *btl,
|
||||
mca_btl_base_endpoint_t *endpoint,
|
||||
mca_mpool_base_registration_t *registration,
|
||||
opal_convertor_t *convertor, uint8_t order,
|
||||
size_t reserve, size_t *size, uint32_t flags);
|
||||
static mca_btl_base_registration_handle_t *mca_btl_scif_register_mem (struct mca_btl_base_module_t *btl,
|
||||
mca_btl_base_endpoint_t *endpoint,
|
||||
void *base, size_t size, uint32_t flags);
|
||||
static int mca_btl_scif_deregister_mem (struct mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle);
|
||||
|
||||
static struct mca_btl_base_descriptor_t *
|
||||
mca_btl_scif_prepare_src (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint,
|
||||
mca_mpool_base_registration_t *registration,
|
||||
struct opal_convertor_t *convertor,
|
||||
uint8_t order, size_t reserve, size_t *size,
|
||||
uint32_t flags);
|
||||
@ -48,11 +45,12 @@ mca_btl_scif_module_t mca_btl_scif_module = {
|
||||
.btl_alloc = mca_btl_scif_alloc,
|
||||
.btl_free = mca_btl_scif_free,
|
||||
.btl_prepare_src = mca_btl_scif_prepare_src,
|
||||
.btl_prepare_dst = mca_btl_scif_prepare_dst,
|
||||
.btl_send = mca_btl_scif_send,
|
||||
.btl_sendi = mca_btl_scif_sendi,
|
||||
.btl_put = mca_btl_scif_put,
|
||||
.btl_get = mca_btl_scif_get,
|
||||
.btl_register_mem = mca_btl_scif_register_mem,
|
||||
.btl_deregister_mem = mca_btl_scif_deregister_mem,
|
||||
}
|
||||
};
|
||||
|
||||
@ -163,10 +161,10 @@ mca_btl_scif_alloc(struct mca_btl_base_module_t *btl,
|
||||
|
||||
frag->base.des_flags = flags;
|
||||
frag->base.order = order;
|
||||
frag->base.des_local = &frag->segments[0].base;
|
||||
frag->base.des_local_count = 1;
|
||||
frag->base.des_segments = frag->segments;
|
||||
frag->base.des_segment_count = 1;
|
||||
|
||||
frag->segments[0].base.seg_len = size;
|
||||
frag->segments[0].seg_len = size;
|
||||
|
||||
return &frag->base;
|
||||
}
|
||||
@ -178,16 +176,19 @@ mca_btl_scif_free (struct mca_btl_base_module_t *btl,
|
||||
return mca_btl_scif_frag_return ((mca_btl_scif_base_frag_t *) des);
|
||||
}
|
||||
|
||||
static inline mca_btl_base_descriptor_t *mca_btl_scif_prepare_dma (struct mca_btl_base_module_t *btl,
|
||||
mca_btl_base_endpoint_t *endpoint,
|
||||
void *data_ptr, size_t size,
|
||||
mca_mpool_base_registration_t *registration,
|
||||
uint8_t order, uint32_t flags)
|
||||
static mca_btl_base_registration_handle_t *mca_btl_scif_register_mem (struct mca_btl_base_module_t *btl,
|
||||
mca_btl_base_endpoint_t *endpoint,
|
||||
void *base, size_t size, uint32_t flags)
|
||||
{
|
||||
mca_btl_scif_base_frag_t *frag;
|
||||
mca_btl_scif_reg_t *scif_reg;
|
||||
int rc;
|
||||
|
||||
if (MCA_BTL_ENDPOINT_ANY == endpoint) {
|
||||
/* it probably isn't possible to support registering memory to use with any endpoint so
|
||||
* return NULL */
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (OPAL_LIKELY(MCA_BTL_SCIF_EP_STATE_CONNECTED != endpoint->state)) {
|
||||
/* the endpoint needs to be connected before the fragment can be
|
||||
* registered. */
|
||||
@ -198,67 +199,36 @@ static inline mca_btl_base_descriptor_t *mca_btl_scif_prepare_dma (struct mca_bt
|
||||
}
|
||||
}
|
||||
|
||||
(void) MCA_BTL_SCIF_FRAG_ALLOC_DMA(endpoint, frag);
|
||||
if (OPAL_UNLIKELY(NULL == frag)) {
|
||||
rc = btl->btl_mpool->mpool_register(btl->btl_mpool, base, size, 0,
|
||||
(mca_mpool_base_registration_t **) &scif_reg);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (NULL == registration) {
|
||||
rc = btl->btl_mpool->mpool_register(btl->btl_mpool, data_ptr, size, 0,
|
||||
(mca_mpool_base_registration_t **) ®istration);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
mca_btl_scif_frag_return (frag);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
frag->registration = (mca_btl_scif_reg_t *) registration;
|
||||
}
|
||||
|
||||
scif_reg = (mca_btl_scif_reg_t *) registration;
|
||||
|
||||
/* register the memory location with this peer if it isn't already */
|
||||
if ((off_t) -1 == scif_reg->registrations[endpoint->id]) {
|
||||
size_t seg_size = (size_t)((uintptr_t) registration->bound - (uintptr_t) registration->base) + 1;
|
||||
scif_reg->registrations[endpoint->id] = scif_register (endpoint->scif_epd, registration->base,
|
||||
seg_size, 0, SCIF_PROT_READ |
|
||||
SCIF_PROT_WRITE, 0);
|
||||
if ((off_t) -1 == scif_reg->handles[endpoint->id].btl_handle.scif_offset) {
|
||||
size_t seg_size = (size_t)((uintptr_t) scif_reg->base.bound - (uintptr_t) scif_reg->base.base) + 1;
|
||||
|
||||
/* NTH: until we determine a way to pass permissions to the mpool just make all segments
|
||||
* read/write */
|
||||
scif_reg->handles[endpoint->id].btl_handle.scif_offset =
|
||||
scif_register (endpoint->scif_epd, scif_reg->base.base, seg_size, 0, SCIF_PROT_READ |
|
||||
SCIF_PROT_WRITE, 0);
|
||||
BTL_VERBOSE(("registered fragment for scif DMA transaction. offset = %lu",
|
||||
(unsigned long) scif_reg->registrations[endpoint->id]));
|
||||
(unsigned long) scif_reg->handles[endpoint->id].btl_handle.scif_offset));
|
||||
}
|
||||
|
||||
if (OPAL_UNLIKELY((off_t) -1 == scif_reg->registrations[endpoint->id])) {
|
||||
mca_btl_scif_frag_return (frag);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
frag->segments[0].base.seg_addr.lval = (uint64_t)(uintptr_t) data_ptr;
|
||||
frag->segments[0].base.seg_len = size;
|
||||
frag->segments[0].scif_offset = scif_reg->registrations[endpoint->id] +
|
||||
(off_t) ((ptrdiff_t) data_ptr - (ptrdiff_t) registration->base);
|
||||
/* save the original pointer so the offset can be adjusted if needed (this is
|
||||
* required for osc/rdma) */
|
||||
frag->segments[0].orig_ptr = (uint64_t)(uintptr_t) data_ptr;
|
||||
frag->base.order = order;
|
||||
frag->base.des_flags = flags;
|
||||
|
||||
frag->base.des_local = &frag->segments->base;
|
||||
frag->base.des_local_count = 1;
|
||||
|
||||
return &frag->base;
|
||||
return &scif_reg->handles[endpoint->id].btl_handle;
|
||||
}
|
||||
|
||||
static inline mca_btl_base_descriptor_t *mca_btl_scif_prepare_dma_conv (struct mca_btl_base_module_t *btl,
|
||||
mca_btl_base_endpoint_t *endpoint,
|
||||
mca_mpool_base_registration_t *registration,
|
||||
struct opal_convertor_t *convertor,
|
||||
uint8_t order, size_t *size,
|
||||
uint32_t flags)
|
||||
static int mca_btl_scif_deregister_mem (struct mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle)
|
||||
{
|
||||
void *data_ptr;
|
||||
mca_btl_scif_registration_handle_t *scif_handle = (mca_btl_scif_registration_handle_t *) handle;
|
||||
mca_btl_scif_reg_t *scif_reg = scif_handle->reg;
|
||||
|
||||
opal_convertor_get_current_pointer (convertor, &data_ptr);
|
||||
btl->btl_mpool->mpool_deregister (btl->btl_mpool, &scif_reg->base);
|
||||
|
||||
return mca_btl_scif_prepare_dma (btl, endpoint, data_ptr, *size, registration, order, flags);
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
static inline struct mca_btl_base_descriptor_t *
|
||||
@ -286,10 +256,10 @@ mca_btl_scif_prepare_src_send (struct mca_btl_base_module_t *btl,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
frag->segments[0].base.seg_len = reserve;
|
||||
frag->segments[1].base.seg_addr.pval = data_ptr;
|
||||
frag->segments[1].base.seg_len = *size;
|
||||
frag->base.des_local_count = 2;
|
||||
frag->segments[0].seg_len = reserve;
|
||||
frag->segments[1].seg_addr.pval = data_ptr;
|
||||
frag->segments[1].seg_len = *size;
|
||||
frag->base.des_segment_count = 2;
|
||||
} else {
|
||||
/* buffered send */
|
||||
(void) MCA_BTL_SCIF_FRAG_ALLOC_EAGER(endpoint, frag);
|
||||
@ -299,7 +269,7 @@ mca_btl_scif_prepare_src_send (struct mca_btl_base_module_t *btl,
|
||||
|
||||
if (*size) {
|
||||
iov.iov_len = *size;
|
||||
iov.iov_base = (IOVBASE_TYPE *) ((uintptr_t) frag->segments[0].base.seg_addr.pval + reserve);
|
||||
iov.iov_base = (IOVBASE_TYPE *) ((uintptr_t) frag->segments[0].seg_addr.pval + reserve);
|
||||
|
||||
rc = opal_convertor_pack (convertor, &iov, &iov_count, &max_size);
|
||||
if (OPAL_UNLIKELY(rc < 0)) {
|
||||
@ -309,37 +279,22 @@ mca_btl_scif_prepare_src_send (struct mca_btl_base_module_t *btl,
|
||||
*size = max_size;
|
||||
}
|
||||
|
||||
frag->segments[0].base.seg_len = reserve + *size;
|
||||
frag->base.des_local_count = 1;
|
||||
frag->segments[0].seg_len = reserve + *size;
|
||||
frag->base.des_segment_count = 1;
|
||||
}
|
||||
|
||||
frag->base.des_local = &frag->segments->base;
|
||||
frag->base.order = order;
|
||||
frag->base.des_flags = flags;
|
||||
frag->base.des_segments = frag->segments;
|
||||
frag->base.order = order;
|
||||
frag->base.des_flags = flags;
|
||||
|
||||
return &frag->base;
|
||||
}
|
||||
|
||||
static mca_btl_base_descriptor_t *mca_btl_scif_prepare_src (struct mca_btl_base_module_t *btl,
|
||||
mca_btl_base_endpoint_t *endpoint,
|
||||
mca_mpool_base_registration_t *registration,
|
||||
struct opal_convertor_t *convertor,
|
||||
uint8_t order, size_t reserve, size_t *size,
|
||||
uint32_t flags)
|
||||
{
|
||||
if (OPAL_LIKELY(reserve)) {
|
||||
return mca_btl_scif_prepare_src_send (btl, endpoint, convertor,
|
||||
order, reserve, size, flags);
|
||||
} else {
|
||||
return mca_btl_scif_prepare_dma_conv (btl, endpoint, registration, convertor, order, size, flags);
|
||||
}
|
||||
}
|
||||
|
||||
static mca_btl_base_descriptor_t *mca_btl_scif_prepare_dst (mca_btl_base_module_t *btl,
|
||||
mca_btl_base_endpoint_t *endpoint,
|
||||
mca_mpool_base_registration_t *registration,
|
||||
opal_convertor_t *convertor, uint8_t order,
|
||||
size_t reserve, size_t *size, uint32_t flags)
|
||||
{
|
||||
return mca_btl_scif_prepare_dma_conv (btl, endpoint, registration, convertor, order, size, flags);
|
||||
return mca_btl_scif_prepare_src_send (btl, endpoint, convertor, order, reserve, size, flags);
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -16,63 +16,57 @@
|
||||
|
||||
/**
|
||||
* Initiate a put operation.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param descriptor (IN) Description of the data to be transferred
|
||||
*/
|
||||
int mca_btl_scif_put (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint,
|
||||
struct mca_btl_base_descriptor_t *des) {
|
||||
mca_btl_scif_segment_t *src = (mca_btl_scif_segment_t *) des->des_local;
|
||||
mca_btl_scif_segment_t *dst = (mca_btl_scif_segment_t *) des->des_remote;
|
||||
size_t len = lmin (src->base.seg_len, dst->base.seg_len);
|
||||
int rc, mark, flags = 0;
|
||||
int mca_btl_scif_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
{
|
||||
int rc, mark, scif_flags = 0;
|
||||
off_t roffset, loffset;
|
||||
#if defined(SCIF_TIMING)
|
||||
struct timespec ts;
|
||||
|
||||
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
|
||||
|
||||
mca_btl_scif_component.put_count++;
|
||||
mca_btl_scif_component.get_count++;
|
||||
#endif
|
||||
|
||||
BTL_VERBOSE(("Using DMA Put for frag %p", (void *) des));
|
||||
BTL_VERBOSE(("Using DMA Put from local address %p to remote address %" PRIx64,
|
||||
local_address, remote_address));
|
||||
|
||||
roffset = dst->scif_offset + (off_t)(dst->orig_ptr - dst->base.seg_addr.lval);
|
||||
loffset = src->scif_offset + (off_t)(src->orig_ptr - src->base.seg_addr.lval);
|
||||
roffset = remote_handle->scif_offset + (off_t)(remote_address - remote_handle->scif_base);
|
||||
loffset = local_handle->scif_offset + (off_t)((intptr_t) local_address - local_handle->scif_base);
|
||||
|
||||
if (mca_btl_scif_component.rma_use_cpu) {
|
||||
flags = SCIF_RMA_USECPU;
|
||||
scif_flags = SCIF_RMA_USECPU;
|
||||
}
|
||||
|
||||
if (mca_btl_scif_component.rma_sync) {
|
||||
flags |= SCIF_RMA_SYNC;
|
||||
scif_flags |= SCIF_RMA_SYNC;
|
||||
}
|
||||
|
||||
/* start the write */
|
||||
rc = scif_writeto (endpoint->scif_epd, loffset, len, roffset, flags);
|
||||
rc = scif_writeto (endpoint->scif_epd, loffset, size, roffset, scif_flags);
|
||||
rc = scif_readfrom (endpoint->scif_epd, loffset, size, roffset, scif_flags);
|
||||
if (OPAL_UNLIKELY(-1 == rc)) {
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
/* always call the callback function */
|
||||
des->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
|
||||
|
||||
/* according to the scif documentation is is better to use a fence rather
|
||||
* than using the SCIF_RMA_SYNC flag with scif_writeto */
|
||||
if (!(flags & SCIF_RMA_SYNC)) {
|
||||
if (!(scif_flags & SCIF_RMA_SYNC)) {
|
||||
/* according to the scif documentation is is better to use a fence rather
|
||||
* than using the SCIF_RMA_SYNC flag with scif_readfrom */
|
||||
scif_fence_mark (endpoint->scif_epd, SCIF_FENCE_INIT_SELF, &mark);
|
||||
scif_fence_wait (endpoint->scif_epd, mark);
|
||||
}
|
||||
|
||||
#if defined(SCIF_TIMING)
|
||||
SCIF_UPDATE_TIMER(mca_btl_scif_component.put_time,
|
||||
mca_btl_scif_component.put_time_max, ts);
|
||||
SCIF_UPDATE_TIMER(mca_btl_scif_component.get_time,
|
||||
mca_btl_scif_component.get_time_max, ts);
|
||||
#endif
|
||||
|
||||
/* since we completed the fence the RMA operation is complete */
|
||||
mca_btl_scif_frag_complete ((mca_btl_scif_base_frag_t *) des, OPAL_SUCCESS);
|
||||
/* always call the callback function */
|
||||
cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
@ -118,22 +118,22 @@ static int mca_btl_scif_send_frag (struct mca_btl_base_endpoint_t *endpoint,
|
||||
unsigned char * restrict dst;
|
||||
|
||||
BTL_VERBOSE(("btl/scif sending descriptor %p from %d -> %d. length = %" PRIu64, (void *) frag,
|
||||
OPAL_PROC_MY_NAME.vpid, endpoint->peer_proc->proc_name.vpid, frag->segments[0].base.seg_len));
|
||||
OPAL_PROC_MY_NAME.vpid, endpoint->peer_proc->proc_name.vpid, frag->segments[0].seg_len));
|
||||
|
||||
if (OPAL_LIKELY(OPAL_SUCCESS == mca_btl_scif_send_get_buffer (endpoint, size, &dst))) {
|
||||
unsigned char * restrict data = (unsigned char * restrict) frag->segments[0].base.seg_addr.pval;
|
||||
unsigned char * restrict data = (unsigned char * restrict) frag->segments[0].seg_addr.pval;
|
||||
#if defined(SCIF_TIMING)
|
||||
struct timespec ts;
|
||||
|
||||
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
|
||||
#endif
|
||||
|
||||
memcpy (dst + sizeof (frag->hdr), data, frag->segments[0].base.seg_len);
|
||||
memcpy (dst + sizeof (frag->hdr), data, frag->segments[0].seg_len);
|
||||
|
||||
if (frag->segments[1].base.seg_len) {
|
||||
memcpy (dst + sizeof (frag->hdr) + frag->segments[0].base.seg_len,
|
||||
frag->segments[1].base.seg_addr.pval,
|
||||
frag->segments[1].base.seg_len);
|
||||
if (frag->segments[1].seg_len) {
|
||||
memcpy (dst + sizeof (frag->hdr) + frag->segments[0].seg_len,
|
||||
frag->segments[1].seg_addr.pval,
|
||||
frag->segments[1].seg_len);
|
||||
}
|
||||
|
||||
#if defined(SCIF_USE_SEQ)
|
||||
@ -165,7 +165,7 @@ int mca_btl_scif_send (struct mca_btl_base_module_t *btl,
|
||||
mca_btl_base_tag_t tag)
|
||||
{
|
||||
mca_btl_scif_base_frag_t *frag = (mca_btl_scif_base_frag_t *) descriptor;
|
||||
size_t size = frag->segments[0].base.seg_len + frag->segments[1].base.seg_len;
|
||||
size_t size = frag->segments[0].seg_len + frag->segments[1].seg_len;
|
||||
int rc;
|
||||
|
||||
frag->hdr.tag = tag;
|
||||
@ -223,7 +223,9 @@ int mca_btl_scif_sendi (struct mca_btl_base_module_t *btl,
|
||||
|
||||
rc = mca_btl_scif_send_get_buffer (endpoint, length, &base);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
*descriptor = NULL;
|
||||
if (NULL != descriptor) {
|
||||
*descriptor = NULL;
|
||||
}
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
|
@ -38,13 +38,15 @@
|
||||
#include "btl_self_frag.h"
|
||||
#include "opal/util/proc.h"
|
||||
|
||||
static int mca_btl_self_put (struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_btl_base_descriptor_t* des);
|
||||
static int mca_btl_self_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
|
||||
static int mca_btl_self_get (struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_btl_base_descriptor_t* des);
|
||||
static int mca_btl_self_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
|
||||
mca_btl_base_module_t mca_btl_self = {
|
||||
.btl_component = &mca_btl_self_component.super,
|
||||
@ -54,7 +56,6 @@ mca_btl_base_module_t mca_btl_self = {
|
||||
.btl_alloc = mca_btl_self_alloc,
|
||||
.btl_free = mca_btl_self_free,
|
||||
.btl_prepare_src = mca_btl_self_prepare_src,
|
||||
.btl_prepare_dst = mca_btl_self_prepare_dst,
|
||||
.btl_send = mca_btl_self_send,
|
||||
.btl_put = mca_btl_self_put,
|
||||
.btl_get = mca_btl_self_get,
|
||||
@ -135,8 +136,8 @@ mca_btl_base_descriptor_t* mca_btl_self_alloc(
|
||||
|
||||
frag->segment.seg_len = size;
|
||||
frag->base.des_flags = flags;
|
||||
frag->base.des_local = &(frag->segment);
|
||||
frag->base.des_local_count = 1;
|
||||
frag->base.des_segments = &(frag->segment);
|
||||
frag->base.des_segment_count = 1;
|
||||
return (mca_btl_base_descriptor_t*)frag;
|
||||
}
|
||||
|
||||
@ -151,10 +152,8 @@ int mca_btl_self_free( struct mca_btl_base_module_t* btl,
|
||||
{
|
||||
mca_btl_self_frag_t* frag = (mca_btl_self_frag_t*)des;
|
||||
|
||||
frag->base.des_local = NULL;
|
||||
frag->base.des_local_count = 0;
|
||||
frag->base.des_remote = NULL;
|
||||
frag->base.des_remote_count = 0;
|
||||
frag->base.des_segments = NULL;
|
||||
frag->base.des_segment_count = 0;
|
||||
|
||||
if(frag->size == mca_btl_self.btl_eager_limit) {
|
||||
MCA_BTL_SELF_FRAG_RETURN_EAGER(frag);
|
||||
@ -175,7 +174,6 @@ int mca_btl_self_free( struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_descriptor_t*
|
||||
mca_btl_self_prepare_src( struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
@ -231,44 +229,11 @@ mca_btl_self_prepare_src( struct mca_btl_base_module_t* btl,
|
||||
*size = max_data;
|
||||
}
|
||||
frag->base.des_flags = flags;
|
||||
frag->base.des_local = &frag->segment;
|
||||
frag->base.des_local_count = 1;
|
||||
frag->base.des_segments = &frag->segment;
|
||||
frag->base.des_segment_count = 1;
|
||||
|
||||
return &frag->base;
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepare data for receive.
|
||||
*/
|
||||
struct mca_btl_base_descriptor_t*
|
||||
mca_btl_self_prepare_dst( struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t* size,
|
||||
uint32_t flags )
|
||||
{
|
||||
mca_btl_self_frag_t* frag;
|
||||
size_t max_data = *size;
|
||||
void *ptr;
|
||||
|
||||
MCA_BTL_SELF_FRAG_ALLOC_RDMA(frag);
|
||||
if(OPAL_UNLIKELY(NULL == frag)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* setup descriptor to point directly to user buffer */
|
||||
opal_convertor_get_current_pointer( convertor, &ptr );
|
||||
frag->segment.seg_addr.lval = (uint64_t)(uintptr_t) ptr;
|
||||
|
||||
frag->segment.seg_len = reserve + max_data;
|
||||
frag->base.des_local = &frag->segment;
|
||||
frag->base.des_local_count = 1;
|
||||
frag->base.des_flags = flags;
|
||||
return &frag->base;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initiate a send to the peer.
|
||||
@ -285,12 +250,6 @@ int mca_btl_self_send( struct mca_btl_base_module_t* btl,
|
||||
mca_btl_active_message_callback_t* reg;
|
||||
int btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
||||
|
||||
/**
|
||||
* We have to set the dst before the call to the function and reset them
|
||||
* after.
|
||||
*/
|
||||
des->des_remote = des->des_local;
|
||||
des->des_remote_count = des->des_local_count;
|
||||
/* upcall */
|
||||
reg = mca_btl_base_active_message_trigger + tag;
|
||||
reg->cbfunc( btl, tag, des, reg->cbdata );
|
||||
@ -305,100 +264,29 @@ int mca_btl_self_send( struct mca_btl_base_module_t* btl,
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initiate a put to the peer.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param peer (IN) BTL peer addressing
|
||||
*/
|
||||
|
||||
static int mca_btl_self_rdma( struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_btl_base_descriptor_t* des,
|
||||
mca_btl_base_segment_t* src, size_t src_cnt,
|
||||
mca_btl_base_segment_t* dst, size_t dst_cnt)
|
||||
static int mca_btl_self_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
{
|
||||
unsigned char* src_addr = (unsigned char *)(uintptr_t) src->seg_addr.lval;
|
||||
size_t src_len = src->seg_len;
|
||||
unsigned char* dst_addr = (unsigned char *)(uintptr_t) dst->seg_addr.lval;
|
||||
size_t dst_len = dst->seg_len;
|
||||
int btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
||||
memcpy ((void *)(intptr_t) remote_address, local_address, size);
|
||||
|
||||
while(src_len && dst_len) {
|
||||
cbfunc (btl, endpoint, local_address, NULL, cbcontext, cbdata, OPAL_SUCCESS);
|
||||
|
||||
if(src_len == dst_len) {
|
||||
memcpy(dst_addr, src_addr, src_len);
|
||||
|
||||
/* advance src */
|
||||
if(--src_cnt != 0) {
|
||||
src++;
|
||||
src_addr = (unsigned char*)src->seg_addr.pval;
|
||||
src_len = src->seg_len;
|
||||
} else {
|
||||
src_len = 0;
|
||||
}
|
||||
|
||||
/* advance dst */
|
||||
if(--dst_cnt != 0) {
|
||||
dst++;
|
||||
dst_addr = (unsigned char*)dst->seg_addr.pval;
|
||||
dst_len = dst->seg_len;
|
||||
} else {
|
||||
dst_len = 0;
|
||||
}
|
||||
|
||||
} else {
|
||||
size_t bytes = src_len < dst_len ? src_len : dst_len;
|
||||
memcpy(dst_addr, src_addr, bytes);
|
||||
|
||||
/* advance src */
|
||||
src_len -= bytes;
|
||||
if(src_len == 0) {
|
||||
if(--src_cnt != 0) {
|
||||
src++;
|
||||
src_addr = (unsigned char*)src->seg_addr.pval;
|
||||
src_len = src->seg_len;
|
||||
}
|
||||
} else {
|
||||
src_addr += bytes;
|
||||
}
|
||||
|
||||
/* advance dst */
|
||||
dst_len -= bytes;
|
||||
if(dst_len == 0) {
|
||||
if(--dst_cnt != 0) {
|
||||
dst++;
|
||||
dst_addr = (unsigned char*)src->seg_addr.pval;
|
||||
dst_len = src->seg_len;
|
||||
}
|
||||
} else {
|
||||
dst_addr += bytes;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* rdma completion */
|
||||
des->des_cbfunc( btl, endpoint, des, OPAL_SUCCESS );
|
||||
if( btl_ownership ) {
|
||||
mca_btl_self_free( btl, des );
|
||||
}
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
static int mca_btl_self_put (struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_btl_base_descriptor_t* des)
|
||||
static int mca_btl_self_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
{
|
||||
return mca_btl_self_rdma (btl, endpoint, des, des->des_local, des->des_local_count,
|
||||
des->des_remote, des->des_remote_count);
|
||||
}
|
||||
memcpy (local_address, (void *)(intptr_t) remote_address, size);
|
||||
|
||||
static int mca_btl_self_get (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint,
|
||||
struct mca_btl_base_descriptor_t *des)
|
||||
{
|
||||
return mca_btl_self_rdma (btl, endpoint, des, des->des_remote, des->des_remote_count,
|
||||
des->des_local, des->des_local_count);
|
||||
cbfunc (btl, endpoint, local_address, NULL, cbcontext, cbdata, OPAL_SUCCESS);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_btl_self_ft_event(int state) {
|
||||
|
@ -1,3 +1,4 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
@ -9,6 +10,8 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -40,7 +43,7 @@ BEGIN_C_DECLS
|
||||
* Shared Memory (SELF) BTL module.
|
||||
*/
|
||||
struct mca_btl_self_component_t {
|
||||
mca_btl_base_component_2_0_0_t super; /**< base BTL component */
|
||||
mca_btl_base_component_3_0_0_t super; /**< base BTL component */
|
||||
int free_list_num; /**< initial size of free lists */
|
||||
int free_list_max; /**< maximum size of free lists */
|
||||
int free_list_inc; /**< number of elements to alloc when growing free lists */
|
||||
@ -165,24 +168,6 @@ int mca_btl_self_free(
|
||||
struct mca_btl_base_descriptor_t* mca_btl_self_prepare_src(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t* size,
|
||||
uint32_t flags
|
||||
);
|
||||
|
||||
/**
|
||||
* Prepare data for RDMA
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param peer (IN) BTL peer addressing
|
||||
*/
|
||||
struct mca_btl_base_descriptor_t* mca_btl_self_prepare_dst(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
|
@ -99,7 +99,6 @@ static int mca_btl_self_component_register(void)
|
||||
mca_btl_self.btl_rdma_pipeline_frag_size = INT_MAX;
|
||||
mca_btl_self.btl_min_rdma_pipeline_size = 0;
|
||||
mca_btl_self.btl_flags = MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_SEND_INPLACE;
|
||||
mca_btl_self.btl_seg_size = sizeof (mca_btl_base_segment_t);
|
||||
mca_btl_self.btl_bandwidth = 100;
|
||||
mca_btl_self.btl_latency = 0;
|
||||
mca_btl_base_param_register(&mca_btl_self_component.super.btl_version,
|
||||
|
@ -23,8 +23,8 @@ static inline void mca_btl_self_frag_constructor(mca_btl_self_frag_t* frag)
|
||||
{
|
||||
frag->segment.seg_addr.pval = frag+1;
|
||||
frag->segment.seg_len = (uint32_t)frag->size;
|
||||
frag->base.des_local = &frag->segment;
|
||||
frag->base.des_local_count = 1;
|
||||
frag->base.des_segments = &frag->segment;
|
||||
frag->base.des_segment_count = 1;
|
||||
frag->base.des_flags = 0;
|
||||
}
|
||||
|
||||
|
@ -57,6 +57,9 @@
|
||||
#include "opal/mca/mpool/base/base.h"
|
||||
#include "opal/mca/mpool/sm/mpool_sm.h"
|
||||
|
||||
#include "opal/align.h"
|
||||
#include "opal/util/sys_limits.h"
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
#include "opal/util/basename.h"
|
||||
#include "opal/mca/crs/base/base.h"
|
||||
@ -81,9 +84,6 @@ mca_btl_sm_t mca_btl_sm = {
|
||||
.btl_alloc = mca_btl_sm_alloc,
|
||||
.btl_free = mca_btl_sm_free,
|
||||
.btl_prepare_src = mca_btl_sm_prepare_src,
|
||||
#if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA
|
||||
.btl_prepare_dst = mca_btl_sm_prepare_dst,
|
||||
#endif /* OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA */
|
||||
.btl_send = mca_btl_sm_send,
|
||||
.btl_sendi = mca_btl_sm_sendi,
|
||||
.btl_dump = mca_btl_sm_dump,
|
||||
@ -743,7 +743,6 @@ extern int mca_btl_sm_free(
|
||||
struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_src(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
@ -828,11 +827,9 @@ struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_src(
|
||||
}
|
||||
#endif /* OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA */
|
||||
|
||||
frag->base.des_local = &(frag->segment.base);
|
||||
frag->base.des_local_count = 1;
|
||||
frag->base.des_segments = &(frag->segment.base);
|
||||
frag->base.des_segment_count = 1;
|
||||
frag->base.order = MCA_BTL_NO_ORDER;
|
||||
frag->base.des_remote = NULL;
|
||||
frag->base.des_remote_count = 0;
|
||||
frag->base.des_flags = flags;
|
||||
*size = max_data;
|
||||
return &frag->base;
|
||||
@ -950,9 +947,12 @@ int mca_btl_sm_sendi( struct mca_btl_base_module_t* btl,
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
/* presumably, this code path will never get executed */
|
||||
*descriptor = mca_btl_sm_alloc( btl, endpoint, order,
|
||||
payload_size + header_size, flags);
|
||||
if (NULL != descriptor) {
|
||||
/* presumably, this code path will never get executed */
|
||||
*descriptor = mca_btl_sm_alloc( btl, endpoint, order,
|
||||
payload_size + header_size, flags);
|
||||
}
|
||||
|
||||
return OPAL_ERR_RESOURCE_BUSY;
|
||||
}
|
||||
|
||||
@ -1001,51 +1001,87 @@ int mca_btl_sm_send( struct mca_btl_base_module_t* btl,
|
||||
}
|
||||
|
||||
#if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA
|
||||
struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_dst(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t* size,
|
||||
uint32_t flags)
|
||||
mca_btl_base_registration_handle_t *mca_btl_sm_register_mem (struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
void *base, size_t size, uint32_t flags)
|
||||
{
|
||||
void *ptr;
|
||||
mca_btl_sm_frag_t* frag;
|
||||
mca_btl_sm_registration_handle_t *handle;
|
||||
mca_btl_sm_t *sm_btl = (mca_btl_sm_t *) btl;
|
||||
ompi_free_list_item_t *item = NULL;
|
||||
|
||||
MCA_BTL_SM_FRAG_ALLOC_USER(frag);
|
||||
if(OPAL_UNLIKELY(NULL == frag)) {
|
||||
OMPI_FREE_LIST_GET_MT(&mca_btl_sm_component.registration_handles, item);
|
||||
if (OPAL_UNLIKELY(NULL == item)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
frag->segment.base.seg_len = *size;
|
||||
opal_convertor_get_current_pointer( convertor, &ptr );
|
||||
frag->segment.base.seg_addr.lval = (uint64_t)(uintptr_t) ptr;
|
||||
|
||||
frag->base.des_remote = NULL;
|
||||
frag->base.des_remote_count = 0;
|
||||
frag->base.des_local = (mca_btl_base_segment_t*)&frag->segment;
|
||||
frag->base.des_local_count = 1;
|
||||
frag->base.des_flags = flags;
|
||||
return &frag->base;
|
||||
handle = (mca_btl_sm_registration_handle_t *) item;
|
||||
|
||||
#if OPAL_BTL_SM_HAVE_KNEM
|
||||
if (OPAL_LIKELY(mca_btl_sm_component.use_knem)) {
|
||||
struct knem_cmd_create_region knem_cr;
|
||||
struct knem_cmd_param_iovec knem_iov;
|
||||
|
||||
knem_iov.base = (uintptr_t)base & ~(opal_getpagesize() - 1);
|
||||
knem_iov.len = OPAL_ALIGN(size + ((intptr_t) base - knem_iov.base), opal_getpagesize(), intptr_t);
|
||||
knem_cr.iovec_array = (uintptr_t)&knem_iov;
|
||||
knem_cr.iovec_nr = 1;
|
||||
knem_cr.flags = 0;
|
||||
knem_cr.protection = 0;
|
||||
|
||||
if (flags & MCA_BTL_REG_FLAG_REMOTE_READ) {
|
||||
knem_cr.protection |= PROT_READ;
|
||||
}
|
||||
if (flags & MCA_BTL_REG_FLAG_REMOTE_WRITE) {
|
||||
knem_cr.protection |= PROT_WRITE;
|
||||
}
|
||||
|
||||
if (OPAL_UNLIKELY(ioctl(sm_btl->knem_fd, KNEM_CMD_CREATE_REGION, &knem_cr) < 0)) {
|
||||
OMPI_FREE_LIST_RETURN_MT(&mca_btl_sm_component.registration_handles, item);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
handle->btl_handle.data.knem.cookie = knem_cr.cookie;
|
||||
handle->btl_handle.data.knem.base_addr = knem_iov.base;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
/* the pid could be included in a modex but this will work until btl/sm is
|
||||
* deleted */
|
||||
handle->btl_handle.data.pid = getpid ();
|
||||
}
|
||||
|
||||
/* return the public part of the handle */
|
||||
return &handle->btl_handle;
|
||||
}
|
||||
|
||||
int mca_btl_sm_deregister_mem (struct mca_btl_base_module_t* btl, mca_btl_base_registration_handle_t *handle)
|
||||
{
|
||||
mca_btl_sm_registration_handle_t *sm_handle =
|
||||
(mca_btl_sm_registration_handle_t *)((intptr_t) handle - offsetof (mca_btl_sm_registration_handle_t, btl_handle));
|
||||
mca_btl_sm_t* sm_btl = (mca_btl_sm_t*) btl;
|
||||
|
||||
#if OPAL_BTL_SM_HAVE_KNEM
|
||||
if (OPAL_LIKELY(mca_btl_sm_component.use_knem)) {
|
||||
(void) ioctl(sm_btl->knem_fd, KNEM_CMD_DESTROY_REGION, &handle->data.knem.cookie);
|
||||
}
|
||||
#endif
|
||||
|
||||
OMPI_FREE_LIST_RETURN_MT(&mca_btl_sm_component.registration_handles, &sm_handle->super);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
#endif /* OPAL_BTL_SM_HAVE_KNEM */
|
||||
|
||||
#if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA
|
||||
|
||||
/**
|
||||
* Initiate an synchronous get.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param descriptor (IN) Description of the data to be transferred
|
||||
*/
|
||||
int mca_btl_sm_get_sync(struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_btl_base_descriptor_t* des)
|
||||
int mca_btl_sm_get_sync (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
{
|
||||
int btl_ownership;
|
||||
mca_btl_sm_frag_t* frag = (mca_btl_sm_frag_t*)des;
|
||||
mca_btl_sm_segment_t *src = (mca_btl_sm_segment_t*)des->des_remote;
|
||||
mca_btl_sm_segment_t *dst = (mca_btl_sm_segment_t*)des->des_local;
|
||||
#if OPAL_BTL_SM_HAVE_KNEM
|
||||
mca_btl_sm_t* sm_btl = (mca_btl_sm_t*) btl;
|
||||
if (OPAL_LIKELY(mca_btl_sm_component.use_knem)) {
|
||||
@ -1054,12 +1090,12 @@ int mca_btl_sm_get_sync(struct mca_btl_base_module_t* btl,
|
||||
|
||||
/* Fill in the ioctl data fields. There's no async completion, so
|
||||
we don't need to worry about getting a slot, etc. */
|
||||
recv_iovec.base = (uintptr_t) dst->base.seg_addr.lval;
|
||||
recv_iovec.len = dst->base.seg_len;
|
||||
recv_iovec.base = (uintptr_t) local_address;
|
||||
recv_iovec.len = size;
|
||||
icopy.local_iovec_array = (uintptr_t)&recv_iovec;
|
||||
icopy.local_iovec_nr = 1;
|
||||
icopy.remote_cookie = src->key;
|
||||
icopy.remote_offset = 0;
|
||||
icopy.remote_cookie = remote_handle->data.knem.cookie;
|
||||
icopy.remote_offset = remote_address - remote_handle->data.knem.base_addr;
|
||||
icopy.write = 0;
|
||||
|
||||
/* Use the DMA flag if knem supports it *and* the segment length
|
||||
@ -1067,7 +1103,7 @@ int mca_btl_sm_get_sync(struct mca_btl_base_module_t* btl,
|
||||
value is 0 (i.e., the MCA param was set to 0), the segment size
|
||||
will never be larger than it, so DMA will never be used. */
|
||||
icopy.flags = 0;
|
||||
if (mca_btl_sm_component.knem_dma_min <= dst->base.seg_len) {
|
||||
if (mca_btl_sm_component.knem_dma_min <= size) {
|
||||
icopy.flags = mca_btl_sm_component.knem_dma_flag;
|
||||
}
|
||||
/* synchronous flags only, no need to specify icopy.async_status_index */
|
||||
@ -1085,27 +1121,19 @@ int mca_btl_sm_get_sync(struct mca_btl_base_module_t* btl,
|
||||
|
||||
#if OPAL_BTL_SM_HAVE_CMA
|
||||
if (OPAL_LIKELY(mca_btl_sm_component.use_cma)) {
|
||||
char *remote_address, *local_address;
|
||||
int remote_length, local_length;
|
||||
struct iovec local, remote;
|
||||
pid_t remote_pid;
|
||||
int val;
|
||||
|
||||
remote_address = (char *)(uintptr_t) src->base.seg_addr.lval;
|
||||
remote_length = src->base.seg_len;
|
||||
|
||||
local_address = (char *)(uintptr_t) dst->base.seg_addr.lval;
|
||||
local_length = dst->base.seg_len;
|
||||
|
||||
remote_pid = src->key;
|
||||
remote.iov_base = remote_address;
|
||||
remote.iov_len = remote_length;
|
||||
remote_pid = remote_handle->data.pid;
|
||||
remote.iov_base = (void *) (intptr_t) remote_address;
|
||||
remote.iov_len = size;
|
||||
local.iov_base = local_address;
|
||||
local.iov_len = local_length;
|
||||
local.iov_len = size;
|
||||
|
||||
val = process_vm_readv(remote_pid, &local, 1, &remote, 1, 0);
|
||||
|
||||
if (val != local_length) {
|
||||
if (val != size) {
|
||||
if (val<0) {
|
||||
opal_output(0, "mca_btl_sm_get_sync: process_vm_readv failed: %i",
|
||||
errno);
|
||||
@ -1119,15 +1147,7 @@ int mca_btl_sm_get_sync(struct mca_btl_base_module_t* btl,
|
||||
}
|
||||
#endif /* OPAL_BTL_SM_HAVE_CMA */
|
||||
|
||||
btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
||||
if (0 != (MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags)) {
|
||||
frag->base.des_cbfunc(&mca_btl_sm.super,
|
||||
frag->endpoint, &frag->base,
|
||||
OPAL_SUCCESS);
|
||||
}
|
||||
if (btl_ownership) {
|
||||
MCA_BTL_SM_FRAG_RETURN(frag);
|
||||
}
|
||||
cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
@ -1139,34 +1159,42 @@ int mca_btl_sm_get_sync(struct mca_btl_base_module_t* btl,
|
||||
|
||||
/**
|
||||
* Initiate an asynchronous get.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param descriptor (IN) Description of the data to be transferred
|
||||
*/
|
||||
int mca_btl_sm_get_async(struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_btl_base_descriptor_t* des)
|
||||
int mca_btl_sm_get_async (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
{
|
||||
int btl_ownership;
|
||||
mca_btl_sm_t* sm_btl = (mca_btl_sm_t*) btl;
|
||||
mca_btl_sm_frag_t* frag = (mca_btl_sm_frag_t*)des;
|
||||
mca_btl_sm_segment_t *src = (mca_btl_sm_segment_t*)des->des_remote;
|
||||
mca_btl_sm_segment_t *dst = (mca_btl_sm_segment_t*)des->des_local;
|
||||
mca_btl_sm_frag_t* frag;
|
||||
struct knem_cmd_inline_copy icopy;
|
||||
struct knem_cmd_param_iovec recv_iovec;
|
||||
|
||||
/* If we have no knem slots available, return
|
||||
TEMP_OUT_OF_RESOURCE */
|
||||
/* If we have no knem slots available, fall back to synchronous */
|
||||
if (sm_btl->knem_status_num_used >=
|
||||
mca_btl_sm_component.knem_max_simultaneous) {
|
||||
return OPAL_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
return mca_btl_sm_get_sync (btl, endpoint, local_address, remote_address, local_handle,
|
||||
remote_handle, size, flags, order, cbfunc, cbcontext, cbdata);
|
||||
}
|
||||
|
||||
/* allocate a fragment to keep track of this transaction */
|
||||
MCA_BTL_SM_FRAG_ALLOC_USER(frag);
|
||||
if (OPAL_UNLIKELY(NULL == frag)) {
|
||||
return mca_btl_sm_get_sync (btl, endpoint, local_address, remote_address, local_handle,
|
||||
remote_handle, size, flags, order, cbfunc, cbcontext, cbdata);
|
||||
}
|
||||
|
||||
/* fill in callback data */
|
||||
frag->cb.func = cbfunc;
|
||||
frag->cb.context = cbcontext;
|
||||
frag->cb.data = cbdata;
|
||||
frag->cb.local_address = local_address;
|
||||
frag->cb.local_handle = local_handle;
|
||||
|
||||
/* We have a slot, so fill in the data fields. Bump the
|
||||
first_avail and num_used counters. */
|
||||
recv_iovec.base = (uintptr_t) dst->base.seg_addr.lval;
|
||||
recv_iovec.len = dst->base.seg_len;
|
||||
recv_iovec.base = (uintptr_t) local_address;
|
||||
recv_iovec.len = size;
|
||||
icopy.local_iovec_array = (uintptr_t)&recv_iovec;
|
||||
icopy.local_iovec_nr = 1;
|
||||
icopy.write = 0;
|
||||
@ -1176,13 +1204,13 @@ int mca_btl_sm_get_async(struct mca_btl_base_module_t* btl,
|
||||
sm_btl->knem_status_first_avail = 0;
|
||||
}
|
||||
++sm_btl->knem_status_num_used;
|
||||
icopy.remote_cookie = src->key;
|
||||
icopy.remote_offset = 0;
|
||||
icopy.remote_cookie = remote_handle->data.knem.cookie;
|
||||
icopy.remote_offset = remote_address - remote_handle->data.knem.base_addr;
|
||||
|
||||
/* Use the DMA flag if knem supports it *and* the segment length
|
||||
is greater than the cutoff */
|
||||
icopy.flags = KNEM_FLAG_ASYNCDMACOMPLETE;
|
||||
if (mca_btl_sm_component.knem_dma_min <= dst->base.seg_len) {
|
||||
if (mca_btl_sm_component.knem_dma_min <= size) {
|
||||
icopy.flags = mca_btl_sm_component.knem_dma_flag;
|
||||
}
|
||||
|
||||
@ -1190,19 +1218,11 @@ int mca_btl_sm_get_async(struct mca_btl_base_module_t* btl,
|
||||
if (OPAL_LIKELY(0 == ioctl(sm_btl->knem_fd,
|
||||
KNEM_CMD_INLINE_COPY, &icopy))) {
|
||||
if (icopy.current_status != KNEM_STATUS_PENDING) {
|
||||
MCA_BTL_SM_FRAG_RETURN(frag);
|
||||
/* request completed synchronously */
|
||||
|
||||
/* FIXME: what if icopy.current_status == KNEM_STATUS_FAILED? */
|
||||
|
||||
btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
||||
if (0 != (MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags)) {
|
||||
frag->base.des_cbfunc(&mca_btl_sm.super,
|
||||
frag->endpoint, &frag->base,
|
||||
OPAL_SUCCESS);
|
||||
}
|
||||
if (btl_ownership) {
|
||||
MCA_BTL_SM_FRAG_RETURN(frag);
|
||||
}
|
||||
cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS);
|
||||
|
||||
--sm_btl->knem_status_num_used;
|
||||
++sm_btl->knem_status_first_used;
|
||||
|
@ -1,3 +1,4 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
@ -11,7 +12,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire. All rights reserved.
|
||||
* Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2013 Los Alamos National Security, LLC.
|
||||
* Copyright (c) 2010-2015 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010-2012 IBM Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -126,7 +127,7 @@ typedef struct mca_btl_sm_mem_node_t {
|
||||
* Shared Memory (SM) BTL module.
|
||||
*/
|
||||
struct mca_btl_sm_component_t {
|
||||
mca_btl_base_component_2_0_0_t super; /**< base BTL component */
|
||||
mca_btl_base_component_3_0_0_t super; /**< base BTL component */
|
||||
int sm_free_list_num; /**< initial size of free lists */
|
||||
int sm_free_list_max; /**< maximum size of free lists */
|
||||
int sm_free_list_inc; /**< number of elements to alloc when growing free lists */
|
||||
@ -182,6 +183,10 @@ struct mca_btl_sm_component_t {
|
||||
#if OPAL_BTL_SM_HAVE_KNEM
|
||||
/* Knem capabilities info */
|
||||
struct knem_cmd_info knem_info;
|
||||
#endif
|
||||
#if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA
|
||||
/** registration handles to hold knem cookies */
|
||||
ompi_free_list_t registration_handles;
|
||||
#endif /* OPAL_BTL_SM_HAVE_KNEM */
|
||||
|
||||
/** MCA: should we be using knem or not? neg=try but continue if
|
||||
@ -461,7 +466,6 @@ extern int mca_btl_sm_free(
|
||||
struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_src(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
@ -504,30 +508,20 @@ extern int mca_btl_sm_send(
|
||||
/*
|
||||
* Synchronous knem/cma get
|
||||
*/
|
||||
extern int mca_btl_sm_get_sync(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_btl_base_descriptor_t* des );
|
||||
|
||||
extern struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_dst(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t* size,
|
||||
uint32_t flags);
|
||||
int mca_btl_sm_get_sync (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
#endif /* OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA */
|
||||
|
||||
#if OPAL_BTL_SM_HAVE_KNEM
|
||||
/*
|
||||
* Asynchronous knem get
|
||||
*/
|
||||
extern int mca_btl_sm_get_async(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_btl_base_descriptor_t* des );
|
||||
int mca_btl_sm_get_async (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
|
||||
#endif /* OPAL_BTL_SM_HAVE_KNEM */
|
||||
|
||||
@ -558,6 +552,31 @@ void mca_btl_sm_component_event_thread(opal_object_t*);
|
||||
#define MCA_BTL_SM_SIGNAL_PEER(peer)
|
||||
#endif
|
||||
|
||||
#if OPAL_BTL_SM_HAVE_KNEM | OPAL_BTL_SM_HAVE_CMA
|
||||
struct mca_btl_base_registration_handle_t {
|
||||
union {
|
||||
struct {
|
||||
uint64_t cookie;
|
||||
intptr_t base_addr;
|
||||
} knem;
|
||||
pid_t pid;
|
||||
} data;
|
||||
};
|
||||
|
||||
struct mca_btl_sm_registration_handle_t {
|
||||
ompi_free_list_item_t super;
|
||||
mca_btl_base_registration_handle_t btl_handle;
|
||||
};
|
||||
typedef struct mca_btl_sm_registration_handle_t mca_btl_sm_registration_handle_t;
|
||||
|
||||
mca_btl_base_registration_handle_t *mca_btl_sm_register_mem (struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
void *base, size_t size, uint32_t flags);
|
||||
|
||||
int mca_btl_sm_deregister_mem (struct mca_btl_base_module_t* btl, mca_btl_base_registration_handle_t *handle);
|
||||
|
||||
#endif
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
||||
|
@ -67,6 +67,10 @@
|
||||
#include "opal/mca/common/cuda/common_cuda.h"
|
||||
#endif /* OPAL_CUDA_SUPPORT */
|
||||
|
||||
#if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA
|
||||
static OBJ_CLASS_INSTANCE(mca_btl_sm_registration_handle_t, ompi_free_list_item_t, NULL, NULL);
|
||||
#endif
|
||||
|
||||
static int mca_btl_sm_component_open(void);
|
||||
static int mca_btl_sm_component_close(void);
|
||||
static int sm_register(void);
|
||||
@ -251,10 +255,13 @@ static int sm_register(void)
|
||||
mca_btl_sm.super.btl_rdma_pipeline_frag_size = 64*1024;
|
||||
mca_btl_sm.super.btl_min_rdma_pipeline_size = 64*1024;
|
||||
mca_btl_sm.super.btl_flags = MCA_BTL_FLAGS_SEND;
|
||||
mca_btl_sm.super.btl_seg_size = sizeof (mca_btl_sm_segment_t);
|
||||
mca_btl_sm.super.btl_bandwidth = 9000; /* Mbs */
|
||||
mca_btl_sm.super.btl_latency = 1; /* Microsecs */
|
||||
|
||||
#if OPAL_BTL_SM_HAVE_KNEM
|
||||
mca_btl_sm.super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t);
|
||||
#endif
|
||||
|
||||
/* Call the BTL based to register its MCA params */
|
||||
mca_btl_base_param_register(&mca_btl_sm_component.super.btl_version,
|
||||
&mca_btl_sm.super);
|
||||
@ -295,6 +302,11 @@ static int mca_btl_sm_component_open(void)
|
||||
OBJ_CONSTRUCT(&mca_btl_sm_component.pending_send_fl, opal_free_list_t);
|
||||
|
||||
mca_btl_sm_component.sm_seg = NULL;
|
||||
|
||||
#if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA
|
||||
OBJ_CONSTRUCT(&mca_btl_sm_component.registration_handles, ompi_free_list_t);
|
||||
#endif
|
||||
|
||||
#if OPAL_BTL_SM_HAVE_KNEM
|
||||
mca_btl_sm.knem_fd = -1;
|
||||
mca_btl_sm.knem_status_array = NULL;
|
||||
@ -332,6 +344,10 @@ static int mca_btl_sm_component_close(void)
|
||||
}
|
||||
#endif /* OPAL_BTL_SM_HAVE_KNEM */
|
||||
|
||||
#if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA
|
||||
OBJ_DESTRUCT(&mca_btl_sm_component.registration_handles);
|
||||
#endif
|
||||
|
||||
OBJ_DESTRUCT(&mca_btl_sm_component.sm_lock);
|
||||
/**
|
||||
* We don't have to destroy the fragment lists. They are allocated
|
||||
@ -904,6 +920,9 @@ mca_btl_sm_component_init(int *num_btls,
|
||||
} else {
|
||||
mca_btl_sm.super.btl_get = mca_btl_sm_get_sync;
|
||||
}
|
||||
|
||||
mca_btl_sm.super.btl_register_mem = mca_btl_sm_register_mem;
|
||||
mca_btl_sm.super.btl_deregister_mem = mca_btl_sm_deregister_mem;
|
||||
}
|
||||
#else
|
||||
/* If the user explicitly asked for knem and we can't provide it,
|
||||
@ -918,6 +937,8 @@ mca_btl_sm_component_init(int *num_btls,
|
||||
/* Will only ever have either cma or knem enabled at runtime
|
||||
so no problems with accidentally overwriting this set earlier */
|
||||
mca_btl_sm.super.btl_get = mca_btl_sm_get_sync;
|
||||
mca_btl_sm.super.btl_register_mem = mca_btl_sm_register_mem;
|
||||
mca_btl_sm.super.btl_deregister_mem = mca_btl_sm_deregister_mem;
|
||||
}
|
||||
#else
|
||||
/* If the user explicitly asked for CMA and we can't provide itm
|
||||
@ -931,6 +952,21 @@ mca_btl_sm_component_init(int *num_btls,
|
||||
}
|
||||
#endif /* OPAL_BTL_SM_HAVE_CMA */
|
||||
|
||||
#if OPAL_BTL_SM_HAVE_KNEM | OPAL_BTL_SM_HAVE_CMA
|
||||
if (mca_btl_sm_component.use_cma || mca_btl_sm_component.use_knem) {
|
||||
rc = ompi_free_list_init_new (&mca_btl_sm_component.registration_handles,
|
||||
sizeof (mca_btl_sm_registration_handle_t),
|
||||
8, OBJ_CLASS(mca_btl_sm_registration_handle_t),
|
||||
0, 0, mca_btl_sm_component.sm_free_list_num,
|
||||
mca_btl_sm_component.sm_free_list_max,
|
||||
mca_btl_sm_component.sm_free_list_inc, NULL);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
free (btls);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
return btls;
|
||||
|
||||
no_knem:
|
||||
@ -963,6 +999,7 @@ mca_btl_sm_component_init(int *num_btls,
|
||||
/* disable get when not using knem or cma */
|
||||
mca_btl_sm.super.btl_get = NULL;
|
||||
mca_btl_sm.super.btl_flags &= ~MCA_BTL_FLAGS_GET;
|
||||
mca_btl_sm_component.use_knem = 0;
|
||||
}
|
||||
|
||||
/* Otherwise, use_knem was 0 (and we didn't get here) or use_knem
|
||||
@ -1090,8 +1127,8 @@ int mca_btl_sm_component_progress(void)
|
||||
reg = mca_btl_base_active_message_trigger + hdr->tag;
|
||||
seg.seg_addr.pval = ((char *)hdr) + sizeof(mca_btl_sm_hdr_t);
|
||||
seg.seg_len = hdr->len;
|
||||
Frag.base.des_local_count = 1;
|
||||
Frag.base.des_local = &seg;
|
||||
Frag.base.des_segment_count = 1;
|
||||
Frag.base.des_segments = &seg;
|
||||
reg->cbfunc(&mca_btl_sm.super, hdr->tag, &(Frag.base),
|
||||
reg->cbdata);
|
||||
/* return the fragment */
|
||||
@ -1176,22 +1213,14 @@ int mca_btl_sm_component_progress(void)
|
||||
mca_btl_sm.knem_status_array[mca_btl_sm.knem_status_first_used]) {
|
||||
if (KNEM_STATUS_SUCCESS ==
|
||||
mca_btl_sm.knem_status_array[mca_btl_sm.knem_status_first_used]) {
|
||||
int btl_ownership;
|
||||
|
||||
/* Handle the completed fragment */
|
||||
frag =
|
||||
mca_btl_sm.knem_frag_array[mca_btl_sm.knem_status_first_used];
|
||||
btl_ownership = (frag->base.des_flags &
|
||||
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
||||
if (0 != (MCA_BTL_DES_SEND_ALWAYS_CALLBACK &
|
||||
frag->base.des_flags)) {
|
||||
frag->base.des_cbfunc(&mca_btl_sm.super,
|
||||
frag->endpoint, &frag->base,
|
||||
OPAL_SUCCESS);
|
||||
}
|
||||
if (btl_ownership) {
|
||||
MCA_BTL_SM_FRAG_RETURN(frag);
|
||||
}
|
||||
frag->cb.func (&mca_btl_sm.super, frag->endpoint,
|
||||
frag->cb.local_address, frag->cb.local_handle,
|
||||
frag->cb.context, frag->cb.data, OPAL_SUCCESS);
|
||||
MCA_BTL_SM_FRAG_RETURN(frag);
|
||||
|
||||
/* Bump counters, loop around the circular buffer if
|
||||
necessary */
|
||||
|
@ -10,6 +10,8 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire. All rights reserved.
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
|
@ -31,8 +31,8 @@ static inline void mca_btl_sm_frag_common_constructor(mca_btl_sm_frag_t* frag)
|
||||
frag->hdr->my_smp_rank = mca_btl_sm_component.my_smp_rank;
|
||||
}
|
||||
frag->segment.base.seg_len = frag->size;
|
||||
frag->base.des_local = &frag->segment.base;
|
||||
frag->base.des_local_count = 1;
|
||||
frag->base.des_segments = &frag->segment.base;
|
||||
frag->base.des_segment_count = 1;
|
||||
frag->base.des_flags = 0;
|
||||
}
|
||||
|
||||
|
@ -1,3 +1,4 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
@ -11,6 +12,8 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -64,6 +67,16 @@ struct mca_btl_sm_frag_t {
|
||||
/* pointer written to the FIFO, this is the base of the shared memory region */
|
||||
mca_btl_sm_hdr_t *hdr;
|
||||
ompi_free_list_t* my_list;
|
||||
#if OPAL_BTL_SM_HAVE_KNEM
|
||||
/* rdma callback data. required for async get */
|
||||
struct {
|
||||
mca_btl_base_rdma_completion_fn_t func;
|
||||
void *local_address;
|
||||
struct mca_btl_base_registration_handle_t *local_handle;
|
||||
void *context;
|
||||
void *data;
|
||||
} cb;
|
||||
#endif
|
||||
};
|
||||
typedef struct mca_btl_sm_frag_t mca_btl_sm_frag_t;
|
||||
typedef struct mca_btl_sm_frag_t mca_btl_sm_frag1_t;
|
||||
|
@ -12,7 +12,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2014 Los Alamos National Security, LLC.
|
||||
* Copyright (c) 2010-2015 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2012-2014 NVIDIA Corporation. All rights reserved.
|
||||
* Copyright (c) 2012 Oracle and/or its affiliates. All rights reserved.
|
||||
@ -71,6 +71,15 @@
|
||||
#include "btl_smcuda_frag.h"
|
||||
#include "btl_smcuda_fifo.h"
|
||||
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
static struct mca_btl_base_registration_handle_t *mca_btl_smcuda_register_mem (
|
||||
struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t *endpoint, void *base,
|
||||
size_t size, uint32_t flags);
|
||||
|
||||
static int mca_btl_smcuda_deregister_mem (struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_registration_handle_t *handle);
|
||||
#endif
|
||||
|
||||
mca_btl_smcuda_t mca_btl_smcuda = {
|
||||
.super = {
|
||||
.btl_component = &mca_btl_smcuda_component.super,
|
||||
@ -80,9 +89,10 @@ mca_btl_smcuda_t mca_btl_smcuda = {
|
||||
.btl_alloc = mca_btl_smcuda_alloc,
|
||||
.btl_free = mca_btl_smcuda_free,
|
||||
.btl_prepare_src = mca_btl_smcuda_prepare_src,
|
||||
#if OPAL_CUDA_SUPPORT || OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA
|
||||
.btl_prepare_dst = mca_btl_smcuda_prepare_dst,
|
||||
#endif /* OPAL_CUDA_SUPPORT || OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA */
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
.btl_register_mem = mca_btl_smcuda_register_mem,
|
||||
.btl_deregister_mem = mca_btl_smcuda_deregister_mem,
|
||||
#endif /* OPAL_CUDA_SUPPORT */
|
||||
.btl_send = mca_btl_smcuda_send,
|
||||
.btl_sendi = mca_btl_smcuda_sendi,
|
||||
.btl_dump = mca_btl_smcuda_dump,
|
||||
@ -741,7 +751,7 @@ extern mca_btl_base_descriptor_t* mca_btl_smcuda_alloc(
|
||||
}
|
||||
|
||||
if (OPAL_LIKELY(frag != NULL)) {
|
||||
frag->segment.base.seg_len = size;
|
||||
frag->segment.seg_len = size;
|
||||
frag->base.des_flags = flags;
|
||||
}
|
||||
return (mca_btl_base_descriptor_t*)frag;
|
||||
@ -772,7 +782,6 @@ extern int mca_btl_smcuda_free(
|
||||
struct mca_btl_base_descriptor_t* mca_btl_smcuda_prepare_src(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
@ -784,68 +793,33 @@ struct mca_btl_base_descriptor_t* mca_btl_smcuda_prepare_src(
|
||||
uint32_t iov_count = 1;
|
||||
size_t max_data = *size;
|
||||
int rc;
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
if (0 != reserve) {
|
||||
#endif /* OPAL_CUDA_SUPPORT */
|
||||
if ( reserve + max_data <= mca_btl_smcuda_component.eager_limit ) {
|
||||
MCA_BTL_SMCUDA_FRAG_ALLOC_EAGER(frag);
|
||||
} else {
|
||||
MCA_BTL_SMCUDA_FRAG_ALLOC_MAX(frag);
|
||||
}
|
||||
if( OPAL_UNLIKELY(NULL == frag) ) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if( OPAL_UNLIKELY(reserve + max_data > frag->size) ) {
|
||||
max_data = frag->size - reserve;
|
||||
}
|
||||
iov.iov_len = max_data;
|
||||
iov.iov_base =
|
||||
(IOVBASE_TYPE*)(((unsigned char*)(frag->segment.base.seg_addr.pval)) + reserve);
|
||||
|
||||
rc = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
|
||||
if( OPAL_UNLIKELY(rc < 0) ) {
|
||||
MCA_BTL_SMCUDA_FRAG_RETURN(frag);
|
||||
return NULL;
|
||||
}
|
||||
frag->segment.base.seg_len = reserve + max_data;
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
if ( reserve + max_data <= mca_btl_smcuda_component.eager_limit ) {
|
||||
MCA_BTL_SMCUDA_FRAG_ALLOC_EAGER(frag);
|
||||
} else {
|
||||
/* Normally, we are here because we have a GPU buffer and we are preparing
|
||||
* to send it. However, we can also be there because we have received a
|
||||
* PUT message because we are trying to send a host buffer. Therefore,
|
||||
* we need to again check to make sure buffer is GPU. If not, then return
|
||||
* NULL. We can just check the convertor since we have that. */
|
||||
if (!(convertor->flags & CONVERTOR_CUDA)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
MCA_BTL_SMCUDA_FRAG_ALLOC_USER(frag);
|
||||
if( OPAL_UNLIKELY(NULL == frag) ) {
|
||||
return NULL;
|
||||
}
|
||||
iov.iov_len = max_data;
|
||||
iov.iov_base = NULL;
|
||||
rc = opal_convertor_pack(convertor, &iov, &iov_count, &max_data);
|
||||
if( OPAL_UNLIKELY(rc < 0) ) {
|
||||
MCA_BTL_SMCUDA_FRAG_RETURN(frag);
|
||||
return NULL;
|
||||
}
|
||||
frag->segment.base.seg_addr.lval = (uint64_t)(uintptr_t) iov.iov_base;
|
||||
frag->segment.base.seg_len = max_data;
|
||||
memcpy(frag->segment.key, ((mca_mpool_common_cuda_reg_t *)registration)->memHandle,
|
||||
sizeof(((mca_mpool_common_cuda_reg_t *)registration)->memHandle) +
|
||||
sizeof(((mca_mpool_common_cuda_reg_t *)registration)->evtHandle));
|
||||
frag->segment.memh_seg_addr.pval = registration->base;
|
||||
frag->segment.memh_seg_len = registration->bound - registration->base + 1;
|
||||
|
||||
MCA_BTL_SMCUDA_FRAG_ALLOC_MAX(frag);
|
||||
}
|
||||
#endif /* OPAL_CUDA_SUPPORT */
|
||||
frag->base.des_local = &(frag->segment.base);
|
||||
frag->base.des_local_count = 1;
|
||||
if( OPAL_UNLIKELY(NULL == frag) ) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if( OPAL_UNLIKELY(reserve + max_data > frag->size) ) {
|
||||
max_data = frag->size - reserve;
|
||||
}
|
||||
iov.iov_len = max_data;
|
||||
iov.iov_base =
|
||||
(IOVBASE_TYPE*)(((unsigned char*)(frag->segment.seg_addr.pval)) + reserve);
|
||||
|
||||
rc = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
|
||||
if( OPAL_UNLIKELY(rc < 0) ) {
|
||||
MCA_BTL_SMCUDA_FRAG_RETURN(frag);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
frag->segment.seg_len = reserve + max_data;
|
||||
frag->base.des_segments = &frag->segment;
|
||||
frag->base.des_segment_count = 1;
|
||||
frag->base.order = MCA_BTL_NO_ORDER;
|
||||
frag->base.des_remote = NULL;
|
||||
frag->base.des_remote_count = 0;
|
||||
frag->base.des_flags = flags;
|
||||
*size = max_data;
|
||||
return &frag->base;
|
||||
@ -854,8 +828,8 @@ struct mca_btl_base_descriptor_t* mca_btl_smcuda_prepare_src(
|
||||
#if 0
|
||||
#define MCA_BTL_SMCUDA_TOUCH_DATA_TILL_CACHELINE_BOUNDARY(sm_frag) \
|
||||
do { \
|
||||
char* _memory = (char*)(sm_frag)->segment.base.seg_addr.pval + \
|
||||
(sm_frag)->segment.base.seg_len; \
|
||||
char* _memory = (char*)(sm_frag)->segment.seg_addr.pval + \
|
||||
(sm_frag)->segment.seg_len; \
|
||||
int* _intmem; \
|
||||
size_t align = (intptr_t)_memory & 0xFUL; \
|
||||
switch( align & 0x3 ) { \
|
||||
@ -926,7 +900,7 @@ int mca_btl_smcuda_sendi( struct mca_btl_base_module_t* btl,
|
||||
}
|
||||
|
||||
/* fill in fragment fields */
|
||||
frag->segment.base.seg_len = length;
|
||||
frag->segment.seg_len = length;
|
||||
frag->hdr->len = length;
|
||||
assert( 0 == (flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) );
|
||||
frag->base.des_flags = flags | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP; /* why do any flags matter here other than OWNERSHIP? */
|
||||
@ -934,7 +908,7 @@ int mca_btl_smcuda_sendi( struct mca_btl_base_module_t* btl,
|
||||
frag->endpoint = endpoint;
|
||||
|
||||
/* write the match header (with MPI comm/tag/etc. info) */
|
||||
memcpy( frag->segment.base.seg_addr.pval, header, header_size );
|
||||
memcpy( frag->segment.seg_addr.pval, header, header_size );
|
||||
|
||||
/* write the message data if there is any */
|
||||
/*
|
||||
@ -945,7 +919,7 @@ int mca_btl_smcuda_sendi( struct mca_btl_base_module_t* btl,
|
||||
struct iovec iov;
|
||||
uint32_t iov_count;
|
||||
/* pack the data into the supplied buffer */
|
||||
iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)frag->segment.base.seg_addr.pval + header_size);
|
||||
iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)frag->segment.seg_addr.pval + header_size);
|
||||
iov.iov_len = max_data = payload_size;
|
||||
iov_count = 1;
|
||||
|
||||
@ -1000,7 +974,7 @@ int mca_btl_smcuda_send( struct mca_btl_base_module_t* btl,
|
||||
#endif /* OPAL_CUDA_SUPPORT */
|
||||
|
||||
/* available header space */
|
||||
frag->hdr->len = frag->segment.base.seg_len;
|
||||
frag->hdr->len = frag->segment.seg_len;
|
||||
/* type of message, pt-2-pt, one-sided, etc */
|
||||
frag->hdr->tag = tag;
|
||||
|
||||
@ -1024,65 +998,76 @@ int mca_btl_smcuda_send( struct mca_btl_base_module_t* btl,
|
||||
*/
|
||||
return 0;
|
||||
}
|
||||
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
struct mca_btl_base_descriptor_t* mca_btl_smcuda_prepare_dst(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t* size,
|
||||
uint32_t flags)
|
||||
static struct mca_btl_base_registration_handle_t *mca_btl_smcuda_register_mem (
|
||||
struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t *endpoint, void *base,
|
||||
size_t size, uint32_t flags)
|
||||
{
|
||||
void *ptr;
|
||||
mca_btl_smcuda_frag_t* frag;
|
||||
mca_mpool_common_cuda_reg_t *reg;
|
||||
int mpool_flags = 0;
|
||||
|
||||
/* Only support GPU buffers */
|
||||
if (!(convertor->flags & CONVERTOR_CUDA)) {
|
||||
if (MCA_BTL_REG_FLAG_CUDA_GPU_MEM & flags) {
|
||||
mpool_flags |= MCA_MPOOL_FLAGS_CUDA_GPU_MEM;
|
||||
}
|
||||
|
||||
btl->btl_mpool->mpool_register (btl->btl_mpool, base, size, mpool_flags,
|
||||
(mca_mpool_base_registration_t **) ®);
|
||||
if (OPAL_UNLIKELY(NULL == reg)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
MCA_BTL_SMCUDA_FRAG_ALLOC_USER(frag);
|
||||
if(OPAL_UNLIKELY(NULL == frag)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
frag->segment.base.seg_len = *size;
|
||||
opal_convertor_get_current_pointer( convertor, &ptr );
|
||||
frag->segment.base.seg_addr.lval = (uint64_t)(uintptr_t) ptr;
|
||||
|
||||
frag->base.des_remote = NULL;
|
||||
frag->base.des_remote_count = 0;
|
||||
frag->base.des_local = &frag->segment.base;
|
||||
frag->base.des_local_count = 1;
|
||||
frag->base.des_flags = flags;
|
||||
return &frag->base;
|
||||
return (mca_btl_base_registration_handle_t *) ®->data;
|
||||
}
|
||||
#endif /* OPAL_CUDA_SUPPORT */
|
||||
|
||||
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* ep,
|
||||
struct mca_btl_base_descriptor_t* descriptor)
|
||||
static int mca_btl_smcuda_deregister_mem (struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_registration_handle_t *handle)
|
||||
{
|
||||
mca_mpool_common_cuda_reg_t *reg = (mca_mpool_common_cuda_reg_t *)
|
||||
((intptr_t) handle - offsetof (mca_mpool_common_cuda_reg_t, data));
|
||||
|
||||
btl->btl_mpool->mpool_deregister (btl->btl_mpool, ®->base);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *ep, void *local_address,
|
||||
uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
|
||||
struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
{
|
||||
mca_btl_smcuda_segment_t *src_seg = (mca_btl_smcuda_segment_t *) descriptor->des_remote;
|
||||
mca_btl_smcuda_segment_t *dst_seg = (mca_btl_smcuda_segment_t *) descriptor->des_local;
|
||||
mca_mpool_common_cuda_reg_t rget_reg;
|
||||
mca_mpool_common_cuda_reg_t *reg_ptr = &rget_reg;
|
||||
int btl_ownership;
|
||||
int rc, done;
|
||||
void *remote_memory_address;
|
||||
size_t offset;
|
||||
mca_btl_smcuda_frag_t* frag = (mca_btl_smcuda_frag_t*)descriptor;
|
||||
mca_btl_smcuda_frag_t *frag;
|
||||
|
||||
/* NTH: copied from old prepare_dst function */
|
||||
MCA_BTL_SMCUDA_FRAG_ALLOC_USER(frag);
|
||||
if(OPAL_UNLIKELY(NULL == frag)) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* shove all the info needed for completion callbacks into the fragment */
|
||||
frag->segment.seg_len = size;
|
||||
frag->segment.seg_addr.pval = local_address;
|
||||
frag->base.des_segments = &frag->segment;
|
||||
frag->base.des_segment_count = 1;
|
||||
frag->base.des_flags = flags;
|
||||
frag->base.des_cbfunc = (mca_btl_base_completion_fn_t) cbfunc;
|
||||
frag->base.des_cbdata = cbdata;
|
||||
frag->base.des_context = cbcontext;
|
||||
frag->local_handle = local_handle;
|
||||
|
||||
/* Set to 0 for debugging since it is a list item but I am not
|
||||
* intializing it properly and it is annoying to see all the
|
||||
* garbage in the debugger. */
|
||||
|
||||
memset(&rget_reg, 0, sizeof(rget_reg));
|
||||
memcpy(&rget_reg.memHandle, src_seg->key, sizeof(src_seg->key));
|
||||
memcpy(&rget_reg.data.memHandle, remote_handle->reg_data.memHandle,
|
||||
sizeof(remote_handle->reg_data.memHandle));
|
||||
|
||||
/* Open the memory handle to the remote memory. If it is cached, then
|
||||
* we just retrieve it from cache and avoid a call to open the handle. That
|
||||
@ -1091,8 +1076,8 @@ int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t* btl,
|
||||
* remote memory which may lie somewhere in the middle. This is taken care of
|
||||
* a few lines down. Note that we hand in the peer rank just for debugging
|
||||
* support. */
|
||||
rc = ep->mpool->mpool_register(ep->mpool, src_seg->memh_seg_addr.pval,
|
||||
src_seg->memh_seg_len, ep->peer_smp_rank,
|
||||
rc = ep->mpool->mpool_register(ep->mpool, remote_handle->reg_data.memh_seg_addr.pval,
|
||||
remote_handle->reg_data.memh_seg_len, ep->peer_smp_rank,
|
||||
(mca_mpool_base_registration_t **)®_ptr);
|
||||
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
@ -1107,7 +1092,7 @@ int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t* btl,
|
||||
* not equal the address that was used to retrieve the block.
|
||||
* Therefore, compute the offset and add it to the address of the
|
||||
* memory handle. */
|
||||
offset = (unsigned char *)src_seg->base.seg_addr.lval - reg_ptr->base.base;
|
||||
offset = (size_t) ((intptr_t) remote_address - (intptr_t) reg_ptr->base.base);
|
||||
remote_memory_address = (unsigned char *)reg_ptr->base.alloc_base + offset;
|
||||
if (0 != offset) {
|
||||
opal_output(-1, "OFFSET=%d", (int)offset);
|
||||
@ -1120,8 +1105,7 @@ int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t* btl,
|
||||
* rget_reg, not reg_ptr, as we do not cache the event. */
|
||||
mca_common_wait_stream_synchronize(&rget_reg);
|
||||
|
||||
rc = mca_common_cuda_memcpy((void *)(uintptr_t) dst_seg->base.seg_addr.lval,
|
||||
remote_memory_address, dst_seg->base.seg_len,
|
||||
rc = mca_common_cuda_memcpy(local_address, remote_memory_address, size,
|
||||
"mca_btl_smcuda_get", (mca_btl_base_descriptor_t *)frag,
|
||||
&done);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
@ -1133,17 +1117,8 @@ int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t* btl,
|
||||
}
|
||||
|
||||
if (OPAL_UNLIKELY(1 == done)) {
|
||||
/* This should only be true when experimenting with synchronous copies. */
|
||||
btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
||||
if (0 != (MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags)) {
|
||||
frag->base.des_cbfunc(&mca_btl_smcuda.super,
|
||||
frag->endpoint, &frag->base,
|
||||
OPAL_SUCCESS);
|
||||
}
|
||||
|
||||
if (btl_ownership) {
|
||||
mca_btl_smcuda_free(btl, (mca_btl_base_descriptor_t *)frag);
|
||||
}
|
||||
cbfunc (btl, ep, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS);
|
||||
mca_btl_smcuda_free(btl, (mca_btl_base_descriptor_t *)frag);
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
@ -1208,7 +1183,7 @@ static void mca_btl_smcuda_send_cuda_ipc_request(struct mca_btl_base_module_t* b
|
||||
frag->endpoint = endpoint;
|
||||
ctrlhdr.ctag = IPC_REQ;
|
||||
ctrlhdr.cudev = mydevnum;
|
||||
memcpy(frag->segment.base.seg_addr.pval, &ctrlhdr, sizeof(struct ctrlhdr_st));
|
||||
memcpy(frag->segment.seg_addr.pval, &ctrlhdr, sizeof(struct ctrlhdr_st));
|
||||
|
||||
MCA_BTL_SMCUDA_TOUCH_DATA_TILL_CACHELINE_BOUNDARY(frag);
|
||||
/* write the fragment pointer to the FIFO */
|
||||
|
@ -1,3 +1,4 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
@ -11,7 +12,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire. All rights reserved.
|
||||
* Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010 Los Alamos National Security, LLC.
|
||||
* Copyright (c) 2010-2015 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2012-2013 NVIDIA Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -438,7 +439,6 @@ extern int mca_btl_smcuda_free(
|
||||
struct mca_btl_base_descriptor_t* mca_btl_smcuda_prepare_src(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
@ -481,19 +481,11 @@ extern int mca_btl_smcuda_send(
|
||||
/**
|
||||
* Remote get using device memory.
|
||||
*/
|
||||
extern int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* ep,
|
||||
struct mca_btl_base_descriptor_t* descriptor);
|
||||
|
||||
extern struct mca_btl_base_descriptor_t* mca_btl_smcuda_prepare_dst(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t* size,
|
||||
uint32_t flags);
|
||||
int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *ep, void *local_address,
|
||||
uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
|
||||
struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
|
||||
/* CUDA IPC control message tags */
|
||||
enum ipcCtrlMsg {
|
||||
|
@ -180,7 +180,7 @@ static int smcuda_register(void)
|
||||
mca_btl_smcuda.super.btl_rdma_pipeline_frag_size = 64*1024;
|
||||
mca_btl_smcuda.super.btl_min_rdma_pipeline_size = 64*1024;
|
||||
mca_btl_smcuda.super.btl_flags = MCA_BTL_FLAGS_SEND;
|
||||
mca_btl_smcuda.super.btl_seg_size = sizeof (mca_btl_smcuda_segment_t);
|
||||
mca_btl_smcuda.super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t);
|
||||
mca_btl_smcuda.super.btl_bandwidth = 9000; /* Mbs */
|
||||
mca_btl_smcuda.super.btl_latency = 1; /* Microsecs */
|
||||
|
||||
@ -655,7 +655,7 @@ static void mca_btl_smcuda_send_cuda_ipc_ack(struct mca_btl_base_module_t* btl,
|
||||
frag->hdr->tag = MCA_BTL_TAG_SMCUDA;
|
||||
frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
|
||||
frag->endpoint = endpoint;
|
||||
memcpy(frag->segment.base.seg_addr.pval, &ctrlhdr, sizeof(struct ctrlhdr_st));
|
||||
memcpy(frag->segment.seg_addr.pval, &ctrlhdr, sizeof(struct ctrlhdr_st));
|
||||
|
||||
/* write the fragment pointer to the FIFO */
|
||||
/*
|
||||
@ -691,7 +691,7 @@ static void btl_smcuda_control(mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint;
|
||||
mca_btl_smcuda_t *smcuda_btl = (mca_btl_smcuda_t *)btl;
|
||||
mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
|
||||
mca_btl_base_segment_t* segments = des->des_local;
|
||||
mca_btl_base_segment_t* segments = des->des_segments;
|
||||
|
||||
/* Use the rank of the peer that sent the data to get to the endpoint
|
||||
* structure. This is needed for PML callback. */
|
||||
@ -1065,8 +1065,8 @@ int mca_btl_smcuda_component_progress(void)
|
||||
reg = mca_btl_base_active_message_trigger + hdr->tag;
|
||||
seg.seg_addr.pval = ((char *)hdr) + sizeof(mca_btl_smcuda_hdr_t);
|
||||
seg.seg_len = hdr->len;
|
||||
Frag.base.des_local_count = 1;
|
||||
Frag.base.des_local = &seg;
|
||||
Frag.base.des_segment_count = 1;
|
||||
Frag.base.des_segments = &seg;
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
Frag.hdr = hdr; /* needed for peer rank in control messages */
|
||||
#endif /* OPAL_CUDA_SUPPORT */
|
||||
@ -1134,20 +1134,16 @@ int mca_btl_smcuda_component_progress(void)
|
||||
* completed. If so, issue the PML callbacks on the fragments.
|
||||
*/
|
||||
while (1 == progress_one_cuda_ipc_event((mca_btl_base_descriptor_t **)&frag)) {
|
||||
int btl_ownership;
|
||||
btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
||||
if (0 != (MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags)) {
|
||||
frag->base.des_cbfunc(&mca_btl_smcuda.super,
|
||||
frag->endpoint, &frag->base,
|
||||
OPAL_SUCCESS);
|
||||
}
|
||||
mca_btl_base_rdma_completion_fn_t cbfunc = (mca_btl_base_rdma_completion_fn_t) frag->base.des_cbfunc;
|
||||
|
||||
if (btl_ownership) {
|
||||
if(frag->registration != NULL) {
|
||||
frag->endpoint->mpool->mpool_deregister(frag->endpoint->mpool,
|
||||
(mca_mpool_base_registration_t*)frag->registration);
|
||||
frag->registration = NULL;
|
||||
}
|
||||
cbfunc (&mca_btl_smcuda.super, frag->endpoint, frag->segment.seg_addr.pval,
|
||||
frag->local_handle, frag->base.des_context, frag->base.des_cbdata,
|
||||
OPAL_SUCCESS);
|
||||
|
||||
if(frag->registration != NULL) {
|
||||
frag->endpoint->mpool->mpool_deregister(frag->endpoint->mpool,
|
||||
(mca_mpool_base_registration_t*)frag->registration);
|
||||
frag->registration = NULL;
|
||||
MCA_BTL_SMCUDA_FRAG_RETURN(frag);
|
||||
}
|
||||
nevents++;
|
||||
|
@ -1,3 +1,4 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
@ -11,6 +12,8 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 NVIDIA Corporation. All rights reserved.
|
||||
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -27,13 +30,13 @@ static inline void mca_btl_smcuda_frag_common_constructor(mca_btl_smcuda_frag_t*
|
||||
if(frag->hdr != NULL) {
|
||||
frag->hdr->frag = (mca_btl_smcuda_frag_t*)((uintptr_t)frag |
|
||||
MCA_BTL_SMCUDA_FRAG_ACK);
|
||||
frag->segment.base.seg_addr.pval = ((char*)frag->hdr) +
|
||||
frag->segment.seg_addr.pval = ((char*)frag->hdr) +
|
||||
sizeof(mca_btl_smcuda_hdr_t);
|
||||
frag->hdr->my_smp_rank = mca_btl_smcuda_component.my_smp_rank;
|
||||
}
|
||||
frag->segment.base.seg_len = frag->size;
|
||||
frag->base.des_local = &frag->segment.base;
|
||||
frag->base.des_local_count = 1;
|
||||
frag->segment.seg_len = frag->size;
|
||||
frag->base.des_segments = &frag->segment;
|
||||
frag->base.des_segment_count = 1;
|
||||
frag->base.des_flags = 0;
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
frag->registration = NULL;
|
||||
|
@ -1,3 +1,4 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
@ -12,6 +13,8 @@
|
||||
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 NVIDIA Corporation. All rights reserved.
|
||||
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -27,6 +30,9 @@
|
||||
#include "opal_config.h"
|
||||
#include "btl_smcuda.h"
|
||||
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
#include "opal/mca/common/cuda/common_cuda.h"
|
||||
#endif
|
||||
|
||||
#define MCA_BTL_SMCUDA_FRAG_TYPE_MASK ((uintptr_t)0x3)
|
||||
#define MCA_BTL_SMCUDA_FRAG_SEND ((uintptr_t)0x0)
|
||||
@ -46,6 +52,12 @@ struct mca_btl_smcuda_hdr_t {
|
||||
};
|
||||
typedef struct mca_btl_smcuda_hdr_t mca_btl_smcuda_hdr_t;
|
||||
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
struct mca_btl_base_registration_handle_t {
|
||||
mca_mpool_common_cuda_reg_data_t reg_data;
|
||||
};
|
||||
#endif
|
||||
|
||||
struct mca_btl_smcuda_segment_t {
|
||||
mca_btl_base_segment_t base;
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
@ -63,10 +75,11 @@ typedef struct mca_btl_smcuda_segment_t mca_btl_smcuda_segment_t;
|
||||
*/
|
||||
struct mca_btl_smcuda_frag_t {
|
||||
mca_btl_base_descriptor_t base;
|
||||
mca_btl_smcuda_segment_t segment;
|
||||
mca_btl_base_segment_t segment;
|
||||
struct mca_btl_base_endpoint_t *endpoint;
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
struct mca_mpool_base_registration_t *registration;
|
||||
struct mca_btl_base_registration_handle_t *local_handle;
|
||||
#endif /* OPAL_CUDA_SUPPORT */
|
||||
size_t size;
|
||||
/* pointer written to the FIFO, this is the base of the shared memory region */
|
||||
|
@ -42,7 +42,6 @@ mca_btl_tcp_module_t mca_btl_tcp_module = {
|
||||
.btl_alloc = mca_btl_tcp_alloc,
|
||||
.btl_free = mca_btl_tcp_free,
|
||||
.btl_prepare_src = mca_btl_tcp_prepare_src,
|
||||
.btl_prepare_dst = mca_btl_tcp_prepare_dst,
|
||||
.btl_send = mca_btl_tcp_send,
|
||||
.btl_put = mca_btl_tcp_put,
|
||||
.btl_dump = mca_btl_base_dump,
|
||||
@ -170,8 +169,8 @@ mca_btl_base_descriptor_t* mca_btl_tcp_alloc(
|
||||
frag->segments[0].seg_len = size;
|
||||
frag->segments[0].seg_addr.pval = frag+1;
|
||||
|
||||
frag->base.des_local = frag->segments;
|
||||
frag->base.des_local_count = 1;
|
||||
frag->base.des_segments = frag->segments;
|
||||
frag->base.des_segment_count = 1;
|
||||
frag->base.des_flags = flags;
|
||||
frag->base.order = MCA_BTL_NO_ORDER;
|
||||
frag->btl = (mca_btl_tcp_module_t*)btl;
|
||||
@ -202,7 +201,6 @@ int mca_btl_tcp_free(
|
||||
mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
@ -238,7 +236,7 @@ mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src(
|
||||
frag->segments[0].seg_addr.pval = (frag + 1);
|
||||
frag->segments[0].seg_len = reserve;
|
||||
|
||||
frag->base.des_local_count = 1;
|
||||
frag->base.des_segment_count = 1;
|
||||
if(opal_convertor_need_buffers(convertor)) {
|
||||
|
||||
if (max_data + reserve > frag->size) {
|
||||
@ -268,66 +266,16 @@ mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src(
|
||||
|
||||
frag->segments[1].seg_addr.pval = iov.iov_base;
|
||||
frag->segments[1].seg_len = max_data;
|
||||
frag->base.des_local_count = 2;
|
||||
frag->base.des_segment_count = 2;
|
||||
}
|
||||
|
||||
frag->base.des_local = frag->segments;
|
||||
frag->base.des_remote = NULL;
|
||||
frag->base.des_remote_count = 0;
|
||||
frag->base.des_segments = frag->segments;
|
||||
frag->base.des_flags = flags;
|
||||
frag->base.order = MCA_BTL_NO_ORDER;
|
||||
*size = max_data;
|
||||
return &frag->base;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Prepare a descriptor for send/rdma using the supplied
|
||||
* convertor. If the convertor references data that is contigous,
|
||||
* the descriptor may simply point to the user buffer. Otherwise,
|
||||
* this routine is responsible for allocating buffer space and
|
||||
* packing if required.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL peer addressing
|
||||
* @param convertor (IN) Data type convertor
|
||||
* @param reserve (IN) Additional bytes requested by upper layer to precede user data
|
||||
* @param size (IN/OUT) Number of bytes to prepare (IN), number of bytes actually prepared (OUT)
|
||||
*/
|
||||
|
||||
mca_btl_base_descriptor_t* mca_btl_tcp_prepare_dst(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t* size,
|
||||
uint32_t flags)
|
||||
{
|
||||
mca_btl_tcp_frag_t* frag;
|
||||
|
||||
if( OPAL_UNLIKELY((*size) > UINT32_MAX) ) { /* limit the size to what we support */
|
||||
*size = (size_t)UINT32_MAX;
|
||||
}
|
||||
MCA_BTL_TCP_FRAG_ALLOC_USER(frag);
|
||||
if( OPAL_UNLIKELY(NULL == frag) ) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
frag->segments->seg_len = *size;
|
||||
opal_convertor_get_current_pointer( convertor, (void**)&(frag->segments->seg_addr.pval) );
|
||||
|
||||
frag->base.des_remote = NULL;
|
||||
frag->base.des_remote_count = 0;
|
||||
frag->base.des_local = frag->segments;
|
||||
frag->base.des_local_count = 1;
|
||||
frag->base.des_flags = flags;
|
||||
frag->base.order = MCA_BTL_NO_ORDER;
|
||||
return &frag->base;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Initiate an asynchronous send.
|
||||
*
|
||||
@ -355,7 +303,7 @@ int mca_btl_tcp_send( struct mca_btl_base_module_t* btl,
|
||||
frag->iov[0].iov_base = (IOVBASE_TYPE*)&frag->hdr;
|
||||
frag->iov[0].iov_len = sizeof(frag->hdr);
|
||||
frag->hdr.size = 0;
|
||||
for( i = 0; i < (int)frag->base.des_local_count; i++) {
|
||||
for( i = 0; i < (int)frag->base.des_segment_count; i++) {
|
||||
frag->hdr.size += frag->segments[i].seg_len;
|
||||
frag->iov[i+1].iov_len = frag->segments[i].seg_len;
|
||||
frag->iov[i+1].iov_base = (IOVBASE_TYPE*)frag->segments[i].seg_addr.pval;
|
||||
@ -368,23 +316,55 @@ int mca_btl_tcp_send( struct mca_btl_base_module_t* btl,
|
||||
return mca_btl_tcp_endpoint_send(endpoint,frag);
|
||||
}
|
||||
|
||||
static void fake_rdma_complete (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint,
|
||||
mca_btl_base_descriptor_t *desc, int rc)
|
||||
{
|
||||
mca_btl_tcp_frag_t *frag = (mca_btl_tcp_frag_t *) desc;
|
||||
|
||||
frag->cb.func (btl, endpoint, frag->segments[0].seg_addr.pval, NULL, frag->cb.context, frag->cb.data,
|
||||
rc);
|
||||
}
|
||||
|
||||
/**
|
||||
* Initiate an asynchronous put.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param descriptor (IN) Description of the data to be transferred
|
||||
*/
|
||||
|
||||
int mca_btl_tcp_put( mca_btl_base_module_t* btl,
|
||||
mca_btl_base_endpoint_t* endpoint,
|
||||
mca_btl_base_descriptor_t* descriptor )
|
||||
int mca_btl_tcp_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
{
|
||||
mca_btl_tcp_module_t* tcp_btl = (mca_btl_tcp_module_t*) btl;
|
||||
mca_btl_tcp_frag_t* frag = (mca_btl_tcp_frag_t*)descriptor;
|
||||
mca_btl_tcp_frag_t *frag = NULL;
|
||||
int i;
|
||||
|
||||
MCA_BTL_TCP_FRAG_ALLOC_USER(frag);
|
||||
if( OPAL_UNLIKELY(NULL == frag) ) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;;
|
||||
}
|
||||
|
||||
frag->endpoint = endpoint;
|
||||
|
||||
frag->segments->seg_len = size;
|
||||
frag->segments->seg_addr.pval = local_address;
|
||||
|
||||
frag->base.des_segments = frag->segments;
|
||||
frag->base.des_segment_count = 1;
|
||||
frag->base.order = MCA_BTL_NO_ORDER;
|
||||
|
||||
frag->segments[0].seg_addr.pval = local_address;
|
||||
frag->segments[0].seg_len = size;
|
||||
|
||||
frag->segments[1].seg_addr.lval = remote_address;
|
||||
frag->segments[1].seg_len = size;
|
||||
|
||||
frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
|
||||
frag->base.des_cbfunc = fake_rdma_complete;
|
||||
|
||||
frag->cb.func = cbfunc;
|
||||
frag->cb.data = cbdata;
|
||||
frag->cb.context = cbcontext;
|
||||
|
||||
frag->btl = tcp_btl;
|
||||
frag->endpoint = endpoint;
|
||||
frag->rc = 0;
|
||||
@ -394,9 +374,9 @@ int mca_btl_tcp_put( mca_btl_base_module_t* btl,
|
||||
frag->iov_ptr = frag->iov;
|
||||
frag->iov[0].iov_base = (IOVBASE_TYPE*)&frag->hdr;
|
||||
frag->iov[0].iov_len = sizeof(frag->hdr);
|
||||
frag->iov[1].iov_base = (IOVBASE_TYPE*)frag->base.des_remote;
|
||||
frag->iov[1].iov_len = frag->base.des_remote_count * sizeof(mca_btl_base_segment_t);
|
||||
for( i = 0; i < (int)frag->base.des_local_count; i++ ) {
|
||||
frag->iov[1].iov_base = (IOVBASE_TYPE*) (frag->segments + 1);
|
||||
frag->iov[1].iov_len = sizeof(mca_btl_base_segment_t);
|
||||
for( i = 0; i < (int)frag->base.des_segment_count; i++ ) {
|
||||
frag->hdr.size += frag->segments[i].seg_len;
|
||||
frag->iov[i+2].iov_len = frag->segments[i].seg_len;
|
||||
frag->iov[i+2].iov_base = (IOVBASE_TYPE*)frag->segments[i].seg_addr.pval;
|
||||
@ -404,7 +384,7 @@ int mca_btl_tcp_put( mca_btl_base_module_t* btl,
|
||||
}
|
||||
frag->hdr.base.tag = MCA_BTL_TAG_BTL;
|
||||
frag->hdr.type = MCA_BTL_TCP_HDR_TYPE_PUT;
|
||||
frag->hdr.count = frag->base.des_remote_count;
|
||||
frag->hdr.count = 1;
|
||||
if (endpoint->endpoint_nbo) MCA_BTL_TCP_HDR_HTON(frag->hdr);
|
||||
return ((i = mca_btl_tcp_endpoint_send(endpoint,frag)) >= 0 ? OPAL_SUCCESS : i);
|
||||
}
|
||||
@ -412,22 +392,46 @@ int mca_btl_tcp_put( mca_btl_base_module_t* btl,
|
||||
|
||||
/**
|
||||
* Initiate an asynchronous get.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param descriptor (IN) Description of the data to be transferred
|
||||
*
|
||||
*/
|
||||
|
||||
int mca_btl_tcp_get(
|
||||
mca_btl_base_module_t* btl,
|
||||
mca_btl_base_endpoint_t* endpoint,
|
||||
mca_btl_base_descriptor_t* descriptor)
|
||||
int mca_btl_tcp_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
{
|
||||
mca_btl_tcp_module_t* tcp_btl = (mca_btl_tcp_module_t*) btl;
|
||||
mca_btl_tcp_frag_t* frag = (mca_btl_tcp_frag_t*)descriptor;
|
||||
mca_btl_tcp_frag_t* frag = NULL;
|
||||
int rc;
|
||||
|
||||
MCA_BTL_TCP_FRAG_ALLOC_USER(frag);
|
||||
if( OPAL_UNLIKELY(NULL == frag) ) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;;
|
||||
}
|
||||
|
||||
frag->endpoint = endpoint;
|
||||
|
||||
frag->segments->seg_len = size;
|
||||
frag->segments->seg_addr.pval = local_address;
|
||||
|
||||
frag->base.des_segments = frag->segments;
|
||||
frag->base.des_segment_count = 1;
|
||||
frag->base.order = MCA_BTL_NO_ORDER;
|
||||
|
||||
frag->segments[0].seg_addr.pval = local_address;
|
||||
frag->segments[0].seg_len = size;
|
||||
|
||||
frag->segments[1].seg_addr.lval = remote_address;
|
||||
frag->segments[1].seg_len = size;
|
||||
|
||||
/* call the rdma callback through the descriptor callback. this is
|
||||
* tcp so the extra latency is not an issue */
|
||||
frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
|
||||
frag->base.des_cbfunc = fake_rdma_complete;
|
||||
|
||||
frag->cb.func = cbfunc;
|
||||
frag->cb.data = cbdata;
|
||||
frag->cb.context = cbcontext;
|
||||
|
||||
frag->btl = tcp_btl;
|
||||
frag->endpoint = endpoint;
|
||||
frag->rc = 0;
|
||||
@ -437,11 +441,11 @@ int mca_btl_tcp_get(
|
||||
frag->iov_ptr = frag->iov;
|
||||
frag->iov[0].iov_base = (IOVBASE_TYPE*)&frag->hdr;
|
||||
frag->iov[0].iov_len = sizeof(frag->hdr);
|
||||
frag->iov[1].iov_base = (IOVBASE_TYPE*)frag->base.des_remote;
|
||||
frag->iov[1].iov_len = frag->base.des_remote_count * sizeof(mca_btl_base_segment_t);
|
||||
frag->iov[1].iov_base = (IOVBASE_TYPE*) &frag->segments[1];
|
||||
frag->iov[1].iov_len = sizeof(mca_btl_base_segment_t);
|
||||
frag->hdr.base.tag = MCA_BTL_TAG_BTL;
|
||||
frag->hdr.type = MCA_BTL_TCP_HDR_TYPE_GET;
|
||||
frag->hdr.count = frag->base.des_remote_count;
|
||||
frag->hdr.count = 1;
|
||||
if (endpoint->endpoint_nbo) MCA_BTL_TCP_HDR_HTON(frag->hdr);
|
||||
return ((rc = mca_btl_tcp_endpoint_send(endpoint,frag)) >= 0 ? OPAL_SUCCESS : rc);
|
||||
}
|
||||
|
@ -1,3 +1,4 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
@ -12,6 +13,8 @@
|
||||
* Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -52,7 +55,7 @@ BEGIN_C_DECLS
|
||||
*/
|
||||
|
||||
struct mca_btl_tcp_component_t {
|
||||
mca_btl_base_component_2_0_0_t super; /**< base BTL component */
|
||||
mca_btl_base_component_3_0_0_t super; /**< base BTL component */
|
||||
uint32_t tcp_addr_count; /**< total number of addresses */
|
||||
uint32_t tcp_num_btls; /**< number of interfaces available to the TCP component */
|
||||
unsigned int tcp_num_links; /**< number of logical links per physical device */
|
||||
@ -217,32 +220,22 @@ extern int mca_btl_tcp_send(
|
||||
|
||||
/**
|
||||
* Initiate an asynchronous put.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param descriptor (IN) Description of the data to be transferred
|
||||
*/
|
||||
|
||||
extern int mca_btl_tcp_put(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* btl_peer,
|
||||
struct mca_btl_base_descriptor_t* decriptor
|
||||
);
|
||||
int mca_btl_tcp_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
|
||||
|
||||
/**
|
||||
* Initiate an asynchronous get.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param descriptor (IN) Description of the data to be transferred
|
||||
*/
|
||||
|
||||
extern int mca_btl_tcp_get(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* btl_peer,
|
||||
struct mca_btl_base_descriptor_t* decriptor
|
||||
);
|
||||
int mca_btl_tcp_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
|
||||
/**
|
||||
* Allocate a descriptor with a segment of the requested size.
|
||||
@ -290,7 +283,6 @@ extern int mca_btl_tcp_free(
|
||||
mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* peer,
|
||||
struct mca_mpool_base_registration_t*,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
@ -298,16 +290,6 @@ mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src(
|
||||
uint32_t flags
|
||||
);
|
||||
|
||||
extern mca_btl_base_descriptor_t* mca_btl_tcp_prepare_dst(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* peer,
|
||||
struct mca_mpool_base_registration_t*,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t* size,
|
||||
uint32_t flags);
|
||||
|
||||
|
||||
/**
|
||||
* Fault Tolerance Event Notification Function
|
||||
|
@ -270,7 +270,7 @@ static int mca_btl_tcp_component_register(void)
|
||||
MCA_BTL_FLAGS_NEED_CSUM |
|
||||
MCA_BTL_FLAGS_NEED_ACK |
|
||||
MCA_BTL_FLAGS_HETEROGENEOUS_RDMA;
|
||||
mca_btl_tcp_module.super.btl_seg_size = sizeof (mca_btl_base_segment_t);
|
||||
|
||||
mca_btl_tcp_module.super.btl_bandwidth = 100;
|
||||
mca_btl_tcp_module.super.btl_latency = 100;
|
||||
|
||||
|
@ -58,6 +58,12 @@ struct mca_btl_tcp_frag_t {
|
||||
size_t size;
|
||||
int rc;
|
||||
ompi_free_list_t* my_list;
|
||||
/* fake rdma completion */
|
||||
struct {
|
||||
mca_btl_base_rdma_completion_fn_t func;
|
||||
void *data;
|
||||
void *context;
|
||||
} cb;
|
||||
};
|
||||
typedef struct mca_btl_tcp_frag_t mca_btl_tcp_frag_t;
|
||||
OBJ_CLASS_DECLARATION(mca_btl_tcp_frag_t);
|
||||
@ -116,10 +122,8 @@ do { \
|
||||
frag->iov_cnt = 1; \
|
||||
frag->iov_idx = 0; \
|
||||
frag->iov_ptr = frag->iov; \
|
||||
frag->base.des_remote = NULL; \
|
||||
frag->base.des_remote_count = 0; \
|
||||
frag->base.des_local = frag->segments; \
|
||||
frag->base.des_local_count = 1; \
|
||||
frag->base.des_segments = frag->segments; \
|
||||
frag->base.des_segment_count = 1; \
|
||||
} while(0)
|
||||
|
||||
|
||||
|
@ -10,7 +10,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -43,9 +43,11 @@ mca_btl_template_module_t mca_btl_template_module = {
|
||||
.btl_alloc = mca_btl_template_alloc,
|
||||
.btl_free = mca_btl_template_free,
|
||||
.btl_prepare_src = mca_btl_template_prepare_src,
|
||||
.btl_prepare_dst = mca_btl_template_prepare_dst,
|
||||
.btl_send = mca_btl_template_send,
|
||||
.btl_put = mca_btl_template_put,
|
||||
.btl_get = mca_btl_template_get,
|
||||
.btl_register_mem = mca_btl_template_register_mem,
|
||||
.btl_deregister_mem = mca_btl_template_deregister_mem,
|
||||
.btl_ft_event = mca_btl_template_ft_event
|
||||
}
|
||||
};
|
||||
@ -206,7 +208,6 @@ int mca_btl_template_free(
|
||||
mca_btl_base_descriptor_t* mca_btl_template_prepare_src(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
@ -270,49 +271,8 @@ mca_btl_base_descriptor_t* mca_btl_template_prepare_src(
|
||||
frag->segment.seg_len = max_data + reserve;
|
||||
}
|
||||
|
||||
frag->base.des_local = &frag->segment;
|
||||
frag->base.des_local_count = 1;
|
||||
frag->base.des_flags = 0;
|
||||
return &frag->base;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Prepare a descriptor for send/rdma using the supplied
|
||||
* convertor. If the convertor references data that is contigous,
|
||||
* the descriptor may simply point to the user buffer. Otherwise,
|
||||
* this routine is responsible for allocating buffer space and
|
||||
* packing if required.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL peer addressing
|
||||
* @param convertor (IN) Data type convertor
|
||||
* @param reserve (IN) Additional bytes requested by upper layer to precede user data
|
||||
* @param size (IN/OUT) Number of bytes to prepare (IN), number of bytes actually prepared (OUT)
|
||||
*/
|
||||
|
||||
mca_btl_base_descriptor_t* mca_btl_template_prepare_dst(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t* size,
|
||||
uint32_t flags)
|
||||
{
|
||||
mca_btl_template_frag_t* frag;
|
||||
|
||||
MCA_BTL_TEMPLATE_FRAG_ALLOC_USER(btl, frag);
|
||||
if(OPAL_UNLIKELY(NULL == frag)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
frag->segment.seg_len = *size;
|
||||
opal_convertor_get_current_pointer( convertor, (void**)&(frag->segment.seg_addr.pval) );
|
||||
|
||||
frag->base.des_local = &frag->segment;
|
||||
frag->base.des_local_count = 1;
|
||||
frag->base.des_segments = &frag->segment;
|
||||
frag->base.des_segment_count = 1;
|
||||
frag->base.des_flags = 0;
|
||||
return &frag->base;
|
||||
}
|
||||
@ -350,14 +310,13 @@ int mca_btl_template_send(
|
||||
* @param descriptor (IN) Description of the data to be transferred
|
||||
*/
|
||||
|
||||
int mca_btl_template_put(
|
||||
mca_btl_base_module_t* btl,
|
||||
mca_btl_base_endpoint_t* endpoint,
|
||||
mca_btl_base_descriptor_t* descriptor)
|
||||
int mca_btl_template_put (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
|
||||
struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
{
|
||||
/* mca_btl_template_module_t* template_btl = (mca_btl_template_module_t*) btl; */
|
||||
mca_btl_template_frag_t* frag = (mca_btl_template_frag_t*) descriptor;
|
||||
frag->endpoint = endpoint;
|
||||
/* TODO */
|
||||
return OPAL_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
@ -372,18 +331,64 @@ int mca_btl_template_put(
|
||||
*
|
||||
*/
|
||||
|
||||
int mca_btl_template_get(
|
||||
mca_btl_base_module_t* btl,
|
||||
mca_btl_base_endpoint_t* endpoint,
|
||||
mca_btl_base_descriptor_t* descriptor)
|
||||
int mca_btl_template_get (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
|
||||
struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
{
|
||||
/* mca_btl_template_module_t* template_btl = (mca_btl_template_module_t*) btl; */
|
||||
mca_btl_template_frag_t* frag = (mca_btl_template_frag_t*) descriptor;
|
||||
frag->endpoint = endpoint;
|
||||
/* TODO */
|
||||
return OPAL_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Register a memory region for put/get/atomic operations.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint(IN) BTL addressing information (or NULL for all endpoints)
|
||||
* @param base (IN) Pointer to start of region
|
||||
* @param size (IN) Size of region
|
||||
* @param flags (IN) Flags indicating what operation will be performed. Valid
|
||||
* values are MCA_BTL_DES_FLAGS_PUT, MCA_BTL_DES_FLAGS_GET,
|
||||
* and MCA_BTL_DES_FLAGS_ATOMIC
|
||||
*
|
||||
* @returns a memory registration handle valid for both local and remote operations
|
||||
* @returns NULL if the region could not be registered
|
||||
*
|
||||
* This function registers the specified region with the hardware for use with
|
||||
* the btl_put, btl_get, btl_atomic_cas, btl_atomic_op, and btl_atomic_fop
|
||||
* functions. Care should be taken to not hold an excessive number of registrations
|
||||
* as they may use limited system/NIC resources.
|
||||
*/
|
||||
struct mca_btl_base_registration_handle_t *mca_btl_template_register_mem (
|
||||
struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t *endpoint, void *base,
|
||||
size_t size, uint32_t flags)
|
||||
{
|
||||
/* mca_btl_template_module_t* template_btl = (mca_btl_template_module_t*) btl; */
|
||||
/* TODO */
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Deregister a memory region
|
||||
*
|
||||
* @param btl (IN) BTL module region was registered with
|
||||
* @param handle (IN) BTL registration handle to deregister
|
||||
*
|
||||
* This function deregisters the memory region associated with the specified handle. Care
|
||||
* should be taken to not perform any RDMA or atomic operation on this memory region
|
||||
* after it is deregistered. It is erroneous to specify a memory handle associated with
|
||||
* a remote node.
|
||||
*/
|
||||
int mca_btl_template_deregister_mem (struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_registration_handle_t *handle)
|
||||
{
|
||||
/* mca_btl_template_module_t* template_btl = (mca_btl_template_module_t*) btl; */
|
||||
/* TODO */
|
||||
return OPAL_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Cleanup/release module resources.
|
||||
|
@ -1,3 +1,4 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
@ -9,6 +10,8 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -41,7 +44,7 @@ BEGIN_C_DECLS
|
||||
*/
|
||||
|
||||
struct mca_btl_template_component_t {
|
||||
mca_btl_base_component_2_0_0_t super; /**< base BTL component */
|
||||
mca_btl_base_component_3_0_0_t super; /**< base BTL component */
|
||||
|
||||
uint32_t template_num_btls;
|
||||
/**< number of hcas available to the TEMPLATE component */
|
||||
@ -187,32 +190,114 @@ extern int mca_btl_template_send(
|
||||
|
||||
/**
|
||||
* Initiate an asynchronous put.
|
||||
* Completion Semantics: if this function returns a 1 then the operation
|
||||
* is complete. a return of OPAL_SUCCESS indicates
|
||||
* the put operation has been queued with the
|
||||
* network. the local_handle can not be deregistered
|
||||
* until all outstanding operations on that handle
|
||||
* have been completed.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param descriptor (IN) Description of the data to be transferred
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param local_address (IN) Local address to put from (registered)
|
||||
* @param remote_address (IN) Remote address to put to (registered remotely)
|
||||
* @param local_handle (IN) Registration handle for region containing
|
||||
* (local_address, local_address + size)
|
||||
* @param remote_handle (IN) Remote registration handle for region containing
|
||||
* (remote_address, remote_address + size)
|
||||
* @param size (IN) Number of bytes to put
|
||||
* @param flags (IN) Flags for this put operation
|
||||
* @param order (IN) Ordering
|
||||
* @param cbfunc (IN) Function to call on completion (if queued)
|
||||
* @param cbcontext (IN) Context for the callback
|
||||
* @param cbdata (IN) Data for callback
|
||||
*
|
||||
* @retval OPAL_SUCCESS The descriptor was successfully queued for a put
|
||||
* @retval OPAL_ERROR The descriptor was NOT successfully queued for a put
|
||||
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put
|
||||
* operation. Try again later
|
||||
* @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or
|
||||
* alignment restrictions.
|
||||
*/
|
||||
|
||||
extern int mca_btl_template_put(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* btl_peer,
|
||||
struct mca_btl_base_descriptor_t* decriptor
|
||||
);
|
||||
|
||||
int mca_btl_template_put (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
|
||||
struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
|
||||
/**
|
||||
* Initiate an asynchronous get.
|
||||
* Completion Semantics: if this function returns a 1 then the operation
|
||||
* is complete. a return of OPAL_SUCCESS indicates
|
||||
* the get operation has been queued with the
|
||||
* network. the local_handle can not be deregistered
|
||||
* until all outstanding operations on that handle
|
||||
* have been completed.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param local_address (IN) Local address to put from (registered)
|
||||
* @param remote_address (IN) Remote address to put to (registered remotely)
|
||||
* @param local_handle (IN) Registration handle for region containing
|
||||
* (local_address, local_address + size)
|
||||
* @param remote_handle (IN) Remote registration handle for region containing
|
||||
* (remote_address, remote_address + size)
|
||||
* @param size (IN) Number of bytes to put
|
||||
* @param flags (IN) Flags for this put operation
|
||||
* @param order (IN) Ordering
|
||||
* @param cbfunc (IN) Function to call on completion (if queued)
|
||||
* @param cbcontext (IN) Context for the callback
|
||||
* @param cbdata (IN) Data for callback
|
||||
*
|
||||
* @retval OPAL_SUCCESS The descriptor was successfully queued for a put
|
||||
* @retval OPAL_ERROR The descriptor was NOT successfully queued for a put
|
||||
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put
|
||||
* operation. Try again later
|
||||
* @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or
|
||||
* alignment restrictions.
|
||||
*/
|
||||
int mca_btl_template_get (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
|
||||
struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
|
||||
/**
|
||||
* @brief Register a memory region for put/get/atomic operations.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param descriptor (IN) Description of the data to be transferred
|
||||
* @param endpoint(IN) BTL addressing information (or NULL for all endpoints)
|
||||
* @param base (IN) Pointer to start of region
|
||||
* @param size (IN) Size of region
|
||||
* @param flags (IN) Flags indicating what operation will be performed. Valid
|
||||
* values are MCA_BTL_DES_FLAGS_PUT, MCA_BTL_DES_FLAGS_GET,
|
||||
* and MCA_BTL_DES_FLAGS_ATOMIC
|
||||
*
|
||||
* @returns a memory registration handle valid for both local and remote operations
|
||||
* @returns NULL if the region could not be registered
|
||||
*
|
||||
* This function registers the specified region with the hardware for use with
|
||||
* the btl_put, btl_get, btl_atomic_cas, btl_atomic_op, and btl_atomic_fop
|
||||
* functions. Care should be taken to not hold an excessive number of registrations
|
||||
* as they may use limited system/NIC resources.
|
||||
*/
|
||||
|
||||
extern int mca_btl_template_get(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* btl_peer,
|
||||
struct mca_btl_base_descriptor_t* decriptor
|
||||
);
|
||||
struct mca_btl_base_registration_handle_t *mca_btl_template_register_mem (
|
||||
struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t *endpoint, void *base,
|
||||
size_t size, uint32_t flags);
|
||||
|
||||
/**
|
||||
* @brief Deregister a memory region
|
||||
*
|
||||
* @param btl (IN) BTL module region was registered with
|
||||
* @param handle (IN) BTL registration handle to deregister
|
||||
*
|
||||
* This function deregisters the memory region associated with the specified handle. Care
|
||||
* should be taken to not perform any RDMA or atomic operation on this memory region
|
||||
* after it is deregistered. It is erroneous to specify a memory handle associated with
|
||||
* a remote node.
|
||||
*/
|
||||
int mca_btl_template_deregister_mem (struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_registration_handle_t *handle);
|
||||
|
||||
/**
|
||||
* Register a callback function that is called on receipt
|
||||
@ -275,7 +360,6 @@ extern int mca_btl_template_free(
|
||||
mca_btl_base_descriptor_t* mca_btl_template_prepare_src(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* peer,
|
||||
struct mca_mpool_base_registration_t*,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
@ -283,16 +367,6 @@ mca_btl_base_descriptor_t* mca_btl_template_prepare_src(
|
||||
uint32_t flags
|
||||
);
|
||||
|
||||
extern mca_btl_base_descriptor_t* mca_btl_template_prepare_dst(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* peer,
|
||||
struct mca_mpool_base_registration_t*,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t* size,
|
||||
uint32_t flags);
|
||||
|
||||
/**
|
||||
* Fault Tolerance Event Notification Function
|
||||
* @param state Checkpoint Stae
|
||||
|
@ -39,7 +39,8 @@ ugni_SOURCES = \
|
||||
btl_ugni_smsg.h \
|
||||
btl_ugni_smsg.c \
|
||||
btl_ugni_progress_thread.c \
|
||||
btl_ugni_prepare.h
|
||||
btl_ugni_prepare.h \
|
||||
btl_ugni_atomic.c
|
||||
|
||||
mcacomponentdir = $(opallibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
@ -33,6 +33,7 @@
|
||||
#include "opal/mca/btl/base/btl_base_error.h"
|
||||
#include "opal/class/opal_hash_table.h"
|
||||
#include "opal/class/ompi_free_list.h"
|
||||
#include "opal/class/opal_free_list.h"
|
||||
#include "opal/mca/common/ugni/common_ugni.h"
|
||||
|
||||
#include <errno.h>
|
||||
@ -80,11 +81,16 @@ typedef struct mca_btl_ugni_module_t {
|
||||
opal_mutex_t eager_get_pending_lock;
|
||||
opal_list_t eager_get_pending;
|
||||
|
||||
opal_mutex_t pending_descriptors_lock;
|
||||
opal_list_t pending_descriptors;
|
||||
|
||||
ompi_free_list_t post_descriptors;
|
||||
|
||||
mca_mpool_base_module_t *smsg_mpool;
|
||||
ompi_free_list_t smsg_mboxes;
|
||||
|
||||
gni_ep_handle_t wildcard_ep;
|
||||
gni_ep_handle_t local_ep;
|
||||
struct mca_btl_base_endpoint_t *local_ep;
|
||||
|
||||
struct mca_btl_ugni_endpoint_attr_t wc_remote_attr, wc_local_attr;
|
||||
|
||||
@ -126,7 +132,7 @@ typedef struct mca_btl_ugni_module_t {
|
||||
|
||||
typedef struct mca_btl_ugni_component_t {
|
||||
/* base BTL component */
|
||||
mca_btl_base_component_2_0_0_t super;
|
||||
mca_btl_base_component_3_0_0_t super;
|
||||
|
||||
/* maximum supported btls. hardcoded to 1 for now */
|
||||
uint32_t ugni_max_btls;
|
||||
@ -143,8 +149,6 @@ typedef struct mca_btl_ugni_component_t {
|
||||
|
||||
/* After this message size switch to BTE protocols */
|
||||
size_t ugni_fma_limit;
|
||||
/* Switch to put when trying to GET at or above this size */
|
||||
size_t ugni_get_limit;
|
||||
/* Switch to get when sending above this size */
|
||||
size_t ugni_smsg_limit;
|
||||
|
||||
@ -267,33 +271,31 @@ mca_btl_ugni_sendi (struct mca_btl_base_module_t *btl,
|
||||
uint32_t flags, mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t **descriptor);
|
||||
|
||||
/**
|
||||
* Initiate a get operation.
|
||||
*
|
||||
* location: btl_ugni_get.c
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param descriptor (IN) Description of the data to be transferred
|
||||
*/
|
||||
int
|
||||
mca_btl_ugni_get (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint,
|
||||
struct mca_btl_base_descriptor_t *des);
|
||||
int mca_btl_ugni_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
|
||||
/**
|
||||
* Initiate a put operation.
|
||||
*
|
||||
* location: btl_ugni_put.c
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param descriptor (IN) Description of the data to be transferred
|
||||
*/
|
||||
int
|
||||
mca_btl_ugni_put (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint,
|
||||
struct mca_btl_base_descriptor_t *des);
|
||||
int mca_btl_ugni_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
|
||||
int mca_btl_ugni_aop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle,
|
||||
mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order,
|
||||
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
|
||||
int mca_btl_ugni_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
|
||||
uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata);
|
||||
|
||||
int mca_btl_ugni_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, uint64_t value,
|
||||
int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
|
||||
int mca_btl_ugni_progress_send_wait_list (struct mca_btl_base_endpoint_t *endpoint);
|
||||
|
||||
@ -302,9 +304,14 @@ mca_btl_ugni_alloc(struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint,
|
||||
uint8_t order, size_t size, uint32_t flags);
|
||||
|
||||
struct mca_btl_base_registration_handle_t {
|
||||
/** uGNI memory handle */
|
||||
gni_mem_handle_t gni_handle;
|
||||
};
|
||||
|
||||
typedef struct mca_btl_ugni_reg_t {
|
||||
mca_mpool_base_registration_t base;
|
||||
gni_mem_handle_t memory_hdl;
|
||||
mca_btl_base_registration_handle_t handle;
|
||||
} mca_btl_ugni_reg_t;
|
||||
|
||||
/* Global structures */
|
||||
@ -321,5 +328,7 @@ static inline uint64_t mca_btl_ugni_proc_name_to_id (opal_process_name_t name) {
|
||||
int mca_btl_ugni_spawn_progress_thread(struct mca_btl_base_module_t* btl);
|
||||
int mca_btl_ugni_kill_progress_thread(void);
|
||||
|
||||
/** Number of times the progress thread has woken up */
|
||||
extern unsigned int mca_btl_ugni_progress_thread_wakeups;
|
||||
|
||||
#endif
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
@ -34,7 +34,6 @@ int mca_btl_ugni_add_procs(struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t **peers,
|
||||
opal_bitmap_t *reachable) {
|
||||
mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl;
|
||||
opal_proc_t *my_proc = opal_proc_local_get();
|
||||
size_t i;
|
||||
int rc;
|
||||
void *mmap_start_addr;
|
||||
@ -61,26 +60,28 @@ int mca_btl_ugni_add_procs(struct mca_btl_base_module_t* btl,
|
||||
}
|
||||
|
||||
for (i = 0 ; i < nprocs ; ++i) {
|
||||
struct opal_proc_t *ompi_proc = procs[i];
|
||||
uint64_t proc_id = mca_btl_ugni_proc_name_to_id(ompi_proc->proc_name);
|
||||
struct opal_proc_t *opal_proc = procs[i];
|
||||
uint64_t proc_id = mca_btl_ugni_proc_name_to_id(opal_proc->proc_name);
|
||||
|
||||
if (OPAL_PROC_ON_LOCAL_NODE(ompi_proc->proc_flags)) {
|
||||
if (OPAL_PROC_ON_LOCAL_NODE(opal_proc->proc_flags)) {
|
||||
ugni_module->nlocal_procs++;
|
||||
|
||||
/* Do not use uGNI to communicate with local procs unless we are adding more ranks.
|
||||
* Change this when sm and vader are updated to handle additional add procs. */
|
||||
if (!ugni_module->initialized || my_proc == ompi_proc) {
|
||||
continue;
|
||||
}
|
||||
/* ugni is allowed on local processes to provide support for network
|
||||
* atomic operations */
|
||||
}
|
||||
|
||||
/* Create and Init endpoints */
|
||||
rc = mca_btl_ugni_init_ep (ugni_module, peers + i, (mca_btl_ugni_module_t *) btl, ompi_proc);
|
||||
rc = mca_btl_ugni_init_ep (ugni_module, peers + i, (mca_btl_ugni_module_t *) btl, opal_proc);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
BTL_ERROR(("btl/ugni error initializing endpoint"));
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* go ahead and connect the local endpoint for RDMA/CQ write */
|
||||
if (opal_proc == opal_proc_local_get ()) {
|
||||
ugni_module->local_ep = peers[i];
|
||||
}
|
||||
|
||||
/* Add this endpoint to the pointer array. */
|
||||
BTL_VERBOSE(("initialized uGNI endpoint for proc id: 0x%" PRIx64 " ptr: %p", proc_id, (void *) peers[i]));
|
||||
opal_hash_table_set_value_uint64 (&ugni_module->id_to_endpoint, proc_id, peers[i]);
|
||||
@ -138,26 +139,6 @@ int mca_btl_ugni_add_procs(struct mca_btl_base_module_t* btl,
|
||||
BTL_ERROR(("error creating remote SMSG CQ"));
|
||||
return opal_common_rc_ugni_to_opal (rc);
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock);
|
||||
rc = GNI_EpCreate (ugni_module->device->dev_handle, ugni_module->rdma_local_cq,
|
||||
&ugni_module->local_ep);
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
BTL_ERROR(("error creating local ugni endpoint"));
|
||||
return opal_common_rc_ugni_to_opal (rc);
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock);
|
||||
rc = GNI_EpBind (ugni_module->local_ep,
|
||||
ugni_module->device->dev_addr,
|
||||
getpid());
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
BTL_ERROR(("error binding local ugni endpoint"));
|
||||
return opal_common_rc_ugni_to_opal (rc);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
rc = mca_btl_ugni_setup_mpools (ugni_module);
|
||||
@ -222,8 +203,8 @@ int mca_btl_ugni_del_procs (struct mca_btl_base_module_t *btl,
|
||||
}
|
||||
|
||||
for (i = 0 ; i < nprocs ; ++i) {
|
||||
struct opal_proc_t *ompi_proc = procs[i];
|
||||
uint64_t proc_id = mca_btl_ugni_proc_name_to_id(ompi_proc->proc_name);
|
||||
struct opal_proc_t *opal_proc = procs[i];
|
||||
uint64_t proc_id = mca_btl_ugni_proc_name_to_id(opal_proc->proc_name);
|
||||
mca_btl_base_endpoint_t *ep = NULL;
|
||||
|
||||
/* lookup this proc in the hash table */
|
||||
@ -257,7 +238,7 @@ static int ugni_reg_rdma_mem (void *reg_data, void *base, size_t size,
|
||||
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock);
|
||||
rc = GNI_MemRegister (ugni_module->device->dev_handle, (uint64_t) base,
|
||||
size, NULL, GNI_MEM_READWRITE | GNI_MEM_RELAXED_PI_ORDERING,
|
||||
-1, &(ugni_reg->memory_hdl));
|
||||
-1, &(ugni_reg->handle.gni_handle));
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
|
||||
|
||||
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) {
|
||||
@ -280,7 +261,7 @@ static int ugni_reg_smsg_mem (void *reg_data, void *base, size_t size,
|
||||
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock);
|
||||
rc = GNI_MemRegister (ugni_module->device->dev_handle, (uint64_t) base,
|
||||
size, ugni_module->smsg_remote_cq, GNI_MEM_READWRITE, -1,
|
||||
&(ugni_reg->memory_hdl));
|
||||
&(ugni_reg->handle.gni_handle));
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
|
||||
return opal_common_rc_ugni_to_opal (rc);
|
||||
}
|
||||
@ -293,7 +274,7 @@ ugni_dereg_mem (void *reg_data, mca_mpool_base_registration_t *reg)
|
||||
gni_return_t rc;
|
||||
|
||||
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock);
|
||||
rc = GNI_MemDeregister (ugni_module->device->dev_handle, &ugni_reg->memory_hdl);
|
||||
rc = GNI_MemDeregister (ugni_module->device->dev_handle, &ugni_reg->handle.gni_handle);
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
|
||||
if (GNI_RC_SUCCESS != rc) {
|
||||
return OPAL_ERROR;
|
||||
@ -470,6 +451,15 @@ mca_btl_ugni_setup_mpools (mca_btl_ugni_module_t *ugni_module)
|
||||
return rc;
|
||||
}
|
||||
|
||||
rc = ompi_free_list_init_new (&ugni_module->post_descriptors,
|
||||
sizeof (mca_btl_ugni_post_descriptor_t),
|
||||
8, OBJ_CLASS(mca_btl_ugni_post_descriptor_t),
|
||||
0, 0, 0, -1, 256, NULL);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
BTL_ERROR(("error creating post descriptor free list"));
|
||||
return rc;
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
|
135
opal/mca/btl/ugni/btl_ugni_atomic.c
Обычный файл
135
opal/mca/btl/ugni/btl_ugni_atomic.c
Обычный файл
@ -0,0 +1,135 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "btl_ugni_rdma.h"
|
||||
|
||||
static gni_fma_cmd_type_t famo_cmds[] = {
|
||||
[MCA_BTL_ATOMIC_ADD] = GNI_FMA_ATOMIC_FADD,
|
||||
[MCA_BTL_ATOMIC_AND] = GNI_FMA_ATOMIC_FAND,
|
||||
[MCA_BTL_ATOMIC_OR] = GNI_FMA_ATOMIC_FOR,
|
||||
[MCA_BTL_ATOMIC_XOR] = GNI_FMA_ATOMIC_FXOR,
|
||||
};
|
||||
|
||||
static gni_fma_cmd_type_t amo_cmds[] = {
|
||||
[MCA_BTL_ATOMIC_ADD] = GNI_FMA_ATOMIC_ADD,
|
||||
[MCA_BTL_ATOMIC_AND] = GNI_FMA_ATOMIC_AND,
|
||||
[MCA_BTL_ATOMIC_OR] = GNI_FMA_ATOMIC_OR,
|
||||
[MCA_BTL_ATOMIC_XOR] = GNI_FMA_ATOMIC_XOR,
|
||||
};
|
||||
|
||||
int mca_btl_ugni_aop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle,
|
||||
mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order,
|
||||
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
{
|
||||
gni_mem_handle_t dummy = {0, 0};
|
||||
mca_btl_ugni_post_descriptor_t *post_desc;
|
||||
int rc;
|
||||
|
||||
rc = mca_btl_ugni_check_endpoint_state_rdma (endpoint);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
mca_btl_ugni_alloc_post_descriptor (endpoint, NULL, cbfunc, cbcontext, cbdata, &post_desc);
|
||||
if (OPAL_UNLIKELY(NULL == post_desc)) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
init_gni_post_desc (&post_desc->desc, order, GNI_POST_AMO, 0, dummy, remote_address,
|
||||
remote_handle->gni_handle, 8, 0);
|
||||
post_desc->desc.base.amo_cmd = amo_cmds[op];
|
||||
|
||||
post_desc->desc.base.first_operand = operand;
|
||||
|
||||
OPAL_THREAD_LOCK(&endpoint->btl->device->dev_lock);
|
||||
rc = GNI_PostFma (endpoint->rdma_ep_handle, &post_desc->desc.base);
|
||||
OPAL_THREAD_UNLOCK(&endpoint->btl->device->dev_lock);
|
||||
if (GNI_RC_SUCCESS != rc) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_btl_ugni_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
|
||||
uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata)
|
||||
{
|
||||
mca_btl_ugni_post_descriptor_t *post_desc;
|
||||
int rc;
|
||||
|
||||
rc = mca_btl_ugni_check_endpoint_state_rdma (endpoint);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
mca_btl_ugni_alloc_post_descriptor (endpoint, local_handle, cbfunc, cbcontext, cbdata, &post_desc);
|
||||
if (OPAL_UNLIKELY(NULL == post_desc)) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
|
||||
init_gni_post_desc (&post_desc->desc, order, GNI_POST_AMO, (intptr_t) local_address, local_handle->gni_handle,
|
||||
remote_address, remote_handle->gni_handle, 8, 0);
|
||||
post_desc->desc.base.amo_cmd = famo_cmds[op];
|
||||
|
||||
post_desc->desc.base.first_operand = operand;
|
||||
|
||||
OPAL_THREAD_LOCK(&endpoint->btl->device->dev_lock);
|
||||
rc = GNI_PostFma (endpoint->rdma_ep_handle, &post_desc->desc.base);
|
||||
OPAL_THREAD_UNLOCK(&endpoint->btl->device->dev_lock);
|
||||
if (GNI_RC_SUCCESS != rc) {
|
||||
mca_btl_ugni_return_post_descriptor (endpoint->btl, post_desc);
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_btl_ugni_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, uint64_t value, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
{
|
||||
mca_btl_ugni_post_descriptor_t *post_desc;
|
||||
int rc;
|
||||
|
||||
rc = mca_btl_ugni_check_endpoint_state_rdma (endpoint);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
mca_btl_ugni_alloc_post_descriptor (endpoint, local_handle, cbfunc, cbcontext, cbdata, &post_desc);
|
||||
if (OPAL_UNLIKELY(NULL == post_desc)) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
|
||||
init_gni_post_desc (&post_desc->desc, order, GNI_POST_AMO, (intptr_t) local_address, local_handle->gni_handle,
|
||||
remote_address, remote_handle->gni_handle, 8, 0);
|
||||
post_desc->desc.base.amo_cmd = GNI_FMA_ATOMIC_CSWAP;
|
||||
|
||||
post_desc->desc.base.first_operand = compare;
|
||||
post_desc->desc.base.second_operand = value;
|
||||
|
||||
OPAL_THREAD_LOCK(&endpoint->btl->device->dev_lock);
|
||||
rc = GNI_PostFma (endpoint->rdma_ep_handle, &post_desc->desc.base);
|
||||
OPAL_THREAD_UNLOCK(&endpoint->btl->device->dev_lock);
|
||||
if (GNI_RC_SUCCESS != rc) {
|
||||
mca_btl_ugni_return_post_descriptor (endpoint->btl, post_desc);
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -18,6 +18,8 @@
|
||||
#include "opal/memoryhooks/memory.h"
|
||||
#include "opal/runtime/opal_params.h"
|
||||
|
||||
#include "opal/mca/base/mca_base_pvar.h"
|
||||
|
||||
static int btl_ugni_component_register(void);
|
||||
static int btl_ugni_component_open(void);
|
||||
static int btl_ugni_component_close(void);
|
||||
@ -52,6 +54,7 @@ static int
|
||||
btl_ugni_component_register(void)
|
||||
{
|
||||
mca_base_var_enum_t *new_enum;
|
||||
gni_nic_device_t device_type;
|
||||
int rc;
|
||||
|
||||
(void) mca_base_var_group_component_register(&mca_btl_ugni_component.super.btl_version,
|
||||
@ -139,15 +142,6 @@ btl_ugni_component_register(void)
|
||||
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
&mca_btl_ugni_component.ugni_fma_limit);
|
||||
|
||||
mca_btl_ugni_component.ugni_get_limit = 1 * 1024 * 1024;
|
||||
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
||||
"get_limit", "Maximum size message that "
|
||||
"will be sent using a get protocol "
|
||||
"(default 1M)", MCA_BASE_VAR_TYPE_INT,
|
||||
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
||||
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
&mca_btl_ugni_component.ugni_get_limit);
|
||||
|
||||
mca_btl_ugni_component.rdma_max_retries = 16;
|
||||
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
||||
"rdma_max_retries", NULL, MCA_BASE_VAR_TYPE_INT,
|
||||
@ -199,6 +193,15 @@ btl_ugni_component_register(void)
|
||||
MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
&mca_btl_ugni_component.progress_thread_requested);
|
||||
|
||||
/* performance variables */
|
||||
mca_btl_ugni_progress_thread_wakeups = 0;
|
||||
(void) mca_base_component_pvar_register(&mca_btl_ugni_component.super.btl_version,
|
||||
"progress_thread_wakeups", "Number of times the progress thread "
|
||||
"has been woken", OPAL_INFO_LVL_9, MCA_BASE_PVAR_CLASS_COUNTER,
|
||||
MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, MCA_BASE_VAR_BIND_NO_OBJECT,
|
||||
MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS, NULL,
|
||||
NULL, NULL, &mca_btl_ugni_progress_thread_wakeups);
|
||||
|
||||
/* btl/ugni can only support only a fixed set of mpools (these mpools have compatible resource
|
||||
* structures) */
|
||||
rc = mca_base_var_enum_create ("btl_ugni_mpool", mpool_values, &new_enum);
|
||||
@ -222,13 +225,28 @@ btl_ugni_component_register(void)
|
||||
mca_btl_ugni_module.super.btl_max_send_size = 8 * 1024;
|
||||
mca_btl_ugni_module.super.btl_rdma_pipeline_send_length = 8 * 1024;
|
||||
|
||||
mca_btl_ugni_module.super.btl_get_limit = 1 * 1024 * 1024;
|
||||
|
||||
/* determine if there are get alignment restrictions */
|
||||
GNI_GetDeviceType (&device_type);
|
||||
|
||||
if (GNI_DEVICE_GEMINI == device_type) {
|
||||
mca_btl_ugni_module.super.btl_get_alignment = 4;
|
||||
} else {
|
||||
mca_btl_ugni_module.super.btl_get_alignment = 0;
|
||||
}
|
||||
|
||||
/* threshold for put */
|
||||
mca_btl_ugni_module.super.btl_min_rdma_pipeline_size = 8 * 1024;
|
||||
|
||||
mca_btl_ugni_module.super.btl_flags = MCA_BTL_FLAGS_SEND |
|
||||
MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND_INPLACE;
|
||||
MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND_INPLACE | MCA_BTL_FLAGS_ATOMIC_OPS |
|
||||
MCA_BTL_FLAGS_ATOMIC_FOPS;
|
||||
mca_btl_ugni_module.super.btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD |
|
||||
MCA_BTL_ATOMIC_SUPPORTS_AND | MCA_BTL_ATOMIC_SUPPORTS_OR | MCA_BTL_ATOMIC_SUPPORTS_XOR |
|
||||
MCA_BTL_ATOMIC_SUPPORTS_CSWAP;
|
||||
|
||||
mca_btl_ugni_module.super.btl_seg_size = sizeof (mca_btl_ugni_segment_t);
|
||||
mca_btl_ugni_module.super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t);
|
||||
|
||||
mca_btl_ugni_module.super.btl_bandwidth = 40000; /* Mbs */
|
||||
mca_btl_ugni_module.super.btl_latency = 2; /* Microsecs */
|
||||
@ -439,89 +457,110 @@ mca_btl_ugni_progress_datagram (mca_btl_ugni_module_t *ugni_module)
|
||||
return count;
|
||||
}
|
||||
|
||||
static inline int
|
||||
mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module, int which_cq)
|
||||
#if OPAL_ENABLE_DEBUG
|
||||
static inline void btl_ugni_dump_post_desc (mca_btl_ugni_post_descriptor_t *desc)
|
||||
{
|
||||
opal_common_ugni_post_desc_t *desc;
|
||||
mca_btl_ugni_base_frag_t *frag;
|
||||
|
||||
fprintf (stderr, "desc->desc.base.post_id = %" PRIx64 "\n", desc->desc.base.post_id);
|
||||
fprintf (stderr, "desc->desc.base.status = %" PRIx64 "\n", desc->desc.base.status);
|
||||
fprintf (stderr, "desc->desc.base.cq_mode_complete = %hu\n", desc->desc.base.cq_mode_complete);
|
||||
fprintf (stderr, "desc->desc.base.type = %d\n", desc->desc.base.type);
|
||||
fprintf (stderr, "desc->desc.base.cq_mode = %hu\n", desc->desc.base.cq_mode);
|
||||
fprintf (stderr, "desc->desc.base.dlvr_mode = %hu\n", desc->desc.base.dlvr_mode);
|
||||
fprintf (stderr, "desc->desc.base.local_addr = %" PRIx64 "\n", desc->desc.base.local_addr);
|
||||
fprintf (stderr, "desc->desc.base.local_mem_hndl = {%" PRIx64 ", %" PRIx64 "}\n", desc->desc.base.local_mem_hndl.qword1,
|
||||
desc->desc.base.local_mem_hndl.qword2);
|
||||
fprintf (stderr, "desc->desc.base.remote_addr = %" PRIx64 "\n", desc->desc.base.remote_addr);
|
||||
fprintf (stderr, "desc->desc.base.remote_mem_hndl = {%" PRIx64 ", %" PRIx64 "}\n", desc->desc.base.remote_mem_hndl.qword1,
|
||||
desc->desc.base.remote_mem_hndl.qword2);
|
||||
fprintf (stderr, "desc->desc.base.length = %" PRIu64 "\n", desc->desc.base.length);
|
||||
fprintf (stderr, "desc->desc.base.rdma_mode = %hu\n", desc->desc.base.rdma_mode);
|
||||
fprintf (stderr, "desc->desc.base.amo_cmd = %d\n", desc->desc.base.amo_cmd);
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline int mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module, int which_cq)
|
||||
{
|
||||
mca_btl_ugni_post_descriptor_t *post_desc = NULL;
|
||||
gni_cq_entry_t event_data = 0;
|
||||
gni_post_descriptor_t *desc;
|
||||
uint32_t recoverable = 1;
|
||||
gni_return_t rc;
|
||||
gni_return_t grc;
|
||||
gni_cq_handle_t the_cq;
|
||||
|
||||
the_cq = (which_cq == 0) ? ugni_module->rdma_local_cq : ugni_module->rdma_local_irq_cq;
|
||||
|
||||
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock);
|
||||
rc = GNI_CqGetEvent (the_cq, &event_data);
|
||||
if (GNI_RC_NOT_DONE == rc) {
|
||||
grc = GNI_CqGetEvent (the_cq, &event_data);
|
||||
if (GNI_RC_NOT_DONE == grc) {
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (OPAL_UNLIKELY((GNI_RC_SUCCESS != rc && !event_data) || GNI_CQ_OVERRUN(event_data))) {
|
||||
if (OPAL_UNLIKELY((GNI_RC_SUCCESS != grc && !event_data) || GNI_CQ_OVERRUN(event_data))) {
|
||||
/* TODO -- need to handle overrun -- how do we do this without an event?
|
||||
will the event eventually come back? Ask Cray */
|
||||
BTL_ERROR(("unhandled post error! ugni rc = %d %s", rc,gni_err_str[rc]));
|
||||
BTL_ERROR(("unhandled post error! ugni rc = %d %s", grc, gni_err_str[grc]));
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
|
||||
return opal_common_rc_ugni_to_opal (rc);
|
||||
|
||||
return opal_common_rc_ugni_to_opal (grc);
|
||||
}
|
||||
|
||||
rc = GNI_GetCompleted (the_cq, event_data, (gni_post_descriptor_t **) &desc);
|
||||
grc = GNI_GetCompleted (the_cq, event_data, &desc);
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
|
||||
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc && GNI_RC_TRANSACTION_ERROR != rc)) {
|
||||
BTL_ERROR(("Error in GNI_GetComplete %s", gni_err_str[rc]));
|
||||
return opal_common_rc_ugni_to_opal (rc);
|
||||
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc && GNI_RC_TRANSACTION_ERROR != grc)) {
|
||||
BTL_ERROR(("Error in GNI_GetComplete %s", gni_err_str[grc]));
|
||||
return opal_common_rc_ugni_to_opal (grc);
|
||||
}
|
||||
|
||||
frag = MCA_BTL_UGNI_DESC_TO_FRAG(desc);
|
||||
|
||||
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc || !GNI_CQ_STATUS_OK(event_data))) {
|
||||
char buffer[1024];
|
||||
post_desc = MCA_BTL_UGNI_DESC_TO_PDESC(desc);
|
||||
|
||||
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc || !GNI_CQ_STATUS_OK(event_data))) {
|
||||
(void) GNI_CqErrorRecoverable (event_data, &recoverable);
|
||||
GNI_CqErrorStr(event_data,buffer,sizeof(buffer));
|
||||
|
||||
if (OPAL_UNLIKELY(++frag->post_desc.tries >= mca_btl_ugni_component.rdma_max_retries ||
|
||||
if (OPAL_UNLIKELY(++post_desc->desc.tries >= mca_btl_ugni_component.rdma_max_retries ||
|
||||
!recoverable)) {
|
||||
char char_buffer[1024];
|
||||
GNI_CqErrorStr (event_data, char_buffer, 1024);
|
||||
/* give up */
|
||||
BTL_ERROR(("giving up on frag %p type %d CQE error %s", (void *) frag, frag->post_desc.base.type, buffer));
|
||||
mca_btl_ugni_frag_complete (frag, OPAL_ERROR);
|
||||
BTL_ERROR(("giving up on desciptor %p, recoverable %d: %s", (void *) post_desc,
|
||||
recoverable, char_buffer));
|
||||
#if OPAL_ENABLE_DEBUG
|
||||
btl_ugni_dump_post_desc (post_desc);
|
||||
#endif
|
||||
mca_btl_ugni_post_desc_complete (ugni_module, post_desc, OPAL_ERROR);
|
||||
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
/* repost transaction */
|
||||
mca_btl_ugni_repost (frag);
|
||||
mca_btl_ugni_repost (ugni_module, post_desc);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
BTL_VERBOSE(("RDMA/FMA complete for frag %p", (void *) frag));
|
||||
|
||||
mca_btl_ugni_frag_complete (frag, opal_common_rc_ugni_to_opal (rc));
|
||||
mca_btl_ugni_post_desc_complete (ugni_module, post_desc, opal_common_rc_ugni_to_opal (grc));
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static inline int
|
||||
mca_btl_ugni_retry_failed (mca_btl_ugni_module_t *ugni_module)
|
||||
mca_btl_ugni_post_pending (mca_btl_ugni_module_t *ugni_module)
|
||||
{
|
||||
int count = opal_list_get_size (&ugni_module->failed_frags);
|
||||
int count = opal_list_get_size (&ugni_module->pending_descriptors);
|
||||
int i;
|
||||
|
||||
for (i = 0 ; i < count ; ++i) {
|
||||
OPAL_THREAD_LOCK(&ugni_module->failed_frags_lock);
|
||||
mca_btl_ugni_base_frag_t *frag =
|
||||
(mca_btl_ugni_base_frag_t *) opal_list_remove_first (&ugni_module->failed_frags);
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->failed_frags_lock);
|
||||
if (NULL == frag) {
|
||||
OPAL_THREAD_LOCK(&ugni_module->pending_descriptors_lock);
|
||||
mca_btl_ugni_post_descriptor_t *post_desc =
|
||||
(mca_btl_ugni_post_descriptor_t *) opal_list_remove_first (&ugni_module->pending_descriptors);
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->pending_descriptors_lock);
|
||||
|
||||
if (OPAL_SUCCESS != mca_btl_ugni_repost (ugni_module, post_desc)) {
|
||||
break;
|
||||
}
|
||||
|
||||
mca_btl_ugni_repost (frag);
|
||||
}
|
||||
|
||||
return count;
|
||||
return i;
|
||||
}
|
||||
|
||||
static inline int
|
||||
@ -571,7 +610,6 @@ static int mca_btl_ugni_component_progress (void)
|
||||
for (i = 0 ; i < mca_btl_ugni_component.ugni_num_btls ; ++i) {
|
||||
ugni_module = mca_btl_ugni_component.modules + i;
|
||||
|
||||
mca_btl_ugni_retry_failed (ugni_module);
|
||||
mca_btl_ugni_progress_wait_list (ugni_module);
|
||||
|
||||
count += mca_btl_ugni_progress_datagram (ugni_module);
|
||||
@ -581,6 +619,9 @@ static int mca_btl_ugni_component_progress (void)
|
||||
if (mca_btl_ugni_component.progress_thread_enabled) {
|
||||
count += mca_btl_ugni_progress_rdma (ugni_module, 1);
|
||||
}
|
||||
|
||||
/* post pending after progressing rdma */
|
||||
mca_btl_ugni_post_pending (ugni_module);
|
||||
}
|
||||
|
||||
return count;
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011-2013 UT-Battelle, LLC. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -10,8 +10,6 @@
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "btl_ugni.h"
|
||||
|
||||
#include "btl_ugni_endpoint.h"
|
||||
#include "btl_ugni_smsg.h"
|
||||
|
||||
@ -90,10 +88,8 @@ int mca_btl_ugni_ep_disconnect (mca_btl_base_endpoint_t *ep, bool send_disconnec
|
||||
static inline int mca_btl_ugni_ep_connect_start (mca_btl_base_endpoint_t *ep) {
|
||||
int rc;
|
||||
|
||||
/* get the modex info for this endpoint and setup a ugni endpoint */
|
||||
rc = opal_common_ugni_endpoint_for_proc (ep->btl->device, ep->peer_proc, &ep->common);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
assert (0);
|
||||
rc = mca_btl_ugni_ep_connect_rdma (ep);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -107,11 +103,6 @@ static inline int mca_btl_ugni_ep_connect_start (mca_btl_base_endpoint_t *ep) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
rc = opal_common_ugni_ep_create (ep->common, ep->btl->rdma_local_cq, &ep->rdma_ep_handle);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* build connection data */
|
||||
rc = mca_btl_ugni_ep_smsg_get_mbox (ep);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
@ -198,7 +189,7 @@ int mca_btl_ugni_ep_connect_progress (mca_btl_base_endpoint_t *ep) {
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
if (MCA_BTL_UGNI_EP_STATE_INIT == ep->state) {
|
||||
if (MCA_BTL_UGNI_EP_STATE_RDMA >= ep->state) {
|
||||
rc = mca_btl_ugni_ep_connect_start (ep);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
return rc;
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -17,6 +17,7 @@
|
||||
|
||||
enum mca_btl_ugni_endpoint_state_t {
|
||||
MCA_BTL_UGNI_EP_STATE_INIT = 0,
|
||||
MCA_BTL_UGNI_EP_STATE_RDMA,
|
||||
MCA_BTL_UGNI_EP_STATE_CONNECTING,
|
||||
MCA_BTL_UGNI_EP_STATE_CONNECTED
|
||||
};
|
||||
@ -114,6 +115,7 @@ static inline int mca_btl_ugni_check_endpoint_state (mca_btl_ugni_endpoint_t *ep
|
||||
|
||||
switch (ep->state) {
|
||||
case MCA_BTL_UGNI_EP_STATE_INIT:
|
||||
case MCA_BTL_UGNI_EP_STATE_RDMA:
|
||||
rc = mca_btl_ugni_ep_connect_progress (ep);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
break;
|
||||
@ -130,6 +132,43 @@ static inline int mca_btl_ugni_check_endpoint_state (mca_btl_ugni_endpoint_t *ep
|
||||
return rc;
|
||||
}
|
||||
|
||||
static inline int mca_btl_ugni_ep_connect_rdma (mca_btl_base_endpoint_t *ep) {
|
||||
int rc;
|
||||
|
||||
if (ep->state >= MCA_BTL_UGNI_EP_STATE_RDMA) {
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
/* get the modex info for this endpoint and setup a ugni endpoint */
|
||||
rc = opal_common_ugni_endpoint_for_proc (ep->btl->device, ep->peer_proc, &ep->common);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
assert (0);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* bind endpoint to remote address */
|
||||
rc = opal_common_ugni_ep_create (ep->common, ep->btl->rdma_local_cq, &ep->rdma_ep_handle);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
ep->state = MCA_BTL_UGNI_EP_STATE_RDMA;
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
static inline int mca_btl_ugni_check_endpoint_state_rdma (mca_btl_base_endpoint_t *ep) {
|
||||
int rc;
|
||||
if (OPAL_LIKELY(MCA_BTL_UGNI_EP_STATE_INIT < ep->state)) {
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
opal_mutex_lock (&ep->lock);
|
||||
rc = mca_btl_ugni_ep_connect_rdma (ep);
|
||||
opal_mutex_unlock (&ep->lock);
|
||||
return rc;
|
||||
}
|
||||
|
||||
static inline int mca_btl_ugni_wildcard_ep_post (mca_btl_ugni_module_t *ugni_module) {
|
||||
gni_return_t rc;
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -16,7 +16,7 @@
|
||||
static inline void mca_btl_ugni_base_frag_constructor (mca_btl_ugni_base_frag_t *frag)
|
||||
{
|
||||
memset ((char *) frag + sizeof (frag->base), 0, sizeof (*frag) - sizeof (frag->base));
|
||||
frag->segments[0].base.seg_addr.pval = frag->base.super.ptr;
|
||||
frag->segments[0].seg_addr.pval = frag->base.super.ptr;
|
||||
}
|
||||
|
||||
static inline void mca_btl_ugni_eager_frag_constructor (mca_btl_ugni_base_frag_t *frag)
|
||||
@ -26,7 +26,7 @@ static inline void mca_btl_ugni_eager_frag_constructor (mca_btl_ugni_base_frag_t
|
||||
|
||||
mca_btl_ugni_base_frag_constructor (frag);
|
||||
|
||||
frag->segments[0].memory_handle = reg->memory_hdl;
|
||||
frag->memory_handle = reg->handle;
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_btl_ugni_smsg_frag_t, mca_btl_base_descriptor_t,
|
||||
@ -38,6 +38,9 @@ OBJ_CLASS_INSTANCE(mca_btl_ugni_rdma_frag_t, mca_btl_base_descriptor_t,
|
||||
OBJ_CLASS_INSTANCE(mca_btl_ugni_eager_frag_t, mca_btl_base_descriptor_t,
|
||||
mca_btl_ugni_eager_frag_constructor, NULL);
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_btl_ugni_post_descriptor_t, ompi_free_list_item_t,
|
||||
NULL, NULL);
|
||||
|
||||
void mca_btl_ugni_frag_init (mca_btl_ugni_base_frag_t *frag, mca_btl_ugni_module_t *ugni_module)
|
||||
{
|
||||
frag->msg_id = opal_pointer_array_add (&ugni_module->pending_smsg_frags_bb, (void *) frag);
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2013 The University of Tennessee and The University
|
||||
@ -19,13 +19,6 @@
|
||||
#include "btl_ugni.h"
|
||||
#include "btl_ugni_endpoint.h"
|
||||
|
||||
typedef struct mca_btl_ugni_segment_t {
|
||||
mca_btl_base_segment_t base;
|
||||
gni_mem_handle_t memory_handle;
|
||||
uint8_t extra_bytes[3];
|
||||
uint8_t extra_byte_count;
|
||||
} mca_btl_ugni_segment_t;
|
||||
|
||||
typedef struct mca_btl_ugni_send_frag_hdr_t {
|
||||
uint32_t lag;
|
||||
} mca_btl_ugni_send_frag_hdr_t;
|
||||
@ -41,7 +34,9 @@ typedef struct mca_btl_ugni_rdma_frag_hdr_t {
|
||||
|
||||
typedef struct mca_btl_ugni_eager_frag_hdr_t {
|
||||
mca_btl_ugni_send_frag_hdr_t send;
|
||||
mca_btl_ugni_segment_t src_seg;
|
||||
uint32_t size;
|
||||
uint64_t address;
|
||||
mca_btl_base_registration_handle_t memory_handle;
|
||||
void *ctx;
|
||||
} mca_btl_ugni_eager_frag_hdr_t;
|
||||
|
||||
@ -59,29 +54,28 @@ typedef union mca_btl_ugni_frag_hdr_t {
|
||||
} mca_btl_ugni_frag_hdr_t;
|
||||
|
||||
enum {
|
||||
MCA_BTL_UGNI_FRAG_BUFFERED = 1, /* frag data is buffered */
|
||||
MCA_BTL_UGNI_FRAG_COMPLETE = 2, /* smsg complete for frag */
|
||||
MCA_BTL_UGNI_FRAG_EAGER = 4, /* eager get frag */
|
||||
MCA_BTL_UGNI_FRAG_IGNORE = 8, /* ignore local smsg completion */
|
||||
MCA_BTL_UGNI_FRAG_SMSG_COMPLETE = 16 /* SMSG has completed for this message */
|
||||
MCA_BTL_UGNI_FRAG_BUFFERED = 1, /* frag data is buffered */
|
||||
MCA_BTL_UGNI_FRAG_COMPLETE = 2, /* smsg complete for frag */
|
||||
MCA_BTL_UGNI_FRAG_EAGER = 4, /* eager get frag */
|
||||
MCA_BTL_UGNI_FRAG_IGNORE = 8, /* ignore local smsg completion */
|
||||
MCA_BTL_UGNI_FRAG_SMSG_COMPLETE = 16, /* SMSG has completed for this message */
|
||||
MCA_BTL_UGNI_FRAG_RESPONSE = 32,
|
||||
};
|
||||
|
||||
struct mca_btl_ugni_base_frag_t;
|
||||
|
||||
typedef void (*frag_cb_t) (struct mca_btl_ugni_base_frag_t *, int);
|
||||
|
||||
typedef struct mca_btl_ugni_base_frag_t {
|
||||
mca_btl_base_descriptor_t base;
|
||||
uint32_t msg_id;
|
||||
uint16_t hdr_size;
|
||||
uint16_t flags;
|
||||
mca_btl_ugni_frag_hdr_t hdr;
|
||||
mca_btl_ugni_segment_t segments[2];
|
||||
mca_btl_base_segment_t segments[2];
|
||||
opal_common_ugni_post_desc_t post_desc;
|
||||
mca_btl_base_endpoint_t *endpoint;
|
||||
mca_btl_ugni_reg_t *registration;
|
||||
ompi_free_list_t *my_list;
|
||||
frag_cb_t cbfunc;
|
||||
mca_btl_base_registration_handle_t memory_handle;
|
||||
} mca_btl_ugni_base_frag_t;
|
||||
|
||||
typedef struct mca_btl_ugni_base_frag_t mca_btl_ugni_smsg_frag_t;
|
||||
@ -91,6 +85,58 @@ typedef struct mca_btl_ugni_base_frag_t mca_btl_ugni_eager_frag_t;
|
||||
#define MCA_BTL_UGNI_DESC_TO_FRAG(desc) \
|
||||
((mca_btl_ugni_base_frag_t *)((uintptr_t) (desc) - offsetof (mca_btl_ugni_base_frag_t, post_desc)))
|
||||
|
||||
typedef struct mca_btl_ugni_post_descriptor_t {
|
||||
ompi_free_list_item_t super;
|
||||
opal_common_ugni_post_desc_t desc;
|
||||
mca_btl_base_endpoint_t *endpoint;
|
||||
mca_btl_base_registration_handle_t *local_handle;
|
||||
mca_btl_base_rdma_completion_fn_t cbfunc;
|
||||
void *cbdata;
|
||||
void *ctx;
|
||||
} mca_btl_ugni_post_descriptor_t;
|
||||
|
||||
OBJ_CLASS_DECLARATION(mca_btl_ugni_post_descriptor_t);
|
||||
|
||||
#define MCA_BTL_UGNI_DESC_TO_PDESC(desc) \
|
||||
((mca_btl_ugni_post_descriptor_t *)((uintptr_t) (desc) - offsetof (mca_btl_ugni_post_descriptor_t, desc)))
|
||||
|
||||
static inline void mca_btl_ugni_alloc_post_descriptor (mca_btl_base_endpoint_t *endpoint, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata,
|
||||
mca_btl_ugni_post_descriptor_t **desc)
|
||||
{
|
||||
ompi_free_list_item_t *item = NULL;
|
||||
|
||||
OMPI_FREE_LIST_GET_MT(&endpoint->btl->post_descriptors, item);
|
||||
*desc = (mca_btl_ugni_post_descriptor_t *) item;
|
||||
if (NULL != item) {
|
||||
(*desc)->cbfunc = cbfunc;
|
||||
(*desc)->ctx = cbcontext;
|
||||
(*desc)->cbdata = cbdata;
|
||||
(*desc)->local_handle = local_handle;
|
||||
(*desc)->endpoint = endpoint;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void mca_btl_ugni_return_post_descriptor (mca_btl_ugni_module_t *module,
|
||||
mca_btl_ugni_post_descriptor_t *desc)
|
||||
{
|
||||
OMPI_FREE_LIST_RETURN_MT(&module->post_descriptors, &desc->super);
|
||||
}
|
||||
|
||||
static inline void mca_btl_ugni_post_desc_complete (mca_btl_ugni_module_t *module, mca_btl_ugni_post_descriptor_t *desc, int rc)
|
||||
{
|
||||
BTL_VERBOSE(("RDMA/FMA/ATOMIC operation complete for post descriptor %p. rc = %d", (void *) desc, rc));
|
||||
|
||||
if (NULL != desc->cbfunc) {
|
||||
/* call the user's callback function */
|
||||
desc->cbfunc (&module->super, desc->endpoint, (void *)(intptr_t) desc->desc.base.local_addr,
|
||||
desc->local_handle, desc->ctx, desc->cbdata, rc);
|
||||
}
|
||||
|
||||
/* the descriptor is no longer needed */
|
||||
mca_btl_ugni_return_post_descriptor (module, desc);
|
||||
}
|
||||
|
||||
OBJ_CLASS_DECLARATION(mca_btl_ugni_smsg_frag_t);
|
||||
OBJ_CLASS_DECLARATION(mca_btl_ugni_rdma_frag_t);
|
||||
OBJ_CLASS_DECLARATION(mca_btl_ugni_eager_frag_t);
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -13,44 +13,31 @@
|
||||
#include "btl_ugni_rdma.h"
|
||||
#include "btl_ugni_smsg.h"
|
||||
|
||||
/**
|
||||
* Initiate a get operation.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param descriptor (IN) Description of the data to be transferred
|
||||
*/
|
||||
int mca_btl_ugni_get (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint,
|
||||
struct mca_btl_base_descriptor_t *des) {
|
||||
mca_btl_ugni_base_frag_t *frag = (mca_btl_ugni_base_frag_t *) des;
|
||||
mca_btl_ugni_segment_t *src_seg = (mca_btl_ugni_segment_t *) des->des_remote;
|
||||
mca_btl_ugni_segment_t *dst_seg = (mca_btl_ugni_segment_t *) des->des_local;
|
||||
size_t size = src_seg->base.seg_len - src_seg->extra_byte_count;
|
||||
int mca_btl_ugni_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
{
|
||||
bool check;
|
||||
|
||||
BTL_VERBOSE(("Using RDMA/FMA Get"));
|
||||
|
||||
/* cause endpoint to bind if it isn't already (bind is sufficient for rdma) */
|
||||
(void) mca_btl_ugni_check_endpoint_state(endpoint);
|
||||
|
||||
/* Check if the get is aligned/sized on a multiple of 4 */
|
||||
check = !!((des->des_remote->seg_addr.lval | des->des_local->seg_addr.lval | size) & 3);
|
||||
check = !!((remote_address | (uint64_t)(intptr_t) local_address | size) & (mca_btl_ugni_module.super.btl_get_alignment - 1));
|
||||
|
||||
if (OPAL_UNLIKELY(check || size > mca_btl_ugni_component.ugni_get_limit)) {
|
||||
/* switch to put */
|
||||
if (OPAL_UNLIKELY(check || size > mca_btl_ugni_module.super.btl_get_limit)) {
|
||||
BTL_VERBOSE(("RDMA/FMA Get not available due to size or alignment restrictions"));
|
||||
|
||||
/* notify the caller that get is not available */
|
||||
return OPAL_ERR_NOT_AVAILABLE;
|
||||
}
|
||||
|
||||
if (src_seg->extra_byte_count) {
|
||||
memmove ((char *) dst_seg->base.seg_addr.pval + size, src_seg->extra_bytes, src_seg->extra_byte_count);
|
||||
src_seg->base.seg_len = size;
|
||||
dst_seg->base.seg_len = size;
|
||||
}
|
||||
BTL_VERBOSE(("Using RDMA/FMA Get from local address %p to remote address %" PRIx64,
|
||||
local_address, remote_address));
|
||||
|
||||
des->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
|
||||
/* cause endpoint to bind if it isn't already (bind is sufficient for rdma) */
|
||||
(void) mca_btl_ugni_check_endpoint_state_rdma (endpoint);
|
||||
|
||||
return mca_btl_ugni_post (frag, true, dst_seg, src_seg);
|
||||
return mca_btl_ugni_post (endpoint, true, size, local_address, remote_address, local_handle,
|
||||
remote_handle, order, cbfunc, cbcontext, cbdata);
|
||||
}
|
||||
|
||||
/* eager get */
|
||||
@ -60,6 +47,8 @@ static void mca_btl_ugni_callback_eager_get_progress_pending (struct mca_btl_bas
|
||||
mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl;
|
||||
mca_btl_ugni_base_frag_t *pending_frag, *frag = (mca_btl_ugni_base_frag_t *) desc;
|
||||
|
||||
memset (&frag->hdr, 0, sizeof (frag->hdr));
|
||||
|
||||
OPAL_THREAD_LOCK(&ugni_module->eager_get_pending_lock);
|
||||
pending_frag = (mca_btl_ugni_base_frag_t *) opal_list_remove_first (&ugni_module->eager_get_pending);
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->eager_get_pending_lock);
|
||||
@ -68,6 +57,8 @@ static void mca_btl_ugni_callback_eager_get_progress_pending (struct mca_btl_bas
|
||||
/* copy the relevant data out of the pending fragment */
|
||||
frag->endpoint = pending_frag->endpoint;
|
||||
|
||||
assert (frag != pending_frag);
|
||||
|
||||
/* start the next eager get using this fragment */
|
||||
(void) mca_btl_ugni_start_eager_get (frag->endpoint, pending_frag->hdr.eager_ex, frag);
|
||||
|
||||
@ -80,39 +71,43 @@ static void mca_btl_ugni_callback_eager_get_progress_pending (struct mca_btl_bas
|
||||
}
|
||||
|
||||
static void mca_btl_ugni_callback_eager_get (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
struct mca_btl_base_descriptor_t *desc, int rc)
|
||||
void *local_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
void *context, void *cbdata, int status)
|
||||
{
|
||||
mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl;
|
||||
mca_btl_ugni_base_frag_t *frag = (mca_btl_ugni_base_frag_t *) desc;
|
||||
mca_btl_ugni_base_frag_t *frag = (mca_btl_ugni_base_frag_t *) context;
|
||||
uint32_t len = frag->hdr.eager.send.lag & 0x00ffffff;
|
||||
uint8_t tag = frag->hdr.eager.send.lag >> 24;
|
||||
size_t payload_len = frag->hdr.eager.src_seg.base.seg_len;
|
||||
size_t payload_len = frag->hdr.eager.size;
|
||||
size_t hdr_len = len - payload_len;
|
||||
mca_btl_active_message_callback_t *reg;
|
||||
mca_btl_base_segment_t segs[2];
|
||||
mca_btl_ugni_base_frag_t tmp;
|
||||
int rc;
|
||||
|
||||
BTL_VERBOSE(("eager get for rem_ctx %p complete", frag->hdr.eager.ctx));
|
||||
BTL_VERBOSE(("eager get for rem_ctx %p complete", frag->hdr.eager.ctx))
|
||||
|
||||
tmp.base.des_local = segs;
|
||||
tmp.base.des_segments = segs;
|
||||
if (hdr_len) {
|
||||
tmp.base.des_local_count = 2;
|
||||
tmp.base.des_segment_count = 2;
|
||||
|
||||
segs[0].seg_addr.pval = frag->hdr.eager_ex.pml_header;
|
||||
segs[0].seg_len = hdr_len;
|
||||
segs[1].seg_addr.pval = frag->segments[0].base.seg_addr.pval;
|
||||
segs[1].seg_addr.pval = local_address;
|
||||
segs[1].seg_len = payload_len;
|
||||
} else {
|
||||
tmp.base.des_local_count = 1;
|
||||
tmp.base.des_segment_count = 1;
|
||||
|
||||
segs[0].seg_addr.pval = frag->segments[0].base.seg_addr.pval;
|
||||
segs[0].seg_addr.pval = local_address;
|
||||
segs[0].seg_len = payload_len;
|
||||
}
|
||||
|
||||
reg = mca_btl_base_active_message_trigger + tag;
|
||||
reg->cbfunc(&frag->endpoint->btl->super, tag, &(tmp.base), reg->cbdata);
|
||||
|
||||
/* fill in the response header */
|
||||
frag->hdr.rdma.ctx = frag->hdr.eager.ctx;
|
||||
frag->flags = MCA_BTL_UGNI_FRAG_RESPONSE;
|
||||
|
||||
/* once complete use this fragment for a pending eager get if any exist */
|
||||
frag->base.des_cbfunc = mca_btl_ugni_callback_eager_get_progress_pending;
|
||||
@ -122,6 +117,7 @@ static void mca_btl_ugni_callback_eager_get (struct mca_btl_base_module_t *btl,
|
||||
NULL, 0, MCA_BTL_UGNI_TAG_RDMA_COMPLETE);
|
||||
if (OPAL_UNLIKELY(0 > rc)) {
|
||||
/* queue fragment */
|
||||
OPAL_THREAD_LOCK(&endpoint->lock);
|
||||
if (false == endpoint->wait_listed) {
|
||||
OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock);
|
||||
opal_list_append (&ugni_module->ep_wait_list, &endpoint->super);
|
||||
@ -129,50 +125,50 @@ static void mca_btl_ugni_callback_eager_get (struct mca_btl_base_module_t *btl,
|
||||
endpoint->wait_listed = true;
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&endpoint->lock);
|
||||
opal_list_append (&endpoint->frag_wait_list, (opal_list_item_t *) frag);
|
||||
OPAL_THREAD_UNLOCK(&endpoint->lock);
|
||||
}
|
||||
}
|
||||
|
||||
int mca_btl_ugni_start_eager_get (mca_btl_base_endpoint_t *ep,
|
||||
int mca_btl_ugni_start_eager_get (mca_btl_base_endpoint_t *endpoint,
|
||||
mca_btl_ugni_eager_ex_frag_hdr_t hdr,
|
||||
mca_btl_ugni_base_frag_t *frag)
|
||||
{
|
||||
mca_btl_ugni_module_t *ugni_module = ep->btl;
|
||||
mca_btl_ugni_module_t *ugni_module = endpoint->btl;
|
||||
size_t size;
|
||||
int rc;
|
||||
|
||||
BTL_VERBOSE(("starting eager get for remote ctx: %p", hdr.eager.ctx));
|
||||
|
||||
do {
|
||||
if (NULL == frag) {
|
||||
rc = MCA_BTL_UGNI_FRAG_ALLOC_EAGER_RECV(ep, frag);
|
||||
/* try to allocate a registered buffer */
|
||||
rc = MCA_BTL_UGNI_FRAG_ALLOC_EAGER_RECV(endpoint, frag);
|
||||
if (OPAL_UNLIKELY(NULL == frag)) {
|
||||
(void) MCA_BTL_UGNI_FRAG_ALLOC_RDMA_INT(ep, frag);
|
||||
/* no registered buffers available. try again later */
|
||||
(void) MCA_BTL_UGNI_FRAG_ALLOC_RDMA_INT(endpoint, frag);
|
||||
|
||||
/* not much can be done if a small fragment can not be allocated. abort! */
|
||||
assert (NULL != frag);
|
||||
frag->hdr.eager_ex = hdr;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
frag->hdr.eager_ex = hdr;
|
||||
frag->flags = 0;
|
||||
|
||||
frag->base.des_flags = 0;
|
||||
frag->hdr.eager_ex = hdr;
|
||||
|
||||
frag->segments[1] = hdr.eager.src_seg;
|
||||
|
||||
/* increase size to a multiple of 4 bytes (required for get) */
|
||||
frag->segments[0].base.seg_len = frag->segments[1].base.seg_len =
|
||||
(hdr.eager.src_seg.base.seg_len + 3) & ~3;
|
||||
|
||||
frag->base.des_local = &frag->segments[1].base;
|
||||
/* increase size to a multiple of 4 bytes (required for get on Gemini) */
|
||||
size = (hdr.eager.size + 3) & ~3;
|
||||
|
||||
/* set up callback for get completion */
|
||||
frag->base.des_flags = MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
|
||||
frag->base.des_cbfunc = mca_btl_ugni_callback_eager_get;
|
||||
|
||||
rc = mca_btl_ugni_post (frag, GNI_POST_RDMA_GET, frag->segments, frag->segments + 1);
|
||||
/* start the get */
|
||||
rc = mca_btl_ugni_post (endpoint, true, size, frag->base.super.ptr, hdr.eager.address,
|
||||
&frag->memory_handle, &hdr.eager.memory_handle,
|
||||
MCA_BTL_NO_ORDER, mca_btl_ugni_callback_eager_get, frag, NULL);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS == rc)) {
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
@ -27,35 +27,37 @@ mca_btl_ugni_free (struct mca_btl_base_module_t *btl,
|
||||
static int
|
||||
mca_btl_ugni_module_finalize (struct mca_btl_base_module_t* btl);
|
||||
|
||||
static mca_btl_base_descriptor_t *
|
||||
mca_btl_ugni_prepare_dst (mca_btl_base_module_t *btl,
|
||||
mca_btl_base_endpoint_t *endpoint,
|
||||
mca_mpool_base_registration_t *registration,
|
||||
opal_convertor_t *convertor, uint8_t order,
|
||||
size_t reserve, size_t *size, uint32_t flags);
|
||||
|
||||
static struct mca_btl_base_descriptor_t *
|
||||
mca_btl_ugni_prepare_src (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint,
|
||||
mca_mpool_base_registration_t *registration,
|
||||
struct opal_convertor_t *convertor,
|
||||
uint8_t order, size_t reserve, size_t *size,
|
||||
uint32_t flags);
|
||||
|
||||
static mca_btl_base_registration_handle_t *
|
||||
mca_btl_ugni_register_mem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *base,
|
||||
size_t size, uint32_t flags);
|
||||
|
||||
static int mca_btl_ugni_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle);
|
||||
|
||||
mca_btl_ugni_module_t mca_btl_ugni_module = {
|
||||
.super = {
|
||||
.btl_component = &mca_btl_ugni_component.super,
|
||||
.btl_add_procs = mca_btl_ugni_add_procs,
|
||||
.btl_del_procs = mca_btl_ugni_del_procs,
|
||||
.btl_finalize = mca_btl_ugni_module_finalize,
|
||||
.btl_alloc = mca_btl_ugni_alloc,
|
||||
.btl_free = mca_btl_ugni_free,
|
||||
.btl_prepare_src = mca_btl_ugni_prepare_src,
|
||||
.btl_prepare_dst = mca_btl_ugni_prepare_dst,
|
||||
.btl_send = mca_btl_ugni_send,
|
||||
.btl_sendi = mca_btl_ugni_sendi,
|
||||
.btl_put = mca_btl_ugni_put,
|
||||
.btl_get = mca_btl_ugni_get,
|
||||
.btl_component = &mca_btl_ugni_component.super,
|
||||
.btl_add_procs = mca_btl_ugni_add_procs,
|
||||
.btl_del_procs = mca_btl_ugni_del_procs,
|
||||
.btl_finalize = mca_btl_ugni_module_finalize,
|
||||
.btl_alloc = mca_btl_ugni_alloc,
|
||||
.btl_free = mca_btl_ugni_free,
|
||||
.btl_prepare_src = mca_btl_ugni_prepare_src,
|
||||
.btl_send = mca_btl_ugni_send,
|
||||
.btl_sendi = mca_btl_ugni_sendi,
|
||||
.btl_put = mca_btl_ugni_put,
|
||||
.btl_get = mca_btl_ugni_get,
|
||||
.btl_register_mem = mca_btl_ugni_register_mem,
|
||||
.btl_deregister_mem = mca_btl_ugni_deregister_mem,
|
||||
.btl_atomic_op = mca_btl_ugni_aop,
|
||||
.btl_atomic_fop = mca_btl_ugni_afop,
|
||||
.btl_atomic_cswap = mca_btl_ugni_acswap,
|
||||
}
|
||||
};
|
||||
|
||||
@ -92,6 +94,9 @@ mca_btl_ugni_module_init (mca_btl_ugni_module_t *ugni_module,
|
||||
OBJ_CONSTRUCT(&ugni_module->endpoints, opal_pointer_array_t);
|
||||
OBJ_CONSTRUCT(&ugni_module->id_to_endpoint, opal_hash_table_t);
|
||||
OBJ_CONSTRUCT(&ugni_module->smsg_mboxes, ompi_free_list_t);
|
||||
OBJ_CONSTRUCT(&ugni_module->pending_descriptors, opal_list_t);
|
||||
OBJ_CONSTRUCT(&ugni_module->eager_get_pending, opal_list_t);
|
||||
OBJ_CONSTRUCT(&ugni_module->post_descriptors, ompi_free_list_t);
|
||||
|
||||
ugni_module->device = dev;
|
||||
dev->btl_ctx = (void *) ugni_module;
|
||||
@ -204,7 +209,6 @@ mca_btl_ugni_module_finalize (struct mca_btl_base_module_t *btl)
|
||||
OBJ_DESTRUCT(&ugni_module->pending_smsg_frags_bb);
|
||||
OBJ_DESTRUCT(&ugni_module->id_to_endpoint);
|
||||
OBJ_DESTRUCT(&ugni_module->endpoints);
|
||||
OBJ_DESTRUCT(&ugni_module->failed_frags);
|
||||
|
||||
OBJ_DESTRUCT(&ugni_module->eager_get_pending);
|
||||
OBJ_DESTRUCT(&ugni_module->eager_get_pending_lock);
|
||||
@ -250,13 +254,13 @@ mca_btl_ugni_alloc(struct mca_btl_base_module_t *btl,
|
||||
|
||||
frag->base.des_flags = flags;
|
||||
frag->base.order = order;
|
||||
frag->base.des_local = &frag->segments[1].base;
|
||||
frag->base.des_local_count = 1;
|
||||
frag->base.des_segments = &frag->segments[1];
|
||||
frag->base.des_segment_count = 1;
|
||||
|
||||
frag->segments[0].base.seg_addr.pval = NULL;
|
||||
frag->segments[0].base.seg_len = 0;
|
||||
frag->segments[1].base.seg_addr.pval = frag->base.super.ptr;
|
||||
frag->segments[1].base.seg_len = size;
|
||||
frag->segments[0].seg_addr.pval = NULL;
|
||||
frag->segments[0].seg_len = 0;
|
||||
frag->segments[1].seg_addr.pval = frag->base.super.ptr;
|
||||
frag->segments[1].seg_len = size;
|
||||
|
||||
frag->flags = MCA_BTL_UGNI_FRAG_BUFFERED;
|
||||
if (size > mca_btl_ugni_component.smsg_max_data) {
|
||||
@ -267,7 +271,7 @@ mca_btl_ugni_alloc(struct mca_btl_base_module_t *btl,
|
||||
|
||||
registration = (mca_btl_ugni_reg_t *) frag->base.super.registration;
|
||||
|
||||
frag->segments[1].memory_handle = registration->memory_hdl;
|
||||
frag->hdr.eager.memory_handle = registration->handle;
|
||||
} else {
|
||||
frag->hdr_size = sizeof (frag->hdr.send);
|
||||
}
|
||||
@ -285,59 +289,36 @@ mca_btl_ugni_free (struct mca_btl_base_module_t *btl,
|
||||
static struct mca_btl_base_descriptor_t *
|
||||
mca_btl_ugni_prepare_src (struct mca_btl_base_module_t *btl,
|
||||
mca_btl_base_endpoint_t *endpoint,
|
||||
mca_mpool_base_registration_t *registration,
|
||||
struct opal_convertor_t *convertor,
|
||||
uint8_t order, size_t reserve, size_t *size,
|
||||
uint32_t flags)
|
||||
{
|
||||
if (OPAL_LIKELY(reserve)) {
|
||||
return mca_btl_ugni_prepare_src_send (btl, endpoint, convertor,
|
||||
order, reserve, size, flags);
|
||||
} else {
|
||||
return mca_btl_ugni_prepare_src_rdma (btl, endpoint, registration,
|
||||
convertor, order, size, flags);
|
||||
}
|
||||
return mca_btl_ugni_prepare_src_send (btl, endpoint, convertor,
|
||||
order, reserve, size, flags);
|
||||
}
|
||||
|
||||
static mca_btl_base_descriptor_t *
|
||||
mca_btl_ugni_prepare_dst (mca_btl_base_module_t *btl,
|
||||
mca_btl_base_endpoint_t *endpoint,
|
||||
mca_mpool_base_registration_t *registration,
|
||||
opal_convertor_t *convertor, uint8_t order,
|
||||
size_t reserve, size_t *size, uint32_t flags)
|
||||
static mca_btl_base_registration_handle_t *
|
||||
mca_btl_ugni_register_mem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *base,
|
||||
size_t size, uint32_t flags)
|
||||
{
|
||||
mca_btl_ugni_base_frag_t *frag;
|
||||
void *data_ptr;
|
||||
mca_btl_ugni_reg_t *reg;
|
||||
int rc;
|
||||
|
||||
opal_convertor_get_current_pointer (convertor, &data_ptr);
|
||||
|
||||
(void) MCA_BTL_UGNI_FRAG_ALLOC_RDMA(endpoint, frag);
|
||||
if (OPAL_UNLIKELY(NULL == frag)) {
|
||||
rc = btl->btl_mpool->mpool_register(btl->btl_mpool, base, size, 0,
|
||||
(mca_mpool_base_registration_t **) ®);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* always need to register the buffer for put/get (even for fma) */
|
||||
if (NULL == registration) {
|
||||
rc = btl->btl_mpool->mpool_register(btl->btl_mpool,
|
||||
data_ptr, *size, 0,
|
||||
®istration);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
mca_btl_ugni_frag_return (frag);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
frag->registration = (mca_btl_ugni_reg_t*) registration;
|
||||
}
|
||||
|
||||
frag->segments[0].memory_handle = ((mca_btl_ugni_reg_t *)registration)->memory_hdl;
|
||||
frag->segments[0].base.seg_len = *size;
|
||||
frag->segments[0].base.seg_addr.lval = (uint64_t)(uintptr_t) data_ptr;
|
||||
|
||||
frag->base.des_local = &frag->segments->base;
|
||||
frag->base.des_local_count = 1;
|
||||
frag->base.order = order;
|
||||
frag->base.des_flags = flags;
|
||||
|
||||
return (struct mca_btl_base_descriptor_t *) frag;
|
||||
return ®->handle;
|
||||
}
|
||||
|
||||
static int mca_btl_ugni_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle)
|
||||
{
|
||||
mca_btl_ugni_reg_t *reg =
|
||||
(mca_btl_ugni_reg_t *)((intptr_t) handle - offsetof (mca_btl_ugni_reg_t, handle));
|
||||
|
||||
(void) btl->btl_mpool->mpool_deregister (btl->btl_mpool, ®->base);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -35,14 +35,14 @@ mca_btl_ugni_prepare_src_send_nodata (struct mca_btl_base_module_t *btl,
|
||||
|
||||
frag->hdr_size = reserve + sizeof (frag->hdr.send);
|
||||
|
||||
frag->segments[0].base.seg_addr.pval = frag->hdr.send_ex.pml_header;
|
||||
frag->segments[0].base.seg_len = reserve;
|
||||
frag->segments[0].seg_addr.pval = frag->hdr.send_ex.pml_header;
|
||||
frag->segments[0].seg_len = reserve;
|
||||
|
||||
frag->segments[1].base.seg_addr.pval = NULL;
|
||||
frag->segments[1].base.seg_len = 0;
|
||||
frag->segments[1].seg_addr.pval = NULL;
|
||||
frag->segments[1].seg_len = 0;
|
||||
|
||||
frag->base.des_local = &frag->segments->base;
|
||||
frag->base.des_local_count = 1;
|
||||
frag->base.des_segments = frag->segments;
|
||||
frag->base.des_segment_count = 1;
|
||||
frag->base.order = order;
|
||||
frag->base.des_flags = flags;
|
||||
|
||||
@ -84,22 +84,22 @@ mca_btl_ugni_prepare_src_send_inplace (struct mca_btl_base_module_t *btl,
|
||||
frag->flags = MCA_BTL_UGNI_FRAG_EAGER | MCA_BTL_UGNI_FRAG_IGNORE;
|
||||
|
||||
frag->registration = registration;
|
||||
frag->segments[1].memory_handle = registration->memory_hdl;
|
||||
frag->hdr.eager.memory_handle = registration->handle;;
|
||||
|
||||
frag->hdr_size = reserve + sizeof (frag->hdr.eager);
|
||||
frag->segments[0].base.seg_addr.pval = frag->hdr.eager_ex.pml_header;
|
||||
frag->segments[0].seg_addr.pval = frag->hdr.eager_ex.pml_header;
|
||||
} else {
|
||||
frag->hdr_size = reserve + sizeof (frag->hdr.send);
|
||||
frag->segments[0].base.seg_addr.pval = frag->hdr.send_ex.pml_header;
|
||||
frag->segments[0].seg_addr.pval = frag->hdr.send_ex.pml_header;
|
||||
}
|
||||
|
||||
frag->segments[0].base.seg_len = reserve;
|
||||
frag->segments[0].seg_len = reserve;
|
||||
|
||||
frag->segments[1].base.seg_addr.pval = data_ptr;
|
||||
frag->segments[1].base.seg_len = *size;
|
||||
frag->segments[1].seg_addr.pval = data_ptr;
|
||||
frag->segments[1].seg_len = *size;
|
||||
|
||||
frag->base.des_local = &frag->segments->base;
|
||||
frag->base.des_local_count = 2;
|
||||
frag->base.des_segments = frag->segments;
|
||||
frag->base.des_segment_count = 2;
|
||||
frag->base.order = order;
|
||||
frag->base.des_flags = flags;
|
||||
|
||||
@ -130,10 +130,9 @@ mca_btl_ugni_prepare_src_send_buffered (struct mca_btl_base_module_t *btl,
|
||||
|
||||
registration = (mca_btl_ugni_reg_t *) frag->base.super.registration;
|
||||
|
||||
frag->segments[1].memory_handle = registration->memory_hdl;
|
||||
|
||||
frag->hdr.eager.memory_handle = registration->handle;
|
||||
frag->hdr_size = reserve + sizeof (frag->hdr.eager);
|
||||
frag->segments[0].base.seg_addr.pval = frag->hdr.eager_ex.pml_header;
|
||||
frag->segments[0].seg_addr.pval = frag->hdr.eager_ex.pml_header;
|
||||
} else {
|
||||
(void) MCA_BTL_UGNI_FRAG_ALLOC_SMSG(endpoint, frag);
|
||||
if (OPAL_UNLIKELY(NULL == frag)) {
|
||||
@ -141,7 +140,7 @@ mca_btl_ugni_prepare_src_send_buffered (struct mca_btl_base_module_t *btl,
|
||||
}
|
||||
|
||||
frag->hdr_size = reserve + sizeof (frag->hdr.send);
|
||||
frag->segments[0].base.seg_addr.pval = frag->hdr.send_ex.pml_header;
|
||||
frag->segments[0].seg_addr.pval = frag->hdr.send_ex.pml_header;
|
||||
}
|
||||
|
||||
frag->flags |= MCA_BTL_UGNI_FRAG_BUFFERED;
|
||||
@ -155,13 +154,13 @@ mca_btl_ugni_prepare_src_send_buffered (struct mca_btl_base_module_t *btl,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
frag->segments[0].base.seg_len = reserve;
|
||||
frag->segments[0].seg_len = reserve;
|
||||
|
||||
frag->segments[1].base.seg_addr.pval = frag->base.super.ptr;
|
||||
frag->segments[1].base.seg_len = *size;
|
||||
frag->segments[1].seg_addr.pval = frag->base.super.ptr;
|
||||
frag->segments[1].seg_len = *size;
|
||||
|
||||
frag->base.des_local = &frag->segments->base;
|
||||
frag->base.des_local_count = 2;
|
||||
frag->base.des_segments = frag->segments;
|
||||
frag->base.des_segment_count = 2;
|
||||
frag->base.order = order;
|
||||
frag->base.des_flags = flags;
|
||||
|
||||
@ -197,66 +196,4 @@ mca_btl_ugni_prepare_src_send (struct mca_btl_base_module_t *btl,
|
||||
}
|
||||
}
|
||||
|
||||
static inline struct mca_btl_base_descriptor_t *
|
||||
mca_btl_ugni_prepare_src_rdma (struct mca_btl_base_module_t *btl,
|
||||
mca_btl_base_endpoint_t *endpoint,
|
||||
mca_mpool_base_registration_t *registration,
|
||||
struct opal_convertor_t *convertor,
|
||||
uint8_t order, size_t *size,
|
||||
uint32_t flags)
|
||||
{
|
||||
mca_btl_ugni_base_frag_t *frag;
|
||||
void *data_ptr;
|
||||
int rc;
|
||||
|
||||
opal_convertor_get_current_pointer (convertor, &data_ptr);
|
||||
|
||||
(void) MCA_BTL_UGNI_FRAG_ALLOC_RDMA(endpoint, frag);
|
||||
if (OPAL_UNLIKELY(NULL == frag)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* For medium message use FMA protocols and for large message
|
||||
* use BTE protocols
|
||||
*/
|
||||
/* No need to register while using FMA Put (registration is
|
||||
* non-null in get-- is this always true?) */
|
||||
if (*size >= mca_btl_ugni_component.ugni_fma_limit || (flags & MCA_BTL_DES_FLAGS_GET)) {
|
||||
if (NULL == registration) {
|
||||
rc = btl->btl_mpool->mpool_register(btl->btl_mpool, data_ptr, *size, 0,
|
||||
(mca_mpool_base_registration_t **) ®istration);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
mca_btl_ugni_frag_return (frag);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
frag->registration = (mca_btl_ugni_reg_t *) registration;
|
||||
}
|
||||
|
||||
frag->segments[0].memory_handle = ((mca_btl_ugni_reg_t *)registration)->memory_hdl;
|
||||
} else {
|
||||
memset ((void *) &frag->segments[0].memory_handle, 0,
|
||||
sizeof (frag->segments[0].memory_handle));
|
||||
}
|
||||
|
||||
if ((flags & MCA_BTL_DES_FLAGS_GET) && (*size & 0x3)) {
|
||||
memmove (frag->segments[0].extra_bytes, (char *) data_ptr + (*size & ~0x3),
|
||||
*size & 0x3);
|
||||
frag->segments[0].extra_byte_count = *size & 0x3;
|
||||
} else {
|
||||
frag->segments[0].extra_byte_count = 0;
|
||||
}
|
||||
|
||||
frag->segments[0].base.seg_addr.lval = (uint64_t)(uintptr_t) data_ptr;
|
||||
frag->segments[0].base.seg_len = *size;
|
||||
|
||||
frag->base.des_local = &frag->segments->base;
|
||||
frag->base.des_local_count = 1;
|
||||
frag->base.order = order;
|
||||
frag->base.des_flags = flags;
|
||||
|
||||
return &frag->base;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -20,17 +20,13 @@
|
||||
|
||||
|
||||
static pthread_t mca_btl_ugni_progress_thread_id;
|
||||
static pthread_mutex_t progress_mutex = PTHREAD_MUTEX_INITIALIZER;
|
||||
static pthread_cond_t progress_cond = PTHREAD_COND_INITIALIZER;
|
||||
|
||||
static volatile int stop_progress_thread = 0;
|
||||
static volatile int progress_thread_done = 0;
|
||||
|
||||
static int thread_wakeups = 0;
|
||||
unsigned int mca_btl_ugni_progress_thread_wakeups;
|
||||
|
||||
static void *mca_btl_ugni_prog_thread_fn(void * data)
|
||||
{
|
||||
int rc,ret = OPAL_SUCCESS;
|
||||
uint32_t which;
|
||||
gni_return_t status;
|
||||
gni_cq_handle_t cq_vec[2];
|
||||
@ -59,36 +55,12 @@ static void *mca_btl_ugni_prog_thread_fn(void * data)
|
||||
if (status == GNI_RC_NOT_DONE) continue;
|
||||
|
||||
if ((status == GNI_RC_SUCCESS) && (stop_progress_thread == 0)) {
|
||||
thread_wakeups++;
|
||||
mca_btl_ugni_progress_thread_wakeups++;
|
||||
opal_progress();
|
||||
}
|
||||
}
|
||||
|
||||
/* Send a signal to the main thread saying we are done */
|
||||
rc = pthread_mutex_lock(&progress_mutex);
|
||||
if (0 != rc) {
|
||||
BTL_ERROR(("btl/ugni pthread_mutex_lock returned %s ",strerror(rc)));
|
||||
ret = OPAL_ERROR;
|
||||
goto fn_exit;
|
||||
}
|
||||
|
||||
progress_thread_done = 1;
|
||||
|
||||
rc = pthread_mutex_unlock(&progress_mutex);
|
||||
if (0 != rc) {
|
||||
BTL_ERROR(("btl/ugni pthread_mutex_unlock returned %s ",strerror(rc)));
|
||||
ret = OPAL_ERROR;
|
||||
goto fn_exit;
|
||||
}
|
||||
|
||||
rc = pthread_cond_signal(&progress_cond);
|
||||
if (0 != rc) {
|
||||
BTL_ERROR(("btl/ugni pthread_cond_signal returned %s ",strerror(rc)));
|
||||
ret = OPAL_ERROR;
|
||||
}
|
||||
|
||||
fn_exit:
|
||||
return ret;
|
||||
return (void *) (intptr_t) OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_btl_ugni_spawn_progress_thread(struct mca_btl_base_module_t *btl)
|
||||
@ -124,9 +96,8 @@ int mca_btl_ugni_spawn_progress_thread(struct mca_btl_base_module_t *btl)
|
||||
|
||||
int mca_btl_ugni_kill_progress_thread(void)
|
||||
{
|
||||
int rc, ret=OPAL_SUCCESS;
|
||||
gni_return_t status;
|
||||
static mca_btl_ugni_base_frag_t cq_write_frag;
|
||||
int ret=OPAL_SUCCESS;
|
||||
void *thread_rc;
|
||||
|
||||
stop_progress_thread = 1;
|
||||
|
||||
@ -134,61 +105,23 @@ int mca_btl_ugni_kill_progress_thread(void)
|
||||
* post a CQ to myself to wake my thread up
|
||||
*/
|
||||
|
||||
cq_write_frag.post_desc.base.type = GNI_POST_CQWRITE;
|
||||
cq_write_frag.post_desc.base.cqwrite_value = 0xdead; /* up to 48 bytes here, not used for now */
|
||||
cq_write_frag.post_desc.base.cq_mode = GNI_CQMODE_GLOBAL_EVENT;
|
||||
cq_write_frag.post_desc.base.dlvr_mode = GNI_DLVMODE_IN_ORDER;
|
||||
cq_write_frag.post_desc.base.src_cq_hndl = mca_btl_ugni_component.modules[0].rdma_local_cq;
|
||||
cq_write_frag.post_desc.base.remote_mem_hndl = mca_btl_ugni_component.modules[0].device->smsg_irq_mhndl;
|
||||
cq_write_frag.post_desc.tries = 0;
|
||||
cq_write_frag.cbfunc = NULL;
|
||||
OPAL_THREAD_LOCK(&mca_btl_ugni_component.modules[0].device->dev_lock);
|
||||
status = GNI_PostCqWrite(mca_btl_ugni_component.modules[0].local_ep,
|
||||
&cq_write_frag.post_desc.base);
|
||||
OPAL_THREAD_UNLOCK(&mca_btl_ugni_component.modules[0].device->dev_lock);
|
||||
ret = mca_btl_ugni_post_cqwrite (mca_btl_ugni_component.modules[0].local_ep,
|
||||
mca_btl_ugni_component.modules[0].rdma_local_cq,
|
||||
mca_btl_ugni_component.modules[0].device->smsg_irq_mhndl,
|
||||
0xdead, NULL, NULL, NULL);
|
||||
/*
|
||||
* TODO: if error returned, need to kill off thread manually
|
||||
*/
|
||||
if (GNI_RC_SUCCESS != status) {
|
||||
BTL_ERROR(("GNI_PostCqWrite returned error - %s",gni_err_str[status]));
|
||||
ret = opal_common_rc_ugni_to_opal(status);
|
||||
if (OPAL_SUCCESS != ret) {
|
||||
/* force the thread to exit */
|
||||
pthread_cancel (mca_btl_ugni_progress_thread_id);
|
||||
goto fn_exit;
|
||||
}
|
||||
|
||||
rc = pthread_mutex_lock(&progress_mutex);
|
||||
if (0 != rc) {
|
||||
BTL_ERROR(("btl/ugni pthread_mutex_lock returned %s ",strerror(rc)));
|
||||
ret = OPAL_ERROR;
|
||||
goto fn_exit;
|
||||
}
|
||||
|
||||
while (!progress_thread_done) {
|
||||
pthread_cond_wait(&progress_cond, &progress_mutex);
|
||||
if (0 != rc) {
|
||||
BTL_ERROR(("btl/ugni pthread_cond_wait returned %s ",strerror(rc)));
|
||||
ret = OPAL_ERROR;
|
||||
goto fn_exit;
|
||||
}
|
||||
}
|
||||
|
||||
rc = pthread_mutex_unlock(&progress_mutex);
|
||||
if (0 != rc) {
|
||||
BTL_ERROR(("btl/ugni pthread_mutex_unlock returned %s ",strerror(rc)));
|
||||
ret = OPAL_ERROR;
|
||||
goto fn_exit;
|
||||
}
|
||||
|
||||
/*
|
||||
* destroy the local_ep
|
||||
*/
|
||||
|
||||
OPAL_THREAD_LOCK(&mca_btl_ugni_component.modules[0].device->dev_lock);
|
||||
status = GNI_EpDestroy (mca_btl_ugni_component.modules[0].local_ep);
|
||||
OPAL_THREAD_UNLOCK(&mca_btl_ugni_component.modules[0].device->dev_lock);
|
||||
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != status)) {
|
||||
BTL_ERROR(("GNI_EpDestroy returned error - %s", gni_err_str[status]));
|
||||
ret = opal_common_rc_ugni_to_opal(status);
|
||||
goto fn_exit;
|
||||
pthread_join (mca_btl_ugni_progress_thread_id, &thread_rc);
|
||||
if (0 != (intptr_t) thread_rc) {
|
||||
BTL_ERROR(("btl/ugni error returned from progress thread: %d", (int) (intptr_t) thread_rc));
|
||||
ret = (int)(intptr_t) thread_rc;
|
||||
}
|
||||
|
||||
fn_exit:
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -14,25 +14,17 @@
|
||||
|
||||
#include "btl_ugni_rdma.h"
|
||||
|
||||
/**
|
||||
* Initiate a put operation.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param descriptor (IN) Description of the data to be transferred
|
||||
*/
|
||||
int mca_btl_ugni_put (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint,
|
||||
struct mca_btl_base_descriptor_t *des) {
|
||||
mca_btl_ugni_base_frag_t *frag = (mca_btl_ugni_base_frag_t *) des;
|
||||
|
||||
BTL_VERBOSE(("Using RDMA/FMA Put for frag %p", (void *) des));
|
||||
int mca_btl_ugni_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
{
|
||||
BTL_VERBOSE(("Using RDMA/FMA Put from local address %p to remote address %" PRIx64,
|
||||
local_address, remote_address));
|
||||
|
||||
/* cause endpoint to bind if it isn't already (bind is sufficient for rdma) */
|
||||
(void) mca_btl_ugni_check_endpoint_state(endpoint);
|
||||
(void) mca_btl_ugni_check_endpoint_state_rdma (endpoint);
|
||||
|
||||
des->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
|
||||
|
||||
return mca_btl_ugni_post (frag, false, (mca_btl_ugni_segment_t *) des->des_local,
|
||||
(mca_btl_ugni_segment_t *) des->des_remote);
|
||||
return mca_btl_ugni_post (endpoint, false, size, local_address, remote_address, local_handle,
|
||||
remote_handle, order, cbfunc, cbcontext, cbdata);
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -20,107 +20,185 @@ int mca_btl_ugni_start_eager_get (mca_btl_base_endpoint_t *ep,
|
||||
mca_btl_ugni_eager_ex_frag_hdr_t hdr,
|
||||
mca_btl_ugni_base_frag_t *frag);
|
||||
|
||||
static inline void init_gni_post_desc (mca_btl_ugni_base_frag_t *frag,
|
||||
gni_post_type_t op_type,
|
||||
uint64_t lcl_addr,
|
||||
gni_mem_handle_t lcl_mdh,
|
||||
uint64_t rem_addr,
|
||||
gni_mem_handle_t rem_mdh,
|
||||
uint64_t bufsize,
|
||||
gni_cq_handle_t cq_hndl) {
|
||||
frag->post_desc.base.type = op_type;
|
||||
frag->post_desc.base.cq_mode = GNI_CQMODE_GLOBAL_EVENT;
|
||||
frag->post_desc.base.dlvr_mode = GNI_DLVMODE_PERFORMANCE;
|
||||
frag->post_desc.base.local_addr = (uint64_t) lcl_addr;
|
||||
frag->post_desc.base.local_mem_hndl = lcl_mdh;
|
||||
frag->post_desc.base.remote_addr = (uint64_t) rem_addr;
|
||||
frag->post_desc.base.remote_mem_hndl = rem_mdh;
|
||||
frag->post_desc.base.length = bufsize;
|
||||
frag->post_desc.base.rdma_mode = 0;
|
||||
frag->post_desc.base.rdma_mode = 0;
|
||||
frag->post_desc.base.src_cq_hndl = cq_hndl;
|
||||
frag->post_desc.tries = 0;
|
||||
static inline void init_gni_post_desc (opal_common_ugni_post_desc_t *post_desc,
|
||||
int order, gni_post_type_t op_type,
|
||||
uint64_t lcl_addr,
|
||||
gni_mem_handle_t lcl_mdh,
|
||||
uint64_t rem_addr,
|
||||
gni_mem_handle_t rem_mdh,
|
||||
uint64_t bufsize,
|
||||
gni_cq_handle_t cq_hndl) {
|
||||
post_desc->base.type = op_type;
|
||||
post_desc->base.cq_mode = GNI_CQMODE_GLOBAL_EVENT;
|
||||
if (MCA_BTL_NO_ORDER == order) {
|
||||
post_desc->base.dlvr_mode = GNI_DLVMODE_PERFORMANCE;
|
||||
} else {
|
||||
post_desc->base.dlvr_mode = GNI_DLVMODE_NO_ADAPT;
|
||||
}
|
||||
post_desc->base.local_addr = (uint64_t) lcl_addr;
|
||||
post_desc->base.local_mem_hndl = lcl_mdh;
|
||||
post_desc->base.remote_addr = (uint64_t) rem_addr;
|
||||
post_desc->base.remote_mem_hndl = rem_mdh;
|
||||
post_desc->base.length = bufsize;
|
||||
post_desc->base.rdma_mode = 0;
|
||||
post_desc->base.src_cq_hndl = cq_hndl;
|
||||
post_desc->tries = 0;
|
||||
}
|
||||
|
||||
static inline int mca_btl_ugni_post_fma (mca_btl_ugni_base_frag_t *frag, gni_post_type_t op_type,
|
||||
mca_btl_ugni_segment_t *lcl_seg, mca_btl_ugni_segment_t *rem_seg)
|
||||
static inline int mca_btl_ugni_post_fma (struct mca_btl_base_endpoint_t *endpoint, gni_post_type_t op_type,
|
||||
size_t size, void *local_address, uint64_t remote_address,
|
||||
mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata)
|
||||
{
|
||||
gni_return_t rc;
|
||||
mca_btl_ugni_post_descriptor_t *post_desc;
|
||||
gni_return_t grc;
|
||||
|
||||
/* Post descriptor (CQ is ignored for FMA transactions) */
|
||||
init_gni_post_desc (frag, op_type, lcl_seg->base.seg_addr.lval, lcl_seg->memory_handle,
|
||||
rem_seg->base.seg_addr.lval, rem_seg->memory_handle, lcl_seg->base.seg_len, 0);
|
||||
mca_btl_ugni_alloc_post_descriptor (endpoint, local_handle, cbfunc, cbcontext, cbdata, &post_desc);
|
||||
if (OPAL_UNLIKELY(NULL == post_desc)) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&frag->endpoint->common->dev->dev_lock);
|
||||
rc = GNI_PostFma (frag->endpoint->rdma_ep_handle, &frag->post_desc.base);
|
||||
OPAL_THREAD_UNLOCK(&frag->endpoint->common->dev->dev_lock);
|
||||
if (GNI_RC_SUCCESS != rc) {
|
||||
BTL_VERBOSE(("GNI_PostFma failed with gni rc: %d", rc));
|
||||
/* Post descriptor (CQ is ignored for FMA transactions) -- The CQ associated with the endpoint
|
||||
* is used. */
|
||||
init_gni_post_desc (&post_desc->desc, order, op_type, (intptr_t) local_address, local_handle->gni_handle,
|
||||
remote_address, remote_handle->gni_handle, size, 0);
|
||||
|
||||
OPAL_THREAD_LOCK(&endpoint->btl->device->dev_lock);
|
||||
grc = GNI_PostFma (endpoint->rdma_ep_handle, &post_desc->desc.base);
|
||||
OPAL_THREAD_UNLOCK(&endpoint->btl->device->dev_lock);
|
||||
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc)) {
|
||||
mca_btl_ugni_return_post_descriptor (endpoint->btl, post_desc);
|
||||
|
||||
if (GNI_RC_ALIGNMENT_ERROR == grc) {
|
||||
BTL_VERBOSE(("GNI_PostFma failed with an alignment error"));
|
||||
return OPAL_ERR_NOT_AVAILABLE;
|
||||
}
|
||||
|
||||
BTL_VERBOSE(("GNI_PostFma failed with gni rc: %d", grc));
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
static inline int mca_btl_ugni_post_bte (mca_btl_ugni_base_frag_t *frag, gni_post_type_t op_type,
|
||||
mca_btl_ugni_segment_t *lcl_seg, mca_btl_ugni_segment_t *rem_seg)
|
||||
static inline int mca_btl_ugni_post_bte (mca_btl_base_endpoint_t *endpoint, gni_post_type_t op_type,
|
||||
size_t size, void *local_address, uint64_t remote_address,
|
||||
mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata)
|
||||
{
|
||||
mca_btl_ugni_post_descriptor_t *post_desc;
|
||||
gni_cq_handle_t cq_handle = endpoint->btl->rdma_local_cq;
|
||||
gni_return_t status;
|
||||
|
||||
/* Post descriptor */
|
||||
if (mca_btl_ugni_component.progress_thread_enabled) {
|
||||
init_gni_post_desc (frag, op_type, lcl_seg->base.seg_addr.lval, lcl_seg->memory_handle,
|
||||
rem_seg->base.seg_addr.lval, rem_seg->memory_handle, lcl_seg->base.seg_len,
|
||||
frag->endpoint->btl->rdma_local_irq_cq);
|
||||
} else {
|
||||
init_gni_post_desc (frag, op_type, lcl_seg->base.seg_addr.lval, lcl_seg->memory_handle,
|
||||
rem_seg->base.seg_addr.lval, rem_seg->memory_handle, lcl_seg->base.seg_len,
|
||||
frag->endpoint->btl->rdma_local_cq);
|
||||
mca_btl_ugni_alloc_post_descriptor (endpoint, local_handle, cbfunc, cbcontext, cbdata, &post_desc);
|
||||
if (OPAL_UNLIKELY(NULL == post_desc)) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&frag->endpoint->common->dev->dev_lock);
|
||||
status = GNI_PostRdma (frag->endpoint->rdma_ep_handle, &frag->post_desc.base);
|
||||
OPAL_THREAD_UNLOCK(&frag->endpoint->common->dev->dev_lock);
|
||||
if (GNI_RC_SUCCESS != status) {
|
||||
if (mca_btl_ugni_component.progress_thread_enabled) {
|
||||
cq_handle = endpoint->btl->rdma_local_irq_cq;
|
||||
}
|
||||
|
||||
/* Post descriptor */
|
||||
init_gni_post_desc (&post_desc->desc, order, op_type, (intptr_t) local_address, local_handle->gni_handle,
|
||||
remote_address, remote_handle->gni_handle, size, cq_handle);
|
||||
|
||||
OPAL_THREAD_LOCK(&endpoint->btl->device->dev_lock);
|
||||
status = GNI_PostRdma (endpoint->rdma_ep_handle, &post_desc->desc.base);
|
||||
OPAL_THREAD_UNLOCK(&endpoint->btl->device->dev_lock);
|
||||
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != status)) {
|
||||
mca_btl_ugni_return_post_descriptor (endpoint->btl, post_desc);
|
||||
|
||||
if (GNI_RC_ALIGNMENT_ERROR == status) {
|
||||
BTL_VERBOSE(("GNI_PostRdma failed with an alignment error"));
|
||||
return OPAL_ERR_NOT_AVAILABLE;
|
||||
}
|
||||
|
||||
BTL_VERBOSE(("GNI_PostRdma failed with gni rc: %d", status));
|
||||
return opal_common_rc_ugni_to_opal(status);
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
static inline int mca_btl_ugni_post (mca_btl_ugni_base_frag_t *frag, bool get, mca_btl_ugni_segment_t *lcl_seg,
|
||||
mca_btl_ugni_segment_t *rem_seg) {
|
||||
static inline int mca_btl_ugni_post_cqwrite (mca_btl_base_endpoint_t *endpoint, gni_cq_handle_t cq_handle,
|
||||
gni_mem_handle_t irq_mhndl, uint64_t value,
|
||||
mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata)
|
||||
{
|
||||
mca_btl_ugni_post_descriptor_t *post_desc;
|
||||
gni_return_t grc;
|
||||
|
||||
mca_btl_ugni_alloc_post_descriptor (endpoint, NULL, cbfunc, cbcontext, cbdata, &post_desc);
|
||||
if (OPAL_UNLIKELY(NULL == post_desc)) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
post_desc->desc.base.type = GNI_POST_CQWRITE;
|
||||
post_desc->desc.base.cqwrite_value = value; /* up to 48 bytes here, not used for now */
|
||||
post_desc->desc.base.cq_mode = GNI_CQMODE_GLOBAL_EVENT;
|
||||
post_desc->desc.base.dlvr_mode = GNI_DLVMODE_IN_ORDER;
|
||||
post_desc->desc.base.src_cq_hndl = cq_handle;
|
||||
post_desc->desc.base.remote_mem_hndl = irq_mhndl;
|
||||
post_desc->desc.tries = 0;
|
||||
|
||||
OPAL_THREAD_LOCK(&endpoint->common->dev->dev_lock);
|
||||
grc = GNI_PostCqWrite(endpoint->rdma_ep_handle, &post_desc->desc.base);
|
||||
OPAL_THREAD_UNLOCK(&endpoint->common->dev->dev_lock);
|
||||
if (GNI_RC_SUCCESS != grc) { /* errors for PostCqWrite treated as non-fatal */
|
||||
BTL_VERBOSE(("GNI_PostCqWrite returned error - %s", gni_err_str[grc]));
|
||||
mca_btl_ugni_return_post_descriptor (endpoint->btl, post_desc);
|
||||
}
|
||||
|
||||
return opal_common_rc_ugni_to_opal (grc);
|
||||
}
|
||||
|
||||
static inline int mca_btl_ugni_post (mca_btl_base_endpoint_t *endpoint, int get, size_t size,
|
||||
void *local_address, uint64_t remote_address,
|
||||
mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata)
|
||||
{
|
||||
const gni_post_type_t fma_ops[2] = {GNI_POST_FMA_PUT, GNI_POST_FMA_GET};
|
||||
const gni_post_type_t rdma_ops[2] = {GNI_POST_RDMA_PUT, GNI_POST_RDMA_GET};
|
||||
|
||||
if (frag->base.des_local->seg_len <= mca_btl_ugni_component.ugni_fma_limit) {
|
||||
return mca_btl_ugni_post_fma (frag, fma_ops[get], lcl_seg, rem_seg);
|
||||
if (size <= mca_btl_ugni_component.ugni_fma_limit) {
|
||||
return mca_btl_ugni_post_fma (endpoint, fma_ops[get], size, local_address, remote_address,
|
||||
local_handle, remote_handle, order, cbfunc, cbcontext, cbdata);
|
||||
}
|
||||
|
||||
return mca_btl_ugni_post_bte (frag, rdma_ops[get], lcl_seg, rem_seg);
|
||||
return mca_btl_ugni_post_bte (endpoint, rdma_ops[get], size, local_address, remote_address,
|
||||
local_handle, remote_handle, order, cbfunc, cbcontext, cbdata);
|
||||
}
|
||||
|
||||
static inline void mca_btl_ugni_repost (mca_btl_ugni_base_frag_t *frag) {
|
||||
static inline int mca_btl_ugni_repost (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_post_descriptor_t *post_desc)
|
||||
{
|
||||
gni_return_t grc;
|
||||
|
||||
OPAL_THREAD_LOCK(&frag->endpoint->common->dev->dev_lock);
|
||||
if (GNI_POST_RDMA_PUT == frag->post_desc.base.type ||
|
||||
GNI_POST_RDMA_GET == frag->post_desc.base.type) {
|
||||
grc = GNI_PostRdma (frag->endpoint->rdma_ep_handle, &frag->post_desc.base);
|
||||
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock);
|
||||
if (GNI_POST_RDMA_PUT == post_desc->desc.base.type ||
|
||||
GNI_POST_RDMA_GET == post_desc->desc.base.type) {
|
||||
grc = GNI_PostRdma (post_desc->endpoint->rdma_ep_handle, &post_desc->desc.base);
|
||||
} else {
|
||||
grc = GNI_PostFma (frag->endpoint->rdma_ep_handle, &frag->post_desc.base);
|
||||
grc = GNI_PostFma (post_desc->endpoint->rdma_ep_handle, &post_desc->desc.base);
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&frag->endpoint->common->dev->dev_lock);
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
|
||||
|
||||
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc)) {
|
||||
/* NTH: Should we even retry these? When this code was written there was no indication
|
||||
* whether an error in post is recoverable. Clobber this code and the associated data
|
||||
* structures if post errors are not recoverable. */
|
||||
OPAL_THREAD_LOCK(&frag->endpoint->btl->failed_frags_lock);
|
||||
opal_list_append (&frag->endpoint->btl->failed_frags, (opal_list_item_t *) frag);
|
||||
OPAL_THREAD_UNLOCK(&frag->endpoint->btl->failed_frags_lock);
|
||||
OPAL_THREAD_LOCK(&ugni_module->pending_descriptors_lock);
|
||||
opal_list_append (&ugni_module->pending_descriptors, (opal_list_item_t *) post_desc);
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->pending_descriptors_lock);
|
||||
}
|
||||
|
||||
return opal_common_rc_ugni_to_opal (grc);
|
||||
}
|
||||
|
||||
#endif /* MCA_BTL_UGNI_RDMA_H */
|
||||
|
@ -23,7 +23,7 @@ int mca_btl_ugni_send (struct mca_btl_base_module_t *btl,
|
||||
mca_btl_base_tag_t tag)
|
||||
{
|
||||
mca_btl_ugni_base_frag_t *frag = (mca_btl_ugni_base_frag_t *) descriptor;
|
||||
size_t size = frag->segments[0].base.seg_len + frag->segments[1].base.seg_len;
|
||||
size_t size = frag->segments[0].seg_len + frag->segments[1].seg_len;
|
||||
mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl;
|
||||
int flags_save = frag->base.des_flags;
|
||||
int rc;
|
||||
@ -41,7 +41,7 @@ int mca_btl_ugni_send (struct mca_btl_base_module_t *btl,
|
||||
}
|
||||
|
||||
BTL_VERBOSE(("btl/ugni sending descriptor %p from %d -> %d. length = %" PRIu64, (void *)descriptor,
|
||||
OPAL_PROC_MY_NAME.vpid, endpoint->common->ep_rem_id, frag->segments[0].base.seg_len));
|
||||
OPAL_PROC_MY_NAME.vpid, endpoint->common->ep_rem_id, size));
|
||||
|
||||
/* temporarily disable ownership and callback flags so we can reliably check the complete flag */
|
||||
frag->base.des_flags &= ~(MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
|
||||
@ -90,14 +90,13 @@ int mca_btl_ugni_send (struct mca_btl_base_module_t *btl,
|
||||
return rc;
|
||||
}
|
||||
|
||||
int
|
||||
mca_btl_ugni_sendi (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint,
|
||||
struct opal_convertor_t *convertor,
|
||||
void *header, size_t header_size,
|
||||
size_t payload_size, uint8_t order,
|
||||
uint32_t flags, mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t **descriptor)
|
||||
int mca_btl_ugni_sendi (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint,
|
||||
struct opal_convertor_t *convertor,
|
||||
void *header, size_t header_size,
|
||||
size_t payload_size, uint8_t order,
|
||||
uint32_t flags, mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t **descriptor)
|
||||
{
|
||||
size_t total_size = header_size + payload_size;
|
||||
mca_btl_ugni_base_frag_t *frag = NULL;
|
||||
@ -118,13 +117,14 @@ mca_btl_ugni_sendi (struct mca_btl_base_module_t *btl,
|
||||
frag = (mca_btl_ugni_base_frag_t *) mca_btl_ugni_prepare_src_send_buffered (btl, endpoint, convertor, order,
|
||||
header_size, &packed_size, flags);
|
||||
}
|
||||
|
||||
assert (packed_size == payload_size);
|
||||
if (OPAL_UNLIKELY(NULL == frag)) {
|
||||
break;
|
||||
}
|
||||
|
||||
frag->hdr.send.lag = (tag << 24) | total_size;
|
||||
memcpy (frag->segments[0].base.seg_addr.pval, header, header_size);
|
||||
memcpy (frag->segments[0].seg_addr.pval, header, header_size);
|
||||
|
||||
rc = mca_btl_ugni_send_frag (endpoint, frag);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
@ -135,7 +135,9 @@ mca_btl_ugni_sendi (struct mca_btl_base_module_t *btl,
|
||||
return OPAL_SUCCESS;
|
||||
} while (0);
|
||||
|
||||
*descriptor = NULL;
|
||||
if (NULL != descriptor) {
|
||||
*descriptor = NULL;
|
||||
}
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
@ -151,7 +153,13 @@ int mca_btl_ugni_progress_send_wait_list (mca_btl_base_endpoint_t *endpoint)
|
||||
if (NULL == frag) {
|
||||
break;
|
||||
}
|
||||
rc = mca_btl_ugni_send_frag (endpoint, frag);
|
||||
if (OPAL_LIKELY(!(frag->flags & MCA_BTL_UGNI_FRAG_RESPONSE))) {
|
||||
rc = mca_btl_ugni_send_frag (endpoint, frag);
|
||||
} else {
|
||||
rc = opal_mca_btl_ugni_smsg_send (frag, &frag->hdr.rdma, sizeof (frag->hdr.rdma),
|
||||
NULL, 0, MCA_BTL_UGNI_TAG_RDMA_COMPLETE);
|
||||
}
|
||||
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS > rc)) {
|
||||
if (OPAL_LIKELY(OPAL_ERR_OUT_OF_RESOURCE == rc)) {
|
||||
OPAL_THREAD_LOCK(&endpoint->lock);
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -26,7 +26,7 @@ static void mca_btl_ugni_smsg_mbox_construct (mca_btl_ugni_smsg_mbox_t *mbox) {
|
||||
mbox->attr.smsg_attr.mbox_offset = (uintptr_t) mbox->super.ptr - (uintptr_t) base_reg->base;
|
||||
mbox->attr.smsg_attr.msg_buffer = base_reg->base;
|
||||
mbox->attr.smsg_attr.buff_size = mca_btl_ugni_component.smsg_mbox_size;
|
||||
mbox->attr.smsg_attr.mem_hndl = ugni_reg->memory_hdl;
|
||||
mbox->attr.smsg_attr.mem_hndl = ugni_reg->handle.gni_handle;
|
||||
mbox->attr.proc_id = mca_btl_ugni_proc_name_to_id (OPAL_PROC_MY_NAME);
|
||||
mbox->attr.rmt_irq_mem_hndl = mca_btl_ugni_component.modules[0].device->smsg_irq_mhndl;
|
||||
}
|
||||
@ -106,8 +106,8 @@ int mca_btl_ugni_smsg_process (mca_btl_base_endpoint_t *ep)
|
||||
BTL_VERBOSE(("received smsg fragment. hdr = {len = %u, tag = %d}", len, tag));
|
||||
|
||||
reg = mca_btl_base_active_message_trigger + tag;
|
||||
frag.base.des_local = &seg;
|
||||
frag.base.des_local_count = 1;
|
||||
frag.base.des_segments = &seg;
|
||||
frag.base.des_segment_count = 1;
|
||||
|
||||
seg.seg_addr.pval = (void *)((uintptr_t)data_ptr + sizeof (mca_btl_ugni_send_frag_hdr_t));
|
||||
seg.seg_len = len;
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -82,22 +82,12 @@ static inline int mca_btl_ugni_progress_local_smsg (mca_btl_ugni_module_t *ugni_
|
||||
return 1;
|
||||
}
|
||||
|
||||
static void mca_btl_ugni_cqwrite_complete (struct mca_btl_ugni_base_frag_t *frag, int rc)
|
||||
{
|
||||
frag->flags |= MCA_BTL_UGNI_FRAG_COMPLETE;
|
||||
|
||||
BTL_VERBOSE(("cqwrite frag complete"));
|
||||
mca_btl_ugni_frag_return (frag);
|
||||
}
|
||||
|
||||
static inline int opal_mca_btl_ugni_smsg_send (mca_btl_ugni_base_frag_t *frag,
|
||||
void *hdr, size_t hdr_len,
|
||||
void *payload, size_t payload_len,
|
||||
mca_btl_ugni_smsg_tag_t tag)
|
||||
{
|
||||
int rc;
|
||||
gni_return_t grc;
|
||||
mca_btl_ugni_base_frag_t *cq_write_frag = NULL;
|
||||
|
||||
OPAL_THREAD_LOCK(&frag->endpoint->common->dev->dev_lock);
|
||||
grc = GNI_SmsgSendWTag (frag->endpoint->smsg_ep_handle, hdr, hdr_len,
|
||||
@ -110,28 +100,9 @@ static inline int opal_mca_btl_ugni_smsg_send (mca_btl_ugni_base_frag_t *frag,
|
||||
|
||||
if (mca_btl_ugni_component.progress_thread_enabled) {
|
||||
if (frag->base.des_flags & MCA_BTL_DES_FLAGS_SIGNAL) {
|
||||
rc = mca_btl_ugni_frag_alloc(frag->endpoint,
|
||||
&frag->endpoint->btl->rdma_frags,
|
||||
&cq_write_frag);
|
||||
if (rc == OPAL_SUCCESS) {
|
||||
cq_write_frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
|
||||
cq_write_frag->registration = NULL;
|
||||
cq_write_frag->endpoint = frag->endpoint;
|
||||
cq_write_frag->post_desc.base.type = GNI_POST_CQWRITE;
|
||||
cq_write_frag->post_desc.base.cqwrite_value = 0xdead; /* up to 48 bytes here, not used for now */
|
||||
cq_write_frag->post_desc.base.cq_mode = GNI_CQMODE_GLOBAL_EVENT;
|
||||
cq_write_frag->post_desc.base.dlvr_mode = GNI_DLVMODE_IN_ORDER;
|
||||
cq_write_frag->post_desc.base.src_cq_hndl = frag->endpoint->btl->rdma_local_cq;
|
||||
cq_write_frag->post_desc.base.remote_mem_hndl = frag->endpoint->rmt_irq_mem_hndl;
|
||||
cq_write_frag->post_desc.tries = 0;
|
||||
cq_write_frag->cbfunc = mca_btl_ugni_cqwrite_complete;
|
||||
OPAL_THREAD_LOCK(&frag->endpoint->common->dev->dev_lock);
|
||||
grc = GNI_PostCqWrite(frag->endpoint->rdma_ep_handle, &cq_write_frag->post_desc.base);
|
||||
OPAL_THREAD_UNLOCK(&frag->endpoint->common->dev->dev_lock);
|
||||
if (grc == GNI_RC_ERROR_RESOURCE) { /* errors for PostCqWrite treated as non-fatal */
|
||||
mca_btl_ugni_frag_return (cq_write_frag);
|
||||
}
|
||||
}
|
||||
/* errors for PostCqWrite treated as non-fatal */
|
||||
(void) mca_btl_ugni_post_cqwrite (frag->endpoint, frag->endpoint->btl->rdma_local_cq,
|
||||
frag->endpoint->rmt_irq_mem_hndl, 0xdead, NULL, NULL, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
@ -155,12 +126,13 @@ static inline int mca_btl_ugni_send_frag (struct mca_btl_base_endpoint_t *btl_pe
|
||||
mca_btl_ugni_base_frag_t *frag) {
|
||||
if (OPAL_LIKELY(!(frag->flags & MCA_BTL_UGNI_FRAG_EAGER))) {
|
||||
return opal_mca_btl_ugni_smsg_send (frag, &frag->hdr.send, frag->hdr_size,
|
||||
frag->segments[1].base.seg_addr.pval,
|
||||
frag->segments[1].base.seg_len,
|
||||
frag->segments[1].seg_addr.pval,
|
||||
frag->segments[1].seg_len,
|
||||
MCA_BTL_UGNI_TAG_SEND);
|
||||
}
|
||||
|
||||
frag->hdr.eager.src_seg = frag->segments[1];
|
||||
frag->hdr.eager.size = frag->segments[1].seg_len;
|
||||
frag->hdr.eager.address = frag->segments[1].seg_addr.lval;
|
||||
frag->hdr.eager.ctx = (void *) frag;
|
||||
|
||||
return opal_mca_btl_ugni_smsg_send (frag, &frag->hdr.eager, frag->hdr_size,
|
||||
|
@ -141,9 +141,12 @@ After the checks above are done, the fragment is enqueued to be sent
|
||||
via opal_btl_usnic_endpoint_enqueue_frag()
|
||||
|
||||
usnic_put()
|
||||
PML will have filled in destination address in descriptor. This is saved
|
||||
and the fragment is enqueued for processing.
|
||||
|
||||
Do a fast version of what happens in prepare_src() (can take shortcuts
|
||||
because we know it will always be a contiguous buffer / no convertor
|
||||
needed). PML gives us the destination address, which we save on the
|
||||
fragment (which is the sentinel value that the underlying engine uses
|
||||
to know that this is a PUT and not a SEND), and the fragment is
|
||||
enqueued for processing.
|
||||
|
||||
opal_btl_usnic_endpoint_enqueue_frag()
|
||||
This appends the fragment to the "to be sent" list of the endpoint and
|
||||
@ -200,8 +203,6 @@ opal_btl_usnic_recv_fast() called fastpath_ok which is set to false every time
|
||||
the fastpath is taken. A call into the regular progress routine will set this
|
||||
flag back to true.
|
||||
|
||||
|
||||
|
||||
======================================
|
||||
reliability:
|
||||
|
||||
@ -233,7 +234,6 @@ rcvr:
|
||||
sender:
|
||||
duplicate ACK triggers immediate retrans if one is not pending for that segment
|
||||
|
||||
|
||||
======================================
|
||||
Reordering induced by two queues and piggy-backing:
|
||||
|
||||
@ -248,6 +248,42 @@ keep command queue empty enough and also beat out the large sends.
|
||||
send credits limit how many larges can be queued on the sender, but there
|
||||
could be many on the receiver
|
||||
|
||||
|
||||
======================================
|
||||
RDMA emulation
|
||||
|
||||
We emulate the RDMA PUT because it's more efficient than regular send:
|
||||
it allows the receive to copy directly to the target buffer
|
||||
(vs. making an intermediate copy out of the bounce buffer).
|
||||
|
||||
It would actually be better to morph this PUT into a GET -- GET would
|
||||
be slightly more efficient. In short, when the target requests the
|
||||
actual RDMA data, with PUT, the request has to go up to the PML, which
|
||||
will then invoke PUT on the source's BTL module. With GET, the target
|
||||
issues the GET, and the source BTL module can reply without needing to
|
||||
go up the stack to the PML.
|
||||
|
||||
Once we start supporting RDMA in hardware:
|
||||
|
||||
- we need to provide module.btl_register_mem and
|
||||
module.btl_deregister_mem functions (see openib for an example)
|
||||
- we need to put something meaningful in
|
||||
btl_usnic_frag.h:mca_btl_base_registration_handle_t.
|
||||
- we need to set module.btl_registration_handle_size to sizeof(struct
|
||||
mca_btl_base_registration_handle_t).
|
||||
- module.btl_put / module.btl_get will receive the
|
||||
mca_btl_base_registration_handle_t from the peer as a cookie.
|
||||
|
||||
Also, module.btl_put / module.btl_get do not need to make descriptors
|
||||
(this was an optimization added in BTL 3.0). They are now called with
|
||||
enough information to do whatever they need to do. module.btl_put
|
||||
still makes a descriptor and submits it to the usnic sending engine so
|
||||
as to utilize a common infrastructure for send and put.
|
||||
|
||||
But it doesn't necessarily have to be that way -- we could optimize
|
||||
out the use of the descriptors. Have not investigated how easy/hard
|
||||
that would be.
|
||||
|
||||
======================================
|
||||
|
||||
November 2014 / SC 2014
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2013-2014 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -147,11 +147,21 @@ opal_btl_usnic_handle_ack(
|
||||
* fragment really needs to be freed, we'll take care of it in a few
|
||||
* lines below.
|
||||
*/
|
||||
if (frag->sf_ack_bytes_left == bytes_acked &&
|
||||
((frag->sf_base.uf_remote_seg[0].seg_addr.pval != NULL) ||
|
||||
(frag->sf_base.uf_base.des_flags &
|
||||
MCA_BTL_DES_SEND_ALWAYS_CALLBACK))) {
|
||||
OPAL_BTL_USNIC_DO_SEND_FRAG_CB(module, frag, "send completion");
|
||||
if (frag->sf_ack_bytes_left == bytes_acked) {
|
||||
#if BTL_VERSION == 30
|
||||
if (frag->sf_base.uf_remote_seg[0].seg_addr.pval != NULL) {
|
||||
OPAL_BTL_USNIC_DO_PUT_FRAG_CB(module, frag, "put completion");
|
||||
} else if (frag->sf_base.uf_base.des_flags &
|
||||
MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
|
||||
OPAL_BTL_USNIC_DO_SEND_FRAG_CB(module, frag, "send completion");
|
||||
}
|
||||
#else
|
||||
if ((frag->sf_base.uf_remote_seg[0].seg_addr.pval != NULL) ||
|
||||
(frag->sf_base.uf_base.des_flags &
|
||||
MCA_BTL_DES_SEND_ALWAYS_CALLBACK)) {
|
||||
OPAL_BTL_USNIC_DO_SEND_FRAG_CB(module, frag, "send completion");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/* free this segment */
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2013-2014 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -17,12 +17,13 @@
|
||||
#include "btl_usnic.h"
|
||||
#include "btl_usnic_frag.h"
|
||||
#include "btl_usnic_endpoint.h"
|
||||
#include "btl_usnic_compat.h"
|
||||
|
||||
/* Invoke the descriptor callback for the frag, updating stats and clearing the
|
||||
* _CALLBACK flag in the process. */
|
||||
/* Invoke the descriptor callback for a (non-PUT) send frag, updating
|
||||
* stats and clearing the _CALLBACK flag in the process. */
|
||||
#define OPAL_BTL_USNIC_DO_SEND_FRAG_CB(module, send_frag, comment) \
|
||||
do { \
|
||||
MSGDEBUG1_OUT("%s:%d: %s send callback for module=%p frag=%p\n", \
|
||||
MSGDEBUG1_OUT("%s:%d: %s SEND callback for module=%p frag=%p\n", \
|
||||
__func__, __LINE__, \
|
||||
(comment), (void *)(module), (void *)(send_frag)); \
|
||||
(send_frag)->sf_base.uf_base.des_cbfunc( \
|
||||
@ -34,6 +35,28 @@
|
||||
++((module)->stats.pml_send_callbacks); \
|
||||
} while (0)
|
||||
|
||||
#if BTL_VERSION == 30
|
||||
/* Invoke the descriptor callback for a send frag that was a PUT,
|
||||
* updating stats and clearing the _CALLBACK flag in the process. */
|
||||
#define OPAL_BTL_USNIC_DO_PUT_FRAG_CB(module, send_frag, comment) \
|
||||
do { \
|
||||
MSGDEBUG1_OUT("%s:%d: %s PUT callback for module=%p frag=%p\n", \
|
||||
__func__, __LINE__, \
|
||||
(comment), (void *)(module), (void *)(send_frag)); \
|
||||
mca_btl_base_rdma_completion_fn_t func = \
|
||||
(mca_btl_base_rdma_completion_fn_t) \
|
||||
(send_frag)->sf_base.uf_base.des_cbfunc; \
|
||||
func(&(module)->super, \
|
||||
(send_frag)->sf_endpoint, \
|
||||
(send_frag)->sf_base.uf_local_seg[0].seg_addr.pval, \
|
||||
NULL, \
|
||||
(send_frag)->sf_base.uf_base.des_context, \
|
||||
(send_frag)->sf_base.uf_base.des_cbdata, \
|
||||
OPAL_SUCCESS); \
|
||||
++((module)->stats.pml_send_callbacks); \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Reap an ACK send that is complete
|
||||
*/
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2015 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -9,15 +9,20 @@
|
||||
|
||||
#if BTL_IN_OPAL
|
||||
#include "opal_config.h"
|
||||
#include "opal/mca/btl/btl.h"
|
||||
#else
|
||||
#include "ompi_config.h"
|
||||
#include "ompi/mca/btl/btl.h"
|
||||
#endif
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal_stdint.h"
|
||||
|
||||
#include "btl_usnic_compat.h"
|
||||
#include "btl_usnic_frag.h"
|
||||
#include "btl_usnic_endpoint.h"
|
||||
#include "btl_usnic_connectivity.h"
|
||||
#include "btl_usnic_send.h"
|
||||
|
||||
/************************************************************************/
|
||||
|
||||
@ -114,4 +119,599 @@ const char *usnic_compat_proc_name_print(opal_process_name_t *pname)
|
||||
return OMPI_NAME_PRINT(pname);
|
||||
}
|
||||
|
||||
#endif /* OMPI version */
|
||||
|
||||
/************************************************************************/
|
||||
|
||||
/* BTL 2.0 and 3.0 compatibilty functions */
|
||||
|
||||
/*----------------------------------------------------------------------*/
|
||||
|
||||
/* The following functions are common between BTL 2.0 and 3.0 */
|
||||
|
||||
/* Responsible for sending "small" frags (reserve + *size <= max_frag_payload)
|
||||
* in the same manner as btl_prepare_src. Must return a smaller amount than
|
||||
* requested if the given convertor cannot process the entire (*size).
|
||||
*/
|
||||
static inline opal_btl_usnic_send_frag_t *
|
||||
prepare_src_small(
|
||||
struct opal_btl_usnic_module_t* module,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t* size,
|
||||
uint32_t flags)
|
||||
{
|
||||
opal_btl_usnic_send_frag_t *frag;
|
||||
opal_btl_usnic_small_send_frag_t *sfrag;
|
||||
size_t payload_len;
|
||||
|
||||
payload_len = *size + reserve;
|
||||
assert(payload_len <= module->max_frag_payload); /* precondition */
|
||||
|
||||
sfrag = opal_btl_usnic_small_send_frag_alloc(module);
|
||||
if (OPAL_UNLIKELY(NULL == sfrag)) {
|
||||
return NULL;
|
||||
}
|
||||
frag = &sfrag->ssf_base;
|
||||
|
||||
/* In the case of a convertor, we will copy the data in now, since that is
|
||||
* the cheapest way to discover how much we can actually send (since we know
|
||||
* we will pack it anyway later). The alternative is to do all of the
|
||||
* following:
|
||||
* 1) clone_with_position(convertor) and see where the new position ends up
|
||||
* actually being (see opal_btl_usnic_convertor_pack_peek). Otherwise we
|
||||
* aren't fulfilling our contract w.r.t. (*size).
|
||||
* 2) Add a bunch of branches checking for different cases, both here and in
|
||||
* progress_sends
|
||||
* 3) If we choose to defer the packing, we must clone the convertor because
|
||||
* the PML owns it and might reuse it for another prepare_src call.
|
||||
*
|
||||
* Two convertor clones is likely to be at least as slow as just copying the
|
||||
* data and might consume a similar amount of memory. Plus we still have to
|
||||
* pack it later to send it.
|
||||
*
|
||||
* The reason we do not copy non-convertor buffer at this point is because
|
||||
* we might still use INLINE for the send, and in that case we do not want
|
||||
* to copy the data at all.
|
||||
*/
|
||||
if (OPAL_UNLIKELY(opal_convertor_need_buffers(convertor))) {
|
||||
/* put user data just after end of 1st seg (upper layer header) */
|
||||
assert(payload_len <= module->max_frag_payload);
|
||||
usnic_convertor_pack_simple(
|
||||
convertor,
|
||||
(IOVBASE_TYPE*)(intptr_t)(frag->sf_base.uf_local_seg[0].seg_addr.lval + reserve),
|
||||
*size,
|
||||
size);
|
||||
payload_len = reserve + *size;
|
||||
frag->sf_base.uf_base.USNIC_SEND_LOCAL_COUNT = 1;
|
||||
/* PML will copy header into beginning of segment */
|
||||
frag->sf_base.uf_local_seg[0].seg_len = payload_len;
|
||||
} else {
|
||||
opal_convertor_get_current_pointer(convertor,
|
||||
&sfrag->ssf_base.sf_base.uf_local_seg[1].seg_addr.pval);
|
||||
frag->sf_base.uf_base.USNIC_SEND_LOCAL_COUNT = 2;
|
||||
frag->sf_base.uf_local_seg[0].seg_len = reserve;
|
||||
frag->sf_base.uf_local_seg[1].seg_len = *size;
|
||||
}
|
||||
|
||||
frag->sf_base.uf_base.des_flags = flags;
|
||||
frag->sf_endpoint = endpoint;
|
||||
|
||||
return frag;
|
||||
}
|
||||
|
||||
static void *
|
||||
pack_chunk_seg_chain_with_reserve(
|
||||
struct opal_btl_usnic_module_t* module,
|
||||
opal_btl_usnic_large_send_frag_t *lfrag,
|
||||
size_t reserve_len,
|
||||
opal_convertor_t *convertor,
|
||||
size_t max_convertor_bytes,
|
||||
size_t *convertor_bytes_packed)
|
||||
{
|
||||
opal_btl_usnic_chunk_segment_t *seg;
|
||||
void *ret_ptr = NULL;
|
||||
int n_segs;
|
||||
uint8_t *copyptr;
|
||||
size_t copylen;
|
||||
size_t seg_space;
|
||||
size_t max_data;
|
||||
bool first_pass;
|
||||
|
||||
assert(NULL != lfrag);
|
||||
assert(NULL != convertor_bytes_packed);
|
||||
|
||||
n_segs = 0;
|
||||
*convertor_bytes_packed = 0;
|
||||
|
||||
first_pass = true;
|
||||
while (*convertor_bytes_packed < max_convertor_bytes ||
|
||||
first_pass) {
|
||||
seg = opal_btl_usnic_chunk_segment_alloc(module);
|
||||
if (OPAL_UNLIKELY(NULL == seg)) {
|
||||
BTL_ERROR(("chunk segment allocation error"));
|
||||
abort(); /* XXX */
|
||||
}
|
||||
++n_segs;
|
||||
|
||||
seg_space = module->max_chunk_payload;
|
||||
copyptr = seg->ss_base.us_payload.raw;
|
||||
|
||||
if (first_pass) {
|
||||
/* logic could accommodate >max, but currently doesn't */
|
||||
assert(reserve_len <= module->max_chunk_payload);
|
||||
ret_ptr = copyptr;
|
||||
seg_space -= reserve_len;
|
||||
copyptr += reserve_len;
|
||||
}
|
||||
|
||||
/* now pack any convertor data */
|
||||
if (*convertor_bytes_packed < max_convertor_bytes && seg_space > 0) {
|
||||
copylen = max_convertor_bytes - *convertor_bytes_packed;
|
||||
if (copylen > seg_space) {
|
||||
copylen = seg_space;
|
||||
}
|
||||
usnic_convertor_pack_simple(convertor, copyptr, copylen, &max_data);
|
||||
seg_space -= max_data;
|
||||
*convertor_bytes_packed += max_data;
|
||||
|
||||
/* If unable to pack any of the remaining bytes, release the
|
||||
* most recently allocated segment and finish processing.
|
||||
*/
|
||||
if (seg_space == module->max_chunk_payload) {
|
||||
assert(max_data == 0); /* only way this can happen */
|
||||
opal_btl_usnic_chunk_segment_return(module, seg);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* bozo checks */
|
||||
assert(seg_space >= 0);
|
||||
assert(seg_space < module->max_chunk_payload);
|
||||
|
||||
/* append segment of data to chain to send */
|
||||
seg->ss_parent_frag = &lfrag->lsf_base;
|
||||
seg->ss_len = module->max_chunk_payload - seg_space;
|
||||
opal_list_append(&lfrag->lsf_seg_chain, &seg->ss_base.us_list.super);
|
||||
|
||||
#if MSGDEBUG1
|
||||
opal_output(0, "%s: appending seg=%p, frag=%p, payload=%zd\n",
|
||||
__func__, (void *)seg, (void *)lfrag,
|
||||
(module->max_chunk_payload - seg_space));
|
||||
#endif
|
||||
|
||||
first_pass = false;
|
||||
}
|
||||
|
||||
return ret_ptr;
|
||||
}
|
||||
|
||||
/* Responsible for handling "large" frags (reserve + *size > max_frag_payload)
|
||||
* in the same manner as btl_prepare_src. Must return a smaller amount than
|
||||
* requested if the given convertor cannot process the entire (*size).
|
||||
*/
|
||||
static opal_btl_usnic_send_frag_t *
|
||||
prepare_src_large(
|
||||
struct opal_btl_usnic_module_t* module,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t* size,
|
||||
uint32_t flags)
|
||||
{
|
||||
opal_btl_usnic_send_frag_t *frag;
|
||||
opal_btl_usnic_large_send_frag_t *lfrag;
|
||||
int rc;
|
||||
|
||||
/* Get holder for the msg */
|
||||
lfrag = opal_btl_usnic_large_send_frag_alloc(module);
|
||||
if (OPAL_UNLIKELY(NULL == lfrag)) {
|
||||
return NULL;
|
||||
}
|
||||
frag = &lfrag->lsf_base;
|
||||
|
||||
/* The header location goes in SG[0], payload in SG[1]. If we are using a
|
||||
* convertor then SG[1].seg_len is accurate but seg_addr is NULL. */
|
||||
frag->sf_base.uf_base.USNIC_SEND_LOCAL_COUNT = 2;
|
||||
|
||||
/* stash header location, PML will write here */
|
||||
frag->sf_base.uf_local_seg[0].seg_addr.pval = &lfrag->lsf_ompi_header;
|
||||
frag->sf_base.uf_local_seg[0].seg_len = reserve;
|
||||
/* make sure upper header small enough */
|
||||
assert(reserve <= sizeof(lfrag->lsf_ompi_header));
|
||||
|
||||
if (OPAL_UNLIKELY(opal_convertor_need_buffers(convertor))) {
|
||||
/* threshold == -1 means always pack eagerly */
|
||||
if (mca_btl_usnic_component.pack_lazy_threshold >= 0 &&
|
||||
*size >= (size_t)mca_btl_usnic_component.pack_lazy_threshold) {
|
||||
MSGDEBUG1_OUT("packing frag %p on the fly", (void *)frag);
|
||||
lfrag->lsf_pack_on_the_fly = true;
|
||||
|
||||
/* tell the PML we will absorb as much as possible while still
|
||||
* respecting indivisible element boundaries in the convertor */
|
||||
*size = opal_btl_usnic_convertor_pack_peek(convertor, *size);
|
||||
|
||||
/* Clone the convertor b/c we (the BTL) don't own it and the PML
|
||||
* might mutate it after we return from this function. */
|
||||
rc = opal_convertor_clone(convertor, &frag->sf_convertor,
|
||||
/*copy_stack=*/true);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
BTL_ERROR(("unexpected convertor clone error"));
|
||||
abort(); /* XXX */
|
||||
}
|
||||
}
|
||||
else {
|
||||
/* pack everything in the convertor into a chain of segments now,
|
||||
* leaving space for the PML header in the first segment */
|
||||
lfrag->lsf_base.sf_base.uf_local_seg[0].seg_addr.pval =
|
||||
pack_chunk_seg_chain_with_reserve(module, lfrag, reserve,
|
||||
convertor, *size, size);
|
||||
}
|
||||
|
||||
/* We set SG[1] to {NULL,bytes_packed} so that various calculations
|
||||
* by both PML and this BTL will be correct. For example, the PML adds
|
||||
* up the bytes in the descriptor segments to determine if an MPI-level
|
||||
* request is complete or not. */
|
||||
frag->sf_base.uf_local_seg[1].seg_addr.pval = NULL;
|
||||
frag->sf_base.uf_local_seg[1].seg_len = *size;
|
||||
} else {
|
||||
/* convertor not needed, just save the payload pointer in SG[1] */
|
||||
lfrag->lsf_pack_on_the_fly = true;
|
||||
opal_convertor_get_current_pointer(convertor,
|
||||
&frag->sf_base.uf_local_seg[1].seg_addr.pval);
|
||||
frag->sf_base.uf_local_seg[1].seg_len = *size;
|
||||
}
|
||||
|
||||
frag->sf_base.uf_base.des_flags = flags;
|
||||
frag->sf_endpoint = endpoint;
|
||||
|
||||
return frag;
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------*/
|
||||
|
||||
#if BTL_VERSION == 20
|
||||
|
||||
/*
|
||||
* BTL 2.0 version of module.btl_prepare_src.
|
||||
*
|
||||
* Note the "user" data the PML wishes to communicate and return a descriptor
|
||||
* that can be used for send or put. We create a frag (which is also a
|
||||
* descriptor by virtue of its base class) and populate it with enough
|
||||
* source information to complete a future send/put.
|
||||
*
|
||||
* We will create either a small send frag if < than an MTU, otherwise a large
|
||||
* send frag. The convertor will be saved for deferred packing if the user
|
||||
* buffer is noncontiguous. Otherwise it will be saved in one of the
|
||||
* descriptor's SGEs.
|
||||
*
|
||||
* NOTE that the *only* reason this routine is allowed to return a size smaller
|
||||
* than was requested is if the convertor cannot process the entire amount.
|
||||
*/
|
||||
mca_btl_base_descriptor_t*
|
||||
opal_btl_usnic_prepare_src(
|
||||
struct mca_btl_base_module_t* base_module,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t* size,
|
||||
uint32_t flags)
|
||||
{
|
||||
opal_btl_usnic_module_t *module = (opal_btl_usnic_module_t*) base_module;
|
||||
opal_btl_usnic_send_frag_t *frag;
|
||||
uint32_t payload_len;
|
||||
#if MSGDEBUG2
|
||||
size_t osize = *size;
|
||||
#endif
|
||||
|
||||
/* Do we need to check the connectivity? If enabled, we'll check
|
||||
the connectivity at either first send to peer X or first ACK to
|
||||
peer X. */
|
||||
opal_btl_usnic_check_connectivity(module, endpoint);
|
||||
|
||||
/*
|
||||
* if total payload len fits in one MTU use small send, else large
|
||||
*/
|
||||
payload_len = *size + reserve;
|
||||
if (payload_len <= module->max_frag_payload) {
|
||||
frag = prepare_src_small(module, endpoint, convertor,
|
||||
order, reserve, size, flags);
|
||||
} else {
|
||||
frag = prepare_src_large(module, endpoint, convertor,
|
||||
order, reserve, size, flags);
|
||||
}
|
||||
|
||||
#if MSGDEBUG2
|
||||
opal_output(0, "prep_src: %s %s frag %p, size=%d+%u (was %u), conv=%p\n",
|
||||
module->fabric_info->fabric_attr->name,
|
||||
(reserve + *size) <= module->max_frag_payload?"small":"large",
|
||||
(void *)frag, (int)reserve, (unsigned)*size, (unsigned)osize,
|
||||
(void *)convertor);
|
||||
#if MSGDEBUG1
|
||||
{
|
||||
unsigned i;
|
||||
mca_btl_base_descriptor_t *desc = &frag->sf_base.uf_base;
|
||||
for (i=0; i<desc->USNIC_SEND_LOCAL_COUNT; ++i) {
|
||||
opal_output(0, " %d: ptr:%p len:%d\n", i,
|
||||
(void *)desc->USNIC_SEND_LOCAL[i].seg_addr.pval,
|
||||
desc->USNIC_SEND_LOCAL[i].seg_len);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
return &frag->sf_base.uf_base;
|
||||
}
|
||||
|
||||
/*
|
||||
* BTL 2.0 prepare_dst function (this function does not exist in BTL
|
||||
* 3.0).
|
||||
*/
|
||||
mca_btl_base_descriptor_t*
|
||||
opal_btl_usnic_prepare_dst(
|
||||
struct mca_btl_base_module_t* base_module,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t* size,
|
||||
uint32_t flags)
|
||||
{
|
||||
opal_btl_usnic_put_dest_frag_t *pfrag;
|
||||
opal_btl_usnic_module_t *module;
|
||||
void *data_ptr;
|
||||
|
||||
module = (opal_btl_usnic_module_t *)base_module;
|
||||
|
||||
/* allocate a fragment for this */
|
||||
pfrag = (opal_btl_usnic_put_dest_frag_t *)
|
||||
opal_btl_usnic_put_dest_frag_alloc(module);
|
||||
if (NULL == pfrag) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* find start of the data */
|
||||
opal_convertor_get_current_pointer(convertor, (void **) &data_ptr);
|
||||
|
||||
/* make a seg entry pointing at data_ptr */
|
||||
pfrag->uf_remote_seg[0].seg_addr.pval = data_ptr;
|
||||
pfrag->uf_remote_seg[0].seg_len = *size;
|
||||
|
||||
pfrag->uf_base.order = order;
|
||||
pfrag->uf_base.des_flags = flags;
|
||||
|
||||
#if MSGDEBUG2
|
||||
opal_output(0, "prep_dst size=%d, addr=%p, pfrag=%p\n", (int)*size,
|
||||
data_ptr, (void *)pfrag);
|
||||
#endif
|
||||
|
||||
return &pfrag->uf_base;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* BTL 2.0 version of module.btl_put.
|
||||
*
|
||||
* Emulate an RDMA put. We'll send the remote address
|
||||
* across to the other side so it will know where to put the data
|
||||
*/
|
||||
int opal_btl_usnic_put(
|
||||
struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint,
|
||||
struct mca_btl_base_descriptor_t *desc)
|
||||
{
|
||||
int rc;
|
||||
opal_btl_usnic_send_frag_t *frag;
|
||||
|
||||
frag = (opal_btl_usnic_send_frag_t *)desc;
|
||||
|
||||
opal_btl_usnic_compute_sf_size(frag);
|
||||
frag->sf_ack_bytes_left = frag->sf_size;
|
||||
|
||||
#if MSGDEBUG2
|
||||
opal_output(0, "usnic_put, frag=%p, size=%d\n", (void *)frag,
|
||||
(int)frag->sf_size);
|
||||
#if MSGDEBUG1
|
||||
{ unsigned i;
|
||||
for (i=0; i<desc->USNIC_PUT_LOCAL_COUNT; ++i) {
|
||||
opal_output(0, " %d: ptr:%p len:%d%s\n", i,
|
||||
desc->USNIC_PUT_LOCAL[i].seg_addr.pval,
|
||||
desc->USNIC_PUT_LOCAL[i].seg_len,
|
||||
(i==0)?" (put local)":"");
|
||||
}
|
||||
for (i=0; i<desc->USNIC_PUT_REMOTE_COUNT; ++i) {
|
||||
opal_output(0, " %d: ptr:%p len:%d%s\n", i,
|
||||
desc->USNIC_PUT_REMOTE[i].seg_addr.pval,
|
||||
desc->USNIC_PUT_REMOTE[i].seg_len,
|
||||
(i==0)?" (put remote)":"");
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* RFXX copy out address - why does he not use our provided holder? */
|
||||
/* JMS What does this mean? ^^ */
|
||||
frag->sf_base.uf_remote_seg[0].seg_addr.pval =
|
||||
desc->USNIC_PUT_REMOTE->seg_addr.pval;
|
||||
|
||||
rc = opal_btl_usnic_finish_put_or_send((opal_btl_usnic_module_t *)btl,
|
||||
(opal_btl_usnic_endpoint_t *)endpoint,
|
||||
frag,
|
||||
/*tag=*/MCA_BTL_NO_ORDER);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------*/
|
||||
|
||||
#elif BTL_VERSION == 30
|
||||
|
||||
/*
|
||||
* BTL 3.0 prepare_src function.
|
||||
*
|
||||
* This function is only used for sending PML fragments (not putting
|
||||
* or getting fragments).
|
||||
*
|
||||
* Note the "user" data the PML wishes to communicate and return a
|
||||
* descriptor. We create a frag (which is also a descriptor by virtue
|
||||
* of its base class) and populate it with enough source information
|
||||
* to complete a future send.
|
||||
*
|
||||
* Recall that the usnic BTL's max_send_size is almost certainly
|
||||
* larger than the MTU (by default, max_send_size is either 25K or
|
||||
* 150K). Therefore, the PML may give us a fragment up to
|
||||
* max_send_size in this function. Hence, we make the decision here
|
||||
* as to whether it's a "small" fragment (i.e., size <= MTU, meaning
|
||||
* that it fits in a single datagram) or a "large" fragment (i.e.,
|
||||
* size > MTU, meaning that it must be chunked into multiple
|
||||
* datagrams).
|
||||
*
|
||||
* The convertor will be saved for deferred packing if the user buffer
|
||||
* is noncontiguous. Otherwise, it will be saved in one of the
|
||||
* descriptor's SGEs.
|
||||
*
|
||||
* NOTE that the *only* reason this routine is allowed to return a size smaller
|
||||
* than was requested is if the convertor cannot process the entire amount.
|
||||
*/
|
||||
struct mca_btl_base_descriptor_t *
|
||||
opal_btl_usnic_prepare_src(struct mca_btl_base_module_t *base_module,
|
||||
struct mca_btl_base_endpoint_t *endpoint,
|
||||
struct opal_convertor_t *convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t *size,
|
||||
uint32_t flags)
|
||||
{
|
||||
opal_btl_usnic_module_t *module = (opal_btl_usnic_module_t*) base_module;
|
||||
opal_btl_usnic_send_frag_t *frag;
|
||||
uint32_t payload_len;
|
||||
#if MSGDEBUG2
|
||||
size_t osize = *size;
|
||||
#endif
|
||||
|
||||
/* Do we need to check the connectivity? If enabled, we'll check
|
||||
the connectivity at either first send to peer X or first ACK to
|
||||
peer X. */
|
||||
opal_btl_usnic_check_connectivity(module, endpoint);
|
||||
|
||||
/*
|
||||
* if total payload len fits in one MTU use small send, else large
|
||||
*/
|
||||
payload_len = *size + reserve;
|
||||
if (payload_len <= module->max_frag_payload) {
|
||||
frag = prepare_src_small(module, endpoint, convertor,
|
||||
order, reserve, size, flags);
|
||||
} else {
|
||||
frag = prepare_src_large(module, endpoint, convertor,
|
||||
order, reserve, size, flags);
|
||||
}
|
||||
|
||||
#if MSGDEBUG2
|
||||
opal_output(0, "prep_src: %s %s frag %p, size=%d+%u (was %u), conv=%p\n",
|
||||
module->fabric_info->fabric_attr->name,
|
||||
(reserve + *size) <= module->max_frag_payload?"small":"large",
|
||||
(void *)frag, (int)reserve, (unsigned)*size, (unsigned)osize,
|
||||
(void *)convertor);
|
||||
#if MSGDEBUG1
|
||||
{
|
||||
unsigned i;
|
||||
mca_btl_base_descriptor_t *desc = &frag->sf_base.uf_base;
|
||||
for (i=0; i<desc->USNIC_SEND_LOCAL_COUNT; ++i) {
|
||||
opal_output(0, " %d: ptr:%p len:%d\n", i,
|
||||
(void *)desc->USNIC_SEND_LOCAL[i].seg_addr.pval,
|
||||
desc->USNIC_SEND_LOCAL[i].seg_len);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
return &frag->sf_base.uf_base;
|
||||
}
|
||||
|
||||
/*
|
||||
* BTL 3.0 version of module.btl_put.
|
||||
*
|
||||
* Emulate an RDMA put. We'll send the remote address across to the
|
||||
* other side so it will know where to put the data.
|
||||
*
|
||||
* Note that this function is only ever called with contiguous
|
||||
* buffers, so a convertor is not necessary.
|
||||
*/
|
||||
int
|
||||
opal_btl_usnic_put(struct mca_btl_base_module_t *base_module,
|
||||
struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, uint64_t remote_address,
|
||||
struct mca_btl_base_registration_handle_t *local_handle,
|
||||
struct mca_btl_base_registration_handle_t *remote_handle,
|
||||
size_t size, int flags, int order,
|
||||
mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata)
|
||||
{
|
||||
opal_btl_usnic_send_frag_t *sfrag;
|
||||
opal_btl_usnic_module_t *module = (opal_btl_usnic_module_t*) base_module;
|
||||
|
||||
/* At least for the moment, continue to make a descriptor, like we
|
||||
used to in BTL 2.0 */
|
||||
if (size <= module->max_frag_payload) {
|
||||
/* Small send fragment -- the whole thing fits in one MTU
|
||||
(i.e., a single chunk) */
|
||||
opal_btl_usnic_small_send_frag_t *ssfrag;
|
||||
ssfrag = opal_btl_usnic_small_send_frag_alloc(module);
|
||||
if (OPAL_UNLIKELY(NULL == ssfrag)) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
sfrag = &ssfrag->ssf_base;
|
||||
} else {
|
||||
/* Large send fragment -- need more than one MTU (i.e.,
|
||||
multiple chunks) */
|
||||
opal_btl_usnic_large_send_frag_t *lsfrag;
|
||||
lsfrag = opal_btl_usnic_large_send_frag_alloc(module);
|
||||
if (OPAL_UNLIKELY(NULL == lsfrag)) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
lsfrag->lsf_pack_on_the_fly = true;
|
||||
|
||||
sfrag = &lsfrag->lsf_base;
|
||||
}
|
||||
|
||||
sfrag->sf_endpoint = endpoint;
|
||||
sfrag->sf_size = size;
|
||||
sfrag->sf_ack_bytes_left = size;
|
||||
|
||||
opal_btl_usnic_frag_t *frag;
|
||||
frag = &sfrag->sf_base;
|
||||
frag->uf_local_seg[0].seg_len = size;
|
||||
frag->uf_local_seg[0].seg_addr.pval = local_address;
|
||||
frag->uf_remote_seg[0].seg_len = size;
|
||||
frag->uf_remote_seg[0].seg_addr.pval =
|
||||
(void *)(uintptr_t) remote_address;
|
||||
|
||||
mca_btl_base_descriptor_t *desc;
|
||||
desc = &frag->uf_base;
|
||||
desc->des_segment_count = 1;
|
||||
desc->des_segments = &frag->uf_local_seg[0];
|
||||
/* This is really the wrong cbfunc type, but we'll cast it to
|
||||
the Right type before we use it. So it'll be ok. */
|
||||
desc->des_cbfunc = (mca_btl_base_completion_fn_t) cbfunc;
|
||||
desc->des_cbdata = cbdata;
|
||||
desc->des_context = cbcontext;
|
||||
desc->des_flags = flags;
|
||||
desc->order = order;
|
||||
|
||||
int rc;
|
||||
rc = opal_btl_usnic_finish_put_or_send(module,
|
||||
(opal_btl_usnic_endpoint_t *)endpoint,
|
||||
sfrag,
|
||||
/*tag=*/MCA_BTL_NO_ORDER);
|
||||
return rc;
|
||||
}
|
||||
|
||||
#endif /* BTL_VERSION */
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2013-2014 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -40,20 +40,22 @@
|
||||
#endif
|
||||
# define USNIC_BTL_DEFAULT_VERSION(name) MCA_BTL_DEFAULT_VERSION(name)
|
||||
|
||||
# define USNIC_SEND_LOCAL des_local
|
||||
# define USNIC_SEND_LOCAL_COUNT des_local_count
|
||||
# define USNIC_SEND_REMOTE des_remote
|
||||
# define USNIC_SEND_REMOTE_COUNT des_remote_count
|
||||
# define USNIC_SEND_LOCAL des_segments
|
||||
# define USNIC_SEND_LOCAL_COUNT des_segment_count
|
||||
# define USNIC_SEND_REMOTE des_segments
|
||||
# define USNIC_SEND_REMOTE_COUNT des_segment_count
|
||||
|
||||
# define USNIC_RECV_LOCAL des_local
|
||||
# define USNIC_RECV_LOCAL_COUNT des_local_count
|
||||
# define USNIC_RECV_REMOTE des_remote
|
||||
# define USNIC_RECV_REMOTE_COUNT des_remote_count
|
||||
# define USNIC_RECV_LOCAL des_segments
|
||||
# define USNIC_RECV_LOCAL_COUNT des_segment_count
|
||||
# define USNIC_RECV_REMOTE des_segments
|
||||
# define USNIC_RECV_REMOTE_COUNT des_segment_count
|
||||
|
||||
# define USNIC_PUT_LOCAL des_local
|
||||
# define USNIC_PUT_LOCAL_COUNT des_local_count
|
||||
# define USNIC_PUT_REMOTE des_remote
|
||||
# define USNIC_PUT_REMOTE_COUNT des_remote_count
|
||||
# define USNIC_PUT_LOCAL des_segments
|
||||
# define USNIC_PUT_LOCAL_COUNT des_segment_count
|
||||
# define USNIC_PUT_REMOTE des_segments
|
||||
# define USNIC_PUT_REMOTE_COUNT des_segments_count
|
||||
|
||||
# define BTL_VERSION 30
|
||||
|
||||
/*
|
||||
* Performance critical; needs to be inline
|
||||
@ -134,6 +136,8 @@ usnic_compat_proc_name_compare(opal_process_name_t a,
|
||||
# define USNIC_PUT_REMOTE des_dst
|
||||
# define USNIC_PUT_REMOTE_COUNT des_dst_cnt
|
||||
|
||||
# define BTL_VERSION 20
|
||||
|
||||
# define USNIC_COMPAT_BASE_VERSION \
|
||||
MCA_BTL_BASE_VERSION_2_0_0, \
|
||||
.mca_type_name = "btl", \
|
||||
@ -207,4 +211,82 @@ void usnic_compat_modex_recv(int *rc,
|
||||
uint64_t usnic_compat_rte_hash_name(opal_process_name_t *pname);
|
||||
const char *usnic_compat_proc_name_print(opal_process_name_t *pname);
|
||||
|
||||
/************************************************************************/
|
||||
|
||||
/* BTL 2.0 vs 3.0 compatibilty functions (specifically: some BTL API
|
||||
functions changed signatures between 2.0 and 3.0) */
|
||||
|
||||
struct mca_btl_base_module_t;
|
||||
struct mca_btl_base_endpoint_t;
|
||||
|
||||
/* BTL 2.0 (i.e., v1.7/v1.8, but listed separately because these are
|
||||
really BTL API issues) */
|
||||
|
||||
#if BTL_VERSION == 20
|
||||
|
||||
#include "ompi/mca/btl/btl.h"
|
||||
|
||||
/* This function changed signature in BTL 3.0 */
|
||||
mca_btl_base_descriptor_t*
|
||||
opal_btl_usnic_prepare_src(
|
||||
struct mca_btl_base_module_t* base_module,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t* size,
|
||||
uint32_t flags);
|
||||
|
||||
/* This function no longer exists in BTL 3.0 */
|
||||
mca_btl_base_descriptor_t*
|
||||
opal_btl_usnic_prepare_dst(
|
||||
struct mca_btl_base_module_t* base_module,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t* size,
|
||||
uint32_t flags);
|
||||
|
||||
/* This function changed signature in BTL 3.0 */
|
||||
int
|
||||
opal_btl_usnic_put(
|
||||
struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint,
|
||||
struct mca_btl_base_descriptor_t *desc);
|
||||
|
||||
/************************************************************************/
|
||||
|
||||
/* BTL 3.0 (i.e., >=v1.9, but listed separately because these are
|
||||
really BTL API issues) */
|
||||
|
||||
#elif BTL_VERSION == 30
|
||||
|
||||
#include "opal/mca/btl/btl.h"
|
||||
|
||||
/* This function changed signature compared to BTL 2.0 */
|
||||
struct mca_btl_base_descriptor_t *
|
||||
opal_btl_usnic_prepare_src(struct mca_btl_base_module_t *base_module,
|
||||
struct mca_btl_base_endpoint_t *endpoint,
|
||||
struct opal_convertor_t *convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t *size,
|
||||
uint32_t flags);
|
||||
|
||||
/* This function changed signature compared to BTL 2.0 */
|
||||
int
|
||||
opal_btl_usnic_put(struct mca_btl_base_module_t *base_module,
|
||||
struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, uint64_t remote_address,
|
||||
struct mca_btl_base_registration_handle_t *local_handle,
|
||||
struct mca_btl_base_registration_handle_t *remote_handle,
|
||||
size_t size, int flags, int order,
|
||||
mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata);
|
||||
|
||||
#endif /* BTL_VERSION */
|
||||
|
||||
#endif /* BTL_USNIC_COMPAT_H */
|
||||
|
@ -11,7 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013-2014 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -122,11 +122,18 @@ recv_seg_constructor(
|
||||
mca_btl_usnic_component.transport_header_len);
|
||||
|
||||
/* initialize descriptor */
|
||||
seg->rs_desc.USNIC_RECV_LOCAL = &seg->rs_segment;
|
||||
seg->rs_desc.USNIC_RECV_LOCAL_COUNT = 1;
|
||||
/* JMS Initializing RECV_REMOTE for receive frags is unnecessary
|
||||
with BTL 3.0. The only reason to keep this here would be for
|
||||
compatibility with the BTL 2.0 usnic-v1.8 git branch (i.e.,
|
||||
it's harmless to do this assignment first, before the
|
||||
RECV_LOCAL assignments -- the compiler will likely compile out
|
||||
this dead code, anyway). */
|
||||
seg->rs_desc.USNIC_RECV_REMOTE = NULL;
|
||||
seg->rs_desc.USNIC_RECV_REMOTE_COUNT = 0;
|
||||
|
||||
seg->rs_desc.USNIC_RECV_LOCAL = &seg->rs_segment;
|
||||
seg->rs_desc.USNIC_RECV_LOCAL_COUNT = 1;
|
||||
|
||||
/*
|
||||
* This pointer is only correct for incoming segments of type
|
||||
* OPAL_BTL_USNIC_PAYLOAD_TYPE_FRAG, but that's the only time
|
||||
@ -144,12 +151,20 @@ send_frag_constructor(opal_btl_usnic_send_frag_t *frag)
|
||||
|
||||
/* Fill in source descriptor */
|
||||
desc = &frag->sf_base.uf_base;
|
||||
|
||||
/* JMS Initializing SEND_REMOTE for receive frags is unnecessary
|
||||
with BTL 3.0. The only reason to keep this here would be for
|
||||
compatibility with the BTL 2.0 usnic-v1.8 git branch (i.e.,
|
||||
it's harmless to do this assignment first, before the
|
||||
SEND_LOCAL assignments -- the compiler will likely compile out
|
||||
this dead code, anyway). */
|
||||
desc->USNIC_SEND_REMOTE = frag->sf_base.uf_remote_seg;
|
||||
desc->USNIC_SEND_REMOTE_COUNT = 0;
|
||||
|
||||
desc->USNIC_SEND_LOCAL = frag->sf_base.uf_local_seg;
|
||||
frag->sf_base.uf_local_seg[0].seg_len = 0;
|
||||
frag->sf_base.uf_local_seg[1].seg_len = 0;
|
||||
desc->USNIC_SEND_LOCAL_COUNT = 2;
|
||||
desc->USNIC_SEND_REMOTE = frag->sf_base.uf_remote_seg;
|
||||
desc->USNIC_SEND_REMOTE_COUNT = 0;
|
||||
|
||||
desc->order = MCA_BTL_NO_ORDER;
|
||||
desc->des_flags = 0;
|
||||
|
@ -11,7 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013-2014 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -85,9 +85,28 @@ usnic_seg_type_str(opal_btl_usnic_seg_type_t t)
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* usnic registration handle (passed over the network to peers as a
|
||||
* cookie).
|
||||
*
|
||||
* Currently, this struct is meaningless (but it must be defined /
|
||||
* exist) because we are emulating RDMA and do not have
|
||||
* btl_register_mem and btl_deregister_mem functions (and we set
|
||||
* module.btl_registration_handle_size to 0, not sizeof(struct
|
||||
* mca_btl_base_registration_handle_t)).
|
||||
*/
|
||||
struct mca_btl_base_registration_handle_t {
|
||||
/* Maybe we'll need fields like this */
|
||||
uint32_t lkey;
|
||||
uint32_t rkey;
|
||||
};
|
||||
|
||||
/*
|
||||
* usnic local registration
|
||||
*/
|
||||
typedef struct opal_btl_usnic_reg_t {
|
||||
mca_mpool_base_registration_t base;
|
||||
struct fid_mr *mr;
|
||||
struct fid_mr *ur_mr;
|
||||
} opal_btl_usnic_reg_t;
|
||||
|
||||
|
||||
@ -145,7 +164,7 @@ typedef struct {
|
||||
|
||||
/**
|
||||
* Descriptor for a common segment. This is exactly one packet and may
|
||||
* be send or receive
|
||||
* be sent or received.
|
||||
*/
|
||||
typedef struct opal_btl_usnic_segment_t {
|
||||
ompi_free_list_item_t us_list;
|
||||
@ -221,7 +240,7 @@ typedef struct opal_btl_usnic_frag_t {
|
||||
/* fragment descriptor type */
|
||||
opal_btl_usnic_frag_type_t uf_type;
|
||||
|
||||
/* utility segments */
|
||||
/* utility segments (just seg_addr/seg_len) */
|
||||
mca_btl_base_segment_t uf_local_seg[2];
|
||||
mca_btl_base_segment_t uf_remote_seg[1];
|
||||
|
||||
@ -568,6 +587,31 @@ opal_btl_usnic_ack_segment_return(
|
||||
OMPI_FREE_LIST_RETURN_MT(&(module->ack_segs), &(ack->ss_base.us_list));
|
||||
}
|
||||
|
||||
/* Compute and set the proper value for sfrag->sf_size. This must not be used
|
||||
* during usnic_alloc, since the PML might change the segment size after
|
||||
* usnic_alloc returns. */
|
||||
static inline void
|
||||
opal_btl_usnic_compute_sf_size(opal_btl_usnic_send_frag_t *sfrag)
|
||||
{
|
||||
opal_btl_usnic_frag_t *frag;
|
||||
|
||||
frag = &sfrag->sf_base;
|
||||
|
||||
/* JMS This can be a put or a send, and the buffers are different... */
|
||||
#if 0
|
||||
assert(frag->uf_base.USNIC_SEND_LOCAL_COUNT > 0);
|
||||
assert(frag->uf_base.USNIC_SEND_LOCAL_COUNT <= 2);
|
||||
|
||||
/* belt and suspenders: second len should be zero if only one SGE */
|
||||
assert(2 == frag->uf_base.USNIC_SEND_LOCAL_COUNT ||
|
||||
0 == frag->uf_local_seg[1].seg_len);
|
||||
#endif
|
||||
|
||||
sfrag->sf_size = 0;
|
||||
sfrag->sf_size += frag->uf_local_seg[0].seg_len;
|
||||
sfrag->sf_size += frag->uf_local_seg[1].seg_len;
|
||||
}
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
||||
|
@ -68,30 +68,6 @@ static void finalize_one_channel(opal_btl_usnic_module_t *module,
|
||||
struct opal_btl_usnic_channel_t *channel);
|
||||
|
||||
|
||||
/* Compute and set the proper value for sfrag->sf_size. This must not be used
|
||||
* during usnic_alloc, since the PML might change the segment size after
|
||||
* usnic_alloc returns. */
|
||||
static inline void compute_sf_size(opal_btl_usnic_send_frag_t *sfrag)
|
||||
{
|
||||
opal_btl_usnic_frag_t *frag;
|
||||
|
||||
frag = &sfrag->sf_base;
|
||||
|
||||
/* JMS This can be a put or a send, and the buffers are different... */
|
||||
#if 0
|
||||
assert(frag->uf_base.USNIC_SEND_LOCAL_COUNT > 0);
|
||||
assert(frag->uf_base.USNIC_SEND_LOCAL_COUNT <= 2);
|
||||
|
||||
/* belt and suspenders: second len should be zero if only one SGE */
|
||||
assert(2 == frag->uf_base.USNIC_SEND_LOCAL_COUNT ||
|
||||
0 == frag->uf_local_seg[1].seg_len);
|
||||
#endif
|
||||
|
||||
sfrag->sf_size = 0;
|
||||
sfrag->sf_size += frag->uf_local_seg[0].seg_len;
|
||||
sfrag->sf_size += frag->uf_local_seg[1].seg_len;
|
||||
}
|
||||
|
||||
/*
|
||||
* Loop over all procs sent to us in add_procs and see if we want to
|
||||
* add a proc/endpoint for them.
|
||||
@ -644,98 +620,6 @@ static int usnic_free(struct mca_btl_base_module_t* btl,
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Notes from george:
|
||||
*
|
||||
* - BTL ALLOC: allocating control messages or eager frags if BTL
|
||||
does not have INPLACE flag. To be clear: max it will ever alloc
|
||||
is eager_limit. THEREFORE: eager_limit is the max that ALLOC
|
||||
must always be able to alloc.
|
||||
--> Contraction in the btl.h documentation.
|
||||
*
|
||||
* - BTL PREPARE SRC: max_send_size frags go through here. Can return
|
||||
a smaller size than was asked for.
|
||||
*
|
||||
* - BTL PREPARE DEST: not used if you don't have PUT/GET
|
||||
*
|
||||
* - BTL SEND: will be used after ALLOC / PREPARE
|
||||
*/
|
||||
|
||||
/* Responsible for handling "small" frags (reserve + *size <= max_frag_payload)
|
||||
* in the same manner as btl_prepare_src. Must return a smaller amount than
|
||||
* requested if the given convertor cannot process the entire (*size).
|
||||
*/
|
||||
static inline
|
||||
opal_btl_usnic_send_frag_t *
|
||||
prepare_src_small(
|
||||
struct opal_btl_usnic_module_t* module,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t* size,
|
||||
uint32_t flags)
|
||||
{
|
||||
opal_btl_usnic_send_frag_t *frag;
|
||||
opal_btl_usnic_small_send_frag_t *sfrag;
|
||||
size_t payload_len;
|
||||
|
||||
payload_len = *size + reserve;
|
||||
assert(payload_len <= module->max_frag_payload); /* precondition */
|
||||
|
||||
sfrag = opal_btl_usnic_small_send_frag_alloc(module);
|
||||
if (OPAL_UNLIKELY(NULL == sfrag)) {
|
||||
return NULL;
|
||||
}
|
||||
frag = &sfrag->ssf_base;
|
||||
|
||||
/* In the case of a convertor, we will copy the data in now, since that is
|
||||
* the cheapest way to discover how much we can actually send (since we know
|
||||
* we will pack it anyway later). The alternative is to do all of the
|
||||
* following:
|
||||
* 1) clone_with_position(convertor) and see where the new position ends up
|
||||
* actually being (see opal_btl_usnic_convertor_pack_peek). Otherwise we
|
||||
* aren't fulfilling our contract w.r.t. (*size).
|
||||
* 2) Add a bunch of branches checking for different cases, both here and in
|
||||
* progress_sends
|
||||
* 3) If we choose to defer the packing, we must clone the convertor because
|
||||
* the PML owns it and might reuse it for another prepare_src call.
|
||||
*
|
||||
* Two convertor clones is likely to be at least as slow as just copying the
|
||||
* data and might consume a similar amount of memory. Plus we still have to
|
||||
* pack it later to send it.
|
||||
*
|
||||
* The reason we do not copy non-convertor buffer at this point is because
|
||||
* we might still use INLINE for the send, and in that case we do not want
|
||||
* to copy the data at all.
|
||||
*/
|
||||
if (OPAL_UNLIKELY(opal_convertor_need_buffers(convertor))) {
|
||||
/* put user data just after end of 1st seg (upper layer header) */
|
||||
assert(payload_len <= module->max_frag_payload);
|
||||
usnic_convertor_pack_simple(
|
||||
convertor,
|
||||
(IOVBASE_TYPE*)(intptr_t)(frag->sf_base.uf_local_seg[0].seg_addr.lval + reserve),
|
||||
*size,
|
||||
size);
|
||||
payload_len = reserve + *size;
|
||||
frag->sf_base.uf_base.USNIC_SEND_LOCAL_COUNT = 1;
|
||||
/* PML will copy header into beginning of segment */
|
||||
frag->sf_base.uf_local_seg[0].seg_len = payload_len;
|
||||
} else {
|
||||
opal_convertor_get_current_pointer(convertor,
|
||||
&sfrag->ssf_base.sf_base.uf_local_seg[1].seg_addr.pval);
|
||||
frag->sf_base.uf_base.USNIC_SEND_LOCAL_COUNT = 2;
|
||||
frag->sf_base.uf_local_seg[0].seg_len = reserve;
|
||||
frag->sf_base.uf_local_seg[1].seg_len = *size;
|
||||
}
|
||||
|
||||
frag->sf_base.uf_base.des_flags = flags;
|
||||
frag->sf_endpoint = endpoint;
|
||||
|
||||
return frag;
|
||||
}
|
||||
|
||||
/* Packs data from the given large send frag into single new segment and
|
||||
* returns a pointer to it. The packed data comes first from SG[0] (PML
|
||||
* header) and then second from either SG[1] (if seg_addr is non-NULL) or from
|
||||
@ -827,345 +711,6 @@ pack_chunk_seg_from_frag(
|
||||
return seg;
|
||||
}
|
||||
|
||||
static
|
||||
void *
|
||||
pack_chunk_seg_chain_with_reserve(
|
||||
struct opal_btl_usnic_module_t* module,
|
||||
opal_btl_usnic_large_send_frag_t *lfrag,
|
||||
size_t reserve_len,
|
||||
opal_convertor_t *convertor,
|
||||
size_t max_convertor_bytes,
|
||||
size_t *convertor_bytes_packed)
|
||||
{
|
||||
opal_btl_usnic_chunk_segment_t *seg;
|
||||
void *ret_ptr = NULL;
|
||||
int n_segs;
|
||||
uint8_t *copyptr;
|
||||
size_t copylen;
|
||||
size_t seg_space;
|
||||
size_t max_data;
|
||||
bool first_pass;
|
||||
|
||||
assert(NULL != lfrag);
|
||||
assert(NULL != convertor_bytes_packed);
|
||||
|
||||
n_segs = 0;
|
||||
*convertor_bytes_packed = 0;
|
||||
|
||||
first_pass = true;
|
||||
while (*convertor_bytes_packed < max_convertor_bytes ||
|
||||
first_pass) {
|
||||
seg = opal_btl_usnic_chunk_segment_alloc(module);
|
||||
if (OPAL_UNLIKELY(NULL == seg)) {
|
||||
BTL_ERROR(("chunk segment allocation error"));
|
||||
abort(); /* XXX */
|
||||
}
|
||||
++n_segs;
|
||||
|
||||
seg_space = module->max_chunk_payload;
|
||||
copyptr = seg->ss_base.us_payload.raw;
|
||||
|
||||
if (first_pass && reserve_len > 0) {
|
||||
/* logic could accommodate >max, but currently doesn't */
|
||||
assert(reserve_len <= module->max_chunk_payload);
|
||||
ret_ptr = copyptr;
|
||||
seg_space -= reserve_len;
|
||||
copyptr += reserve_len;
|
||||
}
|
||||
|
||||
/* now pack any convertor data */
|
||||
if (*convertor_bytes_packed < max_convertor_bytes && seg_space > 0) {
|
||||
copylen = max_convertor_bytes - *convertor_bytes_packed;
|
||||
if (copylen > seg_space) {
|
||||
copylen = seg_space;
|
||||
}
|
||||
usnic_convertor_pack_simple(convertor, copyptr, copylen, &max_data);
|
||||
seg_space -= max_data;
|
||||
*convertor_bytes_packed += max_data;
|
||||
|
||||
/* If unable to pack any of the remaining bytes, release the
|
||||
* most recently allocated segment and finish processing.
|
||||
*/
|
||||
if (seg_space == module->max_chunk_payload) {
|
||||
assert(max_data == 0); /* only way this can happen */
|
||||
opal_btl_usnic_chunk_segment_return(module, seg);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* bozo checks */
|
||||
assert(seg_space >= 0);
|
||||
assert(seg_space < module->max_chunk_payload);
|
||||
|
||||
/* append segment of data to chain to send */
|
||||
seg->ss_parent_frag = &lfrag->lsf_base;
|
||||
seg->ss_len = module->max_chunk_payload - seg_space;
|
||||
opal_list_append(&lfrag->lsf_seg_chain, &seg->ss_base.us_list.super);
|
||||
|
||||
#if MSGDEBUG1
|
||||
opal_output(0, "%s: appending seg=%p, frag=%p, payload=%zd\n",
|
||||
__func__, (void *)seg, (void *)lfrag,
|
||||
(module->max_chunk_payload - seg_space));
|
||||
#endif
|
||||
|
||||
first_pass = false;
|
||||
}
|
||||
|
||||
return ret_ptr;
|
||||
}
|
||||
|
||||
/* Responsible for handling "large" frags (reserve + *size > max_frag_payload)
|
||||
* in the same manner as btl_prepare_src. Must return a smaller amount than
|
||||
* requested if the given convertor cannot process the entire (*size).
|
||||
*/
|
||||
static
|
||||
opal_btl_usnic_send_frag_t *
|
||||
prepare_src_large(
|
||||
struct opal_btl_usnic_module_t* module,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t* size,
|
||||
uint32_t flags)
|
||||
{
|
||||
opal_btl_usnic_send_frag_t *frag;
|
||||
opal_btl_usnic_large_send_frag_t *lfrag;
|
||||
int rc;
|
||||
|
||||
/* Get holder for the msg */
|
||||
lfrag = opal_btl_usnic_large_send_frag_alloc(module);
|
||||
if (OPAL_UNLIKELY(NULL == lfrag)) {
|
||||
return NULL;
|
||||
}
|
||||
frag = &lfrag->lsf_base;
|
||||
|
||||
/* The header location goes in SG[0], payload in SG[1]. If we are using a
|
||||
* convertor then SG[1].seg_len is accurate but seg_addr is NULL. */
|
||||
frag->sf_base.uf_base.USNIC_SEND_LOCAL_COUNT = 2;
|
||||
|
||||
/* stash header location, PML will write here */
|
||||
frag->sf_base.uf_local_seg[0].seg_addr.pval = &lfrag->lsf_ompi_header;
|
||||
frag->sf_base.uf_local_seg[0].seg_len = reserve;
|
||||
/* make sure upper header small enough */
|
||||
assert(reserve <= sizeof(lfrag->lsf_ompi_header));
|
||||
|
||||
if (OPAL_UNLIKELY(opal_convertor_need_buffers(convertor))) {
|
||||
/* threshold == -1 means always pack eagerly */
|
||||
if (mca_btl_usnic_component.pack_lazy_threshold >= 0 &&
|
||||
*size >= (size_t)mca_btl_usnic_component.pack_lazy_threshold) {
|
||||
MSGDEBUG1_OUT("packing frag %p on the fly", (void *)frag);
|
||||
lfrag->lsf_pack_on_the_fly = true;
|
||||
|
||||
/* tell the PML we will absorb as much as possible while still
|
||||
* respecting indivisible element boundaries in the convertor */
|
||||
*size = opal_btl_usnic_convertor_pack_peek(convertor, *size);
|
||||
|
||||
/* Clone the convertor b/c we (the BTL) don't own it and the PML
|
||||
* might mutate it after we return from this function. */
|
||||
rc = opal_convertor_clone(convertor, &frag->sf_convertor,
|
||||
/*copy_stack=*/true);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
BTL_ERROR(("unexpected convertor clone error"));
|
||||
abort(); /* XXX */
|
||||
}
|
||||
}
|
||||
else {
|
||||
/* pack everything in the convertor into a chain of segments now,
|
||||
* leaving space for the PML header in the first segment */
|
||||
lfrag->lsf_base.sf_base.uf_local_seg[0].seg_addr.pval =
|
||||
pack_chunk_seg_chain_with_reserve(module, lfrag, reserve,
|
||||
convertor, *size, size);
|
||||
}
|
||||
|
||||
/* We set SG[1] to {NULL,bytes_packed} so that various calculations
|
||||
* by both PML and this BTL will be correct. For example, the PML adds
|
||||
* up the bytes in the descriptor segments to determine if an MPI-level
|
||||
* request is complete or not. */
|
||||
frag->sf_base.uf_local_seg[1].seg_addr.pval = NULL;
|
||||
frag->sf_base.uf_local_seg[1].seg_len = *size;
|
||||
} else {
|
||||
/* convertor not needed, just save the payload pointer in SG[1] */
|
||||
lfrag->lsf_pack_on_the_fly = true;
|
||||
opal_convertor_get_current_pointer(convertor,
|
||||
&frag->sf_base.uf_local_seg[1].seg_addr.pval);
|
||||
frag->sf_base.uf_local_seg[1].seg_len = *size;
|
||||
}
|
||||
|
||||
frag->sf_base.uf_base.des_flags = flags;
|
||||
frag->sf_endpoint = endpoint;
|
||||
|
||||
return frag;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Note the "user" data the PML wishes to communicate and return a descriptor
|
||||
* that can be used for send or put. We create a frag (which is also a
|
||||
* descriptor by virtue of its base class) and populate it with enough
|
||||
* source information to complete a future send/put.
|
||||
*
|
||||
* We will create either a small send frag if < than an MTU, otherwise a large
|
||||
* send frag. The convertor will be saved for deferred packing if the user
|
||||
* buffer is noncontiguous. Otherwise it will be saved in one of the
|
||||
* descriptor's SGEs.
|
||||
*
|
||||
* NOTE that the *only* reason this routine is allowed to return a size smaller
|
||||
* than was requested is if the convertor cannot process the entire amount.
|
||||
*/
|
||||
static mca_btl_base_descriptor_t*
|
||||
usnic_prepare_src(
|
||||
struct mca_btl_base_module_t* base_module,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t* size,
|
||||
uint32_t flags)
|
||||
{
|
||||
opal_btl_usnic_module_t *module = (opal_btl_usnic_module_t*) base_module;
|
||||
opal_btl_usnic_send_frag_t *frag;
|
||||
uint32_t payload_len;
|
||||
#if MSGDEBUG2
|
||||
size_t osize = *size;
|
||||
#endif
|
||||
|
||||
/* Do we need to check the connectivity? If enabled, we'll check
|
||||
the connectivity at either first send to peer X or first ACK to
|
||||
peer X. */
|
||||
opal_btl_usnic_check_connectivity(module, endpoint);
|
||||
|
||||
/*
|
||||
* if total payload len fits in one MTU use small send, else large
|
||||
*/
|
||||
payload_len = *size + reserve;
|
||||
if (payload_len <= module->max_frag_payload) {
|
||||
frag = prepare_src_small(module, endpoint, registration, convertor,
|
||||
order, reserve, size, flags);
|
||||
} else {
|
||||
frag = prepare_src_large(module, endpoint, registration, convertor,
|
||||
order, reserve, size, flags);
|
||||
}
|
||||
|
||||
#if MSGDEBUG2
|
||||
opal_output(0, "prep_src: %s %s frag %p, size=%d+%u (was %u), conv=%p\n",
|
||||
module->fabric_info->fabric_attr->name,
|
||||
(reserve + *size) <= module->max_frag_payload?"small":"large",
|
||||
(void *)frag, (int)reserve, (unsigned)*size, (unsigned)osize,
|
||||
(void *)convertor);
|
||||
#if MSGDEBUG1
|
||||
{
|
||||
unsigned i;
|
||||
mca_btl_base_descriptor_t *desc = &frag->sf_base.uf_base;
|
||||
for (i=0; i<desc->USNIC_SEND_LOCAL_COUNT; ++i) {
|
||||
opal_output(0, " %d: ptr:%p len:%d\n", i,
|
||||
(void *)desc->USNIC_SEND_LOCAL[i].seg_addr.pval,
|
||||
desc->USNIC_SEND_LOCAL[i].seg_len);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
return &frag->sf_base.uf_base;
|
||||
}
|
||||
|
||||
static mca_btl_base_descriptor_t*
|
||||
usnic_prepare_dst(
|
||||
struct mca_btl_base_module_t* base_module,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t* size,
|
||||
uint32_t flags)
|
||||
{
|
||||
opal_btl_usnic_put_dest_frag_t *pfrag;
|
||||
opal_btl_usnic_module_t *module;
|
||||
void *data_ptr;
|
||||
|
||||
module = (opal_btl_usnic_module_t *)base_module;
|
||||
|
||||
/* allocate a fragment for this */
|
||||
pfrag = (opal_btl_usnic_put_dest_frag_t *)
|
||||
opal_btl_usnic_put_dest_frag_alloc(module);
|
||||
if (NULL == pfrag) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* find start of the data */
|
||||
opal_convertor_get_current_pointer(convertor, (void **) &data_ptr);
|
||||
|
||||
/* make a seg entry pointing at data_ptr */
|
||||
pfrag->uf_remote_seg[0].seg_addr.pval = data_ptr;
|
||||
pfrag->uf_remote_seg[0].seg_len = *size;
|
||||
|
||||
pfrag->uf_base.order = order;
|
||||
pfrag->uf_base.des_flags = flags;
|
||||
|
||||
#if MSGDEBUG2
|
||||
opal_output(0, "prep_dst size=%d, addr=%p, pfrag=%p\n", (int)*size,
|
||||
data_ptr, (void *)pfrag);
|
||||
#endif
|
||||
|
||||
return &pfrag->uf_base;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Emulate an RDMA put. We'll send the remote address
|
||||
* across to the other side so it will know where to put the data
|
||||
*/
|
||||
static int
|
||||
usnic_put(
|
||||
struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint,
|
||||
struct mca_btl_base_descriptor_t *desc)
|
||||
{
|
||||
int rc;
|
||||
opal_btl_usnic_send_frag_t *frag;
|
||||
|
||||
frag = (opal_btl_usnic_send_frag_t *)desc;
|
||||
|
||||
compute_sf_size(frag);
|
||||
frag->sf_ack_bytes_left = frag->sf_size;
|
||||
|
||||
#if MSGDEBUG2
|
||||
opal_output(0, "usnic_put, frag=%p, size=%d\n", (void *)frag,
|
||||
(int)frag->sf_size);
|
||||
#if MSGDEBUG1
|
||||
{ unsigned i;
|
||||
for (i=0; i<desc->USNIC_PUT_LOCAL_COUNT; ++i) {
|
||||
opal_output(0, " %d: ptr:%p len:%d%s\n", i,
|
||||
desc->USNIC_PUT_LOCAL[i].seg_addr.pval,
|
||||
desc->USNIC_PUT_LOCAL[i].seg_len,
|
||||
(i==0)?" (put local)":"");
|
||||
}
|
||||
for (i=0; i<desc->USNIC_PUT_REMOTE_COUNT; ++i) {
|
||||
opal_output(0, " %d: ptr:%p len:%d%s\n", i,
|
||||
desc->USNIC_PUT_REMOTE[i].seg_addr.pval,
|
||||
desc->USNIC_PUT_REMOTE[i].seg_len,
|
||||
(i==0)?" (put remote)":"");
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* RFXX copy out address - why does he not use our provided holder? */
|
||||
/* JMS What does this mean? ^^ */
|
||||
frag->sf_base.uf_remote_seg[0].seg_addr.pval =
|
||||
desc->USNIC_PUT_REMOTE->seg_addr.pval;
|
||||
|
||||
rc = opal_btl_usnic_finish_put_or_send((opal_btl_usnic_module_t *)btl,
|
||||
(opal_btl_usnic_endpoint_t *)endpoint,
|
||||
frag,
|
||||
/*tag=*/MCA_BTL_NO_ORDER);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int usnic_finalize(struct mca_btl_base_module_t* btl)
|
||||
{
|
||||
opal_btl_usnic_module_t* module = (opal_btl_usnic_module_t*)btl;
|
||||
@ -1569,7 +1114,7 @@ usnic_send(
|
||||
assert(frag->sf_endpoint == endpoint);
|
||||
frag->sf_base.uf_remote_seg[0].seg_addr.pval = NULL; /* not a PUT */
|
||||
|
||||
compute_sf_size(frag);
|
||||
opal_btl_usnic_compute_sf_size(frag);
|
||||
frag->sf_ack_bytes_left = frag->sf_size;
|
||||
|
||||
#if MSGDEBUG2
|
||||
@ -1698,13 +1243,13 @@ static int usnic_sendi(struct mca_btl_base_module_t* btl,
|
||||
* RDMA Memory Pool (de)register callbacks
|
||||
*/
|
||||
static int usnic_reg_mr(void* reg_data, void* base, size_t size,
|
||||
mca_mpool_base_registration_t* reg)
|
||||
mca_mpool_base_registration_t* reg)
|
||||
{
|
||||
opal_btl_usnic_module_t* mod = (opal_btl_usnic_module_t*)reg_data;
|
||||
opal_btl_usnic_reg_t* ud_reg = (opal_btl_usnic_reg_t*)reg;
|
||||
opal_btl_usnic_reg_t* ur = (opal_btl_usnic_reg_t*)reg;
|
||||
int rc;
|
||||
|
||||
rc = fi_mr_reg(mod->domain, base, size, 0, 0, 0, 0, &ud_reg->mr, NULL);
|
||||
rc = fi_mr_reg(mod->domain, base, size, 0, 0, 0, 0, &ur->ur_mr, NULL);
|
||||
if (0 != rc) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
@ -1712,21 +1257,20 @@ static int usnic_reg_mr(void* reg_data, void* base, size_t size,
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int usnic_dereg_mr(void* reg_data,
|
||||
mca_mpool_base_registration_t* reg)
|
||||
mca_mpool_base_registration_t* reg)
|
||||
{
|
||||
opal_btl_usnic_reg_t* ud_reg = (opal_btl_usnic_reg_t*)reg;
|
||||
opal_btl_usnic_reg_t* ur = (opal_btl_usnic_reg_t*)reg;
|
||||
|
||||
if (ud_reg->mr != NULL) {
|
||||
if (0 != fi_close(&ud_reg->mr->fid)) {
|
||||
if (ur->ur_mr != NULL) {
|
||||
if (0 != fi_close(&ur->ur_mr->fid)) {
|
||||
opal_output(0, "%s: error unpinning USD memory mr=%p: %s\n",
|
||||
__func__, (void*) ud_reg->mr, strerror(errno));
|
||||
__func__, (void*) ur->ur_mr, strerror(errno));
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
ud_reg->mr = NULL;
|
||||
ur->ur_mr = NULL;
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
@ -2243,7 +1787,13 @@ static void init_pml_values(opal_btl_usnic_module_t *module)
|
||||
|
||||
/* Since we emulate PUT, max_send_size can be same as
|
||||
eager_limit */
|
||||
module->super.btl_max_send_size = module->super.btl_eager_limit;
|
||||
module->super.btl_max_send_size =
|
||||
module->super.btl_eager_limit;
|
||||
|
||||
#if BTL_VERSION == 30
|
||||
module->super.btl_put_limit =
|
||||
module->super.btl_eager_limit;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void init_senders(opal_btl_usnic_module_t *module)
|
||||
@ -2626,22 +2176,45 @@ static int usnic_ft_event(int state)
|
||||
opal_btl_usnic_module_t opal_btl_usnic_module_template = {
|
||||
.super = {
|
||||
.btl_component = &mca_btl_usnic_component.super,
|
||||
|
||||
#if BTL_VERSION == 20
|
||||
.btl_prepare_dst = opal_btl_usnic_prepare_dst,
|
||||
.btl_seg_size = sizeof(mca_btl_base_segment_t),
|
||||
#elif BTL_VERSION == 30
|
||||
.btl_atomic_flags = 0,
|
||||
.btl_registration_handle_size = 0,
|
||||
|
||||
.btl_get_limit = 0,
|
||||
.btl_get_alignment = 0,
|
||||
.btl_put_limit = 0,
|
||||
.btl_put_alignment = 0,
|
||||
|
||||
.btl_atomic_op = NULL,
|
||||
.btl_atomic_fop = NULL,
|
||||
.btl_atomic_cswap = NULL,
|
||||
#endif
|
||||
|
||||
.btl_exclusivity = MCA_BTL_EXCLUSIVITY_DEFAULT,
|
||||
.btl_flags =
|
||||
MCA_BTL_FLAGS_SEND |
|
||||
MCA_BTL_FLAGS_PUT |
|
||||
MCA_BTL_FLAGS_SEND_INPLACE,
|
||||
.btl_seg_size = sizeof(mca_btl_base_segment_t),
|
||||
|
||||
.btl_add_procs = usnic_add_procs,
|
||||
.btl_del_procs = usnic_del_procs,
|
||||
.btl_register = NULL,
|
||||
.btl_finalize = usnic_finalize,
|
||||
|
||||
.btl_alloc = usnic_alloc,
|
||||
.btl_free = usnic_free,
|
||||
.btl_prepare_src = usnic_prepare_src,
|
||||
.btl_prepare_dst = usnic_prepare_dst,
|
||||
.btl_prepare_src = opal_btl_usnic_prepare_src,
|
||||
.btl_send = usnic_send,
|
||||
.btl_put = usnic_put,
|
||||
.btl_sendi = NULL,
|
||||
.btl_put = opal_btl_usnic_put,
|
||||
.btl_get = NULL,
|
||||
.btl_dump = mca_btl_base_dump,
|
||||
|
||||
.btl_mpool = NULL,
|
||||
.btl_register_error = usnic_register_pml_err_cb,
|
||||
.btl_ft_event = usnic_ft_event
|
||||
}
|
||||
|
@ -277,6 +277,10 @@ opal_btl_usnic_recv_fast(opal_btl_usnic_module_t *module,
|
||||
opal_output(0, "fast recv %d bytes:\n", bseg->us_btl_header->payload_len + sizeof(opal_btl_usnic_btl_header_t));
|
||||
opal_btl_usnic_dump_hex(bseg->us_btl_header, bseg->us_btl_header->payload_len + sizeof(opal_btl_usnic_btl_header_t));
|
||||
#endif
|
||||
/* If this is a short incoming message (i.e., the message is
|
||||
wholly contained in this one message -- it is not chunked
|
||||
across multiple messages), and it's not a PUT from the sender,
|
||||
then just handle it here. */
|
||||
if (endpoint != NULL && !endpoint->endpoint_exiting &&
|
||||
(OPAL_BTL_USNIC_PAYLOAD_TYPE_FRAG ==
|
||||
bseg->us_btl_header->payload_type) &&
|
||||
@ -311,8 +315,10 @@ opal_btl_usnic_dump_hex(bseg->us_btl_header, bseg->us_btl_header->payload_len +
|
||||
|
||||
drop:
|
||||
channel->chan_deferred_recv = seg;
|
||||
}
|
||||
|
||||
} else {
|
||||
/* Otherwise, handle all the other cases the "normal" way */
|
||||
else {
|
||||
opal_btl_usnic_recv_call(module, seg, channel);
|
||||
}
|
||||
}
|
||||
@ -382,6 +388,10 @@ opal_btl_usnic_recv(opal_btl_usnic_module_t *module,
|
||||
endpoint = lookup_sender(module, bseg);
|
||||
seg->rs_endpoint = endpoint;
|
||||
|
||||
/* If this is a short incoming message (i.e., the message is
|
||||
wholly contained in this one message -- it is not chunked
|
||||
across multiple messages), and it's not a PUT from the sender,
|
||||
then just handle it here. */
|
||||
if (endpoint != NULL && !endpoint->endpoint_exiting &&
|
||||
(OPAL_BTL_USNIC_PAYLOAD_TYPE_FRAG ==
|
||||
bseg->us_btl_header->payload_type) &&
|
||||
@ -408,7 +418,10 @@ opal_btl_usnic_recv(opal_btl_usnic_module_t *module,
|
||||
reg->cbfunc(&module->super, bseg->us_btl_header->tag,
|
||||
&seg->rs_desc, reg->cbdata);
|
||||
|
||||
} else {
|
||||
}
|
||||
|
||||
/* Otherwise, handle all the other cases the "normal" way */
|
||||
else {
|
||||
opal_btl_usnic_recv_call(module, seg, channel);
|
||||
}
|
||||
}
|
||||
|
@ -11,7 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2008-2015 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -106,7 +106,8 @@ opal_btl_usnic_chunk_send_complete(opal_btl_usnic_module_t *module,
|
||||
* This routine lives in this file to help prevent automatic inlining by the
|
||||
* compiler.
|
||||
*
|
||||
* The "tag" only applies to sends. */
|
||||
* The "tag" only applies to sends.
|
||||
*/
|
||||
int
|
||||
opal_btl_usnic_finish_put_or_send(
|
||||
opal_btl_usnic_module_t *module,
|
||||
|
@ -12,8 +12,8 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire. All rights reserved.
|
||||
* Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2014 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -98,7 +98,7 @@ enum {
|
||||
* Shared Memory (VADER) BTL module.
|
||||
*/
|
||||
struct mca_btl_vader_component_t {
|
||||
mca_btl_base_component_2_0_0_t super; /**< base BTL component */
|
||||
mca_btl_base_component_3_0_0_t super; /**< base BTL component */
|
||||
int vader_free_list_num; /**< initial size of free lists */
|
||||
int vader_free_list_max; /**< maximum size of free lists */
|
||||
int vader_free_list_inc; /**< number of elements to alloc when growing free lists */
|
||||
@ -115,7 +115,6 @@ struct mca_btl_vader_component_t {
|
||||
ompi_free_list_t vader_frags_eager; /**< free list of vader send frags */
|
||||
ompi_free_list_t vader_frags_max_send; /**< free list of vader max send frags (large fragments) */
|
||||
ompi_free_list_t vader_frags_user; /**< free list of small inline frags */
|
||||
ompi_free_list_t vader_frags_rdma; /**< free list of vader put/get frags (single-copy) */
|
||||
|
||||
unsigned int fbox_threshold; /**< number of sends required before we setup a send fast box for a peer */
|
||||
unsigned int fbox_max; /**< maximum number of send fast boxes to allocate */
|
||||
@ -208,21 +207,24 @@ int mca_btl_vader_sendi (struct mca_btl_base_module_t *btl,
|
||||
* @param descriptor (IN) Description of the data to be transferred
|
||||
*/
|
||||
#if OPAL_BTL_VADER_HAVE_XPMEM
|
||||
int mca_btl_vader_put_xpmem (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint,
|
||||
struct mca_btl_base_descriptor_t *des);
|
||||
int mca_btl_vader_put_xpmem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
#endif
|
||||
|
||||
#if OPAL_BTL_VADER_HAVE_CMA
|
||||
int mca_btl_vader_put_cma (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint,
|
||||
struct mca_btl_base_descriptor_t *des);
|
||||
int mca_btl_vader_put_cma (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
#endif
|
||||
|
||||
#if OPAL_BTL_VADER_HAVE_KNEM
|
||||
int mca_btl_vader_put_knem (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint,
|
||||
struct mca_btl_base_descriptor_t *des);
|
||||
int mca_btl_vader_put_knem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
#endif
|
||||
|
||||
/**
|
||||
@ -233,21 +235,24 @@ int mca_btl_vader_put_knem (struct mca_btl_base_module_t *btl,
|
||||
* @param descriptor (IN) Description of the data to be transferred
|
||||
*/
|
||||
#if OPAL_BTL_VADER_HAVE_XPMEM
|
||||
int mca_btl_vader_get_xpmem (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint,
|
||||
struct mca_btl_base_descriptor_t *des);
|
||||
int mca_btl_vader_get_xpmem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
#endif
|
||||
|
||||
#if OPAL_BTL_VADER_HAVE_CMA
|
||||
int mca_btl_vader_get_cma (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint,
|
||||
struct mca_btl_base_descriptor_t *des);
|
||||
int mca_btl_vader_get_cma (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
#endif
|
||||
|
||||
#if OPAL_BTL_VADER_HAVE_KNEM
|
||||
int mca_btl_vader_get_knem (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint,
|
||||
struct mca_btl_base_descriptor_t *des);
|
||||
int mca_btl_vader_get_knem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
#endif
|
||||
|
||||
/**
|
||||
@ -260,6 +265,7 @@ mca_btl_base_descriptor_t* mca_btl_vader_alloc (struct mca_btl_base_module_t* bt
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
uint8_t order, size_t size, uint32_t flags);
|
||||
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
||||
|
@ -12,7 +12,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire. All rights reserved.
|
||||
* Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2014 Los Alamos National Security, LLC.
|
||||
* Copyright (c) 2010-2015 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011 NVIDIA Corporation. All rights reserved.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
@ -227,12 +227,12 @@ static int mca_btl_vader_component_register (void)
|
||||
mca_btl_vader.super.btl_eager_limit = 32 * 1024;
|
||||
mca_btl_vader.super.btl_rndv_eager_limit = mca_btl_vader.super.btl_eager_limit;
|
||||
mca_btl_vader.super.btl_max_send_size = mca_btl_vader.super.btl_eager_limit;
|
||||
mca_btl_vader.super.btl_min_rdma_pipeline_size = mca_btl_vader.super.btl_eager_limit;
|
||||
mca_btl_vader.super.btl_min_rdma_pipeline_size = INT_MAX;
|
||||
} else {
|
||||
mca_btl_vader.super.btl_eager_limit = 4 * 1024;
|
||||
mca_btl_vader.super.btl_rndv_eager_limit = 32 * 1024;
|
||||
mca_btl_vader.super.btl_max_send_size = 32 * 1024;
|
||||
mca_btl_vader.super.btl_min_rdma_pipeline_size = 32 * 1024;
|
||||
mca_btl_vader.super.btl_min_rdma_pipeline_size = INT_MAX;
|
||||
}
|
||||
|
||||
mca_btl_vader.super.btl_rdma_pipeline_send_length = mca_btl_vader.super.btl_eager_limit;
|
||||
@ -251,7 +251,6 @@ static int mca_btl_vader_component_register (void)
|
||||
mca_btl_vader.super.btl_bandwidth = 10000; /* Mbs */
|
||||
}
|
||||
|
||||
mca_btl_vader.super.btl_seg_size = sizeof (mca_btl_vader_segment_t);
|
||||
mca_btl_vader.super.btl_latency = 1; /* Microsecs */
|
||||
|
||||
/* Call the BTL based to register its MCA params */
|
||||
@ -272,7 +271,6 @@ static int mca_btl_vader_component_open(void)
|
||||
OBJ_CONSTRUCT(&mca_btl_vader_component.vader_frags_eager, ompi_free_list_t);
|
||||
OBJ_CONSTRUCT(&mca_btl_vader_component.vader_frags_user, ompi_free_list_t);
|
||||
OBJ_CONSTRUCT(&mca_btl_vader_component.vader_frags_max_send, ompi_free_list_t);
|
||||
OBJ_CONSTRUCT(&mca_btl_vader_component.vader_frags_rdma, ompi_free_list_t);
|
||||
OBJ_CONSTRUCT(&mca_btl_vader_component.lock, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&mca_btl_vader_component.pending_endpoints, opal_list_t);
|
||||
OBJ_CONSTRUCT(&mca_btl_vader_component.pending_fragments, opal_list_t);
|
||||
@ -293,7 +291,6 @@ static int mca_btl_vader_component_close(void)
|
||||
OBJ_DESTRUCT(&mca_btl_vader_component.vader_frags_eager);
|
||||
OBJ_DESTRUCT(&mca_btl_vader_component.vader_frags_user);
|
||||
OBJ_DESTRUCT(&mca_btl_vader_component.vader_frags_max_send);
|
||||
OBJ_DESTRUCT(&mca_btl_vader_component.vader_frags_rdma);
|
||||
OBJ_DESTRUCT(&mca_btl_vader_component.lock);
|
||||
OBJ_DESTRUCT(&mca_btl_vader_component.pending_endpoints);
|
||||
OBJ_DESTRUCT(&mca_btl_vader_component.pending_fragments);
|
||||
@ -349,12 +346,11 @@ static void mca_btl_vader_select_next_single_copy_mechanism (void)
|
||||
static void mca_btl_vader_check_single_copy (void)
|
||||
{
|
||||
int initial_mechanism = mca_btl_vader_component.single_copy_mechanism;
|
||||
int rc;
|
||||
|
||||
#if OPAL_BTL_VADER_HAVE_XPMEM
|
||||
if (MCA_BTL_VADER_XPMEM == mca_btl_vader_component.single_copy_mechanism) {
|
||||
/* try to create an xpmem segment for the entire address space */
|
||||
rc = mca_btl_vader_xpmem_init ();
|
||||
int rc = mca_btl_vader_xpmem_init ();
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
if (MCA_BTL_VADER_XPMEM == initial_mechanism) {
|
||||
opal_show_help("help-btl-vader.txt", "xpmem-make-failed",
|
||||
@ -414,7 +410,7 @@ static void mca_btl_vader_check_single_copy (void)
|
||||
#if OPAL_BTL_VADER_HAVE_KNEM
|
||||
if (MCA_BTL_VADER_KNEM == mca_btl_vader_component.single_copy_mechanism) {
|
||||
/* mca_btl_vader_knem_init will set the appropriate get/put functions */
|
||||
rc = mca_btl_vader_knem_init ();
|
||||
int rc = mca_btl_vader_knem_init ();
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
if (MCA_BTL_VADER_KNEM == initial_mechanism) {
|
||||
opal_show_help("help-btl-vader.txt", "knem requested but not available",
|
||||
@ -559,7 +555,7 @@ failed:
|
||||
void mca_btl_vader_poll_handle_frag (mca_btl_vader_hdr_t *hdr, struct mca_btl_base_endpoint_t *endpoint)
|
||||
{
|
||||
mca_btl_base_segment_t segments[2];
|
||||
mca_btl_base_descriptor_t frag = {.des_local = segments, .des_local_count = 1};
|
||||
mca_btl_base_descriptor_t frag = {.des_segments = segments, .des_segment_count = 1};
|
||||
const mca_btl_active_message_callback_t *reg;
|
||||
|
||||
if (hdr->flags & MCA_BTL_VADER_FLAG_COMPLETE) {
|
||||
@ -579,7 +575,7 @@ void mca_btl_vader_poll_handle_frag (mca_btl_vader_hdr_t *hdr, struct mca_btl_ba
|
||||
&segments[1].seg_addr.pval);
|
||||
|
||||
segments[1].seg_len = hdr->sc_iov.iov_len;
|
||||
frag.des_local_count = 2;
|
||||
frag.des_segment_count = 2;
|
||||
|
||||
/* recv upcall */
|
||||
reg->cbfunc(&mca_btl_vader.super, hdr->tag, &frag, reg->cbdata);
|
||||
|
@ -204,7 +204,7 @@ static inline bool mca_btl_vader_check_fboxes (void)
|
||||
/* the 0xff tag indicates we should skip the rest of the buffer */
|
||||
if (OPAL_LIKELY((0xfe & hdr.data.tag) != 0xfe)) {
|
||||
mca_btl_base_segment_t segment;
|
||||
mca_btl_base_descriptor_t desc = {.des_local = &segment, .des_local_count = 1};
|
||||
mca_btl_base_descriptor_t desc = {.des_segments = &segment, .des_segment_count = 1};
|
||||
const mca_btl_active_message_callback_t *reg =
|
||||
mca_btl_base_active_message_trigger + hdr.data.tag;
|
||||
|
||||
|
@ -31,11 +31,11 @@ static inline void mca_btl_vader_frag_constructor (mca_btl_vader_frag_t *frag)
|
||||
if(frag->hdr != NULL) {
|
||||
frag->hdr->frag = frag;
|
||||
frag->hdr->flags = 0;
|
||||
frag->segments[0].base.seg_addr.pval = (char *)(frag->hdr + 1);
|
||||
frag->segments[0].seg_addr.pval = (char *)(frag->hdr + 1);
|
||||
}
|
||||
|
||||
frag->base.des_local = &frag->segments->base;
|
||||
frag->base.des_local_count = 1;
|
||||
frag->base.des_segments = frag->segments;
|
||||
frag->base.des_segment_count = 1;
|
||||
frag->fbox = NULL;
|
||||
}
|
||||
|
||||
@ -65,8 +65,6 @@ void mca_btl_vader_frag_init (ompi_free_list_item_t *item, void *ctx)
|
||||
frag->my_list = &mca_btl_vader_component.vader_frags_eager;
|
||||
} else if (mca_btl_vader.super.btl_max_send_size == data_size) {
|
||||
frag->my_list = &mca_btl_vader_component.vader_frags_max_send;
|
||||
} else {
|
||||
frag->my_list = &mca_btl_vader_component.vader_frags_rdma;
|
||||
}
|
||||
|
||||
if (data_size) {
|
||||
|
@ -12,7 +12,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -57,15 +57,6 @@ struct mca_btl_vader_hdr_t {
|
||||
};
|
||||
typedef struct mca_btl_vader_hdr_t mca_btl_vader_hdr_t;
|
||||
|
||||
struct mca_btl_vader_segment_t {
|
||||
mca_btl_base_segment_t base;
|
||||
#if OPAL_BTL_VADER_HAVE_KNEM
|
||||
uint64_t cookie;
|
||||
intptr_t registered_base;
|
||||
#endif
|
||||
};
|
||||
typedef struct mca_btl_vader_segment_t mca_btl_vader_segment_t;
|
||||
|
||||
/**
|
||||
* shared memory send fragment derived type.
|
||||
*/
|
||||
@ -73,7 +64,7 @@ struct mca_btl_vader_frag_t {
|
||||
/** base object */
|
||||
mca_btl_base_descriptor_t base;
|
||||
/** storage for segment data (max 2) */
|
||||
mca_btl_vader_segment_t segments[2];
|
||||
mca_btl_base_segment_t segments[2];
|
||||
/** endpoint this fragment is active on */
|
||||
struct mca_btl_base_endpoint_t *endpoint;
|
||||
/** fast box in use (or NULL) */
|
||||
@ -82,9 +73,6 @@ struct mca_btl_vader_frag_t {
|
||||
mca_btl_vader_hdr_t *hdr;
|
||||
/** free list this fragment was allocated within */
|
||||
ompi_free_list_t *my_list;
|
||||
#if OPAL_BTL_VADER_HAVE_KNEM
|
||||
uint64_t cookie;
|
||||
#endif
|
||||
};
|
||||
|
||||
typedef struct mca_btl_vader_frag_t mca_btl_vader_frag_t;
|
||||
@ -108,37 +96,16 @@ static inline int mca_btl_vader_frag_alloc (mca_btl_vader_frag_t **frag, ompi_fr
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
static inline int mca_btl_vader_frag_alloc_rdma (mca_btl_vader_frag_t **frag, ompi_free_list_t *list,
|
||||
struct mca_btl_base_endpoint_t *endpoint) {
|
||||
ompi_free_list_item_t *item;
|
||||
|
||||
OMPI_FREE_LIST_GET_MT(list, item);
|
||||
*frag = (mca_btl_vader_frag_t *) item;
|
||||
if (OPAL_LIKELY(NULL != item)) {
|
||||
(*frag)->endpoint = endpoint;
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
static inline void mca_btl_vader_frag_return (mca_btl_vader_frag_t *frag)
|
||||
{
|
||||
if (frag->hdr) {
|
||||
frag->hdr->flags = 0;
|
||||
}
|
||||
|
||||
frag->segments[0].base.seg_addr.pval = (char *)(frag->hdr + 1);
|
||||
frag->base.des_local_count = 1;
|
||||
frag->segments[0].seg_addr.pval = (char *)(frag->hdr + 1);
|
||||
frag->base.des_segment_count = 1;
|
||||
frag->fbox = NULL;
|
||||
|
||||
#if OPAL_BTL_VADER_HAVE_KNEM
|
||||
if (frag->cookie) {
|
||||
/* NTH: explicity ignore the return code. Don't care about this cookie anymore anyway. */
|
||||
(void) ioctl(mca_btl_vader.knem_fd, KNEM_CMD_DESTROY_REGION, &frag->cookie);
|
||||
frag->cookie = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
OMPI_FREE_LIST_RETURN_MT(frag->my_list, (ompi_free_list_item_t *)frag);
|
||||
}
|
||||
|
||||
@ -153,9 +120,6 @@ OBJ_CLASS_DECLARATION(mca_btl_vader_frag_t);
|
||||
#define MCA_BTL_VADER_FRAG_ALLOC_USER(frag, endpoint) \
|
||||
mca_btl_vader_frag_alloc (&(frag), &mca_btl_vader_component.vader_frags_user, endpoint)
|
||||
|
||||
#define MCA_BTL_VADER_FRAG_ALLOC_RDMA(frag, endpoint) \
|
||||
mca_btl_vader_frag_alloc_rdma (&(frag), &mca_btl_vader_component.vader_frags_rdma, endpoint)
|
||||
|
||||
#define MCA_BTL_VADER_FRAG_RETURN(frag) mca_btl_vader_frag_return(frag)
|
||||
|
||||
|
||||
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
x
Ссылка в новой задаче
Block a user