1
1

Merge pull request #275 from hjelmn/btlmod

Updated the btl interface. Please update your components.
Этот коммит содержится в:
Nathan Hjelm 2014-11-19 15:01:40 -07:00
родитель 6a19bf85dd 5a0a48c3c4
Коммит ccaecf0fd6
129 изменённых файлов: 6476 добавлений и 5681 удалений

Просмотреть файл

@ -152,7 +152,7 @@ AC_DEFUN([OPAL_CHECK_OPENFABRICS],[
# If we have the openib stuff available, find out what we've got
AS_IF([test "$ompi_check_openib_happy" = "yes"],
[AC_CHECK_DECLS([IBV_EVENT_CLIENT_REREGISTER, IBV_ACCESS_SO, IBV_TRANSPORT_USNIC, IBV_TRANSPORT_USNIC_UDP, IBV_NODE_USNIC], [], [],
[AC_CHECK_DECLS([IBV_EVENT_CLIENT_REREGISTER, IBV_ACCESS_SO, IBV_TRANSPORT_USNIC, IBV_TRANSPORT_USNIC_UDP, IBV_NODE_USNIC, IBV_ATOMIC_HCA], [], [],
[#include <infiniband/verbs.h>])
AC_CHECK_FUNCS([ibv_get_device_list ibv_resize_cq])

Просмотреть файл

@ -47,6 +47,10 @@ static ompi_errcode_intern_t ompi_err_request;
static ompi_errcode_intern_t ompi_err_buffer;
static ompi_errcode_intern_t ompi_err_rma_sync;
static ompi_errcode_intern_t ompi_err_rma_shared;
static ompi_errcode_intern_t ompi_err_rma_attach;
static ompi_errcode_intern_t ompi_err_rma_range;
static ompi_errcode_intern_t ompi_err_rma_conflict;
static ompi_errcode_intern_t ompi_err_win;
static void ompi_errcode_intern_construct(ompi_errcode_intern_t* errcode);
static void ompi_errcode_intern_destruct(ompi_errcode_intern_t* errcode);
@ -210,6 +214,38 @@ int ompi_errcode_intern_init (void)
opal_pointer_array_set_item(&ompi_errcodes_intern, ompi_err_rma_shared.index,
&ompi_err_rma_shared);
OBJ_CONSTRUCT(&ompi_err_rma_attach, ompi_errcode_intern_t);
ompi_err_rma_attach.code = OMPI_ERR_RMA_ATTACH;
ompi_err_rma_attach.mpi_code = MPI_ERR_RMA_ATTACH;
ompi_err_rma_attach.index = pos++;
strncpy(ompi_err_rma_attach.errstring, "OMPI_ERR_RMA_ATTACH", OMPI_MAX_ERROR_STRING);
opal_pointer_array_set_item(&ompi_errcodes_intern, ompi_err_rma_attach.index,
&ompi_err_rma_attach);
OBJ_CONSTRUCT(&ompi_err_rma_range, ompi_errcode_intern_t);
ompi_err_rma_range.code = OMPI_ERR_RMA_RANGE;
ompi_err_rma_range.mpi_code = MPI_ERR_RMA_RANGE;
ompi_err_rma_range.index = pos++;
strncpy(ompi_err_rma_range.errstring, "OMPI_ERR_RMA_RANGE", OMPI_MAX_ERROR_STRING);
opal_pointer_array_set_item(&ompi_errcodes_intern, ompi_err_rma_range.index,
&ompi_err_rma_range);
OBJ_CONSTRUCT(&ompi_err_rma_conflict, ompi_errcode_intern_t);
ompi_err_rma_conflict.code = OMPI_ERR_RMA_CONFLICT;
ompi_err_rma_conflict.mpi_code = MPI_ERR_RMA_CONFLICT;
ompi_err_rma_conflict.index = pos++;
strncpy(ompi_err_rma_conflict.errstring, "OMPI_ERR_RMA_CONFLICT", OMPI_MAX_ERROR_STRING);
opal_pointer_array_set_item(&ompi_errcodes_intern, ompi_err_rma_conflict.index,
&ompi_err_rma_conflict);
OBJ_CONSTRUCT(&ompi_err_win, ompi_errcode_intern_t);
ompi_err_win.code = OMPI_ERR_WIN;
ompi_err_win.mpi_code = MPI_ERR_WIN;
ompi_err_win.index = pos++;
strncpy(ompi_err_win.errstring, "OMPI_ERR_WIN", OMPI_MAX_ERROR_STRING);
opal_pointer_array_set_item(&ompi_errcodes_intern, ompi_err_win.index,
&ompi_err_win);
ompi_errcode_intern_lastused=pos;
return OMPI_SUCCESS;
}
@ -235,6 +271,10 @@ int ompi_errcode_intern_finalize(void)
OBJ_DESTRUCT(&ompi_err_request);
OBJ_DESTRUCT(&ompi_err_rma_sync);
OBJ_DESTRUCT(&ompi_err_rma_shared);
OBJ_DESTRUCT(&ompi_err_rma_attach);
OBJ_DESTRUCT(&ompi_err_rma_range);
OBJ_DESTRUCT(&ompi_err_rma_conflict);
OBJ_DESTRUCT(&ompi_err_win);
OBJ_DESTRUCT(&ompi_errcodes_intern);
return OMPI_SUCCESS;

Просмотреть файл

@ -66,7 +66,11 @@ enum {
OMPI_ERR_REQUEST = OMPI_ERR_BASE - 1,
OMPI_ERR_RMA_SYNC = OMPI_ERR_BASE - 2,
OMPI_ERR_RMA_SHARED = OMPI_ERR_BASE - 3
OMPI_ERR_RMA_SHARED = OMPI_ERR_BASE - 3,
OMPI_ERR_RMA_ATTACH = OMPI_ERR_BASE - 4,
OMPI_ERR_RMA_RANGE = OMPI_ERR_BASE - 5,
OMPI_ERR_RMA_CONFLICT = OMPI_ERR_BASE - 6,
OMPI_ERR_WIN = OMPI_ERR_BASE - 7,
};
#define OMPI_ERR_MAX (OMPI_ERR_BASE - 100)

Просмотреть файл

@ -91,7 +91,7 @@ static void mca_bml_base_completion(
{
mca_bml_base_context_t* ctx = (mca_bml_base_context_t*) des->des_cbdata;
/* restore original state */
((unsigned char*)des->des_local[0].seg_addr.pval)[ctx->index] ^= ~0;
((unsigned char*)des->des_segments[0].seg_addr.pval)[ctx->index] ^= ~0;
des->des_cbdata = ctx->cbdata;
des->des_cbfunc = ctx->cbfunc;
free(ctx);
@ -121,11 +121,11 @@ int mca_bml_base_send( mca_bml_base_btl_t* bml_btl,
malloc(sizeof(mca_bml_base_context_t));
if(NULL != ctx) {
opal_output(0, "%s:%d: corrupting data\n", __FILE__, __LINE__);
ctx->index = (size_t) ((des->des_local[0].seg_len *
ctx->index = (size_t) ((des->des_segments[0].seg_len *
opal_rand(&mca_bml_base_rand_buff) * 1.0) / (UINT32_MAX + 1.0));
ctx->cbfunc = des->des_cbfunc;
ctx->cbdata = des->des_cbdata;
((unsigned char*)des->des_local[0].seg_addr.pval)[ctx->index] ^= ~0;
((unsigned char*)des->des_segments[0].seg_addr.pval)[ctx->index] ^= ~0;
des->des_cbdata = ctx;
des->des_cbfunc = mca_bml_base_completion;
}

Просмотреть файл

@ -1,3 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
@ -10,7 +11,7 @@
* Copyright (c) 2004-2006 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
* Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
@ -307,27 +308,30 @@ static inline int mca_bml_base_sendi( mca_bml_base_btl_t* bml_btl,
payload_size, order, flags, tag, descriptor);
}
static inline int mca_bml_base_put( mca_bml_base_btl_t* bml_btl,
mca_btl_base_descriptor_t* des)
static inline int mca_bml_base_put( mca_bml_base_btl_t* bml_btl, void *local_address, uint64_t remote_address,
struct mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_registration_handle_t *remote_handle, size_t size,
int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbdata)
{
mca_btl_base_module_t* btl = bml_btl->btl;
des->des_context = (void*) bml_btl;
return btl->btl_put( btl, bml_btl->btl_endpoint, des );
return btl->btl_put( btl, bml_btl->btl_endpoint, local_address, remote_address, local_handle,
remote_handle, size, flags, order, cbfunc, (void *) bml_btl, cbdata);
}
static inline int mca_bml_base_get( mca_bml_base_btl_t* bml_btl,
mca_btl_base_descriptor_t* des)
static inline int mca_bml_base_get( mca_bml_base_btl_t* bml_btl, void *local_address, uint64_t remote_address,
struct mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_registration_handle_t *remote_handle, size_t size,
int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbdata)
{
mca_btl_base_module_t* btl = bml_btl->btl;
des->des_context = (void*) bml_btl;
return btl->btl_get( btl, bml_btl->btl_endpoint, des );
return btl->btl_get( btl, bml_btl->btl_endpoint, local_address, remote_address, local_handle,
remote_handle, size, flags, order, cbfunc, (void *) bml_btl, cbdata);
}
static inline void mca_bml_base_prepare_src(mca_bml_base_btl_t* bml_btl,
mca_mpool_base_registration_t* reg,
struct opal_convertor_t* conv,
uint8_t order,
size_t reserve,
@ -337,29 +341,27 @@ static inline void mca_bml_base_prepare_src(mca_bml_base_btl_t* bml_btl,
{
mca_btl_base_module_t* btl = bml_btl->btl;
*des = btl->btl_prepare_src( btl, bml_btl->btl_endpoint, reg, conv,
*des = btl->btl_prepare_src( btl, bml_btl->btl_endpoint, conv,
order, reserve, size, flags );
if( OPAL_LIKELY((*des) != NULL) ) {
(*des)->des_context = (void*) bml_btl;
}
}
static inline void mca_bml_base_prepare_dst(mca_bml_base_btl_t* bml_btl,
mca_mpool_base_registration_t* reg,
struct opal_convertor_t* conv,
uint8_t order,
size_t reserve,
size_t *size,
uint32_t flags,
mca_btl_base_descriptor_t** des)
{
static inline void mca_bml_base_register_mem (mca_bml_base_btl_t* bml_btl, void *base,
size_t size, uint32_t flags,
mca_btl_base_registration_handle_t **handle)
{
mca_btl_base_module_t* btl = bml_btl->btl;
*des = btl->btl_prepare_dst( btl, bml_btl->btl_endpoint, reg, conv,
order, reserve, size, flags );
if( OPAL_LIKELY((*des) != NULL) ) {
(*des)->des_context = (void*) bml_btl;
}
*handle = btl->btl_register_mem (btl, bml_btl->btl_endpoint, base, size, flags);
}
static inline void mca_bml_base_deregister_mem (mca_bml_base_btl_t* bml_btl, mca_btl_base_registration_handle_t *handle)
{
mca_btl_base_module_t* btl = bml_btl->btl;
btl->btl_deregister_mem (btl, handle);
}
/*

Просмотреть файл

@ -86,9 +86,7 @@ static int mca_bml_r2_add_btls( void )
return OMPI_ERR_OUT_OF_RESOURCE;
}
for(selected_btl = (mca_btl_base_selected_module_t*)opal_list_get_first(btls);
selected_btl != (mca_btl_base_selected_module_t*)opal_list_get_end(btls);
selected_btl = (mca_btl_base_selected_module_t*)opal_list_get_next(selected_btl)) {
OPAL_LIST_FOREACH(selected_btl, btls, mca_btl_base_selected_module_t) {
mca_btl_base_module_t *btl = selected_btl->btl_module;
mca_bml_r2.btl_modules[mca_bml_r2.num_btl_modules++] = btl;
for (i = 0; NULL != btl_names_argv && NULL != btl_names_argv[i]; ++i) {
@ -127,6 +125,23 @@ static int btl_bandwidth_compare(const void *v1, const void *v2)
return b2->btl->btl_bandwidth - b1->btl->btl_bandwidth;
}
static void mca_bml_r2_calculate_bandwidth_latency (mca_bml_base_btl_array_t *btl_array, double *total_bandwidth, uint32_t *latency)
{
const size_t array_length = mca_bml_base_btl_array_get_size (btl_array);
*latency = UINT_MAX;
*total_bandwidth = 0.;
for (size_t i = 0 ; i < array_length ; ++i) {
mca_bml_base_btl_t *bml_btl = mca_bml_base_btl_array_get_index (btl_array, i);
mca_btl_base_module_t *btl = bml_btl->btl;
*total_bandwidth += btl->btl_bandwidth;
if (btl->btl_latency < *latency) {
*latency = btl->btl_latency;
}
}
}
/*
* For each proc setup a datastructure that indicates the BTLs
* that can be used to reach the destination.
@ -189,6 +204,7 @@ static int mca_bml_r2_add_procs( size_t nprocs,
for(p_index = 0; p_index < mca_bml_r2.num_btl_modules; p_index++) {
mca_btl_base_module_t* btl = mca_bml_r2.btl_modules[p_index];
int btl_inuse = 0;
int btl_flags;
/* if the r2 can reach the destination proc it sets the
* corresponding bit (proc index) in the reachable bitmap
@ -212,7 +228,7 @@ static int mca_bml_r2_add_procs( size_t nprocs,
ompi_proc_t *proc = new_procs[p];
mca_bml_base_endpoint_t * bml_endpoint =
(mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
mca_bml_base_btl_t* bml_btl;
mca_bml_base_btl_t* bml_btl = NULL;
size_t size;
if(NULL == bml_endpoint) {
@ -236,12 +252,35 @@ static int mca_bml_r2_add_procs( size_t nprocs,
bml_endpoint->btl_flags_or = 0;
}
btl_flags = btl->btl_flags;
if( (btl_flags & MCA_BTL_FLAGS_PUT) && (NULL == btl->btl_put) ) {
opal_output(0, "mca_bml_r2_add_procs: The PUT flag is specified for"
" the %s BTL without any PUT function attached. Discard the flag !",
bml_btl->btl->btl_component->btl_version.mca_component_name);
btl_flags ^= MCA_BTL_FLAGS_PUT;
}
if( (btl_flags & MCA_BTL_FLAGS_GET) && (NULL == btl->btl_get) ) {
opal_output(0, "mca_bml_r2_add_procs: The GET flag is specified for"
" the %s BTL without any GET function attached. Discard the flag !",
bml_btl->btl->btl_component->btl_version.mca_component_name);
btl_flags ^= MCA_BTL_FLAGS_GET;
}
if( (btl_flags & (MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_SEND)) == 0 ) {
/**
* If no protocol specified, we have 2 choices: we ignore the BTL
* as we don't know which protocl to use, or we suppose that all
* BTLs support the send protocol.
*/
btl_flags |= MCA_BTL_FLAGS_SEND;
}
/* dont allow an additional BTL with a lower exclusivity ranking */
size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
if(size > 0) {
bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, size-1);
/* skip this btl if the exclusivity is less than the previous */
if(bml_btl->btl->btl_exclusivity > btl->btl_exclusivity) {
/* skip this btl if the exclusivity is less than the previous only if the btl does not provide full rdma (for one-sided) */
if(bml_btl->btl->btl_exclusivity > btl->btl_exclusivity && ((btl_flags & MCA_BTL_FLAGS_RDMA) != MCA_BTL_FLAGS_RDMA)) {
btl->btl_del_procs(btl, 1, (opal_proc_t**)&proc, &btl_endpoints[p]);
opal_output_verbose(20, opal_btl_base_framework.framework_output,
"mca: bml: Not using %s btl to %s on node %s "
@ -261,39 +300,44 @@ static int mca_bml_r2_add_procs( size_t nprocs,
proc->super.proc_hostname);
/* cache the endpoint on the proc */
bml_btl = mca_bml_base_btl_array_insert(&bml_endpoint->btl_send);
bml_btl->btl = btl;
bml_btl->btl_endpoint = btl_endpoints[p];
bml_btl->btl_weight = 0;
bml_btl->btl_flags = btl->btl_flags;
if( (bml_btl->btl_flags & MCA_BTL_FLAGS_PUT) && (NULL == btl->btl_put) ) {
opal_output(0, "mca_bml_r2_add_procs: The PUT flag is specified for"
" the %s BTL without any PUT function attached. Discard the flag !",
bml_btl->btl->btl_component->btl_version.mca_component_name);
bml_btl->btl_flags ^= MCA_BTL_FLAGS_PUT;
}
if( (bml_btl->btl_flags & MCA_BTL_FLAGS_GET) && (NULL == btl->btl_get) ) {
opal_output(0, "mca_bml_r2_add_procs: The GET flag is specified for"
" the %s BTL without any GET function attached. Discard the flag !",
bml_btl->btl->btl_component->btl_version.mca_component_name);
bml_btl->btl_flags ^= MCA_BTL_FLAGS_GET;
}
if( (bml_btl->btl_flags & (MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_SEND)) == 0 ) {
if (NULL == bml_btl || (bml_btl->btl->btl_exclusivity <= btl->btl_exclusivity)) {
bml_btl = mca_bml_base_btl_array_insert(&bml_endpoint->btl_send);
bml_btl->btl = btl;
bml_btl->btl_endpoint = btl_endpoints[p];
bml_btl->btl_weight = 0;
bml_btl->btl_flags = btl_flags;
/**
* If no protocol specified, we have 2 choices: we ignore the BTL
* as we don't know which protocl to use, or we suppose that all
* BTLs support the send protocol.
* calculate the bitwise OR of the btl flags
*/
bml_btl->btl_flags |= MCA_BTL_FLAGS_SEND;
bml_endpoint->btl_flags_or |= bml_btl->btl_flags;
}
/**
* calculate the bitwise OR of the btl flags
*/
bml_endpoint->btl_flags_or |= bml_btl->btl_flags;
/* always add rdma endpoints */
if ((btl_flags & MCA_BTL_FLAGS_RDMA) &&
!((proc->super.proc_arch != ompi_proc_local_proc->super.proc_arch) &&
(0 == (btl->btl_flags & MCA_BTL_FLAGS_HETEROGENEOUS_RDMA)))) {
mca_bml_base_btl_t *bml_btl_rdma = mca_bml_base_btl_array_insert(&bml_endpoint->btl_rdma);
bml_btl_rdma->btl = btl;
bml_btl_rdma->btl_endpoint = btl_endpoints[p];
bml_btl_rdma->btl_weight = 0;
bml_btl_rdma->btl_flags = btl_flags;
if (bml_endpoint->btl_pipeline_send_length < btl->btl_rdma_pipeline_send_length) {
bml_endpoint->btl_pipeline_send_length = btl->btl_rdma_pipeline_send_length;
}
if (bml_endpoint->btl_send_limit < btl->btl_min_rdma_pipeline_size) {
bml_endpoint->btl_send_limit = btl->btl_min_rdma_pipeline_size;
}
}
/* This BTL is in use, allow the progress registration */
btl_inuse++;
}
}
if(btl_inuse > 0 && NULL != btl->btl_component->btl_progress) {
size_t p;
bool found = false;
@ -319,9 +363,8 @@ static int mca_bml_r2_add_procs( size_t nprocs,
mca_bml_base_endpoint_t* bml_endpoint =
(mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
double total_bandwidth = 0;
uint32_t latency = 0xffffffff;
size_t n_index;
size_t n_size;
uint32_t latency;
size_t n_send, n_rdma;
/* skip over procs w/ no btl's registered */
if(NULL == bml_endpoint) {
@ -335,28 +378,22 @@ static int mca_bml_r2_add_procs( size_t nprocs,
* weighting. Once the left over is smaller than this number we will
* start using the weight to compute the correct amount.
*/
n_size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
n_send = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
n_rdma = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
/* sort BTLs in descending order according to bandwidth value */
qsort(bml_endpoint->btl_send.bml_btls, n_size,
qsort(bml_endpoint->btl_send.bml_btls, n_send,
sizeof(mca_bml_base_btl_t), btl_bandwidth_compare);
bml_endpoint->btl_rdma_index = 0;
for(n_index = 0; n_index < n_size; n_index++) {
mca_bml_base_btl_t* bml_btl =
mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index);
mca_btl_base_module_t* btl = bml_btl->btl;
total_bandwidth += bml_btl->btl->btl_bandwidth;
if(btl->btl_latency < latency) {
latency = btl->btl_latency;
}
}
mca_bml_r2_calculate_bandwidth_latency (&bml_endpoint->btl_send, &total_bandwidth, &latency);
/* (1) set the weight of each btl as a percentage of overall bandwidth
* (2) copy all btl instances at the highest priority ranking into the
* list of btls used for first fragments
*/
for(n_index = 0; n_index < n_size; n_index++) {
for (size_t n_index = 0 ; n_index < n_send ; ++n_index) {
mca_bml_base_btl_t* bml_btl =
mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index);
mca_btl_base_module_t *btl = bml_btl->btl;
@ -365,7 +402,7 @@ static int mca_bml_r2_add_procs( size_t nprocs,
if(btl->btl_bandwidth > 0) {
bml_btl->btl_weight = (float)(btl->btl_bandwidth / total_bandwidth);
} else {
bml_btl->btl_weight = (float)(1.0 / n_size);
bml_btl->btl_weight = (float)(1.0 / n_send);
}
/* check to see if this r2 is already in the array of r2s
@ -380,21 +417,24 @@ static int mca_bml_r2_add_procs( size_t nprocs,
/* set endpoint max send size as min of available btls */
if(bml_endpoint->btl_max_send_size > btl->btl_max_send_size)
bml_endpoint->btl_max_send_size = btl->btl_max_send_size;
}
/* check flags - is rdma prefered */
if ((btl->btl_flags & (MCA_BTL_FLAGS_PUT|MCA_BTL_FLAGS_GET)) &&
!((proc->super.proc_arch != ompi_proc_local_proc->super.proc_arch) &&
(0 == (btl->btl_flags & MCA_BTL_FLAGS_HETEROGENEOUS_RDMA)))) {
mca_bml_base_btl_t* bml_btl_rdma = mca_bml_base_btl_array_insert(&bml_endpoint->btl_rdma);
mca_btl_base_module_t* btl_rdma = bml_btl->btl;
/* sort BTLs in descending order according to bandwidth value */
qsort(bml_endpoint->btl_rdma.bml_btls, n_rdma,
sizeof(mca_bml_base_btl_t), btl_bandwidth_compare);
*bml_btl_rdma = *bml_btl;
if(bml_endpoint->btl_pipeline_send_length < btl_rdma->btl_rdma_pipeline_send_length) {
bml_endpoint->btl_pipeline_send_length = btl_rdma->btl_rdma_pipeline_send_length;
}
if(bml_endpoint->btl_send_limit < btl_rdma->btl_min_rdma_pipeline_size) {
bml_endpoint->btl_send_limit = btl_rdma->btl_min_rdma_pipeline_size;
}
mca_bml_r2_calculate_bandwidth_latency (&bml_endpoint->btl_rdma, &total_bandwidth, &latency);
/* set rdma btl weights */
for (size_t n_index = 0 ; n_index < n_rdma ; ++n_index) {
mca_bml_base_btl_t *bml_btl =
mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma, n_index);
/* compute weighting factor for this r2 */
if (bml_btl->btl->btl_bandwidth > 0.0) {
bml_btl->btl_weight = (float)(bml_btl->btl->btl_bandwidth / total_bandwidth);
} else {
bml_btl->btl_weight = (float)(1.0 / n_rdma);
}
}
}

Просмотреть файл

@ -8,6 +8,8 @@
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2014 Los Alamos National Security, LLC. All rights
# reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -15,39 +17,39 @@
# $HEADER$
#
rdma_sources = \
osc_rdma.h \
osc_rdma.c \
osc_rdma_comm.c \
osc_rdma_component.c \
osc_rdma_data_move.h \
osc_rdma_data_move.c \
osc_rdma_frag.h \
osc_rdma_frag.c \
osc_rdma_header.h \
osc_rdma_obj_convert.h \
osc_rdma_request.h \
osc_rdma_request.c \
osc_rdma_active_target.c \
osc_rdma_passive_target.c
pt2pt_sources = \
osc_pt2pt.h \
osc_pt2pt_module.c \
osc_pt2pt_comm.c \
osc_pt2pt_component.c \
osc_pt2pt_data_move.h \
osc_pt2pt_data_move.c \
osc_pt2pt_frag.h \
osc_pt2pt_frag.c \
osc_pt2pt_header.h \
osc_pt2pt_obj_convert.h \
osc_pt2pt_request.h \
osc_pt2pt_request.c \
osc_pt2pt_active_target.c \
osc_pt2pt_passive_target.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_ompi_osc_rdma_DSO
if MCA_BUILD_ompi_osc_pt2pt_DSO
component_noinst =
component_install = mca_osc_rdma.la
component_install = mca_osc_pt2pt.la
else
component_noinst = libmca_osc_rdma.la
component_noinst = libmca_osc_pt2pt.la
component_install =
endif
mcacomponentdir = $(ompilibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_osc_rdma_la_SOURCES = $(rdma_sources)
mca_osc_rdma_la_LDFLAGS = -module -avoid-version
mca_osc_pt2pt_la_SOURCES = $(pt2pt_sources)
mca_osc_pt2pt_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_osc_rdma_la_SOURCES = $(rdma_sources)
libmca_osc_rdma_la_LDFLAGS = -module -avoid-version
libmca_osc_pt2pt_la_SOURCES = $(pt2pt_sources)
libmca_osc_pt2pt_la_LDFLAGS = -module -avoid-version

20
ompi/mca/osc/pt2pt/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,20 @@
# -*- shell-script -*-
#
# Copyright (c) 2013 Sandia National Laboratories. All rights reserved.
# Copyright (c) 2014 Los Alamos National Security, LLC. All rights
# reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_ompi_osc_pt2pt_CONFIG([action-if-can-compile],
# [action-if-cant-compile])
# ------------------------------------------------
# We can always build, unless we were explicitly disabled.
AC_DEFUN([MCA_ompi_osc_pt2pt_CONFIG],[
AC_CONFIG_FILES([ompi/mca/osc/pt2pt/Makefile])
[$1]
])dnl

Просмотреть файл

@ -19,8 +19,8 @@
* $HEADER$
*/
#ifndef OMPI_OSC_RDMA_H
#define OMPI_OSC_RDMA_H
#ifndef OMPI_OSC_PT2PT_H
#define OMPI_OSC_PT2PT_H
#include "ompi_config.h"
#include "opal/class/opal_list.h"
@ -39,13 +39,13 @@
#include "ompi/mca/bml/bml.h"
#include "ompi/memchecker.h"
#include "osc_rdma_header.h"
#include "osc_pt2pt_header.h"
BEGIN_C_DECLS
struct ompi_osc_rdma_frag_t;
struct ompi_osc_pt2pt_frag_t;
struct ompi_osc_rdma_component_t {
struct ompi_osc_pt2pt_component_t {
/** Extend the basic osc component interface */
ompi_osc_base_component_t super;
@ -58,46 +58,45 @@ struct ompi_osc_rdma_component_t {
/** module count */
int module_count;
/** free list of ompi_osc_rdma_frag_t structures */
opal_free_list_t frags;
/** free list of ompi_osc_pt2pt_frag_t structures */
ompi_free_list_t frags;
/** Free list of requests */
ompi_free_list_t requests;
/** RDMA component buffer size */
/** PT2PT component buffer size */
unsigned int buffer_size;
/** Lock for pending_operations */
opal_mutex_t pending_operations_lock;
/** List of operations that need to be processed */
opal_list_t pending_operations;
/** Is the progress function enabled? */
bool progress_enable;
/** List of requests that need to be freed */
opal_list_t request_gc;
/** List of buffers that need to be freed */
opal_list_t buffer_gc;
};
typedef struct ompi_osc_rdma_component_t ompi_osc_rdma_component_t;
typedef struct ompi_osc_pt2pt_component_t ompi_osc_pt2pt_component_t;
struct ompi_osc_rdma_peer_t {
struct ompi_osc_pt2pt_peer_t {
/** Pointer to the current send fragment for each outgoing target */
struct ompi_osc_rdma_frag_t *active_frag;
struct ompi_osc_pt2pt_frag_t *active_frag;
/** Number of acks pending. New requests can not be sent out if there are
* acks pending (to fulfill the ordering constraints of accumulate) */
uint32_t num_acks_pending;
int32_t passive_incoming_frag_count;
bool access_epoch;
bool eager_send_active;
};
typedef struct ompi_osc_rdma_peer_t ompi_osc_rdma_peer_t;
typedef struct ompi_osc_pt2pt_peer_t ompi_osc_pt2pt_peer_t;
#define SEQ_INVALID 0xFFFFFFFFFFFFFFFFULL
/** Module structure. Exactly one of these is associated with each
RDMA window */
struct ompi_osc_rdma_module_t {
PT2PT window */
struct ompi_osc_pt2pt_module_t {
/** Extend the basic osc module interface */
ompi_osc_base_module_t super;
@ -127,12 +126,15 @@ struct ompi_osc_rdma_module_t {
opal_mutex_t acc_lock;
/** peer data */
ompi_osc_rdma_peer_t *peers;
ompi_osc_pt2pt_peer_t *peers;
/** Nmber of communication fragments started for this epoch, by
peer. Not in peer data to make fence more manageable. */
uint32_t *epoch_outgoing_frag_count;
/** Lock for queued_frags */
opal_mutex_t queued_frags_lock;
/** List of full communication buffers queued to be sent. Should
be maintained in order (at least in per-target order). */
opal_list_t queued_frags;
@ -152,9 +154,6 @@ struct ompi_osc_rdma_module_t {
/* Next incoming buffer count at which we want a signal on cond */
uint32_t active_incoming_frag_signal_count;
uint32_t *passive_incoming_frag_count;
uint32_t *passive_incoming_frag_signal_count;
/* Number of flush ack requests send since beginning of time */
uint64_t flush_ack_requested_count;
/* Number of flush ack replies received since beginning of
@ -171,8 +170,6 @@ struct ompi_osc_rdma_module_t {
/** Indicates the window is in an all access epoch (fence, lock_all) */
bool all_access_epoch;
bool *passive_eager_send_active;
/* ********************* PWSC data ************************ */
struct ompi_group_t *pw_group;
struct ompi_group_t *sc_group;
@ -189,9 +186,11 @@ struct ompi_osc_rdma_module_t {
/** Status of the local window lock. One of 0 (unlocked),
MPI_LOCK_EXCLUSIVE, or MPI_LOCK_SHARED. */
int lock_status;
/** number of peers who hold a shared lock on the local window */
int32_t shared_count;
int32_t lock_status;
/** lock for locks_pending list */
opal_mutex_t locks_pending_lock;
/** target side list of lock requests we couldn't satisfy yet */
opal_list_t locks_pending;
@ -210,29 +209,38 @@ struct ompi_osc_rdma_module_t {
/* enforce pscw matching */
/** list of unmatched post messages */
opal_list_t pending_posts;
};
typedef struct ompi_osc_rdma_module_t ompi_osc_rdma_module_t;
OMPI_MODULE_DECLSPEC extern ompi_osc_rdma_component_t mca_osc_rdma_component;
struct ompi_osc_rdma_pending_t {
/** Lock for garbage collection lists */
opal_mutex_t gc_lock;
/** List of requests that need to be freed */
opal_list_t request_gc;
/** List of buffers that need to be freed */
opal_list_t buffer_gc;
};
typedef struct ompi_osc_pt2pt_module_t ompi_osc_pt2pt_module_t;
OMPI_MODULE_DECLSPEC extern ompi_osc_pt2pt_component_t mca_osc_pt2pt_component;
struct ompi_osc_pt2pt_pending_t {
opal_list_item_t super;
ompi_osc_rdma_module_t *module;
ompi_osc_pt2pt_module_t *module;
int source;
ompi_osc_rdma_header_t header;
ompi_osc_pt2pt_header_t header;
};
typedef struct ompi_osc_rdma_pending_t ompi_osc_rdma_pending_t;
OBJ_CLASS_DECLARATION(ompi_osc_rdma_pending_t);
typedef struct ompi_osc_pt2pt_pending_t ompi_osc_pt2pt_pending_t;
OBJ_CLASS_DECLARATION(ompi_osc_pt2pt_pending_t);
#define GET_MODULE(win) ((ompi_osc_rdma_module_t*) win->w_osc_module)
#define GET_MODULE(win) ((ompi_osc_pt2pt_module_t*) win->w_osc_module)
extern bool ompi_osc_rdma_no_locks;
extern bool ompi_osc_pt2pt_no_locks;
int ompi_osc_rdma_attach(struct ompi_win_t *win, void *base, size_t len);
int ompi_osc_rdma_detach(struct ompi_win_t *win, void *base);
int ompi_osc_pt2pt_attach(struct ompi_win_t *win, void *base, size_t len);
int ompi_osc_pt2pt_detach(struct ompi_win_t *win, void *base);
int ompi_osc_rdma_free(struct ompi_win_t *win);
int ompi_osc_pt2pt_free(struct ompi_win_t *win);
int ompi_osc_rdma_put(void *origin_addr,
int ompi_osc_pt2pt_put(void *origin_addr,
int origin_count,
struct ompi_datatype_t *origin_dt,
int target,
@ -241,7 +249,7 @@ int ompi_osc_rdma_put(void *origin_addr,
struct ompi_datatype_t *target_dt,
struct ompi_win_t *win);
int ompi_osc_rdma_accumulate(void *origin_addr,
int ompi_osc_pt2pt_accumulate(void *origin_addr,
int origin_count,
struct ompi_datatype_t *origin_dt,
int target,
@ -251,7 +259,7 @@ int ompi_osc_rdma_accumulate(void *origin_addr,
struct ompi_op_t *op,
struct ompi_win_t *win);
int ompi_osc_rdma_get(void *origin_addr,
int ompi_osc_pt2pt_get(void *origin_addr,
int origin_count,
struct ompi_datatype_t *origin_dt,
int target,
@ -260,7 +268,7 @@ int ompi_osc_rdma_get(void *origin_addr,
struct ompi_datatype_t *target_dt,
struct ompi_win_t *win);
int ompi_osc_rdma_compare_and_swap(void *origin_addr,
int ompi_osc_pt2pt_compare_and_swap(void *origin_addr,
void *compare_addr,
void *result_addr,
struct ompi_datatype_t *dt,
@ -268,7 +276,7 @@ int ompi_osc_rdma_compare_and_swap(void *origin_addr,
OPAL_PTRDIFF_TYPE target_disp,
struct ompi_win_t *win);
int ompi_osc_rdma_fetch_and_op(void *origin_addr,
int ompi_osc_pt2pt_fetch_and_op(void *origin_addr,
void *result_addr,
struct ompi_datatype_t *dt,
int target,
@ -276,7 +284,7 @@ int ompi_osc_rdma_fetch_and_op(void *origin_addr,
struct ompi_op_t *op,
struct ompi_win_t *win);
int ompi_osc_rdma_get_accumulate(void *origin_addr,
int ompi_osc_pt2pt_get_accumulate(void *origin_addr,
int origin_count,
struct ompi_datatype_t *origin_datatype,
void *result_addr,
@ -289,7 +297,7 @@ int ompi_osc_rdma_get_accumulate(void *origin_addr,
struct ompi_op_t *op,
struct ompi_win_t *win);
int ompi_osc_rdma_rput(void *origin_addr,
int ompi_osc_pt2pt_rput(void *origin_addr,
int origin_count,
struct ompi_datatype_t *origin_dt,
int target,
@ -299,7 +307,7 @@ int ompi_osc_rdma_rput(void *origin_addr,
struct ompi_win_t *win,
struct ompi_request_t **request);
int ompi_osc_rdma_rget(void *origin_addr,
int ompi_osc_pt2pt_rget(void *origin_addr,
int origin_count,
struct ompi_datatype_t *origin_dt,
int target,
@ -309,7 +317,7 @@ int ompi_osc_rdma_rget(void *origin_addr,
struct ompi_win_t *win,
struct ompi_request_t **request);
int ompi_osc_rdma_raccumulate(void *origin_addr,
int ompi_osc_pt2pt_raccumulate(void *origin_addr,
int origin_count,
struct ompi_datatype_t *origin_dt,
int target,
@ -320,7 +328,7 @@ int ompi_osc_rdma_raccumulate(void *origin_addr,
struct ompi_win_t *win,
struct ompi_request_t **request);
int ompi_osc_rdma_rget_accumulate(void *origin_addr,
int ompi_osc_pt2pt_rget_accumulate(void *origin_addr,
int origin_count,
struct ompi_datatype_t *origin_datatype,
void *result_addr,
@ -334,51 +342,51 @@ int ompi_osc_rdma_rget_accumulate(void *origin_addr,
struct ompi_win_t *win,
struct ompi_request_t **request);
int ompi_osc_rdma_fence(int assert, struct ompi_win_t *win);
int ompi_osc_pt2pt_fence(int assert, struct ompi_win_t *win);
/* received a post message */
int osc_rdma_incoming_post (ompi_osc_rdma_module_t *module, int source);
int osc_pt2pt_incoming_post (ompi_osc_pt2pt_module_t *module, int source);
int ompi_osc_rdma_start(struct ompi_group_t *group,
int ompi_osc_pt2pt_start(struct ompi_group_t *group,
int assert,
struct ompi_win_t *win);
int ompi_osc_rdma_complete(struct ompi_win_t *win);
int ompi_osc_pt2pt_complete(struct ompi_win_t *win);
int ompi_osc_rdma_post(struct ompi_group_t *group,
int ompi_osc_pt2pt_post(struct ompi_group_t *group,
int assert,
struct ompi_win_t *win);
int ompi_osc_rdma_wait(struct ompi_win_t *win);
int ompi_osc_pt2pt_wait(struct ompi_win_t *win);
int ompi_osc_rdma_test(struct ompi_win_t *win,
int ompi_osc_pt2pt_test(struct ompi_win_t *win,
int *flag);
int ompi_osc_rdma_lock(int lock_type,
int ompi_osc_pt2pt_lock(int lock_type,
int target,
int assert,
struct ompi_win_t *win);
int ompi_osc_rdma_unlock(int target,
int ompi_osc_pt2pt_unlock(int target,
struct ompi_win_t *win);
int ompi_osc_rdma_lock_all(int assert,
int ompi_osc_pt2pt_lock_all(int assert,
struct ompi_win_t *win);
int ompi_osc_rdma_unlock_all(struct ompi_win_t *win);
int ompi_osc_pt2pt_unlock_all(struct ompi_win_t *win);
int ompi_osc_rdma_sync(struct ompi_win_t *win);
int ompi_osc_pt2pt_sync(struct ompi_win_t *win);
int ompi_osc_rdma_flush(int target,
int ompi_osc_pt2pt_flush(int target,
struct ompi_win_t *win);
int ompi_osc_rdma_flush_all(struct ompi_win_t *win);
int ompi_osc_rdma_flush_local(int target,
int ompi_osc_pt2pt_flush_all(struct ompi_win_t *win);
int ompi_osc_pt2pt_flush_local(int target,
struct ompi_win_t *win);
int ompi_osc_rdma_flush_local_all(struct ompi_win_t *win);
int ompi_osc_pt2pt_flush_local_all(struct ompi_win_t *win);
int ompi_osc_rdma_set_info(struct ompi_win_t *win, struct ompi_info_t *info);
int ompi_osc_rdma_get_info(struct ompi_win_t *win, struct ompi_info_t **info_used);
int ompi_osc_pt2pt_set_info(struct ompi_win_t *win, struct ompi_info_t *info);
int ompi_osc_pt2pt_get_info(struct ompi_win_t *win, struct ompi_info_t **info_used);
int ompi_osc_rdma_component_irecv(ompi_osc_rdma_module_t *module,
int ompi_osc_pt2pt_component_irecv(ompi_osc_pt2pt_module_t *module,
void *buf,
size_t count,
struct ompi_datatype_t *datatype,
@ -386,7 +394,7 @@ int ompi_osc_rdma_component_irecv(ompi_osc_rdma_module_t *module,
int tag,
struct ompi_communicator_t *comm);
int ompi_osc_rdma_component_isend(ompi_osc_rdma_module_t *module,
int ompi_osc_pt2pt_component_isend(ompi_osc_pt2pt_module_t *module,
void *buf,
size_t count,
struct ompi_datatype_t *datatype,
@ -395,16 +403,16 @@ int ompi_osc_rdma_component_isend(ompi_osc_rdma_module_t *module,
struct ompi_communicator_t *comm);
/**
* ompi_osc_rdma_progress_pending_acc:
* ompi_osc_pt2pt_progress_pending_acc:
*
* @short Progress one pending accumulation or compare and swap operation.
*
* @param[in] module - OSC RDMA module
* @param[in] module - OSC PT2PT module
*
* @long If the accumulation lock can be aquired progress one pending
* accumulate or compare and swap operation.
*/
int ompi_osc_rdma_progress_pending_acc (ompi_osc_rdma_module_t *module);
int ompi_osc_pt2pt_progress_pending_acc (ompi_osc_pt2pt_module_t *module);
/**
@ -412,7 +420,7 @@ int ompi_osc_rdma_progress_pending_acc (ompi_osc_rdma_module_t *module);
*
* @short Increment incoming completeion count.
*
* @param[in] module - OSC RDMA module
* @param[in] module - OSC PT2PT module
* @param[in] source - Passive target source or MPI_PROC_NULL (active target)
*
* @long This function incremements either the passive or active incoming counts.
@ -420,7 +428,7 @@ int ompi_osc_rdma_progress_pending_acc (ompi_osc_rdma_module_t *module);
* This function uses atomics if necessary so it is not necessary to hold
* the module lock before calling this function.
*/
static inline void mark_incoming_completion (ompi_osc_rdma_module_t *module, int source)
static inline void mark_incoming_completion (ompi_osc_pt2pt_module_t *module, int source)
{
if (MPI_PROC_NULL == source) {
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
@ -431,11 +439,12 @@ static inline void mark_incoming_completion (ompi_osc_rdma_module_t *module, int
opal_condition_broadcast(&module->cond);
}
} else {
ompi_osc_pt2pt_peer_t *peer = module->peers + source;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"mark_incoming_completion marking passive incoming complete. source = %d, count = %d",
source, (int) module->passive_incoming_frag_count[source] + 1));
OPAL_THREAD_ADD32((int32_t *) (module->passive_incoming_frag_count + source), 1);
if (module->passive_incoming_frag_count[source] >= module->passive_incoming_frag_signal_count[source]) {
source, (int) peer->passive_incoming_frag_count + 1));
OPAL_THREAD_ADD32((int32_t *) &peer->passive_incoming_frag_count, 1);
if (0 == peer->passive_incoming_frag_count) {
opal_condition_broadcast(&module->cond);
}
}
@ -446,7 +455,7 @@ static inline void mark_incoming_completion (ompi_osc_rdma_module_t *module, int
*
* @short Increment outgoing count.
*
* @param[in] module - OSC RDMA module
* @param[in] module - OSC PT2PT module
*
* @long This function is used to signal that an outgoing send is complete. It
* incrememnts only the outgoing fragment count and signals the module
@ -454,7 +463,7 @@ static inline void mark_incoming_completion (ompi_osc_rdma_module_t *module, int
* uses atomics if necessary so it is not necessary to hold the module
* lock before calling this function.
*/
static inline void mark_outgoing_completion (ompi_osc_rdma_module_t *module)
static inline void mark_outgoing_completion (ompi_osc_pt2pt_module_t *module)
{
OPAL_THREAD_ADD32((int32_t *) &module->outgoing_frag_count, 1);
if (module->outgoing_frag_count >= module->outgoing_frag_signal_count) {
@ -467,14 +476,14 @@ static inline void mark_outgoing_completion (ompi_osc_rdma_module_t *module)
*
* @short Increment outgoing signal counters.
*
* @param[in] module - OSC RDMA module
* @param[in] module - OSC PT2PT module
* @param[in] target - Passive target rank or MPI_PROC_NULL (active target)
* @param[in] count - Number of outgoing messages to signal.
*
* @long This function uses atomics if necessary so it is not necessary to hold
* the module lock before calling this function.
*/
static inline void ompi_osc_signal_outgoing (ompi_osc_rdma_module_t *module, int target, int count)
static inline void ompi_osc_signal_outgoing (ompi_osc_pt2pt_module_t *module, int target, int count)
{
OPAL_THREAD_ADD32((int32_t *) &module->outgoing_frag_signal_count, count);
if (MPI_PROC_NULL != target) {
@ -486,7 +495,7 @@ static inline void ompi_osc_signal_outgoing (ompi_osc_rdma_module_t *module, int
}
/**
* osc_rdma_copy_on_recv:
* osc_pt2pt_copy_on_recv:
*
* @short Helper function. Copies data from source to target through the
* convertor.
@ -502,7 +511,7 @@ static inline void ompi_osc_signal_outgoing (ompi_osc_rdma_module_t *module, int
* buffer. The copy is done with a convertor generated from proc,
* datatype, and count.
*/
static inline void osc_rdma_copy_on_recv (void *target, void *source, size_t source_len, ompi_proc_t *proc,
static inline void osc_pt2pt_copy_on_recv (void *target, void *source, size_t source_len, ompi_proc_t *proc,
int count, ompi_datatype_t *datatype)
{
opal_convertor_t convertor;
@ -530,7 +539,7 @@ static inline void osc_rdma_copy_on_recv (void *target, void *source, size_t sou
}
/**
* osc_rdma_copy_for_send:
* osc_pt2pt_copy_for_send:
*
* @short: Helper function. Copies data from source to target through the
* convertor.
@ -546,7 +555,7 @@ static inline void osc_rdma_copy_on_recv (void *target, void *source, size_t sou
* buffer. The copy is done with a convertor generated from proc,
* datatype, and count.
*/
static inline void osc_rdma_copy_for_send (void *target, size_t target_len, void *source, ompi_proc_t *proc,
static inline void osc_pt2pt_copy_for_send (void *target, size_t target_len, void *source, ompi_proc_t *proc,
int count, ompi_datatype_t *datatype)
{
opal_convertor_t convertor;
@ -567,7 +576,7 @@ static inline void osc_rdma_copy_for_send (void *target, size_t target_len, void
}
/**
* osc_rdma_request_gc_clean:
* osc_pt2pt_request_gc_clean:
*
* @short Release finished PML requests and accumulate buffers.
*
@ -576,71 +585,77 @@ static inline void osc_rdma_copy_for_send (void *target, size_t target_len, void
* and buffers on the module's garbage collection lists and release then
* at a later time.
*/
static inline void osc_rdma_gc_clean (void)
static inline void osc_pt2pt_gc_clean (ompi_osc_pt2pt_module_t *module)
{
ompi_request_t *request;
opal_list_item_t *item;
OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock);
OPAL_THREAD_LOCK(&module->gc_lock);
while (NULL != (request = (ompi_request_t *) opal_list_remove_first (&mca_osc_rdma_component.request_gc))) {
while (NULL != (request = (ompi_request_t *) opal_list_remove_first (&module->request_gc))) {
OPAL_THREAD_UNLOCK(&module->gc_lock);
ompi_request_free (&request);
OPAL_THREAD_LOCK(&module->gc_lock);
}
while (NULL != (item = opal_list_remove_first (&mca_osc_rdma_component.buffer_gc))) {
while (NULL != (item = opal_list_remove_first (&module->buffer_gc))) {
OBJ_RELEASE(item);
}
OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock);
OPAL_THREAD_UNLOCK(&module->gc_lock);
}
static inline void osc_rdma_gc_add_request (ompi_request_t *request)
static inline void osc_pt2pt_gc_add_request (ompi_osc_pt2pt_module_t *module, ompi_request_t *request)
{
OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock);
opal_list_append (&mca_osc_rdma_component.request_gc, (opal_list_item_t *) request);
OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock);
OPAL_THREAD_SCOPED_LOCK(&module->gc_lock,
opal_list_append (&module->request_gc, (opal_list_item_t *) request));
}
static inline void osc_rdma_gc_add_buffer (opal_list_item_t *buffer)
static inline void osc_pt2pt_gc_add_buffer (ompi_osc_pt2pt_module_t *module, opal_list_item_t *buffer)
{
OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock);
opal_list_append (&mca_osc_rdma_component.buffer_gc, buffer);
OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock);
OPAL_THREAD_SCOPED_LOCK(&module->gc_lock,
opal_list_append (&module->buffer_gc, buffer));
}
#define OSC_RDMA_FRAG_TAG 0x10000
#define OSC_RDMA_FRAG_MASK 0x0ffff
static inline void osc_pt2pt_add_pending (ompi_osc_pt2pt_pending_t *pending)
{
OPAL_THREAD_SCOPED_LOCK(&mca_osc_pt2pt_component.pending_operations_lock,
opal_list_append (&mca_osc_pt2pt_component.pending_operations, &pending->super));
}
#define OSC_PT2PT_FRAG_TAG 0x10000
#define OSC_PT2PT_FRAG_MASK 0x0ffff
/**
* get_tag:
*
* @short Get a send/recv tag for large memory operations.
*
* @param[in] module - OSC RDMA module
* @param[in] module - OSC PT2PT module
*
* @long This function aquires a 16-bit tag for use with large memory operations. The
* tag will be odd or even depending on if this is in a passive target access
* or not.
*/
static inline int get_tag(ompi_osc_rdma_module_t *module)
static inline int get_tag(ompi_osc_pt2pt_module_t *module)
{
/* the LSB of the tag is used be the receiver to determine if the
message is a passive or active target (ie, where to mark
completion). */
int tmp = module->tag_counter + !!(module->passive_target_access_epoch);
module->tag_counter = (module->tag_counter + 2) & OSC_RDMA_FRAG_MASK;
module->tag_counter = (module->tag_counter + 2) & OSC_PT2PT_FRAG_MASK;
return tmp;
}
/**
* ompi_osc_rdma_accumulate_lock:
* ompi_osc_pt2pt_accumulate_lock:
*
* @short Internal function that spins until the accumulation lock has
* been aquired.
*
* @param[in] module - OSC RDMA module
* @param[in] module - OSC PT2PT module
*
* @returns 0
*
@ -648,9 +663,9 @@ static inline int get_tag(ompi_osc_rdma_module_t *module)
* behavior is only acceptable from a user-level call as blocking in a
* callback may cause deadlock. If a callback needs the accumulate lock and
* it is not available it should be placed on the pending_acc list of the
* module. It will be released by ompi_osc_rdma_accumulate_unlock().
* module. It will be released by ompi_osc_pt2pt_accumulate_unlock().
*/
static inline int ompi_osc_rdma_accumulate_lock (ompi_osc_rdma_module_t *module)
static inline int ompi_osc_pt2pt_accumulate_lock (ompi_osc_pt2pt_module_t *module)
{
while (opal_atomic_trylock (&module->accumulate_lock)) {
opal_progress ();
@ -660,11 +675,11 @@ static inline int ompi_osc_rdma_accumulate_lock (ompi_osc_rdma_module_t *module)
}
/**
* ompi_osc_rdma_accumulate_trylock:
* ompi_osc_pt2pt_accumulate_trylock:
*
* @short Try to aquire the accumulation lock.
*
* @param[in] module - OSC RDMA module
* @param[in] module - OSC PT2PT module
*
* @returns 0 if the accumulation lock was aquired
* @returns 1 if the lock was not available
@ -672,34 +687,34 @@ static inline int ompi_osc_rdma_accumulate_lock (ompi_osc_rdma_module_t *module)
* @long This function will try to aquire the accumulation lock. This function
* is safe to call from a callback.
*/
static inline int ompi_osc_rdma_accumulate_trylock (ompi_osc_rdma_module_t *module)
static inline int ompi_osc_pt2pt_accumulate_trylock (ompi_osc_pt2pt_module_t *module)
{
return opal_atomic_trylock (&module->accumulate_lock);
}
/**
* ompi_osc_rdma_accumulate_unlock:
* ompi_osc_pt2pt_accumulate_unlock:
*
* @short Unlock the accumulation lock and release a pending accumulation operation.
*
* @param[in] module - OSC RDMA module
* @param[in] module - OSC PT2PT module
*
* @long This function unlocks the accumulation lock and release a single pending
* accumulation operation if one exists. This function may be called recursively.
*/
static inline void ompi_osc_rdma_accumulate_unlock (ompi_osc_rdma_module_t *module)
static inline void ompi_osc_pt2pt_accumulate_unlock (ompi_osc_pt2pt_module_t *module)
{
opal_atomic_unlock (&module->accumulate_lock);
if (0 != opal_list_get_size (&module->pending_acc)) {
ompi_osc_rdma_progress_pending_acc (module);
ompi_osc_pt2pt_progress_pending_acc (module);
}
}
static inline bool ompi_osc_rdma_check_access_epoch (ompi_osc_rdma_module_t *module, int rank)
static inline bool ompi_osc_pt2pt_check_access_epoch (ompi_osc_pt2pt_module_t *module, int rank)
{
return module->all_access_epoch || module->peers[rank].access_epoch;
}
END_C_DECLS
#endif /* OMPI_OSC_RDMA_H */
#endif /* OMPI_OSC_PT2PT_H */

Просмотреть файл

@ -21,10 +21,10 @@
#include "ompi_config.h"
#include "osc_rdma.h"
#include "osc_rdma_header.h"
#include "osc_rdma_data_move.h"
#include "osc_rdma_frag.h"
#include "osc_pt2pt.h"
#include "osc_pt2pt_header.h"
#include "osc_pt2pt_data_move.h"
#include "osc_pt2pt_frag.h"
#include "mpi.h"
#include "opal/runtime/opal_progress.h"
@ -33,19 +33,19 @@
#include "ompi/mca/osc/base/base.h"
/**
* ompi_osc_rdma_pending_post_t:
* ompi_osc_pt2pt_pending_post_t:
*
* Describes a post operation that was encountered outside its
* matching start operation.
*/
struct ompi_osc_rdma_pending_post_t {
struct ompi_osc_pt2pt_pending_post_t {
opal_list_item_t super;
int rank;
};
typedef struct ompi_osc_rdma_pending_post_t ompi_osc_rdma_pending_post_t;
OBJ_CLASS_DECLARATION(ompi_osc_rdma_pending_post_t);
typedef struct ompi_osc_pt2pt_pending_post_t ompi_osc_pt2pt_pending_post_t;
OBJ_CLASS_DECLARATION(ompi_osc_pt2pt_pending_post_t);
OBJ_CLASS_INSTANCE(ompi_osc_rdma_pending_post_t, opal_list_item_t, NULL, NULL);
OBJ_CLASS_INSTANCE(ompi_osc_pt2pt_pending_post_t, opal_list_item_t, NULL, NULL);
static bool group_contains_proc (ompi_group_t *group, ompi_proc_t *proc)
{
@ -64,7 +64,7 @@ static bool group_contains_proc (ompi_group_t *group, ompi_proc_t *proc)
}
static int*
get_comm_ranks(ompi_osc_rdma_module_t *module,
get_comm_ranks(ompi_osc_pt2pt_module_t *module,
ompi_group_t *sub_group)
{
int *ranks1 = NULL, *ranks2 = NULL;
@ -100,14 +100,14 @@ get_comm_ranks(ompi_osc_rdma_module_t *module,
}
int
ompi_osc_rdma_fence(int assert, ompi_win_t *win)
ompi_osc_pt2pt_fence(int assert, ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
uint32_t incoming_reqs;
int ret = OMPI_SUCCESS;
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc rdma: fence start"));
"osc pt2pt: fence start"));
/* can't enter an active target epoch when in a passive target epoch */
if (module->passive_target_access_epoch) {
@ -122,34 +122,36 @@ ompi_osc_rdma_fence(int assert, ompi_win_t *win)
/* short-circuit the noprecede case */
if (0 != (assert & MPI_MODE_NOPRECEDE)) {
ret = module->comm->c_coll.coll_barrier(module->comm,
module->comm->c_coll.coll_barrier_module);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: fence end (short circuit)"));
"osc pt2pt: fence end (short circuit)"));
return ret;
}
/* try to start all the requests. */
ret = ompi_osc_rdma_frag_flush_all(module);
if (OMPI_SUCCESS != ret) goto cleanup;
/* try to start all requests. */
ret = ompi_osc_pt2pt_frag_flush_all(module);
if (OMPI_SUCCESS != ret) {
return ret;
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: fence done sending"));
"osc pt2pt: fence done sending"));
/* find out how much data everyone is going to send us. */
ret = module->comm->c_coll.coll_reduce_scatter_block (module->epoch_outgoing_frag_count,
&incoming_reqs, 1, MPI_UINT32_T,
MPI_SUM, module->comm,
module->comm->c_coll.coll_reduce_scatter_block_module);
if (OMPI_SUCCESS != ret) goto cleanup;
if (OMPI_SUCCESS != ret) {
OPAL_THREAD_UNLOCK(&module->lock);
return ret;
}
OPAL_THREAD_LOCK(&module->lock);
bzero(module->epoch_outgoing_frag_count,
sizeof(uint32_t) * ompi_comm_size(module->comm));
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: fence expects %d requests",
"osc pt2pt: fence expects %d requests",
incoming_reqs));
/* set our complete condition for incoming requests */
@ -161,32 +163,31 @@ ompi_osc_rdma_fence(int assert, ompi_win_t *win)
opal_condition_wait(&module->cond, &module->lock);
}
ret = OMPI_SUCCESS;
module->active_incoming_frag_signal_count = 0;
if (assert & MPI_MODE_NOSUCCEED) {
/* as specified in MPI-3 p 438 3-5 the fence can end an epoch. it isn't explicitly
* stated that MPI_MODE_NOSUCCEED ends the epoch but it is a safe assumption. */
module->active_eager_send_active = false;
module->all_access_epoch = false;
}
cleanup:
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc rdma: fence end: %d", ret));
}
opal_condition_broadcast (&module->cond);
OPAL_THREAD_UNLOCK(&module->lock);
return ret;
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc pt2pt: fence end: %d", ret));
return OMPI_SUCCESS;
}
int
ompi_osc_rdma_start(ompi_group_t *group,
ompi_osc_pt2pt_start(ompi_group_t *group,
int assert,
ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_pending_post_t *pending_post, *next;
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
ompi_osc_pt2pt_pending_post_t *pending_post, *next;
int group_size;
int *ranks;
@ -209,7 +210,7 @@ ompi_osc_rdma_start(ompi_group_t *group,
group_size = ompi_group_size (module->sc_group);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_start entering with group size %d...",
"ompi_osc_pt2pt_start entering with group size %d...",
group_size));
ranks = get_comm_ranks(module, module->sc_group);
@ -222,13 +223,17 @@ ompi_osc_rdma_start(ompi_group_t *group,
free (ranks);
OPAL_LIST_FOREACH_SAFE(pending_post, next, &module->pending_posts, ompi_osc_rdma_pending_post_t) {
OPAL_LIST_FOREACH_SAFE(pending_post, next, &module->pending_posts, ompi_osc_pt2pt_pending_post_t) {
ompi_proc_t *pending_proc = ompi_comm_peer_lookup (module->comm, pending_post->rank);
if (group_contains_proc (module->sc_group, pending_proc)) {
ompi_osc_pt2pt_peer_t *peer = module->peers + pending_post->rank;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "Consumed unexpected post message from %d",
pending_post->rank));
++module->num_post_msgs;
peer->eager_send_active = true;
opal_list_remove_item (&module->pending_posts, &pending_post->super);
OBJ_RELEASE(pending_post);
}
@ -254,7 +259,7 @@ ompi_osc_rdma_start(ompi_group_t *group,
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_start complete"));
"ompi_osc_pt2pt_start complete"));
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_SUCCESS;
@ -262,11 +267,11 @@ ompi_osc_rdma_start(ompi_group_t *group,
int
ompi_osc_rdma_complete(ompi_win_t *win)
ompi_osc_pt2pt_complete(ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_header_complete_t complete_req;
ompi_osc_rdma_peer_t *peer;
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
ompi_osc_pt2pt_header_complete_t complete_req;
ompi_osc_pt2pt_peer_t *peer;
int ret = OMPI_SUCCESS;
int i;
int *ranks = NULL;
@ -274,7 +279,7 @@ ompi_osc_rdma_complete(ompi_win_t *win)
int my_rank = ompi_comm_rank (module->comm);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_complete entering..."));
"ompi_osc_pt2pt_complete entering..."));
if (NULL == module->sc_group) {
return OMPI_ERR_RMA_SYNC;
@ -291,9 +296,10 @@ ompi_osc_rdma_complete(ompi_win_t *win)
"waiting for post messages. num_post_msgs = %d", module->num_post_msgs));
opal_condition_wait(&module->cond, &module->lock);
}
OPAL_THREAD_UNLOCK(&module->lock);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_complete sending complete message"));
"ompi_osc_pt2pt_complete sending complete message"));
/* for each process in group, send a control message with number
of updates coming, then start all the requests. Note that the
@ -302,38 +308,39 @@ ompi_osc_rdma_complete(ompi_win_t *win)
At the same time, clean out the outgoing count for the next
round. */
OPAL_THREAD_UNLOCK(&module->lock);
for (i = 0 ; i < ompi_group_size(module->sc_group) ; ++i) {
if (my_rank == ranks[i]) {
/* shortcut for self */
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "ompi_osc_rdma_complete self complete"));
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "ompi_osc_pt2pt_complete self complete"));
module->num_complete_msgs++;
continue;
}
complete_req.base.type = OMPI_OSC_RDMA_HDR_TYPE_COMPLETE;
complete_req.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID;
complete_req.base.type = OMPI_OSC_PT2PT_HDR_TYPE_COMPLETE;
complete_req.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID;
complete_req.frag_count = module->epoch_outgoing_frag_count[ranks[i]];
peer = module->peers + ranks[i];
peer->access_epoch = false;
ret = ompi_osc_rdma_control_send(module,
ret = ompi_osc_pt2pt_control_send(module,
ranks[i],
&complete_req,
sizeof(ompi_osc_rdma_header_complete_t));
sizeof(ompi_osc_pt2pt_header_complete_t));
if (OMPI_SUCCESS != ret) goto cleanup;
}
OPAL_THREAD_LOCK(&module->lock);
/* start all requests */
ret = ompi_osc_rdma_frag_flush_all(module);
ret = ompi_osc_pt2pt_frag_flush_all(module);
if (OMPI_SUCCESS != ret) goto cleanup;
OPAL_THREAD_LOCK(&module->lock);
/* zero the fragment counts here to ensure they are zerod */
for (i = 0 ; i < ompi_group_size(module->sc_group) ; ++i) {
peer = module->peers + ranks[i];
module->epoch_outgoing_frag_count[ranks[i]] = 0;
peer->eager_send_active = false;
}
/* wait for outgoing requests to complete. Don't wait for incoming, as
@ -347,14 +354,14 @@ ompi_osc_rdma_complete(ompi_win_t *win)
module->sc_group = NULL;
/* unlock here, as group cleanup can take a while... */
OPAL_THREAD_UNLOCK(&(module->lock));
OPAL_THREAD_UNLOCK(&module->lock);
/* phase 2 cleanup group */
ompi_group_decrement_proc_count(group);
OBJ_RELEASE(group);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_complete complete"));
"ompi_osc_pt2pt_complete complete"));
free (ranks);
return OMPI_SUCCESS;
@ -362,21 +369,19 @@ ompi_osc_rdma_complete(ompi_win_t *win)
cleanup:
if (NULL != ranks) free(ranks);
OPAL_THREAD_UNLOCK(&(module->lock));
return ret;
}
int
ompi_osc_rdma_post(ompi_group_t *group,
ompi_osc_pt2pt_post(ompi_group_t *group,
int assert,
ompi_win_t *win)
{
int *ranks;
int ret = OMPI_SUCCESS;
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_header_post_t post_req;
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
ompi_osc_pt2pt_header_post_t post_req;
int my_rank = ompi_comm_rank(module->comm);
/* can't check for all access epoch here due to fence */
@ -385,7 +390,7 @@ ompi_osc_rdma_post(ompi_group_t *group,
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_post entering with group size %d...",
"ompi_osc_pt2pt_post entering with group size %d...",
ompi_group_size (group)));
/* save the group */
@ -422,19 +427,19 @@ ompi_osc_rdma_post(ompi_group_t *group,
/* shortcut for self */
if (my_rank == ranks[i]) {
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "ompi_osc_rdma_complete self post"));
osc_rdma_incoming_post (module, my_rank);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "ompi_osc_pt2pt_complete self post"));
osc_pt2pt_incoming_post (module, my_rank);
continue;
}
post_req.base.type = OMPI_OSC_RDMA_HDR_TYPE_POST;
post_req.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID;
post_req.base.type = OMPI_OSC_PT2PT_HDR_TYPE_POST;
post_req.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID;
post_req.windx = ompi_comm_get_cid(module->comm);
/* we don't want to send any data, since we're the exposure
epoch only, so use an unbuffered send */
ret = ompi_osc_rdma_control_send_unbuffered(module, ranks[i], &post_req,
sizeof(ompi_osc_rdma_header_post_t));
ret = ompi_osc_pt2pt_control_send_unbuffered(module, ranks[i], &post_req,
sizeof(ompi_osc_pt2pt_header_post_t));
if (OMPI_SUCCESS != ret) {
break;
}
@ -447,9 +452,9 @@ ompi_osc_rdma_post(ompi_group_t *group,
int
ompi_osc_rdma_wait(ompi_win_t *win)
ompi_osc_pt2pt_wait(ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
ompi_group_t *group;
if (NULL == module->pw_group) {
@ -457,7 +462,7 @@ ompi_osc_rdma_wait(ompi_win_t *win)
}
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_wait entering..."));
"ompi_osc_pt2pt_wait entering..."));
OPAL_THREAD_LOCK(&module->lock);
while (0 != module->num_complete_msgs ||
@ -476,17 +481,17 @@ ompi_osc_rdma_wait(ompi_win_t *win)
OBJ_RELEASE(group);
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_wait complete"));
"ompi_osc_pt2pt_wait complete"));
return OMPI_SUCCESS;
}
int
ompi_osc_rdma_test(ompi_win_t *win,
ompi_osc_pt2pt_test(ompi_win_t *win,
int *flag)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
ompi_group_t *group;
int ret = OMPI_SUCCESS;
@ -504,7 +509,6 @@ ompi_osc_rdma_test(ompi_win_t *win,
module->active_incoming_frag_count != module->active_incoming_frag_signal_count) {
*flag = 0;
ret = OMPI_SUCCESS;
goto cleanup;
} else {
*flag = 1;
@ -519,21 +523,21 @@ ompi_osc_rdma_test(ompi_win_t *win,
return OMPI_SUCCESS;
}
cleanup:
OPAL_THREAD_UNLOCK(&(module->lock));
return ret;
}
int osc_rdma_incoming_post (ompi_osc_rdma_module_t *module, int source)
int osc_pt2pt_incoming_post (ompi_osc_pt2pt_module_t *module, int source)
{
ompi_proc_t *source_proc = ompi_comm_peer_lookup (module->comm, source);
ompi_osc_pt2pt_peer_t *peer = module->peers + source;
OPAL_THREAD_LOCK(&module->lock);
/* verify that this proc is part of the current start group */
if (!module->sc_group || !group_contains_proc (module->sc_group, source_proc)) {
ompi_osc_rdma_pending_post_t *pending_post = OBJ_NEW(ompi_osc_rdma_pending_post_t);
ompi_osc_pt2pt_pending_post_t *pending_post = OBJ_NEW(ompi_osc_pt2pt_pending_post_t);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"received unexpected post message from %d. module->sc_group = %p, size = %d",
@ -547,6 +551,9 @@ int osc_rdma_incoming_post (ompi_osc_rdma_module_t *module, int source)
return OMPI_SUCCESS;
}
assert (!peer->eager_send_active);
peer->eager_send_active = true;
module->num_post_msgs++;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"received post message. num_post_msgs = %d", module->num_post_msgs));

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -25,10 +25,10 @@
#include <string.h>
#include "osc_rdma.h"
#include "osc_rdma_data_move.h"
#include "osc_rdma_frag.h"
#include "osc_rdma_request.h"
#include "osc_pt2pt.h"
#include "osc_pt2pt_data_move.h"
#include "osc_pt2pt_frag.h"
#include "osc_pt2pt_request.h"
#include "opal/threads/condition.h"
#include "opal/threads/mutex.h"
@ -55,11 +55,11 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, in
struct ompi_communicator_t *comm, struct ompi_info_t *info,
int flavor, int *model);
ompi_osc_rdma_component_t mca_osc_rdma_component = {
ompi_osc_pt2pt_component_t mca_osc_pt2pt_component = {
{ /* ompi_osc_base_component_t */
{ /* ompi_base_component_t */
OMPI_OSC_BASE_VERSION_3_0_0,
"rdma",
"pt2pt",
OMPI_MAJOR_VERSION, /* MCA component major version */
OMPI_MINOR_VERSION, /* MCA component minor version */
OMPI_RELEASE_VERSION, /* MCA component release version */
@ -80,51 +80,51 @@ ompi_osc_rdma_component_t mca_osc_rdma_component = {
};
ompi_osc_rdma_module_t ompi_osc_rdma_module_template = {
ompi_osc_pt2pt_module_t ompi_osc_pt2pt_module_template = {
{
NULL, /* shared_query */
ompi_osc_rdma_attach,
ompi_osc_rdma_detach,
ompi_osc_rdma_free,
ompi_osc_pt2pt_attach,
ompi_osc_pt2pt_detach,
ompi_osc_pt2pt_free,
ompi_osc_rdma_put,
ompi_osc_rdma_get,
ompi_osc_rdma_accumulate,
ompi_osc_rdma_compare_and_swap,
ompi_osc_rdma_fetch_and_op,
ompi_osc_rdma_get_accumulate,
ompi_osc_pt2pt_put,
ompi_osc_pt2pt_get,
ompi_osc_pt2pt_accumulate,
ompi_osc_pt2pt_compare_and_swap,
ompi_osc_pt2pt_fetch_and_op,
ompi_osc_pt2pt_get_accumulate,
ompi_osc_rdma_rput,
ompi_osc_rdma_rget,
ompi_osc_rdma_raccumulate,
ompi_osc_rdma_rget_accumulate,
ompi_osc_pt2pt_rput,
ompi_osc_pt2pt_rget,
ompi_osc_pt2pt_raccumulate,
ompi_osc_pt2pt_rget_accumulate,
ompi_osc_rdma_fence,
ompi_osc_pt2pt_fence,
ompi_osc_rdma_start,
ompi_osc_rdma_complete,
ompi_osc_rdma_post,
ompi_osc_rdma_wait,
ompi_osc_rdma_test,
ompi_osc_pt2pt_start,
ompi_osc_pt2pt_complete,
ompi_osc_pt2pt_post,
ompi_osc_pt2pt_wait,
ompi_osc_pt2pt_test,
ompi_osc_rdma_lock,
ompi_osc_rdma_unlock,
ompi_osc_rdma_lock_all,
ompi_osc_rdma_unlock_all,
ompi_osc_pt2pt_lock,
ompi_osc_pt2pt_unlock,
ompi_osc_pt2pt_lock_all,
ompi_osc_pt2pt_unlock_all,
ompi_osc_rdma_sync,
ompi_osc_rdma_flush,
ompi_osc_rdma_flush_all,
ompi_osc_rdma_flush_local,
ompi_osc_rdma_flush_local_all,
ompi_osc_pt2pt_sync,
ompi_osc_pt2pt_flush,
ompi_osc_pt2pt_flush_all,
ompi_osc_pt2pt_flush_local,
ompi_osc_pt2pt_flush_local_all,
ompi_osc_rdma_set_info,
ompi_osc_rdma_get_info
ompi_osc_pt2pt_set_info,
ompi_osc_pt2pt_get_info
}
};
bool ompi_osc_rdma_no_locks;
bool ompi_osc_pt2pt_no_locks;
/* look up parameters for configuring this window. The code first
looks in the info structure passed by the user, then through mca
@ -157,7 +157,7 @@ check_config_value_bool(char *key, ompi_info_t *info)
return result;
info_not_found:
param = mca_base_var_find("ompi", "osc", "rdma", key);
param = mca_base_var_find("ompi", "osc", "pt2pt", key);
if (0 > param) return false;
ret = mca_base_var_get_value(param, &flag_value, NULL, NULL);
@ -177,8 +177,8 @@ component_open(void)
static int
component_register(void)
{
ompi_osc_rdma_no_locks = false;
(void) mca_base_component_var_register(&mca_osc_rdma_component.super.osc_version,
ompi_osc_pt2pt_no_locks = false;
(void) mca_base_component_var_register(&mca_osc_pt2pt_component.super.osc_version,
"no_locks",
"Enable optimizations available only if MPI_LOCK is "
"not used. "
@ -186,38 +186,39 @@ component_register(void)
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&ompi_osc_rdma_no_locks);
&ompi_osc_pt2pt_no_locks);
mca_osc_rdma_component.buffer_size = 8192;
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "buffer_size",
mca_osc_pt2pt_component.buffer_size = 8192;
(void) mca_base_component_var_register (&mca_osc_pt2pt_component.super.osc_version, "buffer_size",
"Data transfers smaller than this limit may be coalesced before "
"being transferred (default: 8k)", MCA_BASE_VAR_TYPE_UNSIGNED_INT,
NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
&mca_osc_rdma_component.buffer_size);
&mca_osc_pt2pt_component.buffer_size);
return OMPI_SUCCESS;
}
static int component_progress (void)
{
ompi_osc_rdma_pending_t *pending, *next;
int count = opal_list_get_size (&mca_osc_pt2pt_component.pending_operations);
ompi_osc_pt2pt_pending_t *pending, *next;
if (0 == opal_list_get_size (&mca_osc_rdma_component.pending_operations)) {
if (0 == count) {
return 0;
}
/* process one incoming request */
OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock);
OPAL_LIST_FOREACH_SAFE(pending, next, &mca_osc_rdma_component.pending_operations, ompi_osc_rdma_pending_t) {
OPAL_THREAD_LOCK(&mca_osc_pt2pt_component.pending_operations_lock);
OPAL_LIST_FOREACH_SAFE(pending, next, &mca_osc_pt2pt_component.pending_operations, ompi_osc_pt2pt_pending_t) {
int ret;
switch (pending->header.base.type) {
case OMPI_OSC_RDMA_HDR_TYPE_FLUSH_REQ:
ret = ompi_osc_rdma_process_flush (pending->module, pending->source,
case OMPI_OSC_PT2PT_HDR_TYPE_FLUSH_REQ:
ret = ompi_osc_pt2pt_process_flush (pending->module, pending->source,
&pending->header.flush);
break;
case OMPI_OSC_RDMA_HDR_TYPE_UNLOCK_REQ:
ret = ompi_osc_rdma_process_unlock (pending->module, pending->source,
case OMPI_OSC_PT2PT_HDR_TYPE_UNLOCK_REQ:
ret = ompi_osc_pt2pt_process_unlock (pending->module, pending->source,
&pending->header.unlock);
break;
default:
@ -227,11 +228,11 @@ static int component_progress (void)
}
if (OMPI_SUCCESS == ret) {
opal_list_remove_item (&mca_osc_rdma_component.pending_operations, &pending->super);
opal_list_remove_item (&mca_osc_pt2pt_component.pending_operations, &pending->super);
OBJ_RELEASE(pending);
}
}
OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock);
OPAL_THREAD_UNLOCK(&mca_osc_pt2pt_component.pending_operations_lock);
return 1;
}
@ -242,23 +243,24 @@ component_init(bool enable_progress_threads,
{
int ret;
OBJ_CONSTRUCT(&mca_osc_rdma_component.lock, opal_mutex_t);
OBJ_CONSTRUCT(&mca_osc_rdma_component.pending_operations, opal_list_t);
OBJ_CONSTRUCT(&mca_osc_rdma_component.request_gc, opal_list_t);
OBJ_CONSTRUCT(&mca_osc_rdma_component.buffer_gc, opal_list_t);
OBJ_CONSTRUCT(&mca_osc_pt2pt_component.lock, opal_mutex_t);
OBJ_CONSTRUCT(&mca_osc_pt2pt_component.pending_operations, opal_list_t);
OBJ_CONSTRUCT(&mca_osc_pt2pt_component.pending_operations_lock, opal_mutex_t);
OBJ_CONSTRUCT(&mca_osc_rdma_component.modules,
OBJ_CONSTRUCT(&mca_osc_pt2pt_component.modules,
opal_hash_table_t);
opal_hash_table_init(&mca_osc_rdma_component.modules, 2);
opal_hash_table_init(&mca_osc_pt2pt_component.modules, 2);
mca_osc_rdma_component.progress_enable = false;
mca_osc_rdma_component.module_count = 0;
mca_osc_pt2pt_component.progress_enable = false;
mca_osc_pt2pt_component.module_count = 0;
OBJ_CONSTRUCT(&mca_osc_rdma_component.frags, opal_free_list_t);
ret = opal_free_list_init(&mca_osc_rdma_component.frags,
sizeof(ompi_osc_rdma_frag_t),
OBJ_CLASS(ompi_osc_rdma_frag_t),
1, -1, 1);
OBJ_CONSTRUCT(&mca_osc_pt2pt_component.frags, ompi_free_list_t);
ret = ompi_free_list_init_new (&mca_osc_pt2pt_component.frags,
sizeof(ompi_osc_pt2pt_frag_t), 8,
OBJ_CLASS(ompi_osc_pt2pt_frag_t),
mca_osc_pt2pt_component.buffer_size +
sizeof (ompi_osc_pt2pt_frag_header_t),
8, 1, -1, 1, 0);
if (OMPI_SUCCESS != ret) {
opal_output_verbose(1, ompi_osc_base_framework.framework_output,
"%s:%d: ompi_free_list_init failed: %d",
@ -266,10 +268,10 @@ component_init(bool enable_progress_threads,
return ret;
}
OBJ_CONSTRUCT(&mca_osc_rdma_component.requests, ompi_free_list_t);
ret = ompi_free_list_init(&mca_osc_rdma_component.requests,
sizeof(ompi_osc_rdma_request_t),
OBJ_CLASS(ompi_osc_rdma_request_t),
OBJ_CONSTRUCT(&mca_osc_pt2pt_component.requests, ompi_free_list_t);
ret = ompi_free_list_init(&mca_osc_pt2pt_component.requests,
sizeof(ompi_osc_pt2pt_request_t),
OBJ_CLASS(ompi_osc_pt2pt_request_t),
0, -1, 32, NULL);
if (OMPI_SUCCESS != ret) {
opal_output_verbose(1, ompi_osc_base_framework.framework_output,
@ -287,24 +289,23 @@ component_finalize(void)
{
size_t num_modules;
if (mca_osc_rdma_component.progress_enable) {
if (mca_osc_pt2pt_component.progress_enable) {
opal_progress_unregister (component_progress);
}
if (0 !=
(num_modules = opal_hash_table_get_size(&mca_osc_rdma_component.modules))) {
(num_modules = opal_hash_table_get_size(&mca_osc_pt2pt_component.modules))) {
opal_output(ompi_osc_base_framework.framework_output,
"WARNING: There were %d Windows created but not freed.",
(int) num_modules);
}
OBJ_DESTRUCT(&mca_osc_rdma_component.frags);
OBJ_DESTRUCT(&mca_osc_rdma_component.modules);
OBJ_DESTRUCT(&mca_osc_rdma_component.lock);
OBJ_DESTRUCT(&mca_osc_rdma_component.requests);
OBJ_DESTRUCT(&mca_osc_rdma_component.pending_operations);
OBJ_DESTRUCT(&mca_osc_rdma_component.request_gc);
OBJ_DESTRUCT(&mca_osc_rdma_component.buffer_gc);
OBJ_DESTRUCT(&mca_osc_pt2pt_component.frags);
OBJ_DESTRUCT(&mca_osc_pt2pt_component.modules);
OBJ_DESTRUCT(&mca_osc_pt2pt_component.lock);
OBJ_DESTRUCT(&mca_osc_pt2pt_component.requests);
OBJ_DESTRUCT(&mca_osc_pt2pt_component.pending_operations);
OBJ_DESTRUCT(&mca_osc_pt2pt_component.pending_operations_lock);
return OMPI_SUCCESS;
}
@ -326,27 +327,21 @@ component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit
struct ompi_communicator_t *comm, struct ompi_info_t *info,
int flavor, int *model)
{
ompi_osc_rdma_module_t *module = NULL;
ompi_osc_pt2pt_module_t *module = NULL;
int ret;
char *name;
bool no_locks = false;
/* We don't support shared windows; that's for the sm onesided
component */
if (MPI_WIN_FLAVOR_SHARED == flavor) return OMPI_ERR_NOT_SUPPORTED;
if (check_config_value_bool("no_locks", info)) {
no_locks = true;
ompi_osc_rdma_no_locks = true;
}
/* create module structure with all fields initialized to zero */
module = (ompi_osc_rdma_module_t*)
calloc(1, sizeof(ompi_osc_rdma_module_t));
module = (ompi_osc_pt2pt_module_t*)
calloc(1, sizeof(ompi_osc_pt2pt_module_t));
if (NULL == module) return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
/* fill in the function pointer part */
memcpy(module, &ompi_osc_rdma_module_template,
memcpy(module, &ompi_osc_pt2pt_module_template,
sizeof(ompi_osc_base_module_t));
/* initialize the objects, so that always free in cleanup */
@ -354,10 +349,15 @@ component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit
OBJ_CONSTRUCT(&module->cond, opal_condition_t);
OBJ_CONSTRUCT(&module->acc_lock, opal_mutex_t);
OBJ_CONSTRUCT(&module->queued_frags, opal_list_t);
OBJ_CONSTRUCT(&module->queued_frags_lock, opal_mutex_t);
OBJ_CONSTRUCT(&module->locks_pending, opal_list_t);
OBJ_CONSTRUCT(&module->locks_pending_lock, opal_mutex_t);
OBJ_CONSTRUCT(&module->outstanding_locks, opal_list_t);
OBJ_CONSTRUCT(&module->pending_acc, opal_list_t);
OBJ_CONSTRUCT(&module->pending_posts, opal_list_t);
OBJ_CONSTRUCT(&module->request_gc, opal_list_t);
OBJ_CONSTRUCT(&module->buffer_gc, opal_list_t);
OBJ_CONSTRUCT(&module->gc_lock, opal_mutex_t);
/* options */
/* FIX ME: should actually check this value... */
@ -385,14 +385,14 @@ component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit
if (OMPI_SUCCESS != ret) goto cleanup;
OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output,
"rdma component creating window with id %d",
"pt2pt component creating window with id %d",
ompi_comm_get_cid(module->comm)));
/* record my displacement unit. Always resolved at target */
module->disp_unit = disp_unit;
/* peer data */
module->peers = calloc(ompi_comm_size(comm), sizeof(ompi_osc_rdma_peer_t));
module->peers = calloc(ompi_comm_size(comm), sizeof(ompi_osc_pt2pt_peer_t));
if (NULL == module->peers) {
ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
goto cleanup;
@ -405,20 +405,6 @@ component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit
goto cleanup;
}
if (!no_locks) {
module->passive_incoming_frag_count = calloc(ompi_comm_size(comm), sizeof(uint32_t));
if (NULL == module->passive_incoming_frag_count) {
ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
goto cleanup;
}
module->passive_incoming_frag_signal_count = calloc(ompi_comm_size(comm), sizeof(uint32_t));
if (NULL == module->passive_incoming_frag_signal_count) {
ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
goto cleanup;
}
}
/* the statement below (from Brian) does not seem correct so disable active target on the
* window. if this end up being incorrect please revert this one change */
module->active_eager_send_active = false;
@ -429,43 +415,35 @@ component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit
module->active_eager_send_active = true;
#endif
if (!no_locks) {
module->passive_eager_send_active = malloc(sizeof(bool) * ompi_comm_size(comm));
if (NULL == module->passive_eager_send_active) {
ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
goto cleanup;
}
}
/* lock data */
if (check_config_value_bool("no_locks", info)) {
win->w_flags |= OMPI_WIN_NO_LOCKS;
}
/* update component data */
OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock);
ret = opal_hash_table_set_value_uint32(&mca_osc_rdma_component.modules,
OPAL_THREAD_LOCK(&mca_osc_pt2pt_component.lock);
ret = opal_hash_table_set_value_uint32(&mca_osc_pt2pt_component.modules,
ompi_comm_get_cid(module->comm),
module);
OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock);
OPAL_THREAD_UNLOCK(&mca_osc_pt2pt_component.lock);
if (OMPI_SUCCESS != ret) goto cleanup;
/* fill in window information */
*model = MPI_WIN_UNIFIED;
win->w_osc_module = (ompi_osc_base_module_t*) module;
asprintf(&name, "rdma window %d", ompi_comm_get_cid(module->comm));
asprintf(&name, "pt2pt window %d", ompi_comm_get_cid(module->comm));
ompi_win_set_name(win, name);
free(name);
/* sync memory - make sure all initialization completed */
opal_atomic_mb();
module->incoming_buffer = malloc (mca_osc_rdma_component.buffer_size + sizeof (ompi_osc_rdma_frag_header_t));
module->incoming_buffer = malloc (mca_osc_pt2pt_component.buffer_size + sizeof (ompi_osc_pt2pt_frag_header_t));
if (OPAL_UNLIKELY(NULL == module->incoming_buffer)) {
goto cleanup;
}
ret = ompi_osc_rdma_frag_start_receive (module);
ret = ompi_osc_pt2pt_frag_start_receive (module);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
goto cleanup;
}
@ -476,30 +454,30 @@ component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit
module->comm->c_coll.coll_barrier_module);
if (OMPI_SUCCESS != ret) goto cleanup;
if (!mca_osc_rdma_component.progress_enable) {
if (!mca_osc_pt2pt_component.progress_enable) {
opal_progress_register (component_progress);
mca_osc_rdma_component.progress_enable = true;
mca_osc_pt2pt_component.progress_enable = true;
}
OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output,
"done creating rdma window %d", ompi_comm_get_cid(module->comm)));
"done creating pt2pt window %d", ompi_comm_get_cid(module->comm)));
return OMPI_SUCCESS;
cleanup:
/* set the module so we properly cleanup */
win->w_osc_module = (ompi_osc_base_module_t*) module;
ompi_osc_rdma_free (win);
ompi_osc_pt2pt_free (win);
return ret;
}
int
ompi_osc_rdma_set_info(struct ompi_win_t *win, struct ompi_info_t *info)
ompi_osc_pt2pt_set_info(struct ompi_win_t *win, struct ompi_info_t *info)
{
ompi_osc_rdma_module_t *module =
(ompi_osc_rdma_module_t*) win->w_osc_module;
ompi_osc_pt2pt_module_t *module =
(ompi_osc_pt2pt_module_t*) win->w_osc_module;
/* enforce collectiveness... */
return module->comm->c_coll.coll_barrier(module->comm,
@ -508,7 +486,7 @@ ompi_osc_rdma_set_info(struct ompi_win_t *win, struct ompi_info_t *info)
int
ompi_osc_rdma_get_info(struct ompi_win_t *win, struct ompi_info_t **info_used)
ompi_osc_pt2pt_get_info(struct ompi_win_t *win, struct ompi_info_t **info_used)
{
ompi_info_t *info = OBJ_NEW(ompi_info_t);
if (NULL == info) return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
@ -518,4 +496,4 @@ ompi_osc_rdma_get_info(struct ompi_win_t *win, struct ompi_info_t **info_used)
return OMPI_SUCCESS;
}
OBJ_CLASS_INSTANCE(ompi_osc_rdma_pending_t, opal_list_item_t, NULL, NULL);
OBJ_CLASS_INSTANCE(ompi_osc_pt2pt_pending_t, opal_list_item_t, NULL, NULL);

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -18,22 +18,22 @@
* $HEADER$
*/
#ifndef OMPI_MCA_OSC_RDMA_DATA_MOVE_H
#define OMPI_MCA_OSC_RDMA_DATA_MOVE_H
#ifndef OMPI_MCA_OSC_PT2PT_DATA_MOVE_H
#define OMPI_MCA_OSC_PT2PT_DATA_MOVE_H
#include "osc_rdma_header.h"
#include "osc_pt2pt_header.h"
int ompi_osc_rdma_control_send(ompi_osc_rdma_module_t *module,
int ompi_osc_pt2pt_control_send(ompi_osc_pt2pt_module_t *module,
int target,
void *data,
size_t len);
/**
* ompi_osc_rdma_control_send_unbuffered:
* ompi_osc_pt2pt_control_send_unbuffered:
*
* @short Send an unbuffered control message to a peer.
*
* @param[in] module - OSC RDMA module
* @param[in] module - OSC PT2PT module
* @param[in] target - Target rank
* @param[in] data - Data to send
* @param[in] len - Length of data
@ -45,11 +45,11 @@ int ompi_osc_rdma_control_send(ompi_osc_rdma_module_t *module,
* from its peer). The buffer specified by data will be available
* when this call returns.
*/
int ompi_osc_rdma_control_send_unbuffered (ompi_osc_rdma_module_t *module,
int ompi_osc_pt2pt_control_send_unbuffered (ompi_osc_pt2pt_module_t *module,
int target, void *data, size_t len);
/**
* ompi_osc_rdma_isend_w_cb:
* ompi_osc_pt2pt_isend_w_cb:
*
* @short Post a non-blocking send with a specified callback.
*
@ -66,11 +66,11 @@ int ompi_osc_rdma_control_send_unbuffered (ompi_osc_rdma_module_t *module,
* be called with the associated request. The context specified in ctx will be stored in
* the req_completion_cb_data member of the ompi_request_t for use by the callback.
*/
int ompi_osc_rdma_isend_w_cb (void *ptr, int count, ompi_datatype_t *datatype, int target, int tag,
int ompi_osc_pt2pt_isend_w_cb (void *ptr, int count, ompi_datatype_t *datatype, int target, int tag,
ompi_communicator_t *comm, ompi_request_complete_fn_t cb, void *ctx);
/**
* ompi_osc_rdma_irecv_w_cb:
* ompi_osc_pt2pt_irecv_w_cb:
*
* @short Post a non-blocking receive with a specified callback.
*
@ -89,49 +89,49 @@ int ompi_osc_rdma_isend_w_cb (void *ptr, int count, ompi_datatype_t *datatype, i
* request. The context specified in ctx will be stored in the req_completion_cb_data
* member of the ompi_request_t for use by the callback.
*/
int ompi_osc_rdma_irecv_w_cb (void *ptr, int count, ompi_datatype_t *datatype, int source, int tag,
int ompi_osc_pt2pt_irecv_w_cb (void *ptr, int count, ompi_datatype_t *datatype, int source, int tag,
ompi_communicator_t *comm, ompi_request_t **request_out,
ompi_request_complete_fn_t cb, void *ctx);
int ompi_osc_rdma_process_lock(ompi_osc_rdma_module_t* module,
int ompi_osc_pt2pt_process_lock(ompi_osc_pt2pt_module_t* module,
int source,
struct ompi_osc_rdma_header_lock_t* lock_header);
struct ompi_osc_pt2pt_header_lock_t* lock_header);
void ompi_osc_rdma_process_lock_ack(ompi_osc_rdma_module_t* module,
struct ompi_osc_rdma_header_lock_ack_t* lock_header);
void ompi_osc_pt2pt_process_lock_ack(ompi_osc_pt2pt_module_t* module,
struct ompi_osc_pt2pt_header_lock_ack_t* lock_header);
int ompi_osc_rdma_process_unlock(ompi_osc_rdma_module_t* module,
int ompi_osc_pt2pt_process_unlock(ompi_osc_pt2pt_module_t* module,
int source,
struct ompi_osc_rdma_header_unlock_t* lock_header);
int ompi_osc_rdma_process_flush (ompi_osc_rdma_module_t *module, int source,
ompi_osc_rdma_header_flush_t *flush_header);
struct ompi_osc_pt2pt_header_unlock_t* lock_header);
int ompi_osc_pt2pt_process_flush (ompi_osc_pt2pt_module_t *module, int source,
ompi_osc_pt2pt_header_flush_t *flush_header);
/**
* ompi_osc_rdma_process_unlock_ack:
* ompi_osc_pt2pt_process_unlock_ack:
*
* @short Process an incoming unlock acknowledgement.
*
* @param[in] module - OSC RDMA module
* @param[in] module - OSC PT2PT module
* @param[in] source - Source rank
* @param[in] unlock_ack_header - Incoming unlock ack header
*/
void ompi_osc_rdma_process_unlock_ack (ompi_osc_rdma_module_t *module, int source,
ompi_osc_rdma_header_unlock_ack_t *unlock_ack_header);
void ompi_osc_pt2pt_process_unlock_ack (ompi_osc_pt2pt_module_t *module, int source,
ompi_osc_pt2pt_header_unlock_ack_t *unlock_ack_header);
/**
* ompi_osc_rdma_process_flush_ack:
* ompi_osc_pt2pt_process_flush_ack:
*
* @short Process an incoming flush acknowledgement.
*
* @param[in] module - OSC RDMA module
* @param[in] module - OSC PT2PT module
* @param[in] source - Source rank
* @param[in] flush_ack_header - Incoming flush ack header
*/
void ompi_osc_rdma_process_flush_ack (ompi_osc_rdma_module_t *module, int source,
ompi_osc_rdma_header_flush_ack_t *flush_ack_header);
void ompi_osc_pt2pt_process_flush_ack (ompi_osc_pt2pt_module_t *module, int source,
ompi_osc_pt2pt_header_flush_ack_t *flush_ack_header);
/**
* ompi_osc_rdma_frag_start_receive:
* ompi_osc_pt2pt_frag_start_receive:
*
* @short Start receiving fragments on the OSC module.
*
@ -140,6 +140,6 @@ void ompi_osc_rdma_process_flush_ack (ompi_osc_rdma_module_t *module, int source
* @long This function starts receiving eager fragments on the module. The current
* implementation uses the pml to transfer eager fragments.
*/
int ompi_osc_rdma_frag_start_receive (ompi_osc_rdma_module_t *module);
int ompi_osc_pt2pt_frag_start_receive (ompi_osc_pt2pt_module_t *module);
#endif

203
ompi/mca/osc/pt2pt/osc_pt2pt_frag.c Обычный файл
Просмотреть файл

@ -0,0 +1,203 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "opal/class/opal_list.h"
#include "ompi/mca/osc/base/base.h"
#include "ompi/mca/pml/pml.h"
#include "osc_pt2pt.h"
#include "osc_pt2pt_frag.h"
#include "osc_pt2pt_data_move.h"
static void ompi_osc_pt2pt_frag_constructor (ompi_osc_pt2pt_frag_t *frag){
frag->buffer = frag->super.ptr;
}
OBJ_CLASS_INSTANCE(ompi_osc_pt2pt_frag_t, ompi_free_list_item_t,
ompi_osc_pt2pt_frag_constructor, NULL);
static int frag_send_cb (ompi_request_t *request)
{
ompi_osc_pt2pt_frag_t *frag =
(ompi_osc_pt2pt_frag_t*) request->req_complete_cb_data;
ompi_osc_pt2pt_module_t *module = frag->module;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc pt2pt: frag_send complete to %d, frag = %p, request = %p",
frag->target, (void *) frag, (void *) request));
mark_outgoing_completion(module);
OMPI_FREE_LIST_RETURN_MT(&mca_osc_pt2pt_component.frags, &frag->super);
/* put this request on the garbage colletion list */
osc_pt2pt_gc_add_request (module, request);
return OMPI_SUCCESS;
}
static int
frag_send(ompi_osc_pt2pt_module_t *module,
ompi_osc_pt2pt_frag_t *frag)
{
int count;
count = (int)((uintptr_t) frag->top - (uintptr_t) frag->buffer);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc pt2pt: frag_send called to %d, frag = %p, count = %d",
frag->target, (void *) frag, count));
/* we need to signal now that a frag is outgoing to ensure the count sent
* with the unlock message is correct */
ompi_osc_signal_outgoing (module, frag->target, 1);
return ompi_osc_pt2pt_isend_w_cb (frag->buffer, count, MPI_BYTE, frag->target, OSC_PT2PT_FRAG_TAG,
module->comm, frag_send_cb, frag);
}
int
ompi_osc_pt2pt_frag_start(ompi_osc_pt2pt_module_t *module,
ompi_osc_pt2pt_frag_t *frag)
{
ompi_osc_pt2pt_peer_t *peer = module->peers + frag->target;
int ret;
assert(0 == frag->pending && peer->active_frag != frag);
/* we need to signal now that a frag is outgoing to ensure the count sent
* with the unlock message is correct */
ompi_osc_signal_outgoing (module, frag->target, 1);
/* if eager sends are not active, can't send yet, so buffer and
get out... */
if (!(peer->eager_send_active || module->all_access_epoch)) {
OPAL_THREAD_SCOPED_LOCK(&module->queued_frags_lock,
opal_list_append(&module->queued_frags, (opal_list_item_t *) frag));
return OMPI_SUCCESS;
}
ret = frag_send(module, frag);
opal_condition_broadcast(&module->cond);
return ret;
}
int
ompi_osc_pt2pt_frag_flush_target(ompi_osc_pt2pt_module_t *module, int target)
{
ompi_osc_pt2pt_frag_t *next, *frag = module->peers[target].active_frag;
int ret = OMPI_SUCCESS;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc pt2pt: frag flush target begin"));
/* flush the active frag */
if (NULL != frag) {
if (1 != frag->pending) {
/* communication going on while synchronizing; this is an rma usage bug */
return OMPI_ERR_RMA_SYNC;
}
if (opal_atomic_cmpset (&module->peers[target].active_frag, frag, NULL)) {
OPAL_THREAD_ADD32(&frag->pending, -1);
ret = ompi_osc_pt2pt_frag_start(module, frag);
if (OMPI_SUCCESS != ret) {
return ret;
}
}
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc pt2pt: frag flush target finished active frag"));
/* walk through the pending list and send */
OPAL_THREAD_LOCK(&module->queued_frags_lock);
if (opal_list_get_size (&module->queued_frags)) {
OPAL_LIST_FOREACH_SAFE(frag, next, &module->queued_frags, ompi_osc_pt2pt_frag_t) {
if (frag->target == target) {
opal_list_remove_item(&module->queued_frags, (opal_list_item_t *) frag);
ret = frag_send(module, frag);
if (OPAL_UNLIKELY(OMPI_SUCCESS != frag)) {
break;
}
}
}
}
OPAL_THREAD_UNLOCK(&module->queued_frags_lock);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc pt2pt: frag flush target finished"));
return ret;
}
int
ompi_osc_pt2pt_frag_flush_all(ompi_osc_pt2pt_module_t *module)
{
int ret = OMPI_SUCCESS;
int i;
ompi_osc_pt2pt_frag_t *frag, *next;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc pt2pt: frag flush all begin"));
/* flush the active frag */
for (i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
ompi_osc_pt2pt_frag_t *frag = module->peers[i].active_frag;
if (NULL != frag) {
if (1 != frag->pending) {
OPAL_THREAD_UNLOCK(&module->lock);
/* communication going on while synchronizing; this is a bug */
return OMPI_ERR_RMA_SYNC;
}
if (!opal_atomic_cmpset_ptr (&module->peers[i].active_frag, frag, NULL)) {
continue;
}
OPAL_THREAD_ADD32(&frag->pending, -1);
ret = ompi_osc_pt2pt_frag_start(module, frag);
if (OMPI_SUCCESS != ret) {
return ret;
}
}
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc pt2pt: frag flush all finished active frag"));
/* try to start all the queued frags */
OPAL_THREAD_LOCK(&module->queued_frags_lock);
if (opal_list_get_size (&module->queued_frags)) {
OPAL_LIST_FOREACH_SAFE(frag, next, &module->queued_frags, ompi_osc_pt2pt_frag_t) {
opal_list_remove_item(&module->queued_frags, (opal_list_item_t *) frag);
ret = frag_send(module, frag);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
break;
}
}
}
OPAL_THREAD_UNLOCK(&module->queued_frags_lock);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc pt2pt: frag flush all done"));
return ret;
}

143
ompi/mca/osc/pt2pt/osc_pt2pt_frag.h Обычный файл
Просмотреть файл

@ -0,0 +1,143 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2012 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef OSC_PT2PT_FRAG_H
#define OSC_PT2PT_FRAG_H
#include "ompi/communicator/communicator.h"
#include "osc_pt2pt_header.h"
#include "osc_pt2pt_request.h"
#include "opal/align.h"
/** Communication buffer for packing messages */
struct ompi_osc_pt2pt_frag_t {
ompi_free_list_item_t super;
/* target rank of buffer */
int target;
unsigned char *buffer;
/* space remaining in buffer */
size_t remain_len;
/* start of unused space */
char *top;
/* Number of operations which have started writing into the frag, but not yet completed doing so */
int32_t pending;
ompi_osc_pt2pt_frag_header_t *header;
ompi_osc_pt2pt_module_t *module;
};
typedef struct ompi_osc_pt2pt_frag_t ompi_osc_pt2pt_frag_t;
OBJ_CLASS_DECLARATION(ompi_osc_pt2pt_frag_t);
extern int ompi_osc_pt2pt_frag_start(ompi_osc_pt2pt_module_t *module, ompi_osc_pt2pt_frag_t *buffer);
extern int ompi_osc_pt2pt_frag_flush_target(ompi_osc_pt2pt_module_t *module, int target);
extern int ompi_osc_pt2pt_frag_flush_all(ompi_osc_pt2pt_module_t *module);
/*
* Note: module lock must be held during this operation
*/
static inline int ompi_osc_pt2pt_frag_alloc(ompi_osc_pt2pt_module_t *module, int target,
size_t request_len, ompi_osc_pt2pt_frag_t **buffer,
char **ptr)
{
ompi_osc_pt2pt_frag_t *curr = module->peers[target].active_frag;
int ret;
/* osc pt2pt headers can have 64-bit values. these will need to be aligned
* on an 8-byte boundary on some architectures so we up align the allocation
* size here. */
request_len = OPAL_ALIGN(request_len, 8, size_t);
if (request_len > mca_osc_pt2pt_component.buffer_size) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
OPAL_THREAD_LOCK(&module->lock);
if (NULL == curr || curr->remain_len < request_len) {
ompi_free_list_item_t *item = NULL;
if (NULL != curr) {
curr->remain_len = 0;
module->peers[target].active_frag = NULL;
opal_atomic_mb ();
/* If there's something pending, the pending finish will
start the buffer. Otherwise, we need to start it now. */
if (0 == OPAL_THREAD_ADD32(&curr->pending, -1)) {
ret = ompi_osc_pt2pt_frag_start(module, curr);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
}
}
OMPI_FREE_LIST_GET_MT(&mca_osc_pt2pt_component.frags, item);
if (OPAL_UNLIKELY(NULL == item)) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
curr = module->peers[target].active_frag =
(ompi_osc_pt2pt_frag_t*) item;
curr->target = target;
curr->header = (ompi_osc_pt2pt_frag_header_t*) curr->buffer;
curr->top = (char*) (curr->header + 1);
curr->remain_len = mca_osc_pt2pt_component.buffer_size;
curr->module = module;
curr->pending = 1;
curr->header->base.type = OMPI_OSC_PT2PT_HDR_TYPE_FRAG;
curr->header->base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID;
if (module->passive_target_access_epoch) {
curr->header->base.flags |= OMPI_OSC_PT2PT_HDR_FLAG_PASSIVE_TARGET;
}
curr->header->source = ompi_comm_rank(module->comm);
curr->header->num_ops = 0;
curr->header->windx = ompi_comm_get_cid(module->comm);
if (curr->remain_len < request_len) {
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
}
}
*ptr = curr->top;
*buffer = curr;
curr->top += request_len;
curr->remain_len -= request_len;
OPAL_THREAD_UNLOCK(&module->lock);
OPAL_THREAD_ADD32(&curr->pending, 1);
OPAL_THREAD_ADD32(&curr->header->num_ops, 1);
return OMPI_SUCCESS;
}
/*
* Note: module lock must be held for this operation
*/
static inline int ompi_osc_pt2pt_frag_finish(ompi_osc_pt2pt_module_t *module,
ompi_osc_pt2pt_frag_t* buffer)
{
if (0 == OPAL_THREAD_ADD32(&buffer->pending, -1)) {
return ompi_osc_pt2pt_frag_start(module, buffer);
}
return OMPI_SUCCESS;
}
#endif

189
ompi/mca/osc/pt2pt/osc_pt2pt_header.h Обычный файл
Просмотреть файл

@ -0,0 +1,189 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef OMPI_MCA_OSC_PT2PT_HDR_H
#define OMPI_MCA_OSC_PT2PT_HDR_H
#ifdef HAVE_NETINET_IN_H
#include <netinet/in.h>
#endif
#include "opal/types.h"
enum ompi_osc_pt2pt_hdr_type_t {
OMPI_OSC_PT2PT_HDR_TYPE_PUT = 0x01,
OMPI_OSC_PT2PT_HDR_TYPE_PUT_LONG = 0x02,
OMPI_OSC_PT2PT_HDR_TYPE_ACC = 0x03,
OMPI_OSC_PT2PT_HDR_TYPE_ACC_LONG = 0x04,
OMPI_OSC_PT2PT_HDR_TYPE_GET = 0x05,
OMPI_OSC_PT2PT_HDR_TYPE_CSWAP = 0x06,
OMPI_OSC_PT2PT_HDR_TYPE_CSWAP_LONG = 0x07,
OMPI_OSC_PT2PT_HDR_TYPE_GET_ACC = 0x08,
OMPI_OSC_PT2PT_HDR_TYPE_GET_ACC_LONG = 0x09,
OMPI_OSC_PT2PT_HDR_TYPE_COMPLETE = 0x10,
OMPI_OSC_PT2PT_HDR_TYPE_POST = 0x11,
OMPI_OSC_PT2PT_HDR_TYPE_LOCK_REQ = 0x12,
OMPI_OSC_PT2PT_HDR_TYPE_LOCK_ACK = 0x13,
OMPI_OSC_PT2PT_HDR_TYPE_UNLOCK_REQ = 0x14,
OMPI_OSC_PT2PT_HDR_TYPE_UNLOCK_ACK = 0x15,
OMPI_OSC_PT2PT_HDR_TYPE_FLUSH_REQ = 0x16,
OMPI_OSC_PT2PT_HDR_TYPE_FLUSH_ACK = 0x17,
OMPI_OSC_PT2PT_HDR_TYPE_FRAG = 0x20,
};
typedef enum ompi_osc_pt2pt_hdr_type_t ompi_osc_pt2pt_hdr_type_t;
#define OMPI_OSC_PT2PT_HDR_FLAG_NBO 0x01
#define OMPI_OSC_PT2PT_HDR_FLAG_VALID 0x02
#define OMPI_OSC_PT2PT_HDR_FLAG_PASSIVE_TARGET 0x04
#define OMPI_OSC_PT2PT_HDR_FLAG_LARGE_DATATYPE 0x08
struct ompi_osc_pt2pt_header_base_t {
/** fragment type. 8 bits */
uint8_t type;
/** fragment flags. 8 bits */
uint8_t flags;
};
typedef struct ompi_osc_pt2pt_header_base_t ompi_osc_pt2pt_header_base_t;
struct ompi_osc_pt2pt_header_put_t {
ompi_osc_pt2pt_header_base_t base;
uint16_t tag;
uint32_t count;
uint64_t len;
uint64_t displacement;
};
typedef struct ompi_osc_pt2pt_header_put_t ompi_osc_pt2pt_header_put_t;
struct ompi_osc_pt2pt_header_acc_t {
ompi_osc_pt2pt_header_base_t base;
uint16_t tag;
uint32_t count;
uint32_t op;
uint64_t len;
uint64_t displacement;
};
typedef struct ompi_osc_pt2pt_header_acc_t ompi_osc_pt2pt_header_acc_t;
struct ompi_osc_pt2pt_header_get_t {
ompi_osc_pt2pt_header_base_t base;
uint16_t tag;
uint32_t count;
uint64_t len;
uint64_t displacement;
};
typedef struct ompi_osc_pt2pt_header_get_t ompi_osc_pt2pt_header_get_t;
struct ompi_osc_pt2pt_header_complete_t {
ompi_osc_pt2pt_header_base_t base;
int frag_count;
};
typedef struct ompi_osc_pt2pt_header_complete_t ompi_osc_pt2pt_header_complete_t;
struct ompi_osc_pt2pt_header_cswap_t {
ompi_osc_pt2pt_header_base_t base;
uint16_t tag;
uint32_t len;
uint64_t displacement;
};
typedef struct ompi_osc_pt2pt_header_cswap_t ompi_osc_pt2pt_header_cswap_t;
struct ompi_osc_pt2pt_header_post_t {
ompi_osc_pt2pt_header_base_t base;
uint16_t windx;
};
typedef struct ompi_osc_pt2pt_header_post_t ompi_osc_pt2pt_header_post_t;
struct ompi_osc_pt2pt_header_lock_t {
ompi_osc_pt2pt_header_base_t base;
int32_t lock_type;
uint64_t lock_ptr;
};
typedef struct ompi_osc_pt2pt_header_lock_t ompi_osc_pt2pt_header_lock_t;
struct ompi_osc_pt2pt_header_lock_ack_t {
ompi_osc_pt2pt_header_base_t base;
uint16_t windx;
uint32_t source;
uint64_t lock_ptr;
};
typedef struct ompi_osc_pt2pt_header_lock_ack_t ompi_osc_pt2pt_header_lock_ack_t;
struct ompi_osc_pt2pt_header_unlock_t {
ompi_osc_pt2pt_header_base_t base;
int32_t lock_type;
uint32_t frag_count;
uint64_t lock_ptr;
};
typedef struct ompi_osc_pt2pt_header_unlock_t ompi_osc_pt2pt_header_unlock_t;
struct ompi_osc_pt2pt_header_unlock_ack_t {
ompi_osc_pt2pt_header_base_t base;
uint64_t lock_ptr;
};
typedef struct ompi_osc_pt2pt_header_unlock_ack_t ompi_osc_pt2pt_header_unlock_ack_t;
struct ompi_osc_pt2pt_header_flush_t {
ompi_osc_pt2pt_header_base_t base;
uint32_t frag_count;
uint64_t serial_number;
};
typedef struct ompi_osc_pt2pt_header_flush_t ompi_osc_pt2pt_header_flush_t;
struct ompi_osc_pt2pt_header_flush_ack_t {
ompi_osc_pt2pt_header_base_t base;
uint64_t serial_number;
};
typedef struct ompi_osc_pt2pt_header_flush_ack_t ompi_osc_pt2pt_header_flush_ack_t;
struct ompi_osc_pt2pt_frag_header_t {
ompi_osc_pt2pt_header_base_t base;
uint16_t windx; /* cid of communicator backing window (our window id) */
uint32_t source; /* rank in window of source process */
int32_t num_ops; /* number of operations in this buffer */
uint32_t pad; /* ensure the fragment header is a multiple of 8 bytes */
};
typedef struct ompi_osc_pt2pt_frag_header_t ompi_osc_pt2pt_frag_header_t;
union ompi_osc_pt2pt_header_t {
ompi_osc_pt2pt_header_base_t base;
ompi_osc_pt2pt_header_put_t put;
ompi_osc_pt2pt_header_acc_t acc;
ompi_osc_pt2pt_header_get_t get;
ompi_osc_pt2pt_header_complete_t complete;
ompi_osc_pt2pt_header_cswap_t cswap;
ompi_osc_pt2pt_header_post_t post;
ompi_osc_pt2pt_header_lock_t lock;
ompi_osc_pt2pt_header_lock_ack_t lock_ack;
ompi_osc_pt2pt_header_unlock_t unlock;
ompi_osc_pt2pt_header_unlock_ack_t unlock_ack;
ompi_osc_pt2pt_header_flush_t flush;
ompi_osc_pt2pt_header_flush_ack_t flush_ack;
ompi_osc_pt2pt_frag_header_t frag;
};
typedef union ompi_osc_pt2pt_header_t ompi_osc_pt2pt_header_t;
#endif /* OMPI_MCA_OSC_PT2PT_HDR_H */

Просмотреть файл

@ -20,7 +20,7 @@
#include "ompi_config.h"
#include "osc_rdma.h"
#include "osc_pt2pt.h"
#include "opal/threads/mutex.h"
#include "opal/mca/btl/btl.h"
@ -31,25 +31,24 @@
int
ompi_osc_rdma_attach(struct ompi_win_t *win, void *base, size_t len)
ompi_osc_pt2pt_attach(struct ompi_win_t *win, void *base, size_t len)
{
return OMPI_SUCCESS;
}
int
ompi_osc_rdma_detach(struct ompi_win_t *win, void *base)
ompi_osc_pt2pt_detach(struct ompi_win_t *win, void *base)
{
return OMPI_SUCCESS;
}
int
ompi_osc_rdma_free(ompi_win_t *win)
ompi_osc_pt2pt_free(ompi_win_t *win)
{
int ret = OMPI_SUCCESS;
ompi_osc_rdma_module_t *module = GET_MODULE(win);
opal_list_item_t *item;
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
if (NULL == module) {
return OMPI_SUCCESS;
@ -57,7 +56,7 @@ ompi_osc_rdma_free(ompi_win_t *win)
if (NULL != module->comm) {
opal_output_verbose(1, ompi_osc_base_framework.framework_output,
"rdma component destroying window with id %d",
"pt2pt component destroying window with id %d",
ompi_comm_get_cid(module->comm));
/* finish with a barrier */
@ -67,43 +66,38 @@ ompi_osc_rdma_free(ompi_win_t *win)
}
/* remove from component information */
OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock);
opal_hash_table_remove_value_uint32(&mca_osc_rdma_component.modules,
ompi_comm_get_cid(module->comm));
OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock);
OPAL_THREAD_SCOPED_LOCK(&mca_osc_pt2pt_component.lock,
opal_hash_table_remove_value_uint32(&mca_osc_pt2pt_component.modules,
ompi_comm_get_cid(module->comm)));
}
win->w_osc_module = NULL;
OBJ_DESTRUCT(&module->outstanding_locks);
OBJ_DESTRUCT(&module->locks_pending);
OBJ_DESTRUCT(&module->locks_pending_lock);
OBJ_DESTRUCT(&module->acc_lock);
OBJ_DESTRUCT(&module->cond);
OBJ_DESTRUCT(&module->lock);
/* it is erroneous to close a window with active operations on it so we should
* probably produce an error here instead of cleaning up */
while (NULL != (item = opal_list_remove_first (&module->pending_acc))) {
OBJ_RELEASE(item);
}
OPAL_LIST_DESTRUCT(&module->pending_acc);
OPAL_LIST_DESTRUCT(&module->pending_posts);
OPAL_LIST_DESTRUCT(&module->queued_frags);
OBJ_DESTRUCT(&module->queued_frags_lock);
OBJ_DESTRUCT(&module->pending_acc);
while (NULL != (item = opal_list_remove_first (&module->pending_posts))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&module->pending_posts);
osc_rdma_gc_clean ();
osc_pt2pt_gc_clean (module);
OPAL_LIST_DESTRUCT(&module->request_gc);
OPAL_LIST_DESTRUCT(&module->buffer_gc);
OBJ_DESTRUCT(&module->gc_lock);
if (NULL != module->peers) {
free(module->peers);
}
if (NULL != module->passive_eager_send_active) free(module->passive_eager_send_active);
if (NULL != module->passive_incoming_frag_count) free(module->passive_incoming_frag_count);
if (NULL != module->passive_incoming_frag_signal_count) free(module->passive_incoming_frag_signal_count);
if (NULL != module->epoch_outgoing_frag_count) free(module->epoch_outgoing_frag_count);
if (NULL != module->frag_request) {
module->frag_request->req_complete_cb = NULL;
ompi_request_cancel (module->frag_request);

925
ompi/mca/osc/pt2pt/osc_pt2pt_passive_target.c Обычный файл
Просмотреть файл

@ -0,0 +1,925 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2010 IBM Corporation. All rights reserved.
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "osc_pt2pt.h"
#include "osc_pt2pt_header.h"
#include "osc_pt2pt_data_move.h"
#include "osc_pt2pt_frag.h"
#include "mpi.h"
#include "opal/runtime/opal_progress.h"
#include "opal/threads/mutex.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/osc/base/base.h"
#include "opal/include/opal_stdint.h"
static bool ompi_osc_pt2pt_lock_try_acquire (ompi_osc_pt2pt_module_t* module, int source, int lock_type,
uint64_t serial_number);
/* target-side tracking of a lock request */
struct ompi_osc_pt2pt_pending_lock_t {
opal_list_item_t super;
int peer;
int lock_type;
uint64_t lock_ptr;
};
typedef struct ompi_osc_pt2pt_pending_lock_t ompi_osc_pt2pt_pending_lock_t;
OBJ_CLASS_INSTANCE(ompi_osc_pt2pt_pending_lock_t, opal_list_item_t,
NULL, NULL);
/* origin-side tracking of a lock request */
struct ompi_osc_pt2pt_outstanding_lock_t {
opal_list_item_t super;
int target;
int assert;
bool flushing;
int32_t lock_acks_received;
int32_t unlock_acks_received;
int32_t flush_acks_received;
uint64_t serial_number;
int32_t type;
};
typedef struct ompi_osc_pt2pt_outstanding_lock_t ompi_osc_pt2pt_outstanding_lock_t;
OBJ_CLASS_INSTANCE(ompi_osc_pt2pt_outstanding_lock_t, opal_list_item_t,
NULL, NULL);
static int ompi_osc_activate_next_lock (ompi_osc_pt2pt_module_t *module);
static inline int queue_lock (ompi_osc_pt2pt_module_t *module, int requestor, int lock_type, uint64_t lock_ptr);
static int ompi_osc_pt2pt_flush_lock (ompi_osc_pt2pt_module_t *module, ompi_osc_pt2pt_outstanding_lock_t *lock,
int target);
/**
* Find the first outstanding lock to a target.
*
* @param[in] module - OSC PT2PT module
* @param[in] target - Target rank
*
* @returns an outstanding lock on success
*
* This function traverses the outstanding_locks list in the module
* looking for a lock that matches target. The caller must hold the
* module lock.
*/
static inline ompi_osc_pt2pt_outstanding_lock_t *find_outstanding_lock_st (ompi_osc_pt2pt_module_t *module, int target)
{
ompi_osc_pt2pt_outstanding_lock_t *outstanding_lock, *lock = NULL;
OPAL_LIST_FOREACH(outstanding_lock, &module->outstanding_locks, ompi_osc_pt2pt_outstanding_lock_t) {
if (outstanding_lock->target == target) {
lock = outstanding_lock;
break;
}
}
return lock;
}
static inline ompi_osc_pt2pt_outstanding_lock_t *find_outstanding_lock (ompi_osc_pt2pt_module_t *module, int target)
{
ompi_osc_pt2pt_outstanding_lock_t *lock;
OPAL_THREAD_LOCK(&module->lock);
lock = find_outstanding_lock_st (module, target);
OPAL_THREAD_UNLOCK(&module->lock);
return lock;
}
static inline ompi_osc_pt2pt_outstanding_lock_t *find_outstanding_lock_by_serial (ompi_osc_pt2pt_module_t *module, uint64_t serial_number)
{
ompi_osc_pt2pt_outstanding_lock_t *outstanding_lock, *lock = NULL;
OPAL_THREAD_LOCK(&module->lock);
OPAL_LIST_FOREACH(outstanding_lock, &module->outstanding_locks, ompi_osc_pt2pt_outstanding_lock_t) {
if (outstanding_lock->serial_number == serial_number) {
lock = outstanding_lock;
break;
}
}
OPAL_THREAD_UNLOCK(&module->lock);
return lock;
}
static inline int ompi_osc_pt2pt_lock_self (ompi_osc_pt2pt_module_t *module, ompi_osc_pt2pt_outstanding_lock_t *lock)
{
const int my_rank = ompi_comm_rank (module->comm);
bool acquired = false;
acquired = ompi_osc_pt2pt_lock_try_acquire (module, my_rank, lock->type, (uint64_t) (uintptr_t) lock);
if (!acquired) {
/* queue the lock */
queue_lock (module, my_rank, lock->type, (uint64_t) (uintptr_t) lock);
/* If locking local, can't be non-blocking according to the
standard. We need to wait for the ack here. */
OPAL_THREAD_LOCK(&module->lock);
while (0 == lock->lock_acks_received) {
opal_condition_wait(&module->cond, &module->lock);
}
OPAL_THREAD_UNLOCK(&module->lock);
}
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"local lock aquired"));
return OMPI_SUCCESS;
}
static inline void ompi_osc_pt2pt_unlock_self (ompi_osc_pt2pt_module_t *module, ompi_osc_pt2pt_outstanding_lock_t *lock)
{
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_unlock_self: unlocking myself. lock state = %d", module->lock_status));
if (MPI_LOCK_EXCLUSIVE == lock->type) {
OPAL_THREAD_ADD32(&module->lock_status, 1);
ompi_osc_activate_next_lock (module);
} else if (0 == OPAL_THREAD_ADD32(&module->lock_status, -1)) {
ompi_osc_activate_next_lock (module);
}
/* need to ensure we make progress */
opal_progress();
OPAL_THREAD_ADD32(&lock->unlock_acks_received, 1);
}
static inline int ompi_osc_pt2pt_lock_remote (ompi_osc_pt2pt_module_t *module, int target, ompi_osc_pt2pt_outstanding_lock_t *lock)
{
ompi_osc_pt2pt_header_lock_t lock_req;
int ret;
/* generate a lock request */
lock_req.base.type = OMPI_OSC_PT2PT_HDR_TYPE_LOCK_REQ;
lock_req.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID | OMPI_OSC_PT2PT_HDR_FLAG_PASSIVE_TARGET;
lock_req.lock_type = lock->type;
lock_req.lock_ptr = (uint64_t) (uintptr_t) lock;
ret = ompi_osc_pt2pt_control_send (module, target, &lock_req, sizeof (lock_req));
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
/* make sure the request gets sent, so we can start eager sending... */
ret = ompi_osc_pt2pt_frag_flush_target (module, target);
return ret;
}
static inline int ompi_osc_pt2pt_unlock_remote (ompi_osc_pt2pt_module_t *module, int target, ompi_osc_pt2pt_outstanding_lock_t *lock)
{
ompi_osc_pt2pt_header_unlock_t unlock_req;
int32_t frag_count = opal_atomic_swap_32 ((int32_t *) module->epoch_outgoing_frag_count + target, -1);
unlock_req.base.type = OMPI_OSC_PT2PT_HDR_TYPE_UNLOCK_REQ;
unlock_req.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID | OMPI_OSC_PT2PT_HDR_FLAG_PASSIVE_TARGET;
unlock_req.frag_count = frag_count;
unlock_req.lock_type = lock->type;
unlock_req.lock_ptr = (uint64_t) (uintptr_t) lock;
/* send control message with unlock request and count */
return ompi_osc_pt2pt_control_send (module, target, &unlock_req, sizeof (unlock_req));
}
static int ompi_osc_pt2pt_lock_internal_execute (ompi_osc_pt2pt_module_t *module, ompi_osc_pt2pt_outstanding_lock_t *lock)
{
int my_rank = ompi_comm_rank (module->comm);
int target = lock->target;
int assert = lock->assert;
int ret;
if (0 == (assert & MPI_MODE_NOCHECK)) {
if (my_rank != target && target != -1) {
ret = ompi_osc_pt2pt_lock_remote (module, target, lock);
} else {
ret = ompi_osc_pt2pt_lock_self (module, lock);
}
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
/* return */
return ret;
}
if (-1 == target) {
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
if (my_rank == i) {
continue;
}
ret = ompi_osc_pt2pt_lock_remote (module, i, lock);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
}
}
} else {
if (-1 == target) {
lock->lock_acks_received = ompi_comm_size(module->comm);
} else {
lock->lock_acks_received = 1;
}
}
return OMPI_SUCCESS;
}
static int ompi_osc_pt2pt_lock_internal (int lock_type, int target, int assert, ompi_win_t *win)
{
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
ompi_osc_pt2pt_outstanding_lock_t *lock;
ompi_osc_pt2pt_peer_t *peer = NULL;
int ret = OMPI_SUCCESS;
if (-1 != target) {
peer = module->peers + target;
}
/* Check if no_locks is set. TODO: we also need to track whether we are in an
* active target epoch. Fence can make this tricky to track. */
if (module->sc_group) {
return OMPI_ERR_RMA_SYNC;
}
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc pt2pt: lock %d %d", target, lock_type));
/* create lock item */
lock = OBJ_NEW(ompi_osc_pt2pt_outstanding_lock_t);
if (OPAL_UNLIKELY(NULL == lock)) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
lock->target = target;
lock->lock_acks_received = 0;
lock->unlock_acks_received = 0;
lock->serial_number = OPAL_THREAD_ADD64((int64_t *) &module->lock_serial_number, 1);
lock->type = lock_type;
lock->assert = assert;
/* delay all eager sends until we've heard back.. */
OPAL_THREAD_LOCK(&module->lock);
/* check for conflicting lock */
if (find_outstanding_lock_st (module, target)) {
OBJ_RELEASE(lock);
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERR_RMA_CONFLICT;
}
/* when the lock ack returns we will be in an access epoch with this peer/all peers (target = -1) */
if (-1 == target) {
module->all_access_epoch = true;
} else {
peer->access_epoch = true;
}
module->passive_target_access_epoch = true;
opal_list_append(&module->outstanding_locks, &lock->super);
OPAL_THREAD_UNLOCK(&module->lock);
ret = ompi_osc_pt2pt_lock_internal_execute (module, lock);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
OPAL_THREAD_SCOPED_LOCK(&module->lock,
opal_list_remove_item(&module->outstanding_locks, &lock->super));
OBJ_RELEASE(lock);
}
return ret;
}
static int ompi_osc_pt2pt_unlock_internal (int target, ompi_win_t *win)
{
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
ompi_osc_pt2pt_outstanding_lock_t *lock = NULL;
int my_rank = ompi_comm_rank (module->comm);
ompi_osc_pt2pt_peer_t *peer = NULL;
int lock_acks_expected;
int ret = OMPI_SUCCESS;
if (-1 != target) {
lock_acks_expected = 1;
peer = module->peers + target;
} else {
lock_acks_expected = ompi_comm_size (module->comm);
}
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_unlock_internal: unlocking target %d", target));
OPAL_THREAD_LOCK(&module->lock);
lock = find_outstanding_lock_st (module, target);
if (OPAL_UNLIKELY(NULL == lock)) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_unlock: target %d is not locked in window %s",
target, win->w_name));
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERR_RMA_SYNC;
}
opal_list_remove_item (&module->outstanding_locks, &lock->super);
/* wait until ack has arrived from target */
while (lock->lock_acks_received != lock_acks_expected) {
opal_condition_wait(&module->cond, &module->lock);
}
OPAL_THREAD_UNLOCK(&module->lock);
if (lock->assert & MPI_MODE_NOCHECK) {
/* flush intstead */
ompi_osc_pt2pt_flush_lock (module, lock, target);
} else if (my_rank != target) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc pt2pt: unlock %d, lock_acks_received = %d", target,
lock->lock_acks_received));
if (-1 == target) {
/* send unlock messages to all of my peers */
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
if (my_rank == i) {
continue;
}
ret = ompi_osc_pt2pt_unlock_remote (module, i, lock);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
}
} else {
ret = ompi_osc_pt2pt_unlock_remote (module, target, lock);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
}
/* start all sendreqs to target */
if (-1 == target) {
ret = ompi_osc_pt2pt_frag_flush_all (module);
} else {
ret = ompi_osc_pt2pt_frag_flush_target(module, target);
}
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
/* wait for unlock acks. this signals remote completion of fragments */
OPAL_THREAD_LOCK(&module->lock);
while (lock->unlock_acks_received != lock_acks_expected) {
opal_condition_wait(&module->cond, &module->lock);
}
OPAL_THREAD_UNLOCK(&module->lock);
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_unlock: unlock of %d complete", target));
}
if ((target == my_rank || target == -1) && !(lock->assert & MPI_MODE_NOCHECK)) {
ompi_osc_pt2pt_unlock_self (module, lock);
}
OPAL_THREAD_LOCK(&module->lock);
if (-1 != target) {
peer->access_epoch = false;
module->passive_target_access_epoch = false;
} else {
module->passive_target_access_epoch = false;
module->all_access_epoch = false;
}
OPAL_THREAD_UNLOCK(&module->lock);
OBJ_RELEASE(lock);
return ret;
}
int ompi_osc_pt2pt_lock(int lock_type, int target, int assert, ompi_win_t *win)
{
assert(target >= 0);
return ompi_osc_pt2pt_lock_internal (lock_type, target, assert, win);
}
int ompi_osc_pt2pt_unlock (int target, struct ompi_win_t *win)
{
return ompi_osc_pt2pt_unlock_internal (target, win);
}
int ompi_osc_pt2pt_lock_all(int assert, struct ompi_win_t *win)
{
return ompi_osc_pt2pt_lock_internal (MPI_LOCK_SHARED, -1, assert, win);
}
int ompi_osc_pt2pt_unlock_all (struct ompi_win_t *win)
{
return ompi_osc_pt2pt_unlock_internal (-1, win);
}
int ompi_osc_pt2pt_sync (struct ompi_win_t *win)
{
opal_progress();
return OMPI_SUCCESS;
}
static int ompi_osc_pt2pt_flush_lock (ompi_osc_pt2pt_module_t *module, ompi_osc_pt2pt_outstanding_lock_t *lock,
int target)
{
ompi_osc_pt2pt_header_flush_t flush_req;
int peer_count, ret, flush_count;
int my_rank = ompi_comm_rank (module->comm);
if (-1 == lock->target) {
peer_count = ompi_comm_size(module->comm);
} else {
peer_count = 1;
}
/* wait until ack has arrived from target, since we need to be
able to eager send before we can transfer all the data... */
OPAL_THREAD_LOCK(&module->lock);
while (peer_count > lock->lock_acks_received && lock->flushing) {
opal_condition_wait(&module->cond, &module->lock);
}
lock->flush_acks_received = 0;
lock->flushing = true;
OPAL_THREAD_UNLOCK(&module->lock);
flush_req.base.type = OMPI_OSC_PT2PT_HDR_TYPE_FLUSH_REQ;
flush_req.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID | OMPI_OSC_PT2PT_HDR_FLAG_PASSIVE_TARGET;
flush_req.serial_number = lock->serial_number;
if (-1 == target) {
/* NTH: no local flush */
flush_count = ompi_comm_size(module->comm) - 1;
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
if (i == my_rank) {
continue;
}
flush_req.frag_count = opal_atomic_swap_32 ((int32_t *) module->epoch_outgoing_frag_count + i, -1);
/* send control message with flush request and count */
ret = ompi_osc_pt2pt_control_send (module, i, &flush_req, sizeof (flush_req));
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
/* start all sendreqs to target */
ret = ompi_osc_pt2pt_frag_flush_target (module, i);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
}
} else {
flush_req.frag_count = opal_atomic_swap_32 ((int32_t *) module->epoch_outgoing_frag_count + target, -1);
flush_count = 1;
/* send control message with flush request and count */
ret = ompi_osc_pt2pt_control_send (module, target, &flush_req, sizeof (flush_req));
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
/* start all sendreqs to target */
ret = ompi_osc_pt2pt_frag_flush_target (module, target);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
}
/* wait for all the requests and the flush ack (meaning remote completion) */
OPAL_THREAD_LOCK(&module->lock);
while (flush_count != lock->flush_acks_received) {
opal_condition_wait(&module->cond, &module->lock);
}
lock->flushing = false;
opal_condition_broadcast(&module->cond);
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_SUCCESS;
}
int ompi_osc_pt2pt_flush (int target, struct ompi_win_t *win)
{
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
ompi_osc_pt2pt_outstanding_lock_t *lock;
int ret;
assert (0 <= target);
/* flush is only allowed from within a passive target epoch */
if (!module->passive_target_access_epoch) {
return OMPI_ERR_RMA_SYNC;
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_flush starting..."));
if (ompi_comm_rank (module->comm) == target) {
/* nothing to flush */
opal_progress ();
return OMPI_SUCCESS;
}
lock = find_outstanding_lock (module, target);
if (NULL == lock) {
lock = find_outstanding_lock (module, -1);
}
if (OPAL_UNLIKELY(NULL == lock)) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_flush: target %d is not locked in window %s",
target, win->w_name));
ret = OMPI_ERR_RMA_SYNC;
} else {
ret = ompi_osc_pt2pt_flush_lock (module, lock, target);
}
return ret;
}
int ompi_osc_pt2pt_flush_all (struct ompi_win_t *win)
{
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
ompi_osc_pt2pt_outstanding_lock_t *lock;
int ret = OMPI_SUCCESS;
/* flush is only allowed from within a passive target epoch */
if (OPAL_UNLIKELY(!module->passive_target_access_epoch ||
0 == opal_list_get_size (&module->outstanding_locks))) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_flush_all: no targets are locked in window %s",
win->w_name));
return OMPI_ERR_RMA_SYNC;
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_flush_all entering..."));
/* flush all locks */
OPAL_LIST_FOREACH(lock, &module->outstanding_locks, ompi_osc_pt2pt_outstanding_lock_t) {
ret = ompi_osc_pt2pt_flush_lock (module, lock, lock->target);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
break;
}
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_flush_all complete"));
return ret;
}
int ompi_osc_pt2pt_flush_local (int target, struct ompi_win_t *win)
{
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
int ret;
/* flush is only allowed from within a passive target epoch */
if (!module->passive_target_access_epoch) {
return OMPI_ERR_RMA_SYNC;
}
ret = ompi_osc_pt2pt_frag_flush_target(module, target);
if (OMPI_SUCCESS != ret) {
return ret;
}
/* wait for all the requests */
OPAL_THREAD_LOCK(&module->lock);
while (module->outgoing_frag_count != module->outgoing_frag_signal_count) {
opal_condition_wait(&module->cond, &module->lock);
}
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_SUCCESS;
}
int ompi_osc_pt2pt_flush_local_all (struct ompi_win_t *win)
{
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
int ret = OMPI_SUCCESS;
/* flush is only allowed from within a passive target epoch */
if (!module->passive_target_access_epoch) {
return OMPI_ERR_RMA_SYNC;
}
ret = ompi_osc_pt2pt_frag_flush_all(module);
if (OMPI_SUCCESS != ret) {
return ret;
}
/* wait for all the requests */
OPAL_THREAD_LOCK(&module->lock);
while (module->outgoing_frag_count != module->outgoing_frag_signal_count) {
opal_condition_wait(&module->cond, &module->lock);
}
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_SUCCESS;
}
/* target side operation to acknowledge to initiator side that the
lock is now held by the initiator */
static inline int activate_lock (ompi_osc_pt2pt_module_t *module, int requestor,
uint64_t lock_ptr)
{
ompi_osc_pt2pt_outstanding_lock_t *lock;
if (ompi_comm_rank (module->comm) != requestor) {
ompi_osc_pt2pt_header_lock_ack_t lock_ack;
lock_ack.base.type = OMPI_OSC_PT2PT_HDR_TYPE_LOCK_ACK;
lock_ack.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID;
lock_ack.source = ompi_comm_rank(module->comm);
lock_ack.windx = ompi_comm_get_cid(module->comm);
lock_ack.lock_ptr = lock_ptr;
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc pt2pt: sending lock to %d", requestor));
/* we don't want to send any data, since we're the exposure
epoch only, so use an unbuffered send */
return ompi_osc_pt2pt_control_send_unbuffered (module, requestor, &lock_ack, sizeof (lock_ack));
}
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc pt2pt: releasing local lock"));
lock = (ompi_osc_pt2pt_outstanding_lock_t *) (uintptr_t) lock_ptr;
if (OPAL_UNLIKELY(NULL == lock)) {
OPAL_OUTPUT_VERBOSE((5, ompi_osc_base_framework.framework_output,
"lock could not be located"));
}
OPAL_THREAD_ADD32(&lock->lock_acks_received, 1);
opal_condition_broadcast (&module->cond);
return OMPI_SUCCESS;
}
/* target side operation to create a pending lock request for a lock
request that could not be satisfied */
static inline int queue_lock (ompi_osc_pt2pt_module_t *module, int requestor,
int lock_type, uint64_t lock_ptr)
{
ompi_osc_pt2pt_pending_lock_t *pending =
OBJ_NEW(ompi_osc_pt2pt_pending_lock_t);
if (NULL == pending) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
pending->peer = requestor;
pending->lock_type = lock_type;
pending->lock_ptr = lock_ptr;
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc pt2pt: queueing lock request from %d", requestor));
OPAL_THREAD_SCOPED_LOCK(&module->locks_pending_lock, opal_list_append(&module->locks_pending, &pending->super));
return OMPI_SUCCESS;
}
static bool ompi_osc_pt2pt_lock_try_acquire (ompi_osc_pt2pt_module_t* module, int source, int lock_type, uint64_t lock_ptr)
{
bool queue = false;
if (MPI_LOCK_SHARED == lock_type) {
int32_t lock_status = module->lock_status;
do {
if (lock_status < 0) {
queue = true;
break;
}
if (opal_atomic_cmpset_32 (&module->lock_status, lock_status, lock_status + 1)) {
break;
}
lock_status = module->lock_status;
} while (1);
} else {
queue = !opal_atomic_cmpset_32 (&module->lock_status, 0, -1);
}
if (queue) {
return false;
}
activate_lock(module, source, lock_ptr);
/* activated the lock */
return true;
}
static int ompi_osc_activate_next_lock (ompi_osc_pt2pt_module_t *module) {
/* release any other pending locks we can */
ompi_osc_pt2pt_pending_lock_t *pending_lock, *next;
int ret = OMPI_SUCCESS;
OPAL_THREAD_LOCK(&module->locks_pending_lock);
OPAL_LIST_FOREACH_SAFE(pending_lock, next, &module->locks_pending,
ompi_osc_pt2pt_pending_lock_t) {
bool acquired = ompi_osc_pt2pt_lock_try_acquire (module, pending_lock->peer, pending_lock->lock_type,
pending_lock->lock_ptr);
if (!acquired) {
break;
}
opal_list_remove_item (&module->locks_pending, &pending_lock->super);
OBJ_RELEASE(pending_lock);
}
OPAL_THREAD_UNLOCK(&module->locks_pending_lock);
return ret;
}
/* target side function called when the initiator sends a lock
request. Lock will either be activated and acknowledged or
queued. */
int ompi_osc_pt2pt_process_lock (ompi_osc_pt2pt_module_t* module, int source,
ompi_osc_pt2pt_header_lock_t* lock_header)
{
bool acquired;
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_process_lock: processing lock request from %d. current lock state = %d",
source, module->lock_status));
acquired = ompi_osc_pt2pt_lock_try_acquire (module, source, lock_header->lock_type, lock_header->lock_ptr);
if (!acquired) {
queue_lock(module, source, lock_header->lock_type, lock_header->lock_ptr);
}
return OMPI_SUCCESS;
}
/* initiator-side function called when the target acks the lock
request. */
void ompi_osc_pt2pt_process_lock_ack (ompi_osc_pt2pt_module_t *module,
ompi_osc_pt2pt_header_lock_ack_t *lock_ack_header)
{
ompi_osc_pt2pt_peer_t *peer = module->peers + lock_ack_header->source;
ompi_osc_pt2pt_outstanding_lock_t *lock;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_process_unlock_ack: processing lock ack from %d for lock %" PRIu64,
lock_ack_header->source, lock_ack_header->lock_ptr));
lock = (ompi_osc_pt2pt_outstanding_lock_t *) (uintptr_t) lock_ack_header->lock_ptr;
assert (NULL != lock);
/* no need to hold the lock to set this */
peer->eager_send_active = true;
OPAL_THREAD_ADD32(&lock->lock_acks_received, 1);
opal_condition_broadcast(&module->cond);
}
void ompi_osc_pt2pt_process_flush_ack (ompi_osc_pt2pt_module_t *module, int source,
ompi_osc_pt2pt_header_flush_ack_t *flush_ack_header) {
ompi_osc_pt2pt_outstanding_lock_t *lock;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_process_unlock_ack: processing flush ack from %d for lock %" PRIu64,
source, flush_ack_header->serial_number));
/* NTH: need to verify that this will work as expected */
lock = find_outstanding_lock_by_serial (module, flush_ack_header->serial_number);
assert (NULL != lock);
OPAL_THREAD_ADD32(&lock->flush_acks_received, 1);
opal_condition_broadcast(&module->cond);
}
void ompi_osc_pt2pt_process_unlock_ack (ompi_osc_pt2pt_module_t *module, int source,
ompi_osc_pt2pt_header_unlock_ack_t *unlock_ack_header)
{
ompi_osc_pt2pt_peer_t *peer = module->peers + source;
ompi_osc_pt2pt_outstanding_lock_t *lock;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_process_unlock_ack: processing unlock ack from %d",
source));
/* NTH: need to verify that this will work as expected */
lock = (ompi_osc_pt2pt_outstanding_lock_t *) (intptr_t) unlock_ack_header->lock_ptr;
assert (NULL != lock);
peer->eager_send_active = false;
if (0 == OPAL_THREAD_ADD32(&lock->unlock_acks_received, 1)) {
opal_condition_broadcast(&module->cond);
}
}
/**
* Process an unlock request.
*
* @param[in] module - OSC PT2PT module
* @param[in] source - Source rank
* @param[in] unlock_header - Incoming unlock header
*
* This functions is the target-side function for handling an unlock
* request. Once all pending operations from the target are complete
* this functions sends an unlock acknowledgement then attempts to
* active a pending lock if the lock becomes free.
*/
int ompi_osc_pt2pt_process_unlock (ompi_osc_pt2pt_module_t *module, int source,
ompi_osc_pt2pt_header_unlock_t *unlock_header)
{
ompi_osc_pt2pt_header_unlock_ack_t unlock_ack;
ompi_osc_pt2pt_peer_t *peer = module->peers + source;
int ret;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_process_unlock entering (passive_incoming_frag_count: %d)...",
peer->passive_incoming_frag_count));
/* we cannot block when processing an incoming request */
if (0 != peer->passive_incoming_frag_count) {
return OMPI_ERR_WOULD_BLOCK;
}
unlock_ack.base.type = OMPI_OSC_PT2PT_HDR_TYPE_UNLOCK_ACK;
unlock_ack.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID;
unlock_ack.lock_ptr = unlock_header->lock_ptr;
ret = ompi_osc_pt2pt_control_send_unbuffered (module, source, &unlock_ack, sizeof (unlock_ack));
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
if (-1 == module->lock_status) {
OPAL_THREAD_ADD32(&module->lock_status, 1);
ompi_osc_activate_next_lock (module);
} else if (0 == OPAL_THREAD_ADD32(&module->lock_status, -1)) {
ompi_osc_activate_next_lock (module);
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc pt2pt: finished processing unlock fragment"));
return ret;
}
int ompi_osc_pt2pt_process_flush (ompi_osc_pt2pt_module_t *module, int source,
ompi_osc_pt2pt_header_flush_t *flush_header)
{
ompi_osc_pt2pt_peer_t *peer = module->peers + source;
ompi_osc_pt2pt_header_flush_ack_t flush_ack;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_process_flush entering (passive_incoming_frag_count: %d)...",
peer->passive_incoming_frag_count));
/* we cannot block when processing an incoming request */
if (0 != peer->passive_incoming_frag_count) {
return OMPI_ERR_WOULD_BLOCK;
}
flush_ack.base.type = OMPI_OSC_PT2PT_HDR_TYPE_FLUSH_ACK;
flush_ack.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID;
flush_ack.serial_number = flush_header->serial_number;
return ompi_osc_pt2pt_control_send_unbuffered (module, source, &flush_ack, sizeof (flush_ack));
}

Просмотреть файл

@ -1,9 +1,12 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2013 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
*
* Additional copyrights may follow
*
*
* $HEADER$
*
* Pending frags are fragments that have been received on the target,
@ -14,35 +17,35 @@
* message.
*/
#ifndef OSC_RDMA_PENDING_FRAG_H
#define OSC_RDMA_PENDING_FRAG_H
#ifndef OSC_PT2PT_PENDING_FRAG_H
#define OSC_PT2PT_PENDING_FRAG_H
/** Incoming fragment that has to be queued */
struct ompi_osc_rdma_pending_frag_t {
struct ompi_osc_pt2pt_pending_frag_t {
opal_list_item_t super;
/* This is a pointer to the top of the fragment (which is always
the header). Save as a header to make the casting a bit less
onerous during sequence number lookups. */
ompi_osc_rdma_frag_header_t *header;
ompi_osc_pt2pt_frag_header_t *header;
};
typedef struct ompi_osc_rdma_pending_frag_t ompi_osc_rdma_pending_frag_t;
OBJ_CLASS_DECLARATION(ompi_osc_rdma_pending_frag_t);
typedef struct ompi_osc_pt2pt_pending_frag_t ompi_osc_pt2pt_pending_frag_t;
OBJ_CLASS_DECLARATION(ompi_osc_pt2pt_pending_frag_t);
/*
* Note: module lock must be held during this operation
*/
static inline ompi_osc_rdma_pending_frag_t*
ompi_osc_rdma_pending_frag_create(ompi_osc_rdma_module_t *module,
static inline ompi_osc_pt2pt_pending_frag_t*
ompi_osc_pt2pt_pending_frag_create(ompi_osc_pt2pt_module_t *module,
void *ptr,
size_t size)
{
size_t total_size = sizeof(ompi_osc_rdma_pending_frag_t) + size;
ompi_osc_rdma_pending_frag_t *ret =
(ompi_osc_rdma_pending_frag_t*) malloc(total_size);
size_t total_size = sizeof(ompi_osc_pt2pt_pending_frag_t) + size;
ompi_osc_pt2pt_pending_frag_t *ret =
(ompi_osc_pt2pt_pending_frag_t*) malloc(total_size);
if (NULL == ret) return NULL;
OBJ_CONSTRUCT(&ret, ompi_osc_rdma_pending_frag_t);
OBJ_CONSTRUCT(&ret, ompi_osc_pt2pt_pending_frag_t);
memcpy(ret->header, ptr, size);
return ret;
@ -50,11 +53,11 @@ ompi_osc_rdma_pending_frag_create(ompi_osc_rdma_module_t *module,
/*
* Note: module lock must be held for this operation
* Note: module lock must be held for this operation
*/
static inline int
ompi_osc_rdma_pending_frag_destroy(ompi_osc_rdma_module_t *module,
ompi_osc_rdma_pending_frag_t* frag)
ompi_osc_pt2pt_pending_frag_destroy(ompi_osc_pt2pt_module_t *module,
ompi_osc_pt2pt_pending_frag_t* frag)
{
OBJ_DESTRUCT(&frag);
free(frag);

Просмотреть файл

@ -1,9 +1,12 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2012 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
*
* Additional copyrights may follow
*
*
* $HEADER$
*/
@ -14,8 +17,8 @@
#include "ompi/mca/osc/base/base.h"
#include "ompi/mca/osc/base/osc_base_obj_convert.h"
#include "osc_rdma.h"
#include "osc_rdma_request.h"
#include "osc_pt2pt.h"
#include "osc_pt2pt_request.h"
static int
request_cancel(struct ompi_request_t *request, int complete)
@ -26,14 +29,14 @@ request_cancel(struct ompi_request_t *request, int complete)
static int
request_free(struct ompi_request_t **ompi_req)
{
ompi_osc_rdma_request_t *request =
(ompi_osc_rdma_request_t*) *ompi_req;
ompi_osc_pt2pt_request_t *request =
(ompi_osc_pt2pt_request_t*) *ompi_req;
if (true != request->super.req_complete) {
return MPI_ERR_REQUEST;
}
OMPI_OSC_RDMA_REQUEST_RETURN(request);
OMPI_OSC_PT2PT_REQUEST_RETURN(request);
*ompi_req = MPI_REQUEST_NULL;
@ -42,7 +45,7 @@ request_free(struct ompi_request_t **ompi_req)
static
void
request_construct(ompi_osc_rdma_request_t *request)
request_construct(ompi_osc_pt2pt_request_t *request)
{
request->super.req_type = OMPI_REQUEST_WIN;
request->super.req_status._cancelled = 0;
@ -50,7 +53,7 @@ request_construct(ompi_osc_rdma_request_t *request)
request->super.req_cancel = request_cancel;
}
OBJ_CLASS_INSTANCE(ompi_osc_rdma_request_t,
OBJ_CLASS_INSTANCE(ompi_osc_pt2pt_request_t,
ompi_request_t,
request_construct,
NULL);

Просмотреть файл

@ -4,46 +4,46 @@
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
*
* Additional copyrights may follow
*
*
* $HEADER$
*/
#ifndef OMPI_OSC_RDMA_REQUEST_H
#define OMPI_OSC_RDMA_REQUEST_H
#ifndef OMPI_OSC_PT2PT_REQUEST_H
#define OMPI_OSC_PT2PT_REQUEST_H
#include "osc_rdma.h"
#include "osc_pt2pt.h"
#include "ompi/request/request.h"
#include "opal/util/output.h"
struct ompi_osc_rdma_request_t {
struct ompi_osc_pt2pt_request_t {
ompi_request_t super;
int type;
void *origin_addr;
int origin_count;
struct ompi_datatype_t *origin_dt;
ompi_osc_rdma_module_t* module;
int outstanding_requests;
ompi_osc_pt2pt_module_t* module;
int32_t outstanding_requests;
bool internal;
};
typedef struct ompi_osc_rdma_request_t ompi_osc_rdma_request_t;
OBJ_CLASS_DECLARATION(ompi_osc_rdma_request_t);
typedef struct ompi_osc_pt2pt_request_t ompi_osc_pt2pt_request_t;
OBJ_CLASS_DECLARATION(ompi_osc_pt2pt_request_t);
/* REQUEST_ALLOC is only called from "top-level" functions (rdma_rput,
rdma_rget, etc.), so it's ok to spin here... */
#define OMPI_OSC_RDMA_REQUEST_ALLOC(win, req) \
/* REQUEST_ALLOC is only called from "top-level" functions (pt2pt_rput,
pt2pt_rget, etc.), so it's ok to spin here... */
#define OMPI_OSC_PT2PT_REQUEST_ALLOC(win, req) \
do { \
ompi_free_list_item_t *item; \
do { \
OMPI_FREE_LIST_GET_MT(&mca_osc_rdma_component.requests, item); \
OMPI_FREE_LIST_GET_MT(&mca_osc_pt2pt_component.requests, item); \
if (NULL == item) { \
opal_progress(); \
} \
} while (NULL == item); \
req = (ompi_osc_rdma_request_t*) item; \
req = (ompi_osc_pt2pt_request_t*) item; \
OMPI_REQUEST_INIT(&req->super, false); \
req->super.req_mpi_object.win = win; \
req->super.req_complete = false; \
@ -52,14 +52,14 @@ OBJ_CLASS_DECLARATION(ompi_osc_rdma_request_t);
req->internal = false; \
} while (0)
#define OMPI_OSC_RDMA_REQUEST_RETURN(req) \
#define OMPI_OSC_PT2PT_REQUEST_RETURN(req) \
do { \
OMPI_REQUEST_FINI(&(req)->super); \
OMPI_FREE_LIST_RETURN_MT(&mca_osc_rdma_component.requests, \
(ompi_free_list_item_t *) (req)); \
OMPI_REQUEST_FINI(&(req)->super); \
OMPI_FREE_LIST_RETURN_MT(&mca_osc_pt2pt_component.requests, \
(ompi_free_list_item_t *) (req)); \
} while (0)
static inline void ompi_osc_rdma_request_complete (ompi_osc_rdma_request_t *request, int mpi_error)
static inline void ompi_osc_pt2pt_request_complete (ompi_osc_pt2pt_request_t *request, int mpi_error)
{
if (!request->internal) {
request->super.req_status.MPI_ERROR = mpi_error;
@ -67,8 +67,8 @@ static inline void ompi_osc_rdma_request_complete (ompi_osc_rdma_request_t *requ
/* mark the request complete at the mpi level */
ompi_request_complete (&request->super, true);
} else {
OMPI_OSC_RDMA_REQUEST_RETURN (request);
OMPI_OSC_PT2PT_REQUEST_RETURN (request);
}
}
#endif /* OMPI_OSC_RDMA_REQUEST_H */
#endif /* OMPI_OSC_PT2PT_REQUEST_H */

Просмотреть файл

@ -1,26 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2013 Sandia National Laboratories. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_ompi_osc_rdma_POST_CONFIG(will_build)
# ----------------------------------------
# Only require the tag if we're actually going to be built, since bml
# is one of the ones frequently disabled for large installs.
AC_DEFUN([MCA_ompi_osc_rdma_POST_CONFIG], [
AS_IF([test "$1" = "1"], [OMPI_REQUIRE_ENDPOINT_TAG([BML])])
])dnl
# MCA_ompi_osc_rdma_CONFIG(action-if-can-compile,
# [action-if-cant-compile])
# ------------------------------------------------
# We can always build, unless we were explicitly disabled.
AC_DEFUN([MCA_ompi_osc_rdma_CONFIG],[
AC_CONFIG_FILES([ompi/mca/osc/rdma/Makefile])
[$1]
])dnl

Просмотреть файл

@ -1,197 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "opal/class/opal_list.h"
#include "ompi/mca/osc/base/base.h"
#include "ompi/mca/pml/pml.h"
#include "osc_rdma.h"
#include "osc_rdma_frag.h"
#include "osc_rdma_data_move.h"
static void ompi_osc_rdma_frag_constructor (ompi_osc_rdma_frag_t *frag){
frag->buffer = malloc (mca_osc_rdma_component.buffer_size + sizeof (ompi_osc_rdma_frag_header_t));
assert (frag->buffer);
}
static void ompi_osc_rdma_frag_destructor (ompi_osc_rdma_frag_t *frag) {
if (NULL != frag->buffer) {
free (frag->buffer);
}
}
OBJ_CLASS_INSTANCE(ompi_osc_rdma_frag_t, opal_list_item_t,
ompi_osc_rdma_frag_constructor, ompi_osc_rdma_frag_destructor);
static int frag_send_cb (ompi_request_t *request)
{
ompi_osc_rdma_frag_t *frag =
(ompi_osc_rdma_frag_t*) request->req_complete_cb_data;
ompi_osc_rdma_module_t *module = frag->module;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: frag_send complete to %d, frag = %p, request = %p",
frag->target, (void *) frag, (void *) request));
mark_outgoing_completion(module);
OPAL_FREE_LIST_RETURN(&mca_osc_rdma_component.frags, &frag->super);
/* put this request on the garbage colletion list */
osc_rdma_gc_add_request (request);
return OMPI_SUCCESS;
}
static int
frag_send(ompi_osc_rdma_module_t *module,
ompi_osc_rdma_frag_t *frag)
{
int count;
count = (int)((uintptr_t) frag->top - (uintptr_t) frag->buffer);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: frag_send called to %d, frag = %p, count = %d",
frag->target, (void *) frag, count));
return ompi_osc_rdma_isend_w_cb (frag->buffer, count, MPI_BYTE, frag->target, OSC_RDMA_FRAG_TAG,
module->comm, frag_send_cb, frag);
}
int
ompi_osc_rdma_frag_start(ompi_osc_rdma_module_t *module,
ompi_osc_rdma_frag_t *frag)
{
int ret;
assert(0 == frag->pending);
assert(module->peers[frag->target].active_frag != frag);
/* we need to signal now that a frag is outgoing to ensure the count sent
* with the unlock message is correct */
ompi_osc_signal_outgoing (module, frag->target, 1);
/* if eager sends are not active, can't send yet, so buffer and
get out... */
if (module->passive_target_access_epoch) {
if (!module->passive_eager_send_active[frag->target]) {
opal_list_append(&module->queued_frags, &frag->super);
return OMPI_SUCCESS;
}
} else {
if (!module->active_eager_send_active) {
opal_list_append(&module->queued_frags, &frag->super);
return OMPI_SUCCESS;
}
}
ret = frag_send(module, frag);
opal_condition_broadcast(&module->cond);
return ret;
}
int
ompi_osc_rdma_frag_flush_target(ompi_osc_rdma_module_t *module, int target)
{
int ret = OMPI_SUCCESS;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: frag flush target begin"));
/* flush the active frag */
if (NULL != module->peers[target].active_frag) {
ompi_osc_rdma_frag_t *frag = module->peers[target].active_frag;
if (0 != frag->pending) {
/* communication going on while synchronizing; this is a bug */
return OMPI_ERR_RMA_SYNC;
}
module->peers[target].active_frag = NULL;
ret = ompi_osc_rdma_frag_start(module, frag);
if (OMPI_SUCCESS != ret) return ret;
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: frag flush target finished active frag"));
/* walk through the pending list and send */
ompi_osc_rdma_frag_t *frag, *next;
OPAL_LIST_FOREACH_SAFE(frag, next, &module->queued_frags, ompi_osc_rdma_frag_t) {
if (frag->target == target) {
opal_list_remove_item(&module->queued_frags, &frag->super);
ret = frag_send(module, frag);
if (OMPI_SUCCESS != ret) return ret;
}
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: frag flush target finished"));
return OMPI_SUCCESS;
}
int
ompi_osc_rdma_frag_flush_all(ompi_osc_rdma_module_t *module)
{
int ret = OMPI_SUCCESS;
int i;
ompi_osc_rdma_frag_t *frag, *next;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: frag flush all begin"));
/* flush the active frag */
for (i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
if (NULL != module->peers[i].active_frag) {
ompi_osc_rdma_frag_t *frag = module->peers[i].active_frag;
if (0 != frag->pending) {
/* communication going on while synchronizing; this is a bug */
return OMPI_ERR_RMA_SYNC;
}
module->peers[i].active_frag = NULL;
ret = ompi_osc_rdma_frag_start(module, frag);
if (OMPI_SUCCESS != ret) return ret;
}
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: frag flush all finished active frag"));
/* try to start all the queued frags */
OPAL_LIST_FOREACH_SAFE(frag, next, &module->queued_frags, ompi_osc_rdma_frag_t) {
opal_list_remove_item(&module->queued_frags, &frag->super);
ret = frag_send(module, frag);
if (OMPI_SUCCESS != ret) {
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: failure for frag send: %d", ret));
return ret;
}
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: frag flush all done"));
return OMPI_SUCCESS;
}

Просмотреть файл

@ -1,138 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2012 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef OSC_RDMA_FRAG_H
#define OSC_RDMA_FRAG_H
#include "ompi/communicator/communicator.h"
#include "osc_rdma_header.h"
#include "osc_rdma_request.h"
#include "opal/align.h"
/** Communication buffer for packing messages */
struct ompi_osc_rdma_frag_t {
opal_list_item_t super;
/* target rank of buffer */
int target;
unsigned char *buffer;
/* space remaining in buffer */
size_t remain_len;
/* start of unused space */
char *top;
/* Number of operations which have started writing into the frag, but not yet completed doing so */
int pending;
ompi_osc_rdma_frag_header_t *header;
ompi_osc_rdma_module_t *module;
};
typedef struct ompi_osc_rdma_frag_t ompi_osc_rdma_frag_t;
OBJ_CLASS_DECLARATION(ompi_osc_rdma_frag_t);
extern int ompi_osc_rdma_frag_start(ompi_osc_rdma_module_t *module, ompi_osc_rdma_frag_t *buffer);
extern int ompi_osc_rdma_frag_flush_target(ompi_osc_rdma_module_t *module, int target);
extern int ompi_osc_rdma_frag_flush_all(ompi_osc_rdma_module_t *module);
/*
* Note: module lock must be held during this operation
*/
static inline int ompi_osc_rdma_frag_alloc(ompi_osc_rdma_module_t *module, int target,
size_t request_len, ompi_osc_rdma_frag_t **buffer,
char **ptr)
{
ompi_osc_rdma_frag_t *curr = module->peers[target].active_frag;
int ret;
/* osc rdma headers can have 64-bit values. these will need to be aligned
* on an 8-byte boundary on some architectures so we up align the allocation
* size here. */
request_len = OPAL_ALIGN(request_len, 8, size_t);
if (request_len > mca_osc_rdma_component.buffer_size) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
if (NULL == curr || curr->remain_len < request_len) {
opal_free_list_item_t *item;
if (NULL != curr) {
curr->remain_len = 0;
/* If there's something pending, the pending finish will
start the buffer. Otherwise, we need to start it now. */
if (0 == curr->pending) {
module->peers[target].active_frag = NULL;
ret = ompi_osc_rdma_frag_start(module, curr);
}
}
OPAL_FREE_LIST_GET(&mca_osc_rdma_component.frags,
item, ret);
if (OMPI_SUCCESS != ret) return ret;
curr = module->peers[target].active_frag =
(ompi_osc_rdma_frag_t*) item;
curr->target = target;
curr->header = (ompi_osc_rdma_frag_header_t*) curr->buffer;
curr->top = (char*) (curr->header + 1);
curr->remain_len = mca_osc_rdma_component.buffer_size;
curr->module = module;
curr->pending = 0;
curr->header->base.type = OMPI_OSC_RDMA_HDR_TYPE_FRAG;
curr->header->base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID;
if (module->passive_target_access_epoch) {
curr->header->base.flags |= OMPI_OSC_RDMA_HDR_FLAG_PASSIVE_TARGET;
}
curr->header->source = ompi_comm_rank(module->comm);
curr->header->num_ops = 0;
curr->header->windx = ompi_comm_get_cid(module->comm);
if (curr->remain_len < request_len) {
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
}
}
*ptr = curr->top;
*buffer = curr;
curr->top += request_len;
curr->remain_len -= request_len;
curr->pending++;
curr->header->num_ops++;
return OMPI_SUCCESS;
}
/*
* Note: module lock must be held for this operation
*/
static inline int ompi_osc_rdma_frag_finish(ompi_osc_rdma_module_t *module,
ompi_osc_rdma_frag_t* buffer)
{
if (0 == --buffer->pending && 0 == buffer->remain_len) {
if (OPAL_LIKELY(buffer == module->peers[buffer->target].active_frag)) {
/* this is the active fragment. need to set the current fragment to null
* or it will be started multiple times */
module->peers[buffer->target].active_frag = NULL;
}
return ompi_osc_rdma_frag_start(module, buffer);
}
return OMPI_SUCCESS;
}
#endif

Просмотреть файл

@ -1,187 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef OMPI_MCA_OSC_RDMA_HDR_H
#define OMPI_MCA_OSC_RDMA_HDR_H
#ifdef HAVE_NETINET_IN_H
#include <netinet/in.h>
#endif
#include "opal/types.h"
enum ompi_osc_rdma_hdr_type_t {
OMPI_OSC_RDMA_HDR_TYPE_PUT = 0x01,
OMPI_OSC_RDMA_HDR_TYPE_PUT_LONG = 0x02,
OMPI_OSC_RDMA_HDR_TYPE_ACC = 0x03,
OMPI_OSC_RDMA_HDR_TYPE_ACC_LONG = 0x04,
OMPI_OSC_RDMA_HDR_TYPE_GET = 0x05,
OMPI_OSC_RDMA_HDR_TYPE_CSWAP = 0x06,
OMPI_OSC_RDMA_HDR_TYPE_CSWAP_LONG = 0x07,
OMPI_OSC_RDMA_HDR_TYPE_GET_ACC = 0x08,
OMPI_OSC_RDMA_HDR_TYPE_GET_ACC_LONG = 0x09,
OMPI_OSC_RDMA_HDR_TYPE_COMPLETE = 0x10,
OMPI_OSC_RDMA_HDR_TYPE_POST = 0x11,
OMPI_OSC_RDMA_HDR_TYPE_LOCK_REQ = 0x12,
OMPI_OSC_RDMA_HDR_TYPE_LOCK_ACK = 0x13,
OMPI_OSC_RDMA_HDR_TYPE_UNLOCK_REQ = 0x14,
OMPI_OSC_RDMA_HDR_TYPE_UNLOCK_ACK = 0x15,
OMPI_OSC_RDMA_HDR_TYPE_FLUSH_REQ = 0x16,
OMPI_OSC_RDMA_HDR_TYPE_FLUSH_ACK = 0x17,
OMPI_OSC_RDMA_HDR_TYPE_FRAG = 0x20,
};
typedef enum ompi_osc_rdma_hdr_type_t ompi_osc_rdma_hdr_type_t;
#define OMPI_OSC_RDMA_HDR_FLAG_NBO 0x01
#define OMPI_OSC_RDMA_HDR_FLAG_VALID 0x02
#define OMPI_OSC_RDMA_HDR_FLAG_PASSIVE_TARGET 0x04
#define OMPI_OSC_RDMA_HDR_FLAG_LARGE_DATATYPE 0x08
struct ompi_osc_rdma_header_base_t {
/** fragment type. 8 bits */
uint8_t type;
/** fragment flags. 8 bits */
uint8_t flags;
};
typedef struct ompi_osc_rdma_header_base_t ompi_osc_rdma_header_base_t;
struct ompi_osc_rdma_header_put_t {
ompi_osc_rdma_header_base_t base;
uint16_t tag;
uint32_t count;
uint64_t len;
uint64_t displacement;
};
typedef struct ompi_osc_rdma_header_put_t ompi_osc_rdma_header_put_t;
struct ompi_osc_rdma_header_acc_t {
ompi_osc_rdma_header_base_t base;
uint16_t tag;
uint32_t count;
uint32_t op;
uint64_t len;
uint64_t displacement;
};
typedef struct ompi_osc_rdma_header_acc_t ompi_osc_rdma_header_acc_t;
struct ompi_osc_rdma_header_get_t {
ompi_osc_rdma_header_base_t base;
uint16_t tag;
uint32_t count;
uint64_t len;
uint64_t displacement;
};
typedef struct ompi_osc_rdma_header_get_t ompi_osc_rdma_header_get_t;
struct ompi_osc_rdma_header_complete_t {
ompi_osc_rdma_header_base_t base;
int frag_count;
};
typedef struct ompi_osc_rdma_header_complete_t ompi_osc_rdma_header_complete_t;
struct ompi_osc_rdma_header_cswap_t {
ompi_osc_rdma_header_base_t base;
uint16_t tag;
uint32_t len;
uint64_t displacement;
};
typedef struct ompi_osc_rdma_header_cswap_t ompi_osc_rdma_header_cswap_t;
struct ompi_osc_rdma_header_post_t {
ompi_osc_rdma_header_base_t base;
uint16_t windx;
};
typedef struct ompi_osc_rdma_header_post_t ompi_osc_rdma_header_post_t;
struct ompi_osc_rdma_header_lock_t {
ompi_osc_rdma_header_base_t base;
int32_t lock_type;
uint64_t serial_number;
};
typedef struct ompi_osc_rdma_header_lock_t ompi_osc_rdma_header_lock_t;
struct ompi_osc_rdma_header_lock_ack_t {
ompi_osc_rdma_header_base_t base;
uint16_t windx;
uint32_t source;
uint64_t serial_number;
};
typedef struct ompi_osc_rdma_header_lock_ack_t ompi_osc_rdma_header_lock_ack_t;
struct ompi_osc_rdma_header_unlock_t {
ompi_osc_rdma_header_base_t base;
int32_t lock_type;
uint32_t frag_count;
};
typedef struct ompi_osc_rdma_header_unlock_t ompi_osc_rdma_header_unlock_t;
struct ompi_osc_rdma_header_unlock_ack_t {
ompi_osc_rdma_header_base_t base;
};
typedef struct ompi_osc_rdma_header_unlock_ack_t ompi_osc_rdma_header_unlock_ack_t;
struct ompi_osc_rdma_header_flush_t {
ompi_osc_rdma_header_base_t base;
uint32_t frag_count;
uint64_t serial_number;
};
typedef struct ompi_osc_rdma_header_flush_t ompi_osc_rdma_header_flush_t;
struct ompi_osc_rdma_header_flush_ack_t {
ompi_osc_rdma_header_base_t base;
uint64_t serial_number;
};
typedef struct ompi_osc_rdma_header_flush_ack_t ompi_osc_rdma_header_flush_ack_t;
struct ompi_osc_rdma_frag_header_t {
ompi_osc_rdma_header_base_t base;
uint16_t windx; /* cid of communicator backing window (our window id) */
uint32_t source; /* rank in window of source process */
uint16_t num_ops; /* number of operations in this buffer */
uint16_t pad[3]; /* ensure the fragment header is a multiple of 8 bytes */
};
typedef struct ompi_osc_rdma_frag_header_t ompi_osc_rdma_frag_header_t;
union ompi_osc_rdma_header_t {
ompi_osc_rdma_header_base_t base;
ompi_osc_rdma_header_put_t put;
ompi_osc_rdma_header_acc_t acc;
ompi_osc_rdma_header_get_t get;
ompi_osc_rdma_header_complete_t complete;
ompi_osc_rdma_header_cswap_t cswap;
ompi_osc_rdma_header_post_t post;
ompi_osc_rdma_header_lock_t lock;
ompi_osc_rdma_header_lock_ack_t lock_ack;
ompi_osc_rdma_header_unlock_t unlock;
ompi_osc_rdma_header_unlock_ack_t unlock_ack;
ompi_osc_rdma_header_flush_t flush;
ompi_osc_rdma_header_flush_ack_t flush_ack;
ompi_osc_rdma_frag_header_t frag;
};
typedef union ompi_osc_rdma_header_t ompi_osc_rdma_header_t;
#endif /* OMPI_MCA_OSC_RDMA_HDR_H */

Просмотреть файл

@ -1,47 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2010 IBM Corporation. All rights reserved.
* Copyright (c) 2012 Sandia National Laboratories. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/*
* utility functions for dealing with remote datatype and op structures
*/
/**
* Convert a window index number into a module instance.
*/
static inline ompi_osc_rdma_module_t*
ompi_osc_rdma_windx_to_module(uint32_t windx)
{
int ret;
ompi_osc_rdma_module_t *module;
/* find the right module and dispatch */
OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock);
ret = opal_hash_table_get_value_uint32(&mca_osc_rdma_component.modules,
windx,
(void**) (&module));
OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock);
if (OMPI_SUCCESS != ret) {
opal_output(0, "Could not translate windx %d to a local MPI_Win instance",
windx);
return NULL;
}
return module;
}

Просмотреть файл

@ -1,966 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2010 IBM Corporation. All rights reserved.
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "osc_rdma.h"
#include "osc_rdma_header.h"
#include "osc_rdma_data_move.h"
#include "osc_rdma_frag.h"
#include "mpi.h"
#include "opal/runtime/opal_progress.h"
#include "opal/threads/mutex.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/osc/base/base.h"
#include "opal/include/opal_stdint.h"
/* target-side tracking of a lock request */
struct ompi_osc_rdma_pending_lock_t {
opal_list_item_t super;
int peer;
int lock_type;
uint64_t serial_number;
};
typedef struct ompi_osc_rdma_pending_lock_t ompi_osc_rdma_pending_lock_t;
OBJ_CLASS_INSTANCE(ompi_osc_rdma_pending_lock_t, opal_list_item_t,
NULL, NULL);
/* origin-side tracking of a lock request */
struct ompi_osc_rdma_outstanding_lock_t {
opal_list_item_t super;
int target;
int32_t lock_acks_received;
int32_t unlock_acks_received;
int32_t flush_acks_received;
uint64_t serial_number;
int32_t type;
};
typedef struct ompi_osc_rdma_outstanding_lock_t ompi_osc_rdma_outstanding_lock_t;
OBJ_CLASS_INSTANCE(ompi_osc_rdma_outstanding_lock_t, opal_list_item_t,
NULL, NULL);
static int ompi_osc_activate_next_lock (ompi_osc_rdma_module_t *module);
static inline int queue_lock (ompi_osc_rdma_module_t *module, int requestor,
int lock_type, uint64_t serial_number);
/**
* Find the first outstanding lock to a target.
*
* @param[in] module - OSC RDMA module
* @param[in] target - Target rank
*
* @returns an outstanding lock on success
*
* This function traverses the outstanding_locks list in the module
* looking for a lock that matches target. The caller must hold the
* module lock.
*/
static inline ompi_osc_rdma_outstanding_lock_t *find_outstanding_lock (ompi_osc_rdma_module_t *module, int target)
{
ompi_osc_rdma_outstanding_lock_t *lock;
OPAL_LIST_FOREACH(lock, &module->outstanding_locks, ompi_osc_rdma_outstanding_lock_t) {
if (lock->target == target) {
return lock;
}
}
return NULL;
}
static inline ompi_osc_rdma_outstanding_lock_t *find_outstanding_lock_by_serial (ompi_osc_rdma_module_t *module, uint64_t serial_number)
{
ompi_osc_rdma_outstanding_lock_t *lock;
OPAL_LIST_FOREACH(lock, &module->outstanding_locks, ompi_osc_rdma_outstanding_lock_t) {
if (lock->serial_number == serial_number) {
return lock;
}
}
return NULL;
}
static inline int ompi_osc_rdma_lock_self (ompi_osc_rdma_module_t *module, ompi_osc_rdma_outstanding_lock_t *lock)
{
const int my_rank = ompi_comm_rank (module->comm);
if ((MPI_LOCK_SHARED == lock->type && MPI_LOCK_EXCLUSIVE != module->lock_status) ||
(MPI_LOCK_EXCLUSIVE == lock->type && 0 == module->lock_status)) {
/* we can aquire the lock immediately */
module->lock_status = lock->type;
if (MPI_LOCK_SHARED == lock->type) {
module->shared_count++;
}
lock->lock_acks_received++;
} else {
/* queue the lock */
queue_lock (module, my_rank, lock->type, lock->serial_number);
}
/* If locking local, can't be non-blocking according to the
standard. We need to wait for the ack here. */
while (0 == lock->lock_acks_received) {
opal_condition_wait(&module->cond, &module->lock);
}
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"local lock aquired"));
return OMPI_SUCCESS;
}
static inline void ompi_osc_rdma_unlock_self (ompi_osc_rdma_module_t *module, ompi_osc_rdma_outstanding_lock_t *lock)
{
if (!(MPI_LOCK_SHARED == lock->type && 0 == --module->shared_count)) {
module->lock_status = 0;
ompi_osc_activate_next_lock (module);
}
/* need to ensure we make progress */
opal_progress();
lock->unlock_acks_received++;
}
static inline int ompi_osc_rdma_lock_remote (ompi_osc_rdma_module_t *module, int target, ompi_osc_rdma_outstanding_lock_t *lock)
{
ompi_osc_rdma_header_lock_t lock_req;
int ret;
/* generate a lock request */
lock_req.base.type = OMPI_OSC_RDMA_HDR_TYPE_LOCK_REQ;
lock_req.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID | OMPI_OSC_RDMA_HDR_FLAG_PASSIVE_TARGET;
lock_req.lock_type = lock->type;
lock_req.serial_number = lock->serial_number;
ret = ompi_osc_rdma_control_send (module, target, &lock_req, sizeof (lock_req));
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
/* make sure the request gets sent, so we can start eager sending... */
ret = ompi_osc_rdma_frag_flush_target (module, target);
return ret;
}
static inline int ompi_osc_rdma_unlock_remote (ompi_osc_rdma_module_t *module, int target, ompi_osc_rdma_outstanding_lock_t *lock)
{
ompi_osc_rdma_header_unlock_t unlock_req;
unlock_req.base.type = OMPI_OSC_RDMA_HDR_TYPE_UNLOCK_REQ;
unlock_req.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID | OMPI_OSC_RDMA_HDR_FLAG_PASSIVE_TARGET;
unlock_req.frag_count = module->epoch_outgoing_frag_count[target];
unlock_req.lock_type = lock->type;
/* send control message with unlock request and count */
return ompi_osc_rdma_control_send (module, target, &unlock_req, sizeof (unlock_req));
}
int ompi_osc_rdma_lock(int lock_type, int target, int assert, ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_outstanding_lock_t *lock;
ompi_osc_rdma_peer_t *peer = module->peers + target;
int ret = OMPI_SUCCESS;
/* Check if no_locks is set. TODO: we also need to track whether we are in an
* active target epoch. Fence can make this tricky to track. */
if (NULL == module->passive_eager_send_active || module->sc_group) {
return OMPI_ERR_RMA_SYNC;
}
assert(module->epoch_outgoing_frag_count[target] == 0);
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc rdma: lock %d %d", target, lock_type));
/* delay all eager sends until we've heard back.. */
OPAL_THREAD_LOCK(&module->lock);
module->passive_eager_send_active[target] = false;
module->passive_target_access_epoch = true;
/* when the lock ack returns we will be in an access epoch with this peer */
peer->access_epoch = true;
/* create lock item */
lock = OBJ_NEW(ompi_osc_rdma_outstanding_lock_t);
if (OPAL_UNLIKELY(NULL == lock)) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
lock->target = target;
lock->lock_acks_received = 0;
lock->unlock_acks_received = 0;
lock->serial_number = module->lock_serial_number++;
lock->type = lock_type;
opal_list_append(&module->outstanding_locks, &lock->super);
if (0 == (assert & MPI_MODE_NOCHECK)) {
if (ompi_comm_rank (module->comm) != target) {
ret = ompi_osc_rdma_lock_remote (module, target, lock);
} else {
ret = ompi_osc_rdma_lock_self (module, lock);
}
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
goto exit_error;
}
} else {
lock->lock_acks_received = 1;
}
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_SUCCESS;
exit_error:
OPAL_THREAD_UNLOCK(&module->lock);
opal_list_remove_item(&module->outstanding_locks, &lock->super);
OBJ_RELEASE(lock);
/* return */
return ret;
}
int ompi_osc_rdma_unlock(int target, ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_outstanding_lock_t *lock = NULL;
ompi_osc_rdma_peer_t *peer = module->peers + target;
int ret = OMPI_SUCCESS;
OPAL_THREAD_LOCK(&module->lock);
lock = find_outstanding_lock (module, target);
if (OPAL_UNLIKELY(NULL == lock)) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_unlock: target %d is not locked in window %s",
target, win->w_name));
OPAL_THREAD_LOCK(&module->lock);
return OMPI_ERR_RMA_SYNC;
}
if (ompi_comm_rank (module->comm) != target) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc rdma: unlock %d, lock_acks_received = %d", target,
lock->lock_acks_received));
/* wait until ack has arrived from target */
while (0 == lock->lock_acks_received) {
opal_condition_wait(&module->cond, &module->lock);
}
ret = ompi_osc_rdma_unlock_remote (module, target, lock);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
goto cleanup;
}
/* start all sendreqs to target */
ret = ompi_osc_rdma_frag_flush_target(module, target);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
goto cleanup;
}
/* wait for all the requests and the unlock ack (meaning remote completion) */
while (module->outgoing_frag_count != module->outgoing_frag_signal_count ||
0 == lock->unlock_acks_received) {
opal_condition_wait(&module->cond, &module->lock);
}
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_unlock: unlock of %d complete", target));
} else {
ompi_osc_rdma_unlock_self (module, lock);
}
module->passive_eager_send_active[target] = false;
module->epoch_outgoing_frag_count[target] = 0;
module->passive_target_access_epoch = false;
peer->access_epoch = false;
/* delete the lock */
opal_list_remove_item (&module->outstanding_locks, &lock->super);
OBJ_RELEASE(lock);
cleanup:
OPAL_THREAD_UNLOCK(&module->lock);
return ret;
}
int ompi_osc_rdma_lock_all(int assert, struct ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
int ret, my_rank = ompi_comm_rank (module->comm);
ompi_osc_rdma_outstanding_lock_t *lock;
/* Check if no_locks is set. TODO: we also need to track whether we are in an active
* target epoch. Fence can make this tricky to track. */
if (NULL == module->passive_eager_send_active) {
return OMPI_ERR_RMA_SYNC;
}
/* delay all eager sends until we've heard back.. */
OPAL_THREAD_LOCK(&module->lock);
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
module->passive_eager_send_active[i] = false;
}
module->passive_target_access_epoch = true;
module->all_access_epoch = true;
/* create lock item */
lock = OBJ_NEW(ompi_osc_rdma_outstanding_lock_t);
lock->target = -1;
lock->lock_acks_received = 0;
lock->unlock_acks_received = 0;
lock->serial_number = module->lock_serial_number++;
lock->type = MPI_LOCK_SHARED;
opal_list_append(&module->outstanding_locks, &lock->super);
/* if nocheck is not specified, send a lock request to everyone
and wait for the local response */
if (0 != (assert & MPI_MODE_NOCHECK)) {
ret = ompi_osc_rdma_lock_self (module, lock);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
goto exit_error;
}
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
if (my_rank == i) {
continue;
}
ret = ompi_osc_rdma_lock_remote (module, i, lock);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
opal_list_remove_item(&module->outstanding_locks, &lock->super);
}
}
} else {
lock->lock_acks_received = ompi_comm_size(module->comm);
}
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_SUCCESS;
exit_error:
OPAL_THREAD_UNLOCK(&module->lock);
opal_list_remove_item(&module->outstanding_locks, &lock->super);
OBJ_RELEASE(lock);
/* return */
return ret;
}
int ompi_osc_rdma_unlock_all (struct ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
int my_rank = ompi_comm_rank (module->comm);
ompi_osc_rdma_outstanding_lock_t *lock;
int ret;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_unlock_all entering..."));
OPAL_THREAD_LOCK(&module->lock);
lock = find_outstanding_lock (module, -1);
if (OPAL_UNLIKELY(NULL == lock)) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_unlock_all: not locked in window %s",
win->w_name));
OPAL_THREAD_LOCK(&module->lock);
return OMPI_ERR_RMA_SYNC;
}
/* wait for lock acks */
while (ompi_comm_size(module->comm) != lock->lock_acks_received) {
opal_condition_wait(&module->cond, &module->lock);
}
/* send unlock messages to all of my peers */
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
if (my_rank == i) {
continue;
}
ret = ompi_osc_rdma_unlock_remote (module, i, lock);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
goto cleanup;
}
}
/* unlock myself */
ompi_osc_rdma_unlock_self (module, lock);
/* start all sendreqs to target */
ret = ompi_osc_rdma_frag_flush_all(module);
if (OMPI_SUCCESS != ret) goto cleanup;
/* wait for all the requests and the unlock ack (meaning remote completion) */
while (module->outgoing_frag_count != module->outgoing_frag_signal_count ||
ompi_comm_size(module->comm) != lock->unlock_acks_received) {
opal_condition_wait(&module->cond, &module->lock);
}
/* reset all fragment counters */
memset (module->epoch_outgoing_frag_count, 0, ompi_comm_size(module->comm) * sizeof (module->epoch_outgoing_frag_count[0]));
memset (module->passive_eager_send_active, 0, ompi_comm_size(module->comm) * sizeof (module->passive_eager_send_active[0]));
opal_list_remove_item (&module->outstanding_locks, &lock->super);
OBJ_RELEASE(lock);
module->passive_target_access_epoch = false;
module->all_access_epoch = false;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_unlock_all complete"));
cleanup:
OPAL_THREAD_UNLOCK(&module->lock);
return ret;
}
int ompi_osc_rdma_sync (struct ompi_win_t *win)
{
opal_progress();
return OMPI_SUCCESS;
}
static int ompi_osc_rdma_flush_lock (ompi_osc_rdma_module_t *module, ompi_osc_rdma_outstanding_lock_t *lock,
int target)
{
ompi_osc_rdma_header_flush_t flush_req;
int peer_count, ret, flush_count;
int my_rank = ompi_comm_rank (module->comm);
if (-1 == lock->target) {
peer_count = ompi_comm_size(module->comm);
} else {
peer_count = 1;
}
/* wait until ack has arrived from target, since we need to be
able to eager send before we can transfer all the data... */
while (peer_count > lock->lock_acks_received) {
opal_condition_wait(&module->cond, &module->lock);
}
lock->flush_acks_received = 0;
flush_req.base.type = OMPI_OSC_RDMA_HDR_TYPE_FLUSH_REQ;
flush_req.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID | OMPI_OSC_RDMA_HDR_FLAG_PASSIVE_TARGET;
flush_req.serial_number = lock->serial_number;
if (-1 == target) {
/* NTH: no local flush */
flush_count = ompi_comm_size(module->comm) - 1;
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
if (i == my_rank) {
continue;
}
flush_req.frag_count = module->epoch_outgoing_frag_count[i];
/* send control message with flush request and count */
ret = ompi_osc_rdma_control_send (module, i, &flush_req, sizeof (flush_req));
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
/* start all sendreqs to target */
ret = ompi_osc_rdma_frag_flush_target (module, i);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
}
} else {
flush_req.frag_count = module->epoch_outgoing_frag_count[target];
flush_count = 1;
/* send control message with flush request and count */
ret = ompi_osc_rdma_control_send (module, target, &flush_req, sizeof (flush_req));
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
/* start all sendreqs to target */
ret = ompi_osc_rdma_frag_flush_target (module, target);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
}
/* wait for all the requests and the flush ack (meaning remote completion) */
while (module->outgoing_frag_count != module->outgoing_frag_signal_count ||
flush_count != lock->flush_acks_received) {
opal_condition_wait(&module->cond, &module->lock);
}
if (-1 == target) {
memset (module->epoch_outgoing_frag_count, 0, peer_count * sizeof (module->epoch_outgoing_frag_count[0]));
} else {
module->epoch_outgoing_frag_count[target] = 0;
}
return OMPI_SUCCESS;
}
int ompi_osc_rdma_flush (int target, struct ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_outstanding_lock_t *lock;
int ret;
assert (0 <= target);
/* flush is only allowed from within a passive target epoch */
if (!module->passive_target_access_epoch) {
return OMPI_ERR_RMA_SYNC;
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_flush starting..."));
if (ompi_comm_rank (module->comm) == target) {
/* nothing to flush */
opal_progress ();
return OMPI_SUCCESS;
}
OPAL_THREAD_LOCK(&module->lock);
lock = find_outstanding_lock (module, target);
if (NULL == lock) {
lock = find_outstanding_lock (module, -1);
}
if (OPAL_UNLIKELY(NULL == lock)) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_flush: target %d is not locked in window %s",
target, win->w_name));
OPAL_THREAD_LOCK(&module->lock);
return OMPI_ERR_RMA_SYNC;
}
ret = ompi_osc_rdma_flush_lock (module, lock, target);
OPAL_THREAD_UNLOCK(&module->lock);
return ret;
}
int ompi_osc_rdma_flush_all (struct ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_outstanding_lock_t *lock;
int ret = OMPI_SUCCESS;
/* flush is only allowed from within a passive target epoch */
if (!module->passive_target_access_epoch) {
return OMPI_ERR_RMA_SYNC;
}
if (OPAL_UNLIKELY(0 == opal_list_get_size (&module->outstanding_locks))) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_flush_all: no targets are locked in window %s",
win->w_name));
return OMPI_ERR_RMA_SYNC;
}
OPAL_THREAD_LOCK(&module->lock);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_flush_all entering..."));
/* flush all locks */
OPAL_LIST_FOREACH(lock, &module->outstanding_locks, ompi_osc_rdma_outstanding_lock_t) {
ret = ompi_osc_rdma_flush_lock (module, lock, lock->target);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
break;
}
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_flush_all complete"));
OPAL_THREAD_UNLOCK(&module->lock);
return ret;
}
int ompi_osc_rdma_flush_local (int target, struct ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
int ret;
/* flush is only allowed from within a passive target epoch */
if (!module->passive_target_access_epoch) {
return OMPI_ERR_RMA_SYNC;
}
OPAL_THREAD_LOCK(&module->lock);
ret = ompi_osc_rdma_frag_flush_target(module, target);
if (OMPI_SUCCESS != ret) goto cleanup;
/* wait for all the requests */
while (module->outgoing_frag_count != module->outgoing_frag_signal_count) {
opal_condition_wait(&module->cond, &module->lock);
}
cleanup:
OPAL_THREAD_UNLOCK(&module->lock);
return ret;
}
int ompi_osc_rdma_flush_local_all (struct ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
int ret = OMPI_SUCCESS;
/* flush is only allowed from within a passive target epoch */
if (!module->passive_target_access_epoch) {
return OMPI_ERR_RMA_SYNC;
}
OPAL_THREAD_LOCK(&module->lock);
ret = ompi_osc_rdma_frag_flush_all(module);
if (OMPI_SUCCESS != ret) goto cleanup;
/* wait for all the requests */
while (module->outgoing_frag_count != module->outgoing_frag_signal_count) {
opal_condition_wait(&module->cond, &module->lock);
}
cleanup:
OPAL_THREAD_UNLOCK(&module->lock);
return ret;
}
/* target side operation to acknowledge to initiator side that the
lock is now held by the initiator */
static inline int activate_lock (ompi_osc_rdma_module_t *module, int requestor,
uint64_t serial_number)
{
ompi_osc_rdma_outstanding_lock_t *lock;
if (ompi_comm_rank (module->comm) != requestor) {
ompi_osc_rdma_header_lock_ack_t lock_ack;
lock_ack.base.type = OMPI_OSC_RDMA_HDR_TYPE_LOCK_ACK;
lock_ack.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID;
lock_ack.source = ompi_comm_rank(module->comm);
lock_ack.windx = ompi_comm_get_cid(module->comm);
lock_ack.serial_number = serial_number;
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc rdma: sending lock to %d", requestor));
/* we don't want to send any data, since we're the exposure
epoch only, so use an unbuffered send */
return ompi_osc_rdma_control_send_unbuffered (module, requestor, &lock_ack, sizeof (lock_ack));
}
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc rdma: releasing local lock"));
lock = find_outstanding_lock (module, requestor);
if (NULL == lock) {
lock = find_outstanding_lock (module, -1);
if (OPAL_UNLIKELY(NULL == lock)) {
OPAL_OUTPUT_VERBOSE((5, ompi_osc_base_framework.framework_output,
"lock could not be located"));
}
}
lock->lock_acks_received++;
opal_condition_broadcast (&module->cond);
return OMPI_SUCCESS;
}
/* target side operation to create a pending lock request for a lock
request that could not be satisfied */
static inline int queue_lock (ompi_osc_rdma_module_t *module, int requestor,
int lock_type, uint64_t serial_number)
{
ompi_osc_rdma_pending_lock_t *pending =
OBJ_NEW(ompi_osc_rdma_pending_lock_t);
if (NULL == pending) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
pending->peer = requestor;
pending->lock_type = lock_type;
pending->serial_number = serial_number;
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc rdma: queueing lock request from %d", requestor));
opal_list_append(&module->locks_pending, &pending->super);
return OMPI_SUCCESS;
}
static int ompi_osc_activate_next_lock (ompi_osc_rdma_module_t *module) {
/* release any other pending locks we can */
ompi_osc_rdma_pending_lock_t *pending_lock, *next;
int ret = OMPI_SUCCESS;
OPAL_LIST_FOREACH_SAFE(pending_lock, next, &module->locks_pending,
ompi_osc_rdma_pending_lock_t) {
if (MPI_LOCK_SHARED == pending_lock->lock_type) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_activate_next_lock: release pending lock of type MPI_LOCK_SHARED to peer %d\n",
pending_lock->peer));
/* acquire shared lock */
module->lock_status = MPI_LOCK_SHARED;
module->shared_count++;
ret = activate_lock(module, pending_lock->peer, pending_lock->serial_number);
opal_list_remove_item (&module->locks_pending, &pending_lock->super);
OBJ_RELEASE(pending_lock);
} else {
if (0 == module->lock_status) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_activate_next_lock: release pending lock of type MPI_LOCK_EXCLUSIVE to peer %d\n",
pending_lock->peer));
/* acquire exclusive lock */
module->lock_status = MPI_LOCK_EXCLUSIVE;
ret = activate_lock(module, pending_lock->peer, pending_lock->serial_number);
opal_list_remove_item (&module->locks_pending, &pending_lock->super);
OBJ_RELEASE(pending_lock);
}
/* if the lock was acquired (ie, status was 0), then
we're done. If the lock was not acquired, we're
also done, because all the shared locks have to
finish first */
break;
}
if (OMPI_SUCCESS != ret) {
break;
}
}
return ret;
}
/* target side function called when the initiator sends a lock
request. Lock will either be activated and acknowledged or
queued. */
int ompi_osc_rdma_process_lock (ompi_osc_rdma_module_t* module, int source,
ompi_osc_rdma_header_lock_t* lock_header)
{
int ret;
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_process_lock: processing lock request from %d. current lock state = %d, shared_count = %d",
source, module->lock_status, module->shared_count));
if (MPI_LOCK_SHARED == lock_header->lock_type) {
if (module->lock_status != MPI_LOCK_EXCLUSIVE) {
/* acquire shared lock */
module->lock_status = MPI_LOCK_SHARED;
module->shared_count++;
ret = activate_lock(module, source, lock_header->serial_number);
} else {
/* lock not available, queue */
ret = queue_lock(module, source, lock_header->lock_type, lock_header->serial_number);
}
} else {
if (0 == module->lock_status) {
/* acquire exclusive lock */
module->lock_status = MPI_LOCK_EXCLUSIVE;
ret = activate_lock(module, source, lock_header->serial_number);
} else {
/* lock not available, queue */
ret = queue_lock(module, source, lock_header->lock_type, lock_header->serial_number);
}
}
return ret;
}
/* initiator-side function called when the target acks the lock
request. */
void ompi_osc_rdma_process_lock_ack (ompi_osc_rdma_module_t *module,
ompi_osc_rdma_header_lock_ack_t *lock_ack_header)
{
ompi_osc_rdma_outstanding_lock_t *lock, *next;
OPAL_LIST_FOREACH_SAFE(lock, next, &module->outstanding_locks, ompi_osc_rdma_outstanding_lock_t) {
if (lock->serial_number == lock_ack_header->serial_number) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc rdma: lock ack %d", lock_ack_header->source));
lock->lock_acks_received++;
module->passive_eager_send_active[lock_ack_header->source] = true;
return;
}
}
opal_output(ompi_osc_base_framework.framework_output,
"osc rdma: lock ack %d, %ld for unfindable lock request",
lock_ack_header->source, (unsigned long) lock_ack_header->serial_number);
}
void ompi_osc_rdma_process_flush_ack (ompi_osc_rdma_module_t *module, int source,
ompi_osc_rdma_header_flush_ack_t *flush_ack_header) {
ompi_osc_rdma_outstanding_lock_t *lock;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_process_unlock_ack: processing flush ack from %d for lock %" PRIu64,
source, flush_ack_header->serial_number));
/* NTH: need to verify that this will work as expected */
lock = find_outstanding_lock_by_serial (module, flush_ack_header->serial_number);
assert (NULL != lock);
lock->flush_acks_received++;
opal_condition_broadcast(&module->cond);
}
void ompi_osc_rdma_process_unlock_ack (ompi_osc_rdma_module_t *module, int source,
ompi_osc_rdma_header_unlock_ack_t *unlock_ack_header) {
ompi_osc_rdma_outstanding_lock_t *lock;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_process_unlock_ack: processing unlock ack from %d",
source));
/* NTH: need to verify that this will work as expected */
lock = find_outstanding_lock (module, source);
if (NULL == lock) {
lock = find_outstanding_lock(module, -1);
assert (NULL != lock);
}
lock->unlock_acks_received++;
}
/**
* Process an unlock request.
*
* @param[in] module - OSC RDMA module
* @param[in] source - Source rank
* @param[in] unlock_header - Incoming unlock header
*
* This functions is the target-side functio for handling an unlock
* request. Once all pending operations from the target are complete
* this functions sends an unlock acknowledgement then attempts to
* active a pending lock if the lock becomes free.
*/
int ompi_osc_rdma_process_unlock (ompi_osc_rdma_module_t *module, int source,
ompi_osc_rdma_header_unlock_t *unlock_header)
{
ompi_osc_rdma_header_unlock_ack_t unlock_ack;
int ret;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_process_unlock entering (finished %d/%d)...",
module->passive_incoming_frag_count[source],
module->passive_incoming_frag_signal_count[source]));
/* we cannot block when processing an incoming request */
if (module->passive_incoming_frag_signal_count[source] !=
module->passive_incoming_frag_count[source]) {
return OMPI_ERR_WOULD_BLOCK;
}
unlock_ack.base.type = OMPI_OSC_RDMA_HDR_TYPE_UNLOCK_ACK;
unlock_ack.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID;
ret = ompi_osc_rdma_control_send_unbuffered (module, source, &unlock_ack, sizeof (unlock_ack));
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
module->passive_incoming_frag_signal_count[source] = 0;
module->passive_incoming_frag_count[source] = 0;
OPAL_THREAD_LOCK(&module->lock);
if (unlock_header->lock_type == MPI_LOCK_EXCLUSIVE || 0 == --module->shared_count) {
module->lock_status = 0;
ompi_osc_activate_next_lock (module);
}
OPAL_THREAD_UNLOCK(&module->lock);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: finished processing unlock fragment"));
return ret;
}
int ompi_osc_rdma_process_flush (ompi_osc_rdma_module_t *module, int source,
ompi_osc_rdma_header_flush_t *flush_header)
{
ompi_osc_rdma_header_flush_ack_t flush_ack;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_process_flush entering (finished %d/%d)...",
module->passive_incoming_frag_count[source],
module->passive_incoming_frag_signal_count[source]));
/* we cannot block when processing an incoming request */
if (module->passive_incoming_frag_signal_count[source] !=
module->passive_incoming_frag_count[source]) {
return OMPI_ERR_WOULD_BLOCK;
}
module->passive_incoming_frag_signal_count[source] = 0;
module->passive_incoming_frag_count[source] = 0;
flush_ack.base.type = OMPI_OSC_RDMA_HDR_TYPE_FLUSH_ACK;
flush_ack.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID;
flush_ack.serial_number = flush_header->serial_number;
return ompi_osc_rdma_control_send_unbuffered (module, source, &flush_ack, sizeof (flush_ack));
}

0
ompi/mca/pml/bfo/.opal_ignore Обычный файл
Просмотреть файл

Просмотреть файл

@ -513,7 +513,7 @@ int mca_pml_bfo_send_fin( ompi_proc_t* proc,
fin->des_cbdata = NULL;
/* fill in header */
hdr = (mca_pml_bfo_fin_hdr_t*)fin->des_local->seg_addr.pval;
hdr = (mca_pml_bfo_fin_hdr_t*)fin->des_segments->seg_addr.pval;
hdr->hdr_common.hdr_flags = 0;
hdr->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_FIN;
hdr->hdr_des = hdr_des;

Просмотреть файл

@ -284,7 +284,7 @@ void mca_pml_bfo_repost_fin(struct mca_btl_base_descriptor_t* des) {
proc = (ompi_proc_t*) des->des_cbdata;
bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
hdr = (mca_pml_bfo_fin_hdr_t*)des->des_local->seg_addr.pval;
hdr = (mca_pml_bfo_fin_hdr_t*)des->des_segments->seg_addr.pval;
opal_output_verbose(20, mca_pml_bfo_output,
"REPOST: BFO_HDR_TYPE_FIN: seq=%d,myrank=%d,peer=%d,hdr->hdr_fail=%d,src=%d",
@ -376,7 +376,7 @@ void mca_pml_bfo_recv_frag_callback_rndvrestartnotify(mca_btl_base_module_t* btl
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des,
void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_local;
mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
mca_pml_bfo_recv_request_t* recvreq;
ompi_proc_t* ompi_proc;
@ -461,7 +461,7 @@ void mca_pml_bfo_recv_frag_callback_rndvrestartack(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des,
void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_local;
mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
mca_pml_bfo_send_request_t* sendreq;
@ -522,7 +522,7 @@ void mca_pml_bfo_recv_frag_callback_recverrnotify(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des,
void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_local;
mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
mca_pml_bfo_send_request_t* sendreq;
@ -607,7 +607,7 @@ void mca_pml_bfo_recv_frag_callback_rndvrestartnack(mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des,
void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_local;
mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
mca_pml_bfo_send_request_t* sendreq;
@ -701,7 +701,7 @@ void mca_pml_bfo_send_request_rndvrestartnotify(mca_pml_bfo_send_request_t* send
}
/* fill out header */
restart = (mca_pml_bfo_restart_hdr_t*)des->des_local->seg_addr.pval;
restart = (mca_pml_bfo_restart_hdr_t*)des->des_segments->seg_addr.pval;
restart->hdr_match.hdr_common.hdr_flags = 0;
restart->hdr_match.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY;
restart->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
@ -915,7 +915,7 @@ void mca_pml_bfo_repost_match_fragment(struct mca_btl_base_descriptor_t* des)
mca_btl_base_segment_t* oldseg;
mca_btl_base_segment_t* newseg;
oldseg = des->des_local;
oldseg = des->des_segments;
/* The alloc routine must be called with the MCA_BTL_NO_ORDER
* flag so that the allocation routine works. The allocation
* will fill in the order flag in the descriptor. */
@ -928,7 +928,7 @@ void mca_pml_bfo_repost_match_fragment(struct mca_btl_base_descriptor_t* des)
__FILE__, __LINE__);
ompi_rte_abort(-1, NULL);
}
newseg = newdes->des_local;
newseg = newdes->des_segments;
/* Copy over all the data that is actually sent over the wire */
memcpy(newseg->seg_addr.pval, oldseg->seg_addr.pval, oldseg->seg_len);
newseg->seg_len = oldseg->seg_len;
@ -972,7 +972,7 @@ mca_pml_bfo_rndvrestartnotify_completion(mca_btl_base_module_t* btl,
mca_pml_bfo_restart_hdr_t* restart;
mca_pml_bfo_send_request_t* sendreq;
restart = (mca_pml_bfo_restart_hdr_t*)des->des_local->seg_addr.pval;
restart = (mca_pml_bfo_restart_hdr_t*)des->des_segments->seg_addr.pval;
sendreq = (mca_pml_bfo_send_request_t*) restart->hdr_src_req.pval;
/* Need to resend this message in the case that it fails */
@ -1061,7 +1061,7 @@ void mca_pml_bfo_recv_request_recverrnotify(mca_pml_bfo_recv_request_t* recvreq,
}
/* fill out header */
restart = (mca_pml_bfo_restart_hdr_t*)des->des_local->seg_addr.pval;
restart = (mca_pml_bfo_restart_hdr_t*)des->des_segments->seg_addr.pval;
restart->hdr_match.hdr_common.hdr_flags = 0;
restart->hdr_match.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY;
restart->hdr_match.hdr_ctx = recvreq->req_recv.req_base.req_comm->c_contextid;
@ -1145,7 +1145,7 @@ void mca_pml_bfo_recv_request_rndvrestartack(mca_pml_bfo_recv_request_t* recvreq
}
/* fill out header */
restart = (mca_pml_bfo_restart_hdr_t*)des->des_local->seg_addr.pval;
restart = (mca_pml_bfo_restart_hdr_t*)des->des_segments->seg_addr.pval;
restart->hdr_match.hdr_common.hdr_flags = 0;
restart->hdr_match.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK;
restart->hdr_match.hdr_ctx = recvreq->req_recv.req_base.req_comm->c_contextid;
@ -1208,7 +1208,7 @@ void mca_pml_bfo_recv_request_rndvrestartnack(mca_btl_base_descriptor_t* olddes,
ompi_proc = olddes->des_cbdata;
}
segments = olddes->des_local;
segments = olddes->des_segments;
hdr = (mca_pml_bfo_restart_hdr_t*)segments->seg_addr.pval;
bml_endpoint = ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
@ -1226,7 +1226,7 @@ void mca_pml_bfo_recv_request_rndvrestartnack(mca_btl_base_descriptor_t* olddes,
}
/* fill out header */
nack = (mca_pml_bfo_restart_hdr_t*)des->des_local->seg_addr.pval;
nack = (mca_pml_bfo_restart_hdr_t*)des->des_segments->seg_addr.pval;
nack->hdr_match.hdr_common.hdr_flags = 0;
nack->hdr_match.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNACK;
nack->hdr_match.hdr_ctx = hdr->hdr_match.hdr_ctx;
@ -1317,13 +1317,13 @@ void mca_pml_bfo_recv_restart_completion( mca_btl_base_module_t* btl,
int status )
{
if(OPAL_UNLIKELY(OMPI_SUCCESS != status)) {
mca_pml_bfo_common_hdr_t* common = des->des_local->seg_addr.pval;
mca_pml_bfo_common_hdr_t* common = des->des_segments->seg_addr.pval;
mca_pml_bfo_restart_hdr_t* restart; /* RESTART header */
mca_pml_bfo_recv_request_t* recvreq;
switch (common->hdr_type) {
case MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK:
restart = (mca_pml_bfo_restart_hdr_t*)des->des_local->seg_addr.pval;
restart = (mca_pml_bfo_restart_hdr_t*)des->des_segments->seg_addr.pval;
recvreq = (mca_pml_bfo_recv_request_t*) restart->hdr_dst_req.pval;
opal_output_verbose(30, mca_pml_bfo_output,
"RNDVRESTARTACK: completion failed: try again "
@ -1351,7 +1351,7 @@ void mca_pml_bfo_recv_restart_completion( mca_btl_base_module_t* btl,
mca_pml_bfo_recv_request_rndvrestartnack(des, NULL, true);
break;
case MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY:
restart = (mca_pml_bfo_restart_hdr_t*)des->des_local->seg_addr.pval;
restart = (mca_pml_bfo_restart_hdr_t*)des->des_segments->seg_addr.pval;
recvreq = (mca_pml_bfo_recv_request_t*) restart->hdr_dst_req.pval;
/* With just two BTLs, this should never happen as we are
* typically sending the RECVERRNOTIFY message on the
@ -1759,7 +1759,7 @@ void mca_pml_bfo_check_recv_ctl_completion_status(mca_btl_base_module_t* btl,
struct mca_btl_base_descriptor_t* des,
int status)
{
mca_pml_bfo_common_hdr_t * common = des->des_local->seg_addr.pval;
mca_pml_bfo_common_hdr_t * common = des->des_segments->seg_addr.pval;
mca_pml_bfo_rdma_hdr_t* hdr; /* PUT header */
struct mca_btl_base_descriptor_t* rdma_des;
mca_pml_bfo_recv_request_t* recvreq;
@ -1789,7 +1789,7 @@ void mca_pml_bfo_check_recv_ctl_completion_status(mca_btl_base_module_t* btl,
break;
case MCA_PML_BFO_HDR_TYPE_PUT:
hdr = (mca_pml_bfo_rdma_hdr_t*)des->des_local->seg_addr.pval;
hdr = (mca_pml_bfo_rdma_hdr_t*)des->des_segments->seg_addr.pval;
rdma_des = hdr->hdr_des.pval;
recvreq = des->des_cbdata;
if ((NULL != rdma_des->des_cbdata) && (recvreq == rdma_des->des_cbdata)) {
@ -1947,14 +1947,14 @@ void mca_pml_bfo_update_eager_bml_btl_recv_ctl(mca_bml_base_btl_t** bml_btl,
struct mca_btl_base_descriptor_t* des)
{
if ((*bml_btl)->btl != btl) {
mca_pml_bfo_common_hdr_t * common = des->des_local->seg_addr.pval;
mca_pml_bfo_common_hdr_t * common = des->des_segments->seg_addr.pval;
mca_pml_bfo_ack_hdr_t* ack; /* ACK header */
mca_pml_bfo_recv_request_t* recvreq = NULL;
char *type = NULL;
switch (common->hdr_type) {
case MCA_PML_BFO_HDR_TYPE_ACK:
ack = (mca_pml_bfo_ack_hdr_t*)des->des_local->seg_addr.pval;
ack = (mca_pml_bfo_ack_hdr_t*)des->des_segments->seg_addr.pval;
recvreq = (mca_pml_bfo_recv_request_t*) ack->hdr_dst_req.pval;
type = "ACK";
break;
@ -2106,11 +2106,11 @@ void mca_pml_bfo_find_recvreq_rdma_bml_btl(mca_bml_base_btl_t** bml_btl,
bool mca_pml_bfo_rndv_completion_status_error(struct mca_btl_base_descriptor_t* des,
mca_pml_bfo_send_request_t* sendreq)
{
assert(((mca_pml_bfo_hdr_t*)((des)->des_local->seg_addr.pval))->hdr_match.hdr_ctx ==
assert(((mca_pml_bfo_hdr_t*)((des)->des_segments->seg_addr.pval))->hdr_match.hdr_ctx ==
(sendreq)->req_send.req_base.req_comm->c_contextid);
assert(((mca_pml_bfo_hdr_t*)((des)->des_local->seg_addr.pval))->hdr_match.hdr_src ==
assert(((mca_pml_bfo_hdr_t*)((des)->des_segments->seg_addr.pval))->hdr_match.hdr_src ==
(sendreq)->req_send.req_base.req_comm->c_my_rank);
assert(((mca_pml_bfo_hdr_t*)((des)->des_local->seg_addr.pval))->hdr_match.hdr_seq ==
assert(((mca_pml_bfo_hdr_t*)((des)->des_segments->seg_addr.pval))->hdr_match.hdr_seq ==
(uint16_t)(sendreq)->req_send.req_base.req_sequence);
if ((!(sendreq)->req_error) && (NULL == (sendreq)->req_recv.pval)) {
(sendreq)->req_events--;
@ -2157,7 +2157,7 @@ void mca_pml_bfo_completion_sendreq_has_error(mca_pml_bfo_send_request_t* sendre
void mca_pml_bfo_send_ctl_completion_status_error(struct mca_btl_base_descriptor_t* des)
{
mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)des->des_cbdata;
mca_pml_bfo_hdr_t* hdr = des->des_local->seg_addr.pval;
mca_pml_bfo_hdr_t* hdr = des->des_segments->seg_addr.pval;
switch (hdr->hdr_common.hdr_type) {
case MCA_PML_BFO_HDR_TYPE_RGET:
if ((hdr->hdr_match.hdr_ctx != sendreq->req_send.req_base.req_comm->c_contextid) ||

Просмотреть файл

@ -104,13 +104,13 @@ void mca_pml_bfo_recv_frag_callback_match(mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des,
void* cbdata )
{
mca_btl_base_segment_t* segments = des->des_local;
mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_bfo_match_hdr_t* hdr = (mca_pml_bfo_match_hdr_t*)segments->seg_addr.pval;
ompi_communicator_t *comm_ptr;
mca_pml_bfo_recv_request_t *match = NULL;
mca_pml_bfo_comm_t *comm;
mca_pml_bfo_comm_proc_t *proc;
size_t num_segments = des->des_local_count;
size_t num_segments = des->des_segment_count;
size_t bytes_received = 0;
assert(num_segments <= MCA_BTL_DES_MAX_SEGMENTS);
@ -257,7 +257,7 @@ void mca_pml_bfo_recv_frag_callback_rndv(mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des,
void* cbdata )
{
mca_btl_base_segment_t* segments = des->des_local;
mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_bfo_common_hdr_t)) ) {
@ -265,7 +265,7 @@ void mca_pml_bfo_recv_frag_callback_rndv(mca_btl_base_module_t* btl,
}
bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_RNDV);
mca_pml_bfo_recv_frag_match(btl, &hdr->hdr_match, segments,
des->des_local_count, MCA_PML_BFO_HDR_TYPE_RNDV);
des->des_segment_count, MCA_PML_BFO_HDR_TYPE_RNDV);
return;
}
@ -274,7 +274,7 @@ void mca_pml_bfo_recv_frag_callback_rget(mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des,
void* cbdata )
{
mca_btl_base_segment_t* segments = des->des_local;
mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_bfo_common_hdr_t)) ) {
@ -282,7 +282,7 @@ void mca_pml_bfo_recv_frag_callback_rget(mca_btl_base_module_t* btl,
}
bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_RGET);
mca_pml_bfo_recv_frag_match(btl, &hdr->hdr_match, segments,
des->des_local_count, MCA_PML_BFO_HDR_TYPE_RGET);
des->des_segment_count, MCA_PML_BFO_HDR_TYPE_RGET);
return;
}
@ -293,7 +293,7 @@ void mca_pml_bfo_recv_frag_callback_ack(mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des,
void* cbdata )
{
mca_btl_base_segment_t* segments = des->des_local;
mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
mca_pml_bfo_send_request_t* sendreq;
@ -341,7 +341,7 @@ void mca_pml_bfo_recv_frag_callback_frag(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des,
void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_local;
mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
mca_pml_bfo_recv_request_t* recvreq;
@ -353,7 +353,7 @@ void mca_pml_bfo_recv_frag_callback_frag(mca_btl_base_module_t* btl,
#if PML_BFO
MCA_PML_BFO_ERROR_CHECK_ON_FRAG_CALLBACK(recvreq);
#endif /* PML_BFO */
mca_pml_bfo_recv_request_progress_frag(recvreq,btl,segments,des->des_local_count);
mca_pml_bfo_recv_request_progress_frag(recvreq,btl,segments,des->des_segment_count);
return;
}
@ -363,7 +363,7 @@ void mca_pml_bfo_recv_frag_callback_put(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des,
void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_local;
mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
mca_pml_bfo_send_request_t* sendreq;
@ -386,7 +386,7 @@ void mca_pml_bfo_recv_frag_callback_fin(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des,
void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_local;
mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
mca_btl_base_descriptor_t* rdma;

Просмотреть файл

@ -246,7 +246,7 @@ int mca_pml_bfo_recv_request_ack_send_btl(
}
/* fill out header */
ack = (mca_pml_bfo_ack_hdr_t*)des->des_local->seg_addr.pval;
ack = (mca_pml_bfo_ack_hdr_t*)des->des_segments->seg_addr.pval;
ack->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_ACK;
ack->hdr_common.hdr_flags = nordma ? MCA_PML_BFO_HDR_FLAGS_NORDMA : 0;
ack->hdr_src_req.lval = hdr_src_req;
@ -851,7 +851,7 @@ int mca_pml_bfo_recv_request_schedule_once( mca_pml_bfo_recv_request_t* recvreq,
dst->des_cbfunc = mca_pml_bfo_put_completion;
dst->des_cbdata = recvreq;
seg_size = btl->btl_seg_size * dst->des_local_count;
seg_size = btl->btl_seg_size * dst->des_segment_count;
/* prepare a descriptor for rdma control message */
mca_bml_base_alloc(bml_btl, &ctl, MCA_BTL_NO_ORDER, sizeof(mca_pml_bfo_rdma_hdr_t) + seg_size,
@ -867,7 +867,7 @@ int mca_pml_bfo_recv_request_schedule_once( mca_pml_bfo_recv_request_t* recvreq,
#endif /* PML_BFO */
/* fill in rdma header */
hdr = (mca_pml_bfo_rdma_hdr_t*)ctl->des_local->seg_addr.pval;
hdr = (mca_pml_bfo_rdma_hdr_t*)ctl->des_segments->seg_addr.pval;
hdr->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_PUT;
hdr->hdr_common.hdr_flags =
(!recvreq->req_ack_sent) ? MCA_PML_BFO_HDR_TYPE_ACK : 0;
@ -877,10 +877,10 @@ int mca_pml_bfo_recv_request_schedule_once( mca_pml_bfo_recv_request_t* recvreq,
#endif /* PML_BFO */
hdr->hdr_des.pval = dst;
hdr->hdr_rdma_offset = recvreq->req_rdma_offset;
hdr->hdr_seg_cnt = dst->des_local_count;
hdr->hdr_seg_cnt = dst->des_segment_count;
/* copy segments */
memmove (hdr + 1, dst->des_local, seg_size);
memmove (hdr + 1, dst->des_segments, seg_size);
if(!recvreq->req_ack_sent)
recvreq->req_ack_sent = true;

Просмотреть файл

@ -257,8 +257,8 @@ mca_pml_bfo_rndv_completion( mca_btl_base_module_t* btl,
* have to be atomic.
*/
req_bytes_delivered = mca_pml_bfo_compute_segment_length (btl->btl_seg_size,
(void *) des->des_local,
des->des_local_count,
(void *) des->des_segments,
des->des_segment_count,
sizeof(mca_pml_bfo_rendezvous_hdr_t));
#if PML_BFO
@ -287,8 +287,8 @@ mca_pml_bfo_rget_completion( mca_btl_base_module_t* btl,
/* count bytes of user data actually delivered and check for request completion */
req_bytes_delivered = mca_pml_bfo_compute_segment_length (btl->btl_seg_size,
(void *) des->des_local,
des->des_local_count, 0);
(void *) des->des_segments,
des->des_segment_count, 0);
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered);
send_request_pml_complete_check(sendreq);
@ -357,8 +357,8 @@ mca_pml_bfo_frag_completion( mca_btl_base_module_t* btl,
/* count bytes of user data actually delivered */
req_bytes_delivered = mca_pml_bfo_compute_segment_length (btl->btl_seg_size,
(void *) des->des_local,
des->des_local_count,
(void *) des->des_segments,
des->des_segment_count,
sizeof(mca_pml_bfo_frag_hdr_t));
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, -1);
@ -409,7 +409,7 @@ int mca_pml_bfo_send_request_start_buffered(
if( OPAL_UNLIKELY(NULL == des) ) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
segment = des->des_local;
segment = des->des_segments;
/* pack the data into the BTL supplied buffer */
iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval +
@ -562,7 +562,7 @@ int mca_pml_bfo_send_request_start_copy( mca_pml_bfo_send_request_t* sendreq,
return OMPI_ERR_OUT_OF_RESOURCE;
}
segment = des->des_local;
segment = des->des_segments;
if(size > 0) {
/* pack the data into the supplied buffer */
@ -657,7 +657,7 @@ int mca_pml_bfo_send_request_start_prepare( mca_pml_bfo_send_request_t* sendreq,
if( OPAL_UNLIKELY(NULL == des) ) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
segment = des->des_local;
segment = des->des_segments;
/* build match header */
hdr = (mca_pml_bfo_hdr_t*)segment->seg_addr.pval;
@ -747,7 +747,7 @@ int mca_pml_bfo_send_request_start_rdma( mca_pml_bfo_send_request_t* sendreq,
src->des_cbfunc = mca_pml_bfo_rget_completion;
src->des_cbdata = sendreq;
seg_size = bml_btl->btl->btl_seg_size * src->des_local_count;
seg_size = bml_btl->btl->btl_seg_size * src->des_segment_count;
/* allocate space for get hdr + segment list */
mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER,
@ -759,7 +759,7 @@ int mca_pml_bfo_send_request_start_rdma( mca_pml_bfo_send_request_t* sendreq,
mca_bml_base_free(bml_btl, src);
return OMPI_ERR_OUT_OF_RESOURCE;
}
segment = des->des_local;
segment = des->des_segments;
/* build match header */
hdr = (mca_pml_bfo_hdr_t*)segment->seg_addr.pval;
@ -775,13 +775,13 @@ int mca_pml_bfo_send_request_start_rdma( mca_pml_bfo_send_request_t* sendreq,
MCA_PML_BFO_CHECK_FOR_RNDV_RESTART(hdr, sendreq, "RGET");
#endif /* PML_BFO */
hdr->hdr_rget.hdr_des.pval = src;
hdr->hdr_rget.hdr_seg_cnt = src->des_local_count;
hdr->hdr_rget.hdr_seg_cnt = src->des_segment_count;
bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_RGET,
sendreq->req_send.req_base.req_proc);
/* copy segment data */
memmove (&hdr->hdr_rget + 1, src->des_local, seg_size);
memmove (&hdr->hdr_rget + 1, src->des_segments, seg_size);
des->des_cbfunc = mca_pml_bfo_send_ctl_completion;
@ -808,7 +808,7 @@ int mca_pml_bfo_send_request_start_rdma( mca_pml_bfo_send_request_t* sendreq,
if( OPAL_UNLIKELY(NULL == des)) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
segment = des->des_local;
segment = des->des_segments;
/* build hdr */
hdr = (mca_pml_bfo_hdr_t*)segment->seg_addr.pval;
@ -912,7 +912,7 @@ int mca_pml_bfo_send_request_start_rndv( mca_pml_bfo_send_request_t* sendreq,
if( OPAL_UNLIKELY(NULL == des) ) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
segment = des->des_local;
segment = des->des_segments;
/* build hdr */
hdr = (mca_pml_bfo_hdr_t*)segment->seg_addr.pval;
@ -1145,7 +1145,7 @@ cannot_pack:
des->des_cbdata = sendreq;
/* setup header */
hdr = (mca_pml_bfo_frag_hdr_t*)des->des_local->seg_addr.pval;
hdr = (mca_pml_bfo_frag_hdr_t*)des->des_segments->seg_addr.pval;
hdr->hdr_common.hdr_flags = 0;
hdr->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_FRAG;
hdr->hdr_frag_offset = range->range_send_offset;

Просмотреть файл

@ -14,7 +14,7 @@
* Copyright (c) 2006-2008 University of Houston. All rights reserved.
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved
* Copyright (c) 2011 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2012 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
@ -500,17 +500,17 @@ static void mca_pml_ob1_dump_hdr(mca_pml_ob1_hdr_t* hdr)
case MCA_PML_OB1_HDR_TYPE_RGET:
type = "RGET";
snprintf( header, 128, "ctx %5d src %d tag %d seq %d msg_length %" PRIu64
"seg_cnt %d hdr_des %" PRIu64,
"frag %" PRIu64 " src_ptr %" PRIu64,
hdr->hdr_rndv.hdr_match.hdr_ctx, hdr->hdr_rndv.hdr_match.hdr_src,
hdr->hdr_rndv.hdr_match.hdr_tag, hdr->hdr_rndv.hdr_match.hdr_seq,
hdr->hdr_rndv.hdr_msg_length,
hdr->hdr_rget.hdr_seg_cnt, hdr->hdr_rget.hdr_des.lval);
hdr->hdr_rndv.hdr_msg_length, hdr->hdr_rget.hdr_frag.lval,
hdr->hdr_rget.hdr_src_ptr);
break;
case MCA_PML_OB1_HDR_TYPE_ACK:
type = "ACK";
snprintf( header, 128, "src_req %p dst_req %p offset %" PRIu64,
snprintf( header, 128, "src_req %p dst_req %p offset %" PRIu64 " size %" PRIu64,
hdr->hdr_ack.hdr_src_req.pval, hdr->hdr_ack.hdr_dst_req.pval,
hdr->hdr_ack.hdr_send_offset);
hdr->hdr_ack.hdr_send_offset, hdr->hdr_ack.hdr_send_size);
break;
case MCA_PML_OB1_HDR_TYPE_FRAG:
type = "FRAG";
@ -520,10 +520,11 @@ static void mca_pml_ob1_dump_hdr(mca_pml_ob1_hdr_t* hdr)
break;
case MCA_PML_OB1_HDR_TYPE_PUT:
type = "PUT";
snprintf( header, 128, "seg_cnt %d dst_req %p src_des %p recv_req %p offset %" PRIu64 " [%p %" PRIu64 "]",
hdr->hdr_rdma.hdr_seg_cnt, hdr->hdr_rdma.hdr_req.pval, hdr->hdr_rdma.hdr_des.pval,
snprintf( header, 128, "dst_req %p src_frag %p recv_req %p offset %" PRIu64
" dst_ptr %" PRIu64 " dst_size %" PRIu64,
hdr->hdr_rdma.hdr_req.pval, hdr->hdr_rdma.hdr_frag.pval,
hdr->hdr_rdma.hdr_recv_req.pval, hdr->hdr_rdma.hdr_rdma_offset,
hdr->hdr_rdma.hdr_segs[0].seg_addr.pval, hdr->hdr_rdma.hdr_segs[0].seg_len);
hdr->hdr_rdma.hdr_dst_ptr, hdr->hdr_rdma.hdr_dst_size);
break;
case MCA_PML_OB1_HDR_TYPE_FIN:
type = "FIN";
@ -638,37 +639,32 @@ static void mca_pml_ob1_fin_completion( mca_btl_base_module_t* btl,
*/
int mca_pml_ob1_send_fin( ompi_proc_t* proc,
mca_bml_base_btl_t* bml_btl,
opal_ptr_t hdr_des,
opal_ptr_t hdr_frag,
uint64_t rdma_size,
uint8_t order,
uint32_t status )
int status )
{
mca_btl_base_descriptor_t* fin;
mca_pml_ob1_fin_hdr_t* hdr;
int rc;
mca_bml_base_alloc(bml_btl, &fin, order, sizeof(mca_pml_ob1_fin_hdr_t),
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
if(NULL == fin) {
MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status);
MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_frag, rdma_size, bml_btl, order, status);
return OMPI_ERR_OUT_OF_RESOURCE;
}
fin->des_cbfunc = mca_pml_ob1_fin_completion;
fin->des_cbdata = NULL;
/* fill in header */
hdr = (mca_pml_ob1_fin_hdr_t*)fin->des_local->seg_addr.pval;
hdr->hdr_common.hdr_flags = 0;
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_FIN;
hdr->hdr_des = hdr_des;
hdr->hdr_fail = status;
mca_pml_ob1_fin_hdr_prepare ((mca_pml_ob1_fin_hdr_t *) fin->des_segments->seg_addr.pval,
0, hdr_frag.lval, status ? status : (int64_t) rdma_size);
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_FIN, proc);
/* queue request */
rc = mca_bml_base_send( bml_btl,
fin,
MCA_PML_OB1_HDR_TYPE_FIN );
rc = mca_bml_base_send( bml_btl, fin, MCA_PML_OB1_HDR_TYPE_FIN );
if( OPAL_LIKELY( rc >= 0 ) ) {
if( OPAL_LIKELY( 1 == rc ) ) {
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
@ -676,7 +672,7 @@ int mca_pml_ob1_send_fin( ompi_proc_t* proc,
return OMPI_SUCCESS;
}
mca_bml_base_free(bml_btl, fin);
MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status);
MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_frag, rdma_size, bml_btl, order, status);
return OMPI_ERR_OUT_OF_RESOURCE;
}
@ -717,6 +713,7 @@ void mca_pml_ob1_process_pending_packets(mca_bml_base_btl_t* bml_btl)
pckt->hdr.hdr_ack.hdr_src_req.lval,
pckt->hdr.hdr_ack.hdr_dst_req.pval,
pckt->hdr.hdr_ack.hdr_send_offset,
pckt->hdr.hdr_ack.hdr_send_size,
pckt->hdr.hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA);
if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) {
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
@ -728,9 +725,10 @@ void mca_pml_ob1_process_pending_packets(mca_bml_base_btl_t* bml_btl)
break;
case MCA_PML_OB1_HDR_TYPE_FIN:
rc = mca_pml_ob1_send_fin(pckt->proc, send_dst,
pckt->hdr.hdr_fin.hdr_des,
pckt->hdr.hdr_fin.hdr_frag,
pckt->hdr.hdr_fin.hdr_size,
pckt->order,
pckt->hdr.hdr_fin.hdr_fail);
pckt->status);
if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) {
return;
}

Просмотреть файл

@ -216,6 +216,7 @@ struct mca_pml_ob1_pckt_pending_t {
mca_pml_ob1_hdr_t hdr;
struct mca_bml_base_btl_t *bml_btl;
uint8_t order;
int status;
};
typedef struct mca_pml_ob1_pckt_pending_t mca_pml_ob1_pckt_pending_t;
OBJ_CLASS_DECLARATION(mca_pml_ob1_pckt_pending_t);
@ -234,17 +235,17 @@ do { \
(ompi_free_list_item_t*)pckt); \
} while(0)
#define MCA_PML_OB1_ADD_FIN_TO_PENDING(P, D, B, O, S) \
#define MCA_PML_OB1_ADD_FIN_TO_PENDING(P, D, Sz, B, O, S) \
do { \
mca_pml_ob1_pckt_pending_t *_pckt; \
\
MCA_PML_OB1_PCKT_PENDING_ALLOC(_pckt); \
_pckt->hdr.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_FIN; \
_pckt->hdr.hdr_fin.hdr_des = (D); \
_pckt->hdr.hdr_fin.hdr_fail = (S); \
mca_pml_ob1_fin_hdr_prepare (&_pckt->hdr.hdr_fin, 0, \
(D).lval, (Sz)); \
_pckt->proc = (P); \
_pckt->bml_btl = (B); \
_pckt->order = (O); \
_pckt->status = (S); \
OPAL_THREAD_LOCK(&mca_pml_ob1.lock); \
opal_list_append(&mca_pml_ob1.pckt_pending, \
(opal_list_item_t*)_pckt); \
@ -253,7 +254,7 @@ do { \
int mca_pml_ob1_send_fin(ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl,
opal_ptr_t hdr_des, uint8_t order, uint32_t status);
opal_ptr_t hdr_frag, uint64_t size, uint8_t order, int status);
/* This function tries to resend FIN/ACK packets from pckt_pending queue.
* Packets are added to the queue when sending of FIN or ACK is failed due to
@ -283,20 +284,6 @@ void mca_pml_ob1_process_pending_rdma(void);
/*
* Compute the total number of bytes on supplied descriptor
*/
static inline size_t
mca_pml_ob1_compute_segment_length(size_t seg_size, void *segments,
size_t count, size_t hdrlen)
{
size_t i, length = 0;
mca_btl_base_segment_t *segment = (mca_btl_base_segment_t*)segments;
for (i = 0; i < count ; ++i) {
length += segment->seg_len;
segment = (mca_btl_base_segment_t *)((char *)segment + seg_size);
}
return (length - hdrlen);
}
static inline size_t
mca_pml_ob1_compute_segment_length_base(mca_btl_base_segment_t *segments,
size_t count, size_t hdrlen)
@ -338,7 +325,7 @@ mca_pml_ob1_compute_segment_length_remote (size_t seg_size, void *segments,
/* represent BTL chosen for sending request */
struct mca_pml_ob1_com_btl_t {
mca_bml_base_btl_t *bml_btl;
struct mca_mpool_base_registration_t* btl_reg;
struct mca_btl_base_registration_handle_t *btl_reg;
size_t length;
};
typedef struct mca_pml_ob1_com_btl_t mca_pml_ob1_com_btl_t;

Просмотреть файл

@ -11,7 +11,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2009 IBM Corporation. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
* Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
@ -63,6 +63,13 @@ struct mca_pml_ob1_common_hdr_t {
};
typedef struct mca_pml_ob1_common_hdr_t mca_pml_ob1_common_hdr_t;
static inline void mca_pml_ob1_common_hdr_prepare (mca_pml_ob1_common_hdr_t *hdr, uint8_t hdr_type,
uint8_t hdr_flags)
{
hdr->hdr_type = hdr_type;
hdr->hdr_flags = hdr_flags;
}
#define MCA_PML_OB1_COMMON_HDR_NTOH(h)
#define MCA_PML_OB1_COMMON_HDR_HTON(h)
@ -88,15 +95,19 @@ struct mca_pml_ob1_match_hdr_t {
typedef struct mca_pml_ob1_match_hdr_t mca_pml_ob1_match_hdr_t;
static inline void mca_pml_ob1_match_hdr_prepare (mca_pml_ob1_match_hdr_t *hdr, uint8_t hdr_type, uint8_t hdr_flags,
uint16_t hdr_ctx, int32_t hdr_src, int32_t hdr_tag, uint16_t hdr_seq)
{
mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, hdr_type, hdr_flags);
hdr->hdr_ctx = hdr_ctx;
hdr->hdr_src = hdr_src;
hdr->hdr_tag = hdr_tag;
hdr->hdr_seq = hdr_seq;
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_OB1_MATCH_HDR_FILL(h) \
do { \
(h).hdr_padding[0] = 0; \
(h).hdr_padding[1] = 0; \
} while(0)
#else
#define MCA_PML_OB1_MATCH_HDR_FILL(h)
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
hdr->hdr_padding[0] = 0;
hdr->hdr_padding[1] = 0;
#endif
}
#define MCA_PML_OB1_MATCH_HDR_NTOH(h) \
do { \
@ -110,7 +121,6 @@ do { \
#define MCA_PML_OB1_MATCH_HDR_HTON(h) \
do { \
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
MCA_PML_OB1_MATCH_HDR_FILL(h); \
(h).hdr_ctx = htons((h).hdr_ctx); \
(h).hdr_src = htonl((h).hdr_src); \
(h).hdr_tag = htonl((h).hdr_tag); \
@ -129,12 +139,14 @@ struct mca_pml_ob1_rendezvous_hdr_t {
};
typedef struct mca_pml_ob1_rendezvous_hdr_t mca_pml_ob1_rendezvous_hdr_t;
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_OB1_RNDV_HDR_FILL(h) \
MCA_PML_OB1_MATCH_HDR_FILL((h).hdr_match)
#else
#define MCA_PML_OB1_RNDV_HDR_FILL(h)
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
static inline void mca_pml_ob1_rendezvous_hdr_prepare (mca_pml_ob1_rendezvous_hdr_t *hdr, uint8_t hdr_type, uint8_t hdr_flags,
uint16_t hdr_ctx, int32_t hdr_src, int32_t hdr_tag, uint16_t hdr_seq,
uint64_t hdr_msg_length, void *hdr_src_req)
{
mca_pml_ob1_match_hdr_prepare (&hdr->hdr_match, hdr_type, hdr_flags, hdr_ctx, hdr_src, hdr_tag, hdr_seq);
hdr->hdr_msg_length = hdr_msg_length;
hdr->hdr_src_req.pval = hdr_src_req;
}
/* Note that hdr_src_req is not put in network byte order because it
is never processed by the receiver, other than being copied into
@ -148,7 +160,6 @@ typedef struct mca_pml_ob1_rendezvous_hdr_t mca_pml_ob1_rendezvous_hdr_t;
#define MCA_PML_OB1_RNDV_HDR_HTON(h) \
do { \
MCA_PML_OB1_MATCH_HDR_HTON((h).hdr_match); \
MCA_PML_OB1_RNDV_HDR_FILL(h); \
(h).hdr_msg_length = hton64((h).hdr_msg_length); \
} while (0)
@ -157,38 +168,47 @@ typedef struct mca_pml_ob1_rendezvous_hdr_t mca_pml_ob1_rendezvous_hdr_t;
*/
struct mca_pml_ob1_rget_hdr_t {
mca_pml_ob1_rendezvous_hdr_t hdr_rndv;
uint32_t hdr_seg_cnt; /**< number of segments for rdma */
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
uint8_t hdr_padding[4];
#endif
opal_ptr_t hdr_des; /**< source descriptor */
opal_ptr_t hdr_frag; /**< source fragment (for fin) */
uint64_t hdr_src_ptr; /**< source pointer */
/* btl registration handle data follows */
};
typedef struct mca_pml_ob1_rget_hdr_t mca_pml_ob1_rget_hdr_t;
static inline void mca_pml_ob1_rget_hdr_prepare (mca_pml_ob1_rget_hdr_t *hdr, uint8_t hdr_flags,
uint16_t hdr_ctx, int32_t hdr_src, int32_t hdr_tag, uint16_t hdr_seq,
uint64_t hdr_msg_length, void *hdr_src_req, void *hdr_frag,
void *hdr_src_ptr, void *local_handle, size_t local_handle_size)
{
mca_pml_ob1_rendezvous_hdr_prepare (&hdr->hdr_rndv, MCA_PML_OB1_HDR_TYPE_RGET, hdr_flags,
hdr_ctx, hdr_src, hdr_tag, hdr_seq, hdr_msg_length, hdr_src_req);
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_OB1_RGET_HDR_FILL(h) \
do { \
MCA_PML_OB1_RNDV_HDR_FILL((h).hdr_rndv); \
(h).hdr_padding[0] = 0; \
(h).hdr_padding[1] = 0; \
(h).hdr_padding[2] = 0; \
(h).hdr_padding[3] = 0; \
} while(0)
#else
#define MCA_PML_OB1_RGET_HDR_FILL(h)
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
hdr->hdr_padding[0] = 0;
hdr->hdr_padding[1] = 0;
hdr->hdr_padding[2] = 0;
hdr->hdr_padding[3] = 0;
#endif
hdr->hdr_frag.pval = hdr_frag;
hdr->hdr_src_ptr = (uint64_t)(intptr_t) hdr_src_ptr;
#define MCA_PML_OB1_RGET_HDR_NTOH(h) \
do { \
MCA_PML_OB1_RNDV_HDR_NTOH((h).hdr_rndv); \
(h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \
/* copy registration handle */
memcpy (hdr + 1, local_handle, local_handle_size);
}
#define MCA_PML_OB1_RGET_HDR_NTOH(h) \
do { \
MCA_PML_OB1_RNDV_HDR_NTOH((h).hdr_rndv); \
(h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \
(h).hdr_src_ptr = ntoh64((h).hdr_src_ptr); \
} while (0)
#define MCA_PML_OB1_RGET_HDR_HTON(h) \
do { \
MCA_PML_OB1_RNDV_HDR_HTON((h).hdr_rndv); \
MCA_PML_OB1_RGET_HDR_FILL(h); \
(h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \
#define MCA_PML_OB1_RGET_HDR_HTON(h) \
do { \
MCA_PML_OB1_RNDV_HDR_HTON((h).hdr_rndv); \
(h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \
(h).hdr_src_ptr = hton64((h).hdr_src_ptr); \
} while (0)
/**
@ -205,19 +225,23 @@ struct mca_pml_ob1_frag_hdr_t {
};
typedef struct mca_pml_ob1_frag_hdr_t mca_pml_ob1_frag_hdr_t;
static inline void mca_pml_ob1_frag_hdr_prepare (mca_pml_ob1_frag_hdr_t *hdr, uint8_t hdr_flags,
uint64_t hdr_frag_offset, void *hdr_src_req,
uint64_t hdr_dst_req)
{
mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, MCA_PML_OB1_HDR_TYPE_FRAG, hdr_flags);
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_OB1_FRAG_HDR_FILL(h) \
do { \
(h).hdr_padding[0] = 0; \
(h).hdr_padding[1] = 0; \
(h).hdr_padding[2] = 0; \
(h).hdr_padding[3] = 0; \
(h).hdr_padding[4] = 0; \
(h).hdr_padding[5] = 0; \
} while(0)
#else
#define MCA_PML_OB1_FRAG_HDR_FILL(h)
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
hdr->hdr_padding[0] = 0;
hdr->hdr_padding[1] = 0;
hdr->hdr_padding[2] = 0;
hdr->hdr_padding[3] = 0;
hdr->hdr_padding[4] = 0;
hdr->hdr_padding[5] = 0;
#endif
hdr->hdr_frag_offset = hdr_frag_offset;
hdr->hdr_src_req.pval = hdr_src_req;
hdr->hdr_dst_req.lval = hdr_dst_req;
}
#define MCA_PML_OB1_FRAG_HDR_NTOH(h) \
do { \
@ -228,7 +252,6 @@ do { \
#define MCA_PML_OB1_FRAG_HDR_HTON(h) \
do { \
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
MCA_PML_OB1_FRAG_HDR_FILL(h); \
(h).hdr_frag_offset = hton64((h).hdr_frag_offset); \
} while (0)
@ -244,38 +267,45 @@ struct mca_pml_ob1_ack_hdr_t {
opal_ptr_t hdr_src_req; /**< source request */
opal_ptr_t hdr_dst_req; /**< matched receive request */
uint64_t hdr_send_offset; /**< starting point of copy in/out */
uint64_t hdr_send_size; /**< number of bytes requested (0: all remaining) */
};
typedef struct mca_pml_ob1_ack_hdr_t mca_pml_ob1_ack_hdr_t;
static inline void mca_pml_ob1_ack_hdr_prepare (mca_pml_ob1_ack_hdr_t *hdr, uint8_t hdr_flags,
uint64_t hdr_src_req, void *hdr_dst_req,
uint64_t hdr_send_offset, uint64_t hdr_send_size)
{
mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, MCA_PML_OB1_HDR_TYPE_ACK, hdr_flags);
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_OB1_ACK_HDR_FILL(h) \
do { \
(h).hdr_padding[0] = 0; \
(h).hdr_padding[1] = 0; \
(h).hdr_padding[2] = 0; \
(h).hdr_padding[3] = 0; \
(h).hdr_padding[4] = 0; \
(h).hdr_padding[5] = 0; \
} while (0)
#else
#define MCA_PML_OB1_ACK_HDR_FILL(h)
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
hdr->hdr_padding[0] = 0;
hdr->hdr_padding[1] = 0;
hdr->hdr_padding[2] = 0;
hdr->hdr_padding[3] = 0;
hdr->hdr_padding[4] = 0;
hdr->hdr_padding[5] = 0;
#endif
hdr->hdr_src_req.lval = hdr_src_req;
hdr->hdr_dst_req.pval = hdr_dst_req;
hdr->hdr_send_offset = hdr_send_offset;
hdr->hdr_send_size = hdr_send_size;
}
/* Note that the request headers are not put in NBO because the
src_req is already in receiver's byte order and the dst_req is not
used by the receiver for anything other than backpointers in return
headers */
#define MCA_PML_OB1_ACK_HDR_NTOH(h) \
do { \
MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \
#define MCA_PML_OB1_ACK_HDR_NTOH(h) \
do { \
MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \
(h).hdr_send_offset = ntoh64((h).hdr_send_offset); \
(h).hdr_send_size = ntoh64((h).hdr_send_size); \
} while (0)
#define MCA_PML_OB1_ACK_HDR_HTON(h) \
do { \
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
MCA_PML_OB1_ACK_HDR_FILL(h); \
#define MCA_PML_OB1_ACK_HDR_HTON(h) \
do { \
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
(h).hdr_send_offset = hton64((h).hdr_send_offset); \
(h).hdr_send_size = hton64((h).hdr_send_size); \
} while (0)
/**
@ -287,38 +317,55 @@ struct mca_pml_ob1_rdma_hdr_t {
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
uint8_t hdr_padding[2]; /** two to pad out the hdr to a 4 byte alignment. hdr_req will then be 8 byte aligned after 4 for hdr_seg_cnt */
#endif
uint32_t hdr_seg_cnt; /**< number of segments for rdma */
/* TODO: add real support for multiple destination segments */
opal_ptr_t hdr_req; /**< destination request */
opal_ptr_t hdr_des; /**< source descriptor */
opal_ptr_t hdr_frag; /**< receiver fragment */
opal_ptr_t hdr_recv_req; /**< receive request (NTH: needed for put fallback on send) */
uint64_t hdr_rdma_offset; /**< current offset into user buffer */
mca_btl_base_segment_t hdr_segs[1]; /**< list of segments for rdma */
uint64_t hdr_rdma_offset; /**< current offset into user buffer */
uint64_t hdr_dst_ptr; /**< destination address */
uint64_t hdr_dst_size; /**< destination size */
/* registration data follows */
};
typedef struct mca_pml_ob1_rdma_hdr_t mca_pml_ob1_rdma_hdr_t;
static inline void mca_pml_ob1_rdma_hdr_prepare (mca_pml_ob1_rdma_hdr_t *hdr, uint8_t hdr_flags,
uint64_t hdr_req, void *hdr_frag, void *hdr_recv_req,
uint64_t hdr_rdma_offset, void *hdr_dst_ptr,
uint64_t hdr_dst_size, void *local_handle,
size_t local_handle_size)
{
mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, MCA_PML_OB1_HDR_TYPE_PUT, hdr_flags);
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_OB1_RDMA_HDR_FILL(h) \
do { \
(h).hdr_padding[0] = 0; \
(h).hdr_padding[1] = 0; \
} while(0)
#else
#define MCA_PML_OB1_RDMA_HDR_FILL(h)
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
hdr->hdr_padding[0] = 0;
hdr->hdr_padding[1] = 0;
#endif
hdr->hdr_req.lval = hdr_req;
hdr->hdr_frag.pval = hdr_frag;
hdr->hdr_recv_req.pval = hdr_recv_req;
hdr->hdr_rdma_offset = hdr_rdma_offset;
hdr->hdr_dst_ptr = (uint64_t)(intptr_t) hdr_dst_ptr;
hdr->hdr_dst_size = hdr_dst_size;
#define MCA_PML_OB1_RDMA_HDR_NTOH(h) \
do { \
MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \
(h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \
/* copy segments */
memcpy (hdr + 1, local_handle, local_handle_size);
}
#define MCA_PML_OB1_RDMA_HDR_NTOH(h) \
do { \
MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \
(h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \
(h).hdr_rdma_offset = ntoh64((h).hdr_rdma_offset); \
(h).hdr_dst_ptr = ntoh64((h).hdr_dst_ptr); \
(h).hdr_dst_size = ntoh64((h).hdr_dst_size); \
} while (0)
#define MCA_PML_OB1_RDMA_HDR_HTON(h) \
do { \
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
MCA_PML_OB1_RDMA_HDR_FILL(h); \
(h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \
#define MCA_PML_OB1_RDMA_HDR_HTON(h) \
do { \
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
(h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \
(h).hdr_rdma_offset = hton64((h).hdr_rdma_offset); \
(h).hdr_dst_ptr = hton64((h).hdr_dst_ptr); \
(h).hdr_dst_size = hton64((h).hdr_dst_size); \
} while (0)
/**
@ -330,31 +377,34 @@ struct mca_pml_ob1_fin_hdr_t {
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
uint8_t hdr_padding[2];
#endif
uint32_t hdr_fail; /**< RDMA operation failed */
opal_ptr_t hdr_des; /**< completed descriptor */
int64_t hdr_size; /**< number of bytes completed (positive), error code (negative) */
opal_ptr_t hdr_frag; /**< completed RDMA fragment */
};
typedef struct mca_pml_ob1_fin_hdr_t mca_pml_ob1_fin_hdr_t;
static inline void mca_pml_ob1_fin_hdr_prepare (mca_pml_ob1_fin_hdr_t *hdr, uint8_t hdr_flags,
uint64_t hdr_frag, int64_t hdr_size)
{
mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, MCA_PML_OB1_HDR_TYPE_FIN, hdr_flags);
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_OB1_FIN_HDR_FILL(h) \
do { \
(h).hdr_padding[0] = 0; \
(h).hdr_padding[1] = 0; \
} while (0)
#else
#define MCA_PML_OB1_FIN_HDR_FILL(h)
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
hdr->hdr_padding[0] = 0;
hdr->hdr_padding[1] = 0;
#endif
hdr->hdr_frag.lval = hdr_frag;
hdr->hdr_size = hdr_size;
}
#define MCA_PML_OB1_FIN_HDR_NTOH(h) \
do { \
#define MCA_PML_OB1_FIN_HDR_NTOH(h) \
do { \
MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \
(h).hdr_size = ntoh64((h).hdr_size); \
} while (0)
#define MCA_PML_OB1_FIN_HDR_HTON(h) \
do { \
#define MCA_PML_OB1_FIN_HDR_HTON(h) \
do { \
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
MCA_PML_OB1_FIN_HDR_FILL(h); \
} while (0)
(h).hdr_size = hton64((h).hdr_size); \
} while (0)
/**
* Union of defined hdr types.

Просмотреть файл

@ -66,7 +66,6 @@ static inline int mca_pml_ob1_send_inline (void *buf, size_t count,
ompi_proc_t *dst_proc, mca_bml_base_endpoint_t* endpoint,
ompi_communicator_t * comm)
{
mca_btl_base_descriptor_t *des = NULL;
mca_pml_ob1_match_hdr_t match;
mca_bml_base_btl_t *bml_btl;
OPAL_PTRDIFF_TYPE lb, extent;
@ -94,28 +93,21 @@ static inline int mca_pml_ob1_send_inline (void *buf, size_t count,
opal_convertor_get_packed_size (&convertor, &size);
}
match.hdr_common.hdr_flags = 0;
match.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH;
match.hdr_ctx = comm->c_contextid;
match.hdr_src = comm->c_my_rank;
match.hdr_tag = tag;
match.hdr_seq = seqn;
mca_pml_ob1_match_hdr_prepare (&match, MCA_PML_OB1_HDR_TYPE_MATCH, 0,
comm->c_contextid, comm->c_my_rank,
tag, seqn);
ob1_hdr_hton(&match, MCA_PML_OB1_HDR_TYPE_MATCH, dst_proc);
/* try to send immediately */
rc = mca_bml_base_sendi (bml_btl, &convertor, &match, OMPI_PML_OB1_MATCH_HDR_LEN,
size, MCA_BTL_NO_ORDER, MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP,
MCA_PML_OB1_HDR_TYPE_MATCH, &des);
MCA_PML_OB1_HDR_TYPE_MATCH, NULL);
if (count > 0) {
opal_convertor_cleanup (&convertor);
}
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
if (des) {
mca_bml_base_free (bml_btl, des);
}
return rc;
}
@ -220,7 +212,7 @@ int mca_pml_ob1_send(void *buf,
OBJ_CONSTRUCT(sendreq, mca_pml_ob1_send_request_t);
sendreq->req_send.req_base.req_proc = dst_proc;
sendreq->src_des = NULL;
sendreq->rdma_frag = NULL;
MCA_PML_OB1_SEND_REQUEST_INIT(sendreq,
buf,

Просмотреть файл

@ -1,3 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
@ -9,6 +10,8 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -27,11 +30,6 @@
#include "pml_ob1.h"
#include "pml_ob1_rdma.h"
/* Use this registration if no registration needed for a BTL instead of NULL.
* This will help other code to distinguish case when memory is not registered
* from case when registration is not needed */
static mca_mpool_base_registration_t pml_ob1_dummy_reg;
/*
* Check to see if memory is registered or can be registered. Build a
* set of registrations on the request.
@ -45,7 +43,7 @@ size_t mca_pml_ob1_rdma_btls(
{
int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
double weight_total = 0;
int num_btls_used = 0, n;
int num_btls_used = 0;
/* shortcut when there are no rdma capable btls */
if(num_btls == 0) {
@ -53,29 +51,25 @@ size_t mca_pml_ob1_rdma_btls(
}
/* check to see if memory is registered */
for(n = 0; n < num_btls && num_btls_used < mca_pml_ob1.max_rdma_per_request;
n++) {
for (int n = 0; n < num_btls && num_btls_used < mca_pml_ob1.max_rdma_per_request; n++) {
mca_bml_base_btl_t* bml_btl =
mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma,
(bml_endpoint->btl_rdma_index + n) % num_btls);
mca_mpool_base_registration_t* reg = &pml_ob1_dummy_reg;
mca_mpool_base_module_t *btl_mpool = bml_btl->btl->btl_mpool;
(bml_endpoint->btl_rdma_index + n) % num_btls);
mca_btl_base_registration_handle_t *reg_handle = NULL;
mca_btl_base_module_t *btl = bml_btl->btl;
if( NULL != btl_mpool ) {
if(!mca_pml_ob1.leave_pinned) {
/* look through existing registrations */
btl_mpool->mpool_find(btl_mpool, base, size, &reg);
} else {
/* register the memory */
btl_mpool->mpool_register(btl_mpool, base, size, 0, &reg);
}
if(NULL == reg)
if (btl->btl_register_mem) {
/* try to register the memory with the btl */
reg_handle = btl->btl_register_mem (btl, bml_btl->btl_endpoint, base,
size, MCA_BTL_REG_FLAG_REMOTE_READ);
if (NULL == reg_handle) {
/* btl requires registration but the registration failed */
continue;
}
}
} /* else no registration is needed */
rdma_btls[num_btls_used].bml_btl = bml_btl;
rdma_btls[num_btls_used].btl_reg = reg;
rdma_btls[num_btls_used].btl_reg = reg_handle;
weight_total += bml_btl->btl_weight;
num_btls_used++;
}
@ -83,7 +77,7 @@ size_t mca_pml_ob1_rdma_btls(
/* if we don't use leave_pinned and all BTLs that already have this memory
* registered amount to less then half of available bandwidth - fall back to
* pipeline protocol */
if(0 == num_btls_used || (!mca_pml_ob1.leave_pinned && weight_total < 0.5))
if (0 == num_btls_used || (!mca_pml_ob1.leave_pinned && weight_total < 0.5))
return 0;
mca_pml_ob1_calc_weighted_length(rdma_btls, num_btls_used, size,
@ -103,10 +97,6 @@ size_t mca_pml_ob1_rdma_pipeline_btls( mca_bml_base_endpoint_t* bml_endpoint,
for(i = 0; i < num_btls && i < mca_pml_ob1.max_rdma_per_request; i++) {
rdma_btls[i].bml_btl =
mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma);
if(NULL != rdma_btls[i].bml_btl->btl->btl_mpool)
rdma_btls[i].btl_reg = NULL;
else
rdma_btls[i].btl_reg = &pml_ob1_dummy_reg;
weight_total += rdma_btls[i].bml_btl->btl_weight;
}

Просмотреть файл

@ -1,3 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
@ -9,6 +10,8 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -21,9 +24,13 @@
#include "pml_ob1.h"
#include "pml_ob1_rdmafrag.h"
static void mca_pml_ob1_rdma_frag_constructor (mca_pml_ob1_rdma_frag_t *frag)
{
frag->local_handle = NULL;
}
OBJ_CLASS_INSTANCE(
mca_pml_ob1_rdma_frag_t,
ompi_free_list_item_t,
NULL,
mca_pml_ob1_rdma_frag_constructor,
NULL);

Просмотреть файл

@ -10,6 +10,8 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -32,38 +34,52 @@ typedef enum {
MCA_PML_OB1_RDMA_GET
} mca_pml_ob1_rdma_state_t;
struct mca_pml_ob1_rdma_frag_t;
typedef void (*mca_pml_ob1_rdma_frag_callback_t)(struct mca_pml_ob1_rdma_frag_t *frag, int64_t rdma_length);
/**
* Used to keep track of local and remote RDMA operations.
*/
struct mca_pml_ob1_rdma_frag_t {
ompi_free_list_item_t super;
mca_bml_base_btl_t* rdma_bml;
mca_bml_base_btl_t *rdma_bml;
mca_pml_ob1_hdr_t rdma_hdr;
mca_pml_ob1_rdma_state_t rdma_state;
size_t rdma_length;
uint8_t rdma_segs[MCA_BTL_SEG_MAX_SIZE * MCA_BTL_DES_MAX_SEGMENTS];
void *rdma_req;
struct mca_bml_base_endpoint_t* rdma_ep;
opal_convertor_t convertor;
mca_mpool_base_registration_t* reg;
uint32_t retries;
mca_pml_ob1_rdma_frag_callback_t cbfunc;
uint64_t rdma_offset;
void *local_address;
mca_btl_base_registration_handle_t *local_handle;
uint64_t remote_address;
uint8_t remote_handle[MCA_BTL_REG_HANDLE_MAX_SIZE];
};
typedef struct mca_pml_ob1_rdma_frag_t mca_pml_ob1_rdma_frag_t;
OBJ_CLASS_DECLARATION(mca_pml_ob1_rdma_frag_t);
#define MCA_PML_OB1_RDMA_FRAG_ALLOC(frag) \
do { \
ompi_free_list_item_t* item; \
#define MCA_PML_OB1_RDMA_FRAG_ALLOC(frag) \
do { \
ompi_free_list_item_t* item; \
OMPI_FREE_LIST_WAIT_MT(&mca_pml_ob1.rdma_frags, item); \
frag = (mca_pml_ob1_rdma_frag_t*)item; \
} while(0)
#define MCA_PML_OB1_RDMA_FRAG_RETURN(frag) \
do { \
/* return fragment */ \
OMPI_FREE_LIST_RETURN_MT(&mca_pml_ob1.rdma_frags, \
(ompi_free_list_item_t*)frag); \
frag = (mca_pml_ob1_rdma_frag_t*)item; \
} while(0)
#define MCA_PML_OB1_RDMA_FRAG_RETURN(frag) \
do { \
/* return fragment */ \
if (frag->local_handle) { \
mca_bml_base_deregister_mem (frag->rdma_bml, frag->local_handle); \
frag->local_handle = NULL; \
} \
OMPI_FREE_LIST_RETURN_MT(&mca_pml_ob1.rdma_frags, \
(ompi_free_list_item_t*)frag); \
} while (0)
END_C_DECLS

Просмотреть файл

@ -108,13 +108,13 @@ void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des,
void* cbdata )
{
mca_btl_base_segment_t* segments = des->des_local;
mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_ob1_match_hdr_t* hdr = (mca_pml_ob1_match_hdr_t*)segments->seg_addr.pval;
ompi_communicator_t *comm_ptr;
mca_pml_ob1_recv_request_t *match = NULL;
mca_pml_ob1_comm_t *comm;
mca_pml_ob1_comm_proc_t *proc;
size_t num_segments = des->des_local_count;
size_t num_segments = des->des_segment_count;
size_t bytes_received = 0;
assert(num_segments <= MCA_BTL_DES_MAX_SEGMENTS);
@ -256,7 +256,7 @@ void mca_pml_ob1_recv_frag_callback_rndv(mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des,
void* cbdata )
{
mca_btl_base_segment_t* segments = des->des_local;
mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) {
@ -264,7 +264,7 @@ void mca_pml_ob1_recv_frag_callback_rndv(mca_btl_base_module_t* btl,
}
ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_RNDV);
mca_pml_ob1_recv_frag_match(btl, &hdr->hdr_match, segments,
des->des_local_count, MCA_PML_OB1_HDR_TYPE_RNDV);
des->des_segment_count, MCA_PML_OB1_HDR_TYPE_RNDV);
return;
}
@ -273,7 +273,7 @@ void mca_pml_ob1_recv_frag_callback_rget(mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des,
void* cbdata )
{
mca_btl_base_segment_t* segments = des->des_local;
mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) {
@ -281,7 +281,7 @@ void mca_pml_ob1_recv_frag_callback_rget(mca_btl_base_module_t* btl,
}
ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_RGET);
mca_pml_ob1_recv_frag_match(btl, &hdr->hdr_match, segments,
des->des_local_count, MCA_PML_OB1_HDR_TYPE_RGET);
des->des_segment_count, MCA_PML_OB1_HDR_TYPE_RGET);
return;
}
@ -292,9 +292,10 @@ void mca_pml_ob1_recv_frag_callback_ack(mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des,
void* cbdata )
{
mca_btl_base_segment_t* segments = des->des_local;
mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
mca_pml_ob1_send_request_t* sendreq;
size_t size;
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) {
return;
@ -307,19 +308,25 @@ void mca_pml_ob1_recv_frag_callback_ack(mca_btl_base_module_t* btl,
/* if the request should be delivered entirely by copy in/out
* then throttle sends */
if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA) {
if (NULL != sendreq->src_des) {
/* release registered memory */
mca_bml_base_free (sendreq->req_rdma[0].bml_btl, sendreq->src_des);
sendreq->src_des = NULL;
if (NULL != sendreq->rdma_frag) {
if (NULL != sendreq->rdma_frag->local_handle) {
mca_bml_base_deregister_mem (sendreq->req_rdma[0].bml_btl, sendreq->rdma_frag->local_handle);
sendreq->rdma_frag->local_handle = NULL;
}
MCA_PML_OB1_RDMA_FRAG_RETURN(sendreq->rdma_frag);
sendreq->rdma_frag = NULL;
}
sendreq->req_throttle_sends = true;
}
mca_pml_ob1_send_request_copy_in_out(sendreq,
hdr->hdr_ack.hdr_send_offset,
sendreq->req_send.req_bytes_packed -
hdr->hdr_ack.hdr_send_offset);
if (hdr->hdr_ack.hdr_send_size) {
size = hdr->hdr_ack.hdr_send_size;
} else {
size = sendreq->req_send.req_bytes_packed - hdr->hdr_ack.hdr_send_offset;
}
mca_pml_ob1_send_request_copy_in_out(sendreq, hdr->hdr_ack.hdr_send_offset, size);
if (sendreq->req_state != 0) {
/* Typical receipt of an ACK message causes req_state to be
@ -355,13 +362,14 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des,
void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_local;
mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
mca_pml_ob1_recv_request_t* recvreq;
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) {
return;
}
ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_FRAG);
recvreq = (mca_pml_ob1_recv_request_t*)hdr->hdr_frag.hdr_dst_req.pval;
#if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
@ -372,7 +380,7 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl,
assert(btl->btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV);
/* This will trigger the opal_convertor_pack to start asynchronous copy. */
mca_pml_ob1_recv_request_frag_copy_start(recvreq,btl,segments,des->des_local_count,des);
mca_pml_ob1_recv_request_frag_copy_start(recvreq,btl,segments,des->des_segment_count,des);
/* Let BTL know that it CANNOT free the frag */
des->des_flags |= MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC;
@ -380,7 +388,8 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl,
return;
}
#endif /* OPAL_CUDA_SUPPORT */
mca_pml_ob1_recv_request_progress_frag(recvreq,btl,segments,des->des_local_count);
mca_pml_ob1_recv_request_progress_frag(recvreq,btl,segments,des->des_segment_count);
return;
}
@ -390,7 +399,7 @@ void mca_pml_ob1_recv_frag_callback_put(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des,
void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_local;
mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
mca_pml_ob1_send_request_t* sendreq;
@ -410,20 +419,17 @@ void mca_pml_ob1_recv_frag_callback_fin(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des,
void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_local;
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
mca_btl_base_descriptor_t* rdma;
mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_ob1_fin_hdr_t* hdr = (mca_pml_ob1_fin_hdr_t *) segments->seg_addr.pval;
mca_pml_ob1_rdma_frag_t *frag;
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) {
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_fin_hdr_t)) ) {
return;
}
ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_FIN);
rdma = (mca_btl_base_descriptor_t*)hdr->hdr_fin.hdr_des.pval;
rdma->des_cbfunc(btl, NULL, rdma,
hdr->hdr_fin.hdr_fail ? OMPI_ERROR : OMPI_SUCCESS);
return;
frag = (mca_pml_ob1_rdma_frag_t *) hdr->hdr_frag.pval;
frag->cbfunc (frag, hdr->hdr_size);
}
@ -699,7 +705,7 @@ out_of_order_match:
OPAL_THREAD_UNLOCK(&comm->matching_lock);
if(OPAL_LIKELY(match)) {
switch(type) {
switch(type) {
case MCA_PML_OB1_HDR_TYPE_MATCH:
mca_pml_ob1_recv_request_progress_match(match, btl, segments, num_segments);
break;

Просмотреть файл

@ -13,7 +13,7 @@
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
* Copyright (c) 2011 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2012-2013 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2012 FUJITSU LIMITED. All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science
@ -150,12 +150,17 @@ static void mca_pml_ob1_recv_request_construct(mca_pml_ob1_recv_request_t* reque
request->req_recv.req_base.req_ompi.req_free = mca_pml_ob1_recv_request_free;
request->req_recv.req_base.req_ompi.req_cancel = mca_pml_ob1_recv_request_cancel;
request->req_rdma_cnt = 0;
request->local_handle = NULL;
OBJ_CONSTRUCT(&request->lock, opal_mutex_t);
}
static void mca_pml_ob1_recv_request_destruct(mca_pml_ob1_recv_request_t* request)
{
OBJ_DESTRUCT(&request->lock);
if (OPAL_UNLIKELY(request->local_handle)) {
mca_bml_base_deregister_mem (request->rdma_bml, request->local_handle);
request->local_handle = NULL;
}
}
OBJ_CLASS_INSTANCE(
@ -183,31 +188,27 @@ static void mca_pml_ob1_recv_ctl_completion( mca_btl_base_module_t* btl,
* Put operation has completed remotely - update request status
*/
static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* ep,
struct mca_btl_base_descriptor_t* des,
int status )
static void mca_pml_ob1_put_completion (mca_pml_ob1_rdma_frag_t *frag, int64_t rdma_size)
{
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context;
mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)des->des_cbdata;
size_t bytes_received = 0;
mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req;
mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
if( OPAL_LIKELY(status == OMPI_SUCCESS) ) {
bytes_received = mca_pml_ob1_compute_segment_length (btl->btl_seg_size,
(void *) des->des_local,
des->des_local_count, 0);
}
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth,-1);
mca_bml_base_free(bml_btl, des);
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
/* check completion status */
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, bytes_received);
if(recv_request_pml_complete_check(recvreq) == false &&
if (OPAL_LIKELY(0 < rdma_size)) {
assert ((uint64_t) rdma_size == frag->rdma_length);
/* check completion status */
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, (size_t) rdma_size);
if (recv_request_pml_complete_check(recvreq) == false &&
recvreq->req_rdma_offset < recvreq->req_send_offset) {
/* schedule additional rdma operations */
mca_pml_ob1_recv_request_schedule(recvreq, bml_btl);
/* schedule additional rdma operations */
mca_pml_ob1_recv_request_schedule(recvreq, bml_btl);
}
}
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
}
@ -218,7 +219,7 @@ static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl,
int mca_pml_ob1_recv_request_ack_send_btl(
ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl,
uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset,
bool nordma)
uint64_t size, bool nordma)
{
mca_btl_base_descriptor_t* des;
mca_pml_ob1_ack_hdr_t* ack;
@ -233,12 +234,9 @@ int mca_pml_ob1_recv_request_ack_send_btl(
}
/* fill out header */
ack = (mca_pml_ob1_ack_hdr_t*)des->des_local->seg_addr.pval;
ack->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_ACK;
ack->hdr_common.hdr_flags = nordma ? MCA_PML_OB1_HDR_FLAGS_NORDMA : 0;
ack->hdr_src_req.lval = hdr_src_req;
ack->hdr_dst_req.pval = hdr_dst_req;
ack->hdr_send_offset = hdr_send_offset;
ack = (mca_pml_ob1_ack_hdr_t*)des->des_segments->seg_addr.pval;
mca_pml_ob1_ack_hdr_prepare (ack, nordma ? MCA_PML_OB1_HDR_FLAGS_NORDMA : 0,
hdr_src_req, hdr_dst_req, hdr_send_offset, size);
ob1_hdr_hton(ack, MCA_PML_OB1_HDR_TYPE_ACK, proc);
@ -312,63 +310,99 @@ static int mca_pml_ob1_recv_request_ack(
if(recvreq->req_send_offset == hdr->hdr_msg_length)
return OMPI_SUCCESS;
}
/* let know to shedule function there is no need to put ACK flag */
recvreq->req_ack_sent = true;
return mca_pml_ob1_recv_request_ack_send(proc, hdr->hdr_src_req.lval,
recvreq, recvreq->req_send_offset,
recvreq, recvreq->req_send_offset, 0,
recvreq->req_send_offset == bytes_received);
}
static int mca_pml_ob1_recv_request_put_frag (mca_pml_ob1_rdma_frag_t *frag);
static int mca_pml_ob1_recv_request_get_frag_failed (mca_pml_ob1_rdma_frag_t *frag, int rc)
{
mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req;
ompi_proc_t *proc = (ompi_proc_t *) recvreq->req_recv.req_base.req_proc;
if (OMPI_ERR_NOT_AVAILABLE == rc) {
/* get isn't supported for this transfer. tell peer to fallback on put */
rc = mca_pml_ob1_recv_request_put_frag (frag);
if (OMPI_ERR_OUT_OF_RESOURCE == rc) {
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append (&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
return OMPI_SUCCESS;
}
}
if (++frag->retries < mca_pml_ob1.rdma_retries_limit &&
OMPI_ERR_OUT_OF_RESOURCE == rc) {
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
return OMPI_SUCCESS;
}
/* tell peer to fall back on send for this region */
rc = mca_pml_ob1_recv_request_ack_send(proc, frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req.lval,
recvreq, frag->rdma_offset, frag->rdma_length, false);
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
return rc;
}
/**
* Return resources used by the RDMA
*/
static void mca_pml_ob1_rget_completion( mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* ep,
struct mca_btl_base_descriptor_t* des,
int status )
static void mca_pml_ob1_rget_completion (mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* ep,
void *local_address, mca_btl_base_registration_handle_t *local_handle,
void *context, void *cbdata, int status)
{
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context;
mca_pml_ob1_rdma_frag_t* frag = (mca_pml_ob1_rdma_frag_t*)des->des_cbdata;
mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)frag->rdma_req;
mca_bml_base_btl_t *bml_btl = (mca_bml_base_btl_t *) context;
mca_pml_ob1_rdma_frag_t *frag = (mca_pml_ob1_rdma_frag_t *) cbdata;
mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req;
/* check completion status */
if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
/* TSW - FIX */
OMPI_ERROR_LOG(status);
ompi_rte_abort(-1, NULL);
if (OPAL_UNLIKELY(OMPI_SUCCESS != status)) {
status = mca_pml_ob1_recv_request_get_frag_failed (frag, status);
if (OPAL_UNLIKELY(OMPI_SUCCESS != status)) {
/* TSW - FIX */
OMPI_ERROR_LOG(status);
ompi_rte_abort(-1, NULL);
}
} else {
/* is receive request complete */
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, frag->rdma_length);
/* TODO: re-add order */
mca_pml_ob1_send_fin (recvreq->req_recv.req_base.req_proc,
bml_btl, frag->rdma_hdr.hdr_rget.hdr_frag,
frag->rdma_length, 0, 0);
recv_request_pml_complete_check(recvreq);
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
}
/* is receive request complete */
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, frag->rdma_length);
if (recvreq->req_recv.req_bytes_packed <= recvreq->req_bytes_received) {
mca_pml_ob1_send_fin(recvreq->req_recv.req_base.req_proc,
bml_btl,
frag->rdma_hdr.hdr_rget.hdr_des,
des->order, 0);
}
recv_request_pml_complete_check(recvreq);
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
}
static int mca_pml_ob1_init_get_fallback (mca_pml_ob1_rdma_frag_t *frag,
mca_btl_base_descriptor_t *dst) {
static int mca_pml_ob1_recv_request_put_frag (mca_pml_ob1_rdma_frag_t *frag)
{
mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req;
mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
mca_btl_base_descriptor_t *ctl;
mca_pml_ob1_rdma_hdr_t *hdr;
size_t seg_size;
size_t reg_size;
int rc;
seg_size = bml_btl->btl->btl_seg_size * dst->des_local_count;
reg_size = bml_btl->btl->btl_registration_handle_size;
/* prepare a descriptor for rdma control message */
mca_bml_base_alloc (bml_btl, &ctl, MCA_BTL_NO_ORDER, sizeof (mca_pml_ob1_rdma_hdr_t) + seg_size,
mca_bml_base_alloc (bml_btl, &ctl, MCA_BTL_NO_ORDER, sizeof (mca_pml_ob1_rdma_hdr_t) + reg_size,
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
if (OPAL_UNLIKELY(NULL == ctl)) {
@ -377,26 +411,19 @@ static int mca_pml_ob1_init_get_fallback (mca_pml_ob1_rdma_frag_t *frag,
ctl->des_cbfunc = mca_pml_ob1_recv_ctl_completion;
/* fill in rdma header */
hdr = (mca_pml_ob1_rdma_hdr_t *) ctl->des_local->seg_addr.pval;
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_PUT;
hdr->hdr_common.hdr_flags =
(!recvreq->req_ack_sent) ? MCA_PML_OB1_HDR_TYPE_ACK : 0;
hdr = (mca_pml_ob1_rdma_hdr_t *) ctl->des_segments->seg_addr.pval;
mca_pml_ob1_rdma_hdr_prepare (hdr, (!recvreq->req_ack_sent) ? MCA_PML_OB1_HDR_TYPE_ACK : 0,
recvreq->remote_req_send.lval, frag, recvreq, frag->rdma_offset,
frag->local_address, frag->rdma_length, frag->local_handle,
reg_size);
hdr->hdr_req = frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req;
hdr->hdr_rdma_offset = recvreq->req_rdma_offset;
hdr->hdr_des.pval = dst;
hdr->hdr_recv_req.pval = recvreq;
frag->cbfunc = mca_pml_ob1_put_completion;
hdr->hdr_seg_cnt = dst->des_local_count;
recvreq->req_ack_sent = true;
/* copy segments */
memcpy (hdr + 1, dst->des_local, seg_size);
dst->des_cbfunc = mca_pml_ob1_put_completion;
dst->des_cbdata = recvreq;
if (!recvreq->req_ack_sent)
recvreq->req_ack_sent = true;
PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE,
&(recvreq->req_recv.req_base), size,
PERUSE_RECV);
/* send rdma request to peer */
rc = mca_bml_base_send (bml_btl, ctl, MCA_PML_OB1_HDR_TYPE_PUT);
@ -411,71 +438,38 @@ static int mca_pml_ob1_init_get_fallback (mca_pml_ob1_rdma_frag_t *frag,
/*
*
*/
int mca_pml_ob1_recv_request_get_frag( mca_pml_ob1_rdma_frag_t* frag )
int mca_pml_ob1_recv_request_get_frag (mca_pml_ob1_rdma_frag_t *frag)
{
mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)frag->rdma_req;
mca_bml_base_btl_t* bml_btl = frag->rdma_bml;
mca_btl_base_descriptor_t* descriptor;
size_t save_size = frag->rdma_length;
mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req;
mca_btl_base_registration_handle_t *local_handle = NULL;
mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
int rc;
/* prepare descriptor */
mca_bml_base_prepare_dst( bml_btl,
NULL,
&recvreq->req_recv.req_base.req_convertor,
MCA_BTL_NO_ORDER,
0,
&frag->rdma_length,
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK |
MCA_BTL_DES_FLAGS_GET,
&descriptor );
if( OPAL_UNLIKELY(NULL == descriptor) ) {
if (frag->retries < mca_pml_ob1.rdma_retries_limit) {
frag->rdma_length = save_size;
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
return OMPI_ERR_OUT_OF_RESOURCE;
} else {
ompi_proc_t *proc = (ompi_proc_t *) recvreq->req_recv.req_base.req_proc;
/* tell peer to fall back on send */
recvreq->req_send_offset = 0;
rc = mca_pml_ob1_recv_request_ack_send(proc, frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req.lval,
recvreq, recvreq->req_send_offset, true);
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
return rc;
if (bml_btl->btl->btl_register_mem && !frag->local_handle && !recvreq->local_handle) {
mca_bml_base_register_mem (bml_btl, frag->local_address, frag->rdma_length, MCA_BTL_REG_FLAG_LOCAL_WRITE |
MCA_BTL_REG_FLAG_REMOTE_WRITE, &frag->local_handle);
if (OPAL_UNLIKELY(NULL == frag->local_handle)) {
return mca_pml_ob1_recv_request_get_frag_failed (frag, OMPI_ERR_OUT_OF_RESOURCE);
}
}
descriptor->des_remote = (mca_btl_base_segment_t *) frag->rdma_segs;
descriptor->des_remote_count = frag->rdma_hdr.hdr_rdma.hdr_seg_cnt;
descriptor->des_cbfunc = mca_pml_ob1_rget_completion;
descriptor->des_cbdata = frag;
if (frag->local_handle) {
local_handle = frag->local_handle;
} else if (recvreq->local_handle) {
local_handle = recvreq->local_handle;
}
PERUSE_TRACE_COMM_OMPI_EVENT(PERUSE_COMM_REQ_XFER_CONTINUE,
&(recvreq->req_recv.req_base),
&(((mca_pml_ob1_recv_request_t *) frag->rdma_req)->req_recv.req_base),
frag->rdma_length, PERUSE_RECV);
/* queue up get request */
rc = mca_bml_base_get(bml_btl,descriptor);
rc = mca_bml_base_get (bml_btl, frag->local_address, frag->remote_address, local_handle,
(mca_btl_base_registration_handle_t *) frag->remote_handle, frag->rdma_length,
0, MCA_BTL_NO_ORDER, mca_pml_ob1_rget_completion, frag);
if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
if (OPAL_UNLIKELY(OMPI_ERR_NOT_AVAILABLE == rc)) {
/* get isn't supported for this transfer. tell peer to fallback on put */
rc = mca_pml_ob1_init_get_fallback (frag, descriptor);
}
if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
mca_bml_base_free(bml_btl, descriptor);
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append(&mca_pml_ob1.rdma_pending,
(opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
return OMPI_ERR_OUT_OF_RESOURCE;
} else if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
OMPI_ERROR_LOG(rc);
ompi_rte_abort(-1, NULL);
}
return mca_pml_ob1_recv_request_get_frag_failed (frag, OMPI_ERR_OUT_OF_RESOURCE);
}
return OMPI_SUCCESS;
@ -501,6 +495,7 @@ void mca_pml_ob1_recv_request_progress_frag( mca_pml_ob1_recv_request_t* recvreq
bytes_received = mca_pml_ob1_compute_segment_length_base (segments, num_segments,
sizeof(mca_pml_ob1_frag_hdr_t));
data_offset = hdr->hdr_frag.hdr_frag_offset;
/*
* Make user buffer accessible(defined) before unpacking.
*/
@ -628,7 +623,6 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
mca_pml_ob1_rget_hdr_t* hdr = (mca_pml_ob1_rget_hdr_t*)segments->seg_addr.pval;
mca_bml_base_endpoint_t* bml_endpoint = NULL;
size_t bytes_remaining, prev_sent, offset;
mca_btl_base_segment_t *r_segments;
mca_pml_ob1_rdma_frag_t *frag;
mca_bml_base_btl_t *rdma_bml;
int rc;
@ -636,6 +630,7 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
prev_sent = offset = 0;
bytes_remaining = hdr->hdr_rndv.hdr_msg_length;
recvreq->req_recv.req_bytes_packed = hdr->hdr_rndv.hdr_msg_length;
recvreq->req_send_offset = 0;
MCA_PML_OB1_RECV_REQUEST_MATCHED(recvreq, &hdr->hdr_rndv.hdr_match);
@ -679,8 +674,28 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
ompi_rte_abort(-1, NULL);
}
bytes_remaining = mca_pml_ob1_compute_segment_length_remote (btl->btl_seg_size, (void *)(hdr + 1),
hdr->hdr_seg_cnt, recvreq->req_recv.req_base.req_proc);
bytes_remaining = hdr->hdr_rndv.hdr_msg_length;
/* save the request for put fallback */
recvreq->remote_req_send = hdr->hdr_rndv.hdr_src_req;
recvreq->rdma_bml = rdma_bml;
/* try to register the entire buffer */
if (rdma_bml->btl->btl_register_mem) {
void *data_ptr;
offset = 0;
OPAL_THREAD_LOCK(&recvreq->lock);
opal_convertor_set_position( &recvreq->req_recv.req_base.req_convertor, &offset);
opal_convertor_get_current_pointer (&recvreq->req_recv.req_base.req_convertor, &data_ptr);
OPAL_THREAD_UNLOCK(&recvreq->lock);
mca_bml_base_register_mem (rdma_bml, data_ptr, bytes_remaining, MCA_BTL_REG_FLAG_LOCAL_WRITE |
MCA_BTL_REG_FLAG_REMOTE_WRITE, &recvreq->local_handle);
/* It is not an error if the memory region can not be registered here. The registration will
* be attempted again for each get fragment. */
}
/* The while loop adds a fragmentation mechanism. The variable bytes_remaining holds the num
* of bytes left to be send. In each iteration we send the max possible bytes supported
@ -689,7 +704,12 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
* the next iteration with the updated size.
* Also - In each iteration we update the location in the buffer to be used for writing
* the message ,and the location to read from. This is done using the offset variable that
* accumulates the number of bytes that were sent so far. */
* accumulates the number of bytes that were sent so far.
*
* NTH: This fragmentation may go away if we change the btls to require them to handle
* get fragmentation internally. This is a reasonable solution since some btls do not
* need any fragmentation (sm, vader, self, etc). Remove this loop if this ends up
* being the case. */
while (bytes_remaining > 0) {
/* allocate/initialize a fragment */
MCA_PML_OB1_RDMA_FRAG_ALLOC(frag);
@ -699,29 +719,31 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
ompi_rte_abort(-1, NULL);
}
assert (btl->btl_seg_size * hdr->hdr_seg_cnt <= sizeof (frag->rdma_segs));
memcpy (frag->remote_handle, hdr + 1, btl->btl_registration_handle_size);
memcpy (frag->rdma_segs, hdr + 1, btl->btl_seg_size * hdr->hdr_seg_cnt);
/* update the read location -- NTH: note this will only work if there is exactly one
segment. TODO -- make this work with multiple segments */
r_segments = (mca_btl_base_segment_t *) frag->rdma_segs;
r_segments->seg_addr.lval += offset;
/* update the read location */
frag->remote_address = hdr->hdr_src_ptr + offset;
/* updating the write location */
OPAL_THREAD_LOCK(&recvreq->lock);
opal_convertor_set_position( &recvreq->req_recv.req_base.req_convertor, &offset);
opal_convertor_get_current_pointer (&recvreq->req_recv.req_base.req_convertor, &frag->local_address);
OPAL_THREAD_UNLOCK(&recvreq->lock);
frag->rdma_bml = rdma_bml;
frag->rdma_hdr.hdr_rget = *hdr;
frag->retries = 0;
frag->rdma_req = recvreq;
frag->rdma_ep = bml_endpoint;
frag->rdma_state = MCA_PML_OB1_RDMA_GET;
frag->reg = NULL;
frag->rdma_length = bytes_remaining;
frag->retries = 0;
frag->rdma_req = recvreq;
frag->rdma_state = MCA_PML_OB1_RDMA_GET;
frag->local_handle = NULL;
frag->rdma_offset = offset;
if (bytes_remaining > rdma_bml->btl->btl_get_limit) {
frag->rdma_length = rdma_bml->btl->btl_get_limit;
} else {
frag->rdma_length = bytes_remaining;
}
/* NTH: TODO -- handle error conditions gracefully */
rc = mca_pml_ob1_recv_request_get_frag(frag);
@ -920,13 +942,11 @@ int mca_pml_ob1_recv_request_schedule_once( mca_pml_ob1_recv_request_t* recvreq,
while(bytes_remaining > 0 &&
recvreq->req_pipeline_depth < mca_pml_ob1.recv_pipeline_depth) {
size_t size, seg_size;
mca_pml_ob1_rdma_hdr_t* hdr;
mca_btl_base_descriptor_t* dst;
mca_btl_base_descriptor_t* ctl;
mca_mpool_base_registration_t * reg = NULL;
mca_btl_base_module_t* btl;
mca_pml_ob1_rdma_frag_t *frag = NULL;
mca_btl_base_module_t *btl;
int rc, rdma_idx;
void *data_ptr;
size_t size;
if(prev_bytes_remaining == bytes_remaining) {
if(++num_fail == num_tries) {
@ -947,85 +967,62 @@ int mca_pml_ob1_recv_request_schedule_once( mca_pml_ob1_recv_request_t* recvreq,
do {
rdma_idx = recvreq->req_rdma_idx;
bml_btl = recvreq->req_rdma[rdma_idx].bml_btl;
reg = recvreq->req_rdma[rdma_idx].btl_reg;
size = recvreq->req_rdma[rdma_idx].length;
if(++recvreq->req_rdma_idx >= recvreq->req_rdma_cnt)
recvreq->req_rdma_idx = 0;
} while(!size);
btl = bml_btl->btl;
/* makes sure that we don't exceed BTL max rdma size
* if memory is not pinned already */
if( (NULL == reg) && (btl->btl_rdma_pipeline_frag_size != 0) &&
(size > btl->btl_rdma_pipeline_frag_size)) {
/* NTH: This conditional used to check if there was a registration in
* recvreq->req_rdma[rdma_idx].btl_reg. If once existed it was due to
* the btl not needed registration (equivalent to btl->btl_register_mem
* != NULL. This new check is equivalent. Note: I feel this protocol
* needs work to better improve resource usage when running with a
* leave pinned protocol. */
if (btl->btl_register_mem && (btl->btl_rdma_pipeline_frag_size != 0) &&
(size > btl->btl_rdma_pipeline_frag_size)) {
size = btl->btl_rdma_pipeline_frag_size;
}
/* take lock to protect converter against concurrent access
MCA_PML_OB1_RDMA_FRAG_ALLOC(frag);
if (OPAL_UNLIKELY(NULL == frag)) {
continue;
}
/* take lock to protect convertor against concurrent access
* from unpack */
OPAL_THREAD_LOCK(&recvreq->lock);
opal_convertor_set_position( &recvreq->req_recv.req_base.req_convertor,
&recvreq->req_rdma_offset );
/* prepare a descriptor for RDMA */
mca_bml_base_prepare_dst(bml_btl, reg,
&recvreq->req_recv.req_base.req_convertor,
MCA_BTL_NO_ORDER, 0, &size, MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
MCA_BTL_DES_FLAGS_PUT, &dst);
opal_convertor_set_position (&recvreq->req_recv.req_base.req_convertor,
&recvreq->req_rdma_offset);
opal_convertor_get_current_pointer (&recvreq->req_recv.req_base.req_convertor, &data_ptr);
OPAL_THREAD_UNLOCK(&recvreq->lock);
if(OPAL_UNLIKELY(dst == NULL)) {
continue;
if (btl->btl_register_mem) {
mca_bml_base_register_mem (bml_btl, data_ptr, size, MCA_BTL_REG_FLAG_REMOTE_WRITE,
&frag->local_handle);
if (OPAL_UNLIKELY(NULL == frag->local_handle)) {
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
continue;
}
}
dst->des_cbfunc = mca_pml_ob1_put_completion;
dst->des_cbdata = recvreq;
/* fill in the minimum information needed to handle the fin message */
frag->cbfunc = mca_pml_ob1_put_completion;
frag->rdma_length = size;
frag->rdma_req = recvreq;
frag->rdma_bml = bml_btl;
frag->local_address = data_ptr;
frag->rdma_offset = recvreq->req_rdma_offset;
seg_size = btl->btl_seg_size * dst->des_local_count;
/* prepare a descriptor for rdma control message */
mca_bml_base_alloc(bml_btl, &ctl, MCA_BTL_NO_ORDER, sizeof(mca_pml_ob1_rdma_hdr_t) + seg_size,
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
if( OPAL_UNLIKELY(NULL == ctl) ) {
mca_bml_base_free(bml_btl,dst);
continue;
}
ctl->des_cbfunc = mca_pml_ob1_recv_ctl_completion;
/* fill in rdma header */
hdr = (mca_pml_ob1_rdma_hdr_t*)ctl->des_local->seg_addr.pval;
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_PUT;
hdr->hdr_common.hdr_flags =
(!recvreq->req_ack_sent) ? MCA_PML_OB1_HDR_TYPE_ACK : 0;
hdr->hdr_req = recvreq->remote_req_send;
hdr->hdr_des.pval = dst;
hdr->hdr_recv_req.pval = recvreq;
hdr->hdr_rdma_offset = recvreq->req_rdma_offset;
hdr->hdr_seg_cnt = dst->des_local_count;
/* copy segments */
memmove (hdr + 1, dst->des_local, seg_size);
if(!recvreq->req_ack_sent)
recvreq->req_ack_sent = true;
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_PUT, recvreq->req_recv.req_base.req_proc);
PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE,
&(recvreq->req_recv.req_base), size,
PERUSE_RECV);
/* send rdma request to peer */
rc = mca_bml_base_send(bml_btl, ctl, MCA_PML_OB1_HDR_TYPE_PUT);
if( OPAL_LIKELY( rc >= 0 ) ) {
rc = mca_pml_ob1_recv_request_put_frag (frag);
if (OPAL_LIKELY(OMPI_SUCCESS == rc)) {
/* update request state */
recvreq->req_rdma_offset += size;
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth, 1);
recvreq->req_rdma[rdma_idx].length -= size;
bytes_remaining -= size;
} else {
mca_bml_base_free(bml_btl,ctl);
mca_bml_base_free(bml_btl,dst);
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
}
}

Просмотреть файл

@ -1,3 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
@ -10,7 +11,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
@ -52,6 +53,8 @@ struct mca_pml_ob1_recv_request_t {
bool req_ack_sent; /**< whether ack was sent to the sender */
bool req_match_received; /**< Prevent request to be completed prematurely */
opal_mutex_t lock;
mca_bml_base_btl_t *rdma_bml;
mca_btl_base_registration_handle_t *local_handle;
mca_pml_ob1_com_btl_t req_rdma[1];
};
typedef struct mca_pml_ob1_recv_request_t mca_pml_ob1_recv_request_t;
@ -131,8 +134,12 @@ do { \
#define MCA_PML_OB1_RECV_REQUEST_RETURN(recvreq) \
{ \
MCA_PML_BASE_RECV_REQUEST_FINI(&(recvreq)->req_recv); \
OMPI_FREE_LIST_RETURN_MT( &mca_pml_base_recv_requests, \
(ompi_free_list_item_t*)(recvreq)); \
if ((recvreq)->local_handle) { \
mca_bml_base_deregister_mem ((recvreq)->rdma_bml, (recvreq)->local_handle); \
(recvreq)->local_handle = NULL; \
} \
OMPI_FREE_LIST_RETURN_MT( &mca_pml_base_recv_requests, \
(ompi_free_list_item_t*)(recvreq)); \
}
/**
@ -154,9 +161,11 @@ recv_request_pml_complete(mca_pml_ob1_recv_request_t *recvreq)
}
for(i = 0; i < recvreq->req_rdma_cnt; i++) {
mca_mpool_base_registration_t* btl_reg = recvreq->req_rdma[i].btl_reg;
if( NULL != btl_reg && btl_reg->mpool != NULL) {
btl_reg->mpool->mpool_deregister( btl_reg->mpool, btl_reg );
struct mca_btl_base_registration_handle_t *handle = recvreq->req_rdma[i].btl_reg;
mca_bml_base_btl_t *bml_btl = recvreq->req_rdma[i].bml_btl;
if (NULL != handle) {
mca_bml_base_deregister_mem (bml_btl, handle);
}
}
recvreq->req_rdma_cnt = 0;
@ -178,6 +187,10 @@ recv_request_pml_complete(mca_pml_ob1_recv_request_t *recvreq)
recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR =
MPI_ERR_TRUNCATE;
}
if (OPAL_UNLIKELY(recvreq->local_handle)) {
mca_bml_base_deregister_mem (recvreq->rdma_bml, recvreq->local_handle);
recvreq->local_handle = NULL;
}
MCA_PML_OB1_RECV_REQUEST_MPI_COMPLETE(recvreq);
}
OPAL_THREAD_UNLOCK(&ompi_request_lock);
@ -387,7 +400,7 @@ static inline void mca_pml_ob1_recv_request_schedule(
(void)mca_pml_ob1_recv_request_schedule_exclusive(req, start_bml_btl);
}
#define MCA_PML_OB1_ADD_ACK_TO_PENDING(P, S, D, O) \
#define MCA_PML_OB1_ADD_ACK_TO_PENDING(P, S, D, O, Sz) \
do { \
mca_pml_ob1_pckt_pending_t *_pckt; \
\
@ -396,6 +409,7 @@ static inline void mca_pml_ob1_recv_request_schedule(
_pckt->hdr.hdr_ack.hdr_src_req.lval = (S); \
_pckt->hdr.hdr_ack.hdr_dst_req.pval = (D); \
_pckt->hdr.hdr_ack.hdr_send_offset = (O); \
_pckt->hdr.hdr_ack.hdr_send_size = (Sz); \
_pckt->proc = (P); \
_pckt->bml_btl = NULL; \
OPAL_THREAD_LOCK(&mca_pml_ob1.lock); \
@ -406,11 +420,11 @@ static inline void mca_pml_ob1_recv_request_schedule(
int mca_pml_ob1_recv_request_ack_send_btl(ompi_proc_t* proc,
mca_bml_base_btl_t* bml_btl, uint64_t hdr_src_req, void *hdr_dst_req,
uint64_t hdr_rdma_offset, bool nordma);
uint64_t hdr_rdma_offset, uint64_t size, bool nordma);
static inline int mca_pml_ob1_recv_request_ack_send(ompi_proc_t* proc,
uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset,
bool nordma)
uint64_t size, bool nordma)
{
size_t i;
mca_bml_base_btl_t* bml_btl;
@ -420,12 +434,12 @@ static inline int mca_pml_ob1_recv_request_ack_send(ompi_proc_t* proc,
for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) {
bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
if(mca_pml_ob1_recv_request_ack_send_btl(proc, bml_btl, hdr_src_req,
hdr_dst_req, hdr_send_offset, nordma) == OMPI_SUCCESS)
hdr_dst_req, hdr_send_offset, size, nordma) == OMPI_SUCCESS)
return OMPI_SUCCESS;
}
MCA_PML_OB1_ADD_ACK_TO_PENDING(proc, hdr_src_req, hdr_dst_req,
hdr_send_offset);
hdr_send_offset, size);
return OMPI_ERR_OUT_OF_RESOURCE;
}

Просмотреть файл

@ -137,6 +137,7 @@ static void mca_pml_ob1_send_request_construct(mca_pml_ob1_send_request_t* req)
req->req_send.req_base.req_ompi.req_cancel = mca_pml_ob1_send_request_cancel;
req->req_rdma_cnt = 0;
req->req_throttle_sends = false;
req->rdma_frag = NULL;
OBJ_CONSTRUCT(&req->req_send_ranges, opal_list_t);
OBJ_CONSTRUCT(&req->req_send_range_lock, opal_mutex_t);
}
@ -145,6 +146,10 @@ static void mca_pml_ob1_send_request_destruct(mca_pml_ob1_send_request_t* req)
{
OBJ_DESTRUCT(&req->req_send_ranges);
OBJ_DESTRUCT(&req->req_send_range_lock);
if (req->rdma_frag) {
MCA_PML_OB1_RDMA_FRAG_RETURN(req->rdma_frag);
req->rdma_frag = NULL;
}
}
OBJ_CLASS_INSTANCE( mca_pml_ob1_send_request_t,
@ -236,10 +241,9 @@ mca_pml_ob1_rndv_completion( mca_btl_base_module_t* btl,
* happens in one thread, the increase of the req_bytes_delivered does not
* have to be atomic.
*/
req_bytes_delivered = mca_pml_ob1_compute_segment_length (btl->btl_seg_size,
(void *) des->des_local,
des->des_local_count,
sizeof(mca_pml_ob1_rendezvous_hdr_t));
req_bytes_delivered = mca_pml_ob1_compute_segment_length_base ((void *) des->des_segments,
des->des_segment_count,
sizeof(mca_pml_ob1_rendezvous_hdr_t));
mca_pml_ob1_rndv_completion_request( bml_btl, sendreq, req_bytes_delivered );
}
@ -250,27 +254,18 @@ mca_pml_ob1_rndv_completion( mca_btl_base_module_t* btl,
*/
static void
mca_pml_ob1_rget_completion( mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* ep,
struct mca_btl_base_descriptor_t* des,
int status )
mca_pml_ob1_rget_completion (mca_pml_ob1_rdma_frag_t *frag, int64_t rdma_length)
{
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)des->des_cbdata;
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context;
size_t req_bytes_delivered;
mca_pml_ob1_send_request_t *sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req;
mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
/* count bytes of user data actually delivered and check for request completion */
if (OPAL_LIKELY(OMPI_SUCCESS == status)) {
req_bytes_delivered = mca_pml_ob1_compute_segment_length (btl->btl_seg_size,
(void *) des->des_local,
des->des_local_count, 0);
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered);
if (OPAL_LIKELY(0 < rdma_length)) {
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, (size_t) rdma_length);
}
sendreq->src_des = NULL;
send_request_pml_complete_check(sendreq);
/* free the descriptor */
mca_bml_base_free(bml_btl, des);
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
}
@ -314,10 +309,9 @@ mca_pml_ob1_frag_completion( mca_btl_base_module_t* btl,
}
/* count bytes of user data actually delivered */
req_bytes_delivered = mca_pml_ob1_compute_segment_length (btl->btl_seg_size,
(void *) des->des_local,
des->des_local_count,
sizeof(mca_pml_ob1_frag_hdr_t));
req_bytes_delivered = mca_pml_ob1_compute_segment_length_base ((void *) des->des_segments,
des->des_segment_count,
sizeof(mca_pml_ob1_frag_hdr_t));
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, -1);
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered);
@ -388,7 +382,7 @@ int mca_pml_ob1_send_request_start_buffered(
if( OPAL_UNLIKELY(NULL == des) ) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
segment = des->des_local;
segment = des->des_segments;
/* pack the data into the BTL supplied buffer */
iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval +
@ -407,17 +401,14 @@ int mca_pml_ob1_send_request_start_buffered(
/* build rendezvous header */
hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
hdr->hdr_common.hdr_flags = 0;
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_RNDV;
hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed;
hdr->hdr_rndv.hdr_src_req.pval = sendreq;
mca_pml_ob1_rendezvous_hdr_prepare (&hdr->hdr_rndv, MCA_PML_OB1_HDR_TYPE_RNDV, 0,
sendreq->req_send.req_base.req_comm->c_contextid,
sendreq->req_send.req_base.req_comm->c_my_rank,
sendreq->req_send.req_base.req_tag,
(uint16_t)sendreq->req_send.req_base.req_sequence,
sendreq->req_send.req_bytes_packed, sendreq);
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV,
sendreq->req_send.req_base.req_proc);
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV, sendreq->req_send.req_base.req_proc);
/* update lengths */
segment->seg_len = sizeof(mca_pml_ob1_rendezvous_hdr_t) + max_data;
@ -490,15 +481,13 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq,
if(NULL != bml_btl->btl->btl_sendi) {
mca_pml_ob1_match_hdr_t match;
match.hdr_common.hdr_flags = 0;
match.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH;
match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
match.hdr_tag = sendreq->req_send.req_base.req_tag;
match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
mca_pml_ob1_match_hdr_prepare (&match, MCA_PML_OB1_HDR_TYPE_MATCH, 0,
sendreq->req_send.req_base.req_comm->c_contextid,
sendreq->req_send.req_base.req_comm->c_my_rank,
sendreq->req_send.req_base.req_tag,
(uint16_t)sendreq->req_send.req_base.req_sequence);
ob1_hdr_hton(&match, MCA_PML_OB1_HDR_TYPE_MATCH,
sendreq->req_send.req_base.req_proc);
ob1_hdr_hton (&match, MCA_PML_OB1_HDR_TYPE_MATCH, sendreq->req_send.req_base.req_proc);
/* try to send immediately */
rc = mca_bml_base_sendi( bml_btl, &sendreq->req_send.req_base.req_convertor,
@ -531,7 +520,7 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq,
return OMPI_ERR_OUT_OF_RESOURCE;
}
segment = des->des_local;
segment = des->des_segments;
if(size > 0) {
/* pack the data into the supplied buffer */
@ -565,15 +554,13 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq,
/* build match header */
hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
hdr->hdr_common.hdr_flags = 0;
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH;
hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
mca_pml_ob1_match_hdr_prepare (&hdr->hdr_match, MCA_PML_OB1_HDR_TYPE_MATCH, 0,
sendreq->req_send.req_base.req_comm->c_contextid,
sendreq->req_send.req_base.req_comm->c_my_rank,
sendreq->req_send.req_base.req_tag,
(uint16_t)sendreq->req_send.req_base.req_sequence);
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH,
sendreq->req_send.req_base.req_proc);
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH, sendreq->req_send.req_base.req_proc);
/* update lengths */
segment->seg_len = OMPI_PML_OB1_MATCH_HDR_LEN + max_data;
@ -617,7 +604,6 @@ int mca_pml_ob1_send_request_start_prepare( mca_pml_ob1_send_request_t* sendreq,
/* prepare descriptor */
mca_bml_base_prepare_src( bml_btl,
NULL,
&sendreq->req_send.req_base.req_convertor,
MCA_BTL_NO_ORDER,
OMPI_PML_OB1_MATCH_HDR_LEN,
@ -627,19 +613,17 @@ int mca_pml_ob1_send_request_start_prepare( mca_pml_ob1_send_request_t* sendreq,
if( OPAL_UNLIKELY(NULL == des) ) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
segment = des->des_local;
segment = des->des_segments;
/* build match header */
hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
hdr->hdr_common.hdr_flags = 0;
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH;
hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
mca_pml_ob1_match_hdr_prepare (&hdr->hdr_match, MCA_PML_OB1_HDR_TYPE_MATCH, 0,
sendreq->req_send.req_base.req_comm->c_contextid,
sendreq->req_send.req_base.req_comm->c_my_rank,
sendreq->req_send.req_base.req_tag,
(uint16_t)sendreq->req_send.req_base.req_sequence);
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH,
sendreq->req_send.req_base.req_proc);
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH, sendreq->req_send.req_base.req_proc);
/* short message */
des->des_cbfunc = mca_pml_ob1_match_completion_free;
@ -673,79 +657,67 @@ int mca_pml_ob1_send_request_start_rdma( mca_pml_ob1_send_request_t* sendreq,
* one RDMA capable BTLs). This way round robin distribution of RDMA
* operation is achieved.
*/
mca_btl_base_descriptor_t *des, *src = NULL;
mca_btl_base_registration_handle_t *local_handle;
mca_btl_base_descriptor_t *des;
mca_pml_ob1_rdma_frag_t *frag;
mca_pml_ob1_rget_hdr_t *hdr;
size_t seg_size;
size_t reg_size;
void *data_ptr;
int rc;
sendreq->src_des = NULL;
bml_btl = sendreq->req_rdma[0].bml_btl;
if (!(bml_btl->btl_flags & (MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_CUDA_GET))) {
sendreq->rdma_frag = NULL;
/* This BTL does not support get. Use rendezvous to start the RDMA operation using put instead. */
return mca_pml_ob1_send_request_start_rndv (sendreq, bml_btl, 0, MCA_PML_OB1_HDR_FLAGS_CONTIG |
MCA_PML_OB1_HDR_FLAGS_PIN);
}
MEMCHECKER(
memchecker_call(&opal_memchecker_base_mem_defined,
sendreq->req_send.req_base.req_addr,
sendreq->req_send.req_base.req_count,
sendreq->req_send.req_base.req_datatype);
);
/* prepare source descriptor/segment(s) */
/* PML owns this descriptor and will free it in */
/* mca_pml_ob1_rget_completion */
mca_bml_base_prepare_src( bml_btl, sendreq->req_rdma[0].btl_reg,
&sendreq->req_send.req_base.req_convertor,
MCA_BTL_NO_ORDER, 0, &size, MCA_BTL_DES_FLAGS_GET |
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP, &src );
MEMCHECKER(
memchecker_call(&opal_memchecker_base_mem_noaccess,
sendreq->req_send.req_base.req_addr,
sendreq->req_send.req_base.req_count,
sendreq->req_send.req_base.req_datatype);
);
if( OPAL_UNLIKELY(NULL == src) ) {
return OMPI_ERR_OUT_OF_RESOURCE;
/* at this time ob1 does not support non-contiguous gets. the convertor represents a
* contiguous block of memory */
opal_convertor_get_current_pointer (&sendreq->req_send.req_base.req_convertor, &data_ptr);
local_handle = sendreq->req_rdma[0].btl_reg;
/* allocate an rdma fragment to keep track of the request size for use in the fin message */
MCA_PML_OB1_RDMA_FRAG_ALLOC(frag);
if (OPAL_UNLIKELY(NULL == frag)) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
src->des_cbfunc = mca_pml_ob1_rget_completion;
src->des_cbdata = sendreq;
sendreq->src_des = src;
/* fill in necessary fragment data */
frag->rdma_req = sendreq;
frag->rdma_bml = bml_btl;
frag->rdma_length = size;
frag->cbfunc = mca_pml_ob1_rget_completion;
/* do not store the local handle in the fragment. it will be released by mca_pml_ob1_free_rdma_resources */
seg_size = bml_btl->btl->btl_seg_size * src->des_local_count;
/* save the fragment for get->put fallback */
sendreq->rdma_frag = frag;
reg_size = bml_btl->btl->btl_registration_handle_size;
/* allocate space for get hdr + segment list */
mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, sizeof (*hdr) + seg_size,
mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, sizeof (*hdr) + reg_size,
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
if( OPAL_UNLIKELY(NULL == des) ) {
/* NTH: no need to reset the converter here. it will be reset before it is retried */
mca_bml_base_free(bml_btl, src);
return OMPI_ERR_OUT_OF_RESOURCE;
}
/* build match header */
hdr = (mca_pml_ob1_rget_hdr_t *) des->des_local->seg_addr.pval;
hdr->hdr_rndv.hdr_match.hdr_common.hdr_flags = MCA_PML_OB1_HDR_FLAGS_CONTIG|MCA_PML_OB1_HDR_FLAGS_PIN;
hdr->hdr_rndv.hdr_match.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_RGET;
hdr->hdr_rndv.hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
hdr->hdr_rndv.hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
hdr->hdr_rndv.hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
hdr->hdr_rndv.hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed;
hdr->hdr_rndv.hdr_src_req.pval = sendreq;
hdr->hdr_des.pval = src;
hdr->hdr_seg_cnt = src->des_local_count;
hdr = (mca_pml_ob1_rget_hdr_t *) des->des_segments->seg_addr.pval;
/* TODO -- Add support for multiple segments for get */
mca_pml_ob1_rget_hdr_prepare (hdr, MCA_PML_OB1_HDR_FLAGS_CONTIG | MCA_PML_OB1_HDR_FLAGS_PIN,
sendreq->req_send.req_base.req_comm->c_contextid,
sendreq->req_send.req_base.req_comm->c_my_rank,
sendreq->req_send.req_base.req_tag,
(uint16_t)sendreq->req_send.req_base.req_sequence,
sendreq->req_send.req_bytes_packed, sendreq,
frag, data_ptr, local_handle, reg_size);
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RGET, sendreq->req_send.req_base.req_proc);
/* copy segment data */
memcpy (hdr + 1, src->des_local, seg_size);
des->des_cbfunc = mca_pml_ob1_send_ctl_completion;
des->des_cbdata = sendreq;
@ -763,12 +735,6 @@ int mca_pml_ob1_send_request_start_rdma( mca_pml_ob1_send_request_t* sendreq,
rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_RGET);
if (OPAL_UNLIKELY(rc < 0)) {
mca_bml_base_free(bml_btl, des);
if (sendreq->src_des) {
mca_bml_base_free (bml_btl, sendreq->src_des);
sendreq->src_des = NULL;
}
return rc;
}
@ -806,7 +772,6 @@ int mca_pml_ob1_send_request_start_rndv( mca_pml_ob1_send_request_t* sendreq,
sendreq->req_send.req_base.req_datatype);
);
mca_bml_base_prepare_src( bml_btl,
NULL,
&sendreq->req_send.req_base.req_convertor,
MCA_BTL_NO_ORDER,
sizeof(mca_pml_ob1_rendezvous_hdr_t),
@ -824,21 +789,18 @@ int mca_pml_ob1_send_request_start_rndv( mca_pml_ob1_send_request_t* sendreq,
if( OPAL_UNLIKELY(NULL == des) ) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
segment = des->des_local;
segment = des->des_segments;
/* build hdr */
hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
hdr->hdr_common.hdr_flags = flags;
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_RNDV;
hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed;
hdr->hdr_rndv.hdr_src_req.pval = sendreq;
mca_pml_ob1_rendezvous_hdr_prepare (&hdr->hdr_rndv, MCA_PML_OB1_HDR_TYPE_RNDV, flags,
sendreq->req_send.req_base.req_comm->c_contextid,
sendreq->req_send.req_base.req_comm->c_my_rank,
sendreq->req_send.req_base.req_tag,
(uint16_t)sendreq->req_send.req_base.req_sequence,
sendreq->req_send.req_bytes_packed, sendreq);
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV,
sendreq->req_send.req_base.req_proc);
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV, sendreq->req_send.req_base.req_proc);
/* first fragment of a long message */
des->des_cbdata = sendreq;
@ -1019,10 +981,8 @@ cannot_pack:
sendreq->req_send.req_base.req_count,
sendreq->req_send.req_base.req_datatype);
);
mca_bml_base_prepare_src(bml_btl, NULL,
&sendreq->req_send.req_base.req_convertor,
MCA_BTL_NO_ORDER,
sizeof(mca_pml_ob1_frag_hdr_t),
mca_bml_base_prepare_src(bml_btl, &sendreq->req_send.req_base.req_convertor,
MCA_BTL_NO_ORDER, sizeof(mca_pml_ob1_frag_hdr_t),
&size, MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK, &des);
MEMCHECKER(
memchecker_call(&opal_memchecker_base_mem_noaccess,
@ -1046,12 +1006,9 @@ cannot_pack:
des->des_cbdata = sendreq;
/* setup header */
hdr = (mca_pml_ob1_frag_hdr_t*)des->des_local->seg_addr.pval;
hdr->hdr_common.hdr_flags = 0;
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_FRAG;
hdr->hdr_frag_offset = range->range_send_offset;
hdr->hdr_src_req.pval = sendreq;
hdr->hdr_dst_req = sendreq->req_recv;
hdr = (mca_pml_ob1_frag_hdr_t*)des->des_segments->seg_addr.pval;
mca_pml_ob1_frag_hdr_prepare (hdr, 0, range->range_send_offset, sendreq,
sendreq->req_recv.lval);
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_FRAG,
sendreq->req_send.req_base.req_proc);
@ -1108,38 +1065,66 @@ cannot_pack:
}
/**
* A put fragment could not be started. Queue the fragment to be retried later or
* fall back on send/recv.
*/
static void mca_pml_ob1_send_request_put_frag_failed (mca_pml_ob1_rdma_frag_t *frag, int rc)
{
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req;
mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
if (++frag->retries < mca_pml_ob1.rdma_retries_limit && OMPI_ERR_OUT_OF_RESOURCE == rc) {
/* queue the frag for later if there was a resource error */
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
} else {
/* tell receiver to deregister memory */
mca_pml_ob1_send_fin (sendreq->req_send.req_base.req_proc, bml_btl,
frag->rdma_hdr.hdr_rdma.hdr_frag, 0, MCA_BTL_NO_ORDER,
OPAL_ERR_TEMP_OUT_OF_RESOURCE);
/* send fragment by copy in/out */
mca_pml_ob1_send_request_copy_in_out(sendreq, frag->rdma_hdr.hdr_rdma.hdr_rdma_offset,
frag->rdma_length);
/* if a pointer to a receive request is not set it means that
* ACK was not yet received. Don't schedule sends before ACK */
if (NULL != sendreq->req_recv.pval)
mca_pml_ob1_send_request_schedule (sendreq);
}
}
/**
* An RDMA put operation has completed:
* (1) Update request status and if required set completed
* (2) Send FIN control message to the destination
* (2) Send FIN control message to the destination
*/
static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* ep,
struct mca_btl_base_descriptor_t* des,
int status )
static void mca_pml_ob1_put_completion (mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* ep,
void *local_address, mca_btl_base_registration_handle_t *local_handle,
void *context, void *cbdata, int status)
{
mca_pml_ob1_rdma_frag_t* frag = (mca_pml_ob1_rdma_frag_t*)des->des_cbdata;
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)frag->rdma_req;
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context;
mca_pml_ob1_rdma_frag_t *frag = (mca_pml_ob1_rdma_frag_t *) cbdata;
mca_pml_ob1_send_request_t *sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req;
mca_bml_base_btl_t *bml_btl = (mca_bml_base_btl_t *) context;
/* check completion status */
if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
/* TSW - FIX */
OMPI_ERROR_LOG(status);
ompi_rte_abort(-1, NULL);
if( OPAL_UNLIKELY(OMPI_SUCCESS == status) ) {
/* TODO -- readd ordering */
mca_pml_ob1_send_fin (sendreq->req_send.req_base.req_proc, bml_btl,
frag->rdma_hdr.hdr_rdma.hdr_frag, frag->rdma_length,
0, 0);
/* check for request completion */
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, frag->rdma_length);
send_request_pml_complete_check(sendreq);
} else {
/* try to fall back on send/recv */
mca_pml_ob1_send_request_put_frag_failed (frag, status);
}
mca_pml_ob1_send_fin(sendreq->req_send.req_base.req_proc,
bml_btl,
frag->rdma_hdr.hdr_rdma.hdr_des,
des->order, 0);
/* check for request completion */
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, frag->rdma_length);
send_request_pml_complete_check(sendreq);
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
@ -1147,81 +1132,45 @@ static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl,
int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t *frag )
{
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)frag->rdma_req;
mca_mpool_base_registration_t *reg = NULL;
mca_pml_ob1_send_request_t *sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req;
mca_btl_base_registration_handle_t *local_handle = NULL;
mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
mca_btl_base_descriptor_t *des;
size_t save_size = frag->rdma_length;
int rc;
if (OPAL_LIKELY(NULL == sendreq->src_des)) {
/* setup descriptor */
mca_bml_base_prepare_src( bml_btl,
reg,
&frag->convertor,
MCA_BTL_NO_ORDER,
0,
&frag->rdma_length,
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
MCA_BTL_DES_FLAGS_PUT,
&des );
if( OPAL_UNLIKELY(NULL == des) ) {
if(frag->retries < mca_pml_ob1.rdma_retries_limit) {
size_t offset = (size_t)frag->rdma_hdr.hdr_rdma.hdr_rdma_offset;
frag->rdma_length = save_size;
opal_convertor_set_position(&frag->convertor, &offset);
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
} else {
mca_pml_ob1_send_request_t *sendreq =
(mca_pml_ob1_send_request_t*)frag->rdma_req;
if (bml_btl->btl->btl_register_mem && NULL == frag->local_handle) {
/* Check if the segment is already registered */
for (size_t i = 0 ; i < sendreq->req_rdma_cnt ; ++i) {
if (sendreq->req_rdma[i].bml_btl == frag->rdma_bml) {
/* do not copy the handle to the fragment to avoid deregistring it twice */
local_handle = sendreq->req_rdma[i].btl_reg;
break;
}
}
/* tell receiver to unregister memory */
mca_pml_ob1_send_fin(sendreq->req_send.req_base.req_proc,
bml_btl, frag->rdma_hdr.hdr_rdma.hdr_des,
MCA_BTL_NO_ORDER, 1);
if (NULL == frag->local_handle) {
/* Not already registered. Register the region with the BTL. */
mca_bml_base_register_mem (bml_btl, frag->local_address, frag->rdma_length, 0,
&frag->local_handle);
/* send fragment by copy in/out */
mca_pml_ob1_send_request_copy_in_out(sendreq,
frag->rdma_hdr.hdr_rdma.hdr_rdma_offset, frag->rdma_length);
/* if a pointer to a receive request is not set it means that
* ACK was not yet received. Don't schedule sends before ACK */
if(NULL != sendreq->req_recv.pval)
mca_pml_ob1_send_request_schedule(sendreq);
if (OPAL_UNLIKELY(NULL == frag->local_handle)) {
mca_pml_ob1_send_request_put_frag_failed (frag, OMPI_ERR_OUT_OF_RESOURCE);
return OMPI_ERR_OUT_OF_RESOURCE;
}
return OMPI_ERR_OUT_OF_RESOURCE;
local_handle = frag->local_handle;
}
} else {
/* already have a source descriptor */
des = sendreq->src_des;
sendreq->src_des = NULL;
}
des->des_remote = (mca_btl_base_segment_t *) frag->rdma_segs;
des->des_remote_count = frag->rdma_hdr.hdr_rdma.hdr_seg_cnt;
des->des_cbfunc = mca_pml_ob1_put_completion;
des->des_cbdata = frag;
PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE,
&(((mca_pml_ob1_send_request_t*)frag->rdma_req)->req_send.req_base), save_size, PERUSE_SEND );
rc = mca_bml_base_put(bml_btl, des);
rc = mca_bml_base_put (bml_btl, frag->local_address, frag->remote_address, local_handle,
(mca_btl_base_registration_handle_t *) frag->remote_handle, frag->rdma_length,
0, MCA_BTL_NO_ORDER, mca_pml_ob1_put_completion, frag);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
mca_bml_base_free(bml_btl, des);
frag->rdma_length = save_size;
if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
return OMPI_ERR_OUT_OF_RESOURCE;
} else {
/* TSW - FIX */
OMPI_ERROR_LOG(rc);
ompi_rte_abort(-1, NULL);
}
mca_pml_ob1_send_request_put_frag_failed (frag, rc);
return rc;
}
return OMPI_SUCCESS;
@ -1235,12 +1184,11 @@ int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t *frag )
*/
void mca_pml_ob1_send_request_put( mca_pml_ob1_send_request_t* sendreq,
mca_btl_base_module_t* btl,
mca_btl_base_module_t* btl,
mca_pml_ob1_rdma_hdr_t* hdr )
{
mca_bml_base_endpoint_t *bml_endpoint = sendreq->req_endpoint;
mca_pml_ob1_rdma_frag_t* frag;
size_t i, size = 0;
if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_TYPE_ACK) {
OPAL_THREAD_ADD32(&sendreq->req_state, -1);
@ -1248,61 +1196,36 @@ void mca_pml_ob1_send_request_put( mca_pml_ob1_send_request_t* sendreq,
sendreq->req_recv.pval = hdr->hdr_recv_req.pval;
MCA_PML_OB1_RDMA_FRAG_ALLOC(frag);
if (NULL == sendreq->rdma_frag) {
MCA_PML_OB1_RDMA_FRAG_ALLOC(frag);
if( OPAL_UNLIKELY(NULL == frag) ) {
/* TSW - FIX */
OMPI_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE);
ompi_rte_abort(-1, NULL);
}
assert (btl->btl_seg_size * hdr->hdr_seg_cnt <= sizeof (frag->rdma_segs));
/* setup fragment */
memcpy (frag->rdma_segs, hdr + 1, btl->btl_seg_size * hdr->hdr_seg_cnt);
for( i = 0; i < hdr->hdr_seg_cnt; i++ ) {
mca_btl_base_segment_t *seg = (mca_btl_base_segment_t *) ((uintptr_t)(frag->rdma_segs) + i * btl->btl_seg_size);
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
if ((sendreq->req_send.req_base.req_proc->super.proc_arch & OPAL_ARCH_ISBIGENDIAN) !=
(ompi_proc_local()->super.proc_arch & OPAL_ARCH_ISBIGENDIAN)) {
size += opal_swap_bytes4(seg->seg_len);
} else
#endif
{
size += seg->seg_len;
if( OPAL_UNLIKELY(NULL == frag) ) {
/* TSW - FIX */
OMPI_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE);
ompi_rte_abort(-1, NULL);
}
} else {
/* rget fallback on put */
frag = sendreq->rdma_frag;
sendreq->rdma_frag = NULL;
sendreq->req_state = 0;
}
/* copy registration data */
memcpy (frag->remote_handle, hdr + 1, btl->btl_registration_handle_size);
frag->rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl);
frag->rdma_hdr.hdr_rdma = *hdr;
frag->rdma_req = sendreq;
frag->rdma_ep = bml_endpoint;
frag->rdma_length = size;
frag->rdma_length = hdr->hdr_dst_size;
frag->rdma_state = MCA_PML_OB1_RDMA_PUT;
frag->reg = NULL;
frag->remote_address = hdr->hdr_dst_ptr;
frag->retries = 0;
if (OPAL_UNLIKELY(NULL != sendreq->src_des)) {
/* get fallback path */
sendreq->req_state = 0;
}
/* lookup the corresponding registration */
for(i=0; i<sendreq->req_rdma_cnt; i++) {
if(sendreq->req_rdma[i].bml_btl == frag->rdma_bml) {
frag->reg = sendreq->req_rdma[i].btl_reg;
break;
}
}
/* RDMA writes may proceed in parallel to send and to each other, so
* create clone of the convertor for each RDMA fragment
*/
size = hdr->hdr_rdma_offset;
opal_convertor_clone_with_position(&sendreq->req_send.req_base.req_convertor,
&frag->convertor, 0, &size);
/* Get the address of the current offset. Note: at this time ob1 CAN NOT handle
* non-contiguous RDMA. If that changes this code will be wrong. */
opal_convertor_get_offset_pointer (&sendreq->req_send.req_base.req_convertor,
hdr->hdr_rdma_offset, &frag->local_address);
mca_pml_ob1_send_request_put_frag(frag);
}

Просмотреть файл

@ -12,7 +12,7 @@
* All rights reserved.
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2011-2012 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
@ -54,7 +54,7 @@ struct mca_pml_ob1_send_request_t {
mca_pml_ob1_send_pending_t req_pending;
opal_mutex_t req_send_range_lock;
opal_list_t req_send_ranges;
mca_btl_base_descriptor_t *src_des;
mca_pml_ob1_rdma_frag_t *rdma_frag;
mca_pml_ob1_com_btl_t req_rdma[1];
};
typedef struct mca_pml_ob1_send_request_t mca_pml_ob1_send_request_t;
@ -124,10 +124,9 @@ get_request_from_send_pending(mca_pml_ob1_send_pending_t *type)
ompi_free_list_item_t* item; \
\
if( OPAL_LIKELY(NULL != proc) ) { \
OMPI_FREE_LIST_WAIT_MT(&mca_pml_base_send_requests, item); \
OMPI_FREE_LIST_WAIT_MT(&mca_pml_base_send_requests, item); \
sendreq = (mca_pml_ob1_send_request_t*)item; \
sendreq->req_send.req_base.req_proc = proc; \
sendreq->src_des = NULL; \
} \
}
@ -163,15 +162,18 @@ get_request_from_send_pending(mca_pml_ob1_send_pending_t *type)
assert( 0 == _position ); \
}
static inline void mca_pml_ob1_free_rdma_resources(mca_pml_ob1_send_request_t* sendreq)
static inline void mca_pml_ob1_free_rdma_resources (mca_pml_ob1_send_request_t* sendreq)
{
size_t r;
/* return mpool resources */
for(r = 0; r < sendreq->req_rdma_cnt; r++) {
mca_mpool_base_registration_t* reg = sendreq->req_rdma[r].btl_reg;
if( NULL != reg && reg->mpool != NULL ) {
reg->mpool->mpool_deregister(reg->mpool, reg);
struct mca_btl_base_registration_handle_t *handle = sendreq->req_rdma[r].btl_reg;
mca_bml_base_btl_t *bml_btl = sendreq->req_rdma[r].bml_btl;
if (NULL != handle) {
mca_bml_base_deregister_mem (bml_btl, handle);
sendreq->req_rdma[r].btl_reg = NULL;
}
}
sendreq->req_rdma_cnt = 0;
@ -218,10 +220,14 @@ do {
#define MCA_PML_OB1_SEND_REQUEST_RETURN(sendreq) \
do { \
/* Let the base handle the reference counts */ \
MCA_PML_BASE_SEND_REQUEST_FINI((&(sendreq)->req_send)); \
OMPI_FREE_LIST_RETURN_MT( &mca_pml_base_send_requests, \
(ompi_free_list_item_t*)sendreq); \
/* Let the base handle the reference counts */ \
MCA_PML_BASE_SEND_REQUEST_FINI((&(sendreq)->req_send)); \
if (sendreq->rdma_frag) { \
MCA_PML_OB1_RDMA_FRAG_RETURN (sendreq->rdma_frag); \
sendreq->rdma_frag = NULL; \
} \
OMPI_FREE_LIST_RETURN_MT( &mca_pml_base_send_requests, \
(ompi_free_list_item_t*)sendreq); \
} while(0)

Просмотреть файл

@ -1,4 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
* University Research and Technology
@ -217,6 +217,14 @@ static inline void opal_convertor_get_current_pointer( const opal_convertor_t* p
*position = (void*)base;
}
static inline void opal_convertor_get_offset_pointer( const opal_convertor_t* pConv,
size_t offset, void** position )
{
unsigned char* base = pConv->pBaseBuf + offset + pConv->pDesc->true_lb;
*position = (void*)base;
}
/*
*
*/

Просмотреть файл

@ -36,10 +36,8 @@ mca_btl_active_message_callback_t mca_btl_base_active_message_trigger[MCA_BTL_TA
static void mca_btl_base_descriptor_constructor(mca_btl_base_descriptor_t* des)
{
des->des_local = NULL;
des->des_local_count = 0;
des->des_remote = NULL;
des->des_remote_count = 0;
des->des_segments = NULL;
des->des_segment_count = 0;
des->des_cbfunc = NULL;
des->des_cbdata = NULL;
des->des_flags = 0;

Просмотреть файл

@ -1,3 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
@ -45,13 +46,15 @@ int mca_btl_base_param_register(mca_base_component_t *version,
MCA_BASE_VAR_SCOPE_READONLY,
&module->btl_exclusivity);
asprintf(&msg, "BTL bit flags (general flags: SEND=%d, PUT=%d, GET=%d, SEND_INPLACE=%d, RDMA_MATCHED=%d, HETEROGENEOUS_RDMA=%d; flags only used by the \"dr\" PML (ignored by others): ACK=%d, CHECKSUM=%d, RDMA_COMPLETION=%d; flags only used by the \"bfo\" PML (ignored by others): FAILOVER_SUPPORT=%d)",
asprintf(&msg, "BTL bit flags (general flags: SEND=%d, PUT=%d, GET=%d, SEND_INPLACE=%d, HETEROGENEOUS_RDMA=%d, "
"ATOMIC_OPS=%d; flags only used by the \"dr\" PML (ignored by others): ACK=%d, CHECKSUM=%d, "
"RDMA_COMPLETION=%d; flags only used by the \"bfo\" PML (ignored by others): FAILOVER_SUPPORT=%d)",
MCA_BTL_FLAGS_SEND,
MCA_BTL_FLAGS_PUT,
MCA_BTL_FLAGS_GET,
MCA_BTL_FLAGS_SEND_INPLACE,
MCA_BTL_FLAGS_RDMA_MATCHED,
MCA_BTL_FLAGS_HETEROGENEOUS_RDMA,
MCA_BTL_FLAGS_ATOMIC_OPS,
MCA_BTL_FLAGS_NEED_ACK,
MCA_BTL_FLAGS_NEED_CSUM,
MCA_BTL_FLAGS_RDMA_COMPLETION,
@ -63,6 +66,14 @@ int mca_btl_base_param_register(mca_base_component_t *version,
&module->btl_flags);
free(msg);
asprintf (&msg, "BTL atomic bit flags (general flags: ADD=%d, AND=%d, OR=%d, XOR=%d",
MCA_BTL_ATOMIC_SUPPORTS_ADD, MCA_BTL_ATOMIC_SUPPORTS_AND, MCA_BTL_ATOMIC_SUPPORTS_OR,
MCA_BTL_ATOMIC_SUPPORTS_XOR);
(void) mca_base_component_var_register(version, "atomic_flags", msg, MCA_BASE_VAR_TYPE_UNSIGNED_INT,
NULL, 0, MCA_BASE_VAR_FLAG_DEFAULT_ONLY, OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT, &module->btl_atomic_flags);
free(msg);
(void) mca_base_component_var_register(version, "rndv_eager_limit", "Size (in bytes, including header) of \"phase 1\" fragment sent for all large messages (must be >= 0 and <= eager_limit)",
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0,
OPAL_INFO_LVL_4,
@ -74,6 +85,39 @@ int mca_btl_base_param_register(mca_base_component_t *version,
OPAL_INFO_LVL_4,
MCA_BASE_VAR_SCOPE_READONLY,
&module->btl_eager_limit);
if ((module->btl_flags & MCA_BTL_FLAGS_GET) && module->btl_get) {
if (0 == module->btl_get_limit) {
module->btl_get_limit = SIZE_MAX;
}
(void) mca_base_component_var_register(version, "get_limit", "Maximum size (in bytes) for btl get",
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, OPAL_INFO_LVL_4,
MCA_BASE_VAR_SCOPE_READONLY, &module->btl_get_limit);
/* Allow the user to set the alignment. The BTL should double-check the alignment in its open
* function. */
(void) mca_base_component_var_register(version, "get_alignment", "Alignment required for btl get",
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, OPAL_INFO_LVL_6,
MCA_BASE_VAR_SCOPE_CONSTANT, &module->btl_get_alignment);
}
if ((module->btl_flags & MCA_BTL_FLAGS_PUT) && module->btl_put) {
if (0 == module->btl_put_limit) {
module->btl_put_limit = SIZE_MAX;
}
(void) mca_base_component_var_register(version, "put_limit", "Maximum size (in bytes) for btl put",
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, OPAL_INFO_LVL_4,
MCA_BASE_VAR_SCOPE_READONLY, &module->btl_put_limit);
/* Allow the user to set the alignment. The BTL should double-check the alignment in its open
* function. */
(void) mca_base_component_var_register(version, "put_alignment", "Alignment required for btl put",
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, OPAL_INFO_LVL_6,
MCA_BASE_VAR_SCOPE_CONSTANT, &module->btl_put_alignment);
}
#if OPAL_CUDA_GDR_SUPPORT
/* If no CUDA RDMA support, zero them out */
if (!(MCA_BTL_FLAGS_CUDA_GET & module->btl_flags)) {
@ -144,5 +188,17 @@ int mca_btl_base_param_verify(mca_btl_base_module_t *module)
module->btl_flags &= ~MCA_BTL_FLAGS_GET;
}
if (0 == module->btl_atomic_flags) {
module->btl_flags &= ~MCA_BTL_FLAGS_ATOMIC_OPS;
}
if (0 == module->btl_get_limit) {
module->btl_get_limit = SIZE_MAX;
}
if (0 == module->btl_put_limit) {
module->btl_put_limit = SIZE_MAX;
}
return OPAL_SUCCESS;
}

Просмотреть файл

@ -134,6 +134,23 @@ struct mca_btl_base_descriptor_t;
struct mca_mpool_base_resources_t;
struct opal_proc_t;
/**
* Opaque registration handle for executing RDMA and atomic
* operations on a memory region.
*
* This data inside this handle is appropriate for passing
* to remote peers to execute RDMA and atomic operations. The
* size needed to send the registration handle can be
* obtained from the btl via the btl_registration_handle_size
* member. If this size is 0 then no registration data is
* needed to execute RDMA or atomic operations.
*/
struct mca_btl_base_registration_handle_t;
typedef struct mca_btl_base_registration_handle_t mca_btl_base_registration_handle_t;
/* Wildcard endpoint for use in the register_mem function */
#define MCA_BTL_ENDPOINT_ANY (struct mca_btl_base_endpoint_t *) -1
/* send/recv operations require tag matching */
typedef uint8_t mca_btl_base_tag_t;
@ -173,6 +190,9 @@ typedef uint8_t mca_btl_base_tag_t;
#define MCA_BTL_FLAGS_SEND 0x0001
#define MCA_BTL_FLAGS_PUT 0x0002
#define MCA_BTL_FLAGS_GET 0x0004
/* btls that set the MCA_BTL_FLAGS_RDMA will always get added to the BML
* rdma_btls list. This allows the updated one-sided component to
* use btls that are not otherwise used for send/recv. */
#define MCA_BTL_FLAGS_RDMA (MCA_BTL_FLAGS_GET|MCA_BTL_FLAGS_PUT)
/* btl can send directly from user buffer w/out registration */
@ -209,6 +229,12 @@ typedef uint8_t mca_btl_base_tag_t;
*/
#define MCA_BTL_FLAGS_SIGNALED 0x4000
/** The BTL supports network atomic operations */
#define MCA_BTL_FLAGS_ATOMIC_OPS 0x08000
/** The BTL supports fetching network atomic operations */
#define MCA_BTL_FLAGS_ATOMIC_FOPS 0x10000
/* Default exclusivity levels */
#define MCA_BTL_EXCLUSIVITY_HIGH (64*1024) /* internal loopback */
#define MCA_BTL_EXCLUSIVITY_DEFAULT 1024 /* GM/IB/etc. */
@ -219,6 +245,62 @@ typedef uint8_t mca_btl_base_tag_t;
#define MCA_BTL_ERROR_FLAGS_NONFATAL 0x2
#define MCA_BTL_ERROR_FLAGS_ADD_CUDA_IPC 0x4
/** registration flags */
enum {
/** Allow local write on the registered region. If a region is registered
* with this flag the registration can be used as the local handle for a
* btl_get operation. */
MCA_BTL_REG_FLAG_LOCAL_WRITE = 0x00000001,
/** Allow remote read on the registered region. If a region is registered
* with this flag the registration can be used as the remote handle for a
* btl_get operation. */
MCA_BTL_REG_FLAG_REMOTE_READ = 0x00000002,
/** Allow remote write on the registered region. If a region is registered
* with this flag the registration can be used as the remote handle for a
* btl_put operation. */
MCA_BTL_REG_FLAG_REMOTE_WRITE = 0x00000004,
/** Allow remote atomic operations on the registered region. If a region is
* registered with this flag the registration can be used as the remote
* handle for a btl_atomic_op or btl_atomic_fop operation. */
MCA_BTL_REG_FLAG_REMOTE_ATOMIC = 0x00000008,
/** Allow any btl operation on the registered region. If a region is registered
* with this flag the registration can be used as the local or remote handle for
* any btl operation. */
MCA_BTL_REG_FLAG_ACCESS_ANY = 0x0000000f,
#if OPAL_CUDA_GDR_SUPPORT
/** Region is in GPU memory */
MCA_BTL_REG_FLAG_CUDA_GPU_MEM = 0x00010000,
#endif
};
/** supported atomic operations */
enum {
/** The btl supports atomic add */
MCA_BTL_ATOMIC_SUPPORTS_ADD = 0x00000001,
/** The btl supports atomic bitwise and */
MCA_BTL_ATOMIC_SUPPORTS_AND = 0x00000200,
/** The btl supports atomic bitwise or */
MCA_BTL_ATOMIC_SUPPORTS_OR = 0x00000400,
/** The btl supports atomic bitwise exclusive or */
MCA_BTL_ATOMIC_SUPPORTS_XOR = 0x00000800,
/** The btl supports atomic compare-and-swap */
MCA_BTL_ATOMIC_SUPPORTS_CSWAP = 0x10000000,
/** The btl guarantees global atomicity (can mix btl atomics with cpu atomics) */
MCA_BTL_ATOMIC_SUPPORTS_GLOB = 0x20000000,
};
enum mca_btl_base_atomic_op_t {
/** Atomic add: (*remote_address) = (*remote_address) + operand */
MCA_BTL_ATOMIC_ADD = 0x0001,
/** Atomic and: (*remote_address) = (*remote_address) & operand */
MCA_BTL_ATOMIC_AND = 0x0011,
/** Atomic or: (*remote_address) = (*remote_address) | operand */
MCA_BTL_ATOMIC_OR = 0x0012,
/** Atomic xor: (*remote_address) = (*remote_address) ^ operand */
MCA_BTL_ATOMIC_XOR = 0x0014,
};
typedef enum mca_btl_base_atomic_op_t mca_btl_base_atomic_op_t;
/**
* Asynchronous callback function on completion of an operation.
* Completion Semantics: The descriptor can be reused or returned to the
@ -237,6 +319,32 @@ typedef void (*mca_btl_base_completion_fn_t)(
struct mca_btl_base_descriptor_t* descriptor,
int status);
/**
* Asynchronous callback function on completion of an rdma or atomic operation.
* Completion Semantics: The rdma or atomic memory operation has completed
* remotely (i.e.) is remotely visible and the caller is free to deregister
* the local_handle or modify the memory in local_address.
*
* @param[IN] module the BTL module
* @param[IN] endpoint the BTL endpoint
* @param[IN] local_address local address for the operation (if any)
* @param[IN] local_handle local handle associated with the local_address
* @param[IN] context callback context supplied to the rdma/atomic operation
* @param[IN] cbdata callback data supplied to the rdma/atomic operation
* @param[IN] status status of the operation
*
*/
typedef void (*mca_btl_base_rdma_completion_fn_t)(
struct mca_btl_base_module_t* module,
struct mca_btl_base_endpoint_t* endpoint,
void *local_address,
struct mca_btl_base_registration_handle_t *local_handle,
void *context,
void *cbdata,
int status);
/**
* Describes a region/segment of memory that is addressable
* by an BTL.
@ -262,20 +370,19 @@ struct mca_btl_base_segment_t {
};
typedef struct mca_btl_base_segment_t mca_btl_base_segment_t;
/**
* A descriptor that holds the parameters to a send/put/get
* operation along w/ a callback routine that is called on
* completion of the request.
* Note: receive callbacks will store the incomming data segments in
* des_local
* des_segments
*/
struct mca_btl_base_descriptor_t {
ompi_free_list_item_t super;
mca_btl_base_segment_t *des_local; /**< local segments */
size_t des_local_count; /**< number of local segments */
mca_btl_base_segment_t *des_remote; /**< remote segments */
size_t des_remote_count; /**< number of destination segments */
mca_btl_base_segment_t *des_segments; /**< local segments */
size_t des_segment_count; /**< number of local segments */
mca_btl_base_completion_fn_t des_cbfunc; /**< local callback function */
void* des_cbdata; /**< opaque callback data */
void* des_context; /**< more opaque callback data */
@ -329,6 +436,11 @@ OPAL_DECLSPEC OBJ_CLASS_DECLARATION(mca_btl_base_descriptor_t);
*/
#define MCA_BTL_SEG_MAX_SIZE 256
/**
* Maximum size of a BTL registration handle in bytes
*/
#define MCA_BTL_REG_HANDLE_MAX_SIZE 256
/*
* BTL base header, stores the tag at a minimum
*/
@ -395,7 +507,7 @@ typedef int (*mca_btl_base_component_progress_fn_t)(void);
* completion function, this implies that all data payload in the
* mca_btl_base_descriptor_t must be copied out within this callback or
* forfeited back to the BTL.
* Note also that descriptor segments (des_local) must be base
* Note also that descriptor segments (des_segments) must be base
* segments for all callbacks.
*
* @param[IN] btl BTL module
@ -647,7 +759,6 @@ typedef int (*mca_btl_base_module_free_fn_t)(
typedef struct mca_btl_base_descriptor_t* (*mca_btl_base_module_prepare_fn_t)(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
mca_mpool_base_registration_t* registration,
struct opal_convertor_t* convertor,
uint8_t order,
size_t reserve,
@ -655,6 +766,43 @@ typedef struct mca_btl_base_descriptor_t* (*mca_btl_base_module_prepare_fn_t)(
uint32_t flags
);
/**
* @brief Register a memory region for put/get/atomic operations.
*
* @param btl (IN) BTL module
* @param endpoint(IN) BTL addressing information (or NULL for all endpoints)
* @param base (IN) Pointer to start of region
* @param size (IN) Size of region
* @param flags (IN) Flags indicating what operation will be performed. Valid
* values are MCA_BTL_DES_FLAGS_PUT, MCA_BTL_DES_FLAGS_GET,
* and MCA_BTL_DES_FLAGS_ATOMIC
*
* @returns a memory registration handle valid for both local and remote operations
* @returns NULL if the region could not be registered
*
* This function registers the specified region with the hardware for use with
* the btl_put, btl_get, btl_atomic_cas, btl_atomic_op, and btl_atomic_fop
* functions. Care should be taken to not hold an excessive number of registrations
* as they may use limited system/NIC resources.
*/
typedef struct mca_btl_base_registration_handle_t *(*mca_btl_base_module_register_mem_fn_t)(
struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t *endpoint, void *base,
size_t size, uint32_t flags);
/**
* @brief Deregister a memory region
*
* @param btl (IN) BTL module region was registered with
* @param handle (IN) BTL registration handle to deregister
*
* This function deregisters the memory region associated with the specified handle. Care
* should be taken to not perform any RDMA or atomic operation on this memory region
* after it is deregistered. It is erroneous to specify a memory handle associated with
* a remote node.
*/
typedef int (*mca_btl_base_module_deregister_mem_fn_t)(
struct mca_btl_base_module_t* btl, struct mca_btl_base_registration_handle_t *handle);
/**
* Initiate an asynchronous send.
* Completion Semantics: the descriptor has been queued for a send operation
@ -698,7 +846,8 @@ typedef int (*mca_btl_base_module_send_fn_t)(
* @param flags (IN) Flags.
* @param tag (IN) The tag value used to notify the peer.
* @param descriptor (OUT) The descriptor to be returned unable to be sent immediately
* (may be NULL).
*
* @retval OPAL_SUCCESS The send was successfully queued
* @retval OPAL_ERROR The send failed
* @retval OPAL_ERR_UNREACH The endpoint is not reachable
@ -722,58 +871,210 @@ typedef int (*mca_btl_base_module_sendi_fn_t)(
/**
* Initiate an asynchronous put.
* Completion Semantics: the descriptor has been queued for a put operation
* the BTL now controls the descriptor until local
* completion callback is made on the descriptor
* Completion Semantics: if this function returns a 1 then the operation
* is complete. a return of OPAL_SUCCESS indicates
* the put operation has been queued with the
* network. the local_handle can not be deregistered
* until all outstanding operations on that handle
* have been completed.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param local_address (IN) Local address to put from (registered)
* @param remote_address (IN) Remote address to put to (registered remotely)
* @param local_handle (IN) Registration handle for region containing
* (local_address, local_address + size)
* @param remote_handle (IN) Remote registration handle for region containing
* (remote_address, remote_address + size)
* @param size (IN) Number of bytes to put
* @param flags (IN) Flags for this put operation
* @param order (IN) Ordering
* @param cbfunc (IN) Function to call on completion (if queued)
* @param cbcontext (IN) Context for the callback
* @param cbdata (IN) Data for callback
*
* BTLs that do not have the MCA_BTL_FLAGS_RDMA_MATCHED flag set
* allow multiple concurrent put operations on the same descriptor.
* BTLs that do have the MCA_BTL_FLAGS_RDMA_MATCHED flag set require
* a corresponding prepare_src/dst call for each put operation and
* therefore prohibit multiple concurrent put operations.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*
* @retval OPAL_SUCCESS The descriptor was successfully queued for a put
* @retval OPAL_ERROR The descriptor was NOT successfully queued for a put
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put
* operation. Try again later
* @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or
* alignment restrictions.
*/
typedef int (*mca_btl_base_module_put_fn_t)(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_btl_base_descriptor_t* descriptor
);
typedef int (*mca_btl_base_module_put_fn_t) (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
/**
* Initiate an asynchronous get.
* Completion Semantics: if this function returns a 1 then the operation
* is complete. a return of OPAL_SUCCESS indicates
* the get operation has been queued with the
* network. the local_handle can not be deregistered
* until all outstanding operations on that handle
* have been completed.
*
* Completion Semantics: the descriptor has been queued for a get operation
* the BTL now controls the descriptor until local
* completion callback is made on the descriptor
*
* BTLs that do not have the MCA_BTL_FLAGS_RDMA_MATCHED flag set
* allow multiple concurrent get operations on the same descriptor.
* BTLs that do have the MCA_BTL_FLAGS_RDMA_MATCHED flag set require
* a corresponding prepare_src/dst call for each get operation and
* therefore prohibit multiple concurrent get operations.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*
* @retval OPAL_SUCCESS The descriptor was successfully queued for a get
* @retval OPAL_ERROR The descriptor was NOT successfully queued for a get
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param local_address (IN) Local address to put from (registered)
* @param remote_address (IN) Remote address to put to (registered remotely)
* @param local_handle (IN) Registration handle for region containing
* (local_address, local_address + size)
* @param remote_handle (IN) Remote registration handle for region containing
* (remote_address, remote_address + size)
* @param size (IN) Number of bytes to put
* @param flags (IN) Flags for this put operation
* @param order (IN) Ordering
* @param cbfunc (IN) Function to call on completion (if queued)
* @param cbcontext (IN) Context for the callback
* @param cbdata (IN) Data for callback
*
* @retval OPAL_SUCCESS The descriptor was successfully queued for a put
* @retval OPAL_ERROR The descriptor was NOT successfully queued for a put
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put
* operation. Try again later
* @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or
* alignment restrictions.
*/
typedef int (*mca_btl_base_module_get_fn_t) (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
typedef int (*mca_btl_base_module_get_fn_t)(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_btl_base_descriptor_t* descriptor
);
/**
* Initiate an asynchronous atomic operation.
* Completion Semantics: if this function returns a 1 then the operation
* is complete. a return of OPAL_SUCCESS indicates
* the atomic operation has been queued with the
* network.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param remote_address (IN) Remote address to put to (registered remotely)
* @param remote_handle (IN) Remote registration handle for region containing
* (remote_address, remote_address + 8)
* @param op (IN) Operation to perform
* @param operand (IN) Operand for the operation
* @param flags (IN) Flags for this put operation
* @param order (IN) Ordering
* @param cbfunc (IN) Function to call on completion (if queued)
* @param cbcontext (IN) Context for the callback
* @param cbdata (IN) Data for callback
*
* @retval OPAL_SUCCESS The operation was successfully queued
* @retval 1 The operation is complete
* @retval OPAL_ERROR The operation was NOT successfully queued
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the atomic
* operation. Try again later
* @retval OPAL_ERR_NOT_AVAILABLE Atomic operation can not be performed due to
* alignment restrictions or the operation {op} is not supported
* by the hardware.
*
* After the operation is complete the remote address specified by {remote_address} and
* {remote_handle} will be updated with (*remote_address) = (*remote_address) op operand.
* The btl will guarantee consistency of atomic operations performed via the btl. Note,
* however, that not all btls will provide consistency between btl atomic operations and
* cpu atomics.
*/
typedef int (*mca_btl_base_module_atomic_op64_fn_t) (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint, uint64_t remote_address,
struct mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
void *cbcontext, void *cbdata);
/**
* Initiate an asynchronous fetching atomic operation.
* Completion Semantics: if this function returns a 1 then the operation
* is complete. a return of OPAL_SUCCESS indicates
* the atomic operation has been queued with the
* network.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param local_address (OUT) Local address to store the result in
* @param remote_address (IN) Remote address perfom operation on to (registered remotely)
* @param local_handle (IN) Local registration handle for region containing
* (local_address, local_address + 8)
* @param remote_handle (IN) Remote registration handle for region containing
* (remote_address, remote_address + 8)
* @param op (IN) Operation to perform
* @param operand (IN) Operand for the operation
* @param flags (IN) Flags for this put operation
* @param order (IN) Ordering
* @param cbfunc (IN) Function to call on completion (if queued)
* @param cbcontext (IN) Context for the callback
* @param cbdata (IN) Data for callback
*
* @retval OPAL_SUCCESS The operation was successfully queued
* @retval 1 The operation is complete
* @retval OPAL_ERROR The operation was NOT successfully queued
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the atomic
* operation. Try again later
* @retval OPAL_ERR_NOT_AVAILABLE Atomic operation can not be performed due to
* alignment restrictions or the operation {op} is not supported
* by the hardware.
*
* After the operation is complete the remote address specified by {remote_address} and
* {remote_handle} will be updated with (*remote_address) = (*remote_address) op operand.
* {local_address} will be updated with the previous value stored in {remote_address}.
* The btl will guarantee consistency of atomic operations performed via the btl. Note,
* however, that not all btls will provide consistency between btl atomic operations and
* cpu atomics.
*/
typedef int (*mca_btl_base_module_atomic_fop64_fn_t) (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address,
struct mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
void *cbcontext, void *cbdata);
/**
* Initiate an asynchronous compare and swap operation.
* Completion Semantics: if this function returns a 1 then the operation
* is complete. a return of OPAL_SUCCESS indicates
* the atomic operation has been queued with the
* network.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param local_address (OUT) Local address to store the result in
* @param remote_address (IN) Remote address perfom operation on to (registered remotely)
* @param local_handle (IN) Local registration handle for region containing
* (local_address, local_address + 8)
* @param remote_handle (IN) Remote registration handle for region containing
* (remote_address, remote_address + 8)
* @param compare (IN) Operand for the operation
* @param value (IN) Value to store on success
* @param flags (IN) Flags for this put operation
* @param order (IN) Ordering
* @param cbfunc (IN) Function to call on completion (if queued)
* @param cbcontext (IN) Context for the callback
* @param cbdata (IN) Data for callback
*
* @retval OPAL_SUCCESS The operation was successfully queued
* @retval 1 The operation is complete
* @retval OPAL_ERROR The operation was NOT successfully queued
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the atomic
* operation. Try again later
* @retval OPAL_ERR_NOT_AVAILABLE Atomic operation can not be performed due to
* alignment restrictions or the operation {op} is not supported
* by the hardware.
*
* After the operation is complete the remote address specified by {remote_address} and
* {remote_handle} will be updated with {value} if *remote_address == compare.
* {local_address} will be updated with the previous value stored in {remote_address}.
* The btl will guarantee consistency of atomic operations performed via the btl. Note,
* however, that not all btls will provide consistency between btl atomic operations and
* cpu atomics.
*/
typedef int (*mca_btl_base_module_atomic_cswap_fn_t) (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address,
struct mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_registration_handle_t *remote_handle, uint64_t compare,
uint64_t value, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
void *cbcontext, void *cbdata);
/**
* Diagnostic dump of btl state.
@ -813,7 +1114,14 @@ struct mca_btl_base_module_t {
uint32_t btl_latency; /**< relative ranking of latency used to prioritize btls */
uint32_t btl_bandwidth; /**< bandwidth (Mbytes/sec) supported by each endpoint */
uint32_t btl_flags; /**< flags (put/get...) */
size_t btl_seg_size; /**< size of a btl segment */
uint32_t btl_atomic_flags; /**< atomic operations supported (add, and, xor, etc) */
size_t btl_registration_handle_size; /**< size of the BTLs registration handles */
/* One-sided limitations (0 for no alignment, SIZE_MAX for no limit ) */
size_t btl_get_limit; /**< maximum size supported by the btl_get function */
size_t btl_get_alignment; /**< minimum alignment/size needed by btl_get (power of 2) */
size_t btl_put_limit; /**< maximum size supported by the btl_put function */
size_t btl_put_alignment; /**< minimum alignment/size needed by btl_put (power of 2) */
/* BTL function table */
mca_btl_base_module_add_procs_fn_t btl_add_procs;
@ -824,13 +1132,21 @@ struct mca_btl_base_module_t {
mca_btl_base_module_alloc_fn_t btl_alloc;
mca_btl_base_module_free_fn_t btl_free;
mca_btl_base_module_prepare_fn_t btl_prepare_src;
mca_btl_base_module_prepare_fn_t btl_prepare_dst;
mca_btl_base_module_send_fn_t btl_send;
mca_btl_base_module_sendi_fn_t btl_sendi;
mca_btl_base_module_put_fn_t btl_put;
mca_btl_base_module_get_fn_t btl_get;
mca_btl_base_module_dump_fn_t btl_dump;
/* atomic operations */
mca_btl_base_module_atomic_op64_fn_t btl_atomic_op;
mca_btl_base_module_atomic_fop64_fn_t btl_atomic_fop;
mca_btl_base_module_atomic_cswap_fn_t btl_atomic_cswap;
/* new memory registration functions */
mca_btl_base_module_register_mem_fn_t btl_register_mem; /**< memory registration function (NULL if not needed) */
mca_btl_base_module_deregister_mem_fn_t btl_deregister_mem; /**< memory deregistration function (NULL if not needed) */
/** the mpool associated with this btl (optional) */
mca_mpool_base_module_t* btl_mpool;
/** register a default error handler */

Просмотреть файл

@ -59,6 +59,9 @@ sources = \
btl_openib_fd.c \
btl_openib_ip.h \
btl_openib_ip.c \
btl_openib_put.c \
btl_openib_get.c \
btl_openib_atomic.c \
connect/base.h \
connect/btl_openib_connect_base.c \
connect/btl_openib_connect_empty.c \

Просмотреть файл

@ -91,6 +91,11 @@
#define MIN(a,b) ((a)<(b)?(a):(b))
#endif
static mca_btl_base_registration_handle_t *mca_btl_openib_register_mem (mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
void *base, size_t size, uint32_t flags);
static int mca_btl_openib_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle);
mca_btl_openib_module_t mca_btl_openib_module = {
.super = {
.btl_component = &mca_btl_openib_component.super,
@ -101,14 +106,19 @@ mca_btl_openib_module_t mca_btl_openib_module = {
.btl_alloc = mca_btl_openib_alloc,
.btl_free = mca_btl_openib_free,
.btl_prepare_src = mca_btl_openib_prepare_src,
.btl_prepare_dst = mca_btl_openib_prepare_dst,
.btl_send = mca_btl_openib_send,
.btl_sendi = mca_btl_openib_sendi, /* send immediate */
.btl_put = mca_btl_openib_put,
.btl_get = mca_btl_openib_get,
.btl_dump = mca_btl_base_dump,
.btl_register_error = mca_btl_openib_register_error_cb, /* error call back registration */
.btl_ft_event = mca_btl_openib_ft_event
.btl_ft_event = mca_btl_openib_ft_event,
.btl_register_mem = mca_btl_openib_register_mem,
.btl_deregister_mem = mca_btl_openib_deregister_mem,
#if HAVE_DECL_IBV_ATOMIC_HCA
.btl_atomic_fop = mca_btl_openib_atomic_fop,
.btl_atomic_cswap = mca_btl_openib_atomic_cswap,
#endif
}
};
@ -514,10 +524,12 @@ static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl,
if a user distributes different INI files or parameters for different node/procs,
it is on his own responsibility */
switch(mca_btl_openib_component.receive_queues_source) {
case MCA_BASE_VAR_SOURCE_COMMAND_LINE:
case MCA_BASE_VAR_SOURCE_ENV:
case MCA_BASE_VAR_SOURCE_MAX:
break;
case MCA_BASE_VAR_SOURCE_COMMAND_LINE:
case MCA_BASE_VAR_SOURCE_ENV:
case MCA_BASE_VAR_SOURCE_FILE:
case MCA_BASE_VAR_SOURCE_SET:
case MCA_BASE_VAR_SOURCE_OVERRIDE:
break;
/* If the queues configuration was set from command line
(with --mca btl_openib_receive_queues parameter) => both sides have a same configuration */
@ -526,40 +538,38 @@ static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl,
not possible that remote side got its queues configuration from command line =>
(by prio) the configuration was set from INI file or (if not configure)
by default queues configuration */
case MCA_BASE_VAR_SOURCE_FILE:
case MCA_BASE_VAR_SOURCE_SET:
case MCA_BASE_VAR_SOURCE_OVERRIDE:
if(NULL != values.receive_queues) {
recv_qps = values.receive_queues;
} else {
recv_qps = mca_btl_openib_component.default_recv_qps;
}
case BTL_OPENIB_RQ_SOURCE_DEVICE_INI:
if(NULL != values.receive_queues) {
recv_qps = values.receive_queues;
} else {
recv_qps = mca_btl_openib_component.default_recv_qps;
}
if(0 != strcmp(mca_btl_openib_component.receive_queues,
recv_qps)) {
opal_show_help("help-mpi-btl-openib.txt",
"unsupported queues configuration", true,
opal_process_info.nodename,
ibv_get_device_name(openib_btl->device->ib_dev),
(openib_btl->device->ib_dev_attr).vendor_id,
(openib_btl->device->ib_dev_attr).vendor_part_id,
mca_btl_openib_component.receive_queues,
opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal),
endpoint->rem_info.rem_vendor_id,
endpoint->rem_info.rem_vendor_part_id,
recv_qps);
if(0 != strcmp(mca_btl_openib_component.receive_queues,
recv_qps)) {
opal_show_help("help-mpi-btl-openib.txt",
"unsupported queues configuration", true,
opal_process_info.nodename,
ibv_get_device_name(openib_btl->device->ib_dev),
(openib_btl->device->ib_dev_attr).vendor_id,
(openib_btl->device->ib_dev_attr).vendor_part_id,
mca_btl_openib_component.receive_queues,
opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal),
endpoint->rem_info.rem_vendor_id,
endpoint->rem_info.rem_vendor_part_id,
recv_qps);
return OPAL_ERROR;
}
break;
return OPAL_ERROR;
}
break;
/* If the local queues configuration was set
by default queues => check all possible cases for remote side and compare */
case MCA_BASE_VAR_SOURCE_DEFAULT:
if(NULL != values.receive_queues) {
if(0 != strcmp(mca_btl_openib_component.receive_queues,
values.receive_queues)) {
opal_show_help("help-mpi-btl-openib.txt",
case MCA_BASE_VAR_SOURCE_DEFAULT:
if(NULL != values.receive_queues) {
if(0 != strcmp(mca_btl_openib_component.receive_queues,
values.receive_queues)) {
opal_show_help("help-mpi-btl-openib.txt",
"unsupported queues configuration", true,
opal_process_info.nodename,
ibv_get_device_name(openib_btl->device->ib_dev),
@ -571,10 +581,10 @@ static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl,
endpoint->rem_info.rem_vendor_part_id,
values.receive_queues);
return OPAL_ERROR;
}
return OPAL_ERROR;
}
break;
}
break;
}
return OPAL_SUCCESS;
@ -724,7 +734,7 @@ static int prepare_device_for_use (mca_btl_openib_device_t *device)
#if OPAL_HAVE_THREADS
if(mca_btl_openib_component.use_async_event_thread) {
mca_btl_openib_async_cmd_t async_command;
mca_btl_openib_async_cmd_t async_command;
/* start the async even thread if it is not already started */
if (start_async_event_thread() != OPAL_SUCCESS)
@ -732,8 +742,8 @@ static int prepare_device_for_use (mca_btl_openib_device_t *device)
device->got_fatal_event = false;
device->got_port_event = false;
async_command.a_cmd = OPENIB_ASYNC_CMD_FD_ADD;
async_command.fd = device->ib_dev_context->async_fd;
async_command.a_cmd = OPENIB_ASYNC_CMD_FD_ADD;
async_command.fd = device->ib_dev_context->async_fd;
if (write(mca_btl_openib_component.async_pipe[1],
&async_command, sizeof(mca_btl_openib_async_cmd_t))<0){
BTL_ERROR(("Failed to write to pipe [%d]",errno));
@ -948,6 +958,12 @@ int mca_btl_openib_add_procs(
return rc;
}
rc = mca_btl_openib_size_queues(openib_btl, nprocs);
if (OPAL_SUCCESS != rc) {
BTL_ERROR(("error creating cqs"));
return rc;
}
for (i = 0, local_procs = 0 ; i < (int) nprocs; i++) {
struct opal_proc_t* proc = procs[i];
mca_btl_openib_proc_t* ib_proc;
@ -959,11 +975,6 @@ int mca_btl_openib_add_procs(
local_procs ++;
}
/* OOB, XOOB, and RDMACM do not support SELF comunication, so
* mark the prco as unreachable by openib btl */
if (0 == opal_compare_proc(OPAL_PROC_MY_NAME, proc->proc_name)) {
continue;
}
#if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE)
/* Most current iWARP adapters (June 2008) cannot handle
talking to other processes on the same host (!) -- so mark
@ -1133,7 +1144,7 @@ int mca_btl_openib_add_procs(
return OPAL_ERROR;
}
return mca_btl_openib_size_queues(openib_btl, nprocs);
return OPAL_SUCCESS;
}
/*
@ -1226,15 +1237,16 @@ ib_frag_alloc(mca_btl_openib_module_t *btl, size_t size, uint8_t order,
/* check if pending fragment has enough space for coalescing */
static mca_btl_openib_send_frag_t *check_coalescing(opal_list_t *frag_list,
opal_mutex_t *lock, mca_btl_base_endpoint_t *ep, size_t size)
opal_mutex_t *lock, struct mca_btl_base_endpoint_t *ep, size_t size,
mca_btl_openib_coalesced_frag_t **cfrag)
{
mca_btl_openib_send_frag_t *frag = NULL;
if(opal_list_is_empty(frag_list))
if (opal_list_is_empty(frag_list))
return NULL;
OPAL_THREAD_LOCK(lock);
if(!opal_list_is_empty(frag_list)) {
if (!opal_list_is_empty(frag_list)) {
int qp;
size_t total_length;
opal_list_item_t *i = opal_list_get_first(frag_list);
@ -1251,10 +1263,20 @@ static mca_btl_openib_send_frag_t *check_coalescing(opal_list_t *frag_list,
qp = to_base_frag(frag)->base.order;
if(total_length <= mca_btl_openib_component.qp_infos[qp].size)
opal_list_remove_first(frag_list);
else
if(total_length <= mca_btl_openib_component.qp_infos[qp].size) {
/* make sure we can allocate a coalescing frag before returning success */
*cfrag = alloc_coalesced_frag();
if (OPAL_LIKELY(NULL != cfrag)) {
(*cfrag)->send_frag = frag;
(*cfrag)->sent = false;
opal_list_remove_first(frag_list);
} else {
frag = NULL;
}
} else {
frag = NULL;
}
}
OPAL_THREAD_UNLOCK(lock);
@ -1281,7 +1303,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_alloc(
mca_btl_openib_module_t *obtl = (mca_btl_openib_module_t*)btl;
int qp = frag_size_to_order(obtl, size);
mca_btl_openib_send_frag_t *sfrag = NULL;
mca_btl_openib_coalesced_frag_t *cfrag;
mca_btl_openib_coalesced_frag_t *cfrag = NULL;
assert(qp != MCA_BTL_NO_ORDER);
@ -1290,26 +1312,25 @@ mca_btl_base_descriptor_t* mca_btl_openib_alloc(
int prio = !(flags & MCA_BTL_DES_FLAGS_PRIORITY);
sfrag = check_coalescing(&ep->qps[qp].no_wqe_pending_frags[prio],
&ep->endpoint_lock, ep, size);
&ep->endpoint_lock, ep, size, &cfrag);
if(NULL == sfrag) {
if (NULL == sfrag) {
if(BTL_OPENIB_QP_TYPE_PP(qp)) {
sfrag = check_coalescing(&ep->qps[qp].no_credits_pending_frags[prio],
&ep->endpoint_lock, ep, size);
&ep->endpoint_lock, ep, size, &cfrag);
} else {
sfrag = check_coalescing(
&obtl->qps[qp].u.srq_qp.pending_frags[prio],
&obtl->ib_lock, ep, size);
&obtl->ib_lock, ep, size, &cfrag);
}
}
}
if(NULL == sfrag)
if (NULL == sfrag) {
return ib_frag_alloc((mca_btl_openib_module_t*)btl, size, order, flags);
}
/* begin coalescing message */
cfrag = alloc_coalesced_frag();
cfrag->send_frag = sfrag;
/* fix up new coalescing header if this is the first coalesced frag */
if(sfrag->hdr != sfrag->chdr) {
@ -1343,10 +1364,9 @@ mca_btl_base_descriptor_t* mca_btl_openib_alloc(
to_base_frag(cfrag)->segment.base.seg_addr.pval = cfrag->hdr + 1;
to_base_frag(cfrag)->segment.base.seg_len = size;
/* save coalesced fragment on a main fragment; we will need it after send
* completion to free it and to call upper layer callback */
opal_list_append(&sfrag->coalesced_frags, (opal_list_item_t*)cfrag);
sfrag->coalesced_length += (size+sizeof(mca_btl_openib_header_coalesced_t));
/* NTH: there is no reason to append the coalesced fragment here. No more
* fragments will be added until either send or free has been called on
* the coalesced frag. */
to_base_frag(cfrag)->base.des_flags = flags;
@ -1363,18 +1383,6 @@ int mca_btl_openib_free(
struct mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des)
{
/* is this fragment pointing at user memory? */
if(MCA_BTL_OPENIB_FRAG_SEND_USER == openib_frag_type(des) ||
MCA_BTL_OPENIB_FRAG_RECV_USER == openib_frag_type(des)) {
mca_btl_openib_com_frag_t* frag = to_com_frag(des);
if(frag->registration != NULL) {
btl->btl_mpool->mpool_deregister(btl->btl_mpool,
(mca_mpool_base_registration_t*)frag->registration);
frag->registration = NULL;
}
}
/* reset those field on free so we will not have to do it on alloc */
to_base_frag(des)->base.des_flags = 0;
switch(openib_frag_type(des)) {
@ -1390,15 +1398,18 @@ int mca_btl_openib_free(
to_send_frag(des)->hdr + 1;
assert(!opal_list_get_size(&to_send_frag(des)->coalesced_frags));
/* fall through */
case MCA_BTL_OPENIB_FRAG_RECV:
case MCA_BTL_OPENIB_FRAG_RECV_USER:
case MCA_BTL_OPENIB_FRAG_SEND_USER:
to_base_frag(des)->base.des_remote = NULL;
to_base_frag(des)->base.des_remote_count = 0;
break;
default:
break;
}
if (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_COALESCED && !to_coalesced_frag(des)->sent) {
mca_btl_openib_send_frag_t *sfrag = to_coalesced_frag(des)->send_frag;
/* the coalesced fragment would have sent the original fragment but that
* will not happen so send the fragment now */
mca_btl_openib_endpoint_send(to_com_frag(sfrag)->endpoint, sfrag);
}
MCA_BTL_IB_FRAG_RETURN(des);
return OPAL_SUCCESS;
@ -1430,7 +1441,6 @@ int mca_btl_openib_free(
mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
mca_mpool_base_registration_t* registration,
struct opal_convertor_t* convertor,
uint8_t order,
size_t reserve,
@ -1438,7 +1448,6 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
uint32_t flags)
{
mca_btl_openib_module_t *openib_btl;
mca_btl_openib_reg_t *openib_reg;
mca_btl_openib_com_frag_t *frag = NULL;
struct iovec iov;
uint32_t iov_count = 1;
@ -1448,83 +1457,20 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
openib_btl = (mca_btl_openib_module_t*)btl;
#if OPAL_CUDA_GDR_SUPPORT
if(opal_convertor_cuda_need_buffers(convertor) == false && 0 == reserve) {
#else
if(opal_convertor_need_buffers(convertor) == false && 0 == reserve) {
#endif /* OPAL_CUDA_GDR_SUPPORT */
/* GMS bloody HACK! */
if(registration != NULL || max_data > btl->btl_max_send_size) {
frag = alloc_send_user_frag();
if(NULL == frag) {
return NULL;
}
iov.iov_len = max_data;
iov.iov_base = NULL;
opal_convertor_pack(convertor, &iov, &iov_count, &max_data);
*size = max_data;
if(NULL == registration) {
rc = btl->btl_mpool->mpool_register(btl->btl_mpool,
iov.iov_base, max_data, 0, &registration);
if(OPAL_SUCCESS != rc || NULL == registration) {
MCA_BTL_IB_FRAG_RETURN(frag);
return NULL;
}
/* keep track of the registration we did */
to_com_frag(frag)->registration =
(mca_btl_openib_reg_t*)registration;
}
openib_reg = (mca_btl_openib_reg_t*)registration;
frag->sg_entry.length = max_data;
frag->sg_entry.lkey = openib_reg->mr->lkey;
frag->sg_entry.addr = (uint64_t)(uintptr_t)iov.iov_base;
to_base_frag(frag)->base.order = order;
to_base_frag(frag)->base.des_flags = flags;
to_base_frag(frag)->segment.base.seg_len = max_data;
to_base_frag(frag)->segment.base.seg_addr.lval = (uint64_t)(uintptr_t) iov.iov_base;
to_base_frag(frag)->segment.key = frag->sg_entry.lkey;
assert(MCA_BTL_NO_ORDER == order);
BTL_VERBOSE(("frag->sg_entry.lkey = %" PRIu32 " .addr = %" PRIx64,
frag->sg_entry.lkey, frag->sg_entry.addr));
return &to_base_frag(frag)->base;
}
}
assert(MCA_BTL_NO_ORDER == order);
if(max_data + reserve > btl->btl_max_send_size) {
if (max_data + reserve > btl->btl_max_send_size) {
max_data = btl->btl_max_send_size - reserve;
}
if (OPAL_UNLIKELY(0 == reserve)) {
frag = (mca_btl_openib_com_frag_t *) ib_frag_alloc(openib_btl, max_data, order, flags);
if(NULL == frag)
return NULL;
/* NTH: this frag will be ue used for either a get or put so we need to set the lval to be
consistent with the usage in get and put. the pval will be restored in mca_btl_openib_free */
ptr = to_base_frag(frag)->segment.base.seg_addr.pval;
to_base_frag(frag)->segment.base.seg_addr.lval =
(uint64_t)(uintptr_t) ptr;
} else {
frag =
(mca_btl_openib_com_frag_t *) mca_btl_openib_alloc(btl, endpoint, order,
frag = (mca_btl_openib_com_frag_t *) mca_btl_openib_alloc (btl, endpoint, order,
max_data + reserve, flags);
if(NULL == frag)
return NULL;
ptr = to_base_frag(frag)->segment.base.seg_addr.pval;
if (NULL == frag) {
return NULL;
}
ptr = to_base_frag(frag)->segment.base.seg_addr.pval;
iov.iov_len = max_data;
iov.iov_base = (IOVBASE_TYPE *) ( (unsigned char*) ptr + reserve );
rc = opal_convertor_pack(convertor, &iov, &iov_count, &max_data);
@ -1547,103 +1493,6 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
return &to_base_frag(frag)->base;
}
/**
* Prepare the dst buffer
*
* @param btl (IN) BTL module
* @param peer (IN) BTL peer addressing
* prepare dest's behavior depends on the following:
* Has a valid memory registration been passed to prepare_src?
* if so we attempt to use the pre-registered user-buffer, if the memory registration
* is to small (only a portion of the user buffer) then we must reregister the user buffer
* Has the user requested the memory to be left pinned?
* if so we insert the memory registration into a memory tree for later lookup, we
* may also remove a previous registration if a MRU (most recently used) list of
* registrations is full, this prevents resources from being exhausted.
*/
mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
mca_mpool_base_registration_t* registration,
struct opal_convertor_t* convertor,
uint8_t order,
size_t reserve,
size_t* size,
uint32_t flags)
{
mca_btl_openib_module_t *openib_btl;
mca_btl_openib_component_t *openib_component;
mca_btl_openib_com_frag_t *frag;
mca_btl_openib_reg_t *openib_reg;
uint32_t max_msg_sz;
int rc;
void *buffer;
openib_btl = (mca_btl_openib_module_t*)btl;
openib_component = (mca_btl_openib_component_t*)btl->btl_component;
frag = alloc_recv_user_frag();
if(NULL == frag) {
return NULL;
}
/* max_msg_sz is the maximum message size of the HCA (hw limitation)
set the minimum between local max_msg_sz and the remote */
max_msg_sz = MIN(openib_btl->ib_port_attr.max_msg_sz,
endpoint->endpoint_btl->ib_port_attr.max_msg_sz);
/* check if user has explicitly limited the max message size */
if (openib_component->max_hw_msg_size > 0 &&
max_msg_sz > (size_t)openib_component->max_hw_msg_size) {
max_msg_sz = openib_component->max_hw_msg_size;
}
/* limit the message so to max_msg_sz */
if (*size > (size_t)max_msg_sz) {
*size = (size_t)max_msg_sz;
BTL_VERBOSE(("message size limited to %" PRIsize_t "\n", *size));
}
opal_convertor_get_current_pointer(convertor, &buffer);
if(NULL == registration){
/* we didn't get a memory registration passed in, so we have to
* register the region ourselves
*/
uint32_t mflags = 0;
#if OPAL_CUDA_GDR_SUPPORT
if (convertor->flags & CONVERTOR_CUDA) {
mflags |= MCA_MPOOL_FLAGS_CUDA_GPU_MEM;
}
#endif /* OPAL_CUDA_GDR_SUPPORT */
rc = btl->btl_mpool->mpool_register(btl->btl_mpool, buffer, *size, mflags,
&registration);
if(OPAL_SUCCESS != rc || NULL == registration) {
MCA_BTL_IB_FRAG_RETURN(frag);
return NULL;
}
/* keep track of the registration we did */
frag->registration = (mca_btl_openib_reg_t*)registration;
}
openib_reg = (mca_btl_openib_reg_t*)registration;
frag->sg_entry.length = *size;
frag->sg_entry.lkey = openib_reg->mr->lkey;
frag->sg_entry.addr = (uint64_t)(uintptr_t)buffer;
to_base_frag(frag)->segment.base.seg_addr.lval = (uint64_t)(uintptr_t) buffer;
to_base_frag(frag)->segment.base.seg_len = *size;
to_base_frag(frag)->segment.key = openib_reg->mr->rkey;
to_base_frag(frag)->base.order = order;
to_base_frag(frag)->base.des_flags = flags;
BTL_VERBOSE(("frag->sg_entry.lkey = %" PRIu32 " .addr = %" PRIx64 " "
"rkey = %" PRIu32, frag->sg_entry.lkey, frag->sg_entry.addr,
openib_reg->mr->rkey));
return &to_base_frag(frag)->base;
}
static int mca_btl_openib_finalize_resources(struct mca_btl_base_module_t* btl) {
mca_btl_openib_module_t* openib_btl;
mca_btl_openib_endpoint_t* endpoint;
@ -1796,16 +1645,15 @@ int mca_btl_openib_sendi( struct mca_btl_base_module_t* btl,
{
mca_btl_openib_module_t *obtl = (mca_btl_openib_module_t*)btl;
size_t size = payload_size + header_size;
size_t eager_limit;
int qp = frag_size_to_order(obtl, size),
prio = !(flags & MCA_BTL_DES_FLAGS_PRIORITY),
ib_rc;
int32_t cm_return;
bool do_rdma = false;
ompi_free_list_item_t* item = NULL;
mca_btl_openib_frag_t *frag;
mca_btl_openib_header_t *hdr;
int send_signaled;
int rc;
OPAL_THREAD_LOCK(&ep->endpoint_lock);
@ -1827,45 +1675,26 @@ int mca_btl_openib_sendi( struct mca_btl_base_module_t* btl,
/* Allocate WQE */
if(OPAL_UNLIKELY(qp_get_wqe(ep, qp) < 0)) {
goto no_credits_or_wqe;
}
/* eager rdma or send ? Check eager rdma credits */
/* Note: Maybe we want to implement isend only for eager rdma ?*/
eager_limit = mca_btl_openib_component.eager_limit +
sizeof(mca_btl_openib_header_coalesced_t) +
sizeof(mca_btl_openib_control_header_t);
if(OPAL_LIKELY(size <= eager_limit)) {
if(acquire_eager_rdma_send_credit(ep) == OPAL_SUCCESS) {
do_rdma = true;
}
}
/* if(!do_rdma && acquire_send_credit(ep, frag) != OPAL_SUCCESS) { */
/* Check send credits if it is no rdma */
if(!do_rdma) {
if(BTL_OPENIB_QP_TYPE_PP(qp)) {
if(OPAL_UNLIKELY(OPAL_THREAD_ADD32(&ep->qps[qp].u.pp_qp.sd_credits, -1) < 0)){
OPAL_THREAD_ADD32(&ep->qps[qp].u.pp_qp.sd_credits, 1);
goto no_credits_or_wqe;
}
} else {
if(OPAL_UNLIKELY(OPAL_THREAD_ADD32(&obtl->qps[qp].u.srq_qp.sd_credits, -1) < 0)){
OPAL_THREAD_ADD32(&obtl->qps[qp].u.srq_qp.sd_credits, 1);
goto no_credits_or_wqe;
}
}
goto cant_send_wqe;
}
/* Allocate fragment */
OMPI_FREE_LIST_GET_MT(&obtl->device->qps[qp].send_free, item);
if(OPAL_UNLIKELY(NULL == item)) {
/* we don't return NULL because maybe later we will try to coalesce */
goto no_frags;
goto cant_send_wqe;
}
frag = to_base_frag(item);
hdr = to_send_frag(item)->hdr;
/* eager rdma or send ? Check eager rdma credits */
/* Note: Maybe we want to implement isend only for eager rdma ?*/
rc = mca_btl_openib_endpoint_credit_acquire (ep, qp, prio, size, &do_rdma,
to_send_frag(frag), false);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
goto cant_send_frag;
}
frag->segment.base.seg_len = size;
frag->base.order = qp;
frag->base.des_flags = flags;
@ -1890,29 +1719,6 @@ int mca_btl_openib_sendi( struct mca_btl_base_module_t* btl,
assert(max_data == payload_size);
}
/* Set all credits */
BTL_OPENIB_GET_CREDITS(ep->eager_rdma_local.credits, hdr->credits);
if(hdr->credits)
hdr->credits |= BTL_OPENIB_RDMA_CREDITS_FLAG;
if(!do_rdma) {
if(BTL_OPENIB_QP_TYPE_PP(qp) && 0 == hdr->credits) {
BTL_OPENIB_GET_CREDITS(ep->qps[qp].u.pp_qp.rd_credits, hdr->credits);
}
} else {
hdr->credits |= (qp << 11);
}
BTL_OPENIB_GET_CREDITS(ep->qps[qp].u.pp_qp.cm_return, cm_return);
/* cm_seen is only 8 bytes, but cm_return is 32 bytes */
if(cm_return > 255) {
hdr->cm_seen = 255;
cm_return -= 255;
OPAL_THREAD_ADD32(&ep->qps[qp].u.pp_qp.cm_return, cm_return);
} else {
hdr->cm_seen = cm_return;
}
#if BTL_OPENIB_FAILOVER_ENABLED
send_signaled = 1;
#else
@ -1920,7 +1726,7 @@ int mca_btl_openib_sendi( struct mca_btl_base_module_t* btl,
#endif
ib_rc = post_send(ep, to_send_frag(item), do_rdma, send_signaled);
if(!ib_rc) {
if (!ib_rc) {
if (0 == send_signaled) {
MCA_BTL_IB_FRAG_RETURN(frag);
}
@ -1931,37 +1737,28 @@ int mca_btl_openib_sendi( struct mca_btl_base_module_t* btl,
}
#endif
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
return OPAL_SUCCESS;
}
/* Failed to send, do clean up all allocated resources */
if(ep->nbo) {
if (ep->nbo) {
BTL_OPENIB_HEADER_NTOH(*hdr);
}
if(BTL_OPENIB_IS_RDMA_CREDITS(hdr->credits)) {
OPAL_THREAD_ADD32(&ep->eager_rdma_local.credits,
BTL_OPENIB_CREDITS(hdr->credits));
}
if (!do_rdma && BTL_OPENIB_QP_TYPE_PP(qp)) {
OPAL_THREAD_ADD32(&ep->qps[qp].u.pp_qp.rd_credits,
hdr->credits);
}
no_frags:
if(do_rdma) {
OPAL_THREAD_ADD32(&ep->eager_rdma_remote.tokens, 1);
} else {
if(BTL_OPENIB_QP_TYPE_PP(qp)) {
OPAL_THREAD_ADD32(&ep->qps[qp].u.pp_qp.sd_credits, 1);
} else if BTL_OPENIB_QP_TYPE_SRQ(qp){
OPAL_THREAD_ADD32(&obtl->qps[qp].u.srq_qp.sd_credits, 1);
}
}
no_credits_or_wqe:
qp_put_wqe(ep, qp);
mca_btl_openib_endpoint_credit_release (ep, qp, do_rdma, to_send_frag(frag));
cant_send_frag:
MCA_BTL_IB_FRAG_RETURN(frag);
cant_send_wqe:
qp_put_wqe (ep, qp);
cant_send:
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
/* We can not send the data directly, so we just return descriptor */
*descriptor = mca_btl_openib_alloc(btl, ep, order, size, flags);
if (NULL != descriptor) {
*descriptor = mca_btl_openib_alloc(btl, ep, order, size, flags);
}
return OPAL_ERR_RESOURCE_BUSY;
}
/*
@ -1981,11 +1778,19 @@ int mca_btl_openib_send(
openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_COALESCED);
if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_COALESCED) {
frag = to_coalesced_frag(des)->send_frag;
/* save coalesced fragment on a main fragment; we will need it after send
* completion to free it and to call upper layer callback */
opal_list_append(&frag->coalesced_frags, (opal_list_item_t*) des);
frag->coalesced_length += to_coalesced_frag(des)->hdr->alloc_size +
sizeof(mca_btl_openib_header_coalesced_t);
to_coalesced_frag(des)->sent = true;
to_coalesced_frag(des)->hdr->tag = tag;
to_coalesced_frag(des)->hdr->size = des->des_local->seg_len;
to_coalesced_frag(des)->hdr->size = des->des_segments->seg_len;
if(ep->nbo)
BTL_OPENIB_HEADER_COALESCED_HTON(*to_coalesced_frag(des)->hdr);
frag = to_coalesced_frag(des)->send_frag;
} else {
frag = to_send_frag(des);
to_com_frag(des)->endpoint = ep;
@ -1997,161 +1802,34 @@ int mca_btl_openib_send(
return mca_btl_openib_endpoint_send(ep, frag);
}
/*
* RDMA WRITE local buffer to remote buffer address.
*/
int mca_btl_openib_put( mca_btl_base_module_t* btl,
mca_btl_base_endpoint_t* ep,
mca_btl_base_descriptor_t* descriptor)
static mca_btl_base_registration_handle_t *mca_btl_openib_register_mem (mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
void *base, size_t size, uint32_t flags)
{
mca_btl_openib_segment_t *src_seg = (mca_btl_openib_segment_t *) descriptor->des_local;
mca_btl_openib_segment_t *dst_seg = (mca_btl_openib_segment_t *) descriptor->des_remote;
struct ibv_send_wr* bad_wr;
mca_btl_openib_out_frag_t* frag = to_out_frag(descriptor);
int qp = descriptor->order;
uint64_t rem_addr = dst_seg->base.seg_addr.lval;
uint32_t rkey = dst_seg->key;
mca_btl_openib_reg_t *reg;
uint32_t mflags = 0;
int rc;
assert(openib_frag_type(frag) == MCA_BTL_OPENIB_FRAG_SEND_USER ||
openib_frag_type(frag) == MCA_BTL_OPENIB_FRAG_SEND);
#if OPAL_CUDA_GDR_SUPPORT
if (flags & MCA_BTL_REG_FLAG_CUDA_GPU_MEM) {
mflags |= MCA_MPOOL_FLAGS_CUDA_GPU_MEM;
}
#endif /* OPAL_CUDA_GDR_SUPPORT */
descriptor->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
if(ep->endpoint_state != MCA_BTL_IB_CONNECTED) {
int rc;
OPAL_THREAD_LOCK(&ep->endpoint_lock);
rc = check_endpoint_state(ep, descriptor, &ep->pending_put_frags);
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
if(OPAL_ERR_RESOURCE_BUSY == rc)
return OPAL_SUCCESS;
if(OPAL_SUCCESS != rc)
return rc;
rc = btl->btl_mpool->mpool_register (btl->btl_mpool, base, size, mflags,
(mca_mpool_base_registration_t **) &reg);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc || NULL == reg)) {
return NULL;
}
if(MCA_BTL_NO_ORDER == qp)
qp = mca_btl_openib_component.rdma_qp;
/* check for a send wqe */
if (qp_get_wqe(ep, qp) < 0) {
qp_put_wqe(ep, qp);
OPAL_THREAD_LOCK(&ep->endpoint_lock);
opal_list_append(&ep->pending_put_frags, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
return OPAL_SUCCESS;
}
/* post descriptor */
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
if((ep->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN)
!= (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) {
rem_addr = opal_swap_bytes8(rem_addr);
rkey = opal_swap_bytes4(rkey);
}
#endif
frag->sr_desc.wr.rdma.remote_addr = rem_addr;
frag->sr_desc.wr.rdma.rkey = rkey;
to_com_frag(frag)->sg_entry.addr = src_seg->base.seg_addr.lval;
to_com_frag(frag)->sg_entry.length = src_seg->base.seg_len;
to_com_frag(frag)->endpoint = ep;
#if HAVE_XRC
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp))
frag->sr_desc.xrc_remote_srq_num=ep->rem_info.rem_srqs[qp].rem_srq_num;
#endif
descriptor->order = qp;
/* Setting opcode on a frag constructor isn't enough since prepare_src
* may return send_frag instead of put_frag */
frag->sr_desc.opcode = IBV_WR_RDMA_WRITE;
frag->sr_desc.send_flags = ib_send_flags(src_seg->base.seg_len, &(ep->qps[qp]), 1);
qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag));
qp_reset_signal_count(ep, qp);
if(ibv_post_send(ep->qps[qp].qp->lcl_qp, &frag->sr_desc, &bad_wr))
return OPAL_ERROR;
return OPAL_SUCCESS;
return &reg->btl_handle;
}
/*
* RDMA READ remote buffer to local buffer address.
*/
int mca_btl_openib_get(mca_btl_base_module_t* btl,
mca_btl_base_endpoint_t* ep,
mca_btl_base_descriptor_t* descriptor)
static int mca_btl_openib_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle)
{
mca_btl_openib_segment_t *src_seg = (mca_btl_openib_segment_t *) descriptor->des_remote;
mca_btl_openib_segment_t *dst_seg = (mca_btl_openib_segment_t *) descriptor->des_local;
struct ibv_send_wr* bad_wr;
mca_btl_openib_get_frag_t* frag = to_get_frag(descriptor);
int qp = descriptor->order;
uint64_t rem_addr = src_seg->base.seg_addr.lval;
uint32_t rkey = src_seg->key;
mca_btl_openib_reg_t *reg = (mca_btl_openib_reg_t *)((intptr_t) handle - offsetof (mca_btl_openib_reg_t, btl_handle));
assert(openib_frag_type(frag) == MCA_BTL_OPENIB_FRAG_RECV_USER);
descriptor->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
if(ep->endpoint_state != MCA_BTL_IB_CONNECTED) {
int rc;
OPAL_THREAD_LOCK(&ep->endpoint_lock);
rc = check_endpoint_state(ep, descriptor, &ep->pending_get_frags);
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
if(OPAL_ERR_RESOURCE_BUSY == rc)
return OPAL_SUCCESS;
if(OPAL_SUCCESS != rc)
return rc;
}
if(MCA_BTL_NO_ORDER == qp)
qp = mca_btl_openib_component.rdma_qp;
/* check for a send wqe */
if (qp_get_wqe(ep, qp) < 0) {
qp_put_wqe(ep, qp);
OPAL_THREAD_LOCK(&ep->endpoint_lock);
opal_list_append(&ep->pending_get_frags, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
return OPAL_SUCCESS;
}
/* check for a get token */
if(OPAL_THREAD_ADD32(&ep->get_tokens,-1) < 0) {
qp_put_wqe(ep, qp);
OPAL_THREAD_ADD32(&ep->get_tokens,1);
OPAL_THREAD_LOCK(&ep->endpoint_lock);
opal_list_append(&ep->pending_get_frags, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
return OPAL_SUCCESS;
}
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
if((ep->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN)
!= (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) {
rem_addr = opal_swap_bytes8(rem_addr);
rkey = opal_swap_bytes4(rkey);
}
#endif
frag->sr_desc.wr.rdma.remote_addr = rem_addr;
frag->sr_desc.wr.rdma.rkey = rkey;
to_com_frag(frag)->sg_entry.addr = dst_seg->base.seg_addr.lval;
to_com_frag(frag)->sg_entry.length = dst_seg->base.seg_len;
to_com_frag(frag)->endpoint = ep;
#if HAVE_XRC
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp))
frag->sr_desc.xrc_remote_srq_num=ep->rem_info.rem_srqs[qp].rem_srq_num;
#endif
descriptor->order = qp;
qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag));
qp_reset_signal_count(ep, qp);
if(ibv_post_send(ep->qps[qp].qp->lcl_qp, &frag->sr_desc, &bad_wr))
return OPAL_ERROR;
btl->btl_mpool->mpool_deregister (btl->btl_mpool, (mca_mpool_base_registration_t *) reg);
return OPAL_SUCCESS;
}

Просмотреть файл

@ -126,10 +126,7 @@ struct mca_btl_openib_qp_info_t {
(BTL_OPENIB_QP_TYPE(Q) == MCA_BTL_OPENIB_XRC_QP)
typedef enum {
BTL_OPENIB_RQ_SOURCE_DEFAULT,
BTL_OPENIB_RQ_SOURCE_MCA,
BTL_OPENIB_RQ_SOURCE_DEVICE_INI,
BTL_OPENIB_RQ_SOURCE_MAX
BTL_OPENIB_RQ_SOURCE_DEVICE_INI = MCA_BASE_VAR_SOURCE_MAX,
} btl_openib_receive_queues_source_t;
typedef enum {
@ -497,9 +494,15 @@ typedef struct mca_btl_openib_module_t mca_btl_openib_module_t;
extern mca_btl_openib_module_t mca_btl_openib_module;
struct mca_btl_base_registration_handle_t {
uint32_t rkey;
uint32_t lkey;
};
struct mca_btl_openib_reg_t {
mca_mpool_base_registration_t base;
struct ibv_mr *mr;
mca_btl_base_registration_handle_t btl_handle;
};
typedef struct mca_btl_openib_reg_t mca_btl_openib_reg_t;
@ -612,32 +615,182 @@ extern int mca_btl_openib_sendi( struct mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t** descriptor
);
/**
* PML->BTL Initiate a put of the specified size.
*
* @param btl (IN) BTL instance
* @param btl_peer (IN) BTL peer addressing
* @param descriptor (IN) Descriptor of data to be transmitted.
*/
extern int mca_btl_openib_put(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* btl_peer,
struct mca_btl_base_descriptor_t* descriptor
);
/* forward decaration for internal put/get */
struct mca_btl_openib_put_frag_t;
struct mca_btl_openib_get_frag_t;
/**
* PML->BTL Initiate a get of the specified size.
* @brief Schedule a put fragment with the HCA (internal)
*
* @param btl (IN) BTL instance
* @param btl_base_peer (IN) BTL peer addressing
* @param descriptor (IN) Descriptor of data to be transmitted.
* @param ep (IN) BTL endpoint
* @param frag (IN) Fragment prepared by mca_btl_openib_put
*
* If the fragment can not be scheduled due to resource limitations then
* the fragment will be put on the pending put fragment list and retried
* when another get/put fragment has completed.
*/
extern int mca_btl_openib_get(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* btl_peer,
struct mca_btl_base_descriptor_t* descriptor
);
int mca_btl_openib_put_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep,
struct mca_btl_openib_put_frag_t *frag);
/**
* @brief Schedule an RDMA write with the HCA
*
* @param btl (IN) BTL instance
* @param ep (IN) BTL endpoint
* @param local_address (IN) Source address
* @param remote_address (IN) Destination address
* @param local_handle (IN) Registration handle for region containing the region {local_address, size}
* @param remote_handle (IN) Registration handle for region containing the region {remote_address, size}
* @param size (IN) Number of bytes to write
* @param flags (IN) Transfer flags
* @param order (IN) Ordering
* @param cbfunc (IN) Function to call on completion
* @param cbcontext (IN) Context for completion callback
* @param cbdata (IN) Data for completion callback
*
* @return OPAL_ERR_BAD_PARAM if a bad parameter was passed
* @return OPAL_SUCCCESS if the operation was successfully scheduled
*
* This function will attempt to schedule a put operation with the HCA.
*/
int mca_btl_openib_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
/**
* @brief Schedule a get fragment with the HCA (internal)
*
* @param btl (IN) BTL instance
* @param ep (IN) BTL endpoint
* @param qp (IN) ID of queue pair to schedule the get on
* @param frag (IN) Fragment prepared by mca_btl_openib_get
*
* If the fragment can not be scheduled due to resource limitations then
* the fragment will be put on the pending get fragment list and retried
* when another get/put fragment has completed.
*/
int mca_btl_openib_get_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep,
struct mca_btl_openib_get_frag_t *frag);
/**
* @brief Schedule an RDMA read with the HCA
*
* @param btl (IN) BTL instance
* @param ep (IN) BTL endpoint
* @param local_address (IN) Destination address
* @param remote_address (IN) Source address
* @param local_handle (IN) Registration handle for region containing the region {local_address, size}
* @param remote_handle (IN) Registration handle for region containing the region {remote_address, size}
* @param size (IN) Number of bytes to read
* @param flags (IN) Transfer flags
* @param order (IN) Ordering
* @param cbfunc (IN) Function to call on completion
* @param cbcontext (IN) Context for completion callback
* @param cbdata (IN) Data for completion callback
*
* @return OPAL_ERR_BAD_PARAM if a bad parameter was passed
* @return OPAL_SUCCCESS if the operation was successfully scheduled
*
* This function will attempt to schedule a get operation with the HCA.
*/
int mca_btl_openib_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
/**
* Initiate an asynchronous fetching atomic operation.
* Completion Semantics: if this function returns a 1 then the operation
* is complete. a return of OPAL_SUCCESS indicates
* the atomic operation has been queued with the
* network.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param local_address (OUT) Local address to store the result in
* @param remote_address (IN) Remote address perfom operation on to (registered remotely)
* @param local_handle (IN) Local registration handle for region containing
* (local_address, local_address + 8)
* @param remote_handle (IN) Remote registration handle for region containing
* (remote_address, remote_address + 8)
* @param op (IN) Operation to perform
* @param operand (IN) Operand for the operation
* @param flags (IN) Flags for this put operation
* @param order (IN) Ordering
* @param cbfunc (IN) Function to call on completion (if queued)
* @param cbcontext (IN) Context for the callback
* @param cbdata (IN) Data for callback
*
* @retval OPAL_SUCCESS The operation was successfully queued
* @retval 1 The operation is complete
* @retval OPAL_ERROR The operation was NOT successfully queued
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the atomic
* operation. Try again later
* @retval OPAL_ERR_NOT_AVAILABLE Atomic operation can not be performed due to
* alignment restrictions or the operation {op} is not supported
* by the hardware.
*
* After the operation is complete the remote address specified by {remote_address} and
* {remote_handle} will be updated with (*remote_address) = (*remote_address) op operand.
* {local_address} will be updated with the previous value stored in {remote_address}.
* The btl will guarantee consistency of atomic operations performed via the btl. Note,
* however, that not all btls will provide consistency between btl atomic operations and
* cpu atomics.
*/
int mca_btl_openib_atomic_fop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, uint64_t remote_address,
struct mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
int64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
void *cbcontext, void *cbdata);
/**
* Initiate an asynchronous compare and swap operation.
* Completion Semantics: if this function returns a 1 then the operation
* is complete. a return of OPAL_SUCCESS indicates
* the atomic operation has been queued with the
* network.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param local_address (OUT) Local address to store the result in
* @param remote_address (IN) Remote address perfom operation on to (registered remotely)
* @param local_handle (IN) Local registration handle for region containing
* (local_address, local_address + 8)
* @param remote_handle (IN) Remote registration handle for region containing
* (remote_address, remote_address + 8)
* @param compare (IN) Operand for the operation
* @param value (IN) Value to store on success
* @param flags (IN) Flags for this put operation
* @param order (IN) Ordering
* @param cbfunc (IN) Function to call on completion (if queued)
* @param cbcontext (IN) Context for the callback
* @param cbdata (IN) Data for callback
*
* @retval OPAL_SUCCESS The operation was successfully queued
* @retval 1 The operation is complete
* @retval OPAL_ERROR The operation was NOT successfully queued
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the atomic
* operation. Try again later
* @retval OPAL_ERR_NOT_AVAILABLE Atomic operation can not be performed due to
* alignment restrictions or the operation {op} is not supported
* by the hardware.
*
* After the operation is complete the remote address specified by {remote_address} and
* {remote_handle} will be updated with {value} if *remote_address == compare.
* {local_address} will be updated with the previous value stored in {remote_address}.
* The btl will guarantee consistency of atomic operations performed via the btl. Note,
* however, that not all btls will provide consistency between btl atomic operations and
* cpu atomics.
*/
int mca_btl_openib_atomic_cswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, uint64_t remote_address,
struct mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_registration_handle_t *remote_handle, int64_t compare,
int64_t value, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
void *cbcontext, void *cbdata);
/**
* Allocate a descriptor.
@ -674,7 +827,6 @@ extern int mca_btl_openib_free(
mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* peer,
mca_mpool_base_registration_t* registration,
struct opal_convertor_t* convertor,
uint8_t order,
size_t reserve,
@ -682,22 +834,6 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
uint32_t flags
);
/**
* Allocate a descriptor initialized for RDMA write.
*
* @param btl (IN) BTL module
* @param peer (IN) BTL peer addressing
*/
extern mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* peer,
mca_mpool_base_registration_t* registration,
struct opal_convertor_t* convertor,
uint8_t order,
size_t reserve,
size_t* size,
uint32_t flags);
extern void mca_btl_openib_frag_progress_pending_put_get(
struct mca_btl_base_endpoint_t*, const int);

135
opal/mca/btl/openib/btl_openib_atomic.c Обычный файл
Просмотреть файл

@ -0,0 +1,135 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "btl_openib.h"
#include "btl_openib_endpoint.h"
#include "btl_openib_xrc.h"
#if HAVE_DECL_IBV_ATOMIC_HCA
static int mca_btl_openib_atomic_internal (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, enum ibv_wr_opcode opcode,
int64_t operand, int operand2, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
void *cbcontext, void *cbdata)
{
mca_btl_openib_get_frag_t* frag = NULL;
int qp = order;
int rc;
frag = to_get_frag(alloc_recv_user_frag());
if (OPAL_UNLIKELY(NULL == frag)) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
if (MCA_BTL_NO_ORDER == qp) {
qp = mca_btl_openib_component.rdma_qp;
}
/* set base descriptor flags */
to_base_frag(frag)->base.order = qp;
/* free this descriptor when the operation is complete */
to_base_frag(frag)->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
/* set up scatter-gather entry */
to_com_frag(frag)->sg_entry.length = 8;
to_com_frag(frag)->sg_entry.lkey = local_handle->lkey;
to_com_frag(frag)->sg_entry.addr = (uint64_t)(uintptr_t) local_address;
to_com_frag(frag)->endpoint = endpoint;
/* set up rdma callback */
frag->cb.func = cbfunc;
frag->cb.context = cbcontext;
frag->cb.data = cbdata;
frag->cb.local_handle = local_handle;
/* set up descriptor */
frag->sr_desc.wr.atomic.remote_addr = remote_address;
frag->sr_desc.opcode = opcode;
frag->sr_desc.wr.atomic.compare_add = operand;
frag->sr_desc.wr.atomic.swap = operand2;
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
if((endpoint->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN)
!= (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) {
frag->sr_desc.wr.atomic.rkey = opal_swap_bytes4 (remote_handle->rkey);
} else
#endif
{
frag->sr_desc.wr.atomic.rkey = remote_handle->rkey;
}
#if HAVE_XRC
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) {
frag->sr_desc.xrc_remote_srq_num=endpoint->rem_info.rem_srqs[qp].rem_srq_num;
}
#endif
if (endpoint->endpoint_state != MCA_BTL_IB_CONNECTED) {
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
rc = check_endpoint_state(endpoint, &to_base_frag(frag)->base, &endpoint->pending_get_frags);
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
if (OPAL_ERR_RESOURCE_BUSY == rc) {
return OPAL_SUCCESS;
}
if (OPAL_SUCCESS != rc) {
MCA_BTL_IB_FRAG_RETURN (frag);
return rc;
}
}
rc = mca_btl_openib_get_internal (btl, endpoint, frag);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
if (OPAL_LIKELY(OPAL_ERR_OUT_OF_RESOURCE == rc)) {
rc = OPAL_SUCCESS;
OPAL_THREAD_SCOPED_LOCK(&endpoint->endpoint_lock,
opal_list_append(&endpoint->pending_get_frags, (opal_list_item_t*)frag));
} else {
MCA_BTL_IB_FRAG_RETURN (frag);
}
}
return rc;
}
int mca_btl_openib_atomic_fop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, uint64_t remote_address,
struct mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
int64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
void *cbcontext, void *cbdata)
{
if (OPAL_UNLIKELY(MCA_BTL_ATOMIC_ADD != op)) {
return OPAL_ERR_NOT_SUPPORTED;
}
return mca_btl_openib_atomic_internal (btl, endpoint, local_address, remote_address, local_handle,
remote_handle, IBV_WR_ATOMIC_FETCH_AND_ADD, operand, 0,
flags, order, cbfunc, cbcontext, cbdata);
}
int mca_btl_openib_atomic_cswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, uint64_t remote_address,
struct mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_registration_handle_t *remote_handle, int64_t compare,
int64_t value, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
void *cbcontext, void *cbdata)
{
return mca_btl_openib_atomic_internal (btl, endpoint, local_address, remote_address, local_handle,
remote_handle, IBV_WR_ATOMIC_CMP_AND_SWP, compare, value,
flags, order, cbfunc, cbcontext, cbdata);
}
#endif

Просмотреть файл

@ -471,7 +471,7 @@ static void btl_openib_control(mca_btl_base_module_t* btl,
mca_btl_openib_header_coalesced_t *clsc_hdr =
(mca_btl_openib_header_coalesced_t*)(ctl_hdr + 1);
mca_btl_active_message_callback_t* reg;
size_t len = des->des_local->seg_len - sizeof(*ctl_hdr);
size_t len = des->des_segments->seg_len - sizeof(*ctl_hdr);
switch (ctl_hdr->type) {
case MCA_BTL_OPENIB_CONTROL_CREDITS:
@ -522,8 +522,8 @@ static void btl_openib_control(mca_btl_base_module_t* btl,
skip = (sizeof(*clsc_hdr) + clsc_hdr->alloc_size - pad);
tmp_des.des_local = &tmp_seg;
tmp_des.des_local_count = 1;
tmp_des.des_segments = &tmp_seg;
tmp_des.des_segment_count = 1;
tmp_seg.seg_addr.pval = clsc_hdr + 1;
tmp_seg.seg_len = clsc_hdr->size;
@ -583,6 +583,10 @@ static int openib_reg_mr(void *reg_data, void *base, size_t size,
enum ibv_access_flags access_flag = (enum ibv_access_flags) (IBV_ACCESS_LOCAL_WRITE |
IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ);
#if HAVE_DECL_IBV_ATOMIC_HCA
access_flag |= IBV_ACCESS_REMOTE_ATOMIC;
#endif
if (device->mem_reg_max &&
device->mem_reg_max < (device->mem_reg_active + size)) {
return OPAL_ERR_OUT_OF_RESOURCE;
@ -605,6 +609,9 @@ static int openib_reg_mr(void *reg_data, void *base, size_t size,
return OPAL_ERR_OUT_OF_RESOURCE;
}
openib_reg->btl_handle.lkey = openib_reg->mr->lkey;
openib_reg->btl_handle.rkey = openib_reg->mr->rkey;
OPAL_OUTPUT_VERBOSE((30, mca_btl_openib_component.memory_registration_verbose,
"openib_reg_mr: base=%p, bound=%p, size=%d, flags=0x%x", reg->base, reg->bound,
(int) (reg->bound - reg->base + 1), reg->flags));
@ -804,7 +811,30 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbfunc = btl_openib_control;
mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbdata = NULL;
openib_btl->super.btl_seg_size = sizeof (mca_btl_openib_segment_t);
if (openib_btl->super.btl_get_limit > openib_btl->ib_port_attr.max_msg_sz) {
openib_btl->super.btl_get_limit = openib_btl->ib_port_attr.max_msg_sz;
}
openib_btl->super.btl_get_alignment = 0;
if (openib_btl->super.btl_put_limit > openib_btl->ib_port_attr.max_msg_sz) {
openib_btl->super.btl_put_limit = openib_btl->ib_port_attr.max_msg_sz;
}
#if HAVE_DECL_IBV_ATOMIC_HCA
if (openib_btl->device->ib_dev_attr.atomic_cap == IBV_ATOMIC_NONE) {
openib_btl->super.btl_flags &= ~MCA_BTL_FLAGS_ATOMIC_FOPS;
openib_btl->super.btl_atomic_flags = 0;
openib_btl->super.btl_atomic_fop = NULL;
openib_btl->super.btl_atomic_cswap = NULL;
} else if (IBV_ATOMIC_GLOB == openib_btl->device->ib_dev_attr.atomic_cap) {
openib_btl->super.btl_flags |= MCA_BTL_ATOMIC_SUPPORTS_GLOB;
}
#endif
openib_btl->super.btl_put_alignment = 0;
openib_btl->super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t);
/* Check bandwidth configured for this device */
sprintf(param, "bandwidth_%s", ibv_get_device_name(device->ib_dev));
@ -1960,9 +1990,7 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
}
/* If the MCA param was specified, skip all the checks */
if ( MCA_BASE_VAR_SOURCE_COMMAND_LINE ||
MCA_BASE_VAR_SOURCE_ENV ==
mca_btl_openib_component.receive_queues_source) {
if (MCA_BASE_VAR_SOURCE_DEFAULT != mca_btl_openib_component.receive_queues_source) {
goto good;
}
@ -1980,7 +2008,7 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
mca_btl_openib_component.receive_queues =
strdup(values.receive_queues);
mca_btl_openib_component.receive_queues_source =
MCA_BASE_VAR_SOURCE_FILE;
BTL_OPENIB_RQ_SOURCE_DEVICE_INI;
}
}
@ -2881,17 +2909,20 @@ void mca_btl_openib_frag_progress_pending_put_get(mca_btl_base_endpoint_t *ep,
size_t i, len = opal_list_get_size(&ep->pending_get_frags);
int rc;
for(i = 0; i < len && ep->qps[qp].qp->sd_wqe > 0 && ep->get_tokens > 0; i++)
{
for(i = 0; i < len && ep->qps[qp].qp->sd_wqe > 0 && ep->get_tokens > 0; i++) {
OPAL_THREAD_LOCK(&ep->endpoint_lock);
frag = opal_list_remove_first(&(ep->pending_get_frags));
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
if(NULL == frag)
if (NULL == frag)
break;
rc = mca_btl_openib_get((mca_btl_base_module_t *)openib_btl, ep,
&to_base_frag(frag)->base);
if(OPAL_ERR_OUT_OF_RESOURCE == rc)
rc = mca_btl_openib_get_internal ((mca_btl_base_module_t *)openib_btl, ep,
to_get_frag(frag));
if (OPAL_ERR_OUT_OF_RESOURCE == rc) {
OPAL_THREAD_LOCK(&ep->endpoint_lock);
opal_list_prepend (&ep->pending_get_frags, frag);
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
break;
}
}
len = opal_list_get_size(&ep->pending_put_frags);
@ -2899,12 +2930,16 @@ void mca_btl_openib_frag_progress_pending_put_get(mca_btl_base_endpoint_t *ep,
OPAL_THREAD_LOCK(&ep->endpoint_lock);
frag = opal_list_remove_first(&(ep->pending_put_frags));
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
if(NULL == frag)
if (NULL == frag)
break;
rc = mca_btl_openib_put((mca_btl_base_module_t*)openib_btl, ep,
&to_base_frag(frag)->base);
if(OPAL_ERR_OUT_OF_RESOURCE == rc)
rc = mca_btl_openib_put_internal ((mca_btl_base_module_t*)openib_btl, ep,
to_put_frag(frag));
if (OPAL_ERR_OUT_OF_RESOURCE == rc) {
OPAL_THREAD_LOCK(&ep->endpoint_lock);
opal_list_prepend (&ep->pending_put_frags, frag);
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
break;
}
}
}
@ -2925,7 +2960,7 @@ static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl,
/* advance the segment address past the header and subtract from the
* length.*/
des->des_local->seg_len = byte_len - sizeof(mca_btl_openib_header_t);
des->des_segments->seg_len = byte_len - sizeof(mca_btl_openib_header_t);
if(OPAL_LIKELY(!(is_credit_msg = is_credit_message(frag)))) {
/* call registered callback */
@ -2960,7 +2995,7 @@ static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl,
}
} else {
mca_btl_openib_rdma_credits_header_t *chdr =
(mca_btl_openib_rdma_credits_header_t *) des->des_local->seg_addr.pval;
(mca_btl_openib_rdma_credits_header_t *) des->des_segments->seg_addr.pval;
if(ep->nbo) {
BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH(*chdr);
}
@ -3266,11 +3301,27 @@ static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq,
/* Handle work completions */
switch(wc->opcode) {
case IBV_WC_RDMA_READ:
OPAL_OUTPUT((-1, "Got WC: RDMA_READ"));
OPAL_THREAD_ADD32(&endpoint->get_tokens, 1);
/* fall through */
case IBV_WC_COMP_SWAP:
case IBV_WC_FETCH_ADD:
OPAL_OUTPUT((-1, "Got WC: RDMA_READ or RDMA_WRITE"));
OPAL_THREAD_ADD32(&endpoint->get_tokens, 1);
mca_btl_openib_get_frag_t *get_frag = to_get_frag(des);
get_frag->cb.func (&openib_btl->super, endpoint, (void *)(intptr_t) frag->sg_entry.addr,
get_frag->cb.local_handle, get_frag->cb.context, get_frag->cb.data,
OPAL_SUCCESS);
case IBV_WC_RDMA_WRITE:
if (MCA_BTL_OPENIB_FRAG_SEND_USER == openib_frag_type(des)) {
mca_btl_openib_put_frag_t *put_frag = to_put_frag(des);
put_frag->cb.func (&openib_btl->super, endpoint, (void *)(intptr_t) frag->sg_entry.addr,
put_frag->cb.local_handle, put_frag->cb.context, put_frag->cb.data,
OPAL_SUCCESS);
put_frag->cb.func = NULL;
}
/* fall through */
case IBV_WC_SEND:
OPAL_OUTPUT((-1, "Got WC: RDMA_WRITE or SEND"));
if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) {
@ -3299,7 +3350,7 @@ static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq,
/* Process a completed send/put/get */
btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
des->des_cbfunc(&openib_btl->super, endpoint, des,OPAL_SUCCESS);
des->des_cbfunc(&openib_btl->super, endpoint, des, OPAL_SUCCESS);
}
if( btl_ownership ) {
mca_btl_openib_free(&openib_btl->super, des);

Просмотреть файл

@ -1,4 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
@ -11,7 +11,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006-2012 Los Alamos National Security, LLC. All rights
* Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2006-2007 Voltaire All rights reserved.
* Copyright (c) 2006-2009 Mellanox Technologies, Inc. All rights reserved.
@ -51,7 +51,7 @@
static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint);
static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint);
static inline int acruire_wqe(mca_btl_openib_endpoint_t *ep,
static inline int acquire_wqe(mca_btl_openib_endpoint_t *ep,
mca_btl_openib_send_frag_t *frag)
{
int qp = to_base_frag(frag)->base.order;
@ -67,91 +67,34 @@ static inline int acruire_wqe(mca_btl_openib_endpoint_t *ep,
return OPAL_SUCCESS;
}
static int acquire_send_credit(mca_btl_openib_endpoint_t *endpoint,
mca_btl_openib_send_frag_t *frag)
{
mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl;
int qp = to_base_frag(frag)->base.order;
int prio = !(to_base_frag(frag)->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY);
if(BTL_OPENIB_QP_TYPE_PP(qp)) {
if(OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, -1) < 0) {
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, 1);
opal_list_append(&endpoint->qps[qp].no_credits_pending_frags[prio],
(opal_list_item_t *)frag);
return OPAL_ERR_OUT_OF_RESOURCE;
}
} else {
if(OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, -1) < 0)
{
OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1);
OPAL_THREAD_LOCK(&openib_btl->ib_lock);
opal_list_append(&openib_btl->qps[qp].u.srq_qp.pending_frags[prio],
(opal_list_item_t *)frag);
OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
return OPAL_ERR_OUT_OF_RESOURCE;
}
}
return OPAL_SUCCESS;
}
/* this function is called with endpoint->endpoint_lock held */
int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t *endpoint,
mca_btl_openib_send_frag_t *frag)
{
int prio = !(to_base_frag(frag)->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY);
mca_btl_openib_header_t *hdr = frag->hdr;
mca_btl_base_descriptor_t *des = &to_base_frag(frag)->base;
int qp, ib_rc;
int32_t cm_return;
int qp, ib_rc, rc;
bool do_rdma = false;
size_t eager_limit;
size_t size;
if(OPAL_LIKELY(des->order == MCA_BTL_NO_ORDER))
des->order = frag->qp_idx;
qp = des->order;
if(acruire_wqe(endpoint, frag) != OPAL_SUCCESS)
if(acquire_wqe(endpoint, frag) != OPAL_SUCCESS)
return OPAL_ERR_RESOURCE_BUSY;
eager_limit = mca_btl_openib_component.eager_limit +
sizeof(mca_btl_openib_header_coalesced_t) +
sizeof(mca_btl_openib_control_header_t);
if(des->des_local->seg_len + frag->coalesced_length <= eager_limit &&
(des->des_flags & MCA_BTL_DES_FLAGS_PRIORITY)) {
/* High priority frag. Try to send over eager RDMA */
if(acquire_eager_rdma_send_credit(endpoint) == OPAL_SUCCESS)
do_rdma = true;
}
size = des->des_segments->seg_len + frag->coalesced_length;
if(!do_rdma && acquire_send_credit(endpoint, frag) != OPAL_SUCCESS) {
rc = mca_btl_openib_endpoint_credit_acquire (endpoint, qp, prio, size,
&do_rdma, frag, true);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
qp_put_wqe(endpoint, qp);
return OPAL_ERR_RESOURCE_BUSY;
}
BTL_OPENIB_GET_CREDITS(endpoint->eager_rdma_local.credits, hdr->credits);
if(hdr->credits)
hdr->credits |= BTL_OPENIB_RDMA_CREDITS_FLAG;
if(!do_rdma) {
if(BTL_OPENIB_QP_TYPE_PP(qp) && 0 == hdr->credits) {
BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, hdr->credits);
}
} else {
hdr->credits |= (qp << 11);
}
BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
/* cm_seen is only 8 bytes, but cm_return is 32 bytes */
if(cm_return > 255) {
hdr->cm_seen = 255;
cm_return -= 255;
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
} else {
hdr->cm_seen = cm_return;
}
qp_reset_signal_count(endpoint, qp);
ib_rc = post_send(endpoint, frag, do_rdma, 1);
@ -161,27 +104,12 @@ int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t *endpoint,
if(endpoint->nbo)
BTL_OPENIB_HEADER_NTOH(*hdr);
if(BTL_OPENIB_IS_RDMA_CREDITS(hdr->credits)) {
OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits,
BTL_OPENIB_CREDITS(hdr->credits));
}
mca_btl_openib_endpoint_credit_release (endpoint, qp, do_rdma, frag);
qp_put_wqe(endpoint, qp);
if(do_rdma) {
OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1);
} else {
if(BTL_OPENIB_QP_TYPE_PP(qp)) {
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits,
hdr->credits);
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, 1);
} else if BTL_OPENIB_QP_TYPE_SRQ(qp){
mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl;
OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1);
}
}
BTL_ERROR(("error posting send request error %d: %s\n",
ib_rc, strerror(ib_rc)));
BTL_ERROR(("error posting send request error %d: %s. size = %lu\n",
ib_rc, strerror(ib_rc), size));
return OPAL_ERROR;
}
@ -690,8 +618,8 @@ void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint)
/* We need to post this one */
if (OPAL_ERROR == mca_btl_openib_endpoint_post_send(endpoint, frag)) {
BTL_ERROR(("Error posting send"));
}
BTL_ERROR(("Error posting send"));
}
}
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);

Просмотреть файл

@ -1,3 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
@ -10,7 +11,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
* Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2006-2007 Voltaire All rights reserved.
* Copyright (c) 2007-2009 Mellanox Technologies. All rights reserved.
@ -610,6 +611,101 @@ static inline int post_send(mca_btl_openib_endpoint_t *ep,
return ibv_post_send(ep->qps[qp].qp->lcl_qp, sr_desc, &bad_wr);
}
/* called with the endpoint lock held */
static inline int mca_btl_openib_endpoint_credit_acquire (struct mca_btl_base_endpoint_t *endpoint, int qp,
int prio, size_t size, bool *do_rdma,
mca_btl_openib_send_frag_t *frag, bool queue_frag)
{
mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl;
mca_btl_openib_header_t *hdr = frag->hdr;
size_t eager_limit;
int32_t cm_return;
eager_limit = mca_btl_openib_component.eager_limit +
sizeof(mca_btl_openib_header_coalesced_t) +
sizeof(mca_btl_openib_control_header_t);
if (!(prio && size < eager_limit && acquire_eager_rdma_send_credit(endpoint) == OPAL_SUCCESS)) {
*do_rdma = false;
if (BTL_OPENIB_QP_TYPE_PP(qp)) {
if (OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, -1) < 0) {
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, 1);
if (queue_frag) {
opal_list_append(&endpoint->qps[qp].no_credits_pending_frags[prio],
(opal_list_item_t *)frag);
}
return OPAL_ERR_OUT_OF_RESOURCE;
}
} else {
if(OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, -1) < 0) {
OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1);
if (queue_frag) {
OPAL_THREAD_LOCK(&openib_btl->ib_lock);
opal_list_append(&openib_btl->qps[qp].u.srq_qp.pending_frags[prio],
(opal_list_item_t *)frag);
OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
}
return OPAL_ERR_OUT_OF_RESOURCE;
}
}
} else {
/* High priority frag. Try to send over eager RDMA */
*do_rdma = true;
}
/* Set all credits */
BTL_OPENIB_GET_CREDITS(endpoint->eager_rdma_local.credits, hdr->credits);
if (hdr->credits) {
hdr->credits |= BTL_OPENIB_RDMA_CREDITS_FLAG;
}
if (!*do_rdma) {
if (BTL_OPENIB_QP_TYPE_PP(qp) && 0 == hdr->credits) {
BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, hdr->credits);
}
} else {
hdr->credits |= (qp << 11);
}
BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
/* cm_seen is only 8 bytes, but cm_return is 32 bytes */
if(cm_return > 255) {
hdr->cm_seen = 255;
cm_return -= 255;
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
} else {
hdr->cm_seen = cm_return;
}
return OPAL_SUCCESS;
}
/* called with the endpoint lock held. */
static inline void mca_btl_openib_endpoint_credit_release (struct mca_btl_base_endpoint_t *endpoint, int qp,
bool do_rdma, mca_btl_openib_send_frag_t *frag)
{
mca_btl_openib_header_t *hdr = frag->hdr;
if (BTL_OPENIB_IS_RDMA_CREDITS(hdr->credits)) {
OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits, BTL_OPENIB_CREDITS(hdr->credits));
}
if (do_rdma) {
OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1);
} else {
if(BTL_OPENIB_QP_TYPE_PP(qp)) {
OPAL_THREAD_ADD32 (&endpoint->qps[qp].u.pp_qp.rd_credits, hdr->credits);
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, 1);
} else if BTL_OPENIB_QP_TYPE_SRQ(qp){
mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl;
OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1);
}
}
}
END_C_DECLS
#endif

Просмотреть файл

@ -153,8 +153,8 @@ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
if (NULL != btlname) free(btlname);
/* Since we believe we have done a send, read or write, then the
* des_local fields should have valid data. */
assert(des->des_local != NULL);
* des_segments fields should have valid data. */
assert(des->des_segments != NULL);
/* If the endpoint is not yet in the MCA_BTL_IB_CLOSED state, then
* change the status. Since this connection was mapped out in the

Просмотреть файл

@ -68,8 +68,8 @@ static void out_constructor(mca_btl_openib_out_frag_t *frag)
{
mca_btl_openib_frag_t *base_frag = to_base_frag(frag);
base_frag->base.des_local = &base_frag->segment.base;
base_frag->base.des_local_count = 1;
base_frag->base.des_segments = &base_frag->segment.base;
base_frag->base.des_segment_count = 1;
frag->sr_desc.wr_id = (uint64_t)(uintptr_t)frag;
frag->sr_desc.sg_list = &to_com_frag(frag)->sg_entry;
@ -83,8 +83,8 @@ static void in_constructor(mca_btl_openib_in_frag_t *frag)
{
mca_btl_openib_frag_t *base_frag = to_base_frag(frag);
base_frag->base.des_local = &base_frag->segment.base;
base_frag->base.des_local_count = 1;
base_frag->base.des_segments = &base_frag->segment.base;
base_frag->base.des_segment_count = 1;
}
static void send_constructor(mca_btl_openib_send_frag_t *frag)
@ -134,6 +134,7 @@ static void put_constructor(mca_btl_openib_put_frag_t *frag)
{
to_base_frag(frag)->type = MCA_BTL_OPENIB_FRAG_SEND_USER;
to_out_frag(frag)->sr_desc.opcode = IBV_WR_RDMA_WRITE;
frag->cb.func = NULL;
}
static void get_constructor(mca_btl_openib_get_frag_t *frag)
@ -154,8 +155,8 @@ static void coalesced_constructor(mca_btl_openib_coalesced_frag_t *frag)
base_frag->type = MCA_BTL_OPENIB_FRAG_COALESCED;
base_frag->base.des_local = &base_frag->segment.base;
base_frag->base.des_local_count = 1;
base_frag->base.des_segments = &base_frag->segment.base;
base_frag->base.des_segment_count = 1;
}
OBJ_CLASS_INSTANCE(

Просмотреть файл

@ -349,7 +349,15 @@ OBJ_CLASS_DECLARATION(mca_btl_openib_recv_frag_t);
#define to_recv_frag(f) ((mca_btl_openib_recv_frag_t*)(f))
typedef struct mca_btl_openib_out_frag_t mca_btl_openib_put_frag_t;
typedef struct mca_btl_openib_put_frag_t {
mca_btl_openib_out_frag_t super;
struct {
mca_btl_base_rdma_completion_fn_t func;
mca_btl_base_registration_handle_t *local_handle;
void *context;
void *data;
} cb;
} mca_btl_openib_put_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_put_frag_t);
#define to_put_frag(f) ((mca_btl_openib_put_frag_t*)(f))
@ -357,6 +365,12 @@ OBJ_CLASS_DECLARATION(mca_btl_openib_put_frag_t);
typedef struct mca_btl_openib_get_frag_t {
mca_btl_openib_in_frag_t super;
struct ibv_send_wr sr_desc;
struct {
mca_btl_base_rdma_completion_fn_t func;
mca_btl_base_registration_handle_t *local_handle;
void *context;
void *data;
} cb;
} mca_btl_openib_get_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_get_frag_t);
@ -371,6 +385,7 @@ typedef struct mca_btl_openib_coalesced_frag_t {
mca_btl_openib_frag_t super;
mca_btl_openib_send_frag_t *send_frag;
mca_btl_openib_header_coalesced_t *hdr;
bool sent;
} mca_btl_openib_coalesced_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_coalesced_frag_t);

159
opal/mca/btl/openib/btl_openib_get.c Обычный файл
Просмотреть файл

@ -0,0 +1,159 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2013 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
* Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2006-2007 Voltaire All rights reserved.
* Copyright (c) 2008-2012 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2009 IBM Corporation. All rights reserved.
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "btl_openib.h"
#include "btl_openib_frag.h"
#include "btl_openib_endpoint.h"
#include "btl_openib_xrc.h"
/*
* RDMA READ remote buffer to local buffer address.
*/
int mca_btl_openib_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, void *local_address,
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{
mca_btl_openib_get_frag_t* frag = NULL;
int qp = order;
int rc;
if (OPAL_UNLIKELY(size > btl->btl_get_limit)) {
return OPAL_ERR_BAD_PARAM;
}
frag = to_get_frag(alloc_recv_user_frag());
if (OPAL_UNLIKELY(NULL == frag)) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
if (MCA_BTL_NO_ORDER == qp) {
qp = mca_btl_openib_component.rdma_qp;
}
/* set base descriptor flags */
to_base_frag(frag)->base.order = qp;
/* free this descriptor when the operation is complete */
to_base_frag(frag)->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
/* set up scatter-gather entry */
to_com_frag(frag)->sg_entry.length = size;
to_com_frag(frag)->sg_entry.lkey = local_handle->lkey;
to_com_frag(frag)->sg_entry.addr = (uint64_t)(uintptr_t) local_address;
to_com_frag(frag)->endpoint = ep;
/* set up rdma callback */
frag->cb.func = cbfunc;
frag->cb.context = cbcontext;
frag->cb.data = cbdata;
frag->cb.local_handle = local_handle;
/* set up descriptor */
frag->sr_desc.wr.rdma.remote_addr = remote_address;
/* the opcode may have been changed by an atomic operation */
frag->sr_desc.opcode = IBV_WR_RDMA_READ;
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
if((ep->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN)
!= (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) {
frag->sr_desc.wr.rdma.rkey = opal_swap_bytes4 (remote_handle->rkey);
} else
#endif
{
frag->sr_desc.wr.rdma.rkey = remote_handle->rkey;
}
#if HAVE_XRC
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) {
frag->sr_desc.xrc_remote_srq_num=ep->rem_info.rem_srqs[qp].rem_srq_num;
}
#endif
if (ep->endpoint_state != MCA_BTL_IB_CONNECTED) {
OPAL_THREAD_LOCK(&ep->endpoint_lock);
rc = check_endpoint_state(ep, &to_base_frag(frag)->base, &ep->pending_get_frags);
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
if (OPAL_ERR_RESOURCE_BUSY == rc) {
return OPAL_SUCCESS;
}
if (OPAL_SUCCESS != rc) {
MCA_BTL_IB_FRAG_RETURN (frag);
return rc;
}
}
rc = mca_btl_openib_get_internal (btl, ep, frag);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
if (OPAL_LIKELY(OPAL_ERR_OUT_OF_RESOURCE == rc)) {
rc = OPAL_SUCCESS;
OPAL_THREAD_LOCK(&ep->endpoint_lock);
opal_list_append(&ep->pending_get_frags, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
} else {
MCA_BTL_IB_FRAG_RETURN (frag);
}
}
return rc;
}
int mca_btl_openib_get_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep,
mca_btl_openib_get_frag_t *frag)
{
int qp = to_base_frag(frag)->base.order;
struct ibv_send_wr *bad_wr;
/* check for a send wqe */
if (qp_get_wqe(ep, qp) < 0) {
qp_put_wqe(ep, qp);
return OPAL_ERR_OUT_OF_RESOURCE;
}
/* check for a get token */
if (OPAL_THREAD_ADD32(&ep->get_tokens,-1) < 0) {
qp_put_wqe(ep, qp);
OPAL_THREAD_ADD32(&ep->get_tokens,1);
return OPAL_ERR_OUT_OF_RESOURCE;
}
qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag));
qp_reset_signal_count(ep, qp);
if (ibv_post_send(ep->qps[qp].qp->lcl_qp, &frag->sr_desc, &bad_wr)) {
qp_put_wqe(ep, qp);
OPAL_THREAD_ADD32(&ep->get_tokens,1);
return OPAL_ERROR;
}
return OPAL_SUCCESS;
}

Просмотреть файл

@ -1,3 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
@ -11,7 +12,7 @@
* All rights reserved.
* Copyright (c) 2006-2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
* Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2006-2007 Voltaire All rights reserved.
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
@ -567,10 +568,16 @@ int btl_openib_register_mca_params(void)
mca_btl_openib_module.super.btl_rdma_pipeline_frag_size = 1024 * 1024;
mca_btl_openib_module.super.btl_min_rdma_pipeline_size = 256 * 1024;
mca_btl_openib_module.super.btl_flags = MCA_BTL_FLAGS_RDMA |
MCA_BTL_FLAGS_NEED_ACK | MCA_BTL_FLAGS_NEED_CSUM | MCA_BTL_FLAGS_HETEROGENEOUS_RDMA;
MCA_BTL_FLAGS_NEED_ACK | MCA_BTL_FLAGS_NEED_CSUM | MCA_BTL_FLAGS_HETEROGENEOUS_RDMA;
#if BTL_OPENIB_FAILOVER_ENABLED
mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_FAILOVER_SUPPORT;
#endif
#if HAVE_DECL_IBV_ATOMIC_HCA
mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_ATOMIC_FOPS;
mca_btl_openib_module.super.btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD | MCA_BTL_ATOMIC_SUPPORTS_CSWAP;
#endif
/* Default to bandwidth auto-detection */
mca_btl_openib_module.super.btl_bandwidth = 0;
mca_btl_openib_module.super.btl_latency = 4;

152
opal/mca/btl/openib/btl_openib_put.c Обычный файл
Просмотреть файл

@ -0,0 +1,152 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2013 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
* Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2006-2007 Voltaire All rights reserved.
* Copyright (c) 2008-2012 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2009 IBM Corporation. All rights reserved.
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "btl_openib.h"
#include "btl_openib_frag.h"
#include "btl_openib_endpoint.h"
#include "btl_openib_xrc.h"
/*
* RDMA WRITE local buffer to remote buffer address.
*/
int mca_btl_openib_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, void *local_address,
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{
mca_btl_openib_put_frag_t *frag = NULL;
int rc, qp = order;
if (OPAL_UNLIKELY(size > btl->btl_put_limit)) {
return OPAL_ERR_BAD_PARAM;
}
frag = to_put_frag(alloc_send_user_frag ());
if (OPAL_UNLIKELY(NULL == frag)) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
if (MCA_BTL_NO_ORDER == qp) {
qp = mca_btl_openib_component.rdma_qp;
}
/* set base descriptor flags */
to_base_frag(frag)->base.order = qp;
/* free this descriptor when the operation is complete */
to_base_frag(frag)->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
/* set up scatter-gather entry */
to_com_frag(frag)->sg_entry.length = size;
to_com_frag(frag)->sg_entry.lkey = local_handle->lkey;
to_com_frag(frag)->sg_entry.addr = (uint64_t)(intptr_t) local_address;
to_com_frag(frag)->endpoint = ep;
/* set up rdma callback */
frag->cb.func = cbfunc;
frag->cb.context = cbcontext;
frag->cb.data = cbdata;
frag->cb.local_handle = local_handle;
/* post descriptor */
to_out_frag(frag)->sr_desc.opcode = IBV_WR_RDMA_WRITE;
to_out_frag(frag)->sr_desc.send_flags = ib_send_flags(size, &(ep->qps[qp]), 1);
to_out_frag(frag)->sr_desc.wr.rdma.remote_addr = remote_address;
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
if ((ep->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN)
!= (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) {
to_out_frag(frag)->sr_desc.wr.rdma.rkey = opal_swap_bytes4(remote_handle->rkey);
} else
#endif
{
to_out_frag(frag)->sr_desc.wr.rdma.rkey = remote_handle->rkey;
}
#if HAVE_XRC
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp))
to_out_frag(frag)->sr_desc.xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num;
#endif
if (ep->endpoint_state != MCA_BTL_IB_CONNECTED) {
OPAL_THREAD_LOCK(&ep->endpoint_lock);
rc = check_endpoint_state(ep, &to_base_frag(frag)->base, &ep->pending_put_frags);
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
if (OPAL_ERR_RESOURCE_BUSY == rc) {
/* descriptor was queued pending connection */
return OPAL_SUCCESS;
}
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
MCA_BTL_IB_FRAG_RETURN (frag);
return rc;
}
}
rc = mca_btl_openib_put_internal (btl, ep, frag);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
if (OPAL_LIKELY(OPAL_ERR_OUT_OF_RESOURCE == rc)) {
rc = OPAL_SUCCESS;
/* queue the fragment for when resources are available */
OPAL_THREAD_LOCK(&ep->endpoint_lock);
opal_list_append(&ep->pending_put_frags, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
} else {
MCA_BTL_IB_FRAG_RETURN (frag);
}
}
return rc;
}
int mca_btl_openib_put_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep,
mca_btl_openib_put_frag_t *frag)
{
int qp = to_base_frag(frag)->base.order;
struct ibv_send_wr *bad_wr;
int rc;
/* check for a send wqe */
if (qp_get_wqe(ep, qp) < 0) {
qp_put_wqe(ep, qp);
return OPAL_ERR_OUT_OF_RESOURCE;
}
qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag));
qp_reset_signal_count(ep, qp);
if (0 != (rc = ibv_post_send(ep->qps[qp].qp->lcl_qp, &to_out_frag(frag)->sr_desc, &bad_wr))) {
qp_put_wqe(ep, qp);
return OPAL_ERROR;;
}
return OPAL_SUCCESS;
}

Просмотреть файл

@ -126,7 +126,6 @@ AC_DEFUN([MCA_opal_btl_openib_CONFIG],[
[enable openib BTL failover])
AM_CONDITIONAL([MCA_btl_openib_enable_failover], [test "x$btl_openib_failover_enabled" = "x1"])
# Check for __malloc_hook availability
AC_ARG_ENABLE(btl-openib-malloc-alignment,
AC_HELP_STRING([--enable-btl-openib-malloc-alignment], [Enable support for allocated memory alignment. Default: enabled if supported, disabled otherwise.]))

Просмотреть файл

@ -321,16 +321,17 @@ static int udcm_send_request (mca_btl_base_endpoint_t *lcl_ep,
static void udcm_send_timeout (evutil_socket_t fd, short event, void *arg);
static int udcm_finish_connection (mca_btl_openib_endpoint_t *lcl_ep);
static int udcm_rc_qps_to_rts(mca_btl_openib_endpoint_t *lcl_ep);
/* XRC support */
#if HAVE_XRC
static int udcm_xrc_start_connect (opal_btl_openib_connect_base_module_t *cpc,
mca_btl_base_endpoint_t *lcl_ep);
static int udcm_xrc_restart_connect (mca_btl_base_endpoint_t *lcl_ep);
static int udcm_xrc_send_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_hdr_t *msg_hdr);
static int udcm_xrc_send_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, uint32_t rem_qp_num, uint32_t rem_psn);
static int udcm_xrc_send_qp_create (mca_btl_base_endpoint_t *lcl_ep);
static int udcm_xrc_recv_qp_connect (mca_btl_openib_endpoint_t *lcl_ep);
static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_hdr_t *msg_hdr);
static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, uint32_t rem_qp_num, uint32_t rem_psn);
static int udcm_xrc_send_request (mca_btl_base_endpoint_t *lcl_ep, mca_btl_base_endpoint_t *rem_ep,
uint8_t msg_type);
static int udcm_xrc_send_xresponse (mca_btl_base_endpoint_t *lcl_ep, mca_btl_base_endpoint_t *rem_ep,
@ -507,6 +508,93 @@ static int udcm_component_finalize(void)
/* mark: udcm module */
#if HAVE_XRC
static int udcm_endpoint_init_self_xrc (struct mca_btl_base_endpoint_t *lcl_ep)
{
udcm_endpoint_t *udep = UDCM_ENDPOINT_DATA(lcl_ep);
int rc;
opal_mutex_lock (&udep->udep_lock);
do {
rc = udcm_xrc_recv_qp_connect (lcl_ep);
if (OPAL_SUCCESS != rc) {
BTL_VERBOSE(("error connecting loopback XRC receive queue pair"));
break;
}
lcl_ep->xrc_recv_qp_num = lcl_ep->qps[0].qp->lcl_qp->qp_num;
rc = mca_btl_openib_endpoint_post_recvs (lcl_ep);
if (OPAL_SUCCESS != rc) {
BTL_VERBOSE(("error posting receives for loopback queue pair"));
break;
}
rc = udcm_xrc_recv_qp_create (lcl_ep, lcl_ep->qps[0].qp->lcl_qp->qp_num,
lcl_ep->qps[0].qp->lcl_psn);
if (OPAL_SUCCESS != rc) {
BTL_VERBOSE(("error creating loopback XRC receive queue pair"));
break;
}
rc = udcm_xrc_send_qp_connect (lcl_ep, lcl_ep->qps[0].qp->lcl_qp->qp_num,
lcl_ep->qps[0].qp->lcl_psn);
if (OPAL_SUCCESS != rc) {
BTL_VERBOSE(("error creating loopback XRC send queue pair"));
break;
}
lcl_ep->endpoint_state = MCA_BTL_IB_CONNECTED;
rc = udcm_finish_connection (lcl_ep);
} while (0);
opal_mutex_unlock (&udep->udep_lock);
return rc;
}
#endif
static int udcm_endpoint_init_self (struct mca_btl_base_endpoint_t *lcl_ep)
{
udcm_endpoint_t *udep = UDCM_ENDPOINT_DATA(lcl_ep);
int rc;
opal_mutex_lock (&udep->udep_lock);
do {
if (OPAL_SUCCESS != (rc = udcm_endpoint_init_data (lcl_ep))) {
BTL_VERBOSE(("error initializing loopback endpoint cpc data"));
break;
}
if (OPAL_SUCCESS != (rc = udcm_rc_qp_create_all (lcl_ep))) {
BTL_VERBOSE(("error initializing loopback endpoint qps"));
break;
}
/* save queue pair info */
lcl_ep->rem_info.rem_index = lcl_ep->index;
for (int i = 0 ; i < mca_btl_openib_component.num_qps ; ++i) {
lcl_ep->rem_info.rem_qps[i].rem_psn = lcl_ep->qps[i].qp->lcl_psn;
lcl_ep->rem_info.rem_qps[i].rem_qp_num = lcl_ep->qps[i].qp->lcl_qp->qp_num;
}
if (OPAL_SUCCESS != (rc = udcm_rc_qps_to_rts (lcl_ep))) {
BTL_VERBOSE(("error moving loopback endpoint qps to RTS"));
break;
}
lcl_ep->endpoint_state = MCA_BTL_IB_CONNECTED;
rc = udcm_finish_connection (lcl_ep);
return OPAL_SUCCESS;
} while (0);
opal_mutex_unlock (&udep->udep_lock);
return rc;
}
static int udcm_endpoint_init (struct mca_btl_base_endpoint_t *lcl_ep)
{
udcm_endpoint_t *udep = lcl_ep->endpoint_local_cpc_data =
@ -518,6 +606,16 @@ static int udcm_endpoint_init (struct mca_btl_base_endpoint_t *lcl_ep)
OBJ_CONSTRUCT(&udep->udep_lock, opal_mutex_t);
if (lcl_ep->endpoint_proc->proc_opal == opal_proc_local_get ()) {
/* go ahead and try to create a loopback queue pair */
#if HAVE_XRC
if (mca_btl_openib_component.num_xrc_qps > 0) {
return udcm_endpoint_init_self_xrc (lcl_ep);
} else
#endif
return udcm_endpoint_init_self (lcl_ep);
}
return OPAL_SUCCESS;
}
@ -1069,6 +1167,9 @@ static inline int udcm_rc_qp_to_init (struct ibv_qp *qp,
attr.pkey_index = btl->pkey_index;
attr.port_num = btl->port_num;
attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ;
#if HAVE_DECL_IBV_ATOMIC_HCA
attr.qp_access_flags |= IBV_ACCESS_REMOTE_ATOMIC;
#endif
attr_mask = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT |
IBV_QP_ACCESS_FLAGS;
@ -2307,7 +2408,7 @@ static int udcm_xrc_restart_connect (mca_btl_base_endpoint_t *lcl_ep)
/* mark: xrc send qp */
/* Send qp connect */
static int udcm_xrc_send_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_hdr_t *msg_hdr)
static int udcm_xrc_send_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, uint32_t rem_qp_num, uint32_t rem_psn)
{
mca_btl_openib_module_t *openib_btl = lcl_ep->endpoint_btl;
struct ibv_qp_attr attr;
@ -2316,7 +2417,7 @@ static int udcm_xrc_send_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg
int ret;
BTL_VERBOSE(("Connecting send qp: %p, remote qp: %d", (void *)lcl_ep->qps[0].qp->lcl_qp,
msg_hdr->data.xres.rem_qp_num));
rem_qp_num));
assert(NULL != lcl_ep->qps);
qp = lcl_ep->qps[0].qp->lcl_qp;
psn = lcl_ep->qps[0].qp->lcl_psn;
@ -2326,8 +2427,8 @@ static int udcm_xrc_send_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg
attr.qp_state = IBV_QPS_RTR;
attr.path_mtu = (openib_btl->device->mtu < lcl_ep->rem_info.rem_mtu) ?
openib_btl->device->mtu : lcl_ep->rem_info.rem_mtu;
attr.dest_qp_num = msg_hdr->data.xres.rem_qp_num;
attr.rq_psn = msg_hdr->data.xres.rem_psn;
attr.dest_qp_num = rem_qp_num;
attr.rq_psn = rem_psn;
attr.max_dest_rd_atomic = mca_btl_openib_component.ib_max_rdma_dst_ops;
attr.min_rnr_timer = mca_btl_openib_component.ib_min_rnr_timer;
attr.ah_attr.is_global = 0;
@ -2460,6 +2561,9 @@ static int udcm_xrc_send_qp_create (mca_btl_base_endpoint_t *lcl_ep)
attr.pkey_index = openib_btl->pkey_index;
attr.port_num = openib_btl->port_num;
attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ;
#if HAVE_DECL_IBV_ATOMIC_HCA
attr.qp_access_flags |= IBV_ACCESS_REMOTE_ATOMIC;
#endif
ret = ibv_modify_qp(*qp, &attr,
IBV_QP_STATE |
IBV_QP_PKEY_INDEX |
@ -2501,7 +2605,7 @@ static int udcm_xrc_recv_qp_connect (mca_btl_openib_endpoint_t *lcl_ep)
}
/* Recv qp create */
static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_hdr_t *msg_hdr)
static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, uint32_t rem_qp_num, uint32_t rem_psn)
{
mca_btl_openib_module_t* openib_btl = lcl_ep->endpoint_btl;
struct ibv_qp_init_attr qp_init_attr;
@ -2525,6 +2629,11 @@ static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_
attr.pkey_index = openib_btl->pkey_index;
attr.port_num = openib_btl->port_num;
attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ;
#if HAVE_DECL_IBV_ATOMIC_HCA
attr.qp_access_flags |= IBV_ACCESS_REMOTE_ATOMIC;
#endif
ret = ibv_modify_xrc_rcv_qp(openib_btl->device->xrc_domain,
lcl_ep->xrc_recv_qp_num, &attr,
IBV_QP_STATE | IBV_QP_PKEY_INDEX |
@ -2540,8 +2649,8 @@ static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_
attr.qp_state = IBV_QPS_RTR;
attr.path_mtu = (openib_btl->device->mtu < lcl_ep->rem_info.rem_mtu) ?
openib_btl->device->mtu : lcl_ep->rem_info.rem_mtu;
attr.dest_qp_num = msg_hdr->data.xreq.rem_qp_num;
attr.rq_psn = msg_hdr->data.xreq.rem_psn;
attr.dest_qp_num = rem_qp_num;
attr.rq_psn = rem_psn;
attr.max_dest_rd_atomic = mca_btl_openib_component.ib_max_rdma_dst_ops;
attr.min_rnr_timer = mca_btl_openib_component.ib_min_rnr_timer;
attr.ah_attr.is_global = 0;
@ -2715,7 +2824,7 @@ static int udcm_xrc_handle_xconnect (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg
response_type = UDCM_MESSAGE_XRESPONSE;
rc = udcm_xrc_recv_qp_create (lcl_ep, msg_hdr);
rc = udcm_xrc_recv_qp_create (lcl_ep, msg_hdr->data.xreq.rem_qp_num, msg_hdr->data.xreq.rem_psn);
if (OPAL_SUCCESS != rc) {
break;
}
@ -2761,7 +2870,7 @@ static int udcm_xrc_handle_xresponse (mca_btl_openib_endpoint_t *lcl_ep, udcm_ms
udep->recv_resp = true;
rc = udcm_xrc_send_qp_connect (lcl_ep, msg_hdr);
rc = udcm_xrc_send_qp_connect (lcl_ep, msg_hdr->data.xres.rem_qp_num, msg_hdr->data.xres.rem_psn);
if (OPAL_SUCCESS != rc) {
mca_btl_openib_endpoint_invoke_error (lcl_ep);
}

Просмотреть файл

@ -183,7 +183,7 @@ mca_btl_portals4_alloc(struct mca_btl_base_module_t* btl_base,
}
frag->md_h = PTL_INVALID_HANDLE;
frag->base.des_local_count = 1;
frag->base.des_segment_count = 1;
frag->base.des_flags = flags | MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
frag->base.order = MCA_BTL_NO_ORDER;
@ -272,7 +272,7 @@ mca_btl_portals4_prepare_src(struct mca_btl_base_module_t* btl_base,
}
frag->segments[0].base.seg_len = max_data + reserve;
frag->base.des_local_count = 1;
frag->base.des_segment_count = 1;
} else {
/* no need to pack - rdma operation out of user's buffer */
@ -302,7 +302,7 @@ mca_btl_portals4_prepare_src(struct mca_btl_base_module_t* btl_base,
frag->segments[0].base.seg_len = max_data;
frag->segments[0].base.seg_addr.pval = iov.iov_base;
frag->segments[0].key = OPAL_THREAD_ADD64(&(portals4_btl->portals_rdma_key), 1);
frag->base.des_local_count = 1;
frag->base.des_segment_count = 1;
/* either a put or get. figure out which later */
OPAL_OUTPUT_VERBOSE((90, opal_btl_base_framework.framework_output,
@ -348,7 +348,7 @@ mca_btl_portals4_prepare_src(struct mca_btl_base_module_t* btl_base,
(void *)frag, frag->me_h, me.start, me.length,
me.match_id.phys.nid, me.match_id.phys.pid, me.match_bits));
}
frag->base.des_local = &frag->segments[0].base;
frag->base.des_segments = &frag->segments[0].base;
frag->base.des_remote = NULL;
frag->base.des_remote_count = 0;
frag->base.des_flags = flags | MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
@ -390,8 +390,8 @@ mca_btl_portals4_prepare_dst(struct mca_btl_base_module_t* btl_base,
frag->segments[0].key = OPAL_THREAD_ADD64(&(portals4_btl->portals_rdma_key), 1);
frag->base.des_remote = NULL;
frag->base.des_remote_count = 0;
frag->base.des_local = &frag->segments[0].base;
frag->base.des_local_count = 1;
frag->base.des_segments = &frag->segments[0].base;
frag->base.des_segment_count = 1;
frag->base.des_flags = flags | MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
frag->base.order = MCA_BTL_NO_ORDER;
frag->md_h = PTL_INVALID_HANDLE;

Просмотреть файл

@ -725,11 +725,11 @@ mca_btl_portals4_component_progress(void)
frag = ev.user_ptr;
tag = (unsigned char) (ev.hdr_data);
frag->base.des_local = seg;
frag->base.des_segments = seg;
seg[0].seg_addr.pval = ev.start;
seg[0].seg_len = ev.mlength;
frag->base.des_local_count = 1;
frag->base.des_segment_count = 1;
reg = mca_btl_base_active_message_trigger + tag;
OPAL_OUTPUT_VERBOSE((50, opal_btl_base_framework.framework_output,

Просмотреть файл

@ -26,8 +26,8 @@ static void
mca_btl_portals4_frag_common_send_constructor(mca_btl_portals4_frag_t* frag)
{
frag->base.des_flags = 0;
frag->base.des_local = &frag->segments[0].base;
frag->base.des_local_count = 2;
frag->base.des_segments = &frag->segments[0].base;
frag->base.des_segment_count = 2;
frag->segments[0].base.seg_addr.pval = frag + 1;
frag->segments[0].base.seg_len = frag->size;

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
* Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
@ -197,29 +197,21 @@ int mca_btl_scif_sendi (struct mca_btl_base_module_t *btl,
* Initiate a get operation.
*
* location: btl_scif_get.c
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*/
int
mca_btl_scif_get (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *des);
int mca_btl_scif_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
/**
* Initiate a put operation.
*
* location: btl_scif_put.c
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*/
int
mca_btl_scif_put (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *des);
int mca_btl_scif_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
mca_btl_base_descriptor_t *
mca_btl_scif_alloc(struct mca_btl_base_module_t *btl,
@ -228,9 +220,25 @@ mca_btl_scif_alloc(struct mca_btl_base_module_t *btl,
int mca_btl_scif_progress_send_wait_list (struct mca_btl_base_endpoint_t *endpoint);
struct mca_btl_scif_reg_t;
struct mca_btl_base_registration_handle_t {
/** scif offset */
off_t scif_offset;
/** base address of this scif region */
uintptr_t scif_base;
};
struct mca_btl_scif_registration_handle_t {
mca_btl_base_registration_handle_t btl_handle;
struct mca_btl_scif_reg_t *reg;
};
typedef struct mca_btl_scif_registration_handle_t mca_btl_scif_registration_handle_t;
typedef struct mca_btl_scif_reg_t {
mca_mpool_base_registration_t base;
off_t *registrations;
/** per-endpoint btl handles for this registration */
mca_btl_scif_registration_handle_t *handles;
} mca_btl_scif_reg_t;
/* Global structures */

Просмотреть файл

@ -165,14 +165,14 @@ static int scif_dereg_mem (void *reg_data, mca_mpool_base_registration_t *reg)
/* register the fragment with all connected endpoints */
for (i = 0 ; i < (int) mca_btl_scif_module.endpoint_count ; ++i) {
if ((off_t)-1 != scif_reg->registrations[i] &&
if ((off_t)-1 != scif_reg->handles[i].btl_handle.scif_offset &&
MCA_BTL_SCIF_EP_STATE_CONNECTED == mca_btl_scif_module.endpoints[i].state) {
(void) scif_unregister(mca_btl_scif_module.endpoints[i].scif_epd,
scif_reg->registrations[i], size);
scif_reg->handles[i].btl_handle.scif_offset, size);
}
}
free (scif_reg->registrations);
free (scif_reg->handles);
return OPAL_SUCCESS;
}
@ -184,17 +184,22 @@ static int scif_reg_mem (void *reg_data, void *base, size_t size,
int rc = OPAL_SUCCESS;
unsigned int i;
scif_reg->registrations = calloc (mca_btl_scif_module.endpoint_count,
sizeof (off_t));
memset (scif_reg->registrations, -1, mca_btl_scif_module.endpoint_count * sizeof (off_t));
scif_reg->handles = calloc (mca_btl_scif_module.endpoint_count, sizeof (scif_reg->handles[0]));
/* intialize all scif offsets to -1 and initialize the pointer back to the mpool registration */
for (i = 0 ; i < mca_btl_scif_module.endpoint_count ; ++i) {
scif_reg->handles[i].btl_handle.scif_offset = -1;
scif_reg->handles[i].btl_handle.scif_base = (intptr_t) base;
scif_reg->handles[i].reg = scif_reg;
}
/* register the pointer with all connected endpoints */
for (i = 0 ; i < mca_btl_scif_module.endpoint_count ; ++i) {
if (MCA_BTL_SCIF_EP_STATE_CONNECTED == mca_btl_scif_module.endpoints[i].state) {
scif_reg->registrations[i] = scif_register(mca_btl_scif_module.endpoints[i].scif_epd,
base, size, 0, SCIF_PROT_READ |
SCIF_PROT_WRITE, 0);
if (SCIF_REGISTER_FAILED == scif_reg->registrations[i]) {
scif_reg->handles[i].btl_handle.scif_offset = scif_register (mca_btl_scif_module.endpoints[i].scif_epd,
base, size, 0, SCIF_PROT_READ |
SCIF_PROT_WRITE, 0);
if (SCIF_REGISTER_FAILED == scif_reg->handles[i].btl_handle.scif_offset) {
/* cleanup */
scif_dereg_mem (reg_data, reg);
rc = OPAL_ERR_OUT_OF_RESOURCE;

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
* Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
@ -171,7 +171,7 @@ static int btl_scif_component_register(void)
mca_btl_scif_module.super.btl_flags = MCA_BTL_FLAGS_SEND |
MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND_INPLACE;
mca_btl_scif_module.super.btl_seg_size = sizeof (mca_btl_scif_segment_t);
mca_btl_scif_module.super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t);
mca_btl_scif_module.super.btl_bandwidth = 50000; /* Mbs */
mca_btl_scif_module.super.btl_latency = 2; /* Microsecs */
@ -329,11 +329,11 @@ static int mca_btl_scif_progress_recvs (mca_btl_base_endpoint_t *ep)
* the fragment without introducing another copy here. this
* limitation has not appeared to cause any performance
* problems. */
frag.base.des_local_count = 1;
frag.segments[0].base.seg_len = hdr->size;
frag.segments[0].base.seg_addr.pval = (void *) (hdr + 1);
frag.base.des_segment_count = 1;
frag.segments[0].seg_len = hdr->size;
frag.segments[0].seg_addr.pval = (void *) (hdr + 1);
frag.base.des_local = &frag.segments[0].base;
frag.base.des_segments = frag.segments;
/* call the registered callback function */
reg->cbfunc(&mca_btl_scif_module.super, hdr->tag, &frag.base, reg->cbdata);

Просмотреть файл

@ -15,13 +15,13 @@
static inline void mca_btl_scif_base_frag_constructor (mca_btl_scif_base_frag_t *frag)
{
memset ((char *) frag + sizeof (frag->base), 0, sizeof (*frag) - sizeof (frag->base));
frag->segments[0].base.seg_addr.pval = frag->base.super.ptr;
frag->segments[0].seg_addr.pval = frag->base.super.ptr;
}
static inline void mca_btl_scif_eager_frag_constructor (mca_btl_scif_base_frag_t *frag)
{
memset ((char *) frag + sizeof (frag->base), 0, sizeof (*frag) - sizeof (frag->base));
frag->segments[0].base.seg_addr.pval = frag->base.super.ptr;
frag->segments[0].seg_addr.pval = frag->base.super.ptr;
}
OBJ_CLASS_INSTANCE(mca_btl_scif_eager_frag_t, mca_btl_base_descriptor_t,

Просмотреть файл

@ -15,16 +15,6 @@
#include "btl_scif.h"
#include "btl_scif_endpoint.h"
typedef struct mca_btl_scif_segment_t {
mca_btl_base_segment_t base;
/* scif offset */
off_t scif_offset;
/* original pointer */
uint64_t orig_ptr;
} mca_btl_scif_segment_t;
typedef struct mca_btl_scif_frag_hdr_t {
#if defined(SCIF_USE_SEQ)
uint32_t seq;
@ -41,7 +31,7 @@ typedef void (*frag_cb_t) (struct mca_btl_scif_base_frag_t *, int);
typedef struct mca_btl_scif_base_frag_t {
mca_btl_base_descriptor_t base;
mca_btl_scif_frag_hdr_t hdr;
mca_btl_scif_segment_t segments[2];
mca_btl_base_segment_t segments[2];
mca_btl_base_endpoint_t *endpoint;
mca_btl_scif_reg_t *registration;
ompi_free_list_t *my_list;
@ -78,9 +68,9 @@ static inline int mca_btl_scif_frag_return (mca_btl_scif_base_frag_t *frag)
frag->registration = NULL;
}
frag->segments[0].base.seg_addr.pval = frag->base.super.ptr;
frag->segments[0].base.seg_len = 0;
frag->segments[1].base.seg_len = 0;
frag->segments[0].seg_addr.pval = frag->base.super.ptr;
frag->segments[0].seg_len = 0;
frag->segments[1].seg_len = 0;
OMPI_FREE_LIST_RETURN_MT(frag->my_list, (ompi_free_list_item_t *) frag);

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
* Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
@ -20,18 +20,13 @@
/**
* Initiate a get operation.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*/
int mca_btl_scif_get (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *des) {
mca_btl_scif_segment_t *src = (mca_btl_scif_segment_t *) des->des_remote;
mca_btl_scif_segment_t *dst = (mca_btl_scif_segment_t *) des->des_local;
size_t len = lmin (src->base.seg_len, dst->base.seg_len);
int rc, mark, flags = 0;
int mca_btl_scif_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{
int rc, mark, scif_flags = 0;
off_t roffset, loffset;
#if defined(SCIF_TIMING)
struct timespec ts;
@ -41,30 +36,27 @@ int mca_btl_scif_get (struct mca_btl_base_module_t *btl,
mca_btl_scif_component.get_count++;
#endif
BTL_VERBOSE(("Using DMA Get for frag %p from offset %lu", (void *) des,
(unsigned long) src->scif_offset));
BTL_VERBOSE(("Using DMA Get from remote address %" PRIx64 " to local address %p",
remote_address, local_address));
roffset = src->scif_offset + (off_t)(src->orig_ptr - src->base.seg_addr.lval);
loffset = dst->scif_offset + (off_t)(dst->orig_ptr - dst->base.seg_addr.lval);
roffset = remote_handle->scif_offset + (off_t)(remote_address - remote_handle->scif_base);
loffset = local_handle->scif_offset + (off_t)((intptr_t)local_address - local_handle->scif_base);
if (mca_btl_scif_component.rma_use_cpu) {
flags = SCIF_RMA_USECPU;
scif_flags = SCIF_RMA_USECPU;
}
if (mca_btl_scif_component.rma_sync) {
flags |= SCIF_RMA_SYNC;
scif_flags |= SCIF_RMA_SYNC;
}
/* start the read */
rc = scif_readfrom (endpoint->scif_epd, loffset, len, roffset, flags);
rc = scif_readfrom (endpoint->scif_epd, loffset, size, roffset, scif_flags);
if (OPAL_UNLIKELY(-1 == rc)) {
return OPAL_ERROR;
}
/* always call the callback function */
des->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
if (!(flags & SCIF_RMA_SYNC)) {
if (!(scif_flags & SCIF_RMA_SYNC)) {
/* according to the scif documentation is is better to use a fence rather
* than using the SCIF_RMA_SYNC flag with scif_readfrom */
scif_fence_mark (endpoint->scif_epd, SCIF_FENCE_INIT_SELF, &mark);
@ -76,8 +68,8 @@ int mca_btl_scif_get (struct mca_btl_base_module_t *btl,
mca_btl_scif_component.get_time_max, ts);
#endif
/* since we completed the fence the RMA operation is complete */
mca_btl_scif_frag_complete ((mca_btl_scif_base_frag_t *) des, OPAL_SUCCESS);
/* always call the callback function */
cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS);
return OPAL_SUCCESS;
}

Просмотреть файл

@ -24,17 +24,14 @@ mca_btl_scif_free (struct mca_btl_base_module_t *btl,
static int
mca_btl_scif_module_finalize (struct mca_btl_base_module_t* btl);
static mca_btl_base_descriptor_t *
mca_btl_scif_prepare_dst (mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
mca_mpool_base_registration_t *registration,
opal_convertor_t *convertor, uint8_t order,
size_t reserve, size_t *size, uint32_t flags);
static mca_btl_base_registration_handle_t *mca_btl_scif_register_mem (struct mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
void *base, size_t size, uint32_t flags);
static int mca_btl_scif_deregister_mem (struct mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle);
static struct mca_btl_base_descriptor_t *
mca_btl_scif_prepare_src (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
mca_mpool_base_registration_t *registration,
struct opal_convertor_t *convertor,
uint8_t order, size_t reserve, size_t *size,
uint32_t flags);
@ -48,11 +45,12 @@ mca_btl_scif_module_t mca_btl_scif_module = {
.btl_alloc = mca_btl_scif_alloc,
.btl_free = mca_btl_scif_free,
.btl_prepare_src = mca_btl_scif_prepare_src,
.btl_prepare_dst = mca_btl_scif_prepare_dst,
.btl_send = mca_btl_scif_send,
.btl_sendi = mca_btl_scif_sendi,
.btl_put = mca_btl_scif_put,
.btl_get = mca_btl_scif_get,
.btl_register_mem = mca_btl_scif_register_mem,
.btl_deregister_mem = mca_btl_scif_deregister_mem,
}
};
@ -163,10 +161,10 @@ mca_btl_scif_alloc(struct mca_btl_base_module_t *btl,
frag->base.des_flags = flags;
frag->base.order = order;
frag->base.des_local = &frag->segments[0].base;
frag->base.des_local_count = 1;
frag->base.des_segments = frag->segments;
frag->base.des_segment_count = 1;
frag->segments[0].base.seg_len = size;
frag->segments[0].seg_len = size;
return &frag->base;
}
@ -178,16 +176,19 @@ mca_btl_scif_free (struct mca_btl_base_module_t *btl,
return mca_btl_scif_frag_return ((mca_btl_scif_base_frag_t *) des);
}
static inline mca_btl_base_descriptor_t *mca_btl_scif_prepare_dma (struct mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
void *data_ptr, size_t size,
mca_mpool_base_registration_t *registration,
uint8_t order, uint32_t flags)
static mca_btl_base_registration_handle_t *mca_btl_scif_register_mem (struct mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
void *base, size_t size, uint32_t flags)
{
mca_btl_scif_base_frag_t *frag;
mca_btl_scif_reg_t *scif_reg;
int rc;
if (MCA_BTL_ENDPOINT_ANY == endpoint) {
/* it probably isn't possible to support registering memory to use with any endpoint so
* return NULL */
return NULL;
}
if (OPAL_LIKELY(MCA_BTL_SCIF_EP_STATE_CONNECTED != endpoint->state)) {
/* the endpoint needs to be connected before the fragment can be
* registered. */
@ -198,67 +199,36 @@ static inline mca_btl_base_descriptor_t *mca_btl_scif_prepare_dma (struct mca_bt
}
}
(void) MCA_BTL_SCIF_FRAG_ALLOC_DMA(endpoint, frag);
if (OPAL_UNLIKELY(NULL == frag)) {
rc = btl->btl_mpool->mpool_register(btl->btl_mpool, base, size, 0,
(mca_mpool_base_registration_t **) &scif_reg);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
return NULL;
}
if (NULL == registration) {
rc = btl->btl_mpool->mpool_register(btl->btl_mpool, data_ptr, size, 0,
(mca_mpool_base_registration_t **) &registration);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
mca_btl_scif_frag_return (frag);
return NULL;
}
frag->registration = (mca_btl_scif_reg_t *) registration;
}
scif_reg = (mca_btl_scif_reg_t *) registration;
/* register the memory location with this peer if it isn't already */
if ((off_t) -1 == scif_reg->registrations[endpoint->id]) {
size_t seg_size = (size_t)((uintptr_t) registration->bound - (uintptr_t) registration->base) + 1;
scif_reg->registrations[endpoint->id] = scif_register (endpoint->scif_epd, registration->base,
seg_size, 0, SCIF_PROT_READ |
SCIF_PROT_WRITE, 0);
if ((off_t) -1 == scif_reg->handles[endpoint->id].btl_handle.scif_offset) {
size_t seg_size = (size_t)((uintptr_t) scif_reg->base.bound - (uintptr_t) scif_reg->base.base) + 1;
/* NTH: until we determine a way to pass permissions to the mpool just make all segments
* read/write */
scif_reg->handles[endpoint->id].btl_handle.scif_offset =
scif_register (endpoint->scif_epd, scif_reg->base.base, seg_size, 0, SCIF_PROT_READ |
SCIF_PROT_WRITE, 0);
BTL_VERBOSE(("registered fragment for scif DMA transaction. offset = %lu",
(unsigned long) scif_reg->registrations[endpoint->id]));
(unsigned long) scif_reg->handles[endpoint->id].btl_handle.scif_offset));
}
if (OPAL_UNLIKELY((off_t) -1 == scif_reg->registrations[endpoint->id])) {
mca_btl_scif_frag_return (frag);
return NULL;
}
frag->segments[0].base.seg_addr.lval = (uint64_t)(uintptr_t) data_ptr;
frag->segments[0].base.seg_len = size;
frag->segments[0].scif_offset = scif_reg->registrations[endpoint->id] +
(off_t) ((ptrdiff_t) data_ptr - (ptrdiff_t) registration->base);
/* save the original pointer so the offset can be adjusted if needed (this is
* required for osc/rdma) */
frag->segments[0].orig_ptr = (uint64_t)(uintptr_t) data_ptr;
frag->base.order = order;
frag->base.des_flags = flags;
frag->base.des_local = &frag->segments->base;
frag->base.des_local_count = 1;
return &frag->base;
return &scif_reg->handles[endpoint->id].btl_handle;
}
static inline mca_btl_base_descriptor_t *mca_btl_scif_prepare_dma_conv (struct mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
mca_mpool_base_registration_t *registration,
struct opal_convertor_t *convertor,
uint8_t order, size_t *size,
uint32_t flags)
static int mca_btl_scif_deregister_mem (struct mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle)
{
void *data_ptr;
mca_btl_scif_registration_handle_t *scif_handle = (mca_btl_scif_registration_handle_t *) handle;
mca_btl_scif_reg_t *scif_reg = scif_handle->reg;
opal_convertor_get_current_pointer (convertor, &data_ptr);
btl->btl_mpool->mpool_deregister (btl->btl_mpool, &scif_reg->base);
return mca_btl_scif_prepare_dma (btl, endpoint, data_ptr, *size, registration, order, flags);
return OPAL_SUCCESS;
}
static inline struct mca_btl_base_descriptor_t *
@ -286,10 +256,10 @@ mca_btl_scif_prepare_src_send (struct mca_btl_base_module_t *btl,
return NULL;
}
frag->segments[0].base.seg_len = reserve;
frag->segments[1].base.seg_addr.pval = data_ptr;
frag->segments[1].base.seg_len = *size;
frag->base.des_local_count = 2;
frag->segments[0].seg_len = reserve;
frag->segments[1].seg_addr.pval = data_ptr;
frag->segments[1].seg_len = *size;
frag->base.des_segment_count = 2;
} else {
/* buffered send */
(void) MCA_BTL_SCIF_FRAG_ALLOC_EAGER(endpoint, frag);
@ -299,7 +269,7 @@ mca_btl_scif_prepare_src_send (struct mca_btl_base_module_t *btl,
if (*size) {
iov.iov_len = *size;
iov.iov_base = (IOVBASE_TYPE *) ((uintptr_t) frag->segments[0].base.seg_addr.pval + reserve);
iov.iov_base = (IOVBASE_TYPE *) ((uintptr_t) frag->segments[0].seg_addr.pval + reserve);
rc = opal_convertor_pack (convertor, &iov, &iov_count, &max_size);
if (OPAL_UNLIKELY(rc < 0)) {
@ -309,37 +279,22 @@ mca_btl_scif_prepare_src_send (struct mca_btl_base_module_t *btl,
*size = max_size;
}
frag->segments[0].base.seg_len = reserve + *size;
frag->base.des_local_count = 1;
frag->segments[0].seg_len = reserve + *size;
frag->base.des_segment_count = 1;
}
frag->base.des_local = &frag->segments->base;
frag->base.order = order;
frag->base.des_flags = flags;
frag->base.des_segments = frag->segments;
frag->base.order = order;
frag->base.des_flags = flags;
return &frag->base;
}
static mca_btl_base_descriptor_t *mca_btl_scif_prepare_src (struct mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
mca_mpool_base_registration_t *registration,
struct opal_convertor_t *convertor,
uint8_t order, size_t reserve, size_t *size,
uint32_t flags)
{
if (OPAL_LIKELY(reserve)) {
return mca_btl_scif_prepare_src_send (btl, endpoint, convertor,
order, reserve, size, flags);
} else {
return mca_btl_scif_prepare_dma_conv (btl, endpoint, registration, convertor, order, size, flags);
}
}
static mca_btl_base_descriptor_t *mca_btl_scif_prepare_dst (mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
mca_mpool_base_registration_t *registration,
opal_convertor_t *convertor, uint8_t order,
size_t reserve, size_t *size, uint32_t flags)
{
return mca_btl_scif_prepare_dma_conv (btl, endpoint, registration, convertor, order, size, flags);
return mca_btl_scif_prepare_src_send (btl, endpoint, convertor, order, reserve, size, flags);
}

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
* Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
@ -16,63 +16,57 @@
/**
* Initiate a put operation.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*/
int mca_btl_scif_put (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *des) {
mca_btl_scif_segment_t *src = (mca_btl_scif_segment_t *) des->des_local;
mca_btl_scif_segment_t *dst = (mca_btl_scif_segment_t *) des->des_remote;
size_t len = lmin (src->base.seg_len, dst->base.seg_len);
int rc, mark, flags = 0;
int mca_btl_scif_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{
int rc, mark, scif_flags = 0;
off_t roffset, loffset;
#if defined(SCIF_TIMING)
struct timespec ts;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
mca_btl_scif_component.put_count++;
mca_btl_scif_component.get_count++;
#endif
BTL_VERBOSE(("Using DMA Put for frag %p", (void *) des));
BTL_VERBOSE(("Using DMA Put from local address %p to remote address %" PRIx64,
local_address, remote_address));
roffset = dst->scif_offset + (off_t)(dst->orig_ptr - dst->base.seg_addr.lval);
loffset = src->scif_offset + (off_t)(src->orig_ptr - src->base.seg_addr.lval);
roffset = remote_handle->scif_offset + (off_t)(remote_address - remote_handle->scif_base);
loffset = local_handle->scif_offset + (off_t)((intptr_t) local_address - local_handle->scif_base);
if (mca_btl_scif_component.rma_use_cpu) {
flags = SCIF_RMA_USECPU;
scif_flags = SCIF_RMA_USECPU;
}
if (mca_btl_scif_component.rma_sync) {
flags |= SCIF_RMA_SYNC;
scif_flags |= SCIF_RMA_SYNC;
}
/* start the write */
rc = scif_writeto (endpoint->scif_epd, loffset, len, roffset, flags);
rc = scif_writeto (endpoint->scif_epd, loffset, size, roffset, scif_flags);
rc = scif_readfrom (endpoint->scif_epd, loffset, size, roffset, scif_flags);
if (OPAL_UNLIKELY(-1 == rc)) {
return OPAL_ERROR;
}
/* always call the callback function */
des->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
/* according to the scif documentation is is better to use a fence rather
* than using the SCIF_RMA_SYNC flag with scif_writeto */
if (!(flags & SCIF_RMA_SYNC)) {
if (!(scif_flags & SCIF_RMA_SYNC)) {
/* according to the scif documentation is is better to use a fence rather
* than using the SCIF_RMA_SYNC flag with scif_readfrom */
scif_fence_mark (endpoint->scif_epd, SCIF_FENCE_INIT_SELF, &mark);
scif_fence_wait (endpoint->scif_epd, mark);
}
#if defined(SCIF_TIMING)
SCIF_UPDATE_TIMER(mca_btl_scif_component.put_time,
mca_btl_scif_component.put_time_max, ts);
SCIF_UPDATE_TIMER(mca_btl_scif_component.get_time,
mca_btl_scif_component.get_time_max, ts);
#endif
/* since we completed the fence the RMA operation is complete */
mca_btl_scif_frag_complete ((mca_btl_scif_base_frag_t *) des, OPAL_SUCCESS);
/* always call the callback function */
cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS);
return OPAL_SUCCESS;
}

Просмотреть файл

@ -118,22 +118,22 @@ static int mca_btl_scif_send_frag (struct mca_btl_base_endpoint_t *endpoint,
unsigned char * restrict dst;
BTL_VERBOSE(("btl/scif sending descriptor %p from %d -> %d. length = %" PRIu64, (void *) frag,
OPAL_PROC_MY_NAME.vpid, endpoint->peer_proc->proc_name.vpid, frag->segments[0].base.seg_len));
opal_process_name_vpid(OPAL_PROC_MY_NAME), opal_process_name_vpid(endpoint->peer_proc->proc_name), frag->segments[0].seg_len));
if (OPAL_LIKELY(OPAL_SUCCESS == mca_btl_scif_send_get_buffer (endpoint, size, &dst))) {
unsigned char * restrict data = (unsigned char * restrict) frag->segments[0].base.seg_addr.pval;
unsigned char * restrict data = (unsigned char * restrict) frag->segments[0].seg_addr.pval;
#if defined(SCIF_TIMING)
struct timespec ts;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
#endif
memcpy (dst + sizeof (frag->hdr), data, frag->segments[0].base.seg_len);
memcpy (dst + sizeof (frag->hdr), data, frag->segments[0].seg_len);
if (frag->segments[1].base.seg_len) {
memcpy (dst + sizeof (frag->hdr) + frag->segments[0].base.seg_len,
frag->segments[1].base.seg_addr.pval,
frag->segments[1].base.seg_len);
if (frag->segments[1].seg_len) {
memcpy (dst + sizeof (frag->hdr) + frag->segments[0].seg_len,
frag->segments[1].seg_addr.pval,
frag->segments[1].seg_len);
}
#if defined(SCIF_USE_SEQ)
@ -165,7 +165,7 @@ int mca_btl_scif_send (struct mca_btl_base_module_t *btl,
mca_btl_base_tag_t tag)
{
mca_btl_scif_base_frag_t *frag = (mca_btl_scif_base_frag_t *) descriptor;
size_t size = frag->segments[0].base.seg_len + frag->segments[1].base.seg_len;
size_t size = frag->segments[0].seg_len + frag->segments[1].seg_len;
int rc;
frag->hdr.tag = tag;
@ -223,7 +223,9 @@ int mca_btl_scif_sendi (struct mca_btl_base_module_t *btl,
rc = mca_btl_scif_send_get_buffer (endpoint, length, &base);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
*descriptor = NULL;
if (NULL != descriptor) {
*descriptor = NULL;
}
return OPAL_ERR_OUT_OF_RESOURCE;
}

Просмотреть файл

@ -38,13 +38,15 @@
#include "btl_self_frag.h"
#include "opal/util/proc.h"
static int mca_btl_self_put (struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_btl_base_descriptor_t* des);
static int mca_btl_self_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
static int mca_btl_self_get (struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_btl_base_descriptor_t* des);
static int mca_btl_self_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
mca_btl_base_module_t mca_btl_self = {
.btl_component = &mca_btl_self_component.super,
@ -54,7 +56,6 @@ mca_btl_base_module_t mca_btl_self = {
.btl_alloc = mca_btl_self_alloc,
.btl_free = mca_btl_self_free,
.btl_prepare_src = mca_btl_self_prepare_src,
.btl_prepare_dst = mca_btl_self_prepare_dst,
.btl_send = mca_btl_self_send,
.btl_put = mca_btl_self_put,
.btl_get = mca_btl_self_get,
@ -135,8 +136,8 @@ mca_btl_base_descriptor_t* mca_btl_self_alloc(
frag->segment.seg_len = size;
frag->base.des_flags = flags;
frag->base.des_local = &(frag->segment);
frag->base.des_local_count = 1;
frag->base.des_segments = &(frag->segment);
frag->base.des_segment_count = 1;
return (mca_btl_base_descriptor_t*)frag;
}
@ -151,10 +152,8 @@ int mca_btl_self_free( struct mca_btl_base_module_t* btl,
{
mca_btl_self_frag_t* frag = (mca_btl_self_frag_t*)des;
frag->base.des_local = NULL;
frag->base.des_local_count = 0;
frag->base.des_remote = NULL;
frag->base.des_remote_count = 0;
frag->base.des_segments = NULL;
frag->base.des_segment_count = 0;
if(frag->size == mca_btl_self.btl_eager_limit) {
MCA_BTL_SELF_FRAG_RETURN_EAGER(frag);
@ -175,7 +174,6 @@ int mca_btl_self_free( struct mca_btl_base_module_t* btl,
struct mca_btl_base_descriptor_t*
mca_btl_self_prepare_src( struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
mca_mpool_base_registration_t* registration,
struct opal_convertor_t* convertor,
uint8_t order,
size_t reserve,
@ -231,44 +229,11 @@ mca_btl_self_prepare_src( struct mca_btl_base_module_t* btl,
*size = max_data;
}
frag->base.des_flags = flags;
frag->base.des_local = &frag->segment;
frag->base.des_local_count = 1;
frag->base.des_segments = &frag->segment;
frag->base.des_segment_count = 1;
return &frag->base;
}
/**
* Prepare data for receive.
*/
struct mca_btl_base_descriptor_t*
mca_btl_self_prepare_dst( struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
mca_mpool_base_registration_t* registration,
struct opal_convertor_t* convertor,
uint8_t order,
size_t reserve,
size_t* size,
uint32_t flags )
{
mca_btl_self_frag_t* frag;
size_t max_data = *size;
void *ptr;
MCA_BTL_SELF_FRAG_ALLOC_RDMA(frag);
if(OPAL_UNLIKELY(NULL == frag)) {
return NULL;
}
/* setup descriptor to point directly to user buffer */
opal_convertor_get_current_pointer( convertor, &ptr );
frag->segment.seg_addr.lval = (uint64_t)(uintptr_t) ptr;
frag->segment.seg_len = reserve + max_data;
frag->base.des_local = &frag->segment;
frag->base.des_local_count = 1;
frag->base.des_flags = flags;
return &frag->base;
}
/**
* Initiate a send to the peer.
@ -285,12 +250,6 @@ int mca_btl_self_send( struct mca_btl_base_module_t* btl,
mca_btl_active_message_callback_t* reg;
int btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
/**
* We have to set the dst before the call to the function and reset them
* after.
*/
des->des_remote = des->des_local;
des->des_remote_count = des->des_local_count;
/* upcall */
reg = mca_btl_base_active_message_trigger + tag;
reg->cbfunc( btl, tag, des, reg->cbdata );
@ -305,100 +264,29 @@ int mca_btl_self_send( struct mca_btl_base_module_t* btl,
return 1;
}
/**
* Initiate a put to the peer.
*
* @param btl (IN) BTL module
* @param peer (IN) BTL peer addressing
*/
static int mca_btl_self_rdma( struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_btl_base_descriptor_t* des,
mca_btl_base_segment_t* src, size_t src_cnt,
mca_btl_base_segment_t* dst, size_t dst_cnt)
static int mca_btl_self_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{
unsigned char* src_addr = (unsigned char *)(uintptr_t) src->seg_addr.lval;
size_t src_len = src->seg_len;
unsigned char* dst_addr = (unsigned char *)(uintptr_t) dst->seg_addr.lval;
size_t dst_len = dst->seg_len;
int btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
memcpy ((void *)(intptr_t) remote_address, local_address, size);
while(src_len && dst_len) {
cbfunc (btl, endpoint, local_address, NULL, cbcontext, cbdata, OPAL_SUCCESS);
if(src_len == dst_len) {
memcpy(dst_addr, src_addr, src_len);
/* advance src */
if(--src_cnt != 0) {
src++;
src_addr = (unsigned char*)src->seg_addr.pval;
src_len = src->seg_len;
} else {
src_len = 0;
}
/* advance dst */
if(--dst_cnt != 0) {
dst++;
dst_addr = (unsigned char*)dst->seg_addr.pval;
dst_len = dst->seg_len;
} else {
dst_len = 0;
}
} else {
size_t bytes = src_len < dst_len ? src_len : dst_len;
memcpy(dst_addr, src_addr, bytes);
/* advance src */
src_len -= bytes;
if(src_len == 0) {
if(--src_cnt != 0) {
src++;
src_addr = (unsigned char*)src->seg_addr.pval;
src_len = src->seg_len;
}
} else {
src_addr += bytes;
}
/* advance dst */
dst_len -= bytes;
if(dst_len == 0) {
if(--dst_cnt != 0) {
dst++;
dst_addr = (unsigned char*)src->seg_addr.pval;
dst_len = src->seg_len;
}
} else {
dst_addr += bytes;
}
}
}
/* rdma completion */
des->des_cbfunc( btl, endpoint, des, OPAL_SUCCESS );
if( btl_ownership ) {
mca_btl_self_free( btl, des );
}
return OPAL_SUCCESS;
}
static int mca_btl_self_put (struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_btl_base_descriptor_t* des)
static int mca_btl_self_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{
return mca_btl_self_rdma (btl, endpoint, des, des->des_local, des->des_local_count,
des->des_remote, des->des_remote_count);
}
memcpy (local_address, (void *)(intptr_t) remote_address, size);
static int mca_btl_self_get (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *des)
{
return mca_btl_self_rdma (btl, endpoint, des, des->des_remote, des->des_remote_count,
des->des_local, des->des_local_count);
cbfunc (btl, endpoint, local_address, NULL, cbcontext, cbdata, OPAL_SUCCESS);
return OPAL_SUCCESS;
}
int mca_btl_self_ft_event(int state) {

Просмотреть файл

@ -165,24 +165,6 @@ int mca_btl_self_free(
struct mca_btl_base_descriptor_t* mca_btl_self_prepare_src(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_mpool_base_registration_t* registration,
struct opal_convertor_t* convertor,
uint8_t order,
size_t reserve,
size_t* size,
uint32_t flags
);
/**
* Prepare data for RDMA
*
* @param btl (IN) BTL module
* @param peer (IN) BTL peer addressing
*/
struct mca_btl_base_descriptor_t* mca_btl_self_prepare_dst(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_mpool_base_registration_t* registration,
struct opal_convertor_t* convertor,
uint8_t order,
size_t reserve,

Просмотреть файл

@ -99,7 +99,6 @@ static int mca_btl_self_component_register(void)
mca_btl_self.btl_rdma_pipeline_frag_size = INT_MAX;
mca_btl_self.btl_min_rdma_pipeline_size = 0;
mca_btl_self.btl_flags = MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_SEND_INPLACE;
mca_btl_self.btl_seg_size = sizeof (mca_btl_base_segment_t);
mca_btl_self.btl_bandwidth = 100;
mca_btl_self.btl_latency = 0;
mca_btl_base_param_register(&mca_btl_self_component.super.btl_version,

Просмотреть файл

@ -23,8 +23,8 @@ static inline void mca_btl_self_frag_constructor(mca_btl_self_frag_t* frag)
{
frag->segment.seg_addr.pval = frag+1;
frag->segment.seg_len = (uint32_t)frag->size;
frag->base.des_local = &frag->segment;
frag->base.des_local_count = 1;
frag->base.des_segments = &frag->segment;
frag->base.des_segment_count = 1;
frag->base.des_flags = 0;
}

Просмотреть файл

@ -57,6 +57,9 @@
#include "opal/mca/mpool/base/base.h"
#include "opal/mca/mpool/sm/mpool_sm.h"
#include "opal/align.h"
#include "opal/util/sys_limits.h"
#if OPAL_ENABLE_FT_CR == 1
#include "opal/util/basename.h"
#include "opal/mca/crs/base/base.h"
@ -81,9 +84,6 @@ mca_btl_sm_t mca_btl_sm = {
.btl_alloc = mca_btl_sm_alloc,
.btl_free = mca_btl_sm_free,
.btl_prepare_src = mca_btl_sm_prepare_src,
#if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA
.btl_prepare_dst = mca_btl_sm_prepare_dst,
#endif /* OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA */
.btl_send = mca_btl_sm_send,
.btl_sendi = mca_btl_sm_sendi,
.btl_dump = mca_btl_sm_dump,
@ -743,7 +743,6 @@ extern int mca_btl_sm_free(
struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_src(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
mca_mpool_base_registration_t* registration,
struct opal_convertor_t* convertor,
uint8_t order,
size_t reserve,
@ -828,11 +827,9 @@ struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_src(
}
#endif /* OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA */
frag->base.des_local = &(frag->segment.base);
frag->base.des_local_count = 1;
frag->base.des_segments = &(frag->segment.base);
frag->base.des_segment_count = 1;
frag->base.order = MCA_BTL_NO_ORDER;
frag->base.des_remote = NULL;
frag->base.des_remote_count = 0;
frag->base.des_flags = flags;
*size = max_data;
return &frag->base;
@ -950,9 +947,12 @@ int mca_btl_sm_sendi( struct mca_btl_base_module_t* btl,
return OPAL_SUCCESS;
}
/* presumably, this code path will never get executed */
*descriptor = mca_btl_sm_alloc( btl, endpoint, order,
payload_size + header_size, flags);
if (NULL != descriptor) {
/* presumably, this code path will never get executed */
*descriptor = mca_btl_sm_alloc( btl, endpoint, order,
payload_size + header_size, flags);
}
return OPAL_ERR_RESOURCE_BUSY;
}
@ -1001,51 +1001,87 @@ int mca_btl_sm_send( struct mca_btl_base_module_t* btl,
}
#if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA
struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_dst(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_mpool_base_registration_t* registration,
struct opal_convertor_t* convertor,
uint8_t order,
size_t reserve,
size_t* size,
uint32_t flags)
mca_btl_base_registration_handle_t *mca_btl_sm_register_mem (struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
void *base, size_t size, uint32_t flags)
{
void *ptr;
mca_btl_sm_frag_t* frag;
mca_btl_sm_registration_handle_t *handle;
mca_btl_sm_t *sm_btl = (mca_btl_sm_t *) btl;
ompi_free_list_item_t *item = NULL;
MCA_BTL_SM_FRAG_ALLOC_USER(frag);
if(OPAL_UNLIKELY(NULL == frag)) {
OMPI_FREE_LIST_GET_MT(&mca_btl_sm_component.registration_handles, item);
if (OPAL_UNLIKELY(NULL == item)) {
return NULL;
}
frag->segment.base.seg_len = *size;
opal_convertor_get_current_pointer( convertor, &ptr );
frag->segment.base.seg_addr.lval = (uint64_t)(uintptr_t) ptr;
frag->base.des_remote = NULL;
frag->base.des_remote_count = 0;
frag->base.des_local = (mca_btl_base_segment_t*)&frag->segment;
frag->base.des_local_count = 1;
frag->base.des_flags = flags;
return &frag->base;
handle = (mca_btl_sm_registration_handle_t *) item;
#if OPAL_BTL_SM_HAVE_KNEM
if (OPAL_LIKELY(mca_btl_sm_component.use_knem)) {
struct knem_cmd_create_region knem_cr;
struct knem_cmd_param_iovec knem_iov;
knem_iov.base = (uintptr_t)base & ~(opal_getpagesize() - 1);
knem_iov.len = OPAL_ALIGN(size + ((intptr_t) base - knem_iov.base), opal_getpagesize(), intptr_t);
knem_cr.iovec_array = (uintptr_t)&knem_iov;
knem_cr.iovec_nr = 1;
knem_cr.flags = 0;
knem_cr.protection = 0;
if (flags & MCA_BTL_REG_FLAG_REMOTE_READ) {
knem_cr.protection |= PROT_READ;
}
if (flags & MCA_BTL_REG_FLAG_REMOTE_WRITE) {
knem_cr.protection |= PROT_WRITE;
}
if (OPAL_UNLIKELY(ioctl(sm_btl->knem_fd, KNEM_CMD_CREATE_REGION, &knem_cr) < 0)) {
OMPI_FREE_LIST_RETURN_MT(&mca_btl_sm_component.registration_handles, item);
return NULL;
}
handle->btl_handle.data.knem.cookie = knem_cr.cookie;
handle->btl_handle.data.knem.base_addr = knem_iov.base;
} else
#endif
{
/* the pid could be included in a modex but this will work until btl/sm is
* deleted */
handle->btl_handle.data.pid = getpid ();
}
/* return the public part of the handle */
return &handle->btl_handle;
}
int mca_btl_sm_deregister_mem (struct mca_btl_base_module_t* btl, mca_btl_base_registration_handle_t *handle)
{
mca_btl_sm_registration_handle_t *sm_handle =
(mca_btl_sm_registration_handle_t *)((intptr_t) handle - offsetof (mca_btl_sm_registration_handle_t, btl_handle));
mca_btl_sm_t* sm_btl = (mca_btl_sm_t*) btl;
#if OPAL_BTL_SM_HAVE_KNEM
if (OPAL_LIKELY(mca_btl_sm_component.use_knem)) {
(void) ioctl(sm_btl->knem_fd, KNEM_CMD_DESTROY_REGION, &handle->data.knem.cookie);
}
#endif
OMPI_FREE_LIST_RETURN_MT(&mca_btl_sm_component.registration_handles, &sm_handle->super);
return OPAL_SUCCESS;
}
#endif /* OPAL_BTL_SM_HAVE_KNEM */
#if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA
/**
* Initiate an synchronous get.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*/
int mca_btl_sm_get_sync(struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_btl_base_descriptor_t* des)
int mca_btl_sm_get_sync (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{
int btl_ownership;
mca_btl_sm_frag_t* frag = (mca_btl_sm_frag_t*)des;
mca_btl_sm_segment_t *src = (mca_btl_sm_segment_t*)des->des_remote;
mca_btl_sm_segment_t *dst = (mca_btl_sm_segment_t*)des->des_local;
#if OPAL_BTL_SM_HAVE_KNEM
mca_btl_sm_t* sm_btl = (mca_btl_sm_t*) btl;
if (OPAL_LIKELY(mca_btl_sm_component.use_knem)) {
@ -1054,12 +1090,12 @@ int mca_btl_sm_get_sync(struct mca_btl_base_module_t* btl,
/* Fill in the ioctl data fields. There's no async completion, so
we don't need to worry about getting a slot, etc. */
recv_iovec.base = (uintptr_t) dst->base.seg_addr.lval;
recv_iovec.len = dst->base.seg_len;
recv_iovec.base = (uintptr_t) local_address;
recv_iovec.len = size;
icopy.local_iovec_array = (uintptr_t)&recv_iovec;
icopy.local_iovec_nr = 1;
icopy.remote_cookie = src->key;
icopy.remote_offset = 0;
icopy.remote_cookie = remote_handle->data.knem.cookie;
icopy.remote_offset = remote_address - remote_handle->data.knem.base_addr;
icopy.write = 0;
/* Use the DMA flag if knem supports it *and* the segment length
@ -1067,7 +1103,7 @@ int mca_btl_sm_get_sync(struct mca_btl_base_module_t* btl,
value is 0 (i.e., the MCA param was set to 0), the segment size
will never be larger than it, so DMA will never be used. */
icopy.flags = 0;
if (mca_btl_sm_component.knem_dma_min <= dst->base.seg_len) {
if (mca_btl_sm_component.knem_dma_min <= size) {
icopy.flags = mca_btl_sm_component.knem_dma_flag;
}
/* synchronous flags only, no need to specify icopy.async_status_index */
@ -1085,27 +1121,19 @@ int mca_btl_sm_get_sync(struct mca_btl_base_module_t* btl,
#if OPAL_BTL_SM_HAVE_CMA
if (OPAL_LIKELY(mca_btl_sm_component.use_cma)) {
char *remote_address, *local_address;
int remote_length, local_length;
struct iovec local, remote;
pid_t remote_pid;
int val;
remote_address = (char *)(uintptr_t) src->base.seg_addr.lval;
remote_length = src->base.seg_len;
local_address = (char *)(uintptr_t) dst->base.seg_addr.lval;
local_length = dst->base.seg_len;
remote_pid = src->key;
remote.iov_base = remote_address;
remote.iov_len = remote_length;
remote_pid = remote_handle->data.pid;
remote.iov_base = (void *) (intptr_t) remote_address;
remote.iov_len = size;
local.iov_base = local_address;
local.iov_len = local_length;
local.iov_len = size;
val = process_vm_readv(remote_pid, &local, 1, &remote, 1, 0);
if (val != local_length) {
if (val != size) {
if (val<0) {
opal_output(0, "mca_btl_sm_get_sync: process_vm_readv failed: %i",
errno);
@ -1119,15 +1147,7 @@ int mca_btl_sm_get_sync(struct mca_btl_base_module_t* btl,
}
#endif /* OPAL_BTL_SM_HAVE_CMA */
btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
if (0 != (MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags)) {
frag->base.des_cbfunc(&mca_btl_sm.super,
frag->endpoint, &frag->base,
OPAL_SUCCESS);
}
if (btl_ownership) {
MCA_BTL_SM_FRAG_RETURN(frag);
}
cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS);
return OPAL_SUCCESS;
}
@ -1139,34 +1159,42 @@ int mca_btl_sm_get_sync(struct mca_btl_base_module_t* btl,
/**
* Initiate an asynchronous get.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*/
int mca_btl_sm_get_async(struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_btl_base_descriptor_t* des)
int mca_btl_sm_get_async (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{
int btl_ownership;
mca_btl_sm_t* sm_btl = (mca_btl_sm_t*) btl;
mca_btl_sm_frag_t* frag = (mca_btl_sm_frag_t*)des;
mca_btl_sm_segment_t *src = (mca_btl_sm_segment_t*)des->des_remote;
mca_btl_sm_segment_t *dst = (mca_btl_sm_segment_t*)des->des_local;
mca_btl_sm_frag_t* frag;
struct knem_cmd_inline_copy icopy;
struct knem_cmd_param_iovec recv_iovec;
/* If we have no knem slots available, return
TEMP_OUT_OF_RESOURCE */
/* If we have no knem slots available, fall back to synchronous */
if (sm_btl->knem_status_num_used >=
mca_btl_sm_component.knem_max_simultaneous) {
return OPAL_ERR_TEMP_OUT_OF_RESOURCE;
return mca_btl_sm_get_sync (btl, endpoint, local_address, remote_address, local_handle,
remote_handle, size, flags, order, cbfunc, cbcontext, cbdata);
}
/* allocate a fragment to keep track of this transaction */
MCA_BTL_SM_FRAG_ALLOC_USER(frag);
if (OPAL_UNLIKELY(NULL == frag)) {
return mca_btl_sm_get_sync (btl, endpoint, local_address, remote_address, local_handle,
remote_handle, size, flags, order, cbfunc, cbcontext, cbdata);
}
/* fill in callback data */
frag->cb.func = cbfunc;
frag->cb.context = cbcontext;
frag->cb.data = cbdata;
frag->cb.local_address = local_address;
frag->cb.local_handle = local_handle;
/* We have a slot, so fill in the data fields. Bump the
first_avail and num_used counters. */
recv_iovec.base = (uintptr_t) dst->base.seg_addr.lval;
recv_iovec.len = dst->base.seg_len;
recv_iovec.base = (uintptr_t) local_address;
recv_iovec.len = size;
icopy.local_iovec_array = (uintptr_t)&recv_iovec;
icopy.local_iovec_nr = 1;
icopy.write = 0;
@ -1176,13 +1204,13 @@ int mca_btl_sm_get_async(struct mca_btl_base_module_t* btl,
sm_btl->knem_status_first_avail = 0;
}
++sm_btl->knem_status_num_used;
icopy.remote_cookie = src->key;
icopy.remote_offset = 0;
icopy.remote_cookie = remote_handle->data.knem.cookie;
icopy.remote_offset = remote_address - remote_handle->data.knem.base_addr;
/* Use the DMA flag if knem supports it *and* the segment length
is greater than the cutoff */
icopy.flags = KNEM_FLAG_ASYNCDMACOMPLETE;
if (mca_btl_sm_component.knem_dma_min <= dst->base.seg_len) {
if (mca_btl_sm_component.knem_dma_min <= size) {
icopy.flags = mca_btl_sm_component.knem_dma_flag;
}
@ -1190,19 +1218,11 @@ int mca_btl_sm_get_async(struct mca_btl_base_module_t* btl,
if (OPAL_LIKELY(0 == ioctl(sm_btl->knem_fd,
KNEM_CMD_INLINE_COPY, &icopy))) {
if (icopy.current_status != KNEM_STATUS_PENDING) {
MCA_BTL_SM_FRAG_RETURN(frag);
/* request completed synchronously */
/* FIXME: what if icopy.current_status == KNEM_STATUS_FAILED? */
btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
if (0 != (MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags)) {
frag->base.des_cbfunc(&mca_btl_sm.super,
frag->endpoint, &frag->base,
OPAL_SUCCESS);
}
if (btl_ownership) {
MCA_BTL_SM_FRAG_RETURN(frag);
}
cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS);
--sm_btl->knem_status_num_used;
++sm_btl->knem_status_first_used;

Просмотреть файл

@ -1,3 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
@ -11,7 +12,7 @@
* All rights reserved.
* Copyright (c) 2006-2007 Voltaire. All rights reserved.
* Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010-2013 Los Alamos National Security, LLC.
* Copyright (c) 2010-2014 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2010-2012 IBM Corporation. All rights reserved.
* $COPYRIGHT$
@ -182,6 +183,10 @@ struct mca_btl_sm_component_t {
#if OPAL_BTL_SM_HAVE_KNEM
/* Knem capabilities info */
struct knem_cmd_info knem_info;
#endif
#if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA
/** registration handles to hold knem cookies */
ompi_free_list_t registration_handles;
#endif /* OPAL_BTL_SM_HAVE_KNEM */
/** MCA: should we be using knem or not? neg=try but continue if
@ -461,7 +466,6 @@ extern int mca_btl_sm_free(
struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_src(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
mca_mpool_base_registration_t* registration,
struct opal_convertor_t* convertor,
uint8_t order,
size_t reserve,
@ -504,30 +508,20 @@ extern int mca_btl_sm_send(
/*
* Synchronous knem/cma get
*/
extern int mca_btl_sm_get_sync(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_btl_base_descriptor_t* des );
extern struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_dst(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_mpool_base_registration_t* registration,
struct opal_convertor_t* convertor,
uint8_t order,
size_t reserve,
size_t* size,
uint32_t flags);
int mca_btl_sm_get_sync (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
#endif /* OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA */
#if OPAL_BTL_SM_HAVE_KNEM
/*
* Asynchronous knem get
*/
extern int mca_btl_sm_get_async(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_btl_base_descriptor_t* des );
int mca_btl_sm_get_async (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
#endif /* OPAL_BTL_SM_HAVE_KNEM */
@ -558,6 +552,31 @@ void mca_btl_sm_component_event_thread(opal_object_t*);
#define MCA_BTL_SM_SIGNAL_PEER(peer)
#endif
#if OPAL_BTL_SM_HAVE_KNEM | OPAL_BTL_SM_HAVE_CMA
struct mca_btl_base_registration_handle_t {
union {
struct {
uint64_t cookie;
intptr_t base_addr;
} knem;
pid_t pid;
} data;
};
struct mca_btl_sm_registration_handle_t {
ompi_free_list_item_t super;
mca_btl_base_registration_handle_t btl_handle;
};
typedef struct mca_btl_sm_registration_handle_t mca_btl_sm_registration_handle_t;
mca_btl_base_registration_handle_t *mca_btl_sm_register_mem (struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
void *base, size_t size, uint32_t flags);
int mca_btl_sm_deregister_mem (struct mca_btl_base_module_t* btl, mca_btl_base_registration_handle_t *handle);
#endif
END_C_DECLS
#endif

Просмотреть файл

@ -67,6 +67,10 @@
#include "opal/mca/common/cuda/common_cuda.h"
#endif /* OPAL_CUDA_SUPPORT */
#if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA
static OBJ_CLASS_INSTANCE(mca_btl_sm_registration_handle_t, ompi_free_list_item_t, NULL, NULL);
#endif
static int mca_btl_sm_component_open(void);
static int mca_btl_sm_component_close(void);
static int sm_register(void);
@ -251,10 +255,13 @@ static int sm_register(void)
mca_btl_sm.super.btl_rdma_pipeline_frag_size = 64*1024;
mca_btl_sm.super.btl_min_rdma_pipeline_size = 64*1024;
mca_btl_sm.super.btl_flags = MCA_BTL_FLAGS_SEND;
mca_btl_sm.super.btl_seg_size = sizeof (mca_btl_sm_segment_t);
mca_btl_sm.super.btl_bandwidth = 9000; /* Mbs */
mca_btl_sm.super.btl_latency = 1; /* Microsecs */
#if OPAL_BTL_SM_HAVE_KNEM
mca_btl_sm.super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t);
#endif
/* Call the BTL based to register its MCA params */
mca_btl_base_param_register(&mca_btl_sm_component.super.btl_version,
&mca_btl_sm.super);
@ -295,6 +302,11 @@ static int mca_btl_sm_component_open(void)
OBJ_CONSTRUCT(&mca_btl_sm_component.pending_send_fl, opal_free_list_t);
mca_btl_sm_component.sm_seg = NULL;
#if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA
OBJ_CONSTRUCT(&mca_btl_sm_component.registration_handles, ompi_free_list_t);
#endif
#if OPAL_BTL_SM_HAVE_KNEM
mca_btl_sm.knem_fd = -1;
mca_btl_sm.knem_status_array = NULL;
@ -332,6 +344,10 @@ static int mca_btl_sm_component_close(void)
}
#endif /* OPAL_BTL_SM_HAVE_KNEM */
#if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA
OBJ_DESTRUCT(&mca_btl_sm_component.registration_handles);
#endif
OBJ_DESTRUCT(&mca_btl_sm_component.sm_lock);
/**
* We don't have to destroy the fragment lists. They are allocated
@ -904,6 +920,9 @@ mca_btl_sm_component_init(int *num_btls,
} else {
mca_btl_sm.super.btl_get = mca_btl_sm_get_sync;
}
mca_btl_sm.super.btl_register_mem = mca_btl_sm_register_mem;
mca_btl_sm.super.btl_deregister_mem = mca_btl_sm_deregister_mem;
}
#else
/* If the user explicitly asked for knem and we can't provide it,
@ -918,6 +937,8 @@ mca_btl_sm_component_init(int *num_btls,
/* Will only ever have either cma or knem enabled at runtime
so no problems with accidentally overwriting this set earlier */
mca_btl_sm.super.btl_get = mca_btl_sm_get_sync;
mca_btl_sm.super.btl_register_mem = mca_btl_sm_register_mem;
mca_btl_sm.super.btl_deregister_mem = mca_btl_sm_deregister_mem;
}
#else
/* If the user explicitly asked for CMA and we can't provide itm
@ -931,6 +952,21 @@ mca_btl_sm_component_init(int *num_btls,
}
#endif /* OPAL_BTL_SM_HAVE_CMA */
#if OPAL_BTL_SM_HAVE_KNEM | OPAL_BTL_SM_HAVE_CMA
if (mca_btl_sm_component.use_cma || mca_btl_sm_component.use_knem) {
rc = ompi_free_list_init_new (&mca_btl_sm_component.registration_handles,
sizeof (mca_btl_sm_registration_handle_t),
8, OBJ_CLASS(mca_btl_sm_registration_handle_t),
0, 0, mca_btl_sm_component.sm_free_list_num,
mca_btl_sm_component.sm_free_list_max,
mca_btl_sm_component.sm_free_list_inc, NULL);
if (OPAL_SUCCESS != rc) {
free (btls);
return NULL;
}
}
#endif
return btls;
no_knem:
@ -963,6 +999,7 @@ mca_btl_sm_component_init(int *num_btls,
/* disable get when not using knem or cma */
mca_btl_sm.super.btl_get = NULL;
mca_btl_sm.super.btl_flags &= ~MCA_BTL_FLAGS_GET;
mca_btl_sm_component.use_knem = 0;
}
/* Otherwise, use_knem was 0 (and we didn't get here) or use_knem
@ -1090,8 +1127,8 @@ int mca_btl_sm_component_progress(void)
reg = mca_btl_base_active_message_trigger + hdr->tag;
seg.seg_addr.pval = ((char *)hdr) + sizeof(mca_btl_sm_hdr_t);
seg.seg_len = hdr->len;
Frag.base.des_local_count = 1;
Frag.base.des_local = &seg;
Frag.base.des_segment_count = 1;
Frag.base.des_segments = &seg;
reg->cbfunc(&mca_btl_sm.super, hdr->tag, &(Frag.base),
reg->cbdata);
/* return the fragment */
@ -1176,22 +1213,14 @@ int mca_btl_sm_component_progress(void)
mca_btl_sm.knem_status_array[mca_btl_sm.knem_status_first_used]) {
if (KNEM_STATUS_SUCCESS ==
mca_btl_sm.knem_status_array[mca_btl_sm.knem_status_first_used]) {
int btl_ownership;
/* Handle the completed fragment */
frag =
mca_btl_sm.knem_frag_array[mca_btl_sm.knem_status_first_used];
btl_ownership = (frag->base.des_flags &
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
if (0 != (MCA_BTL_DES_SEND_ALWAYS_CALLBACK &
frag->base.des_flags)) {
frag->base.des_cbfunc(&mca_btl_sm.super,
frag->endpoint, &frag->base,
OPAL_SUCCESS);
}
if (btl_ownership) {
MCA_BTL_SM_FRAG_RETURN(frag);
}
frag->cb.func (&mca_btl_sm.super, frag->endpoint,
frag->cb.local_address, frag->cb.local_handle,
frag->cb.context, frag->cb.data, OPAL_SUCCESS);
MCA_BTL_SM_FRAG_RETURN(frag);
/* Bump counters, loop around the circular buffer if
necessary */

Просмотреть файл

@ -10,6 +10,8 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2007 Voltaire. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow

Просмотреть файл

@ -31,8 +31,8 @@ static inline void mca_btl_sm_frag_common_constructor(mca_btl_sm_frag_t* frag)
frag->hdr->my_smp_rank = mca_btl_sm_component.my_smp_rank;
}
frag->segment.base.seg_len = frag->size;
frag->base.des_local = &frag->segment.base;
frag->base.des_local_count = 1;
frag->base.des_segments = &frag->segment.base;
frag->base.des_segment_count = 1;
frag->base.des_flags = 0;
}

Просмотреть файл

@ -1,3 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
@ -11,6 +12,8 @@
* All rights reserved.
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -64,6 +67,16 @@ struct mca_btl_sm_frag_t {
/* pointer written to the FIFO, this is the base of the shared memory region */
mca_btl_sm_hdr_t *hdr;
ompi_free_list_t* my_list;
#if OPAL_BTL_SM_HAVE_KNEM
/* rdma callback data. required for async get */
struct {
mca_btl_base_rdma_completion_fn_t func;
void *local_address;
struct mca_btl_base_registration_handle_t *local_handle;
void *context;
void *data;
} cb;
#endif
};
typedef struct mca_btl_sm_frag_t mca_btl_sm_frag_t;
typedef struct mca_btl_sm_frag_t mca_btl_sm_frag1_t;

Просмотреть файл

@ -832,8 +832,8 @@ struct mca_btl_base_descriptor_t* mca_btl_smcuda_prepare_src(
}
#endif /* OPAL_CUDA_SUPPORT */
frag->base.des_local = &(frag->segment.base);
frag->base.des_local_count = 1;
frag->base.des_segments = &(frag->segment.base);
frag->base.des_segment_count = 1;
frag->base.order = MCA_BTL_NO_ORDER;
frag->base.des_remote = NULL;
frag->base.des_remote_count = 0;
@ -1045,8 +1045,8 @@ struct mca_btl_base_descriptor_t* mca_btl_smcuda_prepare_dst(
frag->base.des_remote = NULL;
frag->base.des_remote_count = 0;
frag->base.des_local = &frag->segment.base;
frag->base.des_local_count = 1;
frag->base.des_segments = &frag->segment.base;
frag->base.des_segment_count = 1;
frag->base.des_flags = flags;
return &frag->base;
}
@ -1059,7 +1059,7 @@ int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t* btl,
struct mca_btl_base_descriptor_t* descriptor)
{
mca_btl_smcuda_segment_t *src_seg = (mca_btl_smcuda_segment_t *) descriptor->des_remote;
mca_btl_smcuda_segment_t *dst_seg = (mca_btl_smcuda_segment_t *) descriptor->des_local;
mca_btl_smcuda_segment_t *dst_seg = (mca_btl_smcuda_segment_t *) descriptor->des_segments;
mca_mpool_common_cuda_reg_t rget_reg;
mca_mpool_common_cuda_reg_t *reg_ptr = &rget_reg;
int btl_ownership;

Просмотреть файл

@ -691,7 +691,7 @@ static void btl_smcuda_control(mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t *endpoint;
mca_btl_smcuda_t *smcuda_btl = (mca_btl_smcuda_t *)btl;
mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
mca_btl_base_segment_t* segments = des->des_local;
mca_btl_base_segment_t* segments = des->des_segments;
/* Use the rank of the peer that sent the data to get to the endpoint
* structure. This is needed for PML callback. */
@ -1065,8 +1065,8 @@ int mca_btl_smcuda_component_progress(void)
reg = mca_btl_base_active_message_trigger + hdr->tag;
seg.seg_addr.pval = ((char *)hdr) + sizeof(mca_btl_smcuda_hdr_t);
seg.seg_len = hdr->len;
Frag.base.des_local_count = 1;
Frag.base.des_local = &seg;
Frag.base.des_segment_count = 1;
Frag.base.des_segments = &seg;
#if OPAL_CUDA_SUPPORT
Frag.hdr = hdr; /* needed for peer rank in control messages */
#endif /* OPAL_CUDA_SUPPORT */

Просмотреть файл

@ -32,8 +32,8 @@ static inline void mca_btl_smcuda_frag_common_constructor(mca_btl_smcuda_frag_t*
frag->hdr->my_smp_rank = mca_btl_smcuda_component.my_smp_rank;
}
frag->segment.base.seg_len = frag->size;
frag->base.des_local = &frag->segment.base;
frag->base.des_local_count = 1;
frag->base.des_segments = &frag->segment.base;
frag->base.des_segment_count = 1;
frag->base.des_flags = 0;
#if OPAL_CUDA_SUPPORT
frag->registration = NULL;

Просмотреть файл

@ -42,7 +42,6 @@ mca_btl_tcp_module_t mca_btl_tcp_module = {
.btl_alloc = mca_btl_tcp_alloc,
.btl_free = mca_btl_tcp_free,
.btl_prepare_src = mca_btl_tcp_prepare_src,
.btl_prepare_dst = mca_btl_tcp_prepare_dst,
.btl_send = mca_btl_tcp_send,
.btl_put = mca_btl_tcp_put,
.btl_dump = mca_btl_base_dump,
@ -170,8 +169,8 @@ mca_btl_base_descriptor_t* mca_btl_tcp_alloc(
frag->segments[0].seg_len = size;
frag->segments[0].seg_addr.pval = frag+1;
frag->base.des_local = frag->segments;
frag->base.des_local_count = 1;
frag->base.des_segments = frag->segments;
frag->base.des_segment_count = 1;
frag->base.des_flags = flags;
frag->base.order = MCA_BTL_NO_ORDER;
frag->btl = (mca_btl_tcp_module_t*)btl;
@ -202,7 +201,6 @@ int mca_btl_tcp_free(
mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_mpool_base_registration_t* registration,
struct opal_convertor_t* convertor,
uint8_t order,
size_t reserve,
@ -238,7 +236,7 @@ mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src(
frag->segments[0].seg_addr.pval = (frag + 1);
frag->segments[0].seg_len = reserve;
frag->base.des_local_count = 1;
frag->base.des_segment_count = 1;
if(opal_convertor_need_buffers(convertor)) {
if (max_data + reserve > frag->size) {
@ -268,66 +266,16 @@ mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src(
frag->segments[1].seg_addr.pval = iov.iov_base;
frag->segments[1].seg_len = max_data;
frag->base.des_local_count = 2;
frag->base.des_segment_count = 2;
}
frag->base.des_local = frag->segments;
frag->base.des_remote = NULL;
frag->base.des_remote_count = 0;
frag->base.des_segments = frag->segments;
frag->base.des_flags = flags;
frag->base.order = MCA_BTL_NO_ORDER;
*size = max_data;
return &frag->base;
}
/**
* Prepare a descriptor for send/rdma using the supplied
* convertor. If the convertor references data that is contigous,
* the descriptor may simply point to the user buffer. Otherwise,
* this routine is responsible for allocating buffer space and
* packing if required.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL peer addressing
* @param convertor (IN) Data type convertor
* @param reserve (IN) Additional bytes requested by upper layer to precede user data
* @param size (IN/OUT) Number of bytes to prepare (IN), number of bytes actually prepared (OUT)
*/
mca_btl_base_descriptor_t* mca_btl_tcp_prepare_dst(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_mpool_base_registration_t* registration,
struct opal_convertor_t* convertor,
uint8_t order,
size_t reserve,
size_t* size,
uint32_t flags)
{
mca_btl_tcp_frag_t* frag;
if( OPAL_UNLIKELY((*size) > UINT32_MAX) ) { /* limit the size to what we support */
*size = (size_t)UINT32_MAX;
}
MCA_BTL_TCP_FRAG_ALLOC_USER(frag);
if( OPAL_UNLIKELY(NULL == frag) ) {
return NULL;
}
frag->segments->seg_len = *size;
opal_convertor_get_current_pointer( convertor, (void**)&(frag->segments->seg_addr.pval) );
frag->base.des_remote = NULL;
frag->base.des_remote_count = 0;
frag->base.des_local = frag->segments;
frag->base.des_local_count = 1;
frag->base.des_flags = flags;
frag->base.order = MCA_BTL_NO_ORDER;
return &frag->base;
}
/**
* Initiate an asynchronous send.
*
@ -355,7 +303,7 @@ int mca_btl_tcp_send( struct mca_btl_base_module_t* btl,
frag->iov[0].iov_base = (IOVBASE_TYPE*)&frag->hdr;
frag->iov[0].iov_len = sizeof(frag->hdr);
frag->hdr.size = 0;
for( i = 0; i < (int)frag->base.des_local_count; i++) {
for( i = 0; i < (int)frag->base.des_segment_count; i++) {
frag->hdr.size += frag->segments[i].seg_len;
frag->iov[i+1].iov_len = frag->segments[i].seg_len;
frag->iov[i+1].iov_base = (IOVBASE_TYPE*)frag->segments[i].seg_addr.pval;
@ -368,23 +316,55 @@ int mca_btl_tcp_send( struct mca_btl_base_module_t* btl,
return mca_btl_tcp_endpoint_send(endpoint,frag);
}
static void fake_rdma_complete (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint,
mca_btl_base_descriptor_t *desc, int rc)
{
mca_btl_tcp_frag_t *frag = (mca_btl_tcp_frag_t *) desc;
frag->cb.func (btl, endpoint, frag->segments[0].seg_addr.pval, NULL, frag->cb.context, frag->cb.data,
rc);
}
/**
* Initiate an asynchronous put.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*/
int mca_btl_tcp_put( mca_btl_base_module_t* btl,
mca_btl_base_endpoint_t* endpoint,
mca_btl_base_descriptor_t* descriptor )
int mca_btl_tcp_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{
mca_btl_tcp_module_t* tcp_btl = (mca_btl_tcp_module_t*) btl;
mca_btl_tcp_frag_t* frag = (mca_btl_tcp_frag_t*)descriptor;
mca_btl_tcp_frag_t *frag = NULL;
int i;
MCA_BTL_TCP_FRAG_ALLOC_USER(frag);
if( OPAL_UNLIKELY(NULL == frag) ) {
return OPAL_ERR_OUT_OF_RESOURCE;;
}
frag->endpoint = endpoint;
frag->segments->seg_len = size;
frag->segments->seg_addr.pval = local_address;
frag->base.des_segments = frag->segments;
frag->base.des_segment_count = 1;
frag->base.order = MCA_BTL_NO_ORDER;
frag->segments[0].seg_addr.pval = local_address;
frag->segments[0].seg_len = size;
frag->segments[1].seg_addr.lval = remote_address;
frag->segments[1].seg_len = size;
frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
frag->base.des_cbfunc = fake_rdma_complete;
frag->cb.func = cbfunc;
frag->cb.data = cbdata;
frag->cb.context = cbcontext;
frag->btl = tcp_btl;
frag->endpoint = endpoint;
frag->rc = 0;
@ -394,9 +374,9 @@ int mca_btl_tcp_put( mca_btl_base_module_t* btl,
frag->iov_ptr = frag->iov;
frag->iov[0].iov_base = (IOVBASE_TYPE*)&frag->hdr;
frag->iov[0].iov_len = sizeof(frag->hdr);
frag->iov[1].iov_base = (IOVBASE_TYPE*)frag->base.des_remote;
frag->iov[1].iov_len = frag->base.des_remote_count * sizeof(mca_btl_base_segment_t);
for( i = 0; i < (int)frag->base.des_local_count; i++ ) {
frag->iov[1].iov_base = (IOVBASE_TYPE*) (frag->segments + 1);
frag->iov[1].iov_len = sizeof(mca_btl_base_segment_t);
for( i = 0; i < (int)frag->base.des_segment_count; i++ ) {
frag->hdr.size += frag->segments[i].seg_len;
frag->iov[i+2].iov_len = frag->segments[i].seg_len;
frag->iov[i+2].iov_base = (IOVBASE_TYPE*)frag->segments[i].seg_addr.pval;
@ -404,7 +384,7 @@ int mca_btl_tcp_put( mca_btl_base_module_t* btl,
}
frag->hdr.base.tag = MCA_BTL_TAG_BTL;
frag->hdr.type = MCA_BTL_TCP_HDR_TYPE_PUT;
frag->hdr.count = frag->base.des_remote_count;
frag->hdr.count = 1;
if (endpoint->endpoint_nbo) MCA_BTL_TCP_HDR_HTON(frag->hdr);
return ((i = mca_btl_tcp_endpoint_send(endpoint,frag)) >= 0 ? OPAL_SUCCESS : i);
}
@ -412,22 +392,46 @@ int mca_btl_tcp_put( mca_btl_base_module_t* btl,
/**
* Initiate an asynchronous get.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*
*/
int mca_btl_tcp_get(
mca_btl_base_module_t* btl,
mca_btl_base_endpoint_t* endpoint,
mca_btl_base_descriptor_t* descriptor)
int mca_btl_tcp_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{
mca_btl_tcp_module_t* tcp_btl = (mca_btl_tcp_module_t*) btl;
mca_btl_tcp_frag_t* frag = (mca_btl_tcp_frag_t*)descriptor;
mca_btl_tcp_frag_t* frag = NULL;
int rc;
MCA_BTL_TCP_FRAG_ALLOC_USER(frag);
if( OPAL_UNLIKELY(NULL == frag) ) {
return OPAL_ERR_OUT_OF_RESOURCE;;
}
frag->endpoint = endpoint;
frag->segments->seg_len = size;
frag->segments->seg_addr.pval = local_address;
frag->base.des_segments = frag->segments;
frag->base.des_segment_count = 1;
frag->base.order = MCA_BTL_NO_ORDER;
frag->segments[0].seg_addr.pval = local_address;
frag->segments[0].seg_len = size;
frag->segments[1].seg_addr.lval = remote_address;
frag->segments[1].seg_len = size;
/* call the rdma callback through the descriptor callback. this is
* tcp so the extra latency is not an issue */
frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
frag->base.des_cbfunc = fake_rdma_complete;
frag->cb.func = cbfunc;
frag->cb.data = cbdata;
frag->cb.context = cbcontext;
frag->btl = tcp_btl;
frag->endpoint = endpoint;
frag->rc = 0;
@ -437,11 +441,11 @@ int mca_btl_tcp_get(
frag->iov_ptr = frag->iov;
frag->iov[0].iov_base = (IOVBASE_TYPE*)&frag->hdr;
frag->iov[0].iov_len = sizeof(frag->hdr);
frag->iov[1].iov_base = (IOVBASE_TYPE*)frag->base.des_remote;
frag->iov[1].iov_len = frag->base.des_remote_count * sizeof(mca_btl_base_segment_t);
frag->iov[1].iov_base = (IOVBASE_TYPE*) &frag->segments[1];
frag->iov[1].iov_len = sizeof(mca_btl_base_segment_t);
frag->hdr.base.tag = MCA_BTL_TAG_BTL;
frag->hdr.type = MCA_BTL_TCP_HDR_TYPE_GET;
frag->hdr.count = frag->base.des_remote_count;
frag->hdr.count = 1;
if (endpoint->endpoint_nbo) MCA_BTL_TCP_HDR_HTON(frag->hdr);
return ((rc = mca_btl_tcp_endpoint_send(endpoint,frag)) >= 0 ? OPAL_SUCCESS : rc);
}

Просмотреть файл

@ -217,32 +217,22 @@ extern int mca_btl_tcp_send(
/**
* Initiate an asynchronous put.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*/
extern int mca_btl_tcp_put(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* btl_peer,
struct mca_btl_base_descriptor_t* decriptor
);
int mca_btl_tcp_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
/**
* Initiate an asynchronous get.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*/
extern int mca_btl_tcp_get(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* btl_peer,
struct mca_btl_base_descriptor_t* decriptor
);
int mca_btl_tcp_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
/**
* Allocate a descriptor with a segment of the requested size.
@ -290,7 +280,6 @@ extern int mca_btl_tcp_free(
mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* peer,
struct mca_mpool_base_registration_t*,
struct opal_convertor_t* convertor,
uint8_t order,
size_t reserve,
@ -298,16 +287,6 @@ mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src(
uint32_t flags
);
extern mca_btl_base_descriptor_t* mca_btl_tcp_prepare_dst(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* peer,
struct mca_mpool_base_registration_t*,
struct opal_convertor_t* convertor,
uint8_t order,
size_t reserve,
size_t* size,
uint32_t flags);
/**
* Fault Tolerance Event Notification Function

Просмотреть файл

@ -287,7 +287,7 @@ static int mca_btl_tcp_component_register(void)
MCA_BTL_FLAGS_NEED_CSUM |
MCA_BTL_FLAGS_NEED_ACK |
MCA_BTL_FLAGS_HETEROGENEOUS_RDMA;
mca_btl_tcp_module.super.btl_seg_size = sizeof (mca_btl_base_segment_t);
mca_btl_tcp_module.super.btl_bandwidth = 100;
mca_btl_tcp_module.super.btl_latency = 100;

Просмотреть файл

@ -58,6 +58,12 @@ struct mca_btl_tcp_frag_t {
size_t size;
int rc;
ompi_free_list_t* my_list;
/* fake rdma completion */
struct {
mca_btl_base_rdma_completion_fn_t func;
void *data;
void *context;
} cb;
};
typedef struct mca_btl_tcp_frag_t mca_btl_tcp_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_tcp_frag_t);
@ -116,10 +122,8 @@ do { \
frag->iov_cnt = 1; \
frag->iov_idx = 0; \
frag->iov_ptr = frag->iov; \
frag->base.des_remote = NULL; \
frag->base.des_remote_count = 0; \
frag->base.des_local = frag->segments; \
frag->base.des_local_count = 1; \
frag->base.des_segments = frag->segments; \
frag->base.des_segment_count = 1; \
} while(0)

Просмотреть файл

@ -270,8 +270,8 @@ mca_btl_base_descriptor_t* mca_btl_template_prepare_src(
frag->segment.seg_len = max_data + reserve;
}
frag->base.des_local = &frag->segment;
frag->base.des_local_count = 1;
frag->base.des_segments = &frag->segment;
frag->base.des_segment_count = 1;
frag->base.des_flags = 0;
return &frag->base;
}
@ -311,8 +311,8 @@ mca_btl_base_descriptor_t* mca_btl_template_prepare_dst(
frag->segment.seg_len = *size;
opal_convertor_get_current_pointer( convertor, (void**)&(frag->segment.seg_addr.pval) );
frag->base.des_local = &frag->segment;
frag->base.des_local_count = 1;
frag->base.des_segments = &frag->segment;
frag->base.des_segment_count = 1;
frag->base.des_flags = 0;
return &frag->base;
}

Просмотреть файл

@ -38,7 +38,8 @@ ugni_SOURCES = \
btl_ugni.h \
btl_ugni_smsg.h \
btl_ugni_smsg.c \
btl_ugni_prepare.h
btl_ugni_prepare.h \
btl_ugni_atomic.c
mcacomponentdir = $(opallibdir)
mcacomponent_LTLIBRARIES = $(component_install)

Просмотреть файл

@ -33,6 +33,7 @@
#include "opal/mca/btl/base/btl_base_error.h"
#include "opal/class/opal_hash_table.h"
#include "opal/class/ompi_free_list.h"
#include "opal/class/opal_free_list.h"
#include "opal/mca/common/ugni/common_ugni.h"
#include <errno.h>
@ -80,6 +81,11 @@ typedef struct mca_btl_ugni_module_t {
opal_mutex_t eager_get_pending_lock;
opal_list_t eager_get_pending;
opal_mutex_t pending_descriptors_lock;
opal_list_t pending_descriptors;
ompi_free_list_t post_descriptors;
mca_mpool_base_module_t *smsg_mpool;
ompi_free_list_t smsg_mboxes;
@ -143,8 +149,6 @@ typedef struct mca_btl_ugni_component_t {
/* After this message size switch to BTE protocols */
size_t ugni_fma_limit;
/* Switch to put when trying to GET at or above this size */
size_t ugni_get_limit;
/* Switch to get when sending above this size */
size_t ugni_smsg_limit;
@ -260,33 +264,31 @@ mca_btl_ugni_sendi (struct mca_btl_base_module_t *btl,
uint32_t flags, mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t **descriptor);
/**
* Initiate a get operation.
*
* location: btl_ugni_get.c
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*/
int
mca_btl_ugni_get (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *des);
int mca_btl_ugni_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
/**
* Initiate a put operation.
*
* location: btl_ugni_put.c
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*/
int
mca_btl_ugni_put (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *des);
int mca_btl_ugni_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
int mca_btl_ugni_aop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle,
mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order,
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
int mca_btl_ugni_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
void *cbcontext, void *cbdata);
int mca_btl_ugni_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, uint64_t value,
int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
int mca_btl_ugni_progress_send_wait_list (struct mca_btl_base_endpoint_t *endpoint);
@ -295,9 +297,14 @@ mca_btl_ugni_alloc(struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
uint8_t order, size_t size, uint32_t flags);
struct mca_btl_base_registration_handle_t {
/** uGNI memory handle */
gni_mem_handle_t gni_handle;
};
typedef struct mca_btl_ugni_reg_t {
mca_mpool_base_registration_t base;
gni_mem_handle_t memory_hdl;
mca_btl_base_registration_handle_t handle;
} mca_btl_ugni_reg_t;
/* Global structures */

Просмотреть файл

@ -34,7 +34,6 @@ int mca_btl_ugni_add_procs(struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t **peers,
opal_bitmap_t *reachable) {
mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl;
opal_proc_t *my_proc = opal_proc_local_get();
size_t i;
int rc;
@ -66,11 +65,8 @@ int mca_btl_ugni_add_procs(struct mca_btl_base_module_t* btl,
if (OPAL_PROC_ON_LOCAL_NODE(ompi_proc->proc_flags)) {
ugni_module->nlocal_procs++;
/* Do not use uGNI to communicate with local procs unless we are adding more ranks.
* Change this when sm and vader are updated to handle additional add procs. */
if (!ugni_module->initialized || my_proc == ompi_proc) {
continue;
}
/* ugni is allowed on local processes to provide support for network
* atomic operations */
}
/* Create and Init endpoints */
@ -188,7 +184,7 @@ static int ugni_reg_rdma_mem (void *reg_data, void *base, size_t size,
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock);
rc = GNI_MemRegister (ugni_module->device->dev_handle, (uint64_t) base,
size, NULL, GNI_MEM_READWRITE | GNI_MEM_RELAXED_PI_ORDERING,
-1, &(ugni_reg->memory_hdl));
-1, &(ugni_reg->handle.gni_handle));
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) {
@ -211,7 +207,7 @@ static int ugni_reg_smsg_mem (void *reg_data, void *base, size_t size,
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock);
rc = GNI_MemRegister (ugni_module->device->dev_handle, (uint64_t) base,
size, ugni_module->smsg_remote_cq, GNI_MEM_READWRITE, -1,
&(ugni_reg->memory_hdl));
&(ugni_reg->handle.gni_handle));
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
return opal_common_rc_ugni_to_opal (rc);
}
@ -224,7 +220,7 @@ ugni_dereg_mem (void *reg_data, mca_mpool_base_registration_t *reg)
gni_return_t rc;
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock);
rc = GNI_MemDeregister (ugni_module->device->dev_handle, &ugni_reg->memory_hdl);
rc = GNI_MemDeregister (ugni_module->device->dev_handle, &ugni_reg->handle.gni_handle);
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
if (GNI_RC_SUCCESS != rc) {
return OPAL_ERROR;
@ -401,6 +397,15 @@ mca_btl_ugni_setup_mpools (mca_btl_ugni_module_t *ugni_module)
return rc;
}
rc = ompi_free_list_init_new (&ugni_module->post_descriptors,
sizeof (mca_btl_ugni_post_descriptor_t),
8, OBJ_CLASS(mca_btl_ugni_post_descriptor_t),
0, 0, 0, -1, 256, NULL);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
BTL_ERROR(("error creating post descriptor free list"));
return rc;
}
return OPAL_SUCCESS;
}

135
opal/mca/btl/ugni/btl_ugni_atomic.c Обычный файл
Просмотреть файл

@ -0,0 +1,135 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "btl_ugni_rdma.h"
static gni_fma_cmd_type_t famo_cmds[] = {
[MCA_BTL_ATOMIC_ADD] = GNI_FMA_ATOMIC_FADD,
[MCA_BTL_ATOMIC_AND] = GNI_FMA_ATOMIC_FAND,
[MCA_BTL_ATOMIC_OR] = GNI_FMA_ATOMIC_FOR,
[MCA_BTL_ATOMIC_XOR] = GNI_FMA_ATOMIC_FXOR,
};
static gni_fma_cmd_type_t amo_cmds[] = {
[MCA_BTL_ATOMIC_ADD] = GNI_FMA_ATOMIC_ADD,
[MCA_BTL_ATOMIC_AND] = GNI_FMA_ATOMIC_AND,
[MCA_BTL_ATOMIC_OR] = GNI_FMA_ATOMIC_OR,
[MCA_BTL_ATOMIC_XOR] = GNI_FMA_ATOMIC_XOR,
};
int mca_btl_ugni_aop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle,
mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order,
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{
gni_mem_handle_t dummy = {0, 0};
mca_btl_ugni_post_descriptor_t *post_desc;
int rc;
rc = mca_btl_ugni_check_endpoint_state_rdma (endpoint);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
return rc;
}
mca_btl_ugni_alloc_post_descriptor (endpoint, NULL, cbfunc, cbcontext, cbdata, &post_desc);
if (OPAL_UNLIKELY(NULL == post_desc)) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
init_gni_post_desc (&post_desc->desc, order, GNI_POST_AMO, 0, dummy, remote_address,
remote_handle->gni_handle, 8, 0);
post_desc->desc.base.amo_cmd = amo_cmds[op];
post_desc->desc.base.first_operand = operand;
OPAL_THREAD_LOCK(&endpoint->btl->device->dev_lock);
rc = GNI_PostFma (endpoint->rdma_ep_handle, &post_desc->desc.base);
OPAL_THREAD_UNLOCK(&endpoint->btl->device->dev_lock);
if (GNI_RC_SUCCESS != rc) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
return OPAL_SUCCESS;
}
int mca_btl_ugni_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
void *cbcontext, void *cbdata)
{
mca_btl_ugni_post_descriptor_t *post_desc;
int rc;
rc = mca_btl_ugni_check_endpoint_state_rdma (endpoint);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
return rc;
}
mca_btl_ugni_alloc_post_descriptor (endpoint, local_handle, cbfunc, cbcontext, cbdata, &post_desc);
if (OPAL_UNLIKELY(NULL == post_desc)) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
init_gni_post_desc (&post_desc->desc, order, GNI_POST_AMO, (intptr_t) local_address, local_handle->gni_handle,
remote_address, remote_handle->gni_handle, 8, 0);
post_desc->desc.base.amo_cmd = famo_cmds[op];
post_desc->desc.base.first_operand = operand;
OPAL_THREAD_LOCK(&endpoint->btl->device->dev_lock);
rc = GNI_PostFma (endpoint->rdma_ep_handle, &post_desc->desc.base);
OPAL_THREAD_UNLOCK(&endpoint->btl->device->dev_lock);
if (GNI_RC_SUCCESS != rc) {
mca_btl_ugni_return_post_descriptor (endpoint->btl, post_desc);
return OPAL_ERR_OUT_OF_RESOURCE;
}
return OPAL_SUCCESS;
}
int mca_btl_ugni_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, uint64_t value, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{
mca_btl_ugni_post_descriptor_t *post_desc;
int rc;
rc = mca_btl_ugni_check_endpoint_state_rdma (endpoint);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
return rc;
}
mca_btl_ugni_alloc_post_descriptor (endpoint, local_handle, cbfunc, cbcontext, cbdata, &post_desc);
if (OPAL_UNLIKELY(NULL == post_desc)) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
init_gni_post_desc (&post_desc->desc, order, GNI_POST_AMO, (intptr_t) local_address, local_handle->gni_handle,
remote_address, remote_handle->gni_handle, 8, 0);
post_desc->desc.base.amo_cmd = GNI_FMA_ATOMIC_CSWAP;
post_desc->desc.base.first_operand = compare;
post_desc->desc.base.second_operand = value;
OPAL_THREAD_LOCK(&endpoint->btl->device->dev_lock);
rc = GNI_PostFma (endpoint->rdma_ep_handle, &post_desc->desc.base);
OPAL_THREAD_UNLOCK(&endpoint->btl->device->dev_lock);
if (GNI_RC_SUCCESS != rc) {
mca_btl_ugni_return_post_descriptor (endpoint->btl, post_desc);
return OPAL_ERR_OUT_OF_RESOURCE;
}
return OPAL_SUCCESS;
}

Просмотреть файл

@ -52,6 +52,7 @@ static int
btl_ugni_component_register(void)
{
mca_base_var_enum_t *new_enum;
gni_nic_device_t device_type;
int rc;
(void) mca_base_var_group_component_register(&mca_btl_ugni_component.super.btl_version,
@ -139,15 +140,6 @@ btl_ugni_component_register(void)
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.ugni_fma_limit);
mca_btl_ugni_component.ugni_get_limit = 1 * 1024 * 1024;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"get_limit", "Maximum size message that "
"will be sent using a get protocol "
"(default 1M)", MCA_BASE_VAR_TYPE_INT,
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.ugni_get_limit);
mca_btl_ugni_component.rdma_max_retries = 16;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"rdma_max_retries", NULL, MCA_BASE_VAR_TYPE_INT,
@ -212,13 +204,28 @@ btl_ugni_component_register(void)
mca_btl_ugni_module.super.btl_max_send_size = 8 * 1024;
mca_btl_ugni_module.super.btl_rdma_pipeline_send_length = 8 * 1024;
mca_btl_ugni_module.super.btl_get_limit = 1 * 1024 * 1024;
/* determine if there are get alignment restrictions */
GNI_GetDeviceType (&device_type);
if (GNI_DEVICE_GEMINI == device_type) {
mca_btl_ugni_module.super.btl_get_alignment = 4;
} else {
mca_btl_ugni_module.super.btl_get_alignment = 0;
}
/* threshold for put */
mca_btl_ugni_module.super.btl_min_rdma_pipeline_size = 8 * 1024;
mca_btl_ugni_module.super.btl_flags = MCA_BTL_FLAGS_SEND |
MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND_INPLACE;
MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND_INPLACE | MCA_BTL_FLAGS_ATOMIC_OPS |
MCA_BTL_FLAGS_ATOMIC_FOPS;
mca_btl_ugni_module.super.btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD |
MCA_BTL_ATOMIC_SUPPORTS_AND | MCA_BTL_ATOMIC_SUPPORTS_OR | MCA_BTL_ATOMIC_SUPPORTS_XOR |
MCA_BTL_ATOMIC_SUPPORTS_CSWAP;
mca_btl_ugni_module.super.btl_seg_size = sizeof (mca_btl_ugni_segment_t);
mca_btl_ugni_module.super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t);
mca_btl_ugni_module.super.btl_bandwidth = 40000; /* Mbs */
mca_btl_ugni_module.super.btl_latency = 2; /* Microsecs */
@ -425,89 +432,107 @@ mca_btl_ugni_progress_datagram (mca_btl_ugni_module_t *ugni_module)
return count;
}
static inline int
mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module, int which_cq)
#if OPAL_ENABLE_DEBUG
static inline void btl_ugni_dump_post_desc (mca_btl_ugni_post_descriptor_t *desc)
{
opal_common_ugni_post_desc_t *desc;
mca_btl_ugni_base_frag_t *frag;
gni_cq_entry_t event_data = 0;
uint32_t recoverable = 1;
gni_return_t rc;
gni_cq_handle_t the_cq;
the_cq = (which_cq == 0) ? ugni_module->rdma_local_cq : ugni_module->rdma_local_irq_cq;
fprintf (stderr, "desc->desc.base.post_id = %" PRIx64 "\n", desc->desc.base.post_id);
fprintf (stderr, "desc->desc.base.status = %" PRIx64 "\n", desc->desc.base.status);
fprintf (stderr, "desc->desc.base.cq_mode_complete = %hu\n", desc->desc.base.cq_mode_complete);
fprintf (stderr, "desc->desc.base.type = %d\n", desc->desc.base.type);
fprintf (stderr, "desc->desc.base.cq_mode = %hu\n", desc->desc.base.cq_mode);
fprintf (stderr, "desc->desc.base.dlvr_mode = %hu\n", desc->desc.base.dlvr_mode);
fprintf (stderr, "desc->desc.base.local_addr = %" PRIx64 "\n", desc->desc.base.local_addr);
fprintf (stderr, "desc->desc.base.local_mem_hndl = {%" PRIx64 ", %" PRIx64 "}\n", desc->desc.base.local_mem_hndl.qword1,
desc->desc.base.local_mem_hndl.qword2);
fprintf (stderr, "desc->desc.base.remote_addr = %" PRIx64 "\n", desc->desc.base.remote_addr);
fprintf (stderr, "desc->desc.base.remote_mem_hndl = {%" PRIx64 ", %" PRIx64 "}\n", desc->desc.base.remote_mem_hndl.qword1,
desc->desc.base.remote_mem_hndl.qword2);
fprintf (stderr, "desc->desc.base.length = %" PRIu64 "\n", desc->desc.base.length);
fprintf (stderr, "desc->desc.base.rdma_mode = %hu\n", desc->desc.base.rdma_mode);
fprintf (stderr, "desc->desc.base.amo_cmd = %d\n", desc->desc.base.amo_cmd);
}
#endif
static inline int mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module, int which_cq)
{
mca_btl_ugni_post_descriptor_t *post_desc = NULL;
gni_cq_entry_t event_data = 0;
gni_post_descriptor_t *desc;
uint32_t recoverable = 1;
gni_return_t grc;
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock);
rc = GNI_CqGetEvent (the_cq, &event_data);
if (GNI_RC_NOT_DONE == rc) {
grc = GNI_CqGetEvent (ugni_module->rdma_local_cq, &event_data);
if (GNI_RC_NOT_DONE == grc) {
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
return 0;
}
if (OPAL_UNLIKELY((GNI_RC_SUCCESS != rc && !event_data) || GNI_CQ_OVERRUN(event_data))) {
if (OPAL_UNLIKELY((GNI_RC_SUCCESS != grc && !event_data) || GNI_CQ_OVERRUN(event_data))) {
/* TODO -- need to handle overrun -- how do we do this without an event?
will the event eventually come back? Ask Cray */
BTL_ERROR(("unhandled post error! ugni rc = %d %s", rc,gni_err_str[rc]));
BTL_ERROR(("unhandled post error! ugni rc = %d %s", grc, gni_err_str[grc]));
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
return opal_common_rc_ugni_to_opal (rc);
return opal_common_rc_ugni_to_opal (grc);
}
rc = GNI_GetCompleted (the_cq, event_data, (gni_post_descriptor_t **) &desc);
grc = GNI_GetCompleted (ugni_module->rdma_local_cq, event_data, &desc);
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc && GNI_RC_TRANSACTION_ERROR != rc)) {
BTL_ERROR(("Error in GNI_GetComplete %s", gni_err_str[rc]));
return opal_common_rc_ugni_to_opal (rc);
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc && GNI_RC_TRANSACTION_ERROR != grc)) {
BTL_ERROR(("Error in GNI_GetComplete %s", gni_err_str[grc]));
return opal_common_rc_ugni_to_opal (grc);
}
frag = MCA_BTL_UGNI_DESC_TO_FRAG(desc);
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc || !GNI_CQ_STATUS_OK(event_data))) {
char buffer[1024];
post_desc = MCA_BTL_UGNI_DESC_TO_PDESC(desc);
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc || !GNI_CQ_STATUS_OK(event_data))) {
(void) GNI_CqErrorRecoverable (event_data, &recoverable);
GNI_CqErrorStr(event_data,buffer,sizeof(buffer));
if (OPAL_UNLIKELY(++frag->post_desc.tries >= mca_btl_ugni_component.rdma_max_retries ||
if (OPAL_UNLIKELY(++post_desc->desc.tries >= mca_btl_ugni_component.rdma_max_retries ||
!recoverable)) {
char char_buffer[1024];
GNI_CqErrorStr (event_data, char_buffer, 1024);
/* give up */
BTL_ERROR(("giving up on frag %p type %d CQE error %s", (void *) frag, frag->post_desc.base.type, buffer));
mca_btl_ugni_frag_complete (frag, OPAL_ERROR);
BTL_ERROR(("giving up on desciptor %p, recoverable %d: %s", (void *) post_desc,
recoverable, char_buffer));
#if OPAL_ENABLE_DEBUG
btl_ugni_dump_post_desc (post_desc);
#endif
mca_btl_ugni_post_desc_complete (ugni_module, post_desc, OPAL_ERROR);
return OPAL_ERROR;
}
/* repost transaction */
mca_btl_ugni_repost (frag);
mca_btl_ugni_repost (ugni_module, post_desc);
return 0;
}
BTL_VERBOSE(("RDMA/FMA complete for frag %p", (void *) frag));
mca_btl_ugni_frag_complete (frag, opal_common_rc_ugni_to_opal (rc));
mca_btl_ugni_post_desc_complete (ugni_module, post_desc, opal_common_rc_ugni_to_opal (grc));
return 1;
}
static inline int
mca_btl_ugni_retry_failed (mca_btl_ugni_module_t *ugni_module)
mca_btl_ugni_post_pending (mca_btl_ugni_module_t *ugni_module)
{
int count = opal_list_get_size (&ugni_module->failed_frags);
int count = opal_list_get_size (&ugni_module->pending_descriptors);
int i;
for (i = 0 ; i < count ; ++i) {
OPAL_THREAD_LOCK(&ugni_module->failed_frags_lock);
mca_btl_ugni_base_frag_t *frag =
(mca_btl_ugni_base_frag_t *) opal_list_remove_first (&ugni_module->failed_frags);
OPAL_THREAD_UNLOCK(&ugni_module->failed_frags_lock);
if (NULL == frag) {
OPAL_THREAD_LOCK(&ugni_module->pending_descriptors_lock);
mca_btl_ugni_post_descriptor_t *post_desc =
(mca_btl_ugni_post_descriptor_t *) opal_list_remove_first (&ugni_module->pending_descriptors);
OPAL_THREAD_UNLOCK(&ugni_module->pending_descriptors_lock);
if (OPAL_SUCCESS != mca_btl_ugni_repost (ugni_module, post_desc)) {
break;
}
mca_btl_ugni_repost (frag);
}
return count;
return i;
}
static inline int
@ -557,7 +582,6 @@ static int mca_btl_ugni_component_progress (void)
for (i = 0 ; i < mca_btl_ugni_component.ugni_num_btls ; ++i) {
ugni_module = mca_btl_ugni_component.modules + i;
mca_btl_ugni_retry_failed (ugni_module);
mca_btl_ugni_progress_wait_list (ugni_module);
count += mca_btl_ugni_progress_datagram (ugni_module);
@ -565,6 +589,8 @@ static int mca_btl_ugni_component_progress (void)
count += mca_btl_ugni_progress_remote_smsg (ugni_module);
count += mca_btl_ugni_progress_rdma (ugni_module, 0);
/* post pending after progressing rdma */
mca_btl_ugni_post_pending (ugni_module);
}
return count;

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше