1
1

Merge pull request #275 from hjelmn/btlmod

Updated the btl interface. Please update your components.
Этот коммит содержится в:
Nathan Hjelm 2014-11-19 15:01:40 -07:00
родитель 6a19bf85dd 5a0a48c3c4
Коммит ccaecf0fd6
129 изменённых файлов: 6476 добавлений и 5681 удалений

Просмотреть файл

@ -152,7 +152,7 @@ AC_DEFUN([OPAL_CHECK_OPENFABRICS],[
# If we have the openib stuff available, find out what we've got # If we have the openib stuff available, find out what we've got
AS_IF([test "$ompi_check_openib_happy" = "yes"], AS_IF([test "$ompi_check_openib_happy" = "yes"],
[AC_CHECK_DECLS([IBV_EVENT_CLIENT_REREGISTER, IBV_ACCESS_SO, IBV_TRANSPORT_USNIC, IBV_TRANSPORT_USNIC_UDP, IBV_NODE_USNIC], [], [], [AC_CHECK_DECLS([IBV_EVENT_CLIENT_REREGISTER, IBV_ACCESS_SO, IBV_TRANSPORT_USNIC, IBV_TRANSPORT_USNIC_UDP, IBV_NODE_USNIC, IBV_ATOMIC_HCA], [], [],
[#include <infiniband/verbs.h>]) [#include <infiniband/verbs.h>])
AC_CHECK_FUNCS([ibv_get_device_list ibv_resize_cq]) AC_CHECK_FUNCS([ibv_get_device_list ibv_resize_cq])

Просмотреть файл

@ -47,6 +47,10 @@ static ompi_errcode_intern_t ompi_err_request;
static ompi_errcode_intern_t ompi_err_buffer; static ompi_errcode_intern_t ompi_err_buffer;
static ompi_errcode_intern_t ompi_err_rma_sync; static ompi_errcode_intern_t ompi_err_rma_sync;
static ompi_errcode_intern_t ompi_err_rma_shared; static ompi_errcode_intern_t ompi_err_rma_shared;
static ompi_errcode_intern_t ompi_err_rma_attach;
static ompi_errcode_intern_t ompi_err_rma_range;
static ompi_errcode_intern_t ompi_err_rma_conflict;
static ompi_errcode_intern_t ompi_err_win;
static void ompi_errcode_intern_construct(ompi_errcode_intern_t* errcode); static void ompi_errcode_intern_construct(ompi_errcode_intern_t* errcode);
static void ompi_errcode_intern_destruct(ompi_errcode_intern_t* errcode); static void ompi_errcode_intern_destruct(ompi_errcode_intern_t* errcode);
@ -210,6 +214,38 @@ int ompi_errcode_intern_init (void)
opal_pointer_array_set_item(&ompi_errcodes_intern, ompi_err_rma_shared.index, opal_pointer_array_set_item(&ompi_errcodes_intern, ompi_err_rma_shared.index,
&ompi_err_rma_shared); &ompi_err_rma_shared);
OBJ_CONSTRUCT(&ompi_err_rma_attach, ompi_errcode_intern_t);
ompi_err_rma_attach.code = OMPI_ERR_RMA_ATTACH;
ompi_err_rma_attach.mpi_code = MPI_ERR_RMA_ATTACH;
ompi_err_rma_attach.index = pos++;
strncpy(ompi_err_rma_attach.errstring, "OMPI_ERR_RMA_ATTACH", OMPI_MAX_ERROR_STRING);
opal_pointer_array_set_item(&ompi_errcodes_intern, ompi_err_rma_attach.index,
&ompi_err_rma_attach);
OBJ_CONSTRUCT(&ompi_err_rma_range, ompi_errcode_intern_t);
ompi_err_rma_range.code = OMPI_ERR_RMA_RANGE;
ompi_err_rma_range.mpi_code = MPI_ERR_RMA_RANGE;
ompi_err_rma_range.index = pos++;
strncpy(ompi_err_rma_range.errstring, "OMPI_ERR_RMA_RANGE", OMPI_MAX_ERROR_STRING);
opal_pointer_array_set_item(&ompi_errcodes_intern, ompi_err_rma_range.index,
&ompi_err_rma_range);
OBJ_CONSTRUCT(&ompi_err_rma_conflict, ompi_errcode_intern_t);
ompi_err_rma_conflict.code = OMPI_ERR_RMA_CONFLICT;
ompi_err_rma_conflict.mpi_code = MPI_ERR_RMA_CONFLICT;
ompi_err_rma_conflict.index = pos++;
strncpy(ompi_err_rma_conflict.errstring, "OMPI_ERR_RMA_CONFLICT", OMPI_MAX_ERROR_STRING);
opal_pointer_array_set_item(&ompi_errcodes_intern, ompi_err_rma_conflict.index,
&ompi_err_rma_conflict);
OBJ_CONSTRUCT(&ompi_err_win, ompi_errcode_intern_t);
ompi_err_win.code = OMPI_ERR_WIN;
ompi_err_win.mpi_code = MPI_ERR_WIN;
ompi_err_win.index = pos++;
strncpy(ompi_err_win.errstring, "OMPI_ERR_WIN", OMPI_MAX_ERROR_STRING);
opal_pointer_array_set_item(&ompi_errcodes_intern, ompi_err_win.index,
&ompi_err_win);
ompi_errcode_intern_lastused=pos; ompi_errcode_intern_lastused=pos;
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
@ -235,6 +271,10 @@ int ompi_errcode_intern_finalize(void)
OBJ_DESTRUCT(&ompi_err_request); OBJ_DESTRUCT(&ompi_err_request);
OBJ_DESTRUCT(&ompi_err_rma_sync); OBJ_DESTRUCT(&ompi_err_rma_sync);
OBJ_DESTRUCT(&ompi_err_rma_shared); OBJ_DESTRUCT(&ompi_err_rma_shared);
OBJ_DESTRUCT(&ompi_err_rma_attach);
OBJ_DESTRUCT(&ompi_err_rma_range);
OBJ_DESTRUCT(&ompi_err_rma_conflict);
OBJ_DESTRUCT(&ompi_err_win);
OBJ_DESTRUCT(&ompi_errcodes_intern); OBJ_DESTRUCT(&ompi_errcodes_intern);
return OMPI_SUCCESS; return OMPI_SUCCESS;

Просмотреть файл

@ -66,7 +66,11 @@ enum {
OMPI_ERR_REQUEST = OMPI_ERR_BASE - 1, OMPI_ERR_REQUEST = OMPI_ERR_BASE - 1,
OMPI_ERR_RMA_SYNC = OMPI_ERR_BASE - 2, OMPI_ERR_RMA_SYNC = OMPI_ERR_BASE - 2,
OMPI_ERR_RMA_SHARED = OMPI_ERR_BASE - 3 OMPI_ERR_RMA_SHARED = OMPI_ERR_BASE - 3,
OMPI_ERR_RMA_ATTACH = OMPI_ERR_BASE - 4,
OMPI_ERR_RMA_RANGE = OMPI_ERR_BASE - 5,
OMPI_ERR_RMA_CONFLICT = OMPI_ERR_BASE - 6,
OMPI_ERR_WIN = OMPI_ERR_BASE - 7,
}; };
#define OMPI_ERR_MAX (OMPI_ERR_BASE - 100) #define OMPI_ERR_MAX (OMPI_ERR_BASE - 100)

Просмотреть файл

@ -91,7 +91,7 @@ static void mca_bml_base_completion(
{ {
mca_bml_base_context_t* ctx = (mca_bml_base_context_t*) des->des_cbdata; mca_bml_base_context_t* ctx = (mca_bml_base_context_t*) des->des_cbdata;
/* restore original state */ /* restore original state */
((unsigned char*)des->des_local[0].seg_addr.pval)[ctx->index] ^= ~0; ((unsigned char*)des->des_segments[0].seg_addr.pval)[ctx->index] ^= ~0;
des->des_cbdata = ctx->cbdata; des->des_cbdata = ctx->cbdata;
des->des_cbfunc = ctx->cbfunc; des->des_cbfunc = ctx->cbfunc;
free(ctx); free(ctx);
@ -121,11 +121,11 @@ int mca_bml_base_send( mca_bml_base_btl_t* bml_btl,
malloc(sizeof(mca_bml_base_context_t)); malloc(sizeof(mca_bml_base_context_t));
if(NULL != ctx) { if(NULL != ctx) {
opal_output(0, "%s:%d: corrupting data\n", __FILE__, __LINE__); opal_output(0, "%s:%d: corrupting data\n", __FILE__, __LINE__);
ctx->index = (size_t) ((des->des_local[0].seg_len * ctx->index = (size_t) ((des->des_segments[0].seg_len *
opal_rand(&mca_bml_base_rand_buff) * 1.0) / (UINT32_MAX + 1.0)); opal_rand(&mca_bml_base_rand_buff) * 1.0) / (UINT32_MAX + 1.0));
ctx->cbfunc = des->des_cbfunc; ctx->cbfunc = des->des_cbfunc;
ctx->cbdata = des->des_cbdata; ctx->cbdata = des->des_cbdata;
((unsigned char*)des->des_local[0].seg_addr.pval)[ctx->index] ^= ~0; ((unsigned char*)des->des_segments[0].seg_addr.pval)[ctx->index] ^= ~0;
des->des_cbdata = ctx; des->des_cbdata = ctx;
des->des_cbfunc = mca_bml_base_completion; des->des_cbfunc = mca_bml_base_completion;
} }

Просмотреть файл

@ -1,3 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/* /*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
@ -10,7 +11,7 @@
* Copyright (c) 2004-2006 The Regents of the University of California. * Copyright (c) 2004-2006 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights * Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
@ -307,27 +308,30 @@ static inline int mca_bml_base_sendi( mca_bml_base_btl_t* bml_btl,
payload_size, order, flags, tag, descriptor); payload_size, order, flags, tag, descriptor);
} }
static inline int mca_bml_base_put( mca_bml_base_btl_t* bml_btl, static inline int mca_bml_base_put( mca_bml_base_btl_t* bml_btl, void *local_address, uint64_t remote_address,
mca_btl_base_descriptor_t* des) struct mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_registration_handle_t *remote_handle, size_t size,
int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbdata)
{ {
mca_btl_base_module_t* btl = bml_btl->btl; mca_btl_base_module_t* btl = bml_btl->btl;
des->des_context = (void*) bml_btl; return btl->btl_put( btl, bml_btl->btl_endpoint, local_address, remote_address, local_handle,
return btl->btl_put( btl, bml_btl->btl_endpoint, des ); remote_handle, size, flags, order, cbfunc, (void *) bml_btl, cbdata);
} }
static inline int mca_bml_base_get( mca_bml_base_btl_t* bml_btl, static inline int mca_bml_base_get( mca_bml_base_btl_t* bml_btl, void *local_address, uint64_t remote_address,
mca_btl_base_descriptor_t* des) struct mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_registration_handle_t *remote_handle, size_t size,
int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbdata)
{ {
mca_btl_base_module_t* btl = bml_btl->btl; mca_btl_base_module_t* btl = bml_btl->btl;
des->des_context = (void*) bml_btl; return btl->btl_get( btl, bml_btl->btl_endpoint, local_address, remote_address, local_handle,
return btl->btl_get( btl, bml_btl->btl_endpoint, des ); remote_handle, size, flags, order, cbfunc, (void *) bml_btl, cbdata);
} }
static inline void mca_bml_base_prepare_src(mca_bml_base_btl_t* bml_btl, static inline void mca_bml_base_prepare_src(mca_bml_base_btl_t* bml_btl,
mca_mpool_base_registration_t* reg,
struct opal_convertor_t* conv, struct opal_convertor_t* conv,
uint8_t order, uint8_t order,
size_t reserve, size_t reserve,
@ -337,29 +341,27 @@ static inline void mca_bml_base_prepare_src(mca_bml_base_btl_t* bml_btl,
{ {
mca_btl_base_module_t* btl = bml_btl->btl; mca_btl_base_module_t* btl = bml_btl->btl;
*des = btl->btl_prepare_src( btl, bml_btl->btl_endpoint, reg, conv, *des = btl->btl_prepare_src( btl, bml_btl->btl_endpoint, conv,
order, reserve, size, flags ); order, reserve, size, flags );
if( OPAL_LIKELY((*des) != NULL) ) { if( OPAL_LIKELY((*des) != NULL) ) {
(*des)->des_context = (void*) bml_btl; (*des)->des_context = (void*) bml_btl;
} }
} }
static inline void mca_bml_base_prepare_dst(mca_bml_base_btl_t* bml_btl, static inline void mca_bml_base_register_mem (mca_bml_base_btl_t* bml_btl, void *base,
mca_mpool_base_registration_t* reg, size_t size, uint32_t flags,
struct opal_convertor_t* conv, mca_btl_base_registration_handle_t **handle)
uint8_t order, {
size_t reserve,
size_t *size,
uint32_t flags,
mca_btl_base_descriptor_t** des)
{
mca_btl_base_module_t* btl = bml_btl->btl; mca_btl_base_module_t* btl = bml_btl->btl;
*des = btl->btl_prepare_dst( btl, bml_btl->btl_endpoint, reg, conv, *handle = btl->btl_register_mem (btl, bml_btl->btl_endpoint, base, size, flags);
order, reserve, size, flags ); }
if( OPAL_LIKELY((*des) != NULL) ) {
(*des)->des_context = (void*) bml_btl; static inline void mca_bml_base_deregister_mem (mca_bml_base_btl_t* bml_btl, mca_btl_base_registration_handle_t *handle)
} {
mca_btl_base_module_t* btl = bml_btl->btl;
btl->btl_deregister_mem (btl, handle);
} }
/* /*

Просмотреть файл

@ -86,9 +86,7 @@ static int mca_bml_r2_add_btls( void )
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }
for(selected_btl = (mca_btl_base_selected_module_t*)opal_list_get_first(btls); OPAL_LIST_FOREACH(selected_btl, btls, mca_btl_base_selected_module_t) {
selected_btl != (mca_btl_base_selected_module_t*)opal_list_get_end(btls);
selected_btl = (mca_btl_base_selected_module_t*)opal_list_get_next(selected_btl)) {
mca_btl_base_module_t *btl = selected_btl->btl_module; mca_btl_base_module_t *btl = selected_btl->btl_module;
mca_bml_r2.btl_modules[mca_bml_r2.num_btl_modules++] = btl; mca_bml_r2.btl_modules[mca_bml_r2.num_btl_modules++] = btl;
for (i = 0; NULL != btl_names_argv && NULL != btl_names_argv[i]; ++i) { for (i = 0; NULL != btl_names_argv && NULL != btl_names_argv[i]; ++i) {
@ -127,6 +125,23 @@ static int btl_bandwidth_compare(const void *v1, const void *v2)
return b2->btl->btl_bandwidth - b1->btl->btl_bandwidth; return b2->btl->btl_bandwidth - b1->btl->btl_bandwidth;
} }
static void mca_bml_r2_calculate_bandwidth_latency (mca_bml_base_btl_array_t *btl_array, double *total_bandwidth, uint32_t *latency)
{
const size_t array_length = mca_bml_base_btl_array_get_size (btl_array);
*latency = UINT_MAX;
*total_bandwidth = 0.;
for (size_t i = 0 ; i < array_length ; ++i) {
mca_bml_base_btl_t *bml_btl = mca_bml_base_btl_array_get_index (btl_array, i);
mca_btl_base_module_t *btl = bml_btl->btl;
*total_bandwidth += btl->btl_bandwidth;
if (btl->btl_latency < *latency) {
*latency = btl->btl_latency;
}
}
}
/* /*
* For each proc setup a datastructure that indicates the BTLs * For each proc setup a datastructure that indicates the BTLs
* that can be used to reach the destination. * that can be used to reach the destination.
@ -189,6 +204,7 @@ static int mca_bml_r2_add_procs( size_t nprocs,
for(p_index = 0; p_index < mca_bml_r2.num_btl_modules; p_index++) { for(p_index = 0; p_index < mca_bml_r2.num_btl_modules; p_index++) {
mca_btl_base_module_t* btl = mca_bml_r2.btl_modules[p_index]; mca_btl_base_module_t* btl = mca_bml_r2.btl_modules[p_index];
int btl_inuse = 0; int btl_inuse = 0;
int btl_flags;
/* if the r2 can reach the destination proc it sets the /* if the r2 can reach the destination proc it sets the
* corresponding bit (proc index) in the reachable bitmap * corresponding bit (proc index) in the reachable bitmap
@ -212,7 +228,7 @@ static int mca_bml_r2_add_procs( size_t nprocs,
ompi_proc_t *proc = new_procs[p]; ompi_proc_t *proc = new_procs[p];
mca_bml_base_endpoint_t * bml_endpoint = mca_bml_base_endpoint_t * bml_endpoint =
(mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
mca_bml_base_btl_t* bml_btl; mca_bml_base_btl_t* bml_btl = NULL;
size_t size; size_t size;
if(NULL == bml_endpoint) { if(NULL == bml_endpoint) {
@ -236,12 +252,35 @@ static int mca_bml_r2_add_procs( size_t nprocs,
bml_endpoint->btl_flags_or = 0; bml_endpoint->btl_flags_or = 0;
} }
btl_flags = btl->btl_flags;
if( (btl_flags & MCA_BTL_FLAGS_PUT) && (NULL == btl->btl_put) ) {
opal_output(0, "mca_bml_r2_add_procs: The PUT flag is specified for"
" the %s BTL without any PUT function attached. Discard the flag !",
bml_btl->btl->btl_component->btl_version.mca_component_name);
btl_flags ^= MCA_BTL_FLAGS_PUT;
}
if( (btl_flags & MCA_BTL_FLAGS_GET) && (NULL == btl->btl_get) ) {
opal_output(0, "mca_bml_r2_add_procs: The GET flag is specified for"
" the %s BTL without any GET function attached. Discard the flag !",
bml_btl->btl->btl_component->btl_version.mca_component_name);
btl_flags ^= MCA_BTL_FLAGS_GET;
}
if( (btl_flags & (MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_SEND)) == 0 ) {
/**
* If no protocol specified, we have 2 choices: we ignore the BTL
* as we don't know which protocl to use, or we suppose that all
* BTLs support the send protocol.
*/
btl_flags |= MCA_BTL_FLAGS_SEND;
}
/* dont allow an additional BTL with a lower exclusivity ranking */ /* dont allow an additional BTL with a lower exclusivity ranking */
size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send); size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
if(size > 0) { if(size > 0) {
bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, size-1); bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, size-1);
/* skip this btl if the exclusivity is less than the previous */ /* skip this btl if the exclusivity is less than the previous only if the btl does not provide full rdma (for one-sided) */
if(bml_btl->btl->btl_exclusivity > btl->btl_exclusivity) { if(bml_btl->btl->btl_exclusivity > btl->btl_exclusivity && ((btl_flags & MCA_BTL_FLAGS_RDMA) != MCA_BTL_FLAGS_RDMA)) {
btl->btl_del_procs(btl, 1, (opal_proc_t**)&proc, &btl_endpoints[p]); btl->btl_del_procs(btl, 1, (opal_proc_t**)&proc, &btl_endpoints[p]);
opal_output_verbose(20, opal_btl_base_framework.framework_output, opal_output_verbose(20, opal_btl_base_framework.framework_output,
"mca: bml: Not using %s btl to %s on node %s " "mca: bml: Not using %s btl to %s on node %s "
@ -261,39 +300,44 @@ static int mca_bml_r2_add_procs( size_t nprocs,
proc->super.proc_hostname); proc->super.proc_hostname);
/* cache the endpoint on the proc */ /* cache the endpoint on the proc */
bml_btl = mca_bml_base_btl_array_insert(&bml_endpoint->btl_send); if (NULL == bml_btl || (bml_btl->btl->btl_exclusivity <= btl->btl_exclusivity)) {
bml_btl->btl = btl; bml_btl = mca_bml_base_btl_array_insert(&bml_endpoint->btl_send);
bml_btl->btl_endpoint = btl_endpoints[p]; bml_btl->btl = btl;
bml_btl->btl_weight = 0; bml_btl->btl_endpoint = btl_endpoints[p];
bml_btl->btl_flags = btl->btl_flags; bml_btl->btl_weight = 0;
if( (bml_btl->btl_flags & MCA_BTL_FLAGS_PUT) && (NULL == btl->btl_put) ) { bml_btl->btl_flags = btl_flags;
opal_output(0, "mca_bml_r2_add_procs: The PUT flag is specified for"
" the %s BTL without any PUT function attached. Discard the flag !",
bml_btl->btl->btl_component->btl_version.mca_component_name);
bml_btl->btl_flags ^= MCA_BTL_FLAGS_PUT;
}
if( (bml_btl->btl_flags & MCA_BTL_FLAGS_GET) && (NULL == btl->btl_get) ) {
opal_output(0, "mca_bml_r2_add_procs: The GET flag is specified for"
" the %s BTL without any GET function attached. Discard the flag !",
bml_btl->btl->btl_component->btl_version.mca_component_name);
bml_btl->btl_flags ^= MCA_BTL_FLAGS_GET;
}
if( (bml_btl->btl_flags & (MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_SEND)) == 0 ) {
/** /**
* If no protocol specified, we have 2 choices: we ignore the BTL * calculate the bitwise OR of the btl flags
* as we don't know which protocl to use, or we suppose that all
* BTLs support the send protocol.
*/ */
bml_btl->btl_flags |= MCA_BTL_FLAGS_SEND; bml_endpoint->btl_flags_or |= bml_btl->btl_flags;
} }
/**
* calculate the bitwise OR of the btl flags /* always add rdma endpoints */
*/ if ((btl_flags & MCA_BTL_FLAGS_RDMA) &&
bml_endpoint->btl_flags_or |= bml_btl->btl_flags; !((proc->super.proc_arch != ompi_proc_local_proc->super.proc_arch) &&
(0 == (btl->btl_flags & MCA_BTL_FLAGS_HETEROGENEOUS_RDMA)))) {
mca_bml_base_btl_t *bml_btl_rdma = mca_bml_base_btl_array_insert(&bml_endpoint->btl_rdma);
bml_btl_rdma->btl = btl;
bml_btl_rdma->btl_endpoint = btl_endpoints[p];
bml_btl_rdma->btl_weight = 0;
bml_btl_rdma->btl_flags = btl_flags;
if (bml_endpoint->btl_pipeline_send_length < btl->btl_rdma_pipeline_send_length) {
bml_endpoint->btl_pipeline_send_length = btl->btl_rdma_pipeline_send_length;
}
if (bml_endpoint->btl_send_limit < btl->btl_min_rdma_pipeline_size) {
bml_endpoint->btl_send_limit = btl->btl_min_rdma_pipeline_size;
}
}
/* This BTL is in use, allow the progress registration */ /* This BTL is in use, allow the progress registration */
btl_inuse++; btl_inuse++;
} }
} }
if(btl_inuse > 0 && NULL != btl->btl_component->btl_progress) { if(btl_inuse > 0 && NULL != btl->btl_component->btl_progress) {
size_t p; size_t p;
bool found = false; bool found = false;
@ -319,9 +363,8 @@ static int mca_bml_r2_add_procs( size_t nprocs,
mca_bml_base_endpoint_t* bml_endpoint = mca_bml_base_endpoint_t* bml_endpoint =
(mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
double total_bandwidth = 0; double total_bandwidth = 0;
uint32_t latency = 0xffffffff; uint32_t latency;
size_t n_index; size_t n_send, n_rdma;
size_t n_size;
/* skip over procs w/ no btl's registered */ /* skip over procs w/ no btl's registered */
if(NULL == bml_endpoint) { if(NULL == bml_endpoint) {
@ -335,28 +378,22 @@ static int mca_bml_r2_add_procs( size_t nprocs,
* weighting. Once the left over is smaller than this number we will * weighting. Once the left over is smaller than this number we will
* start using the weight to compute the correct amount. * start using the weight to compute the correct amount.
*/ */
n_size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send); n_send = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
n_rdma = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
/* sort BTLs in descending order according to bandwidth value */ /* sort BTLs in descending order according to bandwidth value */
qsort(bml_endpoint->btl_send.bml_btls, n_size, qsort(bml_endpoint->btl_send.bml_btls, n_send,
sizeof(mca_bml_base_btl_t), btl_bandwidth_compare); sizeof(mca_bml_base_btl_t), btl_bandwidth_compare);
bml_endpoint->btl_rdma_index = 0; bml_endpoint->btl_rdma_index = 0;
for(n_index = 0; n_index < n_size; n_index++) {
mca_bml_base_btl_t* bml_btl = mca_bml_r2_calculate_bandwidth_latency (&bml_endpoint->btl_send, &total_bandwidth, &latency);
mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index);
mca_btl_base_module_t* btl = bml_btl->btl;
total_bandwidth += bml_btl->btl->btl_bandwidth;
if(btl->btl_latency < latency) {
latency = btl->btl_latency;
}
}
/* (1) set the weight of each btl as a percentage of overall bandwidth /* (1) set the weight of each btl as a percentage of overall bandwidth
* (2) copy all btl instances at the highest priority ranking into the * (2) copy all btl instances at the highest priority ranking into the
* list of btls used for first fragments * list of btls used for first fragments
*/ */
for(n_index = 0; n_index < n_size; n_index++) { for (size_t n_index = 0 ; n_index < n_send ; ++n_index) {
mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_t* bml_btl =
mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index); mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index);
mca_btl_base_module_t *btl = bml_btl->btl; mca_btl_base_module_t *btl = bml_btl->btl;
@ -365,7 +402,7 @@ static int mca_bml_r2_add_procs( size_t nprocs,
if(btl->btl_bandwidth > 0) { if(btl->btl_bandwidth > 0) {
bml_btl->btl_weight = (float)(btl->btl_bandwidth / total_bandwidth); bml_btl->btl_weight = (float)(btl->btl_bandwidth / total_bandwidth);
} else { } else {
bml_btl->btl_weight = (float)(1.0 / n_size); bml_btl->btl_weight = (float)(1.0 / n_send);
} }
/* check to see if this r2 is already in the array of r2s /* check to see if this r2 is already in the array of r2s
@ -380,21 +417,24 @@ static int mca_bml_r2_add_procs( size_t nprocs,
/* set endpoint max send size as min of available btls */ /* set endpoint max send size as min of available btls */
if(bml_endpoint->btl_max_send_size > btl->btl_max_send_size) if(bml_endpoint->btl_max_send_size > btl->btl_max_send_size)
bml_endpoint->btl_max_send_size = btl->btl_max_send_size; bml_endpoint->btl_max_send_size = btl->btl_max_send_size;
}
/* check flags - is rdma prefered */ /* sort BTLs in descending order according to bandwidth value */
if ((btl->btl_flags & (MCA_BTL_FLAGS_PUT|MCA_BTL_FLAGS_GET)) && qsort(bml_endpoint->btl_rdma.bml_btls, n_rdma,
!((proc->super.proc_arch != ompi_proc_local_proc->super.proc_arch) && sizeof(mca_bml_base_btl_t), btl_bandwidth_compare);
(0 == (btl->btl_flags & MCA_BTL_FLAGS_HETEROGENEOUS_RDMA)))) {
mca_bml_base_btl_t* bml_btl_rdma = mca_bml_base_btl_array_insert(&bml_endpoint->btl_rdma);
mca_btl_base_module_t* btl_rdma = bml_btl->btl;
*bml_btl_rdma = *bml_btl; mca_bml_r2_calculate_bandwidth_latency (&bml_endpoint->btl_rdma, &total_bandwidth, &latency);
if(bml_endpoint->btl_pipeline_send_length < btl_rdma->btl_rdma_pipeline_send_length) {
bml_endpoint->btl_pipeline_send_length = btl_rdma->btl_rdma_pipeline_send_length; /* set rdma btl weights */
} for (size_t n_index = 0 ; n_index < n_rdma ; ++n_index) {
if(bml_endpoint->btl_send_limit < btl_rdma->btl_min_rdma_pipeline_size) { mca_bml_base_btl_t *bml_btl =
bml_endpoint->btl_send_limit = btl_rdma->btl_min_rdma_pipeline_size; mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma, n_index);
}
/* compute weighting factor for this r2 */
if (bml_btl->btl->btl_bandwidth > 0.0) {
bml_btl->btl_weight = (float)(bml_btl->btl->btl_bandwidth / total_bandwidth);
} else {
bml_btl->btl_weight = (float)(1.0 / n_rdma);
} }
} }
} }

Просмотреть файл

@ -8,6 +8,8 @@
# Copyright (c) 2004-2005 The Regents of the University of California. # Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved. # All rights reserved.
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2014 Los Alamos National Security, LLC. All rights
# reserved.
# $COPYRIGHT$ # $COPYRIGHT$
# #
# Additional copyrights may follow # Additional copyrights may follow
@ -15,39 +17,39 @@
# $HEADER$ # $HEADER$
# #
rdma_sources = \ pt2pt_sources = \
osc_rdma.h \ osc_pt2pt.h \
osc_rdma.c \ osc_pt2pt_module.c \
osc_rdma_comm.c \ osc_pt2pt_comm.c \
osc_rdma_component.c \ osc_pt2pt_component.c \
osc_rdma_data_move.h \ osc_pt2pt_data_move.h \
osc_rdma_data_move.c \ osc_pt2pt_data_move.c \
osc_rdma_frag.h \ osc_pt2pt_frag.h \
osc_rdma_frag.c \ osc_pt2pt_frag.c \
osc_rdma_header.h \ osc_pt2pt_header.h \
osc_rdma_obj_convert.h \ osc_pt2pt_obj_convert.h \
osc_rdma_request.h \ osc_pt2pt_request.h \
osc_rdma_request.c \ osc_pt2pt_request.c \
osc_rdma_active_target.c \ osc_pt2pt_active_target.c \
osc_rdma_passive_target.c osc_pt2pt_passive_target.c
# Make the output library in this directory, and name it either # Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la # mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds). # (for static builds).
if MCA_BUILD_ompi_osc_rdma_DSO if MCA_BUILD_ompi_osc_pt2pt_DSO
component_noinst = component_noinst =
component_install = mca_osc_rdma.la component_install = mca_osc_pt2pt.la
else else
component_noinst = libmca_osc_rdma.la component_noinst = libmca_osc_pt2pt.la
component_install = component_install =
endif endif
mcacomponentdir = $(ompilibdir) mcacomponentdir = $(ompilibdir)
mcacomponent_LTLIBRARIES = $(component_install) mcacomponent_LTLIBRARIES = $(component_install)
mca_osc_rdma_la_SOURCES = $(rdma_sources) mca_osc_pt2pt_la_SOURCES = $(pt2pt_sources)
mca_osc_rdma_la_LDFLAGS = -module -avoid-version mca_osc_pt2pt_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst) noinst_LTLIBRARIES = $(component_noinst)
libmca_osc_rdma_la_SOURCES = $(rdma_sources) libmca_osc_pt2pt_la_SOURCES = $(pt2pt_sources)
libmca_osc_rdma_la_LDFLAGS = -module -avoid-version libmca_osc_pt2pt_la_LDFLAGS = -module -avoid-version

20
ompi/mca/osc/pt2pt/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,20 @@
# -*- shell-script -*-
#
# Copyright (c) 2013 Sandia National Laboratories. All rights reserved.
# Copyright (c) 2014 Los Alamos National Security, LLC. All rights
# reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_ompi_osc_pt2pt_CONFIG([action-if-can-compile],
# [action-if-cant-compile])
# ------------------------------------------------
# We can always build, unless we were explicitly disabled.
AC_DEFUN([MCA_ompi_osc_pt2pt_CONFIG],[
AC_CONFIG_FILES([ompi/mca/osc/pt2pt/Makefile])
[$1]
])dnl

Просмотреть файл

@ -19,8 +19,8 @@
* $HEADER$ * $HEADER$
*/ */
#ifndef OMPI_OSC_RDMA_H #ifndef OMPI_OSC_PT2PT_H
#define OMPI_OSC_RDMA_H #define OMPI_OSC_PT2PT_H
#include "ompi_config.h" #include "ompi_config.h"
#include "opal/class/opal_list.h" #include "opal/class/opal_list.h"
@ -39,13 +39,13 @@
#include "ompi/mca/bml/bml.h" #include "ompi/mca/bml/bml.h"
#include "ompi/memchecker.h" #include "ompi/memchecker.h"
#include "osc_rdma_header.h" #include "osc_pt2pt_header.h"
BEGIN_C_DECLS BEGIN_C_DECLS
struct ompi_osc_rdma_frag_t; struct ompi_osc_pt2pt_frag_t;
struct ompi_osc_rdma_component_t { struct ompi_osc_pt2pt_component_t {
/** Extend the basic osc component interface */ /** Extend the basic osc component interface */
ompi_osc_base_component_t super; ompi_osc_base_component_t super;
@ -58,46 +58,45 @@ struct ompi_osc_rdma_component_t {
/** module count */ /** module count */
int module_count; int module_count;
/** free list of ompi_osc_rdma_frag_t structures */ /** free list of ompi_osc_pt2pt_frag_t structures */
opal_free_list_t frags; ompi_free_list_t frags;
/** Free list of requests */ /** Free list of requests */
ompi_free_list_t requests; ompi_free_list_t requests;
/** RDMA component buffer size */ /** PT2PT component buffer size */
unsigned int buffer_size; unsigned int buffer_size;
/** Lock for pending_operations */
opal_mutex_t pending_operations_lock;
/** List of operations that need to be processed */ /** List of operations that need to be processed */
opal_list_t pending_operations; opal_list_t pending_operations;
/** Is the progress function enabled? */ /** Is the progress function enabled? */
bool progress_enable; bool progress_enable;
/** List of requests that need to be freed */
opal_list_t request_gc;
/** List of buffers that need to be freed */
opal_list_t buffer_gc;
}; };
typedef struct ompi_osc_rdma_component_t ompi_osc_rdma_component_t; typedef struct ompi_osc_pt2pt_component_t ompi_osc_pt2pt_component_t;
struct ompi_osc_rdma_peer_t { struct ompi_osc_pt2pt_peer_t {
/** Pointer to the current send fragment for each outgoing target */ /** Pointer to the current send fragment for each outgoing target */
struct ompi_osc_rdma_frag_t *active_frag; struct ompi_osc_pt2pt_frag_t *active_frag;
/** Number of acks pending. New requests can not be sent out if there are /** Number of acks pending. New requests can not be sent out if there are
* acks pending (to fulfill the ordering constraints of accumulate) */ * acks pending (to fulfill the ordering constraints of accumulate) */
uint32_t num_acks_pending; uint32_t num_acks_pending;
int32_t passive_incoming_frag_count;
bool access_epoch; bool access_epoch;
bool eager_send_active;
}; };
typedef struct ompi_osc_rdma_peer_t ompi_osc_rdma_peer_t; typedef struct ompi_osc_pt2pt_peer_t ompi_osc_pt2pt_peer_t;
#define SEQ_INVALID 0xFFFFFFFFFFFFFFFFULL #define SEQ_INVALID 0xFFFFFFFFFFFFFFFFULL
/** Module structure. Exactly one of these is associated with each /** Module structure. Exactly one of these is associated with each
RDMA window */ PT2PT window */
struct ompi_osc_rdma_module_t { struct ompi_osc_pt2pt_module_t {
/** Extend the basic osc module interface */ /** Extend the basic osc module interface */
ompi_osc_base_module_t super; ompi_osc_base_module_t super;
@ -127,12 +126,15 @@ struct ompi_osc_rdma_module_t {
opal_mutex_t acc_lock; opal_mutex_t acc_lock;
/** peer data */ /** peer data */
ompi_osc_rdma_peer_t *peers; ompi_osc_pt2pt_peer_t *peers;
/** Nmber of communication fragments started for this epoch, by /** Nmber of communication fragments started for this epoch, by
peer. Not in peer data to make fence more manageable. */ peer. Not in peer data to make fence more manageable. */
uint32_t *epoch_outgoing_frag_count; uint32_t *epoch_outgoing_frag_count;
/** Lock for queued_frags */
opal_mutex_t queued_frags_lock;
/** List of full communication buffers queued to be sent. Should /** List of full communication buffers queued to be sent. Should
be maintained in order (at least in per-target order). */ be maintained in order (at least in per-target order). */
opal_list_t queued_frags; opal_list_t queued_frags;
@ -152,9 +154,6 @@ struct ompi_osc_rdma_module_t {
/* Next incoming buffer count at which we want a signal on cond */ /* Next incoming buffer count at which we want a signal on cond */
uint32_t active_incoming_frag_signal_count; uint32_t active_incoming_frag_signal_count;
uint32_t *passive_incoming_frag_count;
uint32_t *passive_incoming_frag_signal_count;
/* Number of flush ack requests send since beginning of time */ /* Number of flush ack requests send since beginning of time */
uint64_t flush_ack_requested_count; uint64_t flush_ack_requested_count;
/* Number of flush ack replies received since beginning of /* Number of flush ack replies received since beginning of
@ -171,8 +170,6 @@ struct ompi_osc_rdma_module_t {
/** Indicates the window is in an all access epoch (fence, lock_all) */ /** Indicates the window is in an all access epoch (fence, lock_all) */
bool all_access_epoch; bool all_access_epoch;
bool *passive_eager_send_active;
/* ********************* PWSC data ************************ */ /* ********************* PWSC data ************************ */
struct ompi_group_t *pw_group; struct ompi_group_t *pw_group;
struct ompi_group_t *sc_group; struct ompi_group_t *sc_group;
@ -189,9 +186,11 @@ struct ompi_osc_rdma_module_t {
/** Status of the local window lock. One of 0 (unlocked), /** Status of the local window lock. One of 0 (unlocked),
MPI_LOCK_EXCLUSIVE, or MPI_LOCK_SHARED. */ MPI_LOCK_EXCLUSIVE, or MPI_LOCK_SHARED. */
int lock_status; int32_t lock_status;
/** number of peers who hold a shared lock on the local window */
int32_t shared_count; /** lock for locks_pending list */
opal_mutex_t locks_pending_lock;
/** target side list of lock requests we couldn't satisfy yet */ /** target side list of lock requests we couldn't satisfy yet */
opal_list_t locks_pending; opal_list_t locks_pending;
@ -210,29 +209,38 @@ struct ompi_osc_rdma_module_t {
/* enforce pscw matching */ /* enforce pscw matching */
/** list of unmatched post messages */ /** list of unmatched post messages */
opal_list_t pending_posts; opal_list_t pending_posts;
};
typedef struct ompi_osc_rdma_module_t ompi_osc_rdma_module_t;
OMPI_MODULE_DECLSPEC extern ompi_osc_rdma_component_t mca_osc_rdma_component;
struct ompi_osc_rdma_pending_t { /** Lock for garbage collection lists */
opal_mutex_t gc_lock;
/** List of requests that need to be freed */
opal_list_t request_gc;
/** List of buffers that need to be freed */
opal_list_t buffer_gc;
};
typedef struct ompi_osc_pt2pt_module_t ompi_osc_pt2pt_module_t;
OMPI_MODULE_DECLSPEC extern ompi_osc_pt2pt_component_t mca_osc_pt2pt_component;
struct ompi_osc_pt2pt_pending_t {
opal_list_item_t super; opal_list_item_t super;
ompi_osc_rdma_module_t *module; ompi_osc_pt2pt_module_t *module;
int source; int source;
ompi_osc_rdma_header_t header; ompi_osc_pt2pt_header_t header;
}; };
typedef struct ompi_osc_rdma_pending_t ompi_osc_rdma_pending_t; typedef struct ompi_osc_pt2pt_pending_t ompi_osc_pt2pt_pending_t;
OBJ_CLASS_DECLARATION(ompi_osc_rdma_pending_t); OBJ_CLASS_DECLARATION(ompi_osc_pt2pt_pending_t);
#define GET_MODULE(win) ((ompi_osc_rdma_module_t*) win->w_osc_module) #define GET_MODULE(win) ((ompi_osc_pt2pt_module_t*) win->w_osc_module)
extern bool ompi_osc_rdma_no_locks; extern bool ompi_osc_pt2pt_no_locks;
int ompi_osc_rdma_attach(struct ompi_win_t *win, void *base, size_t len); int ompi_osc_pt2pt_attach(struct ompi_win_t *win, void *base, size_t len);
int ompi_osc_rdma_detach(struct ompi_win_t *win, void *base); int ompi_osc_pt2pt_detach(struct ompi_win_t *win, void *base);
int ompi_osc_rdma_free(struct ompi_win_t *win); int ompi_osc_pt2pt_free(struct ompi_win_t *win);
int ompi_osc_rdma_put(void *origin_addr, int ompi_osc_pt2pt_put(void *origin_addr,
int origin_count, int origin_count,
struct ompi_datatype_t *origin_dt, struct ompi_datatype_t *origin_dt,
int target, int target,
@ -241,7 +249,7 @@ int ompi_osc_rdma_put(void *origin_addr,
struct ompi_datatype_t *target_dt, struct ompi_datatype_t *target_dt,
struct ompi_win_t *win); struct ompi_win_t *win);
int ompi_osc_rdma_accumulate(void *origin_addr, int ompi_osc_pt2pt_accumulate(void *origin_addr,
int origin_count, int origin_count,
struct ompi_datatype_t *origin_dt, struct ompi_datatype_t *origin_dt,
int target, int target,
@ -251,7 +259,7 @@ int ompi_osc_rdma_accumulate(void *origin_addr,
struct ompi_op_t *op, struct ompi_op_t *op,
struct ompi_win_t *win); struct ompi_win_t *win);
int ompi_osc_rdma_get(void *origin_addr, int ompi_osc_pt2pt_get(void *origin_addr,
int origin_count, int origin_count,
struct ompi_datatype_t *origin_dt, struct ompi_datatype_t *origin_dt,
int target, int target,
@ -260,7 +268,7 @@ int ompi_osc_rdma_get(void *origin_addr,
struct ompi_datatype_t *target_dt, struct ompi_datatype_t *target_dt,
struct ompi_win_t *win); struct ompi_win_t *win);
int ompi_osc_rdma_compare_and_swap(void *origin_addr, int ompi_osc_pt2pt_compare_and_swap(void *origin_addr,
void *compare_addr, void *compare_addr,
void *result_addr, void *result_addr,
struct ompi_datatype_t *dt, struct ompi_datatype_t *dt,
@ -268,7 +276,7 @@ int ompi_osc_rdma_compare_and_swap(void *origin_addr,
OPAL_PTRDIFF_TYPE target_disp, OPAL_PTRDIFF_TYPE target_disp,
struct ompi_win_t *win); struct ompi_win_t *win);
int ompi_osc_rdma_fetch_and_op(void *origin_addr, int ompi_osc_pt2pt_fetch_and_op(void *origin_addr,
void *result_addr, void *result_addr,
struct ompi_datatype_t *dt, struct ompi_datatype_t *dt,
int target, int target,
@ -276,7 +284,7 @@ int ompi_osc_rdma_fetch_and_op(void *origin_addr,
struct ompi_op_t *op, struct ompi_op_t *op,
struct ompi_win_t *win); struct ompi_win_t *win);
int ompi_osc_rdma_get_accumulate(void *origin_addr, int ompi_osc_pt2pt_get_accumulate(void *origin_addr,
int origin_count, int origin_count,
struct ompi_datatype_t *origin_datatype, struct ompi_datatype_t *origin_datatype,
void *result_addr, void *result_addr,
@ -289,7 +297,7 @@ int ompi_osc_rdma_get_accumulate(void *origin_addr,
struct ompi_op_t *op, struct ompi_op_t *op,
struct ompi_win_t *win); struct ompi_win_t *win);
int ompi_osc_rdma_rput(void *origin_addr, int ompi_osc_pt2pt_rput(void *origin_addr,
int origin_count, int origin_count,
struct ompi_datatype_t *origin_dt, struct ompi_datatype_t *origin_dt,
int target, int target,
@ -299,7 +307,7 @@ int ompi_osc_rdma_rput(void *origin_addr,
struct ompi_win_t *win, struct ompi_win_t *win,
struct ompi_request_t **request); struct ompi_request_t **request);
int ompi_osc_rdma_rget(void *origin_addr, int ompi_osc_pt2pt_rget(void *origin_addr,
int origin_count, int origin_count,
struct ompi_datatype_t *origin_dt, struct ompi_datatype_t *origin_dt,
int target, int target,
@ -309,7 +317,7 @@ int ompi_osc_rdma_rget(void *origin_addr,
struct ompi_win_t *win, struct ompi_win_t *win,
struct ompi_request_t **request); struct ompi_request_t **request);
int ompi_osc_rdma_raccumulate(void *origin_addr, int ompi_osc_pt2pt_raccumulate(void *origin_addr,
int origin_count, int origin_count,
struct ompi_datatype_t *origin_dt, struct ompi_datatype_t *origin_dt,
int target, int target,
@ -320,7 +328,7 @@ int ompi_osc_rdma_raccumulate(void *origin_addr,
struct ompi_win_t *win, struct ompi_win_t *win,
struct ompi_request_t **request); struct ompi_request_t **request);
int ompi_osc_rdma_rget_accumulate(void *origin_addr, int ompi_osc_pt2pt_rget_accumulate(void *origin_addr,
int origin_count, int origin_count,
struct ompi_datatype_t *origin_datatype, struct ompi_datatype_t *origin_datatype,
void *result_addr, void *result_addr,
@ -334,51 +342,51 @@ int ompi_osc_rdma_rget_accumulate(void *origin_addr,
struct ompi_win_t *win, struct ompi_win_t *win,
struct ompi_request_t **request); struct ompi_request_t **request);
int ompi_osc_rdma_fence(int assert, struct ompi_win_t *win); int ompi_osc_pt2pt_fence(int assert, struct ompi_win_t *win);
/* received a post message */ /* received a post message */
int osc_rdma_incoming_post (ompi_osc_rdma_module_t *module, int source); int osc_pt2pt_incoming_post (ompi_osc_pt2pt_module_t *module, int source);
int ompi_osc_rdma_start(struct ompi_group_t *group, int ompi_osc_pt2pt_start(struct ompi_group_t *group,
int assert, int assert,
struct ompi_win_t *win); struct ompi_win_t *win);
int ompi_osc_rdma_complete(struct ompi_win_t *win); int ompi_osc_pt2pt_complete(struct ompi_win_t *win);
int ompi_osc_rdma_post(struct ompi_group_t *group, int ompi_osc_pt2pt_post(struct ompi_group_t *group,
int assert, int assert,
struct ompi_win_t *win); struct ompi_win_t *win);
int ompi_osc_rdma_wait(struct ompi_win_t *win); int ompi_osc_pt2pt_wait(struct ompi_win_t *win);
int ompi_osc_rdma_test(struct ompi_win_t *win, int ompi_osc_pt2pt_test(struct ompi_win_t *win,
int *flag); int *flag);
int ompi_osc_rdma_lock(int lock_type, int ompi_osc_pt2pt_lock(int lock_type,
int target, int target,
int assert, int assert,
struct ompi_win_t *win); struct ompi_win_t *win);
int ompi_osc_rdma_unlock(int target, int ompi_osc_pt2pt_unlock(int target,
struct ompi_win_t *win); struct ompi_win_t *win);
int ompi_osc_rdma_lock_all(int assert, int ompi_osc_pt2pt_lock_all(int assert,
struct ompi_win_t *win); struct ompi_win_t *win);
int ompi_osc_rdma_unlock_all(struct ompi_win_t *win); int ompi_osc_pt2pt_unlock_all(struct ompi_win_t *win);
int ompi_osc_rdma_sync(struct ompi_win_t *win); int ompi_osc_pt2pt_sync(struct ompi_win_t *win);
int ompi_osc_rdma_flush(int target, int ompi_osc_pt2pt_flush(int target,
struct ompi_win_t *win); struct ompi_win_t *win);
int ompi_osc_rdma_flush_all(struct ompi_win_t *win); int ompi_osc_pt2pt_flush_all(struct ompi_win_t *win);
int ompi_osc_rdma_flush_local(int target, int ompi_osc_pt2pt_flush_local(int target,
struct ompi_win_t *win); struct ompi_win_t *win);
int ompi_osc_rdma_flush_local_all(struct ompi_win_t *win); int ompi_osc_pt2pt_flush_local_all(struct ompi_win_t *win);
int ompi_osc_rdma_set_info(struct ompi_win_t *win, struct ompi_info_t *info); int ompi_osc_pt2pt_set_info(struct ompi_win_t *win, struct ompi_info_t *info);
int ompi_osc_rdma_get_info(struct ompi_win_t *win, struct ompi_info_t **info_used); int ompi_osc_pt2pt_get_info(struct ompi_win_t *win, struct ompi_info_t **info_used);
int ompi_osc_rdma_component_irecv(ompi_osc_rdma_module_t *module, int ompi_osc_pt2pt_component_irecv(ompi_osc_pt2pt_module_t *module,
void *buf, void *buf,
size_t count, size_t count,
struct ompi_datatype_t *datatype, struct ompi_datatype_t *datatype,
@ -386,7 +394,7 @@ int ompi_osc_rdma_component_irecv(ompi_osc_rdma_module_t *module,
int tag, int tag,
struct ompi_communicator_t *comm); struct ompi_communicator_t *comm);
int ompi_osc_rdma_component_isend(ompi_osc_rdma_module_t *module, int ompi_osc_pt2pt_component_isend(ompi_osc_pt2pt_module_t *module,
void *buf, void *buf,
size_t count, size_t count,
struct ompi_datatype_t *datatype, struct ompi_datatype_t *datatype,
@ -395,16 +403,16 @@ int ompi_osc_rdma_component_isend(ompi_osc_rdma_module_t *module,
struct ompi_communicator_t *comm); struct ompi_communicator_t *comm);
/** /**
* ompi_osc_rdma_progress_pending_acc: * ompi_osc_pt2pt_progress_pending_acc:
* *
* @short Progress one pending accumulation or compare and swap operation. * @short Progress one pending accumulation or compare and swap operation.
* *
* @param[in] module - OSC RDMA module * @param[in] module - OSC PT2PT module
* *
* @long If the accumulation lock can be aquired progress one pending * @long If the accumulation lock can be aquired progress one pending
* accumulate or compare and swap operation. * accumulate or compare and swap operation.
*/ */
int ompi_osc_rdma_progress_pending_acc (ompi_osc_rdma_module_t *module); int ompi_osc_pt2pt_progress_pending_acc (ompi_osc_pt2pt_module_t *module);
/** /**
@ -412,7 +420,7 @@ int ompi_osc_rdma_progress_pending_acc (ompi_osc_rdma_module_t *module);
* *
* @short Increment incoming completeion count. * @short Increment incoming completeion count.
* *
* @param[in] module - OSC RDMA module * @param[in] module - OSC PT2PT module
* @param[in] source - Passive target source or MPI_PROC_NULL (active target) * @param[in] source - Passive target source or MPI_PROC_NULL (active target)
* *
* @long This function incremements either the passive or active incoming counts. * @long This function incremements either the passive or active incoming counts.
@ -420,7 +428,7 @@ int ompi_osc_rdma_progress_pending_acc (ompi_osc_rdma_module_t *module);
* This function uses atomics if necessary so it is not necessary to hold * This function uses atomics if necessary so it is not necessary to hold
* the module lock before calling this function. * the module lock before calling this function.
*/ */
static inline void mark_incoming_completion (ompi_osc_rdma_module_t *module, int source) static inline void mark_incoming_completion (ompi_osc_pt2pt_module_t *module, int source)
{ {
if (MPI_PROC_NULL == source) { if (MPI_PROC_NULL == source) {
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
@ -431,11 +439,12 @@ static inline void mark_incoming_completion (ompi_osc_rdma_module_t *module, int
opal_condition_broadcast(&module->cond); opal_condition_broadcast(&module->cond);
} }
} else { } else {
ompi_osc_pt2pt_peer_t *peer = module->peers + source;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"mark_incoming_completion marking passive incoming complete. source = %d, count = %d", "mark_incoming_completion marking passive incoming complete. source = %d, count = %d",
source, (int) module->passive_incoming_frag_count[source] + 1)); source, (int) peer->passive_incoming_frag_count + 1));
OPAL_THREAD_ADD32((int32_t *) (module->passive_incoming_frag_count + source), 1); OPAL_THREAD_ADD32((int32_t *) &peer->passive_incoming_frag_count, 1);
if (module->passive_incoming_frag_count[source] >= module->passive_incoming_frag_signal_count[source]) { if (0 == peer->passive_incoming_frag_count) {
opal_condition_broadcast(&module->cond); opal_condition_broadcast(&module->cond);
} }
} }
@ -446,7 +455,7 @@ static inline void mark_incoming_completion (ompi_osc_rdma_module_t *module, int
* *
* @short Increment outgoing count. * @short Increment outgoing count.
* *
* @param[in] module - OSC RDMA module * @param[in] module - OSC PT2PT module
* *
* @long This function is used to signal that an outgoing send is complete. It * @long This function is used to signal that an outgoing send is complete. It
* incrememnts only the outgoing fragment count and signals the module * incrememnts only the outgoing fragment count and signals the module
@ -454,7 +463,7 @@ static inline void mark_incoming_completion (ompi_osc_rdma_module_t *module, int
* uses atomics if necessary so it is not necessary to hold the module * uses atomics if necessary so it is not necessary to hold the module
* lock before calling this function. * lock before calling this function.
*/ */
static inline void mark_outgoing_completion (ompi_osc_rdma_module_t *module) static inline void mark_outgoing_completion (ompi_osc_pt2pt_module_t *module)
{ {
OPAL_THREAD_ADD32((int32_t *) &module->outgoing_frag_count, 1); OPAL_THREAD_ADD32((int32_t *) &module->outgoing_frag_count, 1);
if (module->outgoing_frag_count >= module->outgoing_frag_signal_count) { if (module->outgoing_frag_count >= module->outgoing_frag_signal_count) {
@ -467,14 +476,14 @@ static inline void mark_outgoing_completion (ompi_osc_rdma_module_t *module)
* *
* @short Increment outgoing signal counters. * @short Increment outgoing signal counters.
* *
* @param[in] module - OSC RDMA module * @param[in] module - OSC PT2PT module
* @param[in] target - Passive target rank or MPI_PROC_NULL (active target) * @param[in] target - Passive target rank or MPI_PROC_NULL (active target)
* @param[in] count - Number of outgoing messages to signal. * @param[in] count - Number of outgoing messages to signal.
* *
* @long This function uses atomics if necessary so it is not necessary to hold * @long This function uses atomics if necessary so it is not necessary to hold
* the module lock before calling this function. * the module lock before calling this function.
*/ */
static inline void ompi_osc_signal_outgoing (ompi_osc_rdma_module_t *module, int target, int count) static inline void ompi_osc_signal_outgoing (ompi_osc_pt2pt_module_t *module, int target, int count)
{ {
OPAL_THREAD_ADD32((int32_t *) &module->outgoing_frag_signal_count, count); OPAL_THREAD_ADD32((int32_t *) &module->outgoing_frag_signal_count, count);
if (MPI_PROC_NULL != target) { if (MPI_PROC_NULL != target) {
@ -486,7 +495,7 @@ static inline void ompi_osc_signal_outgoing (ompi_osc_rdma_module_t *module, int
} }
/** /**
* osc_rdma_copy_on_recv: * osc_pt2pt_copy_on_recv:
* *
* @short Helper function. Copies data from source to target through the * @short Helper function. Copies data from source to target through the
* convertor. * convertor.
@ -502,7 +511,7 @@ static inline void ompi_osc_signal_outgoing (ompi_osc_rdma_module_t *module, int
* buffer. The copy is done with a convertor generated from proc, * buffer. The copy is done with a convertor generated from proc,
* datatype, and count. * datatype, and count.
*/ */
static inline void osc_rdma_copy_on_recv (void *target, void *source, size_t source_len, ompi_proc_t *proc, static inline void osc_pt2pt_copy_on_recv (void *target, void *source, size_t source_len, ompi_proc_t *proc,
int count, ompi_datatype_t *datatype) int count, ompi_datatype_t *datatype)
{ {
opal_convertor_t convertor; opal_convertor_t convertor;
@ -530,7 +539,7 @@ static inline void osc_rdma_copy_on_recv (void *target, void *source, size_t sou
} }
/** /**
* osc_rdma_copy_for_send: * osc_pt2pt_copy_for_send:
* *
* @short: Helper function. Copies data from source to target through the * @short: Helper function. Copies data from source to target through the
* convertor. * convertor.
@ -546,7 +555,7 @@ static inline void osc_rdma_copy_on_recv (void *target, void *source, size_t sou
* buffer. The copy is done with a convertor generated from proc, * buffer. The copy is done with a convertor generated from proc,
* datatype, and count. * datatype, and count.
*/ */
static inline void osc_rdma_copy_for_send (void *target, size_t target_len, void *source, ompi_proc_t *proc, static inline void osc_pt2pt_copy_for_send (void *target, size_t target_len, void *source, ompi_proc_t *proc,
int count, ompi_datatype_t *datatype) int count, ompi_datatype_t *datatype)
{ {
opal_convertor_t convertor; opal_convertor_t convertor;
@ -567,7 +576,7 @@ static inline void osc_rdma_copy_for_send (void *target, size_t target_len, void
} }
/** /**
* osc_rdma_request_gc_clean: * osc_pt2pt_request_gc_clean:
* *
* @short Release finished PML requests and accumulate buffers. * @short Release finished PML requests and accumulate buffers.
* *
@ -576,71 +585,77 @@ static inline void osc_rdma_copy_for_send (void *target, size_t target_len, void
* and buffers on the module's garbage collection lists and release then * and buffers on the module's garbage collection lists and release then
* at a later time. * at a later time.
*/ */
static inline void osc_rdma_gc_clean (void) static inline void osc_pt2pt_gc_clean (ompi_osc_pt2pt_module_t *module)
{ {
ompi_request_t *request; ompi_request_t *request;
opal_list_item_t *item; opal_list_item_t *item;
OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock); OPAL_THREAD_LOCK(&module->gc_lock);
while (NULL != (request = (ompi_request_t *) opal_list_remove_first (&mca_osc_rdma_component.request_gc))) { while (NULL != (request = (ompi_request_t *) opal_list_remove_first (&module->request_gc))) {
OPAL_THREAD_UNLOCK(&module->gc_lock);
ompi_request_free (&request); ompi_request_free (&request);
OPAL_THREAD_LOCK(&module->gc_lock);
} }
while (NULL != (item = opal_list_remove_first (&mca_osc_rdma_component.buffer_gc))) { while (NULL != (item = opal_list_remove_first (&module->buffer_gc))) {
OBJ_RELEASE(item); OBJ_RELEASE(item);
} }
OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock); OPAL_THREAD_UNLOCK(&module->gc_lock);
} }
static inline void osc_rdma_gc_add_request (ompi_request_t *request) static inline void osc_pt2pt_gc_add_request (ompi_osc_pt2pt_module_t *module, ompi_request_t *request)
{ {
OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock); OPAL_THREAD_SCOPED_LOCK(&module->gc_lock,
opal_list_append (&mca_osc_rdma_component.request_gc, (opal_list_item_t *) request); opal_list_append (&module->request_gc, (opal_list_item_t *) request));
OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock);
} }
static inline void osc_rdma_gc_add_buffer (opal_list_item_t *buffer) static inline void osc_pt2pt_gc_add_buffer (ompi_osc_pt2pt_module_t *module, opal_list_item_t *buffer)
{ {
OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock); OPAL_THREAD_SCOPED_LOCK(&module->gc_lock,
opal_list_append (&mca_osc_rdma_component.buffer_gc, buffer); opal_list_append (&module->buffer_gc, buffer));
OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock);
} }
#define OSC_RDMA_FRAG_TAG 0x10000 static inline void osc_pt2pt_add_pending (ompi_osc_pt2pt_pending_t *pending)
#define OSC_RDMA_FRAG_MASK 0x0ffff {
OPAL_THREAD_SCOPED_LOCK(&mca_osc_pt2pt_component.pending_operations_lock,
opal_list_append (&mca_osc_pt2pt_component.pending_operations, &pending->super));
}
#define OSC_PT2PT_FRAG_TAG 0x10000
#define OSC_PT2PT_FRAG_MASK 0x0ffff
/** /**
* get_tag: * get_tag:
* *
* @short Get a send/recv tag for large memory operations. * @short Get a send/recv tag for large memory operations.
* *
* @param[in] module - OSC RDMA module * @param[in] module - OSC PT2PT module
* *
* @long This function aquires a 16-bit tag for use with large memory operations. The * @long This function aquires a 16-bit tag for use with large memory operations. The
* tag will be odd or even depending on if this is in a passive target access * tag will be odd or even depending on if this is in a passive target access
* or not. * or not.
*/ */
static inline int get_tag(ompi_osc_rdma_module_t *module) static inline int get_tag(ompi_osc_pt2pt_module_t *module)
{ {
/* the LSB of the tag is used be the receiver to determine if the /* the LSB of the tag is used be the receiver to determine if the
message is a passive or active target (ie, where to mark message is a passive or active target (ie, where to mark
completion). */ completion). */
int tmp = module->tag_counter + !!(module->passive_target_access_epoch); int tmp = module->tag_counter + !!(module->passive_target_access_epoch);
module->tag_counter = (module->tag_counter + 2) & OSC_RDMA_FRAG_MASK; module->tag_counter = (module->tag_counter + 2) & OSC_PT2PT_FRAG_MASK;
return tmp; return tmp;
} }
/** /**
* ompi_osc_rdma_accumulate_lock: * ompi_osc_pt2pt_accumulate_lock:
* *
* @short Internal function that spins until the accumulation lock has * @short Internal function that spins until the accumulation lock has
* been aquired. * been aquired.
* *
* @param[in] module - OSC RDMA module * @param[in] module - OSC PT2PT module
* *
* @returns 0 * @returns 0
* *
@ -648,9 +663,9 @@ static inline int get_tag(ompi_osc_rdma_module_t *module)
* behavior is only acceptable from a user-level call as blocking in a * behavior is only acceptable from a user-level call as blocking in a
* callback may cause deadlock. If a callback needs the accumulate lock and * callback may cause deadlock. If a callback needs the accumulate lock and
* it is not available it should be placed on the pending_acc list of the * it is not available it should be placed on the pending_acc list of the
* module. It will be released by ompi_osc_rdma_accumulate_unlock(). * module. It will be released by ompi_osc_pt2pt_accumulate_unlock().
*/ */
static inline int ompi_osc_rdma_accumulate_lock (ompi_osc_rdma_module_t *module) static inline int ompi_osc_pt2pt_accumulate_lock (ompi_osc_pt2pt_module_t *module)
{ {
while (opal_atomic_trylock (&module->accumulate_lock)) { while (opal_atomic_trylock (&module->accumulate_lock)) {
opal_progress (); opal_progress ();
@ -660,11 +675,11 @@ static inline int ompi_osc_rdma_accumulate_lock (ompi_osc_rdma_module_t *module)
} }
/** /**
* ompi_osc_rdma_accumulate_trylock: * ompi_osc_pt2pt_accumulate_trylock:
* *
* @short Try to aquire the accumulation lock. * @short Try to aquire the accumulation lock.
* *
* @param[in] module - OSC RDMA module * @param[in] module - OSC PT2PT module
* *
* @returns 0 if the accumulation lock was aquired * @returns 0 if the accumulation lock was aquired
* @returns 1 if the lock was not available * @returns 1 if the lock was not available
@ -672,34 +687,34 @@ static inline int ompi_osc_rdma_accumulate_lock (ompi_osc_rdma_module_t *module)
* @long This function will try to aquire the accumulation lock. This function * @long This function will try to aquire the accumulation lock. This function
* is safe to call from a callback. * is safe to call from a callback.
*/ */
static inline int ompi_osc_rdma_accumulate_trylock (ompi_osc_rdma_module_t *module) static inline int ompi_osc_pt2pt_accumulate_trylock (ompi_osc_pt2pt_module_t *module)
{ {
return opal_atomic_trylock (&module->accumulate_lock); return opal_atomic_trylock (&module->accumulate_lock);
} }
/** /**
* ompi_osc_rdma_accumulate_unlock: * ompi_osc_pt2pt_accumulate_unlock:
* *
* @short Unlock the accumulation lock and release a pending accumulation operation. * @short Unlock the accumulation lock and release a pending accumulation operation.
* *
* @param[in] module - OSC RDMA module * @param[in] module - OSC PT2PT module
* *
* @long This function unlocks the accumulation lock and release a single pending * @long This function unlocks the accumulation lock and release a single pending
* accumulation operation if one exists. This function may be called recursively. * accumulation operation if one exists. This function may be called recursively.
*/ */
static inline void ompi_osc_rdma_accumulate_unlock (ompi_osc_rdma_module_t *module) static inline void ompi_osc_pt2pt_accumulate_unlock (ompi_osc_pt2pt_module_t *module)
{ {
opal_atomic_unlock (&module->accumulate_lock); opal_atomic_unlock (&module->accumulate_lock);
if (0 != opal_list_get_size (&module->pending_acc)) { if (0 != opal_list_get_size (&module->pending_acc)) {
ompi_osc_rdma_progress_pending_acc (module); ompi_osc_pt2pt_progress_pending_acc (module);
} }
} }
static inline bool ompi_osc_rdma_check_access_epoch (ompi_osc_rdma_module_t *module, int rank) static inline bool ompi_osc_pt2pt_check_access_epoch (ompi_osc_pt2pt_module_t *module, int rank)
{ {
return module->all_access_epoch || module->peers[rank].access_epoch; return module->all_access_epoch || module->peers[rank].access_epoch;
} }
END_C_DECLS END_C_DECLS
#endif /* OMPI_OSC_RDMA_H */ #endif /* OMPI_OSC_PT2PT_H */

Просмотреть файл

@ -21,10 +21,10 @@
#include "ompi_config.h" #include "ompi_config.h"
#include "osc_rdma.h" #include "osc_pt2pt.h"
#include "osc_rdma_header.h" #include "osc_pt2pt_header.h"
#include "osc_rdma_data_move.h" #include "osc_pt2pt_data_move.h"
#include "osc_rdma_frag.h" #include "osc_pt2pt_frag.h"
#include "mpi.h" #include "mpi.h"
#include "opal/runtime/opal_progress.h" #include "opal/runtime/opal_progress.h"
@ -33,19 +33,19 @@
#include "ompi/mca/osc/base/base.h" #include "ompi/mca/osc/base/base.h"
/** /**
* ompi_osc_rdma_pending_post_t: * ompi_osc_pt2pt_pending_post_t:
* *
* Describes a post operation that was encountered outside its * Describes a post operation that was encountered outside its
* matching start operation. * matching start operation.
*/ */
struct ompi_osc_rdma_pending_post_t { struct ompi_osc_pt2pt_pending_post_t {
opal_list_item_t super; opal_list_item_t super;
int rank; int rank;
}; };
typedef struct ompi_osc_rdma_pending_post_t ompi_osc_rdma_pending_post_t; typedef struct ompi_osc_pt2pt_pending_post_t ompi_osc_pt2pt_pending_post_t;
OBJ_CLASS_DECLARATION(ompi_osc_rdma_pending_post_t); OBJ_CLASS_DECLARATION(ompi_osc_pt2pt_pending_post_t);
OBJ_CLASS_INSTANCE(ompi_osc_rdma_pending_post_t, opal_list_item_t, NULL, NULL); OBJ_CLASS_INSTANCE(ompi_osc_pt2pt_pending_post_t, opal_list_item_t, NULL, NULL);
static bool group_contains_proc (ompi_group_t *group, ompi_proc_t *proc) static bool group_contains_proc (ompi_group_t *group, ompi_proc_t *proc)
{ {
@ -64,7 +64,7 @@ static bool group_contains_proc (ompi_group_t *group, ompi_proc_t *proc)
} }
static int* static int*
get_comm_ranks(ompi_osc_rdma_module_t *module, get_comm_ranks(ompi_osc_pt2pt_module_t *module,
ompi_group_t *sub_group) ompi_group_t *sub_group)
{ {
int *ranks1 = NULL, *ranks2 = NULL; int *ranks1 = NULL, *ranks2 = NULL;
@ -100,14 +100,14 @@ get_comm_ranks(ompi_osc_rdma_module_t *module,
} }
int int
ompi_osc_rdma_fence(int assert, ompi_win_t *win) ompi_osc_pt2pt_fence(int assert, ompi_win_t *win)
{ {
ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
uint32_t incoming_reqs; uint32_t incoming_reqs;
int ret = OMPI_SUCCESS; int ret = OMPI_SUCCESS;
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc rdma: fence start")); "osc pt2pt: fence start"));
/* can't enter an active target epoch when in a passive target epoch */ /* can't enter an active target epoch when in a passive target epoch */
if (module->passive_target_access_epoch) { if (module->passive_target_access_epoch) {
@ -122,34 +122,36 @@ ompi_osc_rdma_fence(int assert, ompi_win_t *win)
/* short-circuit the noprecede case */ /* short-circuit the noprecede case */
if (0 != (assert & MPI_MODE_NOPRECEDE)) { if (0 != (assert & MPI_MODE_NOPRECEDE)) {
ret = module->comm->c_coll.coll_barrier(module->comm,
module->comm->c_coll.coll_barrier_module);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: fence end (short circuit)")); "osc pt2pt: fence end (short circuit)"));
return ret; return ret;
} }
/* try to start all the requests. */ /* try to start all requests. */
ret = ompi_osc_rdma_frag_flush_all(module); ret = ompi_osc_pt2pt_frag_flush_all(module);
if (OMPI_SUCCESS != ret) goto cleanup; if (OMPI_SUCCESS != ret) {
return ret;
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: fence done sending")); "osc pt2pt: fence done sending"));
/* find out how much data everyone is going to send us. */ /* find out how much data everyone is going to send us. */
ret = module->comm->c_coll.coll_reduce_scatter_block (module->epoch_outgoing_frag_count, ret = module->comm->c_coll.coll_reduce_scatter_block (module->epoch_outgoing_frag_count,
&incoming_reqs, 1, MPI_UINT32_T, &incoming_reqs, 1, MPI_UINT32_T,
MPI_SUM, module->comm, MPI_SUM, module->comm,
module->comm->c_coll.coll_reduce_scatter_block_module); module->comm->c_coll.coll_reduce_scatter_block_module);
if (OMPI_SUCCESS != ret) goto cleanup; if (OMPI_SUCCESS != ret) {
OPAL_THREAD_UNLOCK(&module->lock);
return ret;
}
OPAL_THREAD_LOCK(&module->lock); OPAL_THREAD_LOCK(&module->lock);
bzero(module->epoch_outgoing_frag_count, bzero(module->epoch_outgoing_frag_count,
sizeof(uint32_t) * ompi_comm_size(module->comm)); sizeof(uint32_t) * ompi_comm_size(module->comm));
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: fence expects %d requests", "osc pt2pt: fence expects %d requests",
incoming_reqs)); incoming_reqs));
/* set our complete condition for incoming requests */ /* set our complete condition for incoming requests */
@ -161,32 +163,31 @@ ompi_osc_rdma_fence(int assert, ompi_win_t *win)
opal_condition_wait(&module->cond, &module->lock); opal_condition_wait(&module->cond, &module->lock);
} }
ret = OMPI_SUCCESS; module->active_incoming_frag_signal_count = 0;
if (assert & MPI_MODE_NOSUCCEED) { if (assert & MPI_MODE_NOSUCCEED) {
/* as specified in MPI-3 p 438 3-5 the fence can end an epoch. it isn't explicitly /* as specified in MPI-3 p 438 3-5 the fence can end an epoch. it isn't explicitly
* stated that MPI_MODE_NOSUCCEED ends the epoch but it is a safe assumption. */ * stated that MPI_MODE_NOSUCCEED ends the epoch but it is a safe assumption. */
module->active_eager_send_active = false; module->active_eager_send_active = false;
module->all_access_epoch = false; module->all_access_epoch = false;
} }
opal_condition_broadcast (&module->cond);
cleanup:
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc rdma: fence end: %d", ret));
OPAL_THREAD_UNLOCK(&module->lock); OPAL_THREAD_UNLOCK(&module->lock);
return ret; OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc pt2pt: fence end: %d", ret));
return OMPI_SUCCESS;
} }
int int
ompi_osc_rdma_start(ompi_group_t *group, ompi_osc_pt2pt_start(ompi_group_t *group,
int assert, int assert,
ompi_win_t *win) ompi_win_t *win)
{ {
ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
ompi_osc_rdma_pending_post_t *pending_post, *next; ompi_osc_pt2pt_pending_post_t *pending_post, *next;
int group_size; int group_size;
int *ranks; int *ranks;
@ -209,7 +210,7 @@ ompi_osc_rdma_start(ompi_group_t *group,
group_size = ompi_group_size (module->sc_group); group_size = ompi_group_size (module->sc_group);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_start entering with group size %d...", "ompi_osc_pt2pt_start entering with group size %d...",
group_size)); group_size));
ranks = get_comm_ranks(module, module->sc_group); ranks = get_comm_ranks(module, module->sc_group);
@ -222,13 +223,17 @@ ompi_osc_rdma_start(ompi_group_t *group,
free (ranks); free (ranks);
OPAL_LIST_FOREACH_SAFE(pending_post, next, &module->pending_posts, ompi_osc_rdma_pending_post_t) { OPAL_LIST_FOREACH_SAFE(pending_post, next, &module->pending_posts, ompi_osc_pt2pt_pending_post_t) {
ompi_proc_t *pending_proc = ompi_comm_peer_lookup (module->comm, pending_post->rank); ompi_proc_t *pending_proc = ompi_comm_peer_lookup (module->comm, pending_post->rank);
if (group_contains_proc (module->sc_group, pending_proc)) { if (group_contains_proc (module->sc_group, pending_proc)) {
ompi_osc_pt2pt_peer_t *peer = module->peers + pending_post->rank;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "Consumed unexpected post message from %d", OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "Consumed unexpected post message from %d",
pending_post->rank)); pending_post->rank));
++module->num_post_msgs; ++module->num_post_msgs;
peer->eager_send_active = true;
opal_list_remove_item (&module->pending_posts, &pending_post->super); opal_list_remove_item (&module->pending_posts, &pending_post->super);
OBJ_RELEASE(pending_post); OBJ_RELEASE(pending_post);
} }
@ -254,7 +259,7 @@ ompi_osc_rdma_start(ompi_group_t *group,
} }
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_start complete")); "ompi_osc_pt2pt_start complete"));
OPAL_THREAD_UNLOCK(&module->lock); OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_SUCCESS; return OMPI_SUCCESS;
@ -262,11 +267,11 @@ ompi_osc_rdma_start(ompi_group_t *group,
int int
ompi_osc_rdma_complete(ompi_win_t *win) ompi_osc_pt2pt_complete(ompi_win_t *win)
{ {
ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
ompi_osc_rdma_header_complete_t complete_req; ompi_osc_pt2pt_header_complete_t complete_req;
ompi_osc_rdma_peer_t *peer; ompi_osc_pt2pt_peer_t *peer;
int ret = OMPI_SUCCESS; int ret = OMPI_SUCCESS;
int i; int i;
int *ranks = NULL; int *ranks = NULL;
@ -274,7 +279,7 @@ ompi_osc_rdma_complete(ompi_win_t *win)
int my_rank = ompi_comm_rank (module->comm); int my_rank = ompi_comm_rank (module->comm);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_complete entering...")); "ompi_osc_pt2pt_complete entering..."));
if (NULL == module->sc_group) { if (NULL == module->sc_group) {
return OMPI_ERR_RMA_SYNC; return OMPI_ERR_RMA_SYNC;
@ -291,9 +296,10 @@ ompi_osc_rdma_complete(ompi_win_t *win)
"waiting for post messages. num_post_msgs = %d", module->num_post_msgs)); "waiting for post messages. num_post_msgs = %d", module->num_post_msgs));
opal_condition_wait(&module->cond, &module->lock); opal_condition_wait(&module->cond, &module->lock);
} }
OPAL_THREAD_UNLOCK(&module->lock);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_complete sending complete message")); "ompi_osc_pt2pt_complete sending complete message"));
/* for each process in group, send a control message with number /* for each process in group, send a control message with number
of updates coming, then start all the requests. Note that the of updates coming, then start all the requests. Note that the
@ -302,38 +308,39 @@ ompi_osc_rdma_complete(ompi_win_t *win)
At the same time, clean out the outgoing count for the next At the same time, clean out the outgoing count for the next
round. */ round. */
OPAL_THREAD_UNLOCK(&module->lock);
for (i = 0 ; i < ompi_group_size(module->sc_group) ; ++i) { for (i = 0 ; i < ompi_group_size(module->sc_group) ; ++i) {
if (my_rank == ranks[i]) { if (my_rank == ranks[i]) {
/* shortcut for self */ /* shortcut for self */
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "ompi_osc_rdma_complete self complete")); OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "ompi_osc_pt2pt_complete self complete"));
module->num_complete_msgs++; module->num_complete_msgs++;
continue; continue;
} }
complete_req.base.type = OMPI_OSC_RDMA_HDR_TYPE_COMPLETE; complete_req.base.type = OMPI_OSC_PT2PT_HDR_TYPE_COMPLETE;
complete_req.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID; complete_req.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID;
complete_req.frag_count = module->epoch_outgoing_frag_count[ranks[i]]; complete_req.frag_count = module->epoch_outgoing_frag_count[ranks[i]];
peer = module->peers + ranks[i]; peer = module->peers + ranks[i];
peer->access_epoch = false; peer->access_epoch = false;
ret = ompi_osc_rdma_control_send(module, ret = ompi_osc_pt2pt_control_send(module,
ranks[i], ranks[i],
&complete_req, &complete_req,
sizeof(ompi_osc_rdma_header_complete_t)); sizeof(ompi_osc_pt2pt_header_complete_t));
if (OMPI_SUCCESS != ret) goto cleanup; if (OMPI_SUCCESS != ret) goto cleanup;
} }
OPAL_THREAD_LOCK(&module->lock);
/* start all requests */ /* start all requests */
ret = ompi_osc_rdma_frag_flush_all(module); ret = ompi_osc_pt2pt_frag_flush_all(module);
if (OMPI_SUCCESS != ret) goto cleanup; if (OMPI_SUCCESS != ret) goto cleanup;
OPAL_THREAD_LOCK(&module->lock);
/* zero the fragment counts here to ensure they are zerod */ /* zero the fragment counts here to ensure they are zerod */
for (i = 0 ; i < ompi_group_size(module->sc_group) ; ++i) { for (i = 0 ; i < ompi_group_size(module->sc_group) ; ++i) {
peer = module->peers + ranks[i];
module->epoch_outgoing_frag_count[ranks[i]] = 0; module->epoch_outgoing_frag_count[ranks[i]] = 0;
peer->eager_send_active = false;
} }
/* wait for outgoing requests to complete. Don't wait for incoming, as /* wait for outgoing requests to complete. Don't wait for incoming, as
@ -347,14 +354,14 @@ ompi_osc_rdma_complete(ompi_win_t *win)
module->sc_group = NULL; module->sc_group = NULL;
/* unlock here, as group cleanup can take a while... */ /* unlock here, as group cleanup can take a while... */
OPAL_THREAD_UNLOCK(&(module->lock)); OPAL_THREAD_UNLOCK(&module->lock);
/* phase 2 cleanup group */ /* phase 2 cleanup group */
ompi_group_decrement_proc_count(group); ompi_group_decrement_proc_count(group);
OBJ_RELEASE(group); OBJ_RELEASE(group);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_complete complete")); "ompi_osc_pt2pt_complete complete"));
free (ranks); free (ranks);
return OMPI_SUCCESS; return OMPI_SUCCESS;
@ -362,21 +369,19 @@ ompi_osc_rdma_complete(ompi_win_t *win)
cleanup: cleanup:
if (NULL != ranks) free(ranks); if (NULL != ranks) free(ranks);
OPAL_THREAD_UNLOCK(&(module->lock));
return ret; return ret;
} }
int int
ompi_osc_rdma_post(ompi_group_t *group, ompi_osc_pt2pt_post(ompi_group_t *group,
int assert, int assert,
ompi_win_t *win) ompi_win_t *win)
{ {
int *ranks; int *ranks;
int ret = OMPI_SUCCESS; int ret = OMPI_SUCCESS;
ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
ompi_osc_rdma_header_post_t post_req; ompi_osc_pt2pt_header_post_t post_req;
int my_rank = ompi_comm_rank(module->comm); int my_rank = ompi_comm_rank(module->comm);
/* can't check for all access epoch here due to fence */ /* can't check for all access epoch here due to fence */
@ -385,7 +390,7 @@ ompi_osc_rdma_post(ompi_group_t *group,
} }
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_post entering with group size %d...", "ompi_osc_pt2pt_post entering with group size %d...",
ompi_group_size (group))); ompi_group_size (group)));
/* save the group */ /* save the group */
@ -422,19 +427,19 @@ ompi_osc_rdma_post(ompi_group_t *group,
/* shortcut for self */ /* shortcut for self */
if (my_rank == ranks[i]) { if (my_rank == ranks[i]) {
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "ompi_osc_rdma_complete self post")); OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "ompi_osc_pt2pt_complete self post"));
osc_rdma_incoming_post (module, my_rank); osc_pt2pt_incoming_post (module, my_rank);
continue; continue;
} }
post_req.base.type = OMPI_OSC_RDMA_HDR_TYPE_POST; post_req.base.type = OMPI_OSC_PT2PT_HDR_TYPE_POST;
post_req.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID; post_req.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID;
post_req.windx = ompi_comm_get_cid(module->comm); post_req.windx = ompi_comm_get_cid(module->comm);
/* we don't want to send any data, since we're the exposure /* we don't want to send any data, since we're the exposure
epoch only, so use an unbuffered send */ epoch only, so use an unbuffered send */
ret = ompi_osc_rdma_control_send_unbuffered(module, ranks[i], &post_req, ret = ompi_osc_pt2pt_control_send_unbuffered(module, ranks[i], &post_req,
sizeof(ompi_osc_rdma_header_post_t)); sizeof(ompi_osc_pt2pt_header_post_t));
if (OMPI_SUCCESS != ret) { if (OMPI_SUCCESS != ret) {
break; break;
} }
@ -447,9 +452,9 @@ ompi_osc_rdma_post(ompi_group_t *group,
int int
ompi_osc_rdma_wait(ompi_win_t *win) ompi_osc_pt2pt_wait(ompi_win_t *win)
{ {
ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
ompi_group_t *group; ompi_group_t *group;
if (NULL == module->pw_group) { if (NULL == module->pw_group) {
@ -457,7 +462,7 @@ ompi_osc_rdma_wait(ompi_win_t *win)
} }
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_wait entering...")); "ompi_osc_pt2pt_wait entering..."));
OPAL_THREAD_LOCK(&module->lock); OPAL_THREAD_LOCK(&module->lock);
while (0 != module->num_complete_msgs || while (0 != module->num_complete_msgs ||
@ -476,17 +481,17 @@ ompi_osc_rdma_wait(ompi_win_t *win)
OBJ_RELEASE(group); OBJ_RELEASE(group);
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_wait complete")); "ompi_osc_pt2pt_wait complete"));
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
int int
ompi_osc_rdma_test(ompi_win_t *win, ompi_osc_pt2pt_test(ompi_win_t *win,
int *flag) int *flag)
{ {
ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
ompi_group_t *group; ompi_group_t *group;
int ret = OMPI_SUCCESS; int ret = OMPI_SUCCESS;
@ -504,7 +509,6 @@ ompi_osc_rdma_test(ompi_win_t *win,
module->active_incoming_frag_count != module->active_incoming_frag_signal_count) { module->active_incoming_frag_count != module->active_incoming_frag_signal_count) {
*flag = 0; *flag = 0;
ret = OMPI_SUCCESS; ret = OMPI_SUCCESS;
goto cleanup;
} else { } else {
*flag = 1; *flag = 1;
@ -519,21 +523,21 @@ ompi_osc_rdma_test(ompi_win_t *win,
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
cleanup:
OPAL_THREAD_UNLOCK(&(module->lock)); OPAL_THREAD_UNLOCK(&(module->lock));
return ret; return ret;
} }
int osc_rdma_incoming_post (ompi_osc_rdma_module_t *module, int source) int osc_pt2pt_incoming_post (ompi_osc_pt2pt_module_t *module, int source)
{ {
ompi_proc_t *source_proc = ompi_comm_peer_lookup (module->comm, source); ompi_proc_t *source_proc = ompi_comm_peer_lookup (module->comm, source);
ompi_osc_pt2pt_peer_t *peer = module->peers + source;
OPAL_THREAD_LOCK(&module->lock); OPAL_THREAD_LOCK(&module->lock);
/* verify that this proc is part of the current start group */ /* verify that this proc is part of the current start group */
if (!module->sc_group || !group_contains_proc (module->sc_group, source_proc)) { if (!module->sc_group || !group_contains_proc (module->sc_group, source_proc)) {
ompi_osc_rdma_pending_post_t *pending_post = OBJ_NEW(ompi_osc_rdma_pending_post_t); ompi_osc_pt2pt_pending_post_t *pending_post = OBJ_NEW(ompi_osc_pt2pt_pending_post_t);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"received unexpected post message from %d. module->sc_group = %p, size = %d", "received unexpected post message from %d. module->sc_group = %p, size = %d",
@ -547,6 +551,9 @@ int osc_rdma_incoming_post (ompi_osc_rdma_module_t *module, int source)
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
assert (!peer->eager_send_active);
peer->eager_send_active = true;
module->num_post_msgs++; module->num_post_msgs++;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"received post message. num_post_msgs = %d", module->num_post_msgs)); "received post message. num_post_msgs = %d", module->num_post_msgs));

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -25,10 +25,10 @@
#include <string.h> #include <string.h>
#include "osc_rdma.h" #include "osc_pt2pt.h"
#include "osc_rdma_data_move.h" #include "osc_pt2pt_data_move.h"
#include "osc_rdma_frag.h" #include "osc_pt2pt_frag.h"
#include "osc_rdma_request.h" #include "osc_pt2pt_request.h"
#include "opal/threads/condition.h" #include "opal/threads/condition.h"
#include "opal/threads/mutex.h" #include "opal/threads/mutex.h"
@ -55,11 +55,11 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, in
struct ompi_communicator_t *comm, struct ompi_info_t *info, struct ompi_communicator_t *comm, struct ompi_info_t *info,
int flavor, int *model); int flavor, int *model);
ompi_osc_rdma_component_t mca_osc_rdma_component = { ompi_osc_pt2pt_component_t mca_osc_pt2pt_component = {
{ /* ompi_osc_base_component_t */ { /* ompi_osc_base_component_t */
{ /* ompi_base_component_t */ { /* ompi_base_component_t */
OMPI_OSC_BASE_VERSION_3_0_0, OMPI_OSC_BASE_VERSION_3_0_0,
"rdma", "pt2pt",
OMPI_MAJOR_VERSION, /* MCA component major version */ OMPI_MAJOR_VERSION, /* MCA component major version */
OMPI_MINOR_VERSION, /* MCA component minor version */ OMPI_MINOR_VERSION, /* MCA component minor version */
OMPI_RELEASE_VERSION, /* MCA component release version */ OMPI_RELEASE_VERSION, /* MCA component release version */
@ -80,51 +80,51 @@ ompi_osc_rdma_component_t mca_osc_rdma_component = {
}; };
ompi_osc_rdma_module_t ompi_osc_rdma_module_template = { ompi_osc_pt2pt_module_t ompi_osc_pt2pt_module_template = {
{ {
NULL, /* shared_query */ NULL, /* shared_query */
ompi_osc_rdma_attach, ompi_osc_pt2pt_attach,
ompi_osc_rdma_detach, ompi_osc_pt2pt_detach,
ompi_osc_rdma_free, ompi_osc_pt2pt_free,
ompi_osc_rdma_put, ompi_osc_pt2pt_put,
ompi_osc_rdma_get, ompi_osc_pt2pt_get,
ompi_osc_rdma_accumulate, ompi_osc_pt2pt_accumulate,
ompi_osc_rdma_compare_and_swap, ompi_osc_pt2pt_compare_and_swap,
ompi_osc_rdma_fetch_and_op, ompi_osc_pt2pt_fetch_and_op,
ompi_osc_rdma_get_accumulate, ompi_osc_pt2pt_get_accumulate,
ompi_osc_rdma_rput, ompi_osc_pt2pt_rput,
ompi_osc_rdma_rget, ompi_osc_pt2pt_rget,
ompi_osc_rdma_raccumulate, ompi_osc_pt2pt_raccumulate,
ompi_osc_rdma_rget_accumulate, ompi_osc_pt2pt_rget_accumulate,
ompi_osc_rdma_fence, ompi_osc_pt2pt_fence,
ompi_osc_rdma_start, ompi_osc_pt2pt_start,
ompi_osc_rdma_complete, ompi_osc_pt2pt_complete,
ompi_osc_rdma_post, ompi_osc_pt2pt_post,
ompi_osc_rdma_wait, ompi_osc_pt2pt_wait,
ompi_osc_rdma_test, ompi_osc_pt2pt_test,
ompi_osc_rdma_lock, ompi_osc_pt2pt_lock,
ompi_osc_rdma_unlock, ompi_osc_pt2pt_unlock,
ompi_osc_rdma_lock_all, ompi_osc_pt2pt_lock_all,
ompi_osc_rdma_unlock_all, ompi_osc_pt2pt_unlock_all,
ompi_osc_rdma_sync, ompi_osc_pt2pt_sync,
ompi_osc_rdma_flush, ompi_osc_pt2pt_flush,
ompi_osc_rdma_flush_all, ompi_osc_pt2pt_flush_all,
ompi_osc_rdma_flush_local, ompi_osc_pt2pt_flush_local,
ompi_osc_rdma_flush_local_all, ompi_osc_pt2pt_flush_local_all,
ompi_osc_rdma_set_info, ompi_osc_pt2pt_set_info,
ompi_osc_rdma_get_info ompi_osc_pt2pt_get_info
} }
}; };
bool ompi_osc_rdma_no_locks; bool ompi_osc_pt2pt_no_locks;
/* look up parameters for configuring this window. The code first /* look up parameters for configuring this window. The code first
looks in the info structure passed by the user, then through mca looks in the info structure passed by the user, then through mca
@ -157,7 +157,7 @@ check_config_value_bool(char *key, ompi_info_t *info)
return result; return result;
info_not_found: info_not_found:
param = mca_base_var_find("ompi", "osc", "rdma", key); param = mca_base_var_find("ompi", "osc", "pt2pt", key);
if (0 > param) return false; if (0 > param) return false;
ret = mca_base_var_get_value(param, &flag_value, NULL, NULL); ret = mca_base_var_get_value(param, &flag_value, NULL, NULL);
@ -177,8 +177,8 @@ component_open(void)
static int static int
component_register(void) component_register(void)
{ {
ompi_osc_rdma_no_locks = false; ompi_osc_pt2pt_no_locks = false;
(void) mca_base_component_var_register(&mca_osc_rdma_component.super.osc_version, (void) mca_base_component_var_register(&mca_osc_pt2pt_component.super.osc_version,
"no_locks", "no_locks",
"Enable optimizations available only if MPI_LOCK is " "Enable optimizations available only if MPI_LOCK is "
"not used. " "not used. "
@ -186,38 +186,39 @@ component_register(void)
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9, OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, MCA_BASE_VAR_SCOPE_READONLY,
&ompi_osc_rdma_no_locks); &ompi_osc_pt2pt_no_locks);
mca_osc_rdma_component.buffer_size = 8192; mca_osc_pt2pt_component.buffer_size = 8192;
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "buffer_size", (void) mca_base_component_var_register (&mca_osc_pt2pt_component.super.osc_version, "buffer_size",
"Data transfers smaller than this limit may be coalesced before " "Data transfers smaller than this limit may be coalesced before "
"being transferred (default: 8k)", MCA_BASE_VAR_TYPE_UNSIGNED_INT, "being transferred (default: 8k)", MCA_BASE_VAR_TYPE_UNSIGNED_INT,
NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
&mca_osc_rdma_component.buffer_size); &mca_osc_pt2pt_component.buffer_size);
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
static int component_progress (void) static int component_progress (void)
{ {
ompi_osc_rdma_pending_t *pending, *next; int count = opal_list_get_size (&mca_osc_pt2pt_component.pending_operations);
ompi_osc_pt2pt_pending_t *pending, *next;
if (0 == opal_list_get_size (&mca_osc_rdma_component.pending_operations)) { if (0 == count) {
return 0; return 0;
} }
/* process one incoming request */ /* process one incoming request */
OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock); OPAL_THREAD_LOCK(&mca_osc_pt2pt_component.pending_operations_lock);
OPAL_LIST_FOREACH_SAFE(pending, next, &mca_osc_rdma_component.pending_operations, ompi_osc_rdma_pending_t) { OPAL_LIST_FOREACH_SAFE(pending, next, &mca_osc_pt2pt_component.pending_operations, ompi_osc_pt2pt_pending_t) {
int ret; int ret;
switch (pending->header.base.type) { switch (pending->header.base.type) {
case OMPI_OSC_RDMA_HDR_TYPE_FLUSH_REQ: case OMPI_OSC_PT2PT_HDR_TYPE_FLUSH_REQ:
ret = ompi_osc_rdma_process_flush (pending->module, pending->source, ret = ompi_osc_pt2pt_process_flush (pending->module, pending->source,
&pending->header.flush); &pending->header.flush);
break; break;
case OMPI_OSC_RDMA_HDR_TYPE_UNLOCK_REQ: case OMPI_OSC_PT2PT_HDR_TYPE_UNLOCK_REQ:
ret = ompi_osc_rdma_process_unlock (pending->module, pending->source, ret = ompi_osc_pt2pt_process_unlock (pending->module, pending->source,
&pending->header.unlock); &pending->header.unlock);
break; break;
default: default:
@ -227,11 +228,11 @@ static int component_progress (void)
} }
if (OMPI_SUCCESS == ret) { if (OMPI_SUCCESS == ret) {
opal_list_remove_item (&mca_osc_rdma_component.pending_operations, &pending->super); opal_list_remove_item (&mca_osc_pt2pt_component.pending_operations, &pending->super);
OBJ_RELEASE(pending); OBJ_RELEASE(pending);
} }
} }
OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock); OPAL_THREAD_UNLOCK(&mca_osc_pt2pt_component.pending_operations_lock);
return 1; return 1;
} }
@ -242,23 +243,24 @@ component_init(bool enable_progress_threads,
{ {
int ret; int ret;
OBJ_CONSTRUCT(&mca_osc_rdma_component.lock, opal_mutex_t); OBJ_CONSTRUCT(&mca_osc_pt2pt_component.lock, opal_mutex_t);
OBJ_CONSTRUCT(&mca_osc_rdma_component.pending_operations, opal_list_t); OBJ_CONSTRUCT(&mca_osc_pt2pt_component.pending_operations, opal_list_t);
OBJ_CONSTRUCT(&mca_osc_rdma_component.request_gc, opal_list_t); OBJ_CONSTRUCT(&mca_osc_pt2pt_component.pending_operations_lock, opal_mutex_t);
OBJ_CONSTRUCT(&mca_osc_rdma_component.buffer_gc, opal_list_t);
OBJ_CONSTRUCT(&mca_osc_rdma_component.modules, OBJ_CONSTRUCT(&mca_osc_pt2pt_component.modules,
opal_hash_table_t); opal_hash_table_t);
opal_hash_table_init(&mca_osc_rdma_component.modules, 2); opal_hash_table_init(&mca_osc_pt2pt_component.modules, 2);
mca_osc_rdma_component.progress_enable = false; mca_osc_pt2pt_component.progress_enable = false;
mca_osc_rdma_component.module_count = 0; mca_osc_pt2pt_component.module_count = 0;
OBJ_CONSTRUCT(&mca_osc_rdma_component.frags, opal_free_list_t); OBJ_CONSTRUCT(&mca_osc_pt2pt_component.frags, ompi_free_list_t);
ret = opal_free_list_init(&mca_osc_rdma_component.frags, ret = ompi_free_list_init_new (&mca_osc_pt2pt_component.frags,
sizeof(ompi_osc_rdma_frag_t), sizeof(ompi_osc_pt2pt_frag_t), 8,
OBJ_CLASS(ompi_osc_rdma_frag_t), OBJ_CLASS(ompi_osc_pt2pt_frag_t),
1, -1, 1); mca_osc_pt2pt_component.buffer_size +
sizeof (ompi_osc_pt2pt_frag_header_t),
8, 1, -1, 1, 0);
if (OMPI_SUCCESS != ret) { if (OMPI_SUCCESS != ret) {
opal_output_verbose(1, ompi_osc_base_framework.framework_output, opal_output_verbose(1, ompi_osc_base_framework.framework_output,
"%s:%d: ompi_free_list_init failed: %d", "%s:%d: ompi_free_list_init failed: %d",
@ -266,10 +268,10 @@ component_init(bool enable_progress_threads,
return ret; return ret;
} }
OBJ_CONSTRUCT(&mca_osc_rdma_component.requests, ompi_free_list_t); OBJ_CONSTRUCT(&mca_osc_pt2pt_component.requests, ompi_free_list_t);
ret = ompi_free_list_init(&mca_osc_rdma_component.requests, ret = ompi_free_list_init(&mca_osc_pt2pt_component.requests,
sizeof(ompi_osc_rdma_request_t), sizeof(ompi_osc_pt2pt_request_t),
OBJ_CLASS(ompi_osc_rdma_request_t), OBJ_CLASS(ompi_osc_pt2pt_request_t),
0, -1, 32, NULL); 0, -1, 32, NULL);
if (OMPI_SUCCESS != ret) { if (OMPI_SUCCESS != ret) {
opal_output_verbose(1, ompi_osc_base_framework.framework_output, opal_output_verbose(1, ompi_osc_base_framework.framework_output,
@ -287,24 +289,23 @@ component_finalize(void)
{ {
size_t num_modules; size_t num_modules;
if (mca_osc_rdma_component.progress_enable) { if (mca_osc_pt2pt_component.progress_enable) {
opal_progress_unregister (component_progress); opal_progress_unregister (component_progress);
} }
if (0 != if (0 !=
(num_modules = opal_hash_table_get_size(&mca_osc_rdma_component.modules))) { (num_modules = opal_hash_table_get_size(&mca_osc_pt2pt_component.modules))) {
opal_output(ompi_osc_base_framework.framework_output, opal_output(ompi_osc_base_framework.framework_output,
"WARNING: There were %d Windows created but not freed.", "WARNING: There were %d Windows created but not freed.",
(int) num_modules); (int) num_modules);
} }
OBJ_DESTRUCT(&mca_osc_rdma_component.frags); OBJ_DESTRUCT(&mca_osc_pt2pt_component.frags);
OBJ_DESTRUCT(&mca_osc_rdma_component.modules); OBJ_DESTRUCT(&mca_osc_pt2pt_component.modules);
OBJ_DESTRUCT(&mca_osc_rdma_component.lock); OBJ_DESTRUCT(&mca_osc_pt2pt_component.lock);
OBJ_DESTRUCT(&mca_osc_rdma_component.requests); OBJ_DESTRUCT(&mca_osc_pt2pt_component.requests);
OBJ_DESTRUCT(&mca_osc_rdma_component.pending_operations); OBJ_DESTRUCT(&mca_osc_pt2pt_component.pending_operations);
OBJ_DESTRUCT(&mca_osc_rdma_component.request_gc); OBJ_DESTRUCT(&mca_osc_pt2pt_component.pending_operations_lock);
OBJ_DESTRUCT(&mca_osc_rdma_component.buffer_gc);
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
@ -326,27 +327,21 @@ component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit
struct ompi_communicator_t *comm, struct ompi_info_t *info, struct ompi_communicator_t *comm, struct ompi_info_t *info,
int flavor, int *model) int flavor, int *model)
{ {
ompi_osc_rdma_module_t *module = NULL; ompi_osc_pt2pt_module_t *module = NULL;
int ret; int ret;
char *name; char *name;
bool no_locks = false;
/* We don't support shared windows; that's for the sm onesided /* We don't support shared windows; that's for the sm onesided
component */ component */
if (MPI_WIN_FLAVOR_SHARED == flavor) return OMPI_ERR_NOT_SUPPORTED; if (MPI_WIN_FLAVOR_SHARED == flavor) return OMPI_ERR_NOT_SUPPORTED;
if (check_config_value_bool("no_locks", info)) {
no_locks = true;
ompi_osc_rdma_no_locks = true;
}
/* create module structure with all fields initialized to zero */ /* create module structure with all fields initialized to zero */
module = (ompi_osc_rdma_module_t*) module = (ompi_osc_pt2pt_module_t*)
calloc(1, sizeof(ompi_osc_rdma_module_t)); calloc(1, sizeof(ompi_osc_pt2pt_module_t));
if (NULL == module) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; if (NULL == module) return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
/* fill in the function pointer part */ /* fill in the function pointer part */
memcpy(module, &ompi_osc_rdma_module_template, memcpy(module, &ompi_osc_pt2pt_module_template,
sizeof(ompi_osc_base_module_t)); sizeof(ompi_osc_base_module_t));
/* initialize the objects, so that always free in cleanup */ /* initialize the objects, so that always free in cleanup */
@ -354,10 +349,15 @@ component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit
OBJ_CONSTRUCT(&module->cond, opal_condition_t); OBJ_CONSTRUCT(&module->cond, opal_condition_t);
OBJ_CONSTRUCT(&module->acc_lock, opal_mutex_t); OBJ_CONSTRUCT(&module->acc_lock, opal_mutex_t);
OBJ_CONSTRUCT(&module->queued_frags, opal_list_t); OBJ_CONSTRUCT(&module->queued_frags, opal_list_t);
OBJ_CONSTRUCT(&module->queued_frags_lock, opal_mutex_t);
OBJ_CONSTRUCT(&module->locks_pending, opal_list_t); OBJ_CONSTRUCT(&module->locks_pending, opal_list_t);
OBJ_CONSTRUCT(&module->locks_pending_lock, opal_mutex_t);
OBJ_CONSTRUCT(&module->outstanding_locks, opal_list_t); OBJ_CONSTRUCT(&module->outstanding_locks, opal_list_t);
OBJ_CONSTRUCT(&module->pending_acc, opal_list_t); OBJ_CONSTRUCT(&module->pending_acc, opal_list_t);
OBJ_CONSTRUCT(&module->pending_posts, opal_list_t); OBJ_CONSTRUCT(&module->pending_posts, opal_list_t);
OBJ_CONSTRUCT(&module->request_gc, opal_list_t);
OBJ_CONSTRUCT(&module->buffer_gc, opal_list_t);
OBJ_CONSTRUCT(&module->gc_lock, opal_mutex_t);
/* options */ /* options */
/* FIX ME: should actually check this value... */ /* FIX ME: should actually check this value... */
@ -385,14 +385,14 @@ component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit
if (OMPI_SUCCESS != ret) goto cleanup; if (OMPI_SUCCESS != ret) goto cleanup;
OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output,
"rdma component creating window with id %d", "pt2pt component creating window with id %d",
ompi_comm_get_cid(module->comm))); ompi_comm_get_cid(module->comm)));
/* record my displacement unit. Always resolved at target */ /* record my displacement unit. Always resolved at target */
module->disp_unit = disp_unit; module->disp_unit = disp_unit;
/* peer data */ /* peer data */
module->peers = calloc(ompi_comm_size(comm), sizeof(ompi_osc_rdma_peer_t)); module->peers = calloc(ompi_comm_size(comm), sizeof(ompi_osc_pt2pt_peer_t));
if (NULL == module->peers) { if (NULL == module->peers) {
ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
goto cleanup; goto cleanup;
@ -405,20 +405,6 @@ component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit
goto cleanup; goto cleanup;
} }
if (!no_locks) {
module->passive_incoming_frag_count = calloc(ompi_comm_size(comm), sizeof(uint32_t));
if (NULL == module->passive_incoming_frag_count) {
ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
goto cleanup;
}
module->passive_incoming_frag_signal_count = calloc(ompi_comm_size(comm), sizeof(uint32_t));
if (NULL == module->passive_incoming_frag_signal_count) {
ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
goto cleanup;
}
}
/* the statement below (from Brian) does not seem correct so disable active target on the /* the statement below (from Brian) does not seem correct so disable active target on the
* window. if this end up being incorrect please revert this one change */ * window. if this end up being incorrect please revert this one change */
module->active_eager_send_active = false; module->active_eager_send_active = false;
@ -429,43 +415,35 @@ component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit
module->active_eager_send_active = true; module->active_eager_send_active = true;
#endif #endif
if (!no_locks) {
module->passive_eager_send_active = malloc(sizeof(bool) * ompi_comm_size(comm));
if (NULL == module->passive_eager_send_active) {
ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
goto cleanup;
}
}
/* lock data */ /* lock data */
if (check_config_value_bool("no_locks", info)) { if (check_config_value_bool("no_locks", info)) {
win->w_flags |= OMPI_WIN_NO_LOCKS; win->w_flags |= OMPI_WIN_NO_LOCKS;
} }
/* update component data */ /* update component data */
OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock); OPAL_THREAD_LOCK(&mca_osc_pt2pt_component.lock);
ret = opal_hash_table_set_value_uint32(&mca_osc_rdma_component.modules, ret = opal_hash_table_set_value_uint32(&mca_osc_pt2pt_component.modules,
ompi_comm_get_cid(module->comm), ompi_comm_get_cid(module->comm),
module); module);
OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock); OPAL_THREAD_UNLOCK(&mca_osc_pt2pt_component.lock);
if (OMPI_SUCCESS != ret) goto cleanup; if (OMPI_SUCCESS != ret) goto cleanup;
/* fill in window information */ /* fill in window information */
*model = MPI_WIN_UNIFIED; *model = MPI_WIN_UNIFIED;
win->w_osc_module = (ompi_osc_base_module_t*) module; win->w_osc_module = (ompi_osc_base_module_t*) module;
asprintf(&name, "rdma window %d", ompi_comm_get_cid(module->comm)); asprintf(&name, "pt2pt window %d", ompi_comm_get_cid(module->comm));
ompi_win_set_name(win, name); ompi_win_set_name(win, name);
free(name); free(name);
/* sync memory - make sure all initialization completed */ /* sync memory - make sure all initialization completed */
opal_atomic_mb(); opal_atomic_mb();
module->incoming_buffer = malloc (mca_osc_rdma_component.buffer_size + sizeof (ompi_osc_rdma_frag_header_t)); module->incoming_buffer = malloc (mca_osc_pt2pt_component.buffer_size + sizeof (ompi_osc_pt2pt_frag_header_t));
if (OPAL_UNLIKELY(NULL == module->incoming_buffer)) { if (OPAL_UNLIKELY(NULL == module->incoming_buffer)) {
goto cleanup; goto cleanup;
} }
ret = ompi_osc_rdma_frag_start_receive (module); ret = ompi_osc_pt2pt_frag_start_receive (module);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
goto cleanup; goto cleanup;
} }
@ -476,30 +454,30 @@ component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit
module->comm->c_coll.coll_barrier_module); module->comm->c_coll.coll_barrier_module);
if (OMPI_SUCCESS != ret) goto cleanup; if (OMPI_SUCCESS != ret) goto cleanup;
if (!mca_osc_rdma_component.progress_enable) { if (!mca_osc_pt2pt_component.progress_enable) {
opal_progress_register (component_progress); opal_progress_register (component_progress);
mca_osc_rdma_component.progress_enable = true; mca_osc_pt2pt_component.progress_enable = true;
} }
OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output,
"done creating rdma window %d", ompi_comm_get_cid(module->comm))); "done creating pt2pt window %d", ompi_comm_get_cid(module->comm)));
return OMPI_SUCCESS; return OMPI_SUCCESS;
cleanup: cleanup:
/* set the module so we properly cleanup */ /* set the module so we properly cleanup */
win->w_osc_module = (ompi_osc_base_module_t*) module; win->w_osc_module = (ompi_osc_base_module_t*) module;
ompi_osc_rdma_free (win); ompi_osc_pt2pt_free (win);
return ret; return ret;
} }
int int
ompi_osc_rdma_set_info(struct ompi_win_t *win, struct ompi_info_t *info) ompi_osc_pt2pt_set_info(struct ompi_win_t *win, struct ompi_info_t *info)
{ {
ompi_osc_rdma_module_t *module = ompi_osc_pt2pt_module_t *module =
(ompi_osc_rdma_module_t*) win->w_osc_module; (ompi_osc_pt2pt_module_t*) win->w_osc_module;
/* enforce collectiveness... */ /* enforce collectiveness... */
return module->comm->c_coll.coll_barrier(module->comm, return module->comm->c_coll.coll_barrier(module->comm,
@ -508,7 +486,7 @@ ompi_osc_rdma_set_info(struct ompi_win_t *win, struct ompi_info_t *info)
int int
ompi_osc_rdma_get_info(struct ompi_win_t *win, struct ompi_info_t **info_used) ompi_osc_pt2pt_get_info(struct ompi_win_t *win, struct ompi_info_t **info_used)
{ {
ompi_info_t *info = OBJ_NEW(ompi_info_t); ompi_info_t *info = OBJ_NEW(ompi_info_t);
if (NULL == info) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; if (NULL == info) return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
@ -518,4 +496,4 @@ ompi_osc_rdma_get_info(struct ompi_win_t *win, struct ompi_info_t **info_used)
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
OBJ_CLASS_INSTANCE(ompi_osc_rdma_pending_t, opal_list_item_t, NULL, NULL); OBJ_CLASS_INSTANCE(ompi_osc_pt2pt_pending_t, opal_list_item_t, NULL, NULL);

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -18,22 +18,22 @@
* $HEADER$ * $HEADER$
*/ */
#ifndef OMPI_MCA_OSC_RDMA_DATA_MOVE_H #ifndef OMPI_MCA_OSC_PT2PT_DATA_MOVE_H
#define OMPI_MCA_OSC_RDMA_DATA_MOVE_H #define OMPI_MCA_OSC_PT2PT_DATA_MOVE_H
#include "osc_rdma_header.h" #include "osc_pt2pt_header.h"
int ompi_osc_rdma_control_send(ompi_osc_rdma_module_t *module, int ompi_osc_pt2pt_control_send(ompi_osc_pt2pt_module_t *module,
int target, int target,
void *data, void *data,
size_t len); size_t len);
/** /**
* ompi_osc_rdma_control_send_unbuffered: * ompi_osc_pt2pt_control_send_unbuffered:
* *
* @short Send an unbuffered control message to a peer. * @short Send an unbuffered control message to a peer.
* *
* @param[in] module - OSC RDMA module * @param[in] module - OSC PT2PT module
* @param[in] target - Target rank * @param[in] target - Target rank
* @param[in] data - Data to send * @param[in] data - Data to send
* @param[in] len - Length of data * @param[in] len - Length of data
@ -45,11 +45,11 @@ int ompi_osc_rdma_control_send(ompi_osc_rdma_module_t *module,
* from its peer). The buffer specified by data will be available * from its peer). The buffer specified by data will be available
* when this call returns. * when this call returns.
*/ */
int ompi_osc_rdma_control_send_unbuffered (ompi_osc_rdma_module_t *module, int ompi_osc_pt2pt_control_send_unbuffered (ompi_osc_pt2pt_module_t *module,
int target, void *data, size_t len); int target, void *data, size_t len);
/** /**
* ompi_osc_rdma_isend_w_cb: * ompi_osc_pt2pt_isend_w_cb:
* *
* @short Post a non-blocking send with a specified callback. * @short Post a non-blocking send with a specified callback.
* *
@ -66,11 +66,11 @@ int ompi_osc_rdma_control_send_unbuffered (ompi_osc_rdma_module_t *module,
* be called with the associated request. The context specified in ctx will be stored in * be called with the associated request. The context specified in ctx will be stored in
* the req_completion_cb_data member of the ompi_request_t for use by the callback. * the req_completion_cb_data member of the ompi_request_t for use by the callback.
*/ */
int ompi_osc_rdma_isend_w_cb (void *ptr, int count, ompi_datatype_t *datatype, int target, int tag, int ompi_osc_pt2pt_isend_w_cb (void *ptr, int count, ompi_datatype_t *datatype, int target, int tag,
ompi_communicator_t *comm, ompi_request_complete_fn_t cb, void *ctx); ompi_communicator_t *comm, ompi_request_complete_fn_t cb, void *ctx);
/** /**
* ompi_osc_rdma_irecv_w_cb: * ompi_osc_pt2pt_irecv_w_cb:
* *
* @short Post a non-blocking receive with a specified callback. * @short Post a non-blocking receive with a specified callback.
* *
@ -89,49 +89,49 @@ int ompi_osc_rdma_isend_w_cb (void *ptr, int count, ompi_datatype_t *datatype, i
* request. The context specified in ctx will be stored in the req_completion_cb_data * request. The context specified in ctx will be stored in the req_completion_cb_data
* member of the ompi_request_t for use by the callback. * member of the ompi_request_t for use by the callback.
*/ */
int ompi_osc_rdma_irecv_w_cb (void *ptr, int count, ompi_datatype_t *datatype, int source, int tag, int ompi_osc_pt2pt_irecv_w_cb (void *ptr, int count, ompi_datatype_t *datatype, int source, int tag,
ompi_communicator_t *comm, ompi_request_t **request_out, ompi_communicator_t *comm, ompi_request_t **request_out,
ompi_request_complete_fn_t cb, void *ctx); ompi_request_complete_fn_t cb, void *ctx);
int ompi_osc_rdma_process_lock(ompi_osc_rdma_module_t* module, int ompi_osc_pt2pt_process_lock(ompi_osc_pt2pt_module_t* module,
int source, int source,
struct ompi_osc_rdma_header_lock_t* lock_header); struct ompi_osc_pt2pt_header_lock_t* lock_header);
void ompi_osc_rdma_process_lock_ack(ompi_osc_rdma_module_t* module, void ompi_osc_pt2pt_process_lock_ack(ompi_osc_pt2pt_module_t* module,
struct ompi_osc_rdma_header_lock_ack_t* lock_header); struct ompi_osc_pt2pt_header_lock_ack_t* lock_header);
int ompi_osc_rdma_process_unlock(ompi_osc_rdma_module_t* module, int ompi_osc_pt2pt_process_unlock(ompi_osc_pt2pt_module_t* module,
int source, int source,
struct ompi_osc_rdma_header_unlock_t* lock_header); struct ompi_osc_pt2pt_header_unlock_t* lock_header);
int ompi_osc_rdma_process_flush (ompi_osc_rdma_module_t *module, int source, int ompi_osc_pt2pt_process_flush (ompi_osc_pt2pt_module_t *module, int source,
ompi_osc_rdma_header_flush_t *flush_header); ompi_osc_pt2pt_header_flush_t *flush_header);
/** /**
* ompi_osc_rdma_process_unlock_ack: * ompi_osc_pt2pt_process_unlock_ack:
* *
* @short Process an incoming unlock acknowledgement. * @short Process an incoming unlock acknowledgement.
* *
* @param[in] module - OSC RDMA module * @param[in] module - OSC PT2PT module
* @param[in] source - Source rank * @param[in] source - Source rank
* @param[in] unlock_ack_header - Incoming unlock ack header * @param[in] unlock_ack_header - Incoming unlock ack header
*/ */
void ompi_osc_rdma_process_unlock_ack (ompi_osc_rdma_module_t *module, int source, void ompi_osc_pt2pt_process_unlock_ack (ompi_osc_pt2pt_module_t *module, int source,
ompi_osc_rdma_header_unlock_ack_t *unlock_ack_header); ompi_osc_pt2pt_header_unlock_ack_t *unlock_ack_header);
/** /**
* ompi_osc_rdma_process_flush_ack: * ompi_osc_pt2pt_process_flush_ack:
* *
* @short Process an incoming flush acknowledgement. * @short Process an incoming flush acknowledgement.
* *
* @param[in] module - OSC RDMA module * @param[in] module - OSC PT2PT module
* @param[in] source - Source rank * @param[in] source - Source rank
* @param[in] flush_ack_header - Incoming flush ack header * @param[in] flush_ack_header - Incoming flush ack header
*/ */
void ompi_osc_rdma_process_flush_ack (ompi_osc_rdma_module_t *module, int source, void ompi_osc_pt2pt_process_flush_ack (ompi_osc_pt2pt_module_t *module, int source,
ompi_osc_rdma_header_flush_ack_t *flush_ack_header); ompi_osc_pt2pt_header_flush_ack_t *flush_ack_header);
/** /**
* ompi_osc_rdma_frag_start_receive: * ompi_osc_pt2pt_frag_start_receive:
* *
* @short Start receiving fragments on the OSC module. * @short Start receiving fragments on the OSC module.
* *
@ -140,6 +140,6 @@ void ompi_osc_rdma_process_flush_ack (ompi_osc_rdma_module_t *module, int source
* @long This function starts receiving eager fragments on the module. The current * @long This function starts receiving eager fragments on the module. The current
* implementation uses the pml to transfer eager fragments. * implementation uses the pml to transfer eager fragments.
*/ */
int ompi_osc_rdma_frag_start_receive (ompi_osc_rdma_module_t *module); int ompi_osc_pt2pt_frag_start_receive (ompi_osc_pt2pt_module_t *module);
#endif #endif

203
ompi/mca/osc/pt2pt/osc_pt2pt_frag.c Обычный файл
Просмотреть файл

@ -0,0 +1,203 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "opal/class/opal_list.h"
#include "ompi/mca/osc/base/base.h"
#include "ompi/mca/pml/pml.h"
#include "osc_pt2pt.h"
#include "osc_pt2pt_frag.h"
#include "osc_pt2pt_data_move.h"
static void ompi_osc_pt2pt_frag_constructor (ompi_osc_pt2pt_frag_t *frag){
frag->buffer = frag->super.ptr;
}
OBJ_CLASS_INSTANCE(ompi_osc_pt2pt_frag_t, ompi_free_list_item_t,
ompi_osc_pt2pt_frag_constructor, NULL);
static int frag_send_cb (ompi_request_t *request)
{
ompi_osc_pt2pt_frag_t *frag =
(ompi_osc_pt2pt_frag_t*) request->req_complete_cb_data;
ompi_osc_pt2pt_module_t *module = frag->module;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc pt2pt: frag_send complete to %d, frag = %p, request = %p",
frag->target, (void *) frag, (void *) request));
mark_outgoing_completion(module);
OMPI_FREE_LIST_RETURN_MT(&mca_osc_pt2pt_component.frags, &frag->super);
/* put this request on the garbage colletion list */
osc_pt2pt_gc_add_request (module, request);
return OMPI_SUCCESS;
}
static int
frag_send(ompi_osc_pt2pt_module_t *module,
ompi_osc_pt2pt_frag_t *frag)
{
int count;
count = (int)((uintptr_t) frag->top - (uintptr_t) frag->buffer);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc pt2pt: frag_send called to %d, frag = %p, count = %d",
frag->target, (void *) frag, count));
/* we need to signal now that a frag is outgoing to ensure the count sent
* with the unlock message is correct */
ompi_osc_signal_outgoing (module, frag->target, 1);
return ompi_osc_pt2pt_isend_w_cb (frag->buffer, count, MPI_BYTE, frag->target, OSC_PT2PT_FRAG_TAG,
module->comm, frag_send_cb, frag);
}
int
ompi_osc_pt2pt_frag_start(ompi_osc_pt2pt_module_t *module,
ompi_osc_pt2pt_frag_t *frag)
{
ompi_osc_pt2pt_peer_t *peer = module->peers + frag->target;
int ret;
assert(0 == frag->pending && peer->active_frag != frag);
/* we need to signal now that a frag is outgoing to ensure the count sent
* with the unlock message is correct */
ompi_osc_signal_outgoing (module, frag->target, 1);
/* if eager sends are not active, can't send yet, so buffer and
get out... */
if (!(peer->eager_send_active || module->all_access_epoch)) {
OPAL_THREAD_SCOPED_LOCK(&module->queued_frags_lock,
opal_list_append(&module->queued_frags, (opal_list_item_t *) frag));
return OMPI_SUCCESS;
}
ret = frag_send(module, frag);
opal_condition_broadcast(&module->cond);
return ret;
}
int
ompi_osc_pt2pt_frag_flush_target(ompi_osc_pt2pt_module_t *module, int target)
{
ompi_osc_pt2pt_frag_t *next, *frag = module->peers[target].active_frag;
int ret = OMPI_SUCCESS;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc pt2pt: frag flush target begin"));
/* flush the active frag */
if (NULL != frag) {
if (1 != frag->pending) {
/* communication going on while synchronizing; this is an rma usage bug */
return OMPI_ERR_RMA_SYNC;
}
if (opal_atomic_cmpset (&module->peers[target].active_frag, frag, NULL)) {
OPAL_THREAD_ADD32(&frag->pending, -1);
ret = ompi_osc_pt2pt_frag_start(module, frag);
if (OMPI_SUCCESS != ret) {
return ret;
}
}
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc pt2pt: frag flush target finished active frag"));
/* walk through the pending list and send */
OPAL_THREAD_LOCK(&module->queued_frags_lock);
if (opal_list_get_size (&module->queued_frags)) {
OPAL_LIST_FOREACH_SAFE(frag, next, &module->queued_frags, ompi_osc_pt2pt_frag_t) {
if (frag->target == target) {
opal_list_remove_item(&module->queued_frags, (opal_list_item_t *) frag);
ret = frag_send(module, frag);
if (OPAL_UNLIKELY(OMPI_SUCCESS != frag)) {
break;
}
}
}
}
OPAL_THREAD_UNLOCK(&module->queued_frags_lock);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc pt2pt: frag flush target finished"));
return ret;
}
int
ompi_osc_pt2pt_frag_flush_all(ompi_osc_pt2pt_module_t *module)
{
int ret = OMPI_SUCCESS;
int i;
ompi_osc_pt2pt_frag_t *frag, *next;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc pt2pt: frag flush all begin"));
/* flush the active frag */
for (i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
ompi_osc_pt2pt_frag_t *frag = module->peers[i].active_frag;
if (NULL != frag) {
if (1 != frag->pending) {
OPAL_THREAD_UNLOCK(&module->lock);
/* communication going on while synchronizing; this is a bug */
return OMPI_ERR_RMA_SYNC;
}
if (!opal_atomic_cmpset_ptr (&module->peers[i].active_frag, frag, NULL)) {
continue;
}
OPAL_THREAD_ADD32(&frag->pending, -1);
ret = ompi_osc_pt2pt_frag_start(module, frag);
if (OMPI_SUCCESS != ret) {
return ret;
}
}
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc pt2pt: frag flush all finished active frag"));
/* try to start all the queued frags */
OPAL_THREAD_LOCK(&module->queued_frags_lock);
if (opal_list_get_size (&module->queued_frags)) {
OPAL_LIST_FOREACH_SAFE(frag, next, &module->queued_frags, ompi_osc_pt2pt_frag_t) {
opal_list_remove_item(&module->queued_frags, (opal_list_item_t *) frag);
ret = frag_send(module, frag);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
break;
}
}
}
OPAL_THREAD_UNLOCK(&module->queued_frags_lock);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc pt2pt: frag flush all done"));
return ret;
}

143
ompi/mca/osc/pt2pt/osc_pt2pt_frag.h Обычный файл
Просмотреть файл

@ -0,0 +1,143 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2012 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef OSC_PT2PT_FRAG_H
#define OSC_PT2PT_FRAG_H
#include "ompi/communicator/communicator.h"
#include "osc_pt2pt_header.h"
#include "osc_pt2pt_request.h"
#include "opal/align.h"
/** Communication buffer for packing messages */
struct ompi_osc_pt2pt_frag_t {
ompi_free_list_item_t super;
/* target rank of buffer */
int target;
unsigned char *buffer;
/* space remaining in buffer */
size_t remain_len;
/* start of unused space */
char *top;
/* Number of operations which have started writing into the frag, but not yet completed doing so */
int32_t pending;
ompi_osc_pt2pt_frag_header_t *header;
ompi_osc_pt2pt_module_t *module;
};
typedef struct ompi_osc_pt2pt_frag_t ompi_osc_pt2pt_frag_t;
OBJ_CLASS_DECLARATION(ompi_osc_pt2pt_frag_t);
extern int ompi_osc_pt2pt_frag_start(ompi_osc_pt2pt_module_t *module, ompi_osc_pt2pt_frag_t *buffer);
extern int ompi_osc_pt2pt_frag_flush_target(ompi_osc_pt2pt_module_t *module, int target);
extern int ompi_osc_pt2pt_frag_flush_all(ompi_osc_pt2pt_module_t *module);
/*
* Note: module lock must be held during this operation
*/
static inline int ompi_osc_pt2pt_frag_alloc(ompi_osc_pt2pt_module_t *module, int target,
size_t request_len, ompi_osc_pt2pt_frag_t **buffer,
char **ptr)
{
ompi_osc_pt2pt_frag_t *curr = module->peers[target].active_frag;
int ret;
/* osc pt2pt headers can have 64-bit values. these will need to be aligned
* on an 8-byte boundary on some architectures so we up align the allocation
* size here. */
request_len = OPAL_ALIGN(request_len, 8, size_t);
if (request_len > mca_osc_pt2pt_component.buffer_size) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
OPAL_THREAD_LOCK(&module->lock);
if (NULL == curr || curr->remain_len < request_len) {
ompi_free_list_item_t *item = NULL;
if (NULL != curr) {
curr->remain_len = 0;
module->peers[target].active_frag = NULL;
opal_atomic_mb ();
/* If there's something pending, the pending finish will
start the buffer. Otherwise, we need to start it now. */
if (0 == OPAL_THREAD_ADD32(&curr->pending, -1)) {
ret = ompi_osc_pt2pt_frag_start(module, curr);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
}
}
OMPI_FREE_LIST_GET_MT(&mca_osc_pt2pt_component.frags, item);
if (OPAL_UNLIKELY(NULL == item)) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
curr = module->peers[target].active_frag =
(ompi_osc_pt2pt_frag_t*) item;
curr->target = target;
curr->header = (ompi_osc_pt2pt_frag_header_t*) curr->buffer;
curr->top = (char*) (curr->header + 1);
curr->remain_len = mca_osc_pt2pt_component.buffer_size;
curr->module = module;
curr->pending = 1;
curr->header->base.type = OMPI_OSC_PT2PT_HDR_TYPE_FRAG;
curr->header->base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID;
if (module->passive_target_access_epoch) {
curr->header->base.flags |= OMPI_OSC_PT2PT_HDR_FLAG_PASSIVE_TARGET;
}
curr->header->source = ompi_comm_rank(module->comm);
curr->header->num_ops = 0;
curr->header->windx = ompi_comm_get_cid(module->comm);
if (curr->remain_len < request_len) {
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
}
}
*ptr = curr->top;
*buffer = curr;
curr->top += request_len;
curr->remain_len -= request_len;
OPAL_THREAD_UNLOCK(&module->lock);
OPAL_THREAD_ADD32(&curr->pending, 1);
OPAL_THREAD_ADD32(&curr->header->num_ops, 1);
return OMPI_SUCCESS;
}
/*
* Note: module lock must be held for this operation
*/
static inline int ompi_osc_pt2pt_frag_finish(ompi_osc_pt2pt_module_t *module,
ompi_osc_pt2pt_frag_t* buffer)
{
if (0 == OPAL_THREAD_ADD32(&buffer->pending, -1)) {
return ompi_osc_pt2pt_frag_start(module, buffer);
}
return OMPI_SUCCESS;
}
#endif

189
ompi/mca/osc/pt2pt/osc_pt2pt_header.h Обычный файл
Просмотреть файл

@ -0,0 +1,189 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef OMPI_MCA_OSC_PT2PT_HDR_H
#define OMPI_MCA_OSC_PT2PT_HDR_H
#ifdef HAVE_NETINET_IN_H
#include <netinet/in.h>
#endif
#include "opal/types.h"
enum ompi_osc_pt2pt_hdr_type_t {
OMPI_OSC_PT2PT_HDR_TYPE_PUT = 0x01,
OMPI_OSC_PT2PT_HDR_TYPE_PUT_LONG = 0x02,
OMPI_OSC_PT2PT_HDR_TYPE_ACC = 0x03,
OMPI_OSC_PT2PT_HDR_TYPE_ACC_LONG = 0x04,
OMPI_OSC_PT2PT_HDR_TYPE_GET = 0x05,
OMPI_OSC_PT2PT_HDR_TYPE_CSWAP = 0x06,
OMPI_OSC_PT2PT_HDR_TYPE_CSWAP_LONG = 0x07,
OMPI_OSC_PT2PT_HDR_TYPE_GET_ACC = 0x08,
OMPI_OSC_PT2PT_HDR_TYPE_GET_ACC_LONG = 0x09,
OMPI_OSC_PT2PT_HDR_TYPE_COMPLETE = 0x10,
OMPI_OSC_PT2PT_HDR_TYPE_POST = 0x11,
OMPI_OSC_PT2PT_HDR_TYPE_LOCK_REQ = 0x12,
OMPI_OSC_PT2PT_HDR_TYPE_LOCK_ACK = 0x13,
OMPI_OSC_PT2PT_HDR_TYPE_UNLOCK_REQ = 0x14,
OMPI_OSC_PT2PT_HDR_TYPE_UNLOCK_ACK = 0x15,
OMPI_OSC_PT2PT_HDR_TYPE_FLUSH_REQ = 0x16,
OMPI_OSC_PT2PT_HDR_TYPE_FLUSH_ACK = 0x17,
OMPI_OSC_PT2PT_HDR_TYPE_FRAG = 0x20,
};
typedef enum ompi_osc_pt2pt_hdr_type_t ompi_osc_pt2pt_hdr_type_t;
#define OMPI_OSC_PT2PT_HDR_FLAG_NBO 0x01
#define OMPI_OSC_PT2PT_HDR_FLAG_VALID 0x02
#define OMPI_OSC_PT2PT_HDR_FLAG_PASSIVE_TARGET 0x04
#define OMPI_OSC_PT2PT_HDR_FLAG_LARGE_DATATYPE 0x08
struct ompi_osc_pt2pt_header_base_t {
/** fragment type. 8 bits */
uint8_t type;
/** fragment flags. 8 bits */
uint8_t flags;
};
typedef struct ompi_osc_pt2pt_header_base_t ompi_osc_pt2pt_header_base_t;
struct ompi_osc_pt2pt_header_put_t {
ompi_osc_pt2pt_header_base_t base;
uint16_t tag;
uint32_t count;
uint64_t len;
uint64_t displacement;
};
typedef struct ompi_osc_pt2pt_header_put_t ompi_osc_pt2pt_header_put_t;
struct ompi_osc_pt2pt_header_acc_t {
ompi_osc_pt2pt_header_base_t base;
uint16_t tag;
uint32_t count;
uint32_t op;
uint64_t len;
uint64_t displacement;
};
typedef struct ompi_osc_pt2pt_header_acc_t ompi_osc_pt2pt_header_acc_t;
struct ompi_osc_pt2pt_header_get_t {
ompi_osc_pt2pt_header_base_t base;
uint16_t tag;
uint32_t count;
uint64_t len;
uint64_t displacement;
};
typedef struct ompi_osc_pt2pt_header_get_t ompi_osc_pt2pt_header_get_t;
struct ompi_osc_pt2pt_header_complete_t {
ompi_osc_pt2pt_header_base_t base;
int frag_count;
};
typedef struct ompi_osc_pt2pt_header_complete_t ompi_osc_pt2pt_header_complete_t;
struct ompi_osc_pt2pt_header_cswap_t {
ompi_osc_pt2pt_header_base_t base;
uint16_t tag;
uint32_t len;
uint64_t displacement;
};
typedef struct ompi_osc_pt2pt_header_cswap_t ompi_osc_pt2pt_header_cswap_t;
struct ompi_osc_pt2pt_header_post_t {
ompi_osc_pt2pt_header_base_t base;
uint16_t windx;
};
typedef struct ompi_osc_pt2pt_header_post_t ompi_osc_pt2pt_header_post_t;
struct ompi_osc_pt2pt_header_lock_t {
ompi_osc_pt2pt_header_base_t base;
int32_t lock_type;
uint64_t lock_ptr;
};
typedef struct ompi_osc_pt2pt_header_lock_t ompi_osc_pt2pt_header_lock_t;
struct ompi_osc_pt2pt_header_lock_ack_t {
ompi_osc_pt2pt_header_base_t base;
uint16_t windx;
uint32_t source;
uint64_t lock_ptr;
};
typedef struct ompi_osc_pt2pt_header_lock_ack_t ompi_osc_pt2pt_header_lock_ack_t;
struct ompi_osc_pt2pt_header_unlock_t {
ompi_osc_pt2pt_header_base_t base;
int32_t lock_type;
uint32_t frag_count;
uint64_t lock_ptr;
};
typedef struct ompi_osc_pt2pt_header_unlock_t ompi_osc_pt2pt_header_unlock_t;
struct ompi_osc_pt2pt_header_unlock_ack_t {
ompi_osc_pt2pt_header_base_t base;
uint64_t lock_ptr;
};
typedef struct ompi_osc_pt2pt_header_unlock_ack_t ompi_osc_pt2pt_header_unlock_ack_t;
struct ompi_osc_pt2pt_header_flush_t {
ompi_osc_pt2pt_header_base_t base;
uint32_t frag_count;
uint64_t serial_number;
};
typedef struct ompi_osc_pt2pt_header_flush_t ompi_osc_pt2pt_header_flush_t;
struct ompi_osc_pt2pt_header_flush_ack_t {
ompi_osc_pt2pt_header_base_t base;
uint64_t serial_number;
};
typedef struct ompi_osc_pt2pt_header_flush_ack_t ompi_osc_pt2pt_header_flush_ack_t;
struct ompi_osc_pt2pt_frag_header_t {
ompi_osc_pt2pt_header_base_t base;
uint16_t windx; /* cid of communicator backing window (our window id) */
uint32_t source; /* rank in window of source process */
int32_t num_ops; /* number of operations in this buffer */
uint32_t pad; /* ensure the fragment header is a multiple of 8 bytes */
};
typedef struct ompi_osc_pt2pt_frag_header_t ompi_osc_pt2pt_frag_header_t;
union ompi_osc_pt2pt_header_t {
ompi_osc_pt2pt_header_base_t base;
ompi_osc_pt2pt_header_put_t put;
ompi_osc_pt2pt_header_acc_t acc;
ompi_osc_pt2pt_header_get_t get;
ompi_osc_pt2pt_header_complete_t complete;
ompi_osc_pt2pt_header_cswap_t cswap;
ompi_osc_pt2pt_header_post_t post;
ompi_osc_pt2pt_header_lock_t lock;
ompi_osc_pt2pt_header_lock_ack_t lock_ack;
ompi_osc_pt2pt_header_unlock_t unlock;
ompi_osc_pt2pt_header_unlock_ack_t unlock_ack;
ompi_osc_pt2pt_header_flush_t flush;
ompi_osc_pt2pt_header_flush_ack_t flush_ack;
ompi_osc_pt2pt_frag_header_t frag;
};
typedef union ompi_osc_pt2pt_header_t ompi_osc_pt2pt_header_t;
#endif /* OMPI_MCA_OSC_PT2PT_HDR_H */

Просмотреть файл

@ -20,7 +20,7 @@
#include "ompi_config.h" #include "ompi_config.h"
#include "osc_rdma.h" #include "osc_pt2pt.h"
#include "opal/threads/mutex.h" #include "opal/threads/mutex.h"
#include "opal/mca/btl/btl.h" #include "opal/mca/btl/btl.h"
@ -31,25 +31,24 @@
int int
ompi_osc_rdma_attach(struct ompi_win_t *win, void *base, size_t len) ompi_osc_pt2pt_attach(struct ompi_win_t *win, void *base, size_t len)
{ {
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
int int
ompi_osc_rdma_detach(struct ompi_win_t *win, void *base) ompi_osc_pt2pt_detach(struct ompi_win_t *win, void *base)
{ {
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
int int
ompi_osc_rdma_free(ompi_win_t *win) ompi_osc_pt2pt_free(ompi_win_t *win)
{ {
int ret = OMPI_SUCCESS; int ret = OMPI_SUCCESS;
ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
opal_list_item_t *item;
if (NULL == module) { if (NULL == module) {
return OMPI_SUCCESS; return OMPI_SUCCESS;
@ -57,7 +56,7 @@ ompi_osc_rdma_free(ompi_win_t *win)
if (NULL != module->comm) { if (NULL != module->comm) {
opal_output_verbose(1, ompi_osc_base_framework.framework_output, opal_output_verbose(1, ompi_osc_base_framework.framework_output,
"rdma component destroying window with id %d", "pt2pt component destroying window with id %d",
ompi_comm_get_cid(module->comm)); ompi_comm_get_cid(module->comm));
/* finish with a barrier */ /* finish with a barrier */
@ -67,43 +66,38 @@ ompi_osc_rdma_free(ompi_win_t *win)
} }
/* remove from component information */ /* remove from component information */
OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock); OPAL_THREAD_SCOPED_LOCK(&mca_osc_pt2pt_component.lock,
opal_hash_table_remove_value_uint32(&mca_osc_rdma_component.modules, opal_hash_table_remove_value_uint32(&mca_osc_pt2pt_component.modules,
ompi_comm_get_cid(module->comm)); ompi_comm_get_cid(module->comm)));
OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock);
} }
win->w_osc_module = NULL; win->w_osc_module = NULL;
OBJ_DESTRUCT(&module->outstanding_locks); OBJ_DESTRUCT(&module->outstanding_locks);
OBJ_DESTRUCT(&module->locks_pending); OBJ_DESTRUCT(&module->locks_pending);
OBJ_DESTRUCT(&module->locks_pending_lock);
OBJ_DESTRUCT(&module->acc_lock); OBJ_DESTRUCT(&module->acc_lock);
OBJ_DESTRUCT(&module->cond); OBJ_DESTRUCT(&module->cond);
OBJ_DESTRUCT(&module->lock); OBJ_DESTRUCT(&module->lock);
/* it is erroneous to close a window with active operations on it so we should /* it is erroneous to close a window with active operations on it so we should
* probably produce an error here instead of cleaning up */ * probably produce an error here instead of cleaning up */
while (NULL != (item = opal_list_remove_first (&module->pending_acc))) { OPAL_LIST_DESTRUCT(&module->pending_acc);
OBJ_RELEASE(item); OPAL_LIST_DESTRUCT(&module->pending_posts);
} OPAL_LIST_DESTRUCT(&module->queued_frags);
OBJ_DESTRUCT(&module->queued_frags_lock);
OBJ_DESTRUCT(&module->pending_acc); osc_pt2pt_gc_clean (module);
OPAL_LIST_DESTRUCT(&module->request_gc);
while (NULL != (item = opal_list_remove_first (&module->pending_posts))) { OPAL_LIST_DESTRUCT(&module->buffer_gc);
OBJ_RELEASE(item); OBJ_DESTRUCT(&module->gc_lock);
}
OBJ_DESTRUCT(&module->pending_posts);
osc_rdma_gc_clean ();
if (NULL != module->peers) { if (NULL != module->peers) {
free(module->peers); free(module->peers);
} }
if (NULL != module->passive_eager_send_active) free(module->passive_eager_send_active);
if (NULL != module->passive_incoming_frag_count) free(module->passive_incoming_frag_count);
if (NULL != module->passive_incoming_frag_signal_count) free(module->passive_incoming_frag_signal_count);
if (NULL != module->epoch_outgoing_frag_count) free(module->epoch_outgoing_frag_count); if (NULL != module->epoch_outgoing_frag_count) free(module->epoch_outgoing_frag_count);
if (NULL != module->frag_request) { if (NULL != module->frag_request) {
module->frag_request->req_complete_cb = NULL; module->frag_request->req_complete_cb = NULL;
ompi_request_cancel (module->frag_request); ompi_request_cancel (module->frag_request);

925
ompi/mca/osc/pt2pt/osc_pt2pt_passive_target.c Обычный файл
Просмотреть файл

@ -0,0 +1,925 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2010 IBM Corporation. All rights reserved.
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "osc_pt2pt.h"
#include "osc_pt2pt_header.h"
#include "osc_pt2pt_data_move.h"
#include "osc_pt2pt_frag.h"
#include "mpi.h"
#include "opal/runtime/opal_progress.h"
#include "opal/threads/mutex.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/osc/base/base.h"
#include "opal/include/opal_stdint.h"
static bool ompi_osc_pt2pt_lock_try_acquire (ompi_osc_pt2pt_module_t* module, int source, int lock_type,
uint64_t serial_number);
/* target-side tracking of a lock request */
struct ompi_osc_pt2pt_pending_lock_t {
opal_list_item_t super;
int peer;
int lock_type;
uint64_t lock_ptr;
};
typedef struct ompi_osc_pt2pt_pending_lock_t ompi_osc_pt2pt_pending_lock_t;
OBJ_CLASS_INSTANCE(ompi_osc_pt2pt_pending_lock_t, opal_list_item_t,
NULL, NULL);
/* origin-side tracking of a lock request */
struct ompi_osc_pt2pt_outstanding_lock_t {
opal_list_item_t super;
int target;
int assert;
bool flushing;
int32_t lock_acks_received;
int32_t unlock_acks_received;
int32_t flush_acks_received;
uint64_t serial_number;
int32_t type;
};
typedef struct ompi_osc_pt2pt_outstanding_lock_t ompi_osc_pt2pt_outstanding_lock_t;
OBJ_CLASS_INSTANCE(ompi_osc_pt2pt_outstanding_lock_t, opal_list_item_t,
NULL, NULL);
static int ompi_osc_activate_next_lock (ompi_osc_pt2pt_module_t *module);
static inline int queue_lock (ompi_osc_pt2pt_module_t *module, int requestor, int lock_type, uint64_t lock_ptr);
static int ompi_osc_pt2pt_flush_lock (ompi_osc_pt2pt_module_t *module, ompi_osc_pt2pt_outstanding_lock_t *lock,
int target);
/**
* Find the first outstanding lock to a target.
*
* @param[in] module - OSC PT2PT module
* @param[in] target - Target rank
*
* @returns an outstanding lock on success
*
* This function traverses the outstanding_locks list in the module
* looking for a lock that matches target. The caller must hold the
* module lock.
*/
static inline ompi_osc_pt2pt_outstanding_lock_t *find_outstanding_lock_st (ompi_osc_pt2pt_module_t *module, int target)
{
ompi_osc_pt2pt_outstanding_lock_t *outstanding_lock, *lock = NULL;
OPAL_LIST_FOREACH(outstanding_lock, &module->outstanding_locks, ompi_osc_pt2pt_outstanding_lock_t) {
if (outstanding_lock->target == target) {
lock = outstanding_lock;
break;
}
}
return lock;
}
static inline ompi_osc_pt2pt_outstanding_lock_t *find_outstanding_lock (ompi_osc_pt2pt_module_t *module, int target)
{
ompi_osc_pt2pt_outstanding_lock_t *lock;
OPAL_THREAD_LOCK(&module->lock);
lock = find_outstanding_lock_st (module, target);
OPAL_THREAD_UNLOCK(&module->lock);
return lock;
}
static inline ompi_osc_pt2pt_outstanding_lock_t *find_outstanding_lock_by_serial (ompi_osc_pt2pt_module_t *module, uint64_t serial_number)
{
ompi_osc_pt2pt_outstanding_lock_t *outstanding_lock, *lock = NULL;
OPAL_THREAD_LOCK(&module->lock);
OPAL_LIST_FOREACH(outstanding_lock, &module->outstanding_locks, ompi_osc_pt2pt_outstanding_lock_t) {
if (outstanding_lock->serial_number == serial_number) {
lock = outstanding_lock;
break;
}
}
OPAL_THREAD_UNLOCK(&module->lock);
return lock;
}
static inline int ompi_osc_pt2pt_lock_self (ompi_osc_pt2pt_module_t *module, ompi_osc_pt2pt_outstanding_lock_t *lock)
{
const int my_rank = ompi_comm_rank (module->comm);
bool acquired = false;
acquired = ompi_osc_pt2pt_lock_try_acquire (module, my_rank, lock->type, (uint64_t) (uintptr_t) lock);
if (!acquired) {
/* queue the lock */
queue_lock (module, my_rank, lock->type, (uint64_t) (uintptr_t) lock);
/* If locking local, can't be non-blocking according to the
standard. We need to wait for the ack here. */
OPAL_THREAD_LOCK(&module->lock);
while (0 == lock->lock_acks_received) {
opal_condition_wait(&module->cond, &module->lock);
}
OPAL_THREAD_UNLOCK(&module->lock);
}
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"local lock aquired"));
return OMPI_SUCCESS;
}
static inline void ompi_osc_pt2pt_unlock_self (ompi_osc_pt2pt_module_t *module, ompi_osc_pt2pt_outstanding_lock_t *lock)
{
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_unlock_self: unlocking myself. lock state = %d", module->lock_status));
if (MPI_LOCK_EXCLUSIVE == lock->type) {
OPAL_THREAD_ADD32(&module->lock_status, 1);
ompi_osc_activate_next_lock (module);
} else if (0 == OPAL_THREAD_ADD32(&module->lock_status, -1)) {
ompi_osc_activate_next_lock (module);
}
/* need to ensure we make progress */
opal_progress();
OPAL_THREAD_ADD32(&lock->unlock_acks_received, 1);
}
static inline int ompi_osc_pt2pt_lock_remote (ompi_osc_pt2pt_module_t *module, int target, ompi_osc_pt2pt_outstanding_lock_t *lock)
{
ompi_osc_pt2pt_header_lock_t lock_req;
int ret;
/* generate a lock request */
lock_req.base.type = OMPI_OSC_PT2PT_HDR_TYPE_LOCK_REQ;
lock_req.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID | OMPI_OSC_PT2PT_HDR_FLAG_PASSIVE_TARGET;
lock_req.lock_type = lock->type;
lock_req.lock_ptr = (uint64_t) (uintptr_t) lock;
ret = ompi_osc_pt2pt_control_send (module, target, &lock_req, sizeof (lock_req));
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
/* make sure the request gets sent, so we can start eager sending... */
ret = ompi_osc_pt2pt_frag_flush_target (module, target);
return ret;
}
static inline int ompi_osc_pt2pt_unlock_remote (ompi_osc_pt2pt_module_t *module, int target, ompi_osc_pt2pt_outstanding_lock_t *lock)
{
ompi_osc_pt2pt_header_unlock_t unlock_req;
int32_t frag_count = opal_atomic_swap_32 ((int32_t *) module->epoch_outgoing_frag_count + target, -1);
unlock_req.base.type = OMPI_OSC_PT2PT_HDR_TYPE_UNLOCK_REQ;
unlock_req.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID | OMPI_OSC_PT2PT_HDR_FLAG_PASSIVE_TARGET;
unlock_req.frag_count = frag_count;
unlock_req.lock_type = lock->type;
unlock_req.lock_ptr = (uint64_t) (uintptr_t) lock;
/* send control message with unlock request and count */
return ompi_osc_pt2pt_control_send (module, target, &unlock_req, sizeof (unlock_req));
}
static int ompi_osc_pt2pt_lock_internal_execute (ompi_osc_pt2pt_module_t *module, ompi_osc_pt2pt_outstanding_lock_t *lock)
{
int my_rank = ompi_comm_rank (module->comm);
int target = lock->target;
int assert = lock->assert;
int ret;
if (0 == (assert & MPI_MODE_NOCHECK)) {
if (my_rank != target && target != -1) {
ret = ompi_osc_pt2pt_lock_remote (module, target, lock);
} else {
ret = ompi_osc_pt2pt_lock_self (module, lock);
}
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
/* return */
return ret;
}
if (-1 == target) {
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
if (my_rank == i) {
continue;
}
ret = ompi_osc_pt2pt_lock_remote (module, i, lock);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
}
}
} else {
if (-1 == target) {
lock->lock_acks_received = ompi_comm_size(module->comm);
} else {
lock->lock_acks_received = 1;
}
}
return OMPI_SUCCESS;
}
static int ompi_osc_pt2pt_lock_internal (int lock_type, int target, int assert, ompi_win_t *win)
{
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
ompi_osc_pt2pt_outstanding_lock_t *lock;
ompi_osc_pt2pt_peer_t *peer = NULL;
int ret = OMPI_SUCCESS;
if (-1 != target) {
peer = module->peers + target;
}
/* Check if no_locks is set. TODO: we also need to track whether we are in an
* active target epoch. Fence can make this tricky to track. */
if (module->sc_group) {
return OMPI_ERR_RMA_SYNC;
}
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc pt2pt: lock %d %d", target, lock_type));
/* create lock item */
lock = OBJ_NEW(ompi_osc_pt2pt_outstanding_lock_t);
if (OPAL_UNLIKELY(NULL == lock)) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
lock->target = target;
lock->lock_acks_received = 0;
lock->unlock_acks_received = 0;
lock->serial_number = OPAL_THREAD_ADD64((int64_t *) &module->lock_serial_number, 1);
lock->type = lock_type;
lock->assert = assert;
/* delay all eager sends until we've heard back.. */
OPAL_THREAD_LOCK(&module->lock);
/* check for conflicting lock */
if (find_outstanding_lock_st (module, target)) {
OBJ_RELEASE(lock);
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERR_RMA_CONFLICT;
}
/* when the lock ack returns we will be in an access epoch with this peer/all peers (target = -1) */
if (-1 == target) {
module->all_access_epoch = true;
} else {
peer->access_epoch = true;
}
module->passive_target_access_epoch = true;
opal_list_append(&module->outstanding_locks, &lock->super);
OPAL_THREAD_UNLOCK(&module->lock);
ret = ompi_osc_pt2pt_lock_internal_execute (module, lock);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
OPAL_THREAD_SCOPED_LOCK(&module->lock,
opal_list_remove_item(&module->outstanding_locks, &lock->super));
OBJ_RELEASE(lock);
}
return ret;
}
static int ompi_osc_pt2pt_unlock_internal (int target, ompi_win_t *win)
{
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
ompi_osc_pt2pt_outstanding_lock_t *lock = NULL;
int my_rank = ompi_comm_rank (module->comm);
ompi_osc_pt2pt_peer_t *peer = NULL;
int lock_acks_expected;
int ret = OMPI_SUCCESS;
if (-1 != target) {
lock_acks_expected = 1;
peer = module->peers + target;
} else {
lock_acks_expected = ompi_comm_size (module->comm);
}
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_unlock_internal: unlocking target %d", target));
OPAL_THREAD_LOCK(&module->lock);
lock = find_outstanding_lock_st (module, target);
if (OPAL_UNLIKELY(NULL == lock)) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_unlock: target %d is not locked in window %s",
target, win->w_name));
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERR_RMA_SYNC;
}
opal_list_remove_item (&module->outstanding_locks, &lock->super);
/* wait until ack has arrived from target */
while (lock->lock_acks_received != lock_acks_expected) {
opal_condition_wait(&module->cond, &module->lock);
}
OPAL_THREAD_UNLOCK(&module->lock);
if (lock->assert & MPI_MODE_NOCHECK) {
/* flush intstead */
ompi_osc_pt2pt_flush_lock (module, lock, target);
} else if (my_rank != target) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc pt2pt: unlock %d, lock_acks_received = %d", target,
lock->lock_acks_received));
if (-1 == target) {
/* send unlock messages to all of my peers */
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
if (my_rank == i) {
continue;
}
ret = ompi_osc_pt2pt_unlock_remote (module, i, lock);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
}
} else {
ret = ompi_osc_pt2pt_unlock_remote (module, target, lock);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
}
/* start all sendreqs to target */
if (-1 == target) {
ret = ompi_osc_pt2pt_frag_flush_all (module);
} else {
ret = ompi_osc_pt2pt_frag_flush_target(module, target);
}
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
/* wait for unlock acks. this signals remote completion of fragments */
OPAL_THREAD_LOCK(&module->lock);
while (lock->unlock_acks_received != lock_acks_expected) {
opal_condition_wait(&module->cond, &module->lock);
}
OPAL_THREAD_UNLOCK(&module->lock);
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_unlock: unlock of %d complete", target));
}
if ((target == my_rank || target == -1) && !(lock->assert & MPI_MODE_NOCHECK)) {
ompi_osc_pt2pt_unlock_self (module, lock);
}
OPAL_THREAD_LOCK(&module->lock);
if (-1 != target) {
peer->access_epoch = false;
module->passive_target_access_epoch = false;
} else {
module->passive_target_access_epoch = false;
module->all_access_epoch = false;
}
OPAL_THREAD_UNLOCK(&module->lock);
OBJ_RELEASE(lock);
return ret;
}
int ompi_osc_pt2pt_lock(int lock_type, int target, int assert, ompi_win_t *win)
{
assert(target >= 0);
return ompi_osc_pt2pt_lock_internal (lock_type, target, assert, win);
}
int ompi_osc_pt2pt_unlock (int target, struct ompi_win_t *win)
{
return ompi_osc_pt2pt_unlock_internal (target, win);
}
int ompi_osc_pt2pt_lock_all(int assert, struct ompi_win_t *win)
{
return ompi_osc_pt2pt_lock_internal (MPI_LOCK_SHARED, -1, assert, win);
}
int ompi_osc_pt2pt_unlock_all (struct ompi_win_t *win)
{
return ompi_osc_pt2pt_unlock_internal (-1, win);
}
int ompi_osc_pt2pt_sync (struct ompi_win_t *win)
{
opal_progress();
return OMPI_SUCCESS;
}
static int ompi_osc_pt2pt_flush_lock (ompi_osc_pt2pt_module_t *module, ompi_osc_pt2pt_outstanding_lock_t *lock,
int target)
{
ompi_osc_pt2pt_header_flush_t flush_req;
int peer_count, ret, flush_count;
int my_rank = ompi_comm_rank (module->comm);
if (-1 == lock->target) {
peer_count = ompi_comm_size(module->comm);
} else {
peer_count = 1;
}
/* wait until ack has arrived from target, since we need to be
able to eager send before we can transfer all the data... */
OPAL_THREAD_LOCK(&module->lock);
while (peer_count > lock->lock_acks_received && lock->flushing) {
opal_condition_wait(&module->cond, &module->lock);
}
lock->flush_acks_received = 0;
lock->flushing = true;
OPAL_THREAD_UNLOCK(&module->lock);
flush_req.base.type = OMPI_OSC_PT2PT_HDR_TYPE_FLUSH_REQ;
flush_req.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID | OMPI_OSC_PT2PT_HDR_FLAG_PASSIVE_TARGET;
flush_req.serial_number = lock->serial_number;
if (-1 == target) {
/* NTH: no local flush */
flush_count = ompi_comm_size(module->comm) - 1;
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
if (i == my_rank) {
continue;
}
flush_req.frag_count = opal_atomic_swap_32 ((int32_t *) module->epoch_outgoing_frag_count + i, -1);
/* send control message with flush request and count */
ret = ompi_osc_pt2pt_control_send (module, i, &flush_req, sizeof (flush_req));
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
/* start all sendreqs to target */
ret = ompi_osc_pt2pt_frag_flush_target (module, i);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
}
} else {
flush_req.frag_count = opal_atomic_swap_32 ((int32_t *) module->epoch_outgoing_frag_count + target, -1);
flush_count = 1;
/* send control message with flush request and count */
ret = ompi_osc_pt2pt_control_send (module, target, &flush_req, sizeof (flush_req));
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
/* start all sendreqs to target */
ret = ompi_osc_pt2pt_frag_flush_target (module, target);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
}
/* wait for all the requests and the flush ack (meaning remote completion) */
OPAL_THREAD_LOCK(&module->lock);
while (flush_count != lock->flush_acks_received) {
opal_condition_wait(&module->cond, &module->lock);
}
lock->flushing = false;
opal_condition_broadcast(&module->cond);
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_SUCCESS;
}
int ompi_osc_pt2pt_flush (int target, struct ompi_win_t *win)
{
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
ompi_osc_pt2pt_outstanding_lock_t *lock;
int ret;
assert (0 <= target);
/* flush is only allowed from within a passive target epoch */
if (!module->passive_target_access_epoch) {
return OMPI_ERR_RMA_SYNC;
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_flush starting..."));
if (ompi_comm_rank (module->comm) == target) {
/* nothing to flush */
opal_progress ();
return OMPI_SUCCESS;
}
lock = find_outstanding_lock (module, target);
if (NULL == lock) {
lock = find_outstanding_lock (module, -1);
}
if (OPAL_UNLIKELY(NULL == lock)) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_flush: target %d is not locked in window %s",
target, win->w_name));
ret = OMPI_ERR_RMA_SYNC;
} else {
ret = ompi_osc_pt2pt_flush_lock (module, lock, target);
}
return ret;
}
int ompi_osc_pt2pt_flush_all (struct ompi_win_t *win)
{
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
ompi_osc_pt2pt_outstanding_lock_t *lock;
int ret = OMPI_SUCCESS;
/* flush is only allowed from within a passive target epoch */
if (OPAL_UNLIKELY(!module->passive_target_access_epoch ||
0 == opal_list_get_size (&module->outstanding_locks))) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_flush_all: no targets are locked in window %s",
win->w_name));
return OMPI_ERR_RMA_SYNC;
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_flush_all entering..."));
/* flush all locks */
OPAL_LIST_FOREACH(lock, &module->outstanding_locks, ompi_osc_pt2pt_outstanding_lock_t) {
ret = ompi_osc_pt2pt_flush_lock (module, lock, lock->target);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
break;
}
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_flush_all complete"));
return ret;
}
int ompi_osc_pt2pt_flush_local (int target, struct ompi_win_t *win)
{
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
int ret;
/* flush is only allowed from within a passive target epoch */
if (!module->passive_target_access_epoch) {
return OMPI_ERR_RMA_SYNC;
}
ret = ompi_osc_pt2pt_frag_flush_target(module, target);
if (OMPI_SUCCESS != ret) {
return ret;
}
/* wait for all the requests */
OPAL_THREAD_LOCK(&module->lock);
while (module->outgoing_frag_count != module->outgoing_frag_signal_count) {
opal_condition_wait(&module->cond, &module->lock);
}
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_SUCCESS;
}
int ompi_osc_pt2pt_flush_local_all (struct ompi_win_t *win)
{
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
int ret = OMPI_SUCCESS;
/* flush is only allowed from within a passive target epoch */
if (!module->passive_target_access_epoch) {
return OMPI_ERR_RMA_SYNC;
}
ret = ompi_osc_pt2pt_frag_flush_all(module);
if (OMPI_SUCCESS != ret) {
return ret;
}
/* wait for all the requests */
OPAL_THREAD_LOCK(&module->lock);
while (module->outgoing_frag_count != module->outgoing_frag_signal_count) {
opal_condition_wait(&module->cond, &module->lock);
}
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_SUCCESS;
}
/* target side operation to acknowledge to initiator side that the
lock is now held by the initiator */
static inline int activate_lock (ompi_osc_pt2pt_module_t *module, int requestor,
uint64_t lock_ptr)
{
ompi_osc_pt2pt_outstanding_lock_t *lock;
if (ompi_comm_rank (module->comm) != requestor) {
ompi_osc_pt2pt_header_lock_ack_t lock_ack;
lock_ack.base.type = OMPI_OSC_PT2PT_HDR_TYPE_LOCK_ACK;
lock_ack.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID;
lock_ack.source = ompi_comm_rank(module->comm);
lock_ack.windx = ompi_comm_get_cid(module->comm);
lock_ack.lock_ptr = lock_ptr;
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc pt2pt: sending lock to %d", requestor));
/* we don't want to send any data, since we're the exposure
epoch only, so use an unbuffered send */
return ompi_osc_pt2pt_control_send_unbuffered (module, requestor, &lock_ack, sizeof (lock_ack));
}
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc pt2pt: releasing local lock"));
lock = (ompi_osc_pt2pt_outstanding_lock_t *) (uintptr_t) lock_ptr;
if (OPAL_UNLIKELY(NULL == lock)) {
OPAL_OUTPUT_VERBOSE((5, ompi_osc_base_framework.framework_output,
"lock could not be located"));
}
OPAL_THREAD_ADD32(&lock->lock_acks_received, 1);
opal_condition_broadcast (&module->cond);
return OMPI_SUCCESS;
}
/* target side operation to create a pending lock request for a lock
request that could not be satisfied */
static inline int queue_lock (ompi_osc_pt2pt_module_t *module, int requestor,
int lock_type, uint64_t lock_ptr)
{
ompi_osc_pt2pt_pending_lock_t *pending =
OBJ_NEW(ompi_osc_pt2pt_pending_lock_t);
if (NULL == pending) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
pending->peer = requestor;
pending->lock_type = lock_type;
pending->lock_ptr = lock_ptr;
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc pt2pt: queueing lock request from %d", requestor));
OPAL_THREAD_SCOPED_LOCK(&module->locks_pending_lock, opal_list_append(&module->locks_pending, &pending->super));
return OMPI_SUCCESS;
}
static bool ompi_osc_pt2pt_lock_try_acquire (ompi_osc_pt2pt_module_t* module, int source, int lock_type, uint64_t lock_ptr)
{
bool queue = false;
if (MPI_LOCK_SHARED == lock_type) {
int32_t lock_status = module->lock_status;
do {
if (lock_status < 0) {
queue = true;
break;
}
if (opal_atomic_cmpset_32 (&module->lock_status, lock_status, lock_status + 1)) {
break;
}
lock_status = module->lock_status;
} while (1);
} else {
queue = !opal_atomic_cmpset_32 (&module->lock_status, 0, -1);
}
if (queue) {
return false;
}
activate_lock(module, source, lock_ptr);
/* activated the lock */
return true;
}
static int ompi_osc_activate_next_lock (ompi_osc_pt2pt_module_t *module) {
/* release any other pending locks we can */
ompi_osc_pt2pt_pending_lock_t *pending_lock, *next;
int ret = OMPI_SUCCESS;
OPAL_THREAD_LOCK(&module->locks_pending_lock);
OPAL_LIST_FOREACH_SAFE(pending_lock, next, &module->locks_pending,
ompi_osc_pt2pt_pending_lock_t) {
bool acquired = ompi_osc_pt2pt_lock_try_acquire (module, pending_lock->peer, pending_lock->lock_type,
pending_lock->lock_ptr);
if (!acquired) {
break;
}
opal_list_remove_item (&module->locks_pending, &pending_lock->super);
OBJ_RELEASE(pending_lock);
}
OPAL_THREAD_UNLOCK(&module->locks_pending_lock);
return ret;
}
/* target side function called when the initiator sends a lock
request. Lock will either be activated and acknowledged or
queued. */
int ompi_osc_pt2pt_process_lock (ompi_osc_pt2pt_module_t* module, int source,
ompi_osc_pt2pt_header_lock_t* lock_header)
{
bool acquired;
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_process_lock: processing lock request from %d. current lock state = %d",
source, module->lock_status));
acquired = ompi_osc_pt2pt_lock_try_acquire (module, source, lock_header->lock_type, lock_header->lock_ptr);
if (!acquired) {
queue_lock(module, source, lock_header->lock_type, lock_header->lock_ptr);
}
return OMPI_SUCCESS;
}
/* initiator-side function called when the target acks the lock
request. */
void ompi_osc_pt2pt_process_lock_ack (ompi_osc_pt2pt_module_t *module,
ompi_osc_pt2pt_header_lock_ack_t *lock_ack_header)
{
ompi_osc_pt2pt_peer_t *peer = module->peers + lock_ack_header->source;
ompi_osc_pt2pt_outstanding_lock_t *lock;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_process_unlock_ack: processing lock ack from %d for lock %" PRIu64,
lock_ack_header->source, lock_ack_header->lock_ptr));
lock = (ompi_osc_pt2pt_outstanding_lock_t *) (uintptr_t) lock_ack_header->lock_ptr;
assert (NULL != lock);
/* no need to hold the lock to set this */
peer->eager_send_active = true;
OPAL_THREAD_ADD32(&lock->lock_acks_received, 1);
opal_condition_broadcast(&module->cond);
}
void ompi_osc_pt2pt_process_flush_ack (ompi_osc_pt2pt_module_t *module, int source,
ompi_osc_pt2pt_header_flush_ack_t *flush_ack_header) {
ompi_osc_pt2pt_outstanding_lock_t *lock;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_process_unlock_ack: processing flush ack from %d for lock %" PRIu64,
source, flush_ack_header->serial_number));
/* NTH: need to verify that this will work as expected */
lock = find_outstanding_lock_by_serial (module, flush_ack_header->serial_number);
assert (NULL != lock);
OPAL_THREAD_ADD32(&lock->flush_acks_received, 1);
opal_condition_broadcast(&module->cond);
}
void ompi_osc_pt2pt_process_unlock_ack (ompi_osc_pt2pt_module_t *module, int source,
ompi_osc_pt2pt_header_unlock_ack_t *unlock_ack_header)
{
ompi_osc_pt2pt_peer_t *peer = module->peers + source;
ompi_osc_pt2pt_outstanding_lock_t *lock;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_process_unlock_ack: processing unlock ack from %d",
source));
/* NTH: need to verify that this will work as expected */
lock = (ompi_osc_pt2pt_outstanding_lock_t *) (intptr_t) unlock_ack_header->lock_ptr;
assert (NULL != lock);
peer->eager_send_active = false;
if (0 == OPAL_THREAD_ADD32(&lock->unlock_acks_received, 1)) {
opal_condition_broadcast(&module->cond);
}
}
/**
* Process an unlock request.
*
* @param[in] module - OSC PT2PT module
* @param[in] source - Source rank
* @param[in] unlock_header - Incoming unlock header
*
* This functions is the target-side function for handling an unlock
* request. Once all pending operations from the target are complete
* this functions sends an unlock acknowledgement then attempts to
* active a pending lock if the lock becomes free.
*/
int ompi_osc_pt2pt_process_unlock (ompi_osc_pt2pt_module_t *module, int source,
ompi_osc_pt2pt_header_unlock_t *unlock_header)
{
ompi_osc_pt2pt_header_unlock_ack_t unlock_ack;
ompi_osc_pt2pt_peer_t *peer = module->peers + source;
int ret;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_process_unlock entering (passive_incoming_frag_count: %d)...",
peer->passive_incoming_frag_count));
/* we cannot block when processing an incoming request */
if (0 != peer->passive_incoming_frag_count) {
return OMPI_ERR_WOULD_BLOCK;
}
unlock_ack.base.type = OMPI_OSC_PT2PT_HDR_TYPE_UNLOCK_ACK;
unlock_ack.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID;
unlock_ack.lock_ptr = unlock_header->lock_ptr;
ret = ompi_osc_pt2pt_control_send_unbuffered (module, source, &unlock_ack, sizeof (unlock_ack));
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
if (-1 == module->lock_status) {
OPAL_THREAD_ADD32(&module->lock_status, 1);
ompi_osc_activate_next_lock (module);
} else if (0 == OPAL_THREAD_ADD32(&module->lock_status, -1)) {
ompi_osc_activate_next_lock (module);
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc pt2pt: finished processing unlock fragment"));
return ret;
}
int ompi_osc_pt2pt_process_flush (ompi_osc_pt2pt_module_t *module, int source,
ompi_osc_pt2pt_header_flush_t *flush_header)
{
ompi_osc_pt2pt_peer_t *peer = module->peers + source;
ompi_osc_pt2pt_header_flush_ack_t flush_ack;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_process_flush entering (passive_incoming_frag_count: %d)...",
peer->passive_incoming_frag_count));
/* we cannot block when processing an incoming request */
if (0 != peer->passive_incoming_frag_count) {
return OMPI_ERR_WOULD_BLOCK;
}
flush_ack.base.type = OMPI_OSC_PT2PT_HDR_TYPE_FLUSH_ACK;
flush_ack.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID;
flush_ack.serial_number = flush_header->serial_number;
return ompi_osc_pt2pt_control_send_unbuffered (module, source, &flush_ack, sizeof (flush_ack));
}

Просмотреть файл

@ -1,9 +1,12 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/* /*
* Copyright (c) 2013 Sandia National Laboratories. All rights reserved. * Copyright (c) 2013 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
* *
* $HEADER$ * $HEADER$
* *
* Pending frags are fragments that have been received on the target, * Pending frags are fragments that have been received on the target,
@ -14,35 +17,35 @@
* message. * message.
*/ */
#ifndef OSC_RDMA_PENDING_FRAG_H #ifndef OSC_PT2PT_PENDING_FRAG_H
#define OSC_RDMA_PENDING_FRAG_H #define OSC_PT2PT_PENDING_FRAG_H
/** Incoming fragment that has to be queued */ /** Incoming fragment that has to be queued */
struct ompi_osc_rdma_pending_frag_t { struct ompi_osc_pt2pt_pending_frag_t {
opal_list_item_t super; opal_list_item_t super;
/* This is a pointer to the top of the fragment (which is always /* This is a pointer to the top of the fragment (which is always
the header). Save as a header to make the casting a bit less the header). Save as a header to make the casting a bit less
onerous during sequence number lookups. */ onerous during sequence number lookups. */
ompi_osc_rdma_frag_header_t *header; ompi_osc_pt2pt_frag_header_t *header;
}; };
typedef struct ompi_osc_rdma_pending_frag_t ompi_osc_rdma_pending_frag_t; typedef struct ompi_osc_pt2pt_pending_frag_t ompi_osc_pt2pt_pending_frag_t;
OBJ_CLASS_DECLARATION(ompi_osc_rdma_pending_frag_t); OBJ_CLASS_DECLARATION(ompi_osc_pt2pt_pending_frag_t);
/* /*
* Note: module lock must be held during this operation * Note: module lock must be held during this operation
*/ */
static inline ompi_osc_rdma_pending_frag_t* static inline ompi_osc_pt2pt_pending_frag_t*
ompi_osc_rdma_pending_frag_create(ompi_osc_rdma_module_t *module, ompi_osc_pt2pt_pending_frag_create(ompi_osc_pt2pt_module_t *module,
void *ptr, void *ptr,
size_t size) size_t size)
{ {
size_t total_size = sizeof(ompi_osc_rdma_pending_frag_t) + size; size_t total_size = sizeof(ompi_osc_pt2pt_pending_frag_t) + size;
ompi_osc_rdma_pending_frag_t *ret = ompi_osc_pt2pt_pending_frag_t *ret =
(ompi_osc_rdma_pending_frag_t*) malloc(total_size); (ompi_osc_pt2pt_pending_frag_t*) malloc(total_size);
if (NULL == ret) return NULL; if (NULL == ret) return NULL;
OBJ_CONSTRUCT(&ret, ompi_osc_rdma_pending_frag_t); OBJ_CONSTRUCT(&ret, ompi_osc_pt2pt_pending_frag_t);
memcpy(ret->header, ptr, size); memcpy(ret->header, ptr, size);
return ret; return ret;
@ -50,11 +53,11 @@ ompi_osc_rdma_pending_frag_create(ompi_osc_rdma_module_t *module,
/* /*
* Note: module lock must be held for this operation * Note: module lock must be held for this operation
*/ */
static inline int static inline int
ompi_osc_rdma_pending_frag_destroy(ompi_osc_rdma_module_t *module, ompi_osc_pt2pt_pending_frag_destroy(ompi_osc_pt2pt_module_t *module,
ompi_osc_rdma_pending_frag_t* frag) ompi_osc_pt2pt_pending_frag_t* frag)
{ {
OBJ_DESTRUCT(&frag); OBJ_DESTRUCT(&frag);
free(frag); free(frag);

Просмотреть файл

@ -1,9 +1,12 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/* /*
* Copyright (c) 2011-2012 Sandia National Laboratories. All rights reserved. * Copyright (c) 2011-2012 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
* *
* $HEADER$ * $HEADER$
*/ */
@ -14,8 +17,8 @@
#include "ompi/mca/osc/base/base.h" #include "ompi/mca/osc/base/base.h"
#include "ompi/mca/osc/base/osc_base_obj_convert.h" #include "ompi/mca/osc/base/osc_base_obj_convert.h"
#include "osc_rdma.h" #include "osc_pt2pt.h"
#include "osc_rdma_request.h" #include "osc_pt2pt_request.h"
static int static int
request_cancel(struct ompi_request_t *request, int complete) request_cancel(struct ompi_request_t *request, int complete)
@ -26,14 +29,14 @@ request_cancel(struct ompi_request_t *request, int complete)
static int static int
request_free(struct ompi_request_t **ompi_req) request_free(struct ompi_request_t **ompi_req)
{ {
ompi_osc_rdma_request_t *request = ompi_osc_pt2pt_request_t *request =
(ompi_osc_rdma_request_t*) *ompi_req; (ompi_osc_pt2pt_request_t*) *ompi_req;
if (true != request->super.req_complete) { if (true != request->super.req_complete) {
return MPI_ERR_REQUEST; return MPI_ERR_REQUEST;
} }
OMPI_OSC_RDMA_REQUEST_RETURN(request); OMPI_OSC_PT2PT_REQUEST_RETURN(request);
*ompi_req = MPI_REQUEST_NULL; *ompi_req = MPI_REQUEST_NULL;
@ -42,7 +45,7 @@ request_free(struct ompi_request_t **ompi_req)
static static
void void
request_construct(ompi_osc_rdma_request_t *request) request_construct(ompi_osc_pt2pt_request_t *request)
{ {
request->super.req_type = OMPI_REQUEST_WIN; request->super.req_type = OMPI_REQUEST_WIN;
request->super.req_status._cancelled = 0; request->super.req_status._cancelled = 0;
@ -50,7 +53,7 @@ request_construct(ompi_osc_rdma_request_t *request)
request->super.req_cancel = request_cancel; request->super.req_cancel = request_cancel;
} }
OBJ_CLASS_INSTANCE(ompi_osc_rdma_request_t, OBJ_CLASS_INSTANCE(ompi_osc_pt2pt_request_t,
ompi_request_t, ompi_request_t,
request_construct, request_construct,
NULL); NULL);

Просмотреть файл

@ -4,46 +4,46 @@
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights * Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
* *
* $HEADER$ * $HEADER$
*/ */
#ifndef OMPI_OSC_RDMA_REQUEST_H #ifndef OMPI_OSC_PT2PT_REQUEST_H
#define OMPI_OSC_RDMA_REQUEST_H #define OMPI_OSC_PT2PT_REQUEST_H
#include "osc_rdma.h" #include "osc_pt2pt.h"
#include "ompi/request/request.h" #include "ompi/request/request.h"
#include "opal/util/output.h" #include "opal/util/output.h"
struct ompi_osc_rdma_request_t { struct ompi_osc_pt2pt_request_t {
ompi_request_t super; ompi_request_t super;
int type; int type;
void *origin_addr; void *origin_addr;
int origin_count; int origin_count;
struct ompi_datatype_t *origin_dt; struct ompi_datatype_t *origin_dt;
ompi_osc_rdma_module_t* module; ompi_osc_pt2pt_module_t* module;
int outstanding_requests; int32_t outstanding_requests;
bool internal; bool internal;
}; };
typedef struct ompi_osc_rdma_request_t ompi_osc_rdma_request_t; typedef struct ompi_osc_pt2pt_request_t ompi_osc_pt2pt_request_t;
OBJ_CLASS_DECLARATION(ompi_osc_rdma_request_t); OBJ_CLASS_DECLARATION(ompi_osc_pt2pt_request_t);
/* REQUEST_ALLOC is only called from "top-level" functions (rdma_rput, /* REQUEST_ALLOC is only called from "top-level" functions (pt2pt_rput,
rdma_rget, etc.), so it's ok to spin here... */ pt2pt_rget, etc.), so it's ok to spin here... */
#define OMPI_OSC_RDMA_REQUEST_ALLOC(win, req) \ #define OMPI_OSC_PT2PT_REQUEST_ALLOC(win, req) \
do { \ do { \
ompi_free_list_item_t *item; \ ompi_free_list_item_t *item; \
do { \ do { \
OMPI_FREE_LIST_GET_MT(&mca_osc_rdma_component.requests, item); \ OMPI_FREE_LIST_GET_MT(&mca_osc_pt2pt_component.requests, item); \
if (NULL == item) { \ if (NULL == item) { \
opal_progress(); \ opal_progress(); \
} \ } \
} while (NULL == item); \ } while (NULL == item); \
req = (ompi_osc_rdma_request_t*) item; \ req = (ompi_osc_pt2pt_request_t*) item; \
OMPI_REQUEST_INIT(&req->super, false); \ OMPI_REQUEST_INIT(&req->super, false); \
req->super.req_mpi_object.win = win; \ req->super.req_mpi_object.win = win; \
req->super.req_complete = false; \ req->super.req_complete = false; \
@ -52,14 +52,14 @@ OBJ_CLASS_DECLARATION(ompi_osc_rdma_request_t);
req->internal = false; \ req->internal = false; \
} while (0) } while (0)
#define OMPI_OSC_RDMA_REQUEST_RETURN(req) \ #define OMPI_OSC_PT2PT_REQUEST_RETURN(req) \
do { \ do { \
OMPI_REQUEST_FINI(&(req)->super); \ OMPI_REQUEST_FINI(&(req)->super); \
OMPI_FREE_LIST_RETURN_MT(&mca_osc_rdma_component.requests, \ OMPI_FREE_LIST_RETURN_MT(&mca_osc_pt2pt_component.requests, \
(ompi_free_list_item_t *) (req)); \ (ompi_free_list_item_t *) (req)); \
} while (0) } while (0)
static inline void ompi_osc_rdma_request_complete (ompi_osc_rdma_request_t *request, int mpi_error) static inline void ompi_osc_pt2pt_request_complete (ompi_osc_pt2pt_request_t *request, int mpi_error)
{ {
if (!request->internal) { if (!request->internal) {
request->super.req_status.MPI_ERROR = mpi_error; request->super.req_status.MPI_ERROR = mpi_error;
@ -67,8 +67,8 @@ static inline void ompi_osc_rdma_request_complete (ompi_osc_rdma_request_t *requ
/* mark the request complete at the mpi level */ /* mark the request complete at the mpi level */
ompi_request_complete (&request->super, true); ompi_request_complete (&request->super, true);
} else { } else {
OMPI_OSC_RDMA_REQUEST_RETURN (request); OMPI_OSC_PT2PT_REQUEST_RETURN (request);
} }
} }
#endif /* OMPI_OSC_RDMA_REQUEST_H */ #endif /* OMPI_OSC_PT2PT_REQUEST_H */

Просмотреть файл

@ -1,26 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2013 Sandia National Laboratories. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_ompi_osc_rdma_POST_CONFIG(will_build)
# ----------------------------------------
# Only require the tag if we're actually going to be built, since bml
# is one of the ones frequently disabled for large installs.
AC_DEFUN([MCA_ompi_osc_rdma_POST_CONFIG], [
AS_IF([test "$1" = "1"], [OMPI_REQUIRE_ENDPOINT_TAG([BML])])
])dnl
# MCA_ompi_osc_rdma_CONFIG(action-if-can-compile,
# [action-if-cant-compile])
# ------------------------------------------------
# We can always build, unless we were explicitly disabled.
AC_DEFUN([MCA_ompi_osc_rdma_CONFIG],[
AC_CONFIG_FILES([ompi/mca/osc/rdma/Makefile])
[$1]
])dnl

Просмотреть файл

@ -1,197 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "opal/class/opal_list.h"
#include "ompi/mca/osc/base/base.h"
#include "ompi/mca/pml/pml.h"
#include "osc_rdma.h"
#include "osc_rdma_frag.h"
#include "osc_rdma_data_move.h"
static void ompi_osc_rdma_frag_constructor (ompi_osc_rdma_frag_t *frag){
frag->buffer = malloc (mca_osc_rdma_component.buffer_size + sizeof (ompi_osc_rdma_frag_header_t));
assert (frag->buffer);
}
static void ompi_osc_rdma_frag_destructor (ompi_osc_rdma_frag_t *frag) {
if (NULL != frag->buffer) {
free (frag->buffer);
}
}
OBJ_CLASS_INSTANCE(ompi_osc_rdma_frag_t, opal_list_item_t,
ompi_osc_rdma_frag_constructor, ompi_osc_rdma_frag_destructor);
static int frag_send_cb (ompi_request_t *request)
{
ompi_osc_rdma_frag_t *frag =
(ompi_osc_rdma_frag_t*) request->req_complete_cb_data;
ompi_osc_rdma_module_t *module = frag->module;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: frag_send complete to %d, frag = %p, request = %p",
frag->target, (void *) frag, (void *) request));
mark_outgoing_completion(module);
OPAL_FREE_LIST_RETURN(&mca_osc_rdma_component.frags, &frag->super);
/* put this request on the garbage colletion list */
osc_rdma_gc_add_request (request);
return OMPI_SUCCESS;
}
static int
frag_send(ompi_osc_rdma_module_t *module,
ompi_osc_rdma_frag_t *frag)
{
int count;
count = (int)((uintptr_t) frag->top - (uintptr_t) frag->buffer);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: frag_send called to %d, frag = %p, count = %d",
frag->target, (void *) frag, count));
return ompi_osc_rdma_isend_w_cb (frag->buffer, count, MPI_BYTE, frag->target, OSC_RDMA_FRAG_TAG,
module->comm, frag_send_cb, frag);
}
int
ompi_osc_rdma_frag_start(ompi_osc_rdma_module_t *module,
ompi_osc_rdma_frag_t *frag)
{
int ret;
assert(0 == frag->pending);
assert(module->peers[frag->target].active_frag != frag);
/* we need to signal now that a frag is outgoing to ensure the count sent
* with the unlock message is correct */
ompi_osc_signal_outgoing (module, frag->target, 1);
/* if eager sends are not active, can't send yet, so buffer and
get out... */
if (module->passive_target_access_epoch) {
if (!module->passive_eager_send_active[frag->target]) {
opal_list_append(&module->queued_frags, &frag->super);
return OMPI_SUCCESS;
}
} else {
if (!module->active_eager_send_active) {
opal_list_append(&module->queued_frags, &frag->super);
return OMPI_SUCCESS;
}
}
ret = frag_send(module, frag);
opal_condition_broadcast(&module->cond);
return ret;
}
int
ompi_osc_rdma_frag_flush_target(ompi_osc_rdma_module_t *module, int target)
{
int ret = OMPI_SUCCESS;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: frag flush target begin"));
/* flush the active frag */
if (NULL != module->peers[target].active_frag) {
ompi_osc_rdma_frag_t *frag = module->peers[target].active_frag;
if (0 != frag->pending) {
/* communication going on while synchronizing; this is a bug */
return OMPI_ERR_RMA_SYNC;
}
module->peers[target].active_frag = NULL;
ret = ompi_osc_rdma_frag_start(module, frag);
if (OMPI_SUCCESS != ret) return ret;
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: frag flush target finished active frag"));
/* walk through the pending list and send */
ompi_osc_rdma_frag_t *frag, *next;
OPAL_LIST_FOREACH_SAFE(frag, next, &module->queued_frags, ompi_osc_rdma_frag_t) {
if (frag->target == target) {
opal_list_remove_item(&module->queued_frags, &frag->super);
ret = frag_send(module, frag);
if (OMPI_SUCCESS != ret) return ret;
}
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: frag flush target finished"));
return OMPI_SUCCESS;
}
int
ompi_osc_rdma_frag_flush_all(ompi_osc_rdma_module_t *module)
{
int ret = OMPI_SUCCESS;
int i;
ompi_osc_rdma_frag_t *frag, *next;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: frag flush all begin"));
/* flush the active frag */
for (i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
if (NULL != module->peers[i].active_frag) {
ompi_osc_rdma_frag_t *frag = module->peers[i].active_frag;
if (0 != frag->pending) {
/* communication going on while synchronizing; this is a bug */
return OMPI_ERR_RMA_SYNC;
}
module->peers[i].active_frag = NULL;
ret = ompi_osc_rdma_frag_start(module, frag);
if (OMPI_SUCCESS != ret) return ret;
}
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: frag flush all finished active frag"));
/* try to start all the queued frags */
OPAL_LIST_FOREACH_SAFE(frag, next, &module->queued_frags, ompi_osc_rdma_frag_t) {
opal_list_remove_item(&module->queued_frags, &frag->super);
ret = frag_send(module, frag);
if (OMPI_SUCCESS != ret) {
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: failure for frag send: %d", ret));
return ret;
}
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: frag flush all done"));
return OMPI_SUCCESS;
}

Просмотреть файл

@ -1,138 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2012 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef OSC_RDMA_FRAG_H
#define OSC_RDMA_FRAG_H
#include "ompi/communicator/communicator.h"
#include "osc_rdma_header.h"
#include "osc_rdma_request.h"
#include "opal/align.h"
/** Communication buffer for packing messages */
struct ompi_osc_rdma_frag_t {
opal_list_item_t super;
/* target rank of buffer */
int target;
unsigned char *buffer;
/* space remaining in buffer */
size_t remain_len;
/* start of unused space */
char *top;
/* Number of operations which have started writing into the frag, but not yet completed doing so */
int pending;
ompi_osc_rdma_frag_header_t *header;
ompi_osc_rdma_module_t *module;
};
typedef struct ompi_osc_rdma_frag_t ompi_osc_rdma_frag_t;
OBJ_CLASS_DECLARATION(ompi_osc_rdma_frag_t);
extern int ompi_osc_rdma_frag_start(ompi_osc_rdma_module_t *module, ompi_osc_rdma_frag_t *buffer);
extern int ompi_osc_rdma_frag_flush_target(ompi_osc_rdma_module_t *module, int target);
extern int ompi_osc_rdma_frag_flush_all(ompi_osc_rdma_module_t *module);
/*
* Note: module lock must be held during this operation
*/
static inline int ompi_osc_rdma_frag_alloc(ompi_osc_rdma_module_t *module, int target,
size_t request_len, ompi_osc_rdma_frag_t **buffer,
char **ptr)
{
ompi_osc_rdma_frag_t *curr = module->peers[target].active_frag;
int ret;
/* osc rdma headers can have 64-bit values. these will need to be aligned
* on an 8-byte boundary on some architectures so we up align the allocation
* size here. */
request_len = OPAL_ALIGN(request_len, 8, size_t);
if (request_len > mca_osc_rdma_component.buffer_size) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
if (NULL == curr || curr->remain_len < request_len) {
opal_free_list_item_t *item;
if (NULL != curr) {
curr->remain_len = 0;
/* If there's something pending, the pending finish will
start the buffer. Otherwise, we need to start it now. */
if (0 == curr->pending) {
module->peers[target].active_frag = NULL;
ret = ompi_osc_rdma_frag_start(module, curr);
}
}
OPAL_FREE_LIST_GET(&mca_osc_rdma_component.frags,
item, ret);
if (OMPI_SUCCESS != ret) return ret;
curr = module->peers[target].active_frag =
(ompi_osc_rdma_frag_t*) item;
curr->target = target;
curr->header = (ompi_osc_rdma_frag_header_t*) curr->buffer;
curr->top = (char*) (curr->header + 1);
curr->remain_len = mca_osc_rdma_component.buffer_size;
curr->module = module;
curr->pending = 0;
curr->header->base.type = OMPI_OSC_RDMA_HDR_TYPE_FRAG;
curr->header->base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID;
if (module->passive_target_access_epoch) {
curr->header->base.flags |= OMPI_OSC_RDMA_HDR_FLAG_PASSIVE_TARGET;
}
curr->header->source = ompi_comm_rank(module->comm);
curr->header->num_ops = 0;
curr->header->windx = ompi_comm_get_cid(module->comm);
if (curr->remain_len < request_len) {
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
}
}
*ptr = curr->top;
*buffer = curr;
curr->top += request_len;
curr->remain_len -= request_len;
curr->pending++;
curr->header->num_ops++;
return OMPI_SUCCESS;
}
/*
* Note: module lock must be held for this operation
*/
static inline int ompi_osc_rdma_frag_finish(ompi_osc_rdma_module_t *module,
ompi_osc_rdma_frag_t* buffer)
{
if (0 == --buffer->pending && 0 == buffer->remain_len) {
if (OPAL_LIKELY(buffer == module->peers[buffer->target].active_frag)) {
/* this is the active fragment. need to set the current fragment to null
* or it will be started multiple times */
module->peers[buffer->target].active_frag = NULL;
}
return ompi_osc_rdma_frag_start(module, buffer);
}
return OMPI_SUCCESS;
}
#endif

Просмотреть файл

@ -1,187 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef OMPI_MCA_OSC_RDMA_HDR_H
#define OMPI_MCA_OSC_RDMA_HDR_H
#ifdef HAVE_NETINET_IN_H
#include <netinet/in.h>
#endif
#include "opal/types.h"
enum ompi_osc_rdma_hdr_type_t {
OMPI_OSC_RDMA_HDR_TYPE_PUT = 0x01,
OMPI_OSC_RDMA_HDR_TYPE_PUT_LONG = 0x02,
OMPI_OSC_RDMA_HDR_TYPE_ACC = 0x03,
OMPI_OSC_RDMA_HDR_TYPE_ACC_LONG = 0x04,
OMPI_OSC_RDMA_HDR_TYPE_GET = 0x05,
OMPI_OSC_RDMA_HDR_TYPE_CSWAP = 0x06,
OMPI_OSC_RDMA_HDR_TYPE_CSWAP_LONG = 0x07,
OMPI_OSC_RDMA_HDR_TYPE_GET_ACC = 0x08,
OMPI_OSC_RDMA_HDR_TYPE_GET_ACC_LONG = 0x09,
OMPI_OSC_RDMA_HDR_TYPE_COMPLETE = 0x10,
OMPI_OSC_RDMA_HDR_TYPE_POST = 0x11,
OMPI_OSC_RDMA_HDR_TYPE_LOCK_REQ = 0x12,
OMPI_OSC_RDMA_HDR_TYPE_LOCK_ACK = 0x13,
OMPI_OSC_RDMA_HDR_TYPE_UNLOCK_REQ = 0x14,
OMPI_OSC_RDMA_HDR_TYPE_UNLOCK_ACK = 0x15,
OMPI_OSC_RDMA_HDR_TYPE_FLUSH_REQ = 0x16,
OMPI_OSC_RDMA_HDR_TYPE_FLUSH_ACK = 0x17,
OMPI_OSC_RDMA_HDR_TYPE_FRAG = 0x20,
};
typedef enum ompi_osc_rdma_hdr_type_t ompi_osc_rdma_hdr_type_t;
#define OMPI_OSC_RDMA_HDR_FLAG_NBO 0x01
#define OMPI_OSC_RDMA_HDR_FLAG_VALID 0x02
#define OMPI_OSC_RDMA_HDR_FLAG_PASSIVE_TARGET 0x04
#define OMPI_OSC_RDMA_HDR_FLAG_LARGE_DATATYPE 0x08
struct ompi_osc_rdma_header_base_t {
/** fragment type. 8 bits */
uint8_t type;
/** fragment flags. 8 bits */
uint8_t flags;
};
typedef struct ompi_osc_rdma_header_base_t ompi_osc_rdma_header_base_t;
struct ompi_osc_rdma_header_put_t {
ompi_osc_rdma_header_base_t base;
uint16_t tag;
uint32_t count;
uint64_t len;
uint64_t displacement;
};
typedef struct ompi_osc_rdma_header_put_t ompi_osc_rdma_header_put_t;
struct ompi_osc_rdma_header_acc_t {
ompi_osc_rdma_header_base_t base;
uint16_t tag;
uint32_t count;
uint32_t op;
uint64_t len;
uint64_t displacement;
};
typedef struct ompi_osc_rdma_header_acc_t ompi_osc_rdma_header_acc_t;
struct ompi_osc_rdma_header_get_t {
ompi_osc_rdma_header_base_t base;
uint16_t tag;
uint32_t count;
uint64_t len;
uint64_t displacement;
};
typedef struct ompi_osc_rdma_header_get_t ompi_osc_rdma_header_get_t;
struct ompi_osc_rdma_header_complete_t {
ompi_osc_rdma_header_base_t base;
int frag_count;
};
typedef struct ompi_osc_rdma_header_complete_t ompi_osc_rdma_header_complete_t;
struct ompi_osc_rdma_header_cswap_t {
ompi_osc_rdma_header_base_t base;
uint16_t tag;
uint32_t len;
uint64_t displacement;
};
typedef struct ompi_osc_rdma_header_cswap_t ompi_osc_rdma_header_cswap_t;
struct ompi_osc_rdma_header_post_t {
ompi_osc_rdma_header_base_t base;
uint16_t windx;
};
typedef struct ompi_osc_rdma_header_post_t ompi_osc_rdma_header_post_t;
struct ompi_osc_rdma_header_lock_t {
ompi_osc_rdma_header_base_t base;
int32_t lock_type;
uint64_t serial_number;
};
typedef struct ompi_osc_rdma_header_lock_t ompi_osc_rdma_header_lock_t;
struct ompi_osc_rdma_header_lock_ack_t {
ompi_osc_rdma_header_base_t base;
uint16_t windx;
uint32_t source;
uint64_t serial_number;
};
typedef struct ompi_osc_rdma_header_lock_ack_t ompi_osc_rdma_header_lock_ack_t;
struct ompi_osc_rdma_header_unlock_t {
ompi_osc_rdma_header_base_t base;
int32_t lock_type;
uint32_t frag_count;
};
typedef struct ompi_osc_rdma_header_unlock_t ompi_osc_rdma_header_unlock_t;
struct ompi_osc_rdma_header_unlock_ack_t {
ompi_osc_rdma_header_base_t base;
};
typedef struct ompi_osc_rdma_header_unlock_ack_t ompi_osc_rdma_header_unlock_ack_t;
struct ompi_osc_rdma_header_flush_t {
ompi_osc_rdma_header_base_t base;
uint32_t frag_count;
uint64_t serial_number;
};
typedef struct ompi_osc_rdma_header_flush_t ompi_osc_rdma_header_flush_t;
struct ompi_osc_rdma_header_flush_ack_t {
ompi_osc_rdma_header_base_t base;
uint64_t serial_number;
};
typedef struct ompi_osc_rdma_header_flush_ack_t ompi_osc_rdma_header_flush_ack_t;
struct ompi_osc_rdma_frag_header_t {
ompi_osc_rdma_header_base_t base;
uint16_t windx; /* cid of communicator backing window (our window id) */
uint32_t source; /* rank in window of source process */
uint16_t num_ops; /* number of operations in this buffer */
uint16_t pad[3]; /* ensure the fragment header is a multiple of 8 bytes */
};
typedef struct ompi_osc_rdma_frag_header_t ompi_osc_rdma_frag_header_t;
union ompi_osc_rdma_header_t {
ompi_osc_rdma_header_base_t base;
ompi_osc_rdma_header_put_t put;
ompi_osc_rdma_header_acc_t acc;
ompi_osc_rdma_header_get_t get;
ompi_osc_rdma_header_complete_t complete;
ompi_osc_rdma_header_cswap_t cswap;
ompi_osc_rdma_header_post_t post;
ompi_osc_rdma_header_lock_t lock;
ompi_osc_rdma_header_lock_ack_t lock_ack;
ompi_osc_rdma_header_unlock_t unlock;
ompi_osc_rdma_header_unlock_ack_t unlock_ack;
ompi_osc_rdma_header_flush_t flush;
ompi_osc_rdma_header_flush_ack_t flush_ack;
ompi_osc_rdma_frag_header_t frag;
};
typedef union ompi_osc_rdma_header_t ompi_osc_rdma_header_t;
#endif /* OMPI_MCA_OSC_RDMA_HDR_H */

Просмотреть файл

@ -1,47 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2010 IBM Corporation. All rights reserved.
* Copyright (c) 2012 Sandia National Laboratories. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/*
* utility functions for dealing with remote datatype and op structures
*/
/**
* Convert a window index number into a module instance.
*/
static inline ompi_osc_rdma_module_t*
ompi_osc_rdma_windx_to_module(uint32_t windx)
{
int ret;
ompi_osc_rdma_module_t *module;
/* find the right module and dispatch */
OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock);
ret = opal_hash_table_get_value_uint32(&mca_osc_rdma_component.modules,
windx,
(void**) (&module));
OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock);
if (OMPI_SUCCESS != ret) {
opal_output(0, "Could not translate windx %d to a local MPI_Win instance",
windx);
return NULL;
}
return module;
}

Просмотреть файл

@ -1,966 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2010 IBM Corporation. All rights reserved.
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "osc_rdma.h"
#include "osc_rdma_header.h"
#include "osc_rdma_data_move.h"
#include "osc_rdma_frag.h"
#include "mpi.h"
#include "opal/runtime/opal_progress.h"
#include "opal/threads/mutex.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/osc/base/base.h"
#include "opal/include/opal_stdint.h"
/* target-side tracking of a lock request */
struct ompi_osc_rdma_pending_lock_t {
opal_list_item_t super;
int peer;
int lock_type;
uint64_t serial_number;
};
typedef struct ompi_osc_rdma_pending_lock_t ompi_osc_rdma_pending_lock_t;
OBJ_CLASS_INSTANCE(ompi_osc_rdma_pending_lock_t, opal_list_item_t,
NULL, NULL);
/* origin-side tracking of a lock request */
struct ompi_osc_rdma_outstanding_lock_t {
opal_list_item_t super;
int target;
int32_t lock_acks_received;
int32_t unlock_acks_received;
int32_t flush_acks_received;
uint64_t serial_number;
int32_t type;
};
typedef struct ompi_osc_rdma_outstanding_lock_t ompi_osc_rdma_outstanding_lock_t;
OBJ_CLASS_INSTANCE(ompi_osc_rdma_outstanding_lock_t, opal_list_item_t,
NULL, NULL);
static int ompi_osc_activate_next_lock (ompi_osc_rdma_module_t *module);
static inline int queue_lock (ompi_osc_rdma_module_t *module, int requestor,
int lock_type, uint64_t serial_number);
/**
* Find the first outstanding lock to a target.
*
* @param[in] module - OSC RDMA module
* @param[in] target - Target rank
*
* @returns an outstanding lock on success
*
* This function traverses the outstanding_locks list in the module
* looking for a lock that matches target. The caller must hold the
* module lock.
*/
static inline ompi_osc_rdma_outstanding_lock_t *find_outstanding_lock (ompi_osc_rdma_module_t *module, int target)
{
ompi_osc_rdma_outstanding_lock_t *lock;
OPAL_LIST_FOREACH(lock, &module->outstanding_locks, ompi_osc_rdma_outstanding_lock_t) {
if (lock->target == target) {
return lock;
}
}
return NULL;
}
static inline ompi_osc_rdma_outstanding_lock_t *find_outstanding_lock_by_serial (ompi_osc_rdma_module_t *module, uint64_t serial_number)
{
ompi_osc_rdma_outstanding_lock_t *lock;
OPAL_LIST_FOREACH(lock, &module->outstanding_locks, ompi_osc_rdma_outstanding_lock_t) {
if (lock->serial_number == serial_number) {
return lock;
}
}
return NULL;
}
static inline int ompi_osc_rdma_lock_self (ompi_osc_rdma_module_t *module, ompi_osc_rdma_outstanding_lock_t *lock)
{
const int my_rank = ompi_comm_rank (module->comm);
if ((MPI_LOCK_SHARED == lock->type && MPI_LOCK_EXCLUSIVE != module->lock_status) ||
(MPI_LOCK_EXCLUSIVE == lock->type && 0 == module->lock_status)) {
/* we can aquire the lock immediately */
module->lock_status = lock->type;
if (MPI_LOCK_SHARED == lock->type) {
module->shared_count++;
}
lock->lock_acks_received++;
} else {
/* queue the lock */
queue_lock (module, my_rank, lock->type, lock->serial_number);
}
/* If locking local, can't be non-blocking according to the
standard. We need to wait for the ack here. */
while (0 == lock->lock_acks_received) {
opal_condition_wait(&module->cond, &module->lock);
}
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"local lock aquired"));
return OMPI_SUCCESS;
}
static inline void ompi_osc_rdma_unlock_self (ompi_osc_rdma_module_t *module, ompi_osc_rdma_outstanding_lock_t *lock)
{
if (!(MPI_LOCK_SHARED == lock->type && 0 == --module->shared_count)) {
module->lock_status = 0;
ompi_osc_activate_next_lock (module);
}
/* need to ensure we make progress */
opal_progress();
lock->unlock_acks_received++;
}
static inline int ompi_osc_rdma_lock_remote (ompi_osc_rdma_module_t *module, int target, ompi_osc_rdma_outstanding_lock_t *lock)
{
ompi_osc_rdma_header_lock_t lock_req;
int ret;
/* generate a lock request */
lock_req.base.type = OMPI_OSC_RDMA_HDR_TYPE_LOCK_REQ;
lock_req.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID | OMPI_OSC_RDMA_HDR_FLAG_PASSIVE_TARGET;
lock_req.lock_type = lock->type;
lock_req.serial_number = lock->serial_number;
ret = ompi_osc_rdma_control_send (module, target, &lock_req, sizeof (lock_req));
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
/* make sure the request gets sent, so we can start eager sending... */
ret = ompi_osc_rdma_frag_flush_target (module, target);
return ret;
}
static inline int ompi_osc_rdma_unlock_remote (ompi_osc_rdma_module_t *module, int target, ompi_osc_rdma_outstanding_lock_t *lock)
{
ompi_osc_rdma_header_unlock_t unlock_req;
unlock_req.base.type = OMPI_OSC_RDMA_HDR_TYPE_UNLOCK_REQ;
unlock_req.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID | OMPI_OSC_RDMA_HDR_FLAG_PASSIVE_TARGET;
unlock_req.frag_count = module->epoch_outgoing_frag_count[target];
unlock_req.lock_type = lock->type;
/* send control message with unlock request and count */
return ompi_osc_rdma_control_send (module, target, &unlock_req, sizeof (unlock_req));
}
int ompi_osc_rdma_lock(int lock_type, int target, int assert, ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_outstanding_lock_t *lock;
ompi_osc_rdma_peer_t *peer = module->peers + target;
int ret = OMPI_SUCCESS;
/* Check if no_locks is set. TODO: we also need to track whether we are in an
* active target epoch. Fence can make this tricky to track. */
if (NULL == module->passive_eager_send_active || module->sc_group) {
return OMPI_ERR_RMA_SYNC;
}
assert(module->epoch_outgoing_frag_count[target] == 0);
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc rdma: lock %d %d", target, lock_type));
/* delay all eager sends until we've heard back.. */
OPAL_THREAD_LOCK(&module->lock);
module->passive_eager_send_active[target] = false;
module->passive_target_access_epoch = true;
/* when the lock ack returns we will be in an access epoch with this peer */
peer->access_epoch = true;
/* create lock item */
lock = OBJ_NEW(ompi_osc_rdma_outstanding_lock_t);
if (OPAL_UNLIKELY(NULL == lock)) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
lock->target = target;
lock->lock_acks_received = 0;
lock->unlock_acks_received = 0;
lock->serial_number = module->lock_serial_number++;
lock->type = lock_type;
opal_list_append(&module->outstanding_locks, &lock->super);
if (0 == (assert & MPI_MODE_NOCHECK)) {
if (ompi_comm_rank (module->comm) != target) {
ret = ompi_osc_rdma_lock_remote (module, target, lock);
} else {
ret = ompi_osc_rdma_lock_self (module, lock);
}
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
goto exit_error;
}
} else {
lock->lock_acks_received = 1;
}
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_SUCCESS;
exit_error:
OPAL_THREAD_UNLOCK(&module->lock);
opal_list_remove_item(&module->outstanding_locks, &lock->super);
OBJ_RELEASE(lock);
/* return */
return ret;
}
int ompi_osc_rdma_unlock(int target, ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_outstanding_lock_t *lock = NULL;
ompi_osc_rdma_peer_t *peer = module->peers + target;
int ret = OMPI_SUCCESS;
OPAL_THREAD_LOCK(&module->lock);
lock = find_outstanding_lock (module, target);
if (OPAL_UNLIKELY(NULL == lock)) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_unlock: target %d is not locked in window %s",
target, win->w_name));
OPAL_THREAD_LOCK(&module->lock);
return OMPI_ERR_RMA_SYNC;
}
if (ompi_comm_rank (module->comm) != target) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc rdma: unlock %d, lock_acks_received = %d", target,
lock->lock_acks_received));
/* wait until ack has arrived from target */
while (0 == lock->lock_acks_received) {
opal_condition_wait(&module->cond, &module->lock);
}
ret = ompi_osc_rdma_unlock_remote (module, target, lock);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
goto cleanup;
}
/* start all sendreqs to target */
ret = ompi_osc_rdma_frag_flush_target(module, target);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
goto cleanup;
}
/* wait for all the requests and the unlock ack (meaning remote completion) */
while (module->outgoing_frag_count != module->outgoing_frag_signal_count ||
0 == lock->unlock_acks_received) {
opal_condition_wait(&module->cond, &module->lock);
}
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_unlock: unlock of %d complete", target));
} else {
ompi_osc_rdma_unlock_self (module, lock);
}
module->passive_eager_send_active[target] = false;
module->epoch_outgoing_frag_count[target] = 0;
module->passive_target_access_epoch = false;
peer->access_epoch = false;
/* delete the lock */
opal_list_remove_item (&module->outstanding_locks, &lock->super);
OBJ_RELEASE(lock);
cleanup:
OPAL_THREAD_UNLOCK(&module->lock);
return ret;
}
int ompi_osc_rdma_lock_all(int assert, struct ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
int ret, my_rank = ompi_comm_rank (module->comm);
ompi_osc_rdma_outstanding_lock_t *lock;
/* Check if no_locks is set. TODO: we also need to track whether we are in an active
* target epoch. Fence can make this tricky to track. */
if (NULL == module->passive_eager_send_active) {
return OMPI_ERR_RMA_SYNC;
}
/* delay all eager sends until we've heard back.. */
OPAL_THREAD_LOCK(&module->lock);
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
module->passive_eager_send_active[i] = false;
}
module->passive_target_access_epoch = true;
module->all_access_epoch = true;
/* create lock item */
lock = OBJ_NEW(ompi_osc_rdma_outstanding_lock_t);
lock->target = -1;
lock->lock_acks_received = 0;
lock->unlock_acks_received = 0;
lock->serial_number = module->lock_serial_number++;
lock->type = MPI_LOCK_SHARED;
opal_list_append(&module->outstanding_locks, &lock->super);
/* if nocheck is not specified, send a lock request to everyone
and wait for the local response */
if (0 != (assert & MPI_MODE_NOCHECK)) {
ret = ompi_osc_rdma_lock_self (module, lock);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
goto exit_error;
}
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
if (my_rank == i) {
continue;
}
ret = ompi_osc_rdma_lock_remote (module, i, lock);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
opal_list_remove_item(&module->outstanding_locks, &lock->super);
}
}
} else {
lock->lock_acks_received = ompi_comm_size(module->comm);
}
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_SUCCESS;
exit_error:
OPAL_THREAD_UNLOCK(&module->lock);
opal_list_remove_item(&module->outstanding_locks, &lock->super);
OBJ_RELEASE(lock);
/* return */
return ret;
}
int ompi_osc_rdma_unlock_all (struct ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
int my_rank = ompi_comm_rank (module->comm);
ompi_osc_rdma_outstanding_lock_t *lock;
int ret;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_unlock_all entering..."));
OPAL_THREAD_LOCK(&module->lock);
lock = find_outstanding_lock (module, -1);
if (OPAL_UNLIKELY(NULL == lock)) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_unlock_all: not locked in window %s",
win->w_name));
OPAL_THREAD_LOCK(&module->lock);
return OMPI_ERR_RMA_SYNC;
}
/* wait for lock acks */
while (ompi_comm_size(module->comm) != lock->lock_acks_received) {
opal_condition_wait(&module->cond, &module->lock);
}
/* send unlock messages to all of my peers */
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
if (my_rank == i) {
continue;
}
ret = ompi_osc_rdma_unlock_remote (module, i, lock);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
goto cleanup;
}
}
/* unlock myself */
ompi_osc_rdma_unlock_self (module, lock);
/* start all sendreqs to target */
ret = ompi_osc_rdma_frag_flush_all(module);
if (OMPI_SUCCESS != ret) goto cleanup;
/* wait for all the requests and the unlock ack (meaning remote completion) */
while (module->outgoing_frag_count != module->outgoing_frag_signal_count ||
ompi_comm_size(module->comm) != lock->unlock_acks_received) {
opal_condition_wait(&module->cond, &module->lock);
}
/* reset all fragment counters */
memset (module->epoch_outgoing_frag_count, 0, ompi_comm_size(module->comm) * sizeof (module->epoch_outgoing_frag_count[0]));
memset (module->passive_eager_send_active, 0, ompi_comm_size(module->comm) * sizeof (module->passive_eager_send_active[0]));
opal_list_remove_item (&module->outstanding_locks, &lock->super);
OBJ_RELEASE(lock);
module->passive_target_access_epoch = false;
module->all_access_epoch = false;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_unlock_all complete"));
cleanup:
OPAL_THREAD_UNLOCK(&module->lock);
return ret;
}
int ompi_osc_rdma_sync (struct ompi_win_t *win)
{
opal_progress();
return OMPI_SUCCESS;
}
static int ompi_osc_rdma_flush_lock (ompi_osc_rdma_module_t *module, ompi_osc_rdma_outstanding_lock_t *lock,
int target)
{
ompi_osc_rdma_header_flush_t flush_req;
int peer_count, ret, flush_count;
int my_rank = ompi_comm_rank (module->comm);
if (-1 == lock->target) {
peer_count = ompi_comm_size(module->comm);
} else {
peer_count = 1;
}
/* wait until ack has arrived from target, since we need to be
able to eager send before we can transfer all the data... */
while (peer_count > lock->lock_acks_received) {
opal_condition_wait(&module->cond, &module->lock);
}
lock->flush_acks_received = 0;
flush_req.base.type = OMPI_OSC_RDMA_HDR_TYPE_FLUSH_REQ;
flush_req.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID | OMPI_OSC_RDMA_HDR_FLAG_PASSIVE_TARGET;
flush_req.serial_number = lock->serial_number;
if (-1 == target) {
/* NTH: no local flush */
flush_count = ompi_comm_size(module->comm) - 1;
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
if (i == my_rank) {
continue;
}
flush_req.frag_count = module->epoch_outgoing_frag_count[i];
/* send control message with flush request and count */
ret = ompi_osc_rdma_control_send (module, i, &flush_req, sizeof (flush_req));
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
/* start all sendreqs to target */
ret = ompi_osc_rdma_frag_flush_target (module, i);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
}
} else {
flush_req.frag_count = module->epoch_outgoing_frag_count[target];
flush_count = 1;
/* send control message with flush request and count */
ret = ompi_osc_rdma_control_send (module, target, &flush_req, sizeof (flush_req));
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
/* start all sendreqs to target */
ret = ompi_osc_rdma_frag_flush_target (module, target);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
}
/* wait for all the requests and the flush ack (meaning remote completion) */
while (module->outgoing_frag_count != module->outgoing_frag_signal_count ||
flush_count != lock->flush_acks_received) {
opal_condition_wait(&module->cond, &module->lock);
}
if (-1 == target) {
memset (module->epoch_outgoing_frag_count, 0, peer_count * sizeof (module->epoch_outgoing_frag_count[0]));
} else {
module->epoch_outgoing_frag_count[target] = 0;
}
return OMPI_SUCCESS;
}
int ompi_osc_rdma_flush (int target, struct ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_outstanding_lock_t *lock;
int ret;
assert (0 <= target);
/* flush is only allowed from within a passive target epoch */
if (!module->passive_target_access_epoch) {
return OMPI_ERR_RMA_SYNC;
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_flush starting..."));
if (ompi_comm_rank (module->comm) == target) {
/* nothing to flush */
opal_progress ();
return OMPI_SUCCESS;
}
OPAL_THREAD_LOCK(&module->lock);
lock = find_outstanding_lock (module, target);
if (NULL == lock) {
lock = find_outstanding_lock (module, -1);
}
if (OPAL_UNLIKELY(NULL == lock)) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_flush: target %d is not locked in window %s",
target, win->w_name));
OPAL_THREAD_LOCK(&module->lock);
return OMPI_ERR_RMA_SYNC;
}
ret = ompi_osc_rdma_flush_lock (module, lock, target);
OPAL_THREAD_UNLOCK(&module->lock);
return ret;
}
int ompi_osc_rdma_flush_all (struct ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_outstanding_lock_t *lock;
int ret = OMPI_SUCCESS;
/* flush is only allowed from within a passive target epoch */
if (!module->passive_target_access_epoch) {
return OMPI_ERR_RMA_SYNC;
}
if (OPAL_UNLIKELY(0 == opal_list_get_size (&module->outstanding_locks))) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_flush_all: no targets are locked in window %s",
win->w_name));
return OMPI_ERR_RMA_SYNC;
}
OPAL_THREAD_LOCK(&module->lock);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_flush_all entering..."));
/* flush all locks */
OPAL_LIST_FOREACH(lock, &module->outstanding_locks, ompi_osc_rdma_outstanding_lock_t) {
ret = ompi_osc_rdma_flush_lock (module, lock, lock->target);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
break;
}
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_flush_all complete"));
OPAL_THREAD_UNLOCK(&module->lock);
return ret;
}
int ompi_osc_rdma_flush_local (int target, struct ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
int ret;
/* flush is only allowed from within a passive target epoch */
if (!module->passive_target_access_epoch) {
return OMPI_ERR_RMA_SYNC;
}
OPAL_THREAD_LOCK(&module->lock);
ret = ompi_osc_rdma_frag_flush_target(module, target);
if (OMPI_SUCCESS != ret) goto cleanup;
/* wait for all the requests */
while (module->outgoing_frag_count != module->outgoing_frag_signal_count) {
opal_condition_wait(&module->cond, &module->lock);
}
cleanup:
OPAL_THREAD_UNLOCK(&module->lock);
return ret;
}
int ompi_osc_rdma_flush_local_all (struct ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
int ret = OMPI_SUCCESS;
/* flush is only allowed from within a passive target epoch */
if (!module->passive_target_access_epoch) {
return OMPI_ERR_RMA_SYNC;
}
OPAL_THREAD_LOCK(&module->lock);
ret = ompi_osc_rdma_frag_flush_all(module);
if (OMPI_SUCCESS != ret) goto cleanup;
/* wait for all the requests */
while (module->outgoing_frag_count != module->outgoing_frag_signal_count) {
opal_condition_wait(&module->cond, &module->lock);
}
cleanup:
OPAL_THREAD_UNLOCK(&module->lock);
return ret;
}
/* target side operation to acknowledge to initiator side that the
lock is now held by the initiator */
static inline int activate_lock (ompi_osc_rdma_module_t *module, int requestor,
uint64_t serial_number)
{
ompi_osc_rdma_outstanding_lock_t *lock;
if (ompi_comm_rank (module->comm) != requestor) {
ompi_osc_rdma_header_lock_ack_t lock_ack;
lock_ack.base.type = OMPI_OSC_RDMA_HDR_TYPE_LOCK_ACK;
lock_ack.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID;
lock_ack.source = ompi_comm_rank(module->comm);
lock_ack.windx = ompi_comm_get_cid(module->comm);
lock_ack.serial_number = serial_number;
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc rdma: sending lock to %d", requestor));
/* we don't want to send any data, since we're the exposure
epoch only, so use an unbuffered send */
return ompi_osc_rdma_control_send_unbuffered (module, requestor, &lock_ack, sizeof (lock_ack));
}
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc rdma: releasing local lock"));
lock = find_outstanding_lock (module, requestor);
if (NULL == lock) {
lock = find_outstanding_lock (module, -1);
if (OPAL_UNLIKELY(NULL == lock)) {
OPAL_OUTPUT_VERBOSE((5, ompi_osc_base_framework.framework_output,
"lock could not be located"));
}
}
lock->lock_acks_received++;
opal_condition_broadcast (&module->cond);
return OMPI_SUCCESS;
}
/* target side operation to create a pending lock request for a lock
request that could not be satisfied */
static inline int queue_lock (ompi_osc_rdma_module_t *module, int requestor,
int lock_type, uint64_t serial_number)
{
ompi_osc_rdma_pending_lock_t *pending =
OBJ_NEW(ompi_osc_rdma_pending_lock_t);
if (NULL == pending) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
pending->peer = requestor;
pending->lock_type = lock_type;
pending->serial_number = serial_number;
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc rdma: queueing lock request from %d", requestor));
opal_list_append(&module->locks_pending, &pending->super);
return OMPI_SUCCESS;
}
static int ompi_osc_activate_next_lock (ompi_osc_rdma_module_t *module) {
/* release any other pending locks we can */
ompi_osc_rdma_pending_lock_t *pending_lock, *next;
int ret = OMPI_SUCCESS;
OPAL_LIST_FOREACH_SAFE(pending_lock, next, &module->locks_pending,
ompi_osc_rdma_pending_lock_t) {
if (MPI_LOCK_SHARED == pending_lock->lock_type) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_activate_next_lock: release pending lock of type MPI_LOCK_SHARED to peer %d\n",
pending_lock->peer));
/* acquire shared lock */
module->lock_status = MPI_LOCK_SHARED;
module->shared_count++;
ret = activate_lock(module, pending_lock->peer, pending_lock->serial_number);
opal_list_remove_item (&module->locks_pending, &pending_lock->super);
OBJ_RELEASE(pending_lock);
} else {
if (0 == module->lock_status) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_activate_next_lock: release pending lock of type MPI_LOCK_EXCLUSIVE to peer %d\n",
pending_lock->peer));
/* acquire exclusive lock */
module->lock_status = MPI_LOCK_EXCLUSIVE;
ret = activate_lock(module, pending_lock->peer, pending_lock->serial_number);
opal_list_remove_item (&module->locks_pending, &pending_lock->super);
OBJ_RELEASE(pending_lock);
}
/* if the lock was acquired (ie, status was 0), then
we're done. If the lock was not acquired, we're
also done, because all the shared locks have to
finish first */
break;
}
if (OMPI_SUCCESS != ret) {
break;
}
}
return ret;
}
/* target side function called when the initiator sends a lock
request. Lock will either be activated and acknowledged or
queued. */
int ompi_osc_rdma_process_lock (ompi_osc_rdma_module_t* module, int source,
ompi_osc_rdma_header_lock_t* lock_header)
{
int ret;
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_process_lock: processing lock request from %d. current lock state = %d, shared_count = %d",
source, module->lock_status, module->shared_count));
if (MPI_LOCK_SHARED == lock_header->lock_type) {
if (module->lock_status != MPI_LOCK_EXCLUSIVE) {
/* acquire shared lock */
module->lock_status = MPI_LOCK_SHARED;
module->shared_count++;
ret = activate_lock(module, source, lock_header->serial_number);
} else {
/* lock not available, queue */
ret = queue_lock(module, source, lock_header->lock_type, lock_header->serial_number);
}
} else {
if (0 == module->lock_status) {
/* acquire exclusive lock */
module->lock_status = MPI_LOCK_EXCLUSIVE;
ret = activate_lock(module, source, lock_header->serial_number);
} else {
/* lock not available, queue */
ret = queue_lock(module, source, lock_header->lock_type, lock_header->serial_number);
}
}
return ret;
}
/* initiator-side function called when the target acks the lock
request. */
void ompi_osc_rdma_process_lock_ack (ompi_osc_rdma_module_t *module,
ompi_osc_rdma_header_lock_ack_t *lock_ack_header)
{
ompi_osc_rdma_outstanding_lock_t *lock, *next;
OPAL_LIST_FOREACH_SAFE(lock, next, &module->outstanding_locks, ompi_osc_rdma_outstanding_lock_t) {
if (lock->serial_number == lock_ack_header->serial_number) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc rdma: lock ack %d", lock_ack_header->source));
lock->lock_acks_received++;
module->passive_eager_send_active[lock_ack_header->source] = true;
return;
}
}
opal_output(ompi_osc_base_framework.framework_output,
"osc rdma: lock ack %d, %ld for unfindable lock request",
lock_ack_header->source, (unsigned long) lock_ack_header->serial_number);
}
void ompi_osc_rdma_process_flush_ack (ompi_osc_rdma_module_t *module, int source,
ompi_osc_rdma_header_flush_ack_t *flush_ack_header) {
ompi_osc_rdma_outstanding_lock_t *lock;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_process_unlock_ack: processing flush ack from %d for lock %" PRIu64,
source, flush_ack_header->serial_number));
/* NTH: need to verify that this will work as expected */
lock = find_outstanding_lock_by_serial (module, flush_ack_header->serial_number);
assert (NULL != lock);
lock->flush_acks_received++;
opal_condition_broadcast(&module->cond);
}
void ompi_osc_rdma_process_unlock_ack (ompi_osc_rdma_module_t *module, int source,
ompi_osc_rdma_header_unlock_ack_t *unlock_ack_header) {
ompi_osc_rdma_outstanding_lock_t *lock;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_process_unlock_ack: processing unlock ack from %d",
source));
/* NTH: need to verify that this will work as expected */
lock = find_outstanding_lock (module, source);
if (NULL == lock) {
lock = find_outstanding_lock(module, -1);
assert (NULL != lock);
}
lock->unlock_acks_received++;
}
/**
* Process an unlock request.
*
* @param[in] module - OSC RDMA module
* @param[in] source - Source rank
* @param[in] unlock_header - Incoming unlock header
*
* This functions is the target-side functio for handling an unlock
* request. Once all pending operations from the target are complete
* this functions sends an unlock acknowledgement then attempts to
* active a pending lock if the lock becomes free.
*/
int ompi_osc_rdma_process_unlock (ompi_osc_rdma_module_t *module, int source,
ompi_osc_rdma_header_unlock_t *unlock_header)
{
ompi_osc_rdma_header_unlock_ack_t unlock_ack;
int ret;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_process_unlock entering (finished %d/%d)...",
module->passive_incoming_frag_count[source],
module->passive_incoming_frag_signal_count[source]));
/* we cannot block when processing an incoming request */
if (module->passive_incoming_frag_signal_count[source] !=
module->passive_incoming_frag_count[source]) {
return OMPI_ERR_WOULD_BLOCK;
}
unlock_ack.base.type = OMPI_OSC_RDMA_HDR_TYPE_UNLOCK_ACK;
unlock_ack.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID;
ret = ompi_osc_rdma_control_send_unbuffered (module, source, &unlock_ack, sizeof (unlock_ack));
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
module->passive_incoming_frag_signal_count[source] = 0;
module->passive_incoming_frag_count[source] = 0;
OPAL_THREAD_LOCK(&module->lock);
if (unlock_header->lock_type == MPI_LOCK_EXCLUSIVE || 0 == --module->shared_count) {
module->lock_status = 0;
ompi_osc_activate_next_lock (module);
}
OPAL_THREAD_UNLOCK(&module->lock);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: finished processing unlock fragment"));
return ret;
}
int ompi_osc_rdma_process_flush (ompi_osc_rdma_module_t *module, int source,
ompi_osc_rdma_header_flush_t *flush_header)
{
ompi_osc_rdma_header_flush_ack_t flush_ack;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_process_flush entering (finished %d/%d)...",
module->passive_incoming_frag_count[source],
module->passive_incoming_frag_signal_count[source]));
/* we cannot block when processing an incoming request */
if (module->passive_incoming_frag_signal_count[source] !=
module->passive_incoming_frag_count[source]) {
return OMPI_ERR_WOULD_BLOCK;
}
module->passive_incoming_frag_signal_count[source] = 0;
module->passive_incoming_frag_count[source] = 0;
flush_ack.base.type = OMPI_OSC_RDMA_HDR_TYPE_FLUSH_ACK;
flush_ack.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID;
flush_ack.serial_number = flush_header->serial_number;
return ompi_osc_rdma_control_send_unbuffered (module, source, &flush_ack, sizeof (flush_ack));
}

0
ompi/mca/pml/bfo/.opal_ignore Обычный файл
Просмотреть файл

Просмотреть файл

@ -513,7 +513,7 @@ int mca_pml_bfo_send_fin( ompi_proc_t* proc,
fin->des_cbdata = NULL; fin->des_cbdata = NULL;
/* fill in header */ /* fill in header */
hdr = (mca_pml_bfo_fin_hdr_t*)fin->des_local->seg_addr.pval; hdr = (mca_pml_bfo_fin_hdr_t*)fin->des_segments->seg_addr.pval;
hdr->hdr_common.hdr_flags = 0; hdr->hdr_common.hdr_flags = 0;
hdr->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_FIN; hdr->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_FIN;
hdr->hdr_des = hdr_des; hdr->hdr_des = hdr_des;

Просмотреть файл

@ -284,7 +284,7 @@ void mca_pml_bfo_repost_fin(struct mca_btl_base_descriptor_t* des) {
proc = (ompi_proc_t*) des->des_cbdata; proc = (ompi_proc_t*) des->des_cbdata;
bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
hdr = (mca_pml_bfo_fin_hdr_t*)des->des_local->seg_addr.pval; hdr = (mca_pml_bfo_fin_hdr_t*)des->des_segments->seg_addr.pval;
opal_output_verbose(20, mca_pml_bfo_output, opal_output_verbose(20, mca_pml_bfo_output,
"REPOST: BFO_HDR_TYPE_FIN: seq=%d,myrank=%d,peer=%d,hdr->hdr_fail=%d,src=%d", "REPOST: BFO_HDR_TYPE_FIN: seq=%d,myrank=%d,peer=%d,hdr->hdr_fail=%d,src=%d",
@ -376,7 +376,7 @@ void mca_pml_bfo_recv_frag_callback_rndvrestartnotify(mca_btl_base_module_t* btl
mca_btl_base_tag_t tag, mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des, mca_btl_base_descriptor_t* des,
void* cbdata ) { void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_local; mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval; mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
mca_pml_bfo_recv_request_t* recvreq; mca_pml_bfo_recv_request_t* recvreq;
ompi_proc_t* ompi_proc; ompi_proc_t* ompi_proc;
@ -461,7 +461,7 @@ void mca_pml_bfo_recv_frag_callback_rndvrestartack(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag, mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des, mca_btl_base_descriptor_t* des,
void* cbdata ) { void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_local; mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval; mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
mca_pml_bfo_send_request_t* sendreq; mca_pml_bfo_send_request_t* sendreq;
@ -522,7 +522,7 @@ void mca_pml_bfo_recv_frag_callback_recverrnotify(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag, mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des, mca_btl_base_descriptor_t* des,
void* cbdata ) { void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_local; mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval; mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
mca_pml_bfo_send_request_t* sendreq; mca_pml_bfo_send_request_t* sendreq;
@ -607,7 +607,7 @@ void mca_pml_bfo_recv_frag_callback_rndvrestartnack(mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des, mca_btl_base_descriptor_t* des,
void* cbdata ) { void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_local; mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval; mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
mca_pml_bfo_send_request_t* sendreq; mca_pml_bfo_send_request_t* sendreq;
@ -701,7 +701,7 @@ void mca_pml_bfo_send_request_rndvrestartnotify(mca_pml_bfo_send_request_t* send
} }
/* fill out header */ /* fill out header */
restart = (mca_pml_bfo_restart_hdr_t*)des->des_local->seg_addr.pval; restart = (mca_pml_bfo_restart_hdr_t*)des->des_segments->seg_addr.pval;
restart->hdr_match.hdr_common.hdr_flags = 0; restart->hdr_match.hdr_common.hdr_flags = 0;
restart->hdr_match.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY; restart->hdr_match.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY;
restart->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; restart->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
@ -915,7 +915,7 @@ void mca_pml_bfo_repost_match_fragment(struct mca_btl_base_descriptor_t* des)
mca_btl_base_segment_t* oldseg; mca_btl_base_segment_t* oldseg;
mca_btl_base_segment_t* newseg; mca_btl_base_segment_t* newseg;
oldseg = des->des_local; oldseg = des->des_segments;
/* The alloc routine must be called with the MCA_BTL_NO_ORDER /* The alloc routine must be called with the MCA_BTL_NO_ORDER
* flag so that the allocation routine works. The allocation * flag so that the allocation routine works. The allocation
* will fill in the order flag in the descriptor. */ * will fill in the order flag in the descriptor. */
@ -928,7 +928,7 @@ void mca_pml_bfo_repost_match_fragment(struct mca_btl_base_descriptor_t* des)
__FILE__, __LINE__); __FILE__, __LINE__);
ompi_rte_abort(-1, NULL); ompi_rte_abort(-1, NULL);
} }
newseg = newdes->des_local; newseg = newdes->des_segments;
/* Copy over all the data that is actually sent over the wire */ /* Copy over all the data that is actually sent over the wire */
memcpy(newseg->seg_addr.pval, oldseg->seg_addr.pval, oldseg->seg_len); memcpy(newseg->seg_addr.pval, oldseg->seg_addr.pval, oldseg->seg_len);
newseg->seg_len = oldseg->seg_len; newseg->seg_len = oldseg->seg_len;
@ -972,7 +972,7 @@ mca_pml_bfo_rndvrestartnotify_completion(mca_btl_base_module_t* btl,
mca_pml_bfo_restart_hdr_t* restart; mca_pml_bfo_restart_hdr_t* restart;
mca_pml_bfo_send_request_t* sendreq; mca_pml_bfo_send_request_t* sendreq;
restart = (mca_pml_bfo_restart_hdr_t*)des->des_local->seg_addr.pval; restart = (mca_pml_bfo_restart_hdr_t*)des->des_segments->seg_addr.pval;
sendreq = (mca_pml_bfo_send_request_t*) restart->hdr_src_req.pval; sendreq = (mca_pml_bfo_send_request_t*) restart->hdr_src_req.pval;
/* Need to resend this message in the case that it fails */ /* Need to resend this message in the case that it fails */
@ -1061,7 +1061,7 @@ void mca_pml_bfo_recv_request_recverrnotify(mca_pml_bfo_recv_request_t* recvreq,
} }
/* fill out header */ /* fill out header */
restart = (mca_pml_bfo_restart_hdr_t*)des->des_local->seg_addr.pval; restart = (mca_pml_bfo_restart_hdr_t*)des->des_segments->seg_addr.pval;
restart->hdr_match.hdr_common.hdr_flags = 0; restart->hdr_match.hdr_common.hdr_flags = 0;
restart->hdr_match.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY; restart->hdr_match.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY;
restart->hdr_match.hdr_ctx = recvreq->req_recv.req_base.req_comm->c_contextid; restart->hdr_match.hdr_ctx = recvreq->req_recv.req_base.req_comm->c_contextid;
@ -1145,7 +1145,7 @@ void mca_pml_bfo_recv_request_rndvrestartack(mca_pml_bfo_recv_request_t* recvreq
} }
/* fill out header */ /* fill out header */
restart = (mca_pml_bfo_restart_hdr_t*)des->des_local->seg_addr.pval; restart = (mca_pml_bfo_restart_hdr_t*)des->des_segments->seg_addr.pval;
restart->hdr_match.hdr_common.hdr_flags = 0; restart->hdr_match.hdr_common.hdr_flags = 0;
restart->hdr_match.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK; restart->hdr_match.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK;
restart->hdr_match.hdr_ctx = recvreq->req_recv.req_base.req_comm->c_contextid; restart->hdr_match.hdr_ctx = recvreq->req_recv.req_base.req_comm->c_contextid;
@ -1208,7 +1208,7 @@ void mca_pml_bfo_recv_request_rndvrestartnack(mca_btl_base_descriptor_t* olddes,
ompi_proc = olddes->des_cbdata; ompi_proc = olddes->des_cbdata;
} }
segments = olddes->des_local; segments = olddes->des_segments;
hdr = (mca_pml_bfo_restart_hdr_t*)segments->seg_addr.pval; hdr = (mca_pml_bfo_restart_hdr_t*)segments->seg_addr.pval;
bml_endpoint = ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; bml_endpoint = ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
@ -1226,7 +1226,7 @@ void mca_pml_bfo_recv_request_rndvrestartnack(mca_btl_base_descriptor_t* olddes,
} }
/* fill out header */ /* fill out header */
nack = (mca_pml_bfo_restart_hdr_t*)des->des_local->seg_addr.pval; nack = (mca_pml_bfo_restart_hdr_t*)des->des_segments->seg_addr.pval;
nack->hdr_match.hdr_common.hdr_flags = 0; nack->hdr_match.hdr_common.hdr_flags = 0;
nack->hdr_match.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNACK; nack->hdr_match.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNACK;
nack->hdr_match.hdr_ctx = hdr->hdr_match.hdr_ctx; nack->hdr_match.hdr_ctx = hdr->hdr_match.hdr_ctx;
@ -1317,13 +1317,13 @@ void mca_pml_bfo_recv_restart_completion( mca_btl_base_module_t* btl,
int status ) int status )
{ {
if(OPAL_UNLIKELY(OMPI_SUCCESS != status)) { if(OPAL_UNLIKELY(OMPI_SUCCESS != status)) {
mca_pml_bfo_common_hdr_t* common = des->des_local->seg_addr.pval; mca_pml_bfo_common_hdr_t* common = des->des_segments->seg_addr.pval;
mca_pml_bfo_restart_hdr_t* restart; /* RESTART header */ mca_pml_bfo_restart_hdr_t* restart; /* RESTART header */
mca_pml_bfo_recv_request_t* recvreq; mca_pml_bfo_recv_request_t* recvreq;
switch (common->hdr_type) { switch (common->hdr_type) {
case MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK: case MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK:
restart = (mca_pml_bfo_restart_hdr_t*)des->des_local->seg_addr.pval; restart = (mca_pml_bfo_restart_hdr_t*)des->des_segments->seg_addr.pval;
recvreq = (mca_pml_bfo_recv_request_t*) restart->hdr_dst_req.pval; recvreq = (mca_pml_bfo_recv_request_t*) restart->hdr_dst_req.pval;
opal_output_verbose(30, mca_pml_bfo_output, opal_output_verbose(30, mca_pml_bfo_output,
"RNDVRESTARTACK: completion failed: try again " "RNDVRESTARTACK: completion failed: try again "
@ -1351,7 +1351,7 @@ void mca_pml_bfo_recv_restart_completion( mca_btl_base_module_t* btl,
mca_pml_bfo_recv_request_rndvrestartnack(des, NULL, true); mca_pml_bfo_recv_request_rndvrestartnack(des, NULL, true);
break; break;
case MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY: case MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY:
restart = (mca_pml_bfo_restart_hdr_t*)des->des_local->seg_addr.pval; restart = (mca_pml_bfo_restart_hdr_t*)des->des_segments->seg_addr.pval;
recvreq = (mca_pml_bfo_recv_request_t*) restart->hdr_dst_req.pval; recvreq = (mca_pml_bfo_recv_request_t*) restart->hdr_dst_req.pval;
/* With just two BTLs, this should never happen as we are /* With just two BTLs, this should never happen as we are
* typically sending the RECVERRNOTIFY message on the * typically sending the RECVERRNOTIFY message on the
@ -1759,7 +1759,7 @@ void mca_pml_bfo_check_recv_ctl_completion_status(mca_btl_base_module_t* btl,
struct mca_btl_base_descriptor_t* des, struct mca_btl_base_descriptor_t* des,
int status) int status)
{ {
mca_pml_bfo_common_hdr_t * common = des->des_local->seg_addr.pval; mca_pml_bfo_common_hdr_t * common = des->des_segments->seg_addr.pval;
mca_pml_bfo_rdma_hdr_t* hdr; /* PUT header */ mca_pml_bfo_rdma_hdr_t* hdr; /* PUT header */
struct mca_btl_base_descriptor_t* rdma_des; struct mca_btl_base_descriptor_t* rdma_des;
mca_pml_bfo_recv_request_t* recvreq; mca_pml_bfo_recv_request_t* recvreq;
@ -1789,7 +1789,7 @@ void mca_pml_bfo_check_recv_ctl_completion_status(mca_btl_base_module_t* btl,
break; break;
case MCA_PML_BFO_HDR_TYPE_PUT: case MCA_PML_BFO_HDR_TYPE_PUT:
hdr = (mca_pml_bfo_rdma_hdr_t*)des->des_local->seg_addr.pval; hdr = (mca_pml_bfo_rdma_hdr_t*)des->des_segments->seg_addr.pval;
rdma_des = hdr->hdr_des.pval; rdma_des = hdr->hdr_des.pval;
recvreq = des->des_cbdata; recvreq = des->des_cbdata;
if ((NULL != rdma_des->des_cbdata) && (recvreq == rdma_des->des_cbdata)) { if ((NULL != rdma_des->des_cbdata) && (recvreq == rdma_des->des_cbdata)) {
@ -1947,14 +1947,14 @@ void mca_pml_bfo_update_eager_bml_btl_recv_ctl(mca_bml_base_btl_t** bml_btl,
struct mca_btl_base_descriptor_t* des) struct mca_btl_base_descriptor_t* des)
{ {
if ((*bml_btl)->btl != btl) { if ((*bml_btl)->btl != btl) {
mca_pml_bfo_common_hdr_t * common = des->des_local->seg_addr.pval; mca_pml_bfo_common_hdr_t * common = des->des_segments->seg_addr.pval;
mca_pml_bfo_ack_hdr_t* ack; /* ACK header */ mca_pml_bfo_ack_hdr_t* ack; /* ACK header */
mca_pml_bfo_recv_request_t* recvreq = NULL; mca_pml_bfo_recv_request_t* recvreq = NULL;
char *type = NULL; char *type = NULL;
switch (common->hdr_type) { switch (common->hdr_type) {
case MCA_PML_BFO_HDR_TYPE_ACK: case MCA_PML_BFO_HDR_TYPE_ACK:
ack = (mca_pml_bfo_ack_hdr_t*)des->des_local->seg_addr.pval; ack = (mca_pml_bfo_ack_hdr_t*)des->des_segments->seg_addr.pval;
recvreq = (mca_pml_bfo_recv_request_t*) ack->hdr_dst_req.pval; recvreq = (mca_pml_bfo_recv_request_t*) ack->hdr_dst_req.pval;
type = "ACK"; type = "ACK";
break; break;
@ -2106,11 +2106,11 @@ void mca_pml_bfo_find_recvreq_rdma_bml_btl(mca_bml_base_btl_t** bml_btl,
bool mca_pml_bfo_rndv_completion_status_error(struct mca_btl_base_descriptor_t* des, bool mca_pml_bfo_rndv_completion_status_error(struct mca_btl_base_descriptor_t* des,
mca_pml_bfo_send_request_t* sendreq) mca_pml_bfo_send_request_t* sendreq)
{ {
assert(((mca_pml_bfo_hdr_t*)((des)->des_local->seg_addr.pval))->hdr_match.hdr_ctx == assert(((mca_pml_bfo_hdr_t*)((des)->des_segments->seg_addr.pval))->hdr_match.hdr_ctx ==
(sendreq)->req_send.req_base.req_comm->c_contextid); (sendreq)->req_send.req_base.req_comm->c_contextid);
assert(((mca_pml_bfo_hdr_t*)((des)->des_local->seg_addr.pval))->hdr_match.hdr_src == assert(((mca_pml_bfo_hdr_t*)((des)->des_segments->seg_addr.pval))->hdr_match.hdr_src ==
(sendreq)->req_send.req_base.req_comm->c_my_rank); (sendreq)->req_send.req_base.req_comm->c_my_rank);
assert(((mca_pml_bfo_hdr_t*)((des)->des_local->seg_addr.pval))->hdr_match.hdr_seq == assert(((mca_pml_bfo_hdr_t*)((des)->des_segments->seg_addr.pval))->hdr_match.hdr_seq ==
(uint16_t)(sendreq)->req_send.req_base.req_sequence); (uint16_t)(sendreq)->req_send.req_base.req_sequence);
if ((!(sendreq)->req_error) && (NULL == (sendreq)->req_recv.pval)) { if ((!(sendreq)->req_error) && (NULL == (sendreq)->req_recv.pval)) {
(sendreq)->req_events--; (sendreq)->req_events--;
@ -2157,7 +2157,7 @@ void mca_pml_bfo_completion_sendreq_has_error(mca_pml_bfo_send_request_t* sendre
void mca_pml_bfo_send_ctl_completion_status_error(struct mca_btl_base_descriptor_t* des) void mca_pml_bfo_send_ctl_completion_status_error(struct mca_btl_base_descriptor_t* des)
{ {
mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)des->des_cbdata; mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)des->des_cbdata;
mca_pml_bfo_hdr_t* hdr = des->des_local->seg_addr.pval; mca_pml_bfo_hdr_t* hdr = des->des_segments->seg_addr.pval;
switch (hdr->hdr_common.hdr_type) { switch (hdr->hdr_common.hdr_type) {
case MCA_PML_BFO_HDR_TYPE_RGET: case MCA_PML_BFO_HDR_TYPE_RGET:
if ((hdr->hdr_match.hdr_ctx != sendreq->req_send.req_base.req_comm->c_contextid) || if ((hdr->hdr_match.hdr_ctx != sendreq->req_send.req_base.req_comm->c_contextid) ||

Просмотреть файл

@ -104,13 +104,13 @@ void mca_pml_bfo_recv_frag_callback_match(mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des, mca_btl_base_descriptor_t* des,
void* cbdata ) void* cbdata )
{ {
mca_btl_base_segment_t* segments = des->des_local; mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_bfo_match_hdr_t* hdr = (mca_pml_bfo_match_hdr_t*)segments->seg_addr.pval; mca_pml_bfo_match_hdr_t* hdr = (mca_pml_bfo_match_hdr_t*)segments->seg_addr.pval;
ompi_communicator_t *comm_ptr; ompi_communicator_t *comm_ptr;
mca_pml_bfo_recv_request_t *match = NULL; mca_pml_bfo_recv_request_t *match = NULL;
mca_pml_bfo_comm_t *comm; mca_pml_bfo_comm_t *comm;
mca_pml_bfo_comm_proc_t *proc; mca_pml_bfo_comm_proc_t *proc;
size_t num_segments = des->des_local_count; size_t num_segments = des->des_segment_count;
size_t bytes_received = 0; size_t bytes_received = 0;
assert(num_segments <= MCA_BTL_DES_MAX_SEGMENTS); assert(num_segments <= MCA_BTL_DES_MAX_SEGMENTS);
@ -257,7 +257,7 @@ void mca_pml_bfo_recv_frag_callback_rndv(mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des, mca_btl_base_descriptor_t* des,
void* cbdata ) void* cbdata )
{ {
mca_btl_base_segment_t* segments = des->des_local; mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval; mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_bfo_common_hdr_t)) ) { if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_bfo_common_hdr_t)) ) {
@ -265,7 +265,7 @@ void mca_pml_bfo_recv_frag_callback_rndv(mca_btl_base_module_t* btl,
} }
bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_RNDV); bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_RNDV);
mca_pml_bfo_recv_frag_match(btl, &hdr->hdr_match, segments, mca_pml_bfo_recv_frag_match(btl, &hdr->hdr_match, segments,
des->des_local_count, MCA_PML_BFO_HDR_TYPE_RNDV); des->des_segment_count, MCA_PML_BFO_HDR_TYPE_RNDV);
return; return;
} }
@ -274,7 +274,7 @@ void mca_pml_bfo_recv_frag_callback_rget(mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des, mca_btl_base_descriptor_t* des,
void* cbdata ) void* cbdata )
{ {
mca_btl_base_segment_t* segments = des->des_local; mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval; mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_bfo_common_hdr_t)) ) { if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_bfo_common_hdr_t)) ) {
@ -282,7 +282,7 @@ void mca_pml_bfo_recv_frag_callback_rget(mca_btl_base_module_t* btl,
} }
bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_RGET); bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_RGET);
mca_pml_bfo_recv_frag_match(btl, &hdr->hdr_match, segments, mca_pml_bfo_recv_frag_match(btl, &hdr->hdr_match, segments,
des->des_local_count, MCA_PML_BFO_HDR_TYPE_RGET); des->des_segment_count, MCA_PML_BFO_HDR_TYPE_RGET);
return; return;
} }
@ -293,7 +293,7 @@ void mca_pml_bfo_recv_frag_callback_ack(mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des, mca_btl_base_descriptor_t* des,
void* cbdata ) void* cbdata )
{ {
mca_btl_base_segment_t* segments = des->des_local; mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval; mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
mca_pml_bfo_send_request_t* sendreq; mca_pml_bfo_send_request_t* sendreq;
@ -341,7 +341,7 @@ void mca_pml_bfo_recv_frag_callback_frag(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag, mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des, mca_btl_base_descriptor_t* des,
void* cbdata ) { void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_local; mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval; mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
mca_pml_bfo_recv_request_t* recvreq; mca_pml_bfo_recv_request_t* recvreq;
@ -353,7 +353,7 @@ void mca_pml_bfo_recv_frag_callback_frag(mca_btl_base_module_t* btl,
#if PML_BFO #if PML_BFO
MCA_PML_BFO_ERROR_CHECK_ON_FRAG_CALLBACK(recvreq); MCA_PML_BFO_ERROR_CHECK_ON_FRAG_CALLBACK(recvreq);
#endif /* PML_BFO */ #endif /* PML_BFO */
mca_pml_bfo_recv_request_progress_frag(recvreq,btl,segments,des->des_local_count); mca_pml_bfo_recv_request_progress_frag(recvreq,btl,segments,des->des_segment_count);
return; return;
} }
@ -363,7 +363,7 @@ void mca_pml_bfo_recv_frag_callback_put(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag, mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des, mca_btl_base_descriptor_t* des,
void* cbdata ) { void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_local; mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval; mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
mca_pml_bfo_send_request_t* sendreq; mca_pml_bfo_send_request_t* sendreq;
@ -386,7 +386,7 @@ void mca_pml_bfo_recv_frag_callback_fin(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag, mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des, mca_btl_base_descriptor_t* des,
void* cbdata ) { void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_local; mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval; mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
mca_btl_base_descriptor_t* rdma; mca_btl_base_descriptor_t* rdma;

Просмотреть файл

@ -246,7 +246,7 @@ int mca_pml_bfo_recv_request_ack_send_btl(
} }
/* fill out header */ /* fill out header */
ack = (mca_pml_bfo_ack_hdr_t*)des->des_local->seg_addr.pval; ack = (mca_pml_bfo_ack_hdr_t*)des->des_segments->seg_addr.pval;
ack->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_ACK; ack->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_ACK;
ack->hdr_common.hdr_flags = nordma ? MCA_PML_BFO_HDR_FLAGS_NORDMA : 0; ack->hdr_common.hdr_flags = nordma ? MCA_PML_BFO_HDR_FLAGS_NORDMA : 0;
ack->hdr_src_req.lval = hdr_src_req; ack->hdr_src_req.lval = hdr_src_req;
@ -851,7 +851,7 @@ int mca_pml_bfo_recv_request_schedule_once( mca_pml_bfo_recv_request_t* recvreq,
dst->des_cbfunc = mca_pml_bfo_put_completion; dst->des_cbfunc = mca_pml_bfo_put_completion;
dst->des_cbdata = recvreq; dst->des_cbdata = recvreq;
seg_size = btl->btl_seg_size * dst->des_local_count; seg_size = btl->btl_seg_size * dst->des_segment_count;
/* prepare a descriptor for rdma control message */ /* prepare a descriptor for rdma control message */
mca_bml_base_alloc(bml_btl, &ctl, MCA_BTL_NO_ORDER, sizeof(mca_pml_bfo_rdma_hdr_t) + seg_size, mca_bml_base_alloc(bml_btl, &ctl, MCA_BTL_NO_ORDER, sizeof(mca_pml_bfo_rdma_hdr_t) + seg_size,
@ -867,7 +867,7 @@ int mca_pml_bfo_recv_request_schedule_once( mca_pml_bfo_recv_request_t* recvreq,
#endif /* PML_BFO */ #endif /* PML_BFO */
/* fill in rdma header */ /* fill in rdma header */
hdr = (mca_pml_bfo_rdma_hdr_t*)ctl->des_local->seg_addr.pval; hdr = (mca_pml_bfo_rdma_hdr_t*)ctl->des_segments->seg_addr.pval;
hdr->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_PUT; hdr->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_PUT;
hdr->hdr_common.hdr_flags = hdr->hdr_common.hdr_flags =
(!recvreq->req_ack_sent) ? MCA_PML_BFO_HDR_TYPE_ACK : 0; (!recvreq->req_ack_sent) ? MCA_PML_BFO_HDR_TYPE_ACK : 0;
@ -877,10 +877,10 @@ int mca_pml_bfo_recv_request_schedule_once( mca_pml_bfo_recv_request_t* recvreq,
#endif /* PML_BFO */ #endif /* PML_BFO */
hdr->hdr_des.pval = dst; hdr->hdr_des.pval = dst;
hdr->hdr_rdma_offset = recvreq->req_rdma_offset; hdr->hdr_rdma_offset = recvreq->req_rdma_offset;
hdr->hdr_seg_cnt = dst->des_local_count; hdr->hdr_seg_cnt = dst->des_segment_count;
/* copy segments */ /* copy segments */
memmove (hdr + 1, dst->des_local, seg_size); memmove (hdr + 1, dst->des_segments, seg_size);
if(!recvreq->req_ack_sent) if(!recvreq->req_ack_sent)
recvreq->req_ack_sent = true; recvreq->req_ack_sent = true;

Просмотреть файл

@ -257,8 +257,8 @@ mca_pml_bfo_rndv_completion( mca_btl_base_module_t* btl,
* have to be atomic. * have to be atomic.
*/ */
req_bytes_delivered = mca_pml_bfo_compute_segment_length (btl->btl_seg_size, req_bytes_delivered = mca_pml_bfo_compute_segment_length (btl->btl_seg_size,
(void *) des->des_local, (void *) des->des_segments,
des->des_local_count, des->des_segment_count,
sizeof(mca_pml_bfo_rendezvous_hdr_t)); sizeof(mca_pml_bfo_rendezvous_hdr_t));
#if PML_BFO #if PML_BFO
@ -287,8 +287,8 @@ mca_pml_bfo_rget_completion( mca_btl_base_module_t* btl,
/* count bytes of user data actually delivered and check for request completion */ /* count bytes of user data actually delivered and check for request completion */
req_bytes_delivered = mca_pml_bfo_compute_segment_length (btl->btl_seg_size, req_bytes_delivered = mca_pml_bfo_compute_segment_length (btl->btl_seg_size,
(void *) des->des_local, (void *) des->des_segments,
des->des_local_count, 0); des->des_segment_count, 0);
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered); OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered);
send_request_pml_complete_check(sendreq); send_request_pml_complete_check(sendreq);
@ -357,8 +357,8 @@ mca_pml_bfo_frag_completion( mca_btl_base_module_t* btl,
/* count bytes of user data actually delivered */ /* count bytes of user data actually delivered */
req_bytes_delivered = mca_pml_bfo_compute_segment_length (btl->btl_seg_size, req_bytes_delivered = mca_pml_bfo_compute_segment_length (btl->btl_seg_size,
(void *) des->des_local, (void *) des->des_segments,
des->des_local_count, des->des_segment_count,
sizeof(mca_pml_bfo_frag_hdr_t)); sizeof(mca_pml_bfo_frag_hdr_t));
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, -1); OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, -1);
@ -409,7 +409,7 @@ int mca_pml_bfo_send_request_start_buffered(
if( OPAL_UNLIKELY(NULL == des) ) { if( OPAL_UNLIKELY(NULL == des) ) {
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }
segment = des->des_local; segment = des->des_segments;
/* pack the data into the BTL supplied buffer */ /* pack the data into the BTL supplied buffer */
iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval + iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval +
@ -562,7 +562,7 @@ int mca_pml_bfo_send_request_start_copy( mca_pml_bfo_send_request_t* sendreq,
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }
segment = des->des_local; segment = des->des_segments;
if(size > 0) { if(size > 0) {
/* pack the data into the supplied buffer */ /* pack the data into the supplied buffer */
@ -657,7 +657,7 @@ int mca_pml_bfo_send_request_start_prepare( mca_pml_bfo_send_request_t* sendreq,
if( OPAL_UNLIKELY(NULL == des) ) { if( OPAL_UNLIKELY(NULL == des) ) {
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }
segment = des->des_local; segment = des->des_segments;
/* build match header */ /* build match header */
hdr = (mca_pml_bfo_hdr_t*)segment->seg_addr.pval; hdr = (mca_pml_bfo_hdr_t*)segment->seg_addr.pval;
@ -747,7 +747,7 @@ int mca_pml_bfo_send_request_start_rdma( mca_pml_bfo_send_request_t* sendreq,
src->des_cbfunc = mca_pml_bfo_rget_completion; src->des_cbfunc = mca_pml_bfo_rget_completion;
src->des_cbdata = sendreq; src->des_cbdata = sendreq;
seg_size = bml_btl->btl->btl_seg_size * src->des_local_count; seg_size = bml_btl->btl->btl_seg_size * src->des_segment_count;
/* allocate space for get hdr + segment list */ /* allocate space for get hdr + segment list */
mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER,
@ -759,7 +759,7 @@ int mca_pml_bfo_send_request_start_rdma( mca_pml_bfo_send_request_t* sendreq,
mca_bml_base_free(bml_btl, src); mca_bml_base_free(bml_btl, src);
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }
segment = des->des_local; segment = des->des_segments;
/* build match header */ /* build match header */
hdr = (mca_pml_bfo_hdr_t*)segment->seg_addr.pval; hdr = (mca_pml_bfo_hdr_t*)segment->seg_addr.pval;
@ -775,13 +775,13 @@ int mca_pml_bfo_send_request_start_rdma( mca_pml_bfo_send_request_t* sendreq,
MCA_PML_BFO_CHECK_FOR_RNDV_RESTART(hdr, sendreq, "RGET"); MCA_PML_BFO_CHECK_FOR_RNDV_RESTART(hdr, sendreq, "RGET");
#endif /* PML_BFO */ #endif /* PML_BFO */
hdr->hdr_rget.hdr_des.pval = src; hdr->hdr_rget.hdr_des.pval = src;
hdr->hdr_rget.hdr_seg_cnt = src->des_local_count; hdr->hdr_rget.hdr_seg_cnt = src->des_segment_count;
bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_RGET, bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_RGET,
sendreq->req_send.req_base.req_proc); sendreq->req_send.req_base.req_proc);
/* copy segment data */ /* copy segment data */
memmove (&hdr->hdr_rget + 1, src->des_local, seg_size); memmove (&hdr->hdr_rget + 1, src->des_segments, seg_size);
des->des_cbfunc = mca_pml_bfo_send_ctl_completion; des->des_cbfunc = mca_pml_bfo_send_ctl_completion;
@ -808,7 +808,7 @@ int mca_pml_bfo_send_request_start_rdma( mca_pml_bfo_send_request_t* sendreq,
if( OPAL_UNLIKELY(NULL == des)) { if( OPAL_UNLIKELY(NULL == des)) {
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }
segment = des->des_local; segment = des->des_segments;
/* build hdr */ /* build hdr */
hdr = (mca_pml_bfo_hdr_t*)segment->seg_addr.pval; hdr = (mca_pml_bfo_hdr_t*)segment->seg_addr.pval;
@ -912,7 +912,7 @@ int mca_pml_bfo_send_request_start_rndv( mca_pml_bfo_send_request_t* sendreq,
if( OPAL_UNLIKELY(NULL == des) ) { if( OPAL_UNLIKELY(NULL == des) ) {
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }
segment = des->des_local; segment = des->des_segments;
/* build hdr */ /* build hdr */
hdr = (mca_pml_bfo_hdr_t*)segment->seg_addr.pval; hdr = (mca_pml_bfo_hdr_t*)segment->seg_addr.pval;
@ -1145,7 +1145,7 @@ cannot_pack:
des->des_cbdata = sendreq; des->des_cbdata = sendreq;
/* setup header */ /* setup header */
hdr = (mca_pml_bfo_frag_hdr_t*)des->des_local->seg_addr.pval; hdr = (mca_pml_bfo_frag_hdr_t*)des->des_segments->seg_addr.pval;
hdr->hdr_common.hdr_flags = 0; hdr->hdr_common.hdr_flags = 0;
hdr->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_FRAG; hdr->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_FRAG;
hdr->hdr_frag_offset = range->range_send_offset; hdr->hdr_frag_offset = range->range_send_offset;

Просмотреть файл

@ -14,7 +14,7 @@
* Copyright (c) 2006-2008 University of Houston. All rights reserved. * Copyright (c) 2006-2008 University of Houston. All rights reserved.
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved
* Copyright (c) 2011 Sandia National Laboratories. All rights reserved. * Copyright (c) 2011 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights * Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2012 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
@ -500,17 +500,17 @@ static void mca_pml_ob1_dump_hdr(mca_pml_ob1_hdr_t* hdr)
case MCA_PML_OB1_HDR_TYPE_RGET: case MCA_PML_OB1_HDR_TYPE_RGET:
type = "RGET"; type = "RGET";
snprintf( header, 128, "ctx %5d src %d tag %d seq %d msg_length %" PRIu64 snprintf( header, 128, "ctx %5d src %d tag %d seq %d msg_length %" PRIu64
"seg_cnt %d hdr_des %" PRIu64, "frag %" PRIu64 " src_ptr %" PRIu64,
hdr->hdr_rndv.hdr_match.hdr_ctx, hdr->hdr_rndv.hdr_match.hdr_src, hdr->hdr_rndv.hdr_match.hdr_ctx, hdr->hdr_rndv.hdr_match.hdr_src,
hdr->hdr_rndv.hdr_match.hdr_tag, hdr->hdr_rndv.hdr_match.hdr_seq, hdr->hdr_rndv.hdr_match.hdr_tag, hdr->hdr_rndv.hdr_match.hdr_seq,
hdr->hdr_rndv.hdr_msg_length, hdr->hdr_rndv.hdr_msg_length, hdr->hdr_rget.hdr_frag.lval,
hdr->hdr_rget.hdr_seg_cnt, hdr->hdr_rget.hdr_des.lval); hdr->hdr_rget.hdr_src_ptr);
break; break;
case MCA_PML_OB1_HDR_TYPE_ACK: case MCA_PML_OB1_HDR_TYPE_ACK:
type = "ACK"; type = "ACK";
snprintf( header, 128, "src_req %p dst_req %p offset %" PRIu64, snprintf( header, 128, "src_req %p dst_req %p offset %" PRIu64 " size %" PRIu64,
hdr->hdr_ack.hdr_src_req.pval, hdr->hdr_ack.hdr_dst_req.pval, hdr->hdr_ack.hdr_src_req.pval, hdr->hdr_ack.hdr_dst_req.pval,
hdr->hdr_ack.hdr_send_offset); hdr->hdr_ack.hdr_send_offset, hdr->hdr_ack.hdr_send_size);
break; break;
case MCA_PML_OB1_HDR_TYPE_FRAG: case MCA_PML_OB1_HDR_TYPE_FRAG:
type = "FRAG"; type = "FRAG";
@ -520,10 +520,11 @@ static void mca_pml_ob1_dump_hdr(mca_pml_ob1_hdr_t* hdr)
break; break;
case MCA_PML_OB1_HDR_TYPE_PUT: case MCA_PML_OB1_HDR_TYPE_PUT:
type = "PUT"; type = "PUT";
snprintf( header, 128, "seg_cnt %d dst_req %p src_des %p recv_req %p offset %" PRIu64 " [%p %" PRIu64 "]", snprintf( header, 128, "dst_req %p src_frag %p recv_req %p offset %" PRIu64
hdr->hdr_rdma.hdr_seg_cnt, hdr->hdr_rdma.hdr_req.pval, hdr->hdr_rdma.hdr_des.pval, " dst_ptr %" PRIu64 " dst_size %" PRIu64,
hdr->hdr_rdma.hdr_req.pval, hdr->hdr_rdma.hdr_frag.pval,
hdr->hdr_rdma.hdr_recv_req.pval, hdr->hdr_rdma.hdr_rdma_offset, hdr->hdr_rdma.hdr_recv_req.pval, hdr->hdr_rdma.hdr_rdma_offset,
hdr->hdr_rdma.hdr_segs[0].seg_addr.pval, hdr->hdr_rdma.hdr_segs[0].seg_len); hdr->hdr_rdma.hdr_dst_ptr, hdr->hdr_rdma.hdr_dst_size);
break; break;
case MCA_PML_OB1_HDR_TYPE_FIN: case MCA_PML_OB1_HDR_TYPE_FIN:
type = "FIN"; type = "FIN";
@ -638,37 +639,32 @@ static void mca_pml_ob1_fin_completion( mca_btl_base_module_t* btl,
*/ */
int mca_pml_ob1_send_fin( ompi_proc_t* proc, int mca_pml_ob1_send_fin( ompi_proc_t* proc,
mca_bml_base_btl_t* bml_btl, mca_bml_base_btl_t* bml_btl,
opal_ptr_t hdr_des, opal_ptr_t hdr_frag,
uint64_t rdma_size,
uint8_t order, uint8_t order,
uint32_t status ) int status )
{ {
mca_btl_base_descriptor_t* fin; mca_btl_base_descriptor_t* fin;
mca_pml_ob1_fin_hdr_t* hdr;
int rc; int rc;
mca_bml_base_alloc(bml_btl, &fin, order, sizeof(mca_pml_ob1_fin_hdr_t), mca_bml_base_alloc(bml_btl, &fin, order, sizeof(mca_pml_ob1_fin_hdr_t),
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
if(NULL == fin) { if(NULL == fin) {
MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status); MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_frag, rdma_size, bml_btl, order, status);
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }
fin->des_cbfunc = mca_pml_ob1_fin_completion; fin->des_cbfunc = mca_pml_ob1_fin_completion;
fin->des_cbdata = NULL; fin->des_cbdata = NULL;
/* fill in header */ /* fill in header */
hdr = (mca_pml_ob1_fin_hdr_t*)fin->des_local->seg_addr.pval; mca_pml_ob1_fin_hdr_prepare ((mca_pml_ob1_fin_hdr_t *) fin->des_segments->seg_addr.pval,
hdr->hdr_common.hdr_flags = 0; 0, hdr_frag.lval, status ? status : (int64_t) rdma_size);
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_FIN;
hdr->hdr_des = hdr_des;
hdr->hdr_fail = status;
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_FIN, proc); ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_FIN, proc);
/* queue request */ /* queue request */
rc = mca_bml_base_send( bml_btl, rc = mca_bml_base_send( bml_btl, fin, MCA_PML_OB1_HDR_TYPE_FIN );
fin,
MCA_PML_OB1_HDR_TYPE_FIN );
if( OPAL_LIKELY( rc >= 0 ) ) { if( OPAL_LIKELY( rc >= 0 ) ) {
if( OPAL_LIKELY( 1 == rc ) ) { if( OPAL_LIKELY( 1 == rc ) ) {
MCA_PML_OB1_PROGRESS_PENDING(bml_btl); MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
@ -676,7 +672,7 @@ int mca_pml_ob1_send_fin( ompi_proc_t* proc,
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
mca_bml_base_free(bml_btl, fin); mca_bml_base_free(bml_btl, fin);
MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status); MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_frag, rdma_size, bml_btl, order, status);
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }
@ -717,6 +713,7 @@ void mca_pml_ob1_process_pending_packets(mca_bml_base_btl_t* bml_btl)
pckt->hdr.hdr_ack.hdr_src_req.lval, pckt->hdr.hdr_ack.hdr_src_req.lval,
pckt->hdr.hdr_ack.hdr_dst_req.pval, pckt->hdr.hdr_ack.hdr_dst_req.pval,
pckt->hdr.hdr_ack.hdr_send_offset, pckt->hdr.hdr_ack.hdr_send_offset,
pckt->hdr.hdr_ack.hdr_send_size,
pckt->hdr.hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA); pckt->hdr.hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA);
if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) { if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) {
OPAL_THREAD_LOCK(&mca_pml_ob1.lock); OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
@ -728,9 +725,10 @@ void mca_pml_ob1_process_pending_packets(mca_bml_base_btl_t* bml_btl)
break; break;
case MCA_PML_OB1_HDR_TYPE_FIN: case MCA_PML_OB1_HDR_TYPE_FIN:
rc = mca_pml_ob1_send_fin(pckt->proc, send_dst, rc = mca_pml_ob1_send_fin(pckt->proc, send_dst,
pckt->hdr.hdr_fin.hdr_des, pckt->hdr.hdr_fin.hdr_frag,
pckt->hdr.hdr_fin.hdr_size,
pckt->order, pckt->order,
pckt->hdr.hdr_fin.hdr_fail); pckt->status);
if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) { if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) {
return; return;
} }

Просмотреть файл

@ -216,6 +216,7 @@ struct mca_pml_ob1_pckt_pending_t {
mca_pml_ob1_hdr_t hdr; mca_pml_ob1_hdr_t hdr;
struct mca_bml_base_btl_t *bml_btl; struct mca_bml_base_btl_t *bml_btl;
uint8_t order; uint8_t order;
int status;
}; };
typedef struct mca_pml_ob1_pckt_pending_t mca_pml_ob1_pckt_pending_t; typedef struct mca_pml_ob1_pckt_pending_t mca_pml_ob1_pckt_pending_t;
OBJ_CLASS_DECLARATION(mca_pml_ob1_pckt_pending_t); OBJ_CLASS_DECLARATION(mca_pml_ob1_pckt_pending_t);
@ -234,17 +235,17 @@ do { \
(ompi_free_list_item_t*)pckt); \ (ompi_free_list_item_t*)pckt); \
} while(0) } while(0)
#define MCA_PML_OB1_ADD_FIN_TO_PENDING(P, D, B, O, S) \ #define MCA_PML_OB1_ADD_FIN_TO_PENDING(P, D, Sz, B, O, S) \
do { \ do { \
mca_pml_ob1_pckt_pending_t *_pckt; \ mca_pml_ob1_pckt_pending_t *_pckt; \
\ \
MCA_PML_OB1_PCKT_PENDING_ALLOC(_pckt); \ MCA_PML_OB1_PCKT_PENDING_ALLOC(_pckt); \
_pckt->hdr.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_FIN; \ mca_pml_ob1_fin_hdr_prepare (&_pckt->hdr.hdr_fin, 0, \
_pckt->hdr.hdr_fin.hdr_des = (D); \ (D).lval, (Sz)); \
_pckt->hdr.hdr_fin.hdr_fail = (S); \
_pckt->proc = (P); \ _pckt->proc = (P); \
_pckt->bml_btl = (B); \ _pckt->bml_btl = (B); \
_pckt->order = (O); \ _pckt->order = (O); \
_pckt->status = (S); \
OPAL_THREAD_LOCK(&mca_pml_ob1.lock); \ OPAL_THREAD_LOCK(&mca_pml_ob1.lock); \
opal_list_append(&mca_pml_ob1.pckt_pending, \ opal_list_append(&mca_pml_ob1.pckt_pending, \
(opal_list_item_t*)_pckt); \ (opal_list_item_t*)_pckt); \
@ -253,7 +254,7 @@ do { \
int mca_pml_ob1_send_fin(ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl, int mca_pml_ob1_send_fin(ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl,
opal_ptr_t hdr_des, uint8_t order, uint32_t status); opal_ptr_t hdr_frag, uint64_t size, uint8_t order, int status);
/* This function tries to resend FIN/ACK packets from pckt_pending queue. /* This function tries to resend FIN/ACK packets from pckt_pending queue.
* Packets are added to the queue when sending of FIN or ACK is failed due to * Packets are added to the queue when sending of FIN or ACK is failed due to
@ -283,20 +284,6 @@ void mca_pml_ob1_process_pending_rdma(void);
/* /*
* Compute the total number of bytes on supplied descriptor * Compute the total number of bytes on supplied descriptor
*/ */
static inline size_t
mca_pml_ob1_compute_segment_length(size_t seg_size, void *segments,
size_t count, size_t hdrlen)
{
size_t i, length = 0;
mca_btl_base_segment_t *segment = (mca_btl_base_segment_t*)segments;
for (i = 0; i < count ; ++i) {
length += segment->seg_len;
segment = (mca_btl_base_segment_t *)((char *)segment + seg_size);
}
return (length - hdrlen);
}
static inline size_t static inline size_t
mca_pml_ob1_compute_segment_length_base(mca_btl_base_segment_t *segments, mca_pml_ob1_compute_segment_length_base(mca_btl_base_segment_t *segments,
size_t count, size_t hdrlen) size_t count, size_t hdrlen)
@ -338,7 +325,7 @@ mca_pml_ob1_compute_segment_length_remote (size_t seg_size, void *segments,
/* represent BTL chosen for sending request */ /* represent BTL chosen for sending request */
struct mca_pml_ob1_com_btl_t { struct mca_pml_ob1_com_btl_t {
mca_bml_base_btl_t *bml_btl; mca_bml_base_btl_t *bml_btl;
struct mca_mpool_base_registration_t* btl_reg; struct mca_btl_base_registration_handle_t *btl_reg;
size_t length; size_t length;
}; };
typedef struct mca_pml_ob1_com_btl_t mca_pml_ob1_com_btl_t; typedef struct mca_pml_ob1_com_btl_t mca_pml_ob1_com_btl_t;

Просмотреть файл

@ -11,7 +11,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2009 IBM Corporation. All rights reserved. * Copyright (c) 2009 IBM Corporation. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights * Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
@ -63,6 +63,13 @@ struct mca_pml_ob1_common_hdr_t {
}; };
typedef struct mca_pml_ob1_common_hdr_t mca_pml_ob1_common_hdr_t; typedef struct mca_pml_ob1_common_hdr_t mca_pml_ob1_common_hdr_t;
static inline void mca_pml_ob1_common_hdr_prepare (mca_pml_ob1_common_hdr_t *hdr, uint8_t hdr_type,
uint8_t hdr_flags)
{
hdr->hdr_type = hdr_type;
hdr->hdr_flags = hdr_flags;
}
#define MCA_PML_OB1_COMMON_HDR_NTOH(h) #define MCA_PML_OB1_COMMON_HDR_NTOH(h)
#define MCA_PML_OB1_COMMON_HDR_HTON(h) #define MCA_PML_OB1_COMMON_HDR_HTON(h)
@ -88,15 +95,19 @@ struct mca_pml_ob1_match_hdr_t {
typedef struct mca_pml_ob1_match_hdr_t mca_pml_ob1_match_hdr_t; typedef struct mca_pml_ob1_match_hdr_t mca_pml_ob1_match_hdr_t;
static inline void mca_pml_ob1_match_hdr_prepare (mca_pml_ob1_match_hdr_t *hdr, uint8_t hdr_type, uint8_t hdr_flags,
uint16_t hdr_ctx, int32_t hdr_src, int32_t hdr_tag, uint16_t hdr_seq)
{
mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, hdr_type, hdr_flags);
hdr->hdr_ctx = hdr_ctx;
hdr->hdr_src = hdr_src;
hdr->hdr_tag = hdr_tag;
hdr->hdr_seq = hdr_seq;
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_OB1_MATCH_HDR_FILL(h) \ hdr->hdr_padding[0] = 0;
do { \ hdr->hdr_padding[1] = 0;
(h).hdr_padding[0] = 0; \ #endif
(h).hdr_padding[1] = 0; \ }
} while(0)
#else
#define MCA_PML_OB1_MATCH_HDR_FILL(h)
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
#define MCA_PML_OB1_MATCH_HDR_NTOH(h) \ #define MCA_PML_OB1_MATCH_HDR_NTOH(h) \
do { \ do { \
@ -110,7 +121,6 @@ do { \
#define MCA_PML_OB1_MATCH_HDR_HTON(h) \ #define MCA_PML_OB1_MATCH_HDR_HTON(h) \
do { \ do { \
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \ MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
MCA_PML_OB1_MATCH_HDR_FILL(h); \
(h).hdr_ctx = htons((h).hdr_ctx); \ (h).hdr_ctx = htons((h).hdr_ctx); \
(h).hdr_src = htonl((h).hdr_src); \ (h).hdr_src = htonl((h).hdr_src); \
(h).hdr_tag = htonl((h).hdr_tag); \ (h).hdr_tag = htonl((h).hdr_tag); \
@ -129,12 +139,14 @@ struct mca_pml_ob1_rendezvous_hdr_t {
}; };
typedef struct mca_pml_ob1_rendezvous_hdr_t mca_pml_ob1_rendezvous_hdr_t; typedef struct mca_pml_ob1_rendezvous_hdr_t mca_pml_ob1_rendezvous_hdr_t;
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG static inline void mca_pml_ob1_rendezvous_hdr_prepare (mca_pml_ob1_rendezvous_hdr_t *hdr, uint8_t hdr_type, uint8_t hdr_flags,
#define MCA_PML_OB1_RNDV_HDR_FILL(h) \ uint16_t hdr_ctx, int32_t hdr_src, int32_t hdr_tag, uint16_t hdr_seq,
MCA_PML_OB1_MATCH_HDR_FILL((h).hdr_match) uint64_t hdr_msg_length, void *hdr_src_req)
#else {
#define MCA_PML_OB1_RNDV_HDR_FILL(h) mca_pml_ob1_match_hdr_prepare (&hdr->hdr_match, hdr_type, hdr_flags, hdr_ctx, hdr_src, hdr_tag, hdr_seq);
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */ hdr->hdr_msg_length = hdr_msg_length;
hdr->hdr_src_req.pval = hdr_src_req;
}
/* Note that hdr_src_req is not put in network byte order because it /* Note that hdr_src_req is not put in network byte order because it
is never processed by the receiver, other than being copied into is never processed by the receiver, other than being copied into
@ -148,7 +160,6 @@ typedef struct mca_pml_ob1_rendezvous_hdr_t mca_pml_ob1_rendezvous_hdr_t;
#define MCA_PML_OB1_RNDV_HDR_HTON(h) \ #define MCA_PML_OB1_RNDV_HDR_HTON(h) \
do { \ do { \
MCA_PML_OB1_MATCH_HDR_HTON((h).hdr_match); \ MCA_PML_OB1_MATCH_HDR_HTON((h).hdr_match); \
MCA_PML_OB1_RNDV_HDR_FILL(h); \
(h).hdr_msg_length = hton64((h).hdr_msg_length); \ (h).hdr_msg_length = hton64((h).hdr_msg_length); \
} while (0) } while (0)
@ -157,38 +168,47 @@ typedef struct mca_pml_ob1_rendezvous_hdr_t mca_pml_ob1_rendezvous_hdr_t;
*/ */
struct mca_pml_ob1_rget_hdr_t { struct mca_pml_ob1_rget_hdr_t {
mca_pml_ob1_rendezvous_hdr_t hdr_rndv; mca_pml_ob1_rendezvous_hdr_t hdr_rndv;
uint32_t hdr_seg_cnt; /**< number of segments for rdma */
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
uint8_t hdr_padding[4]; uint8_t hdr_padding[4];
#endif #endif
opal_ptr_t hdr_des; /**< source descriptor */ opal_ptr_t hdr_frag; /**< source fragment (for fin) */
uint64_t hdr_src_ptr; /**< source pointer */
/* btl registration handle data follows */
}; };
typedef struct mca_pml_ob1_rget_hdr_t mca_pml_ob1_rget_hdr_t; typedef struct mca_pml_ob1_rget_hdr_t mca_pml_ob1_rget_hdr_t;
static inline void mca_pml_ob1_rget_hdr_prepare (mca_pml_ob1_rget_hdr_t *hdr, uint8_t hdr_flags,
uint16_t hdr_ctx, int32_t hdr_src, int32_t hdr_tag, uint16_t hdr_seq,
uint64_t hdr_msg_length, void *hdr_src_req, void *hdr_frag,
void *hdr_src_ptr, void *local_handle, size_t local_handle_size)
{
mca_pml_ob1_rendezvous_hdr_prepare (&hdr->hdr_rndv, MCA_PML_OB1_HDR_TYPE_RGET, hdr_flags,
hdr_ctx, hdr_src, hdr_tag, hdr_seq, hdr_msg_length, hdr_src_req);
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_OB1_RGET_HDR_FILL(h) \ hdr->hdr_padding[0] = 0;
do { \ hdr->hdr_padding[1] = 0;
MCA_PML_OB1_RNDV_HDR_FILL((h).hdr_rndv); \ hdr->hdr_padding[2] = 0;
(h).hdr_padding[0] = 0; \ hdr->hdr_padding[3] = 0;
(h).hdr_padding[1] = 0; \ #endif
(h).hdr_padding[2] = 0; \ hdr->hdr_frag.pval = hdr_frag;
(h).hdr_padding[3] = 0; \ hdr->hdr_src_ptr = (uint64_t)(intptr_t) hdr_src_ptr;
} while(0)
#else
#define MCA_PML_OB1_RGET_HDR_FILL(h)
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
#define MCA_PML_OB1_RGET_HDR_NTOH(h) \ /* copy registration handle */
do { \ memcpy (hdr + 1, local_handle, local_handle_size);
MCA_PML_OB1_RNDV_HDR_NTOH((h).hdr_rndv); \ }
(h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \
#define MCA_PML_OB1_RGET_HDR_NTOH(h) \
do { \
MCA_PML_OB1_RNDV_HDR_NTOH((h).hdr_rndv); \
(h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \
(h).hdr_src_ptr = ntoh64((h).hdr_src_ptr); \
} while (0) } while (0)
#define MCA_PML_OB1_RGET_HDR_HTON(h) \ #define MCA_PML_OB1_RGET_HDR_HTON(h) \
do { \ do { \
MCA_PML_OB1_RNDV_HDR_HTON((h).hdr_rndv); \ MCA_PML_OB1_RNDV_HDR_HTON((h).hdr_rndv); \
MCA_PML_OB1_RGET_HDR_FILL(h); \ (h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \
(h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \ (h).hdr_src_ptr = hton64((h).hdr_src_ptr); \
} while (0) } while (0)
/** /**
@ -205,19 +225,23 @@ struct mca_pml_ob1_frag_hdr_t {
}; };
typedef struct mca_pml_ob1_frag_hdr_t mca_pml_ob1_frag_hdr_t; typedef struct mca_pml_ob1_frag_hdr_t mca_pml_ob1_frag_hdr_t;
static inline void mca_pml_ob1_frag_hdr_prepare (mca_pml_ob1_frag_hdr_t *hdr, uint8_t hdr_flags,
uint64_t hdr_frag_offset, void *hdr_src_req,
uint64_t hdr_dst_req)
{
mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, MCA_PML_OB1_HDR_TYPE_FRAG, hdr_flags);
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_OB1_FRAG_HDR_FILL(h) \ hdr->hdr_padding[0] = 0;
do { \ hdr->hdr_padding[1] = 0;
(h).hdr_padding[0] = 0; \ hdr->hdr_padding[2] = 0;
(h).hdr_padding[1] = 0; \ hdr->hdr_padding[3] = 0;
(h).hdr_padding[2] = 0; \ hdr->hdr_padding[4] = 0;
(h).hdr_padding[3] = 0; \ hdr->hdr_padding[5] = 0;
(h).hdr_padding[4] = 0; \ #endif
(h).hdr_padding[5] = 0; \ hdr->hdr_frag_offset = hdr_frag_offset;
} while(0) hdr->hdr_src_req.pval = hdr_src_req;
#else hdr->hdr_dst_req.lval = hdr_dst_req;
#define MCA_PML_OB1_FRAG_HDR_FILL(h) }
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
#define MCA_PML_OB1_FRAG_HDR_NTOH(h) \ #define MCA_PML_OB1_FRAG_HDR_NTOH(h) \
do { \ do { \
@ -228,7 +252,6 @@ do { \
#define MCA_PML_OB1_FRAG_HDR_HTON(h) \ #define MCA_PML_OB1_FRAG_HDR_HTON(h) \
do { \ do { \
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \ MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
MCA_PML_OB1_FRAG_HDR_FILL(h); \
(h).hdr_frag_offset = hton64((h).hdr_frag_offset); \ (h).hdr_frag_offset = hton64((h).hdr_frag_offset); \
} while (0) } while (0)
@ -244,38 +267,45 @@ struct mca_pml_ob1_ack_hdr_t {
opal_ptr_t hdr_src_req; /**< source request */ opal_ptr_t hdr_src_req; /**< source request */
opal_ptr_t hdr_dst_req; /**< matched receive request */ opal_ptr_t hdr_dst_req; /**< matched receive request */
uint64_t hdr_send_offset; /**< starting point of copy in/out */ uint64_t hdr_send_offset; /**< starting point of copy in/out */
uint64_t hdr_send_size; /**< number of bytes requested (0: all remaining) */
}; };
typedef struct mca_pml_ob1_ack_hdr_t mca_pml_ob1_ack_hdr_t; typedef struct mca_pml_ob1_ack_hdr_t mca_pml_ob1_ack_hdr_t;
static inline void mca_pml_ob1_ack_hdr_prepare (mca_pml_ob1_ack_hdr_t *hdr, uint8_t hdr_flags,
uint64_t hdr_src_req, void *hdr_dst_req,
uint64_t hdr_send_offset, uint64_t hdr_send_size)
{
mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, MCA_PML_OB1_HDR_TYPE_ACK, hdr_flags);
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_OB1_ACK_HDR_FILL(h) \ hdr->hdr_padding[0] = 0;
do { \ hdr->hdr_padding[1] = 0;
(h).hdr_padding[0] = 0; \ hdr->hdr_padding[2] = 0;
(h).hdr_padding[1] = 0; \ hdr->hdr_padding[3] = 0;
(h).hdr_padding[2] = 0; \ hdr->hdr_padding[4] = 0;
(h).hdr_padding[3] = 0; \ hdr->hdr_padding[5] = 0;
(h).hdr_padding[4] = 0; \ #endif
(h).hdr_padding[5] = 0; \ hdr->hdr_src_req.lval = hdr_src_req;
} while (0) hdr->hdr_dst_req.pval = hdr_dst_req;
#else hdr->hdr_send_offset = hdr_send_offset;
#define MCA_PML_OB1_ACK_HDR_FILL(h) hdr->hdr_send_size = hdr_send_size;
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */ }
/* Note that the request headers are not put in NBO because the /* Note that the request headers are not put in NBO because the
src_req is already in receiver's byte order and the dst_req is not src_req is already in receiver's byte order and the dst_req is not
used by the receiver for anything other than backpointers in return used by the receiver for anything other than backpointers in return
headers */ headers */
#define MCA_PML_OB1_ACK_HDR_NTOH(h) \ #define MCA_PML_OB1_ACK_HDR_NTOH(h) \
do { \ do { \
MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \ MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \
(h).hdr_send_offset = ntoh64((h).hdr_send_offset); \ (h).hdr_send_offset = ntoh64((h).hdr_send_offset); \
(h).hdr_send_size = ntoh64((h).hdr_send_size); \
} while (0) } while (0)
#define MCA_PML_OB1_ACK_HDR_HTON(h) \ #define MCA_PML_OB1_ACK_HDR_HTON(h) \
do { \ do { \
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \ MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
MCA_PML_OB1_ACK_HDR_FILL(h); \
(h).hdr_send_offset = hton64((h).hdr_send_offset); \ (h).hdr_send_offset = hton64((h).hdr_send_offset); \
(h).hdr_send_size = hton64((h).hdr_send_size); \
} while (0) } while (0)
/** /**
@ -287,38 +317,55 @@ struct mca_pml_ob1_rdma_hdr_t {
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
uint8_t hdr_padding[2]; /** two to pad out the hdr to a 4 byte alignment. hdr_req will then be 8 byte aligned after 4 for hdr_seg_cnt */ uint8_t hdr_padding[2]; /** two to pad out the hdr to a 4 byte alignment. hdr_req will then be 8 byte aligned after 4 for hdr_seg_cnt */
#endif #endif
uint32_t hdr_seg_cnt; /**< number of segments for rdma */ /* TODO: add real support for multiple destination segments */
opal_ptr_t hdr_req; /**< destination request */ opal_ptr_t hdr_req; /**< destination request */
opal_ptr_t hdr_des; /**< source descriptor */ opal_ptr_t hdr_frag; /**< receiver fragment */
opal_ptr_t hdr_recv_req; /**< receive request (NTH: needed for put fallback on send) */ opal_ptr_t hdr_recv_req; /**< receive request (NTH: needed for put fallback on send) */
uint64_t hdr_rdma_offset; /**< current offset into user buffer */ uint64_t hdr_rdma_offset; /**< current offset into user buffer */
mca_btl_base_segment_t hdr_segs[1]; /**< list of segments for rdma */ uint64_t hdr_dst_ptr; /**< destination address */
uint64_t hdr_dst_size; /**< destination size */
/* registration data follows */
}; };
typedef struct mca_pml_ob1_rdma_hdr_t mca_pml_ob1_rdma_hdr_t; typedef struct mca_pml_ob1_rdma_hdr_t mca_pml_ob1_rdma_hdr_t;
static inline void mca_pml_ob1_rdma_hdr_prepare (mca_pml_ob1_rdma_hdr_t *hdr, uint8_t hdr_flags,
uint64_t hdr_req, void *hdr_frag, void *hdr_recv_req,
uint64_t hdr_rdma_offset, void *hdr_dst_ptr,
uint64_t hdr_dst_size, void *local_handle,
size_t local_handle_size)
{
mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, MCA_PML_OB1_HDR_TYPE_PUT, hdr_flags);
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_OB1_RDMA_HDR_FILL(h) \ hdr->hdr_padding[0] = 0;
do { \ hdr->hdr_padding[1] = 0;
(h).hdr_padding[0] = 0; \ #endif
(h).hdr_padding[1] = 0; \ hdr->hdr_req.lval = hdr_req;
} while(0) hdr->hdr_frag.pval = hdr_frag;
#else hdr->hdr_recv_req.pval = hdr_recv_req;
#define MCA_PML_OB1_RDMA_HDR_FILL(h) hdr->hdr_rdma_offset = hdr_rdma_offset;
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */ hdr->hdr_dst_ptr = (uint64_t)(intptr_t) hdr_dst_ptr;
hdr->hdr_dst_size = hdr_dst_size;
#define MCA_PML_OB1_RDMA_HDR_NTOH(h) \ /* copy segments */
do { \ memcpy (hdr + 1, local_handle, local_handle_size);
MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \ }
(h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \
#define MCA_PML_OB1_RDMA_HDR_NTOH(h) \
do { \
MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \
(h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \
(h).hdr_rdma_offset = ntoh64((h).hdr_rdma_offset); \ (h).hdr_rdma_offset = ntoh64((h).hdr_rdma_offset); \
(h).hdr_dst_ptr = ntoh64((h).hdr_dst_ptr); \
(h).hdr_dst_size = ntoh64((h).hdr_dst_size); \
} while (0) } while (0)
#define MCA_PML_OB1_RDMA_HDR_HTON(h) \ #define MCA_PML_OB1_RDMA_HDR_HTON(h) \
do { \ do { \
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \ MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
MCA_PML_OB1_RDMA_HDR_FILL(h); \ (h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \
(h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \
(h).hdr_rdma_offset = hton64((h).hdr_rdma_offset); \ (h).hdr_rdma_offset = hton64((h).hdr_rdma_offset); \
(h).hdr_dst_ptr = hton64((h).hdr_dst_ptr); \
(h).hdr_dst_size = hton64((h).hdr_dst_size); \
} while (0) } while (0)
/** /**
@ -330,31 +377,34 @@ struct mca_pml_ob1_fin_hdr_t {
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
uint8_t hdr_padding[2]; uint8_t hdr_padding[2];
#endif #endif
uint32_t hdr_fail; /**< RDMA operation failed */ int64_t hdr_size; /**< number of bytes completed (positive), error code (negative) */
opal_ptr_t hdr_des; /**< completed descriptor */ opal_ptr_t hdr_frag; /**< completed RDMA fragment */
}; };
typedef struct mca_pml_ob1_fin_hdr_t mca_pml_ob1_fin_hdr_t; typedef struct mca_pml_ob1_fin_hdr_t mca_pml_ob1_fin_hdr_t;
static inline void mca_pml_ob1_fin_hdr_prepare (mca_pml_ob1_fin_hdr_t *hdr, uint8_t hdr_flags,
uint64_t hdr_frag, int64_t hdr_size)
{
mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, MCA_PML_OB1_HDR_TYPE_FIN, hdr_flags);
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_OB1_FIN_HDR_FILL(h) \ hdr->hdr_padding[0] = 0;
do { \ hdr->hdr_padding[1] = 0;
(h).hdr_padding[0] = 0; \ #endif
(h).hdr_padding[1] = 0; \ hdr->hdr_frag.lval = hdr_frag;
} while (0) hdr->hdr_size = hdr_size;
#else }
#define MCA_PML_OB1_FIN_HDR_FILL(h)
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
#define MCA_PML_OB1_FIN_HDR_NTOH(h) \ #define MCA_PML_OB1_FIN_HDR_NTOH(h) \
do { \ do { \
MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \ MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \
(h).hdr_size = ntoh64((h).hdr_size); \
} while (0) } while (0)
#define MCA_PML_OB1_FIN_HDR_HTON(h) \ #define MCA_PML_OB1_FIN_HDR_HTON(h) \
do { \ do { \
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \ MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
MCA_PML_OB1_FIN_HDR_FILL(h); \ (h).hdr_size = hton64((h).hdr_size); \
} while (0) } while (0)
/** /**
* Union of defined hdr types. * Union of defined hdr types.

Просмотреть файл

@ -66,7 +66,6 @@ static inline int mca_pml_ob1_send_inline (void *buf, size_t count,
ompi_proc_t *dst_proc, mca_bml_base_endpoint_t* endpoint, ompi_proc_t *dst_proc, mca_bml_base_endpoint_t* endpoint,
ompi_communicator_t * comm) ompi_communicator_t * comm)
{ {
mca_btl_base_descriptor_t *des = NULL;
mca_pml_ob1_match_hdr_t match; mca_pml_ob1_match_hdr_t match;
mca_bml_base_btl_t *bml_btl; mca_bml_base_btl_t *bml_btl;
OPAL_PTRDIFF_TYPE lb, extent; OPAL_PTRDIFF_TYPE lb, extent;
@ -94,28 +93,21 @@ static inline int mca_pml_ob1_send_inline (void *buf, size_t count,
opal_convertor_get_packed_size (&convertor, &size); opal_convertor_get_packed_size (&convertor, &size);
} }
match.hdr_common.hdr_flags = 0; mca_pml_ob1_match_hdr_prepare (&match, MCA_PML_OB1_HDR_TYPE_MATCH, 0,
match.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH; comm->c_contextid, comm->c_my_rank,
match.hdr_ctx = comm->c_contextid; tag, seqn);
match.hdr_src = comm->c_my_rank;
match.hdr_tag = tag;
match.hdr_seq = seqn;
ob1_hdr_hton(&match, MCA_PML_OB1_HDR_TYPE_MATCH, dst_proc); ob1_hdr_hton(&match, MCA_PML_OB1_HDR_TYPE_MATCH, dst_proc);
/* try to send immediately */ /* try to send immediately */
rc = mca_bml_base_sendi (bml_btl, &convertor, &match, OMPI_PML_OB1_MATCH_HDR_LEN, rc = mca_bml_base_sendi (bml_btl, &convertor, &match, OMPI_PML_OB1_MATCH_HDR_LEN,
size, MCA_BTL_NO_ORDER, MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP, size, MCA_BTL_NO_ORDER, MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP,
MCA_PML_OB1_HDR_TYPE_MATCH, &des); MCA_PML_OB1_HDR_TYPE_MATCH, NULL);
if (count > 0) { if (count > 0) {
opal_convertor_cleanup (&convertor); opal_convertor_cleanup (&convertor);
} }
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
if (des) {
mca_bml_base_free (bml_btl, des);
}
return rc; return rc;
} }
@ -220,7 +212,7 @@ int mca_pml_ob1_send(void *buf,
OBJ_CONSTRUCT(sendreq, mca_pml_ob1_send_request_t); OBJ_CONSTRUCT(sendreq, mca_pml_ob1_send_request_t);
sendreq->req_send.req_base.req_proc = dst_proc; sendreq->req_send.req_base.req_proc = dst_proc;
sendreq->src_des = NULL; sendreq->rdma_frag = NULL;
MCA_PML_OB1_SEND_REQUEST_INIT(sendreq, MCA_PML_OB1_SEND_REQUEST_INIT(sendreq,
buf, buf,

Просмотреть файл

@ -1,3 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/* /*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
@ -9,6 +10,8 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -27,11 +30,6 @@
#include "pml_ob1.h" #include "pml_ob1.h"
#include "pml_ob1_rdma.h" #include "pml_ob1_rdma.h"
/* Use this registration if no registration needed for a BTL instead of NULL.
* This will help other code to distinguish case when memory is not registered
* from case when registration is not needed */
static mca_mpool_base_registration_t pml_ob1_dummy_reg;
/* /*
* Check to see if memory is registered or can be registered. Build a * Check to see if memory is registered or can be registered. Build a
* set of registrations on the request. * set of registrations on the request.
@ -45,7 +43,7 @@ size_t mca_pml_ob1_rdma_btls(
{ {
int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma); int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
double weight_total = 0; double weight_total = 0;
int num_btls_used = 0, n; int num_btls_used = 0;
/* shortcut when there are no rdma capable btls */ /* shortcut when there are no rdma capable btls */
if(num_btls == 0) { if(num_btls == 0) {
@ -53,29 +51,25 @@ size_t mca_pml_ob1_rdma_btls(
} }
/* check to see if memory is registered */ /* check to see if memory is registered */
for(n = 0; n < num_btls && num_btls_used < mca_pml_ob1.max_rdma_per_request; for (int n = 0; n < num_btls && num_btls_used < mca_pml_ob1.max_rdma_per_request; n++) {
n++) {
mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_t* bml_btl =
mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma, mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma,
(bml_endpoint->btl_rdma_index + n) % num_btls); (bml_endpoint->btl_rdma_index + n) % num_btls);
mca_mpool_base_registration_t* reg = &pml_ob1_dummy_reg; mca_btl_base_registration_handle_t *reg_handle = NULL;
mca_mpool_base_module_t *btl_mpool = bml_btl->btl->btl_mpool; mca_btl_base_module_t *btl = bml_btl->btl;
if( NULL != btl_mpool ) { if (btl->btl_register_mem) {
if(!mca_pml_ob1.leave_pinned) { /* try to register the memory with the btl */
/* look through existing registrations */ reg_handle = btl->btl_register_mem (btl, bml_btl->btl_endpoint, base,
btl_mpool->mpool_find(btl_mpool, base, size, &reg); size, MCA_BTL_REG_FLAG_REMOTE_READ);
} else { if (NULL == reg_handle) {
/* register the memory */ /* btl requires registration but the registration failed */
btl_mpool->mpool_register(btl_mpool, base, size, 0, &reg);
}
if(NULL == reg)
continue; continue;
} }
} /* else no registration is needed */
rdma_btls[num_btls_used].bml_btl = bml_btl; rdma_btls[num_btls_used].bml_btl = bml_btl;
rdma_btls[num_btls_used].btl_reg = reg; rdma_btls[num_btls_used].btl_reg = reg_handle;
weight_total += bml_btl->btl_weight; weight_total += bml_btl->btl_weight;
num_btls_used++; num_btls_used++;
} }
@ -83,7 +77,7 @@ size_t mca_pml_ob1_rdma_btls(
/* if we don't use leave_pinned and all BTLs that already have this memory /* if we don't use leave_pinned and all BTLs that already have this memory
* registered amount to less then half of available bandwidth - fall back to * registered amount to less then half of available bandwidth - fall back to
* pipeline protocol */ * pipeline protocol */
if(0 == num_btls_used || (!mca_pml_ob1.leave_pinned && weight_total < 0.5)) if (0 == num_btls_used || (!mca_pml_ob1.leave_pinned && weight_total < 0.5))
return 0; return 0;
mca_pml_ob1_calc_weighted_length(rdma_btls, num_btls_used, size, mca_pml_ob1_calc_weighted_length(rdma_btls, num_btls_used, size,
@ -103,10 +97,6 @@ size_t mca_pml_ob1_rdma_pipeline_btls( mca_bml_base_endpoint_t* bml_endpoint,
for(i = 0; i < num_btls && i < mca_pml_ob1.max_rdma_per_request; i++) { for(i = 0; i < num_btls && i < mca_pml_ob1.max_rdma_per_request; i++) {
rdma_btls[i].bml_btl = rdma_btls[i].bml_btl =
mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma); mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma);
if(NULL != rdma_btls[i].bml_btl->btl->btl_mpool)
rdma_btls[i].btl_reg = NULL;
else
rdma_btls[i].btl_reg = &pml_ob1_dummy_reg;
weight_total += rdma_btls[i].bml_btl->btl_weight; weight_total += rdma_btls[i].bml_btl->btl_weight;
} }

Просмотреть файл

@ -1,3 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/* /*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
@ -9,6 +10,8 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -21,9 +24,13 @@
#include "pml_ob1.h" #include "pml_ob1.h"
#include "pml_ob1_rdmafrag.h" #include "pml_ob1_rdmafrag.h"
static void mca_pml_ob1_rdma_frag_constructor (mca_pml_ob1_rdma_frag_t *frag)
{
frag->local_handle = NULL;
}
OBJ_CLASS_INSTANCE( OBJ_CLASS_INSTANCE(
mca_pml_ob1_rdma_frag_t, mca_pml_ob1_rdma_frag_t,
ompi_free_list_item_t, ompi_free_list_item_t,
NULL, mca_pml_ob1_rdma_frag_constructor,
NULL); NULL);

Просмотреть файл

@ -10,6 +10,8 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -32,38 +34,52 @@ typedef enum {
MCA_PML_OB1_RDMA_GET MCA_PML_OB1_RDMA_GET
} mca_pml_ob1_rdma_state_t; } mca_pml_ob1_rdma_state_t;
struct mca_pml_ob1_rdma_frag_t;
typedef void (*mca_pml_ob1_rdma_frag_callback_t)(struct mca_pml_ob1_rdma_frag_t *frag, int64_t rdma_length);
/**
* Used to keep track of local and remote RDMA operations.
*/
struct mca_pml_ob1_rdma_frag_t { struct mca_pml_ob1_rdma_frag_t {
ompi_free_list_item_t super; ompi_free_list_item_t super;
mca_bml_base_btl_t* rdma_bml; mca_bml_base_btl_t *rdma_bml;
mca_pml_ob1_hdr_t rdma_hdr; mca_pml_ob1_hdr_t rdma_hdr;
mca_pml_ob1_rdma_state_t rdma_state; mca_pml_ob1_rdma_state_t rdma_state;
size_t rdma_length; size_t rdma_length;
uint8_t rdma_segs[MCA_BTL_SEG_MAX_SIZE * MCA_BTL_DES_MAX_SEGMENTS];
void *rdma_req; void *rdma_req;
struct mca_bml_base_endpoint_t* rdma_ep;
opal_convertor_t convertor;
mca_mpool_base_registration_t* reg;
uint32_t retries; uint32_t retries;
mca_pml_ob1_rdma_frag_callback_t cbfunc;
uint64_t rdma_offset;
void *local_address;
mca_btl_base_registration_handle_t *local_handle;
uint64_t remote_address;
uint8_t remote_handle[MCA_BTL_REG_HANDLE_MAX_SIZE];
}; };
typedef struct mca_pml_ob1_rdma_frag_t mca_pml_ob1_rdma_frag_t; typedef struct mca_pml_ob1_rdma_frag_t mca_pml_ob1_rdma_frag_t;
OBJ_CLASS_DECLARATION(mca_pml_ob1_rdma_frag_t); OBJ_CLASS_DECLARATION(mca_pml_ob1_rdma_frag_t);
#define MCA_PML_OB1_RDMA_FRAG_ALLOC(frag) \ #define MCA_PML_OB1_RDMA_FRAG_ALLOC(frag) \
do { \ do { \
ompi_free_list_item_t* item; \ ompi_free_list_item_t* item; \
OMPI_FREE_LIST_WAIT_MT(&mca_pml_ob1.rdma_frags, item); \ OMPI_FREE_LIST_WAIT_MT(&mca_pml_ob1.rdma_frags, item); \
frag = (mca_pml_ob1_rdma_frag_t*)item; \ frag = (mca_pml_ob1_rdma_frag_t*)item; \
} while(0)
#define MCA_PML_OB1_RDMA_FRAG_RETURN(frag) \
do { \
/* return fragment */ \
OMPI_FREE_LIST_RETURN_MT(&mca_pml_ob1.rdma_frags, \
(ompi_free_list_item_t*)frag); \
} while(0) } while(0)
#define MCA_PML_OB1_RDMA_FRAG_RETURN(frag) \
do { \
/* return fragment */ \
if (frag->local_handle) { \
mca_bml_base_deregister_mem (frag->rdma_bml, frag->local_handle); \
frag->local_handle = NULL; \
} \
OMPI_FREE_LIST_RETURN_MT(&mca_pml_ob1.rdma_frags, \
(ompi_free_list_item_t*)frag); \
} while (0)
END_C_DECLS END_C_DECLS

Просмотреть файл

@ -108,13 +108,13 @@ void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des, mca_btl_base_descriptor_t* des,
void* cbdata ) void* cbdata )
{ {
mca_btl_base_segment_t* segments = des->des_local; mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_ob1_match_hdr_t* hdr = (mca_pml_ob1_match_hdr_t*)segments->seg_addr.pval; mca_pml_ob1_match_hdr_t* hdr = (mca_pml_ob1_match_hdr_t*)segments->seg_addr.pval;
ompi_communicator_t *comm_ptr; ompi_communicator_t *comm_ptr;
mca_pml_ob1_recv_request_t *match = NULL; mca_pml_ob1_recv_request_t *match = NULL;
mca_pml_ob1_comm_t *comm; mca_pml_ob1_comm_t *comm;
mca_pml_ob1_comm_proc_t *proc; mca_pml_ob1_comm_proc_t *proc;
size_t num_segments = des->des_local_count; size_t num_segments = des->des_segment_count;
size_t bytes_received = 0; size_t bytes_received = 0;
assert(num_segments <= MCA_BTL_DES_MAX_SEGMENTS); assert(num_segments <= MCA_BTL_DES_MAX_SEGMENTS);
@ -256,7 +256,7 @@ void mca_pml_ob1_recv_frag_callback_rndv(mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des, mca_btl_base_descriptor_t* des,
void* cbdata ) void* cbdata )
{ {
mca_btl_base_segment_t* segments = des->des_local; mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval; mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) { if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) {
@ -264,7 +264,7 @@ void mca_pml_ob1_recv_frag_callback_rndv(mca_btl_base_module_t* btl,
} }
ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_RNDV); ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_RNDV);
mca_pml_ob1_recv_frag_match(btl, &hdr->hdr_match, segments, mca_pml_ob1_recv_frag_match(btl, &hdr->hdr_match, segments,
des->des_local_count, MCA_PML_OB1_HDR_TYPE_RNDV); des->des_segment_count, MCA_PML_OB1_HDR_TYPE_RNDV);
return; return;
} }
@ -273,7 +273,7 @@ void mca_pml_ob1_recv_frag_callback_rget(mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des, mca_btl_base_descriptor_t* des,
void* cbdata ) void* cbdata )
{ {
mca_btl_base_segment_t* segments = des->des_local; mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval; mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) { if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) {
@ -281,7 +281,7 @@ void mca_pml_ob1_recv_frag_callback_rget(mca_btl_base_module_t* btl,
} }
ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_RGET); ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_RGET);
mca_pml_ob1_recv_frag_match(btl, &hdr->hdr_match, segments, mca_pml_ob1_recv_frag_match(btl, &hdr->hdr_match, segments,
des->des_local_count, MCA_PML_OB1_HDR_TYPE_RGET); des->des_segment_count, MCA_PML_OB1_HDR_TYPE_RGET);
return; return;
} }
@ -292,9 +292,10 @@ void mca_pml_ob1_recv_frag_callback_ack(mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des, mca_btl_base_descriptor_t* des,
void* cbdata ) void* cbdata )
{ {
mca_btl_base_segment_t* segments = des->des_local; mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval; mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
mca_pml_ob1_send_request_t* sendreq; mca_pml_ob1_send_request_t* sendreq;
size_t size;
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) { if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) {
return; return;
@ -307,19 +308,25 @@ void mca_pml_ob1_recv_frag_callback_ack(mca_btl_base_module_t* btl,
/* if the request should be delivered entirely by copy in/out /* if the request should be delivered entirely by copy in/out
* then throttle sends */ * then throttle sends */
if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA) { if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA) {
if (NULL != sendreq->src_des) { if (NULL != sendreq->rdma_frag) {
/* release registered memory */ if (NULL != sendreq->rdma_frag->local_handle) {
mca_bml_base_free (sendreq->req_rdma[0].bml_btl, sendreq->src_des); mca_bml_base_deregister_mem (sendreq->req_rdma[0].bml_btl, sendreq->rdma_frag->local_handle);
sendreq->src_des = NULL; sendreq->rdma_frag->local_handle = NULL;
}
MCA_PML_OB1_RDMA_FRAG_RETURN(sendreq->rdma_frag);
sendreq->rdma_frag = NULL;
} }
sendreq->req_throttle_sends = true; sendreq->req_throttle_sends = true;
} }
mca_pml_ob1_send_request_copy_in_out(sendreq, if (hdr->hdr_ack.hdr_send_size) {
hdr->hdr_ack.hdr_send_offset, size = hdr->hdr_ack.hdr_send_size;
sendreq->req_send.req_bytes_packed - } else {
hdr->hdr_ack.hdr_send_offset); size = sendreq->req_send.req_bytes_packed - hdr->hdr_ack.hdr_send_offset;
}
mca_pml_ob1_send_request_copy_in_out(sendreq, hdr->hdr_ack.hdr_send_offset, size);
if (sendreq->req_state != 0) { if (sendreq->req_state != 0) {
/* Typical receipt of an ACK message causes req_state to be /* Typical receipt of an ACK message causes req_state to be
@ -355,13 +362,14 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag, mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des, mca_btl_base_descriptor_t* des,
void* cbdata ) { void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_local; mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval; mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
mca_pml_ob1_recv_request_t* recvreq; mca_pml_ob1_recv_request_t* recvreq;
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) { if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) {
return; return;
} }
ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_FRAG); ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_FRAG);
recvreq = (mca_pml_ob1_recv_request_t*)hdr->hdr_frag.hdr_dst_req.pval; recvreq = (mca_pml_ob1_recv_request_t*)hdr->hdr_frag.hdr_dst_req.pval;
#if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */ #if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
@ -372,7 +380,7 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl,
assert(btl->btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV); assert(btl->btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV);
/* This will trigger the opal_convertor_pack to start asynchronous copy. */ /* This will trigger the opal_convertor_pack to start asynchronous copy. */
mca_pml_ob1_recv_request_frag_copy_start(recvreq,btl,segments,des->des_local_count,des); mca_pml_ob1_recv_request_frag_copy_start(recvreq,btl,segments,des->des_segment_count,des);
/* Let BTL know that it CANNOT free the frag */ /* Let BTL know that it CANNOT free the frag */
des->des_flags |= MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC; des->des_flags |= MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC;
@ -380,7 +388,8 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl,
return; return;
} }
#endif /* OPAL_CUDA_SUPPORT */ #endif /* OPAL_CUDA_SUPPORT */
mca_pml_ob1_recv_request_progress_frag(recvreq,btl,segments,des->des_local_count);
mca_pml_ob1_recv_request_progress_frag(recvreq,btl,segments,des->des_segment_count);
return; return;
} }
@ -390,7 +399,7 @@ void mca_pml_ob1_recv_frag_callback_put(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag, mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des, mca_btl_base_descriptor_t* des,
void* cbdata ) { void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_local; mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval; mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
mca_pml_ob1_send_request_t* sendreq; mca_pml_ob1_send_request_t* sendreq;
@ -410,20 +419,17 @@ void mca_pml_ob1_recv_frag_callback_fin(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag, mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des, mca_btl_base_descriptor_t* des,
void* cbdata ) { void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_local; mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval; mca_pml_ob1_fin_hdr_t* hdr = (mca_pml_ob1_fin_hdr_t *) segments->seg_addr.pval;
mca_btl_base_descriptor_t* rdma; mca_pml_ob1_rdma_frag_t *frag;
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) { if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_fin_hdr_t)) ) {
return; return;
} }
ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_FIN); ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_FIN);
rdma = (mca_btl_base_descriptor_t*)hdr->hdr_fin.hdr_des.pval; frag = (mca_pml_ob1_rdma_frag_t *) hdr->hdr_frag.pval;
rdma->des_cbfunc(btl, NULL, rdma, frag->cbfunc (frag, hdr->hdr_size);
hdr->hdr_fin.hdr_fail ? OMPI_ERROR : OMPI_SUCCESS);
return;
} }
@ -699,7 +705,7 @@ out_of_order_match:
OPAL_THREAD_UNLOCK(&comm->matching_lock); OPAL_THREAD_UNLOCK(&comm->matching_lock);
if(OPAL_LIKELY(match)) { if(OPAL_LIKELY(match)) {
switch(type) { switch(type) {
case MCA_PML_OB1_HDR_TYPE_MATCH: case MCA_PML_OB1_HDR_TYPE_MATCH:
mca_pml_ob1_recv_request_progress_match(match, btl, segments, num_segments); mca_pml_ob1_recv_request_progress_match(match, btl, segments, num_segments);
break; break;

Просмотреть файл

@ -13,7 +13,7 @@
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
* Copyright (c) 2011 Sandia National Laboratories. All rights reserved. * Copyright (c) 2011 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2012-2013 NVIDIA Corporation. All rights reserved. * Copyright (c) 2012-2013 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights * Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2012 FUJITSU LIMITED. All rights reserved. * Copyright (c) 2012 FUJITSU LIMITED. All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science * Copyright (c) 2014 Research Organization for Information Science
@ -150,12 +150,17 @@ static void mca_pml_ob1_recv_request_construct(mca_pml_ob1_recv_request_t* reque
request->req_recv.req_base.req_ompi.req_free = mca_pml_ob1_recv_request_free; request->req_recv.req_base.req_ompi.req_free = mca_pml_ob1_recv_request_free;
request->req_recv.req_base.req_ompi.req_cancel = mca_pml_ob1_recv_request_cancel; request->req_recv.req_base.req_ompi.req_cancel = mca_pml_ob1_recv_request_cancel;
request->req_rdma_cnt = 0; request->req_rdma_cnt = 0;
request->local_handle = NULL;
OBJ_CONSTRUCT(&request->lock, opal_mutex_t); OBJ_CONSTRUCT(&request->lock, opal_mutex_t);
} }
static void mca_pml_ob1_recv_request_destruct(mca_pml_ob1_recv_request_t* request) static void mca_pml_ob1_recv_request_destruct(mca_pml_ob1_recv_request_t* request)
{ {
OBJ_DESTRUCT(&request->lock); OBJ_DESTRUCT(&request->lock);
if (OPAL_UNLIKELY(request->local_handle)) {
mca_bml_base_deregister_mem (request->rdma_bml, request->local_handle);
request->local_handle = NULL;
}
} }
OBJ_CLASS_INSTANCE( OBJ_CLASS_INSTANCE(
@ -183,31 +188,27 @@ static void mca_pml_ob1_recv_ctl_completion( mca_btl_base_module_t* btl,
* Put operation has completed remotely - update request status * Put operation has completed remotely - update request status
*/ */
static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl, static void mca_pml_ob1_put_completion (mca_pml_ob1_rdma_frag_t *frag, int64_t rdma_size)
struct mca_btl_base_endpoint_t* ep,
struct mca_btl_base_descriptor_t* des,
int status )
{ {
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context; mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req;
mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)des->des_cbdata; mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
size_t bytes_received = 0;
if( OPAL_LIKELY(status == OMPI_SUCCESS) ) {
bytes_received = mca_pml_ob1_compute_segment_length (btl->btl_seg_size,
(void *) des->des_local,
des->des_local_count, 0);
}
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth,-1); OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth,-1);
mca_bml_base_free(bml_btl, des); MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
/* check completion status */ if (OPAL_LIKELY(0 < rdma_size)) {
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, bytes_received); assert ((uint64_t) rdma_size == frag->rdma_length);
if(recv_request_pml_complete_check(recvreq) == false &&
/* check completion status */
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, (size_t) rdma_size);
if (recv_request_pml_complete_check(recvreq) == false &&
recvreq->req_rdma_offset < recvreq->req_send_offset) { recvreq->req_rdma_offset < recvreq->req_send_offset) {
/* schedule additional rdma operations */ /* schedule additional rdma operations */
mca_pml_ob1_recv_request_schedule(recvreq, bml_btl); mca_pml_ob1_recv_request_schedule(recvreq, bml_btl);
}
} }
MCA_PML_OB1_PROGRESS_PENDING(bml_btl); MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
} }
@ -218,7 +219,7 @@ static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl,
int mca_pml_ob1_recv_request_ack_send_btl( int mca_pml_ob1_recv_request_ack_send_btl(
ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl, ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl,
uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset, uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset,
bool nordma) uint64_t size, bool nordma)
{ {
mca_btl_base_descriptor_t* des; mca_btl_base_descriptor_t* des;
mca_pml_ob1_ack_hdr_t* ack; mca_pml_ob1_ack_hdr_t* ack;
@ -233,12 +234,9 @@ int mca_pml_ob1_recv_request_ack_send_btl(
} }
/* fill out header */ /* fill out header */
ack = (mca_pml_ob1_ack_hdr_t*)des->des_local->seg_addr.pval; ack = (mca_pml_ob1_ack_hdr_t*)des->des_segments->seg_addr.pval;
ack->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_ACK; mca_pml_ob1_ack_hdr_prepare (ack, nordma ? MCA_PML_OB1_HDR_FLAGS_NORDMA : 0,
ack->hdr_common.hdr_flags = nordma ? MCA_PML_OB1_HDR_FLAGS_NORDMA : 0; hdr_src_req, hdr_dst_req, hdr_send_offset, size);
ack->hdr_src_req.lval = hdr_src_req;
ack->hdr_dst_req.pval = hdr_dst_req;
ack->hdr_send_offset = hdr_send_offset;
ob1_hdr_hton(ack, MCA_PML_OB1_HDR_TYPE_ACK, proc); ob1_hdr_hton(ack, MCA_PML_OB1_HDR_TYPE_ACK, proc);
@ -312,63 +310,99 @@ static int mca_pml_ob1_recv_request_ack(
if(recvreq->req_send_offset == hdr->hdr_msg_length) if(recvreq->req_send_offset == hdr->hdr_msg_length)
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
/* let know to shedule function there is no need to put ACK flag */ /* let know to shedule function there is no need to put ACK flag */
recvreq->req_ack_sent = true; recvreq->req_ack_sent = true;
return mca_pml_ob1_recv_request_ack_send(proc, hdr->hdr_src_req.lval, return mca_pml_ob1_recv_request_ack_send(proc, hdr->hdr_src_req.lval,
recvreq, recvreq->req_send_offset, recvreq, recvreq->req_send_offset, 0,
recvreq->req_send_offset == bytes_received); recvreq->req_send_offset == bytes_received);
} }
static int mca_pml_ob1_recv_request_put_frag (mca_pml_ob1_rdma_frag_t *frag);
static int mca_pml_ob1_recv_request_get_frag_failed (mca_pml_ob1_rdma_frag_t *frag, int rc)
{
mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req;
ompi_proc_t *proc = (ompi_proc_t *) recvreq->req_recv.req_base.req_proc;
if (OMPI_ERR_NOT_AVAILABLE == rc) {
/* get isn't supported for this transfer. tell peer to fallback on put */
rc = mca_pml_ob1_recv_request_put_frag (frag);
if (OMPI_ERR_OUT_OF_RESOURCE == rc) {
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append (&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
return OMPI_SUCCESS;
}
}
if (++frag->retries < mca_pml_ob1.rdma_retries_limit &&
OMPI_ERR_OUT_OF_RESOURCE == rc) {
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
return OMPI_SUCCESS;
}
/* tell peer to fall back on send for this region */
rc = mca_pml_ob1_recv_request_ack_send(proc, frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req.lval,
recvreq, frag->rdma_offset, frag->rdma_length, false);
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
return rc;
}
/** /**
* Return resources used by the RDMA * Return resources used by the RDMA
*/ */
static void mca_pml_ob1_rget_completion( mca_btl_base_module_t* btl, static void mca_pml_ob1_rget_completion (mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* ep,
struct mca_btl_base_endpoint_t* ep, void *local_address, mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_descriptor_t* des, void *context, void *cbdata, int status)
int status )
{ {
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context; mca_bml_base_btl_t *bml_btl = (mca_bml_base_btl_t *) context;
mca_pml_ob1_rdma_frag_t* frag = (mca_pml_ob1_rdma_frag_t*)des->des_cbdata; mca_pml_ob1_rdma_frag_t *frag = (mca_pml_ob1_rdma_frag_t *) cbdata;
mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)frag->rdma_req; mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req;
/* check completion status */ /* check completion status */
if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) { if (OPAL_UNLIKELY(OMPI_SUCCESS != status)) {
/* TSW - FIX */ status = mca_pml_ob1_recv_request_get_frag_failed (frag, status);
OMPI_ERROR_LOG(status); if (OPAL_UNLIKELY(OMPI_SUCCESS != status)) {
ompi_rte_abort(-1, NULL); /* TSW - FIX */
OMPI_ERROR_LOG(status);
ompi_rte_abort(-1, NULL);
}
} else {
/* is receive request complete */
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, frag->rdma_length);
/* TODO: re-add order */
mca_pml_ob1_send_fin (recvreq->req_recv.req_base.req_proc,
bml_btl, frag->rdma_hdr.hdr_rget.hdr_frag,
frag->rdma_length, 0, 0);
recv_request_pml_complete_check(recvreq);
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
} }
/* is receive request complete */
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, frag->rdma_length);
if (recvreq->req_recv.req_bytes_packed <= recvreq->req_bytes_received) {
mca_pml_ob1_send_fin(recvreq->req_recv.req_base.req_proc,
bml_btl,
frag->rdma_hdr.hdr_rget.hdr_des,
des->order, 0);
}
recv_request_pml_complete_check(recvreq);
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
MCA_PML_OB1_PROGRESS_PENDING(bml_btl); MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
} }
static int mca_pml_ob1_init_get_fallback (mca_pml_ob1_rdma_frag_t *frag, static int mca_pml_ob1_recv_request_put_frag (mca_pml_ob1_rdma_frag_t *frag)
mca_btl_base_descriptor_t *dst) { {
mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req; mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req;
mca_bml_base_btl_t *bml_btl = frag->rdma_bml; mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
mca_btl_base_descriptor_t *ctl; mca_btl_base_descriptor_t *ctl;
mca_pml_ob1_rdma_hdr_t *hdr; mca_pml_ob1_rdma_hdr_t *hdr;
size_t seg_size; size_t reg_size;
int rc; int rc;
seg_size = bml_btl->btl->btl_seg_size * dst->des_local_count; reg_size = bml_btl->btl->btl_registration_handle_size;
/* prepare a descriptor for rdma control message */ /* prepare a descriptor for rdma control message */
mca_bml_base_alloc (bml_btl, &ctl, MCA_BTL_NO_ORDER, sizeof (mca_pml_ob1_rdma_hdr_t) + seg_size, mca_bml_base_alloc (bml_btl, &ctl, MCA_BTL_NO_ORDER, sizeof (mca_pml_ob1_rdma_hdr_t) + reg_size,
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
MCA_BTL_DES_SEND_ALWAYS_CALLBACK); MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
if (OPAL_UNLIKELY(NULL == ctl)) { if (OPAL_UNLIKELY(NULL == ctl)) {
@ -377,26 +411,19 @@ static int mca_pml_ob1_init_get_fallback (mca_pml_ob1_rdma_frag_t *frag,
ctl->des_cbfunc = mca_pml_ob1_recv_ctl_completion; ctl->des_cbfunc = mca_pml_ob1_recv_ctl_completion;
/* fill in rdma header */ /* fill in rdma header */
hdr = (mca_pml_ob1_rdma_hdr_t *) ctl->des_local->seg_addr.pval; hdr = (mca_pml_ob1_rdma_hdr_t *) ctl->des_segments->seg_addr.pval;
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_PUT; mca_pml_ob1_rdma_hdr_prepare (hdr, (!recvreq->req_ack_sent) ? MCA_PML_OB1_HDR_TYPE_ACK : 0,
hdr->hdr_common.hdr_flags = recvreq->remote_req_send.lval, frag, recvreq, frag->rdma_offset,
(!recvreq->req_ack_sent) ? MCA_PML_OB1_HDR_TYPE_ACK : 0; frag->local_address, frag->rdma_length, frag->local_handle,
reg_size);
hdr->hdr_req = frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req; frag->cbfunc = mca_pml_ob1_put_completion;
hdr->hdr_rdma_offset = recvreq->req_rdma_offset;
hdr->hdr_des.pval = dst;
hdr->hdr_recv_req.pval = recvreq;
hdr->hdr_seg_cnt = dst->des_local_count; recvreq->req_ack_sent = true;
/* copy segments */ PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE,
memcpy (hdr + 1, dst->des_local, seg_size); &(recvreq->req_recv.req_base), size,
PERUSE_RECV);
dst->des_cbfunc = mca_pml_ob1_put_completion;
dst->des_cbdata = recvreq;
if (!recvreq->req_ack_sent)
recvreq->req_ack_sent = true;
/* send rdma request to peer */ /* send rdma request to peer */
rc = mca_bml_base_send (bml_btl, ctl, MCA_PML_OB1_HDR_TYPE_PUT); rc = mca_bml_base_send (bml_btl, ctl, MCA_PML_OB1_HDR_TYPE_PUT);
@ -411,71 +438,38 @@ static int mca_pml_ob1_init_get_fallback (mca_pml_ob1_rdma_frag_t *frag,
/* /*
* *
*/ */
int mca_pml_ob1_recv_request_get_frag( mca_pml_ob1_rdma_frag_t* frag ) int mca_pml_ob1_recv_request_get_frag (mca_pml_ob1_rdma_frag_t *frag)
{ {
mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)frag->rdma_req; mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req;
mca_bml_base_btl_t* bml_btl = frag->rdma_bml; mca_btl_base_registration_handle_t *local_handle = NULL;
mca_btl_base_descriptor_t* descriptor; mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
size_t save_size = frag->rdma_length;
int rc; int rc;
/* prepare descriptor */ /* prepare descriptor */
mca_bml_base_prepare_dst( bml_btl, if (bml_btl->btl->btl_register_mem && !frag->local_handle && !recvreq->local_handle) {
NULL, mca_bml_base_register_mem (bml_btl, frag->local_address, frag->rdma_length, MCA_BTL_REG_FLAG_LOCAL_WRITE |
&recvreq->req_recv.req_base.req_convertor, MCA_BTL_REG_FLAG_REMOTE_WRITE, &frag->local_handle);
MCA_BTL_NO_ORDER, if (OPAL_UNLIKELY(NULL == frag->local_handle)) {
0, return mca_pml_ob1_recv_request_get_frag_failed (frag, OMPI_ERR_OUT_OF_RESOURCE);
&frag->rdma_length,
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK |
MCA_BTL_DES_FLAGS_GET,
&descriptor );
if( OPAL_UNLIKELY(NULL == descriptor) ) {
if (frag->retries < mca_pml_ob1.rdma_retries_limit) {
frag->rdma_length = save_size;
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
return OMPI_ERR_OUT_OF_RESOURCE;
} else {
ompi_proc_t *proc = (ompi_proc_t *) recvreq->req_recv.req_base.req_proc;
/* tell peer to fall back on send */
recvreq->req_send_offset = 0;
rc = mca_pml_ob1_recv_request_ack_send(proc, frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req.lval,
recvreq, recvreq->req_send_offset, true);
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
return rc;
} }
} }
descriptor->des_remote = (mca_btl_base_segment_t *) frag->rdma_segs; if (frag->local_handle) {
descriptor->des_remote_count = frag->rdma_hdr.hdr_rdma.hdr_seg_cnt; local_handle = frag->local_handle;
descriptor->des_cbfunc = mca_pml_ob1_rget_completion; } else if (recvreq->local_handle) {
descriptor->des_cbdata = frag; local_handle = recvreq->local_handle;
}
PERUSE_TRACE_COMM_OMPI_EVENT(PERUSE_COMM_REQ_XFER_CONTINUE, PERUSE_TRACE_COMM_OMPI_EVENT(PERUSE_COMM_REQ_XFER_CONTINUE,
&(recvreq->req_recv.req_base), &(((mca_pml_ob1_recv_request_t *) frag->rdma_req)->req_recv.req_base),
frag->rdma_length, PERUSE_RECV); frag->rdma_length, PERUSE_RECV);
/* queue up get request */ /* queue up get request */
rc = mca_bml_base_get(bml_btl,descriptor); rc = mca_bml_base_get (bml_btl, frag->local_address, frag->remote_address, local_handle,
(mca_btl_base_registration_handle_t *) frag->remote_handle, frag->rdma_length,
0, MCA_BTL_NO_ORDER, mca_pml_ob1_rget_completion, frag);
if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) { if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
if (OPAL_UNLIKELY(OMPI_ERR_NOT_AVAILABLE == rc)) { return mca_pml_ob1_recv_request_get_frag_failed (frag, OMPI_ERR_OUT_OF_RESOURCE);
/* get isn't supported for this transfer. tell peer to fallback on put */
rc = mca_pml_ob1_init_get_fallback (frag, descriptor);
}
if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
mca_bml_base_free(bml_btl, descriptor);
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append(&mca_pml_ob1.rdma_pending,
(opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
return OMPI_ERR_OUT_OF_RESOURCE;
} else if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
OMPI_ERROR_LOG(rc);
ompi_rte_abort(-1, NULL);
}
} }
return OMPI_SUCCESS; return OMPI_SUCCESS;
@ -501,6 +495,7 @@ void mca_pml_ob1_recv_request_progress_frag( mca_pml_ob1_recv_request_t* recvreq
bytes_received = mca_pml_ob1_compute_segment_length_base (segments, num_segments, bytes_received = mca_pml_ob1_compute_segment_length_base (segments, num_segments,
sizeof(mca_pml_ob1_frag_hdr_t)); sizeof(mca_pml_ob1_frag_hdr_t));
data_offset = hdr->hdr_frag.hdr_frag_offset; data_offset = hdr->hdr_frag.hdr_frag_offset;
/* /*
* Make user buffer accessible(defined) before unpacking. * Make user buffer accessible(defined) before unpacking.
*/ */
@ -628,7 +623,6 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
mca_pml_ob1_rget_hdr_t* hdr = (mca_pml_ob1_rget_hdr_t*)segments->seg_addr.pval; mca_pml_ob1_rget_hdr_t* hdr = (mca_pml_ob1_rget_hdr_t*)segments->seg_addr.pval;
mca_bml_base_endpoint_t* bml_endpoint = NULL; mca_bml_base_endpoint_t* bml_endpoint = NULL;
size_t bytes_remaining, prev_sent, offset; size_t bytes_remaining, prev_sent, offset;
mca_btl_base_segment_t *r_segments;
mca_pml_ob1_rdma_frag_t *frag; mca_pml_ob1_rdma_frag_t *frag;
mca_bml_base_btl_t *rdma_bml; mca_bml_base_btl_t *rdma_bml;
int rc; int rc;
@ -636,6 +630,7 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
prev_sent = offset = 0; prev_sent = offset = 0;
bytes_remaining = hdr->hdr_rndv.hdr_msg_length; bytes_remaining = hdr->hdr_rndv.hdr_msg_length;
recvreq->req_recv.req_bytes_packed = hdr->hdr_rndv.hdr_msg_length; recvreq->req_recv.req_bytes_packed = hdr->hdr_rndv.hdr_msg_length;
recvreq->req_send_offset = 0;
MCA_PML_OB1_RECV_REQUEST_MATCHED(recvreq, &hdr->hdr_rndv.hdr_match); MCA_PML_OB1_RECV_REQUEST_MATCHED(recvreq, &hdr->hdr_rndv.hdr_match);
@ -679,8 +674,28 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
ompi_rte_abort(-1, NULL); ompi_rte_abort(-1, NULL);
} }
bytes_remaining = mca_pml_ob1_compute_segment_length_remote (btl->btl_seg_size, (void *)(hdr + 1), bytes_remaining = hdr->hdr_rndv.hdr_msg_length;
hdr->hdr_seg_cnt, recvreq->req_recv.req_base.req_proc);
/* save the request for put fallback */
recvreq->remote_req_send = hdr->hdr_rndv.hdr_src_req;
recvreq->rdma_bml = rdma_bml;
/* try to register the entire buffer */
if (rdma_bml->btl->btl_register_mem) {
void *data_ptr;
offset = 0;
OPAL_THREAD_LOCK(&recvreq->lock);
opal_convertor_set_position( &recvreq->req_recv.req_base.req_convertor, &offset);
opal_convertor_get_current_pointer (&recvreq->req_recv.req_base.req_convertor, &data_ptr);
OPAL_THREAD_UNLOCK(&recvreq->lock);
mca_bml_base_register_mem (rdma_bml, data_ptr, bytes_remaining, MCA_BTL_REG_FLAG_LOCAL_WRITE |
MCA_BTL_REG_FLAG_REMOTE_WRITE, &recvreq->local_handle);
/* It is not an error if the memory region can not be registered here. The registration will
* be attempted again for each get fragment. */
}
/* The while loop adds a fragmentation mechanism. The variable bytes_remaining holds the num /* The while loop adds a fragmentation mechanism. The variable bytes_remaining holds the num
* of bytes left to be send. In each iteration we send the max possible bytes supported * of bytes left to be send. In each iteration we send the max possible bytes supported
@ -689,7 +704,12 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
* the next iteration with the updated size. * the next iteration with the updated size.
* Also - In each iteration we update the location in the buffer to be used for writing * Also - In each iteration we update the location in the buffer to be used for writing
* the message ,and the location to read from. This is done using the offset variable that * the message ,and the location to read from. This is done using the offset variable that
* accumulates the number of bytes that were sent so far. */ * accumulates the number of bytes that were sent so far.
*
* NTH: This fragmentation may go away if we change the btls to require them to handle
* get fragmentation internally. This is a reasonable solution since some btls do not
* need any fragmentation (sm, vader, self, etc). Remove this loop if this ends up
* being the case. */
while (bytes_remaining > 0) { while (bytes_remaining > 0) {
/* allocate/initialize a fragment */ /* allocate/initialize a fragment */
MCA_PML_OB1_RDMA_FRAG_ALLOC(frag); MCA_PML_OB1_RDMA_FRAG_ALLOC(frag);
@ -699,29 +719,31 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
ompi_rte_abort(-1, NULL); ompi_rte_abort(-1, NULL);
} }
assert (btl->btl_seg_size * hdr->hdr_seg_cnt <= sizeof (frag->rdma_segs)); memcpy (frag->remote_handle, hdr + 1, btl->btl_registration_handle_size);
memcpy (frag->rdma_segs, hdr + 1, btl->btl_seg_size * hdr->hdr_seg_cnt); /* update the read location */
frag->remote_address = hdr->hdr_src_ptr + offset;
/* update the read location -- NTH: note this will only work if there is exactly one
segment. TODO -- make this work with multiple segments */
r_segments = (mca_btl_base_segment_t *) frag->rdma_segs;
r_segments->seg_addr.lval += offset;
/* updating the write location */ /* updating the write location */
OPAL_THREAD_LOCK(&recvreq->lock); OPAL_THREAD_LOCK(&recvreq->lock);
opal_convertor_set_position( &recvreq->req_recv.req_base.req_convertor, &offset); opal_convertor_set_position( &recvreq->req_recv.req_base.req_convertor, &offset);
opal_convertor_get_current_pointer (&recvreq->req_recv.req_base.req_convertor, &frag->local_address);
OPAL_THREAD_UNLOCK(&recvreq->lock); OPAL_THREAD_UNLOCK(&recvreq->lock);
frag->rdma_bml = rdma_bml; frag->rdma_bml = rdma_bml;
frag->rdma_hdr.hdr_rget = *hdr; frag->rdma_hdr.hdr_rget = *hdr;
frag->retries = 0; frag->retries = 0;
frag->rdma_req = recvreq; frag->rdma_req = recvreq;
frag->rdma_ep = bml_endpoint; frag->rdma_state = MCA_PML_OB1_RDMA_GET;
frag->rdma_state = MCA_PML_OB1_RDMA_GET; frag->local_handle = NULL;
frag->reg = NULL; frag->rdma_offset = offset;
frag->rdma_length = bytes_remaining;
if (bytes_remaining > rdma_bml->btl->btl_get_limit) {
frag->rdma_length = rdma_bml->btl->btl_get_limit;
} else {
frag->rdma_length = bytes_remaining;
}
/* NTH: TODO -- handle error conditions gracefully */ /* NTH: TODO -- handle error conditions gracefully */
rc = mca_pml_ob1_recv_request_get_frag(frag); rc = mca_pml_ob1_recv_request_get_frag(frag);
@ -920,13 +942,11 @@ int mca_pml_ob1_recv_request_schedule_once( mca_pml_ob1_recv_request_t* recvreq,
while(bytes_remaining > 0 && while(bytes_remaining > 0 &&
recvreq->req_pipeline_depth < mca_pml_ob1.recv_pipeline_depth) { recvreq->req_pipeline_depth < mca_pml_ob1.recv_pipeline_depth) {
size_t size, seg_size; mca_pml_ob1_rdma_frag_t *frag = NULL;
mca_pml_ob1_rdma_hdr_t* hdr; mca_btl_base_module_t *btl;
mca_btl_base_descriptor_t* dst;
mca_btl_base_descriptor_t* ctl;
mca_mpool_base_registration_t * reg = NULL;
mca_btl_base_module_t* btl;
int rc, rdma_idx; int rc, rdma_idx;
void *data_ptr;
size_t size;
if(prev_bytes_remaining == bytes_remaining) { if(prev_bytes_remaining == bytes_remaining) {
if(++num_fail == num_tries) { if(++num_fail == num_tries) {
@ -947,85 +967,62 @@ int mca_pml_ob1_recv_request_schedule_once( mca_pml_ob1_recv_request_t* recvreq,
do { do {
rdma_idx = recvreq->req_rdma_idx; rdma_idx = recvreq->req_rdma_idx;
bml_btl = recvreq->req_rdma[rdma_idx].bml_btl; bml_btl = recvreq->req_rdma[rdma_idx].bml_btl;
reg = recvreq->req_rdma[rdma_idx].btl_reg;
size = recvreq->req_rdma[rdma_idx].length; size = recvreq->req_rdma[rdma_idx].length;
if(++recvreq->req_rdma_idx >= recvreq->req_rdma_cnt) if(++recvreq->req_rdma_idx >= recvreq->req_rdma_cnt)
recvreq->req_rdma_idx = 0; recvreq->req_rdma_idx = 0;
} while(!size); } while(!size);
btl = bml_btl->btl; btl = bml_btl->btl;
/* makes sure that we don't exceed BTL max rdma size /* NTH: This conditional used to check if there was a registration in
* if memory is not pinned already */ * recvreq->req_rdma[rdma_idx].btl_reg. If once existed it was due to
if( (NULL == reg) && (btl->btl_rdma_pipeline_frag_size != 0) && * the btl not needed registration (equivalent to btl->btl_register_mem
(size > btl->btl_rdma_pipeline_frag_size)) { * != NULL. This new check is equivalent. Note: I feel this protocol
* needs work to better improve resource usage when running with a
* leave pinned protocol. */
if (btl->btl_register_mem && (btl->btl_rdma_pipeline_frag_size != 0) &&
(size > btl->btl_rdma_pipeline_frag_size)) {
size = btl->btl_rdma_pipeline_frag_size; size = btl->btl_rdma_pipeline_frag_size;
} }
/* take lock to protect converter against concurrent access MCA_PML_OB1_RDMA_FRAG_ALLOC(frag);
if (OPAL_UNLIKELY(NULL == frag)) {
continue;
}
/* take lock to protect convertor against concurrent access
* from unpack */ * from unpack */
OPAL_THREAD_LOCK(&recvreq->lock); OPAL_THREAD_LOCK(&recvreq->lock);
opal_convertor_set_position( &recvreq->req_recv.req_base.req_convertor, opal_convertor_set_position (&recvreq->req_recv.req_base.req_convertor,
&recvreq->req_rdma_offset ); &recvreq->req_rdma_offset);
opal_convertor_get_current_pointer (&recvreq->req_recv.req_base.req_convertor, &data_ptr);
/* prepare a descriptor for RDMA */
mca_bml_base_prepare_dst(bml_btl, reg,
&recvreq->req_recv.req_base.req_convertor,
MCA_BTL_NO_ORDER, 0, &size, MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
MCA_BTL_DES_FLAGS_PUT, &dst);
OPAL_THREAD_UNLOCK(&recvreq->lock); OPAL_THREAD_UNLOCK(&recvreq->lock);
if(OPAL_UNLIKELY(dst == NULL)) { if (btl->btl_register_mem) {
continue; mca_bml_base_register_mem (bml_btl, data_ptr, size, MCA_BTL_REG_FLAG_REMOTE_WRITE,
&frag->local_handle);
if (OPAL_UNLIKELY(NULL == frag->local_handle)) {
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
continue;
}
} }
dst->des_cbfunc = mca_pml_ob1_put_completion; /* fill in the minimum information needed to handle the fin message */
dst->des_cbdata = recvreq; frag->cbfunc = mca_pml_ob1_put_completion;
frag->rdma_length = size;
frag->rdma_req = recvreq;
frag->rdma_bml = bml_btl;
frag->local_address = data_ptr;
frag->rdma_offset = recvreq->req_rdma_offset;
seg_size = btl->btl_seg_size * dst->des_local_count; rc = mca_pml_ob1_recv_request_put_frag (frag);
if (OPAL_LIKELY(OMPI_SUCCESS == rc)) {
/* prepare a descriptor for rdma control message */
mca_bml_base_alloc(bml_btl, &ctl, MCA_BTL_NO_ORDER, sizeof(mca_pml_ob1_rdma_hdr_t) + seg_size,
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
if( OPAL_UNLIKELY(NULL == ctl) ) {
mca_bml_base_free(bml_btl,dst);
continue;
}
ctl->des_cbfunc = mca_pml_ob1_recv_ctl_completion;
/* fill in rdma header */
hdr = (mca_pml_ob1_rdma_hdr_t*)ctl->des_local->seg_addr.pval;
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_PUT;
hdr->hdr_common.hdr_flags =
(!recvreq->req_ack_sent) ? MCA_PML_OB1_HDR_TYPE_ACK : 0;
hdr->hdr_req = recvreq->remote_req_send;
hdr->hdr_des.pval = dst;
hdr->hdr_recv_req.pval = recvreq;
hdr->hdr_rdma_offset = recvreq->req_rdma_offset;
hdr->hdr_seg_cnt = dst->des_local_count;
/* copy segments */
memmove (hdr + 1, dst->des_local, seg_size);
if(!recvreq->req_ack_sent)
recvreq->req_ack_sent = true;
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_PUT, recvreq->req_recv.req_base.req_proc);
PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE,
&(recvreq->req_recv.req_base), size,
PERUSE_RECV);
/* send rdma request to peer */
rc = mca_bml_base_send(bml_btl, ctl, MCA_PML_OB1_HDR_TYPE_PUT);
if( OPAL_LIKELY( rc >= 0 ) ) {
/* update request state */ /* update request state */
recvreq->req_rdma_offset += size; recvreq->req_rdma_offset += size;
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth, 1); OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth, 1);
recvreq->req_rdma[rdma_idx].length -= size; recvreq->req_rdma[rdma_idx].length -= size;
bytes_remaining -= size; bytes_remaining -= size;
} else { } else {
mca_bml_base_free(bml_btl,ctl); MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
mca_bml_base_free(bml_btl,dst);
} }
} }

Просмотреть файл

@ -1,3 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/* /*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
@ -10,7 +11,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights * Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2014 Research Organization for Information Science * Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved. * and Technology (RIST). All rights reserved.
@ -52,6 +53,8 @@ struct mca_pml_ob1_recv_request_t {
bool req_ack_sent; /**< whether ack was sent to the sender */ bool req_ack_sent; /**< whether ack was sent to the sender */
bool req_match_received; /**< Prevent request to be completed prematurely */ bool req_match_received; /**< Prevent request to be completed prematurely */
opal_mutex_t lock; opal_mutex_t lock;
mca_bml_base_btl_t *rdma_bml;
mca_btl_base_registration_handle_t *local_handle;
mca_pml_ob1_com_btl_t req_rdma[1]; mca_pml_ob1_com_btl_t req_rdma[1];
}; };
typedef struct mca_pml_ob1_recv_request_t mca_pml_ob1_recv_request_t; typedef struct mca_pml_ob1_recv_request_t mca_pml_ob1_recv_request_t;
@ -131,8 +134,12 @@ do { \
#define MCA_PML_OB1_RECV_REQUEST_RETURN(recvreq) \ #define MCA_PML_OB1_RECV_REQUEST_RETURN(recvreq) \
{ \ { \
MCA_PML_BASE_RECV_REQUEST_FINI(&(recvreq)->req_recv); \ MCA_PML_BASE_RECV_REQUEST_FINI(&(recvreq)->req_recv); \
OMPI_FREE_LIST_RETURN_MT( &mca_pml_base_recv_requests, \ if ((recvreq)->local_handle) { \
(ompi_free_list_item_t*)(recvreq)); \ mca_bml_base_deregister_mem ((recvreq)->rdma_bml, (recvreq)->local_handle); \
(recvreq)->local_handle = NULL; \
} \
OMPI_FREE_LIST_RETURN_MT( &mca_pml_base_recv_requests, \
(ompi_free_list_item_t*)(recvreq)); \
} }
/** /**
@ -154,9 +161,11 @@ recv_request_pml_complete(mca_pml_ob1_recv_request_t *recvreq)
} }
for(i = 0; i < recvreq->req_rdma_cnt; i++) { for(i = 0; i < recvreq->req_rdma_cnt; i++) {
mca_mpool_base_registration_t* btl_reg = recvreq->req_rdma[i].btl_reg; struct mca_btl_base_registration_handle_t *handle = recvreq->req_rdma[i].btl_reg;
if( NULL != btl_reg && btl_reg->mpool != NULL) { mca_bml_base_btl_t *bml_btl = recvreq->req_rdma[i].bml_btl;
btl_reg->mpool->mpool_deregister( btl_reg->mpool, btl_reg );
if (NULL != handle) {
mca_bml_base_deregister_mem (bml_btl, handle);
} }
} }
recvreq->req_rdma_cnt = 0; recvreq->req_rdma_cnt = 0;
@ -178,6 +187,10 @@ recv_request_pml_complete(mca_pml_ob1_recv_request_t *recvreq)
recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR = recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR =
MPI_ERR_TRUNCATE; MPI_ERR_TRUNCATE;
} }
if (OPAL_UNLIKELY(recvreq->local_handle)) {
mca_bml_base_deregister_mem (recvreq->rdma_bml, recvreq->local_handle);
recvreq->local_handle = NULL;
}
MCA_PML_OB1_RECV_REQUEST_MPI_COMPLETE(recvreq); MCA_PML_OB1_RECV_REQUEST_MPI_COMPLETE(recvreq);
} }
OPAL_THREAD_UNLOCK(&ompi_request_lock); OPAL_THREAD_UNLOCK(&ompi_request_lock);
@ -387,7 +400,7 @@ static inline void mca_pml_ob1_recv_request_schedule(
(void)mca_pml_ob1_recv_request_schedule_exclusive(req, start_bml_btl); (void)mca_pml_ob1_recv_request_schedule_exclusive(req, start_bml_btl);
} }
#define MCA_PML_OB1_ADD_ACK_TO_PENDING(P, S, D, O) \ #define MCA_PML_OB1_ADD_ACK_TO_PENDING(P, S, D, O, Sz) \
do { \ do { \
mca_pml_ob1_pckt_pending_t *_pckt; \ mca_pml_ob1_pckt_pending_t *_pckt; \
\ \
@ -396,6 +409,7 @@ static inline void mca_pml_ob1_recv_request_schedule(
_pckt->hdr.hdr_ack.hdr_src_req.lval = (S); \ _pckt->hdr.hdr_ack.hdr_src_req.lval = (S); \
_pckt->hdr.hdr_ack.hdr_dst_req.pval = (D); \ _pckt->hdr.hdr_ack.hdr_dst_req.pval = (D); \
_pckt->hdr.hdr_ack.hdr_send_offset = (O); \ _pckt->hdr.hdr_ack.hdr_send_offset = (O); \
_pckt->hdr.hdr_ack.hdr_send_size = (Sz); \
_pckt->proc = (P); \ _pckt->proc = (P); \
_pckt->bml_btl = NULL; \ _pckt->bml_btl = NULL; \
OPAL_THREAD_LOCK(&mca_pml_ob1.lock); \ OPAL_THREAD_LOCK(&mca_pml_ob1.lock); \
@ -406,11 +420,11 @@ static inline void mca_pml_ob1_recv_request_schedule(
int mca_pml_ob1_recv_request_ack_send_btl(ompi_proc_t* proc, int mca_pml_ob1_recv_request_ack_send_btl(ompi_proc_t* proc,
mca_bml_base_btl_t* bml_btl, uint64_t hdr_src_req, void *hdr_dst_req, mca_bml_base_btl_t* bml_btl, uint64_t hdr_src_req, void *hdr_dst_req,
uint64_t hdr_rdma_offset, bool nordma); uint64_t hdr_rdma_offset, uint64_t size, bool nordma);
static inline int mca_pml_ob1_recv_request_ack_send(ompi_proc_t* proc, static inline int mca_pml_ob1_recv_request_ack_send(ompi_proc_t* proc,
uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset, uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset,
bool nordma) uint64_t size, bool nordma)
{ {
size_t i; size_t i;
mca_bml_base_btl_t* bml_btl; mca_bml_base_btl_t* bml_btl;
@ -420,12 +434,12 @@ static inline int mca_pml_ob1_recv_request_ack_send(ompi_proc_t* proc,
for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) { for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) {
bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager); bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
if(mca_pml_ob1_recv_request_ack_send_btl(proc, bml_btl, hdr_src_req, if(mca_pml_ob1_recv_request_ack_send_btl(proc, bml_btl, hdr_src_req,
hdr_dst_req, hdr_send_offset, nordma) == OMPI_SUCCESS) hdr_dst_req, hdr_send_offset, size, nordma) == OMPI_SUCCESS)
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
MCA_PML_OB1_ADD_ACK_TO_PENDING(proc, hdr_src_req, hdr_dst_req, MCA_PML_OB1_ADD_ACK_TO_PENDING(proc, hdr_src_req, hdr_dst_req,
hdr_send_offset); hdr_send_offset, size);
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }

Просмотреть файл

@ -137,6 +137,7 @@ static void mca_pml_ob1_send_request_construct(mca_pml_ob1_send_request_t* req)
req->req_send.req_base.req_ompi.req_cancel = mca_pml_ob1_send_request_cancel; req->req_send.req_base.req_ompi.req_cancel = mca_pml_ob1_send_request_cancel;
req->req_rdma_cnt = 0; req->req_rdma_cnt = 0;
req->req_throttle_sends = false; req->req_throttle_sends = false;
req->rdma_frag = NULL;
OBJ_CONSTRUCT(&req->req_send_ranges, opal_list_t); OBJ_CONSTRUCT(&req->req_send_ranges, opal_list_t);
OBJ_CONSTRUCT(&req->req_send_range_lock, opal_mutex_t); OBJ_CONSTRUCT(&req->req_send_range_lock, opal_mutex_t);
} }
@ -145,6 +146,10 @@ static void mca_pml_ob1_send_request_destruct(mca_pml_ob1_send_request_t* req)
{ {
OBJ_DESTRUCT(&req->req_send_ranges); OBJ_DESTRUCT(&req->req_send_ranges);
OBJ_DESTRUCT(&req->req_send_range_lock); OBJ_DESTRUCT(&req->req_send_range_lock);
if (req->rdma_frag) {
MCA_PML_OB1_RDMA_FRAG_RETURN(req->rdma_frag);
req->rdma_frag = NULL;
}
} }
OBJ_CLASS_INSTANCE( mca_pml_ob1_send_request_t, OBJ_CLASS_INSTANCE( mca_pml_ob1_send_request_t,
@ -236,10 +241,9 @@ mca_pml_ob1_rndv_completion( mca_btl_base_module_t* btl,
* happens in one thread, the increase of the req_bytes_delivered does not * happens in one thread, the increase of the req_bytes_delivered does not
* have to be atomic. * have to be atomic.
*/ */
req_bytes_delivered = mca_pml_ob1_compute_segment_length (btl->btl_seg_size, req_bytes_delivered = mca_pml_ob1_compute_segment_length_base ((void *) des->des_segments,
(void *) des->des_local, des->des_segment_count,
des->des_local_count, sizeof(mca_pml_ob1_rendezvous_hdr_t));
sizeof(mca_pml_ob1_rendezvous_hdr_t));
mca_pml_ob1_rndv_completion_request( bml_btl, sendreq, req_bytes_delivered ); mca_pml_ob1_rndv_completion_request( bml_btl, sendreq, req_bytes_delivered );
} }
@ -250,27 +254,18 @@ mca_pml_ob1_rndv_completion( mca_btl_base_module_t* btl,
*/ */
static void static void
mca_pml_ob1_rget_completion( mca_btl_base_module_t* btl, mca_pml_ob1_rget_completion (mca_pml_ob1_rdma_frag_t *frag, int64_t rdma_length)
struct mca_btl_base_endpoint_t* ep,
struct mca_btl_base_descriptor_t* des,
int status )
{ {
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)des->des_cbdata; mca_pml_ob1_send_request_t *sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req;
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context; mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
size_t req_bytes_delivered;
/* count bytes of user data actually delivered and check for request completion */ /* count bytes of user data actually delivered and check for request completion */
if (OPAL_LIKELY(OMPI_SUCCESS == status)) { if (OPAL_LIKELY(0 < rdma_length)) {
req_bytes_delivered = mca_pml_ob1_compute_segment_length (btl->btl_seg_size, OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, (size_t) rdma_length);
(void *) des->des_local,
des->des_local_count, 0);
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered);
} }
sendreq->src_des = NULL;
send_request_pml_complete_check(sendreq); send_request_pml_complete_check(sendreq);
/* free the descriptor */
mca_bml_base_free(bml_btl, des);
MCA_PML_OB1_PROGRESS_PENDING(bml_btl); MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
} }
@ -314,10 +309,9 @@ mca_pml_ob1_frag_completion( mca_btl_base_module_t* btl,
} }
/* count bytes of user data actually delivered */ /* count bytes of user data actually delivered */
req_bytes_delivered = mca_pml_ob1_compute_segment_length (btl->btl_seg_size, req_bytes_delivered = mca_pml_ob1_compute_segment_length_base ((void *) des->des_segments,
(void *) des->des_local, des->des_segment_count,
des->des_local_count, sizeof(mca_pml_ob1_frag_hdr_t));
sizeof(mca_pml_ob1_frag_hdr_t));
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, -1); OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, -1);
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered); OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered);
@ -388,7 +382,7 @@ int mca_pml_ob1_send_request_start_buffered(
if( OPAL_UNLIKELY(NULL == des) ) { if( OPAL_UNLIKELY(NULL == des) ) {
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }
segment = des->des_local; segment = des->des_segments;
/* pack the data into the BTL supplied buffer */ /* pack the data into the BTL supplied buffer */
iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval + iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval +
@ -407,17 +401,14 @@ int mca_pml_ob1_send_request_start_buffered(
/* build rendezvous header */ /* build rendezvous header */
hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval; hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
hdr->hdr_common.hdr_flags = 0; mca_pml_ob1_rendezvous_hdr_prepare (&hdr->hdr_rndv, MCA_PML_OB1_HDR_TYPE_RNDV, 0,
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_RNDV; sendreq->req_send.req_base.req_comm->c_contextid,
hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; sendreq->req_send.req_base.req_comm->c_my_rank,
hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; sendreq->req_send.req_base.req_tag,
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; (uint16_t)sendreq->req_send.req_base.req_sequence,
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; sendreq->req_send.req_bytes_packed, sendreq);
hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed;
hdr->hdr_rndv.hdr_src_req.pval = sendreq;
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV, ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV, sendreq->req_send.req_base.req_proc);
sendreq->req_send.req_base.req_proc);
/* update lengths */ /* update lengths */
segment->seg_len = sizeof(mca_pml_ob1_rendezvous_hdr_t) + max_data; segment->seg_len = sizeof(mca_pml_ob1_rendezvous_hdr_t) + max_data;
@ -490,15 +481,13 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq,
if(NULL != bml_btl->btl->btl_sendi) { if(NULL != bml_btl->btl->btl_sendi) {
mca_pml_ob1_match_hdr_t match; mca_pml_ob1_match_hdr_t match;
match.hdr_common.hdr_flags = 0; mca_pml_ob1_match_hdr_prepare (&match, MCA_PML_OB1_HDR_TYPE_MATCH, 0,
match.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH; sendreq->req_send.req_base.req_comm->c_contextid,
match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; sendreq->req_send.req_base.req_comm->c_my_rank,
match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; sendreq->req_send.req_base.req_tag,
match.hdr_tag = sendreq->req_send.req_base.req_tag; (uint16_t)sendreq->req_send.req_base.req_sequence);
match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
ob1_hdr_hton(&match, MCA_PML_OB1_HDR_TYPE_MATCH, ob1_hdr_hton (&match, MCA_PML_OB1_HDR_TYPE_MATCH, sendreq->req_send.req_base.req_proc);
sendreq->req_send.req_base.req_proc);
/* try to send immediately */ /* try to send immediately */
rc = mca_bml_base_sendi( bml_btl, &sendreq->req_send.req_base.req_convertor, rc = mca_bml_base_sendi( bml_btl, &sendreq->req_send.req_base.req_convertor,
@ -531,7 +520,7 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq,
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }
segment = des->des_local; segment = des->des_segments;
if(size > 0) { if(size > 0) {
/* pack the data into the supplied buffer */ /* pack the data into the supplied buffer */
@ -565,15 +554,13 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq,
/* build match header */ /* build match header */
hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval; hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
hdr->hdr_common.hdr_flags = 0; mca_pml_ob1_match_hdr_prepare (&hdr->hdr_match, MCA_PML_OB1_HDR_TYPE_MATCH, 0,
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH; sendreq->req_send.req_base.req_comm->c_contextid,
hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; sendreq->req_send.req_base.req_comm->c_my_rank,
hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; sendreq->req_send.req_base.req_tag,
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; (uint16_t)sendreq->req_send.req_base.req_sequence);
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH, ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH, sendreq->req_send.req_base.req_proc);
sendreq->req_send.req_base.req_proc);
/* update lengths */ /* update lengths */
segment->seg_len = OMPI_PML_OB1_MATCH_HDR_LEN + max_data; segment->seg_len = OMPI_PML_OB1_MATCH_HDR_LEN + max_data;
@ -617,7 +604,6 @@ int mca_pml_ob1_send_request_start_prepare( mca_pml_ob1_send_request_t* sendreq,
/* prepare descriptor */ /* prepare descriptor */
mca_bml_base_prepare_src( bml_btl, mca_bml_base_prepare_src( bml_btl,
NULL,
&sendreq->req_send.req_base.req_convertor, &sendreq->req_send.req_base.req_convertor,
MCA_BTL_NO_ORDER, MCA_BTL_NO_ORDER,
OMPI_PML_OB1_MATCH_HDR_LEN, OMPI_PML_OB1_MATCH_HDR_LEN,
@ -627,19 +613,17 @@ int mca_pml_ob1_send_request_start_prepare( mca_pml_ob1_send_request_t* sendreq,
if( OPAL_UNLIKELY(NULL == des) ) { if( OPAL_UNLIKELY(NULL == des) ) {
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }
segment = des->des_local; segment = des->des_segments;
/* build match header */ /* build match header */
hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval; hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
hdr->hdr_common.hdr_flags = 0; mca_pml_ob1_match_hdr_prepare (&hdr->hdr_match, MCA_PML_OB1_HDR_TYPE_MATCH, 0,
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH; sendreq->req_send.req_base.req_comm->c_contextid,
hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; sendreq->req_send.req_base.req_comm->c_my_rank,
hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; sendreq->req_send.req_base.req_tag,
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; (uint16_t)sendreq->req_send.req_base.req_sequence);
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH, ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH, sendreq->req_send.req_base.req_proc);
sendreq->req_send.req_base.req_proc);
/* short message */ /* short message */
des->des_cbfunc = mca_pml_ob1_match_completion_free; des->des_cbfunc = mca_pml_ob1_match_completion_free;
@ -673,79 +657,67 @@ int mca_pml_ob1_send_request_start_rdma( mca_pml_ob1_send_request_t* sendreq,
* one RDMA capable BTLs). This way round robin distribution of RDMA * one RDMA capable BTLs). This way round robin distribution of RDMA
* operation is achieved. * operation is achieved.
*/ */
mca_btl_base_registration_handle_t *local_handle;
mca_btl_base_descriptor_t *des, *src = NULL; mca_btl_base_descriptor_t *des;
mca_pml_ob1_rdma_frag_t *frag;
mca_pml_ob1_rget_hdr_t *hdr; mca_pml_ob1_rget_hdr_t *hdr;
size_t seg_size; size_t reg_size;
void *data_ptr;
int rc; int rc;
sendreq->src_des = NULL;
bml_btl = sendreq->req_rdma[0].bml_btl; bml_btl = sendreq->req_rdma[0].bml_btl;
if (!(bml_btl->btl_flags & (MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_CUDA_GET))) { if (!(bml_btl->btl_flags & (MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_CUDA_GET))) {
sendreq->rdma_frag = NULL;
/* This BTL does not support get. Use rendezvous to start the RDMA operation using put instead. */ /* This BTL does not support get. Use rendezvous to start the RDMA operation using put instead. */
return mca_pml_ob1_send_request_start_rndv (sendreq, bml_btl, 0, MCA_PML_OB1_HDR_FLAGS_CONTIG | return mca_pml_ob1_send_request_start_rndv (sendreq, bml_btl, 0, MCA_PML_OB1_HDR_FLAGS_CONTIG |
MCA_PML_OB1_HDR_FLAGS_PIN); MCA_PML_OB1_HDR_FLAGS_PIN);
} }
MEMCHECKER( /* at this time ob1 does not support non-contiguous gets. the convertor represents a
memchecker_call(&opal_memchecker_base_mem_defined, * contiguous block of memory */
sendreq->req_send.req_base.req_addr, opal_convertor_get_current_pointer (&sendreq->req_send.req_base.req_convertor, &data_ptr);
sendreq->req_send.req_base.req_count,
sendreq->req_send.req_base.req_datatype); local_handle = sendreq->req_rdma[0].btl_reg;
);
/* prepare source descriptor/segment(s) */ /* allocate an rdma fragment to keep track of the request size for use in the fin message */
/* PML owns this descriptor and will free it in */ MCA_PML_OB1_RDMA_FRAG_ALLOC(frag);
/* mca_pml_ob1_rget_completion */ if (OPAL_UNLIKELY(NULL == frag)) {
mca_bml_base_prepare_src( bml_btl, sendreq->req_rdma[0].btl_reg, return OPAL_ERR_OUT_OF_RESOURCE;
&sendreq->req_send.req_base.req_convertor,
MCA_BTL_NO_ORDER, 0, &size, MCA_BTL_DES_FLAGS_GET |
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP, &src );
MEMCHECKER(
memchecker_call(&opal_memchecker_base_mem_noaccess,
sendreq->req_send.req_base.req_addr,
sendreq->req_send.req_base.req_count,
sendreq->req_send.req_base.req_datatype);
);
if( OPAL_UNLIKELY(NULL == src) ) {
return OMPI_ERR_OUT_OF_RESOURCE;
} }
src->des_cbfunc = mca_pml_ob1_rget_completion;
src->des_cbdata = sendreq;
sendreq->src_des = src; /* fill in necessary fragment data */
frag->rdma_req = sendreq;
frag->rdma_bml = bml_btl;
frag->rdma_length = size;
frag->cbfunc = mca_pml_ob1_rget_completion;
/* do not store the local handle in the fragment. it will be released by mca_pml_ob1_free_rdma_resources */
seg_size = bml_btl->btl->btl_seg_size * src->des_local_count; /* save the fragment for get->put fallback */
sendreq->rdma_frag = frag;
reg_size = bml_btl->btl->btl_registration_handle_size;
/* allocate space for get hdr + segment list */ /* allocate space for get hdr + segment list */
mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, sizeof (*hdr) + seg_size, mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, sizeof (*hdr) + reg_size,
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
if( OPAL_UNLIKELY(NULL == des) ) { if( OPAL_UNLIKELY(NULL == des) ) {
/* NTH: no need to reset the converter here. it will be reset before it is retried */ /* NTH: no need to reset the converter here. it will be reset before it is retried */
mca_bml_base_free(bml_btl, src);
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }
/* build match header */ /* build match header */
hdr = (mca_pml_ob1_rget_hdr_t *) des->des_local->seg_addr.pval; hdr = (mca_pml_ob1_rget_hdr_t *) des->des_segments->seg_addr.pval;
/* TODO -- Add support for multiple segments for get */
hdr->hdr_rndv.hdr_match.hdr_common.hdr_flags = MCA_PML_OB1_HDR_FLAGS_CONTIG|MCA_PML_OB1_HDR_FLAGS_PIN; mca_pml_ob1_rget_hdr_prepare (hdr, MCA_PML_OB1_HDR_FLAGS_CONTIG | MCA_PML_OB1_HDR_FLAGS_PIN,
hdr->hdr_rndv.hdr_match.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_RGET; sendreq->req_send.req_base.req_comm->c_contextid,
hdr->hdr_rndv.hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; sendreq->req_send.req_base.req_comm->c_my_rank,
hdr->hdr_rndv.hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; sendreq->req_send.req_base.req_tag,
hdr->hdr_rndv.hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; (uint16_t)sendreq->req_send.req_base.req_sequence,
hdr->hdr_rndv.hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; sendreq->req_send.req_bytes_packed, sendreq,
hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed; frag, data_ptr, local_handle, reg_size);
hdr->hdr_rndv.hdr_src_req.pval = sendreq;
hdr->hdr_des.pval = src;
hdr->hdr_seg_cnt = src->des_local_count;
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RGET, sendreq->req_send.req_base.req_proc); ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RGET, sendreq->req_send.req_base.req_proc);
/* copy segment data */
memcpy (hdr + 1, src->des_local, seg_size);
des->des_cbfunc = mca_pml_ob1_send_ctl_completion; des->des_cbfunc = mca_pml_ob1_send_ctl_completion;
des->des_cbdata = sendreq; des->des_cbdata = sendreq;
@ -763,12 +735,6 @@ int mca_pml_ob1_send_request_start_rdma( mca_pml_ob1_send_request_t* sendreq,
rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_RGET); rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_RGET);
if (OPAL_UNLIKELY(rc < 0)) { if (OPAL_UNLIKELY(rc < 0)) {
mca_bml_base_free(bml_btl, des); mca_bml_base_free(bml_btl, des);
if (sendreq->src_des) {
mca_bml_base_free (bml_btl, sendreq->src_des);
sendreq->src_des = NULL;
}
return rc; return rc;
} }
@ -806,7 +772,6 @@ int mca_pml_ob1_send_request_start_rndv( mca_pml_ob1_send_request_t* sendreq,
sendreq->req_send.req_base.req_datatype); sendreq->req_send.req_base.req_datatype);
); );
mca_bml_base_prepare_src( bml_btl, mca_bml_base_prepare_src( bml_btl,
NULL,
&sendreq->req_send.req_base.req_convertor, &sendreq->req_send.req_base.req_convertor,
MCA_BTL_NO_ORDER, MCA_BTL_NO_ORDER,
sizeof(mca_pml_ob1_rendezvous_hdr_t), sizeof(mca_pml_ob1_rendezvous_hdr_t),
@ -824,21 +789,18 @@ int mca_pml_ob1_send_request_start_rndv( mca_pml_ob1_send_request_t* sendreq,
if( OPAL_UNLIKELY(NULL == des) ) { if( OPAL_UNLIKELY(NULL == des) ) {
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }
segment = des->des_local; segment = des->des_segments;
/* build hdr */ /* build hdr */
hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval; hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
hdr->hdr_common.hdr_flags = flags; mca_pml_ob1_rendezvous_hdr_prepare (&hdr->hdr_rndv, MCA_PML_OB1_HDR_TYPE_RNDV, flags,
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_RNDV; sendreq->req_send.req_base.req_comm->c_contextid,
hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; sendreq->req_send.req_base.req_comm->c_my_rank,
hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; sendreq->req_send.req_base.req_tag,
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; (uint16_t)sendreq->req_send.req_base.req_sequence,
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; sendreq->req_send.req_bytes_packed, sendreq);
hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed;
hdr->hdr_rndv.hdr_src_req.pval = sendreq;
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV, ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV, sendreq->req_send.req_base.req_proc);
sendreq->req_send.req_base.req_proc);
/* first fragment of a long message */ /* first fragment of a long message */
des->des_cbdata = sendreq; des->des_cbdata = sendreq;
@ -1019,10 +981,8 @@ cannot_pack:
sendreq->req_send.req_base.req_count, sendreq->req_send.req_base.req_count,
sendreq->req_send.req_base.req_datatype); sendreq->req_send.req_base.req_datatype);
); );
mca_bml_base_prepare_src(bml_btl, NULL, mca_bml_base_prepare_src(bml_btl, &sendreq->req_send.req_base.req_convertor,
&sendreq->req_send.req_base.req_convertor, MCA_BTL_NO_ORDER, sizeof(mca_pml_ob1_frag_hdr_t),
MCA_BTL_NO_ORDER,
sizeof(mca_pml_ob1_frag_hdr_t),
&size, MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK, &des); &size, MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK, &des);
MEMCHECKER( MEMCHECKER(
memchecker_call(&opal_memchecker_base_mem_noaccess, memchecker_call(&opal_memchecker_base_mem_noaccess,
@ -1046,12 +1006,9 @@ cannot_pack:
des->des_cbdata = sendreq; des->des_cbdata = sendreq;
/* setup header */ /* setup header */
hdr = (mca_pml_ob1_frag_hdr_t*)des->des_local->seg_addr.pval; hdr = (mca_pml_ob1_frag_hdr_t*)des->des_segments->seg_addr.pval;
hdr->hdr_common.hdr_flags = 0; mca_pml_ob1_frag_hdr_prepare (hdr, 0, range->range_send_offset, sendreq,
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_FRAG; sendreq->req_recv.lval);
hdr->hdr_frag_offset = range->range_send_offset;
hdr->hdr_src_req.pval = sendreq;
hdr->hdr_dst_req = sendreq->req_recv;
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_FRAG, ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_FRAG,
sendreq->req_send.req_base.req_proc); sendreq->req_send.req_base.req_proc);
@ -1108,38 +1065,66 @@ cannot_pack:
} }
/**
* A put fragment could not be started. Queue the fragment to be retried later or
* fall back on send/recv.
*/
static void mca_pml_ob1_send_request_put_frag_failed (mca_pml_ob1_rdma_frag_t *frag, int rc)
{
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req;
mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
if (++frag->retries < mca_pml_ob1.rdma_retries_limit && OMPI_ERR_OUT_OF_RESOURCE == rc) {
/* queue the frag for later if there was a resource error */
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
} else {
/* tell receiver to deregister memory */
mca_pml_ob1_send_fin (sendreq->req_send.req_base.req_proc, bml_btl,
frag->rdma_hdr.hdr_rdma.hdr_frag, 0, MCA_BTL_NO_ORDER,
OPAL_ERR_TEMP_OUT_OF_RESOURCE);
/* send fragment by copy in/out */
mca_pml_ob1_send_request_copy_in_out(sendreq, frag->rdma_hdr.hdr_rdma.hdr_rdma_offset,
frag->rdma_length);
/* if a pointer to a receive request is not set it means that
* ACK was not yet received. Don't schedule sends before ACK */
if (NULL != sendreq->req_recv.pval)
mca_pml_ob1_send_request_schedule (sendreq);
}
}
/** /**
* An RDMA put operation has completed: * An RDMA put operation has completed:
* (1) Update request status and if required set completed * (1) Update request status and if required set completed
* (2) Send FIN control message to the destination * (2) Send FIN control message to the destination
*/ */
static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl, static void mca_pml_ob1_put_completion (mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* ep,
struct mca_btl_base_endpoint_t* ep, void *local_address, mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_descriptor_t* des, void *context, void *cbdata, int status)
int status )
{ {
mca_pml_ob1_rdma_frag_t* frag = (mca_pml_ob1_rdma_frag_t*)des->des_cbdata; mca_pml_ob1_rdma_frag_t *frag = (mca_pml_ob1_rdma_frag_t *) cbdata;
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)frag->rdma_req; mca_pml_ob1_send_request_t *sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req;
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context; mca_bml_base_btl_t *bml_btl = (mca_bml_base_btl_t *) context;
/* check completion status */ /* check completion status */
if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) { if( OPAL_UNLIKELY(OMPI_SUCCESS == status) ) {
/* TSW - FIX */ /* TODO -- readd ordering */
OMPI_ERROR_LOG(status); mca_pml_ob1_send_fin (sendreq->req_send.req_base.req_proc, bml_btl,
ompi_rte_abort(-1, NULL); frag->rdma_hdr.hdr_rdma.hdr_frag, frag->rdma_length,
0, 0);
/* check for request completion */
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, frag->rdma_length);
send_request_pml_complete_check(sendreq);
} else {
/* try to fall back on send/recv */
mca_pml_ob1_send_request_put_frag_failed (frag, status);
} }
mca_pml_ob1_send_fin(sendreq->req_send.req_base.req_proc,
bml_btl,
frag->rdma_hdr.hdr_rdma.hdr_des,
des->order, 0);
/* check for request completion */
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, frag->rdma_length);
send_request_pml_complete_check(sendreq);
MCA_PML_OB1_RDMA_FRAG_RETURN(frag); MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
MCA_PML_OB1_PROGRESS_PENDING(bml_btl); MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
@ -1147,81 +1132,45 @@ static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl,
int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t *frag ) int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t *frag )
{ {
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)frag->rdma_req; mca_pml_ob1_send_request_t *sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req;
mca_mpool_base_registration_t *reg = NULL; mca_btl_base_registration_handle_t *local_handle = NULL;
mca_bml_base_btl_t *bml_btl = frag->rdma_bml; mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
mca_btl_base_descriptor_t *des;
size_t save_size = frag->rdma_length;
int rc; int rc;
if (OPAL_LIKELY(NULL == sendreq->src_des)) { if (bml_btl->btl->btl_register_mem && NULL == frag->local_handle) {
/* setup descriptor */ /* Check if the segment is already registered */
mca_bml_base_prepare_src( bml_btl, for (size_t i = 0 ; i < sendreq->req_rdma_cnt ; ++i) {
reg, if (sendreq->req_rdma[i].bml_btl == frag->rdma_bml) {
&frag->convertor, /* do not copy the handle to the fragment to avoid deregistring it twice */
MCA_BTL_NO_ORDER, local_handle = sendreq->req_rdma[i].btl_reg;
0, break;
&frag->rdma_length, }
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | }
MCA_BTL_DES_FLAGS_PUT,
&des );
if( OPAL_UNLIKELY(NULL == des) ) {
if(frag->retries < mca_pml_ob1.rdma_retries_limit) {
size_t offset = (size_t)frag->rdma_hdr.hdr_rdma.hdr_rdma_offset;
frag->rdma_length = save_size;
opal_convertor_set_position(&frag->convertor, &offset);
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
} else {
mca_pml_ob1_send_request_t *sendreq =
(mca_pml_ob1_send_request_t*)frag->rdma_req;
/* tell receiver to unregister memory */ if (NULL == frag->local_handle) {
mca_pml_ob1_send_fin(sendreq->req_send.req_base.req_proc, /* Not already registered. Register the region with the BTL. */
bml_btl, frag->rdma_hdr.hdr_rdma.hdr_des, mca_bml_base_register_mem (bml_btl, frag->local_address, frag->rdma_length, 0,
MCA_BTL_NO_ORDER, 1); &frag->local_handle);
/* send fragment by copy in/out */ if (OPAL_UNLIKELY(NULL == frag->local_handle)) {
mca_pml_ob1_send_request_copy_in_out(sendreq, mca_pml_ob1_send_request_put_frag_failed (frag, OMPI_ERR_OUT_OF_RESOURCE);
frag->rdma_hdr.hdr_rdma.hdr_rdma_offset, frag->rdma_length);
/* if a pointer to a receive request is not set it means that return OMPI_ERR_OUT_OF_RESOURCE;
* ACK was not yet received. Don't schedule sends before ACK */
if(NULL != sendreq->req_recv.pval)
mca_pml_ob1_send_request_schedule(sendreq);
} }
return OMPI_ERR_OUT_OF_RESOURCE; local_handle = frag->local_handle;
} }
} else {
/* already have a source descriptor */
des = sendreq->src_des;
sendreq->src_des = NULL;
} }
des->des_remote = (mca_btl_base_segment_t *) frag->rdma_segs;
des->des_remote_count = frag->rdma_hdr.hdr_rdma.hdr_seg_cnt;
des->des_cbfunc = mca_pml_ob1_put_completion;
des->des_cbdata = frag;
PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE, PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE,
&(((mca_pml_ob1_send_request_t*)frag->rdma_req)->req_send.req_base), save_size, PERUSE_SEND ); &(((mca_pml_ob1_send_request_t*)frag->rdma_req)->req_send.req_base), save_size, PERUSE_SEND );
rc = mca_bml_base_put(bml_btl, des); rc = mca_bml_base_put (bml_btl, frag->local_address, frag->remote_address, local_handle,
(mca_btl_base_registration_handle_t *) frag->remote_handle, frag->rdma_length,
0, MCA_BTL_NO_ORDER, mca_pml_ob1_put_completion, frag);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
mca_bml_base_free(bml_btl, des); mca_pml_ob1_send_request_put_frag_failed (frag, rc);
frag->rdma_length = save_size; return rc;
if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
return OMPI_ERR_OUT_OF_RESOURCE;
} else {
/* TSW - FIX */
OMPI_ERROR_LOG(rc);
ompi_rte_abort(-1, NULL);
}
} }
return OMPI_SUCCESS; return OMPI_SUCCESS;
@ -1235,12 +1184,11 @@ int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t *frag )
*/ */
void mca_pml_ob1_send_request_put( mca_pml_ob1_send_request_t* sendreq, void mca_pml_ob1_send_request_put( mca_pml_ob1_send_request_t* sendreq,
mca_btl_base_module_t* btl, mca_btl_base_module_t* btl,
mca_pml_ob1_rdma_hdr_t* hdr ) mca_pml_ob1_rdma_hdr_t* hdr )
{ {
mca_bml_base_endpoint_t *bml_endpoint = sendreq->req_endpoint; mca_bml_base_endpoint_t *bml_endpoint = sendreq->req_endpoint;
mca_pml_ob1_rdma_frag_t* frag; mca_pml_ob1_rdma_frag_t* frag;
size_t i, size = 0;
if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_TYPE_ACK) { if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_TYPE_ACK) {
OPAL_THREAD_ADD32(&sendreq->req_state, -1); OPAL_THREAD_ADD32(&sendreq->req_state, -1);
@ -1248,61 +1196,36 @@ void mca_pml_ob1_send_request_put( mca_pml_ob1_send_request_t* sendreq,
sendreq->req_recv.pval = hdr->hdr_recv_req.pval; sendreq->req_recv.pval = hdr->hdr_recv_req.pval;
MCA_PML_OB1_RDMA_FRAG_ALLOC(frag); if (NULL == sendreq->rdma_frag) {
MCA_PML_OB1_RDMA_FRAG_ALLOC(frag);
if( OPAL_UNLIKELY(NULL == frag) ) { if( OPAL_UNLIKELY(NULL == frag) ) {
/* TSW - FIX */ /* TSW - FIX */
OMPI_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE); OMPI_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE);
ompi_rte_abort(-1, NULL); ompi_rte_abort(-1, NULL);
}
assert (btl->btl_seg_size * hdr->hdr_seg_cnt <= sizeof (frag->rdma_segs));
/* setup fragment */
memcpy (frag->rdma_segs, hdr + 1, btl->btl_seg_size * hdr->hdr_seg_cnt);
for( i = 0; i < hdr->hdr_seg_cnt; i++ ) {
mca_btl_base_segment_t *seg = (mca_btl_base_segment_t *) ((uintptr_t)(frag->rdma_segs) + i * btl->btl_seg_size);
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
if ((sendreq->req_send.req_base.req_proc->super.proc_arch & OPAL_ARCH_ISBIGENDIAN) !=
(ompi_proc_local()->super.proc_arch & OPAL_ARCH_ISBIGENDIAN)) {
size += opal_swap_bytes4(seg->seg_len);
} else
#endif
{
size += seg->seg_len;
} }
} else {
/* rget fallback on put */
frag = sendreq->rdma_frag;
sendreq->rdma_frag = NULL;
sendreq->req_state = 0;
} }
/* copy registration data */
memcpy (frag->remote_handle, hdr + 1, btl->btl_registration_handle_size);
frag->rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl); frag->rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl);
frag->rdma_hdr.hdr_rdma = *hdr; frag->rdma_hdr.hdr_rdma = *hdr;
frag->rdma_req = sendreq; frag->rdma_req = sendreq;
frag->rdma_ep = bml_endpoint; frag->rdma_length = hdr->hdr_dst_size;
frag->rdma_length = size;
frag->rdma_state = MCA_PML_OB1_RDMA_PUT; frag->rdma_state = MCA_PML_OB1_RDMA_PUT;
frag->reg = NULL; frag->remote_address = hdr->hdr_dst_ptr;
frag->retries = 0; frag->retries = 0;
if (OPAL_UNLIKELY(NULL != sendreq->src_des)) { /* Get the address of the current offset. Note: at this time ob1 CAN NOT handle
/* get fallback path */ * non-contiguous RDMA. If that changes this code will be wrong. */
sendreq->req_state = 0; opal_convertor_get_offset_pointer (&sendreq->req_send.req_base.req_convertor,
} hdr->hdr_rdma_offset, &frag->local_address);
/* lookup the corresponding registration */
for(i=0; i<sendreq->req_rdma_cnt; i++) {
if(sendreq->req_rdma[i].bml_btl == frag->rdma_bml) {
frag->reg = sendreq->req_rdma[i].btl_reg;
break;
}
}
/* RDMA writes may proceed in parallel to send and to each other, so
* create clone of the convertor for each RDMA fragment
*/
size = hdr->hdr_rdma_offset;
opal_convertor_clone_with_position(&sendreq->req_send.req_base.req_convertor,
&frag->convertor, 0, &size);
mca_pml_ob1_send_request_put_frag(frag); mca_pml_ob1_send_request_put_frag(frag);
} }

Просмотреть файл

@ -12,7 +12,7 @@
* All rights reserved. * All rights reserved.
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2011-2012 NVIDIA Corporation. All rights reserved. * Copyright (c) 2011-2012 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights * Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
@ -54,7 +54,7 @@ struct mca_pml_ob1_send_request_t {
mca_pml_ob1_send_pending_t req_pending; mca_pml_ob1_send_pending_t req_pending;
opal_mutex_t req_send_range_lock; opal_mutex_t req_send_range_lock;
opal_list_t req_send_ranges; opal_list_t req_send_ranges;
mca_btl_base_descriptor_t *src_des; mca_pml_ob1_rdma_frag_t *rdma_frag;
mca_pml_ob1_com_btl_t req_rdma[1]; mca_pml_ob1_com_btl_t req_rdma[1];
}; };
typedef struct mca_pml_ob1_send_request_t mca_pml_ob1_send_request_t; typedef struct mca_pml_ob1_send_request_t mca_pml_ob1_send_request_t;
@ -124,10 +124,9 @@ get_request_from_send_pending(mca_pml_ob1_send_pending_t *type)
ompi_free_list_item_t* item; \ ompi_free_list_item_t* item; \
\ \
if( OPAL_LIKELY(NULL != proc) ) { \ if( OPAL_LIKELY(NULL != proc) ) { \
OMPI_FREE_LIST_WAIT_MT(&mca_pml_base_send_requests, item); \ OMPI_FREE_LIST_WAIT_MT(&mca_pml_base_send_requests, item); \
sendreq = (mca_pml_ob1_send_request_t*)item; \ sendreq = (mca_pml_ob1_send_request_t*)item; \
sendreq->req_send.req_base.req_proc = proc; \ sendreq->req_send.req_base.req_proc = proc; \
sendreq->src_des = NULL; \
} \ } \
} }
@ -163,15 +162,18 @@ get_request_from_send_pending(mca_pml_ob1_send_pending_t *type)
assert( 0 == _position ); \ assert( 0 == _position ); \
} }
static inline void mca_pml_ob1_free_rdma_resources(mca_pml_ob1_send_request_t* sendreq) static inline void mca_pml_ob1_free_rdma_resources (mca_pml_ob1_send_request_t* sendreq)
{ {
size_t r; size_t r;
/* return mpool resources */ /* return mpool resources */
for(r = 0; r < sendreq->req_rdma_cnt; r++) { for(r = 0; r < sendreq->req_rdma_cnt; r++) {
mca_mpool_base_registration_t* reg = sendreq->req_rdma[r].btl_reg; struct mca_btl_base_registration_handle_t *handle = sendreq->req_rdma[r].btl_reg;
if( NULL != reg && reg->mpool != NULL ) { mca_bml_base_btl_t *bml_btl = sendreq->req_rdma[r].bml_btl;
reg->mpool->mpool_deregister(reg->mpool, reg);
if (NULL != handle) {
mca_bml_base_deregister_mem (bml_btl, handle);
sendreq->req_rdma[r].btl_reg = NULL;
} }
} }
sendreq->req_rdma_cnt = 0; sendreq->req_rdma_cnt = 0;
@ -218,10 +220,14 @@ do {
#define MCA_PML_OB1_SEND_REQUEST_RETURN(sendreq) \ #define MCA_PML_OB1_SEND_REQUEST_RETURN(sendreq) \
do { \ do { \
/* Let the base handle the reference counts */ \ /* Let the base handle the reference counts */ \
MCA_PML_BASE_SEND_REQUEST_FINI((&(sendreq)->req_send)); \ MCA_PML_BASE_SEND_REQUEST_FINI((&(sendreq)->req_send)); \
OMPI_FREE_LIST_RETURN_MT( &mca_pml_base_send_requests, \ if (sendreq->rdma_frag) { \
(ompi_free_list_item_t*)sendreq); \ MCA_PML_OB1_RDMA_FRAG_RETURN (sendreq->rdma_frag); \
sendreq->rdma_frag = NULL; \
} \
OMPI_FREE_LIST_RETURN_MT( &mca_pml_base_send_requests, \
(ompi_free_list_item_t*)sendreq); \
} while(0) } while(0)

Просмотреть файл

@ -1,4 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/* /*
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
@ -217,6 +217,14 @@ static inline void opal_convertor_get_current_pointer( const opal_convertor_t* p
*position = (void*)base; *position = (void*)base;
} }
static inline void opal_convertor_get_offset_pointer( const opal_convertor_t* pConv,
size_t offset, void** position )
{
unsigned char* base = pConv->pBaseBuf + offset + pConv->pDesc->true_lb;
*position = (void*)base;
}
/* /*
* *
*/ */

Просмотреть файл

@ -36,10 +36,8 @@ mca_btl_active_message_callback_t mca_btl_base_active_message_trigger[MCA_BTL_TA
static void mca_btl_base_descriptor_constructor(mca_btl_base_descriptor_t* des) static void mca_btl_base_descriptor_constructor(mca_btl_base_descriptor_t* des)
{ {
des->des_local = NULL; des->des_segments = NULL;
des->des_local_count = 0; des->des_segment_count = 0;
des->des_remote = NULL;
des->des_remote_count = 0;
des->des_cbfunc = NULL; des->des_cbfunc = NULL;
des->des_cbdata = NULL; des->des_cbdata = NULL;
des->des_flags = 0; des->des_flags = 0;

Просмотреть файл

@ -1,3 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/* /*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
@ -45,13 +46,15 @@ int mca_btl_base_param_register(mca_base_component_t *version,
MCA_BASE_VAR_SCOPE_READONLY, MCA_BASE_VAR_SCOPE_READONLY,
&module->btl_exclusivity); &module->btl_exclusivity);
asprintf(&msg, "BTL bit flags (general flags: SEND=%d, PUT=%d, GET=%d, SEND_INPLACE=%d, RDMA_MATCHED=%d, HETEROGENEOUS_RDMA=%d; flags only used by the \"dr\" PML (ignored by others): ACK=%d, CHECKSUM=%d, RDMA_COMPLETION=%d; flags only used by the \"bfo\" PML (ignored by others): FAILOVER_SUPPORT=%d)", asprintf(&msg, "BTL bit flags (general flags: SEND=%d, PUT=%d, GET=%d, SEND_INPLACE=%d, HETEROGENEOUS_RDMA=%d, "
"ATOMIC_OPS=%d; flags only used by the \"dr\" PML (ignored by others): ACK=%d, CHECKSUM=%d, "
"RDMA_COMPLETION=%d; flags only used by the \"bfo\" PML (ignored by others): FAILOVER_SUPPORT=%d)",
MCA_BTL_FLAGS_SEND, MCA_BTL_FLAGS_SEND,
MCA_BTL_FLAGS_PUT, MCA_BTL_FLAGS_PUT,
MCA_BTL_FLAGS_GET, MCA_BTL_FLAGS_GET,
MCA_BTL_FLAGS_SEND_INPLACE, MCA_BTL_FLAGS_SEND_INPLACE,
MCA_BTL_FLAGS_RDMA_MATCHED,
MCA_BTL_FLAGS_HETEROGENEOUS_RDMA, MCA_BTL_FLAGS_HETEROGENEOUS_RDMA,
MCA_BTL_FLAGS_ATOMIC_OPS,
MCA_BTL_FLAGS_NEED_ACK, MCA_BTL_FLAGS_NEED_ACK,
MCA_BTL_FLAGS_NEED_CSUM, MCA_BTL_FLAGS_NEED_CSUM,
MCA_BTL_FLAGS_RDMA_COMPLETION, MCA_BTL_FLAGS_RDMA_COMPLETION,
@ -63,6 +66,14 @@ int mca_btl_base_param_register(mca_base_component_t *version,
&module->btl_flags); &module->btl_flags);
free(msg); free(msg);
asprintf (&msg, "BTL atomic bit flags (general flags: ADD=%d, AND=%d, OR=%d, XOR=%d",
MCA_BTL_ATOMIC_SUPPORTS_ADD, MCA_BTL_ATOMIC_SUPPORTS_AND, MCA_BTL_ATOMIC_SUPPORTS_OR,
MCA_BTL_ATOMIC_SUPPORTS_XOR);
(void) mca_base_component_var_register(version, "atomic_flags", msg, MCA_BASE_VAR_TYPE_UNSIGNED_INT,
NULL, 0, MCA_BASE_VAR_FLAG_DEFAULT_ONLY, OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT, &module->btl_atomic_flags);
free(msg);
(void) mca_base_component_var_register(version, "rndv_eager_limit", "Size (in bytes, including header) of \"phase 1\" fragment sent for all large messages (must be >= 0 and <= eager_limit)", (void) mca_base_component_var_register(version, "rndv_eager_limit", "Size (in bytes, including header) of \"phase 1\" fragment sent for all large messages (must be >= 0 and <= eager_limit)",
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0,
OPAL_INFO_LVL_4, OPAL_INFO_LVL_4,
@ -74,6 +85,39 @@ int mca_btl_base_param_register(mca_base_component_t *version,
OPAL_INFO_LVL_4, OPAL_INFO_LVL_4,
MCA_BASE_VAR_SCOPE_READONLY, MCA_BASE_VAR_SCOPE_READONLY,
&module->btl_eager_limit); &module->btl_eager_limit);
if ((module->btl_flags & MCA_BTL_FLAGS_GET) && module->btl_get) {
if (0 == module->btl_get_limit) {
module->btl_get_limit = SIZE_MAX;
}
(void) mca_base_component_var_register(version, "get_limit", "Maximum size (in bytes) for btl get",
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, OPAL_INFO_LVL_4,
MCA_BASE_VAR_SCOPE_READONLY, &module->btl_get_limit);
/* Allow the user to set the alignment. The BTL should double-check the alignment in its open
* function. */
(void) mca_base_component_var_register(version, "get_alignment", "Alignment required for btl get",
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, OPAL_INFO_LVL_6,
MCA_BASE_VAR_SCOPE_CONSTANT, &module->btl_get_alignment);
}
if ((module->btl_flags & MCA_BTL_FLAGS_PUT) && module->btl_put) {
if (0 == module->btl_put_limit) {
module->btl_put_limit = SIZE_MAX;
}
(void) mca_base_component_var_register(version, "put_limit", "Maximum size (in bytes) for btl put",
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, OPAL_INFO_LVL_4,
MCA_BASE_VAR_SCOPE_READONLY, &module->btl_put_limit);
/* Allow the user to set the alignment. The BTL should double-check the alignment in its open
* function. */
(void) mca_base_component_var_register(version, "put_alignment", "Alignment required for btl put",
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, OPAL_INFO_LVL_6,
MCA_BASE_VAR_SCOPE_CONSTANT, &module->btl_put_alignment);
}
#if OPAL_CUDA_GDR_SUPPORT #if OPAL_CUDA_GDR_SUPPORT
/* If no CUDA RDMA support, zero them out */ /* If no CUDA RDMA support, zero them out */
if (!(MCA_BTL_FLAGS_CUDA_GET & module->btl_flags)) { if (!(MCA_BTL_FLAGS_CUDA_GET & module->btl_flags)) {
@ -144,5 +188,17 @@ int mca_btl_base_param_verify(mca_btl_base_module_t *module)
module->btl_flags &= ~MCA_BTL_FLAGS_GET; module->btl_flags &= ~MCA_BTL_FLAGS_GET;
} }
if (0 == module->btl_atomic_flags) {
module->btl_flags &= ~MCA_BTL_FLAGS_ATOMIC_OPS;
}
if (0 == module->btl_get_limit) {
module->btl_get_limit = SIZE_MAX;
}
if (0 == module->btl_put_limit) {
module->btl_put_limit = SIZE_MAX;
}
return OPAL_SUCCESS; return OPAL_SUCCESS;
} }

Просмотреть файл

@ -134,6 +134,23 @@ struct mca_btl_base_descriptor_t;
struct mca_mpool_base_resources_t; struct mca_mpool_base_resources_t;
struct opal_proc_t; struct opal_proc_t;
/**
* Opaque registration handle for executing RDMA and atomic
* operations on a memory region.
*
* This data inside this handle is appropriate for passing
* to remote peers to execute RDMA and atomic operations. The
* size needed to send the registration handle can be
* obtained from the btl via the btl_registration_handle_size
* member. If this size is 0 then no registration data is
* needed to execute RDMA or atomic operations.
*/
struct mca_btl_base_registration_handle_t;
typedef struct mca_btl_base_registration_handle_t mca_btl_base_registration_handle_t;
/* Wildcard endpoint for use in the register_mem function */
#define MCA_BTL_ENDPOINT_ANY (struct mca_btl_base_endpoint_t *) -1
/* send/recv operations require tag matching */ /* send/recv operations require tag matching */
typedef uint8_t mca_btl_base_tag_t; typedef uint8_t mca_btl_base_tag_t;
@ -173,6 +190,9 @@ typedef uint8_t mca_btl_base_tag_t;
#define MCA_BTL_FLAGS_SEND 0x0001 #define MCA_BTL_FLAGS_SEND 0x0001
#define MCA_BTL_FLAGS_PUT 0x0002 #define MCA_BTL_FLAGS_PUT 0x0002
#define MCA_BTL_FLAGS_GET 0x0004 #define MCA_BTL_FLAGS_GET 0x0004
/* btls that set the MCA_BTL_FLAGS_RDMA will always get added to the BML
* rdma_btls list. This allows the updated one-sided component to
* use btls that are not otherwise used for send/recv. */
#define MCA_BTL_FLAGS_RDMA (MCA_BTL_FLAGS_GET|MCA_BTL_FLAGS_PUT) #define MCA_BTL_FLAGS_RDMA (MCA_BTL_FLAGS_GET|MCA_BTL_FLAGS_PUT)
/* btl can send directly from user buffer w/out registration */ /* btl can send directly from user buffer w/out registration */
@ -209,6 +229,12 @@ typedef uint8_t mca_btl_base_tag_t;
*/ */
#define MCA_BTL_FLAGS_SIGNALED 0x4000 #define MCA_BTL_FLAGS_SIGNALED 0x4000
/** The BTL supports network atomic operations */
#define MCA_BTL_FLAGS_ATOMIC_OPS 0x08000
/** The BTL supports fetching network atomic operations */
#define MCA_BTL_FLAGS_ATOMIC_FOPS 0x10000
/* Default exclusivity levels */ /* Default exclusivity levels */
#define MCA_BTL_EXCLUSIVITY_HIGH (64*1024) /* internal loopback */ #define MCA_BTL_EXCLUSIVITY_HIGH (64*1024) /* internal loopback */
#define MCA_BTL_EXCLUSIVITY_DEFAULT 1024 /* GM/IB/etc. */ #define MCA_BTL_EXCLUSIVITY_DEFAULT 1024 /* GM/IB/etc. */
@ -219,6 +245,62 @@ typedef uint8_t mca_btl_base_tag_t;
#define MCA_BTL_ERROR_FLAGS_NONFATAL 0x2 #define MCA_BTL_ERROR_FLAGS_NONFATAL 0x2
#define MCA_BTL_ERROR_FLAGS_ADD_CUDA_IPC 0x4 #define MCA_BTL_ERROR_FLAGS_ADD_CUDA_IPC 0x4
/** registration flags */
enum {
/** Allow local write on the registered region. If a region is registered
* with this flag the registration can be used as the local handle for a
* btl_get operation. */
MCA_BTL_REG_FLAG_LOCAL_WRITE = 0x00000001,
/** Allow remote read on the registered region. If a region is registered
* with this flag the registration can be used as the remote handle for a
* btl_get operation. */
MCA_BTL_REG_FLAG_REMOTE_READ = 0x00000002,
/** Allow remote write on the registered region. If a region is registered
* with this flag the registration can be used as the remote handle for a
* btl_put operation. */
MCA_BTL_REG_FLAG_REMOTE_WRITE = 0x00000004,
/** Allow remote atomic operations on the registered region. If a region is
* registered with this flag the registration can be used as the remote
* handle for a btl_atomic_op or btl_atomic_fop operation. */
MCA_BTL_REG_FLAG_REMOTE_ATOMIC = 0x00000008,
/** Allow any btl operation on the registered region. If a region is registered
* with this flag the registration can be used as the local or remote handle for
* any btl operation. */
MCA_BTL_REG_FLAG_ACCESS_ANY = 0x0000000f,
#if OPAL_CUDA_GDR_SUPPORT
/** Region is in GPU memory */
MCA_BTL_REG_FLAG_CUDA_GPU_MEM = 0x00010000,
#endif
};
/** supported atomic operations */
enum {
/** The btl supports atomic add */
MCA_BTL_ATOMIC_SUPPORTS_ADD = 0x00000001,
/** The btl supports atomic bitwise and */
MCA_BTL_ATOMIC_SUPPORTS_AND = 0x00000200,
/** The btl supports atomic bitwise or */
MCA_BTL_ATOMIC_SUPPORTS_OR = 0x00000400,
/** The btl supports atomic bitwise exclusive or */
MCA_BTL_ATOMIC_SUPPORTS_XOR = 0x00000800,
/** The btl supports atomic compare-and-swap */
MCA_BTL_ATOMIC_SUPPORTS_CSWAP = 0x10000000,
/** The btl guarantees global atomicity (can mix btl atomics with cpu atomics) */
MCA_BTL_ATOMIC_SUPPORTS_GLOB = 0x20000000,
};
enum mca_btl_base_atomic_op_t {
/** Atomic add: (*remote_address) = (*remote_address) + operand */
MCA_BTL_ATOMIC_ADD = 0x0001,
/** Atomic and: (*remote_address) = (*remote_address) & operand */
MCA_BTL_ATOMIC_AND = 0x0011,
/** Atomic or: (*remote_address) = (*remote_address) | operand */
MCA_BTL_ATOMIC_OR = 0x0012,
/** Atomic xor: (*remote_address) = (*remote_address) ^ operand */
MCA_BTL_ATOMIC_XOR = 0x0014,
};
typedef enum mca_btl_base_atomic_op_t mca_btl_base_atomic_op_t;
/** /**
* Asynchronous callback function on completion of an operation. * Asynchronous callback function on completion of an operation.
* Completion Semantics: The descriptor can be reused or returned to the * Completion Semantics: The descriptor can be reused or returned to the
@ -237,6 +319,32 @@ typedef void (*mca_btl_base_completion_fn_t)(
struct mca_btl_base_descriptor_t* descriptor, struct mca_btl_base_descriptor_t* descriptor,
int status); int status);
/**
* Asynchronous callback function on completion of an rdma or atomic operation.
* Completion Semantics: The rdma or atomic memory operation has completed
* remotely (i.e.) is remotely visible and the caller is free to deregister
* the local_handle or modify the memory in local_address.
*
* @param[IN] module the BTL module
* @param[IN] endpoint the BTL endpoint
* @param[IN] local_address local address for the operation (if any)
* @param[IN] local_handle local handle associated with the local_address
* @param[IN] context callback context supplied to the rdma/atomic operation
* @param[IN] cbdata callback data supplied to the rdma/atomic operation
* @param[IN] status status of the operation
*
*/
typedef void (*mca_btl_base_rdma_completion_fn_t)(
struct mca_btl_base_module_t* module,
struct mca_btl_base_endpoint_t* endpoint,
void *local_address,
struct mca_btl_base_registration_handle_t *local_handle,
void *context,
void *cbdata,
int status);
/** /**
* Describes a region/segment of memory that is addressable * Describes a region/segment of memory that is addressable
* by an BTL. * by an BTL.
@ -262,20 +370,19 @@ struct mca_btl_base_segment_t {
}; };
typedef struct mca_btl_base_segment_t mca_btl_base_segment_t; typedef struct mca_btl_base_segment_t mca_btl_base_segment_t;
/** /**
* A descriptor that holds the parameters to a send/put/get * A descriptor that holds the parameters to a send/put/get
* operation along w/ a callback routine that is called on * operation along w/ a callback routine that is called on
* completion of the request. * completion of the request.
* Note: receive callbacks will store the incomming data segments in * Note: receive callbacks will store the incomming data segments in
* des_local * des_segments
*/ */
struct mca_btl_base_descriptor_t { struct mca_btl_base_descriptor_t {
ompi_free_list_item_t super; ompi_free_list_item_t super;
mca_btl_base_segment_t *des_local; /**< local segments */ mca_btl_base_segment_t *des_segments; /**< local segments */
size_t des_local_count; /**< number of local segments */ size_t des_segment_count; /**< number of local segments */
mca_btl_base_segment_t *des_remote; /**< remote segments */
size_t des_remote_count; /**< number of destination segments */
mca_btl_base_completion_fn_t des_cbfunc; /**< local callback function */ mca_btl_base_completion_fn_t des_cbfunc; /**< local callback function */
void* des_cbdata; /**< opaque callback data */ void* des_cbdata; /**< opaque callback data */
void* des_context; /**< more opaque callback data */ void* des_context; /**< more opaque callback data */
@ -329,6 +436,11 @@ OPAL_DECLSPEC OBJ_CLASS_DECLARATION(mca_btl_base_descriptor_t);
*/ */
#define MCA_BTL_SEG_MAX_SIZE 256 #define MCA_BTL_SEG_MAX_SIZE 256
/**
* Maximum size of a BTL registration handle in bytes
*/
#define MCA_BTL_REG_HANDLE_MAX_SIZE 256
/* /*
* BTL base header, stores the tag at a minimum * BTL base header, stores the tag at a minimum
*/ */
@ -395,7 +507,7 @@ typedef int (*mca_btl_base_component_progress_fn_t)(void);
* completion function, this implies that all data payload in the * completion function, this implies that all data payload in the
* mca_btl_base_descriptor_t must be copied out within this callback or * mca_btl_base_descriptor_t must be copied out within this callback or
* forfeited back to the BTL. * forfeited back to the BTL.
* Note also that descriptor segments (des_local) must be base * Note also that descriptor segments (des_segments) must be base
* segments for all callbacks. * segments for all callbacks.
* *
* @param[IN] btl BTL module * @param[IN] btl BTL module
@ -647,7 +759,6 @@ typedef int (*mca_btl_base_module_free_fn_t)(
typedef struct mca_btl_base_descriptor_t* (*mca_btl_base_module_prepare_fn_t)( typedef struct mca_btl_base_descriptor_t* (*mca_btl_base_module_prepare_fn_t)(
struct mca_btl_base_module_t* btl, struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint, struct mca_btl_base_endpoint_t* endpoint,
mca_mpool_base_registration_t* registration,
struct opal_convertor_t* convertor, struct opal_convertor_t* convertor,
uint8_t order, uint8_t order,
size_t reserve, size_t reserve,
@ -655,6 +766,43 @@ typedef struct mca_btl_base_descriptor_t* (*mca_btl_base_module_prepare_fn_t)(
uint32_t flags uint32_t flags
); );
/**
* @brief Register a memory region for put/get/atomic operations.
*
* @param btl (IN) BTL module
* @param endpoint(IN) BTL addressing information (or NULL for all endpoints)
* @param base (IN) Pointer to start of region
* @param size (IN) Size of region
* @param flags (IN) Flags indicating what operation will be performed. Valid
* values are MCA_BTL_DES_FLAGS_PUT, MCA_BTL_DES_FLAGS_GET,
* and MCA_BTL_DES_FLAGS_ATOMIC
*
* @returns a memory registration handle valid for both local and remote operations
* @returns NULL if the region could not be registered
*
* This function registers the specified region with the hardware for use with
* the btl_put, btl_get, btl_atomic_cas, btl_atomic_op, and btl_atomic_fop
* functions. Care should be taken to not hold an excessive number of registrations
* as they may use limited system/NIC resources.
*/
typedef struct mca_btl_base_registration_handle_t *(*mca_btl_base_module_register_mem_fn_t)(
struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t *endpoint, void *base,
size_t size, uint32_t flags);
/**
* @brief Deregister a memory region
*
* @param btl (IN) BTL module region was registered with
* @param handle (IN) BTL registration handle to deregister
*
* This function deregisters the memory region associated with the specified handle. Care
* should be taken to not perform any RDMA or atomic operation on this memory region
* after it is deregistered. It is erroneous to specify a memory handle associated with
* a remote node.
*/
typedef int (*mca_btl_base_module_deregister_mem_fn_t)(
struct mca_btl_base_module_t* btl, struct mca_btl_base_registration_handle_t *handle);
/** /**
* Initiate an asynchronous send. * Initiate an asynchronous send.
* Completion Semantics: the descriptor has been queued for a send operation * Completion Semantics: the descriptor has been queued for a send operation
@ -698,7 +846,8 @@ typedef int (*mca_btl_base_module_send_fn_t)(
* @param flags (IN) Flags. * @param flags (IN) Flags.
* @param tag (IN) The tag value used to notify the peer. * @param tag (IN) The tag value used to notify the peer.
* @param descriptor (OUT) The descriptor to be returned unable to be sent immediately * @param descriptor (OUT) The descriptor to be returned unable to be sent immediately
* (may be NULL).
*
* @retval OPAL_SUCCESS The send was successfully queued * @retval OPAL_SUCCESS The send was successfully queued
* @retval OPAL_ERROR The send failed * @retval OPAL_ERROR The send failed
* @retval OPAL_ERR_UNREACH The endpoint is not reachable * @retval OPAL_ERR_UNREACH The endpoint is not reachable
@ -722,58 +871,210 @@ typedef int (*mca_btl_base_module_sendi_fn_t)(
/** /**
* Initiate an asynchronous put. * Initiate an asynchronous put.
* Completion Semantics: the descriptor has been queued for a put operation * Completion Semantics: if this function returns a 1 then the operation
* the BTL now controls the descriptor until local * is complete. a return of OPAL_SUCCESS indicates
* completion callback is made on the descriptor * the put operation has been queued with the
* network. the local_handle can not be deregistered
* until all outstanding operations on that handle
* have been completed.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param local_address (IN) Local address to put from (registered)
* @param remote_address (IN) Remote address to put to (registered remotely)
* @param local_handle (IN) Registration handle for region containing
* (local_address, local_address + size)
* @param remote_handle (IN) Remote registration handle for region containing
* (remote_address, remote_address + size)
* @param size (IN) Number of bytes to put
* @param flags (IN) Flags for this put operation
* @param order (IN) Ordering
* @param cbfunc (IN) Function to call on completion (if queued)
* @param cbcontext (IN) Context for the callback
* @param cbdata (IN) Data for callback
* *
* BTLs that do not have the MCA_BTL_FLAGS_RDMA_MATCHED flag set
* allow multiple concurrent put operations on the same descriptor.
* BTLs that do have the MCA_BTL_FLAGS_RDMA_MATCHED flag set require
* a corresponding prepare_src/dst call for each put operation and
* therefore prohibit multiple concurrent put operations.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*
* @retval OPAL_SUCCESS The descriptor was successfully queued for a put * @retval OPAL_SUCCESS The descriptor was successfully queued for a put
* @retval OPAL_ERROR The descriptor was NOT successfully queued for a put * @retval OPAL_ERROR The descriptor was NOT successfully queued for a put
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put
* operation. Try again later
* @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or
* alignment restrictions.
*/ */
typedef int (*mca_btl_base_module_put_fn_t) (struct mca_btl_base_module_t *btl,
typedef int (*mca_btl_base_module_put_fn_t)( struct mca_btl_base_endpoint_t *endpoint, void *local_address,
struct mca_btl_base_module_t* btl, uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_endpoint_t* endpoint, struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
struct mca_btl_base_descriptor_t* descriptor int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
);
/** /**
* Initiate an asynchronous get. * Initiate an asynchronous get.
* Completion Semantics: if this function returns a 1 then the operation
* is complete. a return of OPAL_SUCCESS indicates
* the get operation has been queued with the
* network. the local_handle can not be deregistered
* until all outstanding operations on that handle
* have been completed.
* *
* Completion Semantics: the descriptor has been queued for a get operation * @param btl (IN) BTL module
* the BTL now controls the descriptor until local * @param endpoint (IN) BTL addressing information
* completion callback is made on the descriptor * @param local_address (IN) Local address to put from (registered)
* * @param remote_address (IN) Remote address to put to (registered remotely)
* BTLs that do not have the MCA_BTL_FLAGS_RDMA_MATCHED flag set * @param local_handle (IN) Registration handle for region containing
* allow multiple concurrent get operations on the same descriptor. * (local_address, local_address + size)
* BTLs that do have the MCA_BTL_FLAGS_RDMA_MATCHED flag set require * @param remote_handle (IN) Remote registration handle for region containing
* a corresponding prepare_src/dst call for each get operation and * (remote_address, remote_address + size)
* therefore prohibit multiple concurrent get operations. * @param size (IN) Number of bytes to put
* * @param flags (IN) Flags for this put operation
* @param btl (IN) BTL module * @param order (IN) Ordering
* @param endpoint (IN) BTL addressing information * @param cbfunc (IN) Function to call on completion (if queued)
* @param descriptor (IN) Description of the data to be transferred * @param cbcontext (IN) Context for the callback
* * @param cbdata (IN) Data for callback
* @retval OPAL_SUCCESS The descriptor was successfully queued for a get
* @retval OPAL_ERROR The descriptor was NOT successfully queued for a get
* *
* @retval OPAL_SUCCESS The descriptor was successfully queued for a put
* @retval OPAL_ERROR The descriptor was NOT successfully queued for a put
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put
* operation. Try again later
* @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or
* alignment restrictions.
*/ */
typedef int (*mca_btl_base_module_get_fn_t) (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
typedef int (*mca_btl_base_module_get_fn_t)( /**
struct mca_btl_base_module_t* btl, * Initiate an asynchronous atomic operation.
struct mca_btl_base_endpoint_t* endpoint, * Completion Semantics: if this function returns a 1 then the operation
struct mca_btl_base_descriptor_t* descriptor * is complete. a return of OPAL_SUCCESS indicates
); * the atomic operation has been queued with the
* network.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param remote_address (IN) Remote address to put to (registered remotely)
* @param remote_handle (IN) Remote registration handle for region containing
* (remote_address, remote_address + 8)
* @param op (IN) Operation to perform
* @param operand (IN) Operand for the operation
* @param flags (IN) Flags for this put operation
* @param order (IN) Ordering
* @param cbfunc (IN) Function to call on completion (if queued)
* @param cbcontext (IN) Context for the callback
* @param cbdata (IN) Data for callback
*
* @retval OPAL_SUCCESS The operation was successfully queued
* @retval 1 The operation is complete
* @retval OPAL_ERROR The operation was NOT successfully queued
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the atomic
* operation. Try again later
* @retval OPAL_ERR_NOT_AVAILABLE Atomic operation can not be performed due to
* alignment restrictions or the operation {op} is not supported
* by the hardware.
*
* After the operation is complete the remote address specified by {remote_address} and
* {remote_handle} will be updated with (*remote_address) = (*remote_address) op operand.
* The btl will guarantee consistency of atomic operations performed via the btl. Note,
* however, that not all btls will provide consistency between btl atomic operations and
* cpu atomics.
*/
typedef int (*mca_btl_base_module_atomic_op64_fn_t) (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint, uint64_t remote_address,
struct mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
void *cbcontext, void *cbdata);
/**
* Initiate an asynchronous fetching atomic operation.
* Completion Semantics: if this function returns a 1 then the operation
* is complete. a return of OPAL_SUCCESS indicates
* the atomic operation has been queued with the
* network.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param local_address (OUT) Local address to store the result in
* @param remote_address (IN) Remote address perfom operation on to (registered remotely)
* @param local_handle (IN) Local registration handle for region containing
* (local_address, local_address + 8)
* @param remote_handle (IN) Remote registration handle for region containing
* (remote_address, remote_address + 8)
* @param op (IN) Operation to perform
* @param operand (IN) Operand for the operation
* @param flags (IN) Flags for this put operation
* @param order (IN) Ordering
* @param cbfunc (IN) Function to call on completion (if queued)
* @param cbcontext (IN) Context for the callback
* @param cbdata (IN) Data for callback
*
* @retval OPAL_SUCCESS The operation was successfully queued
* @retval 1 The operation is complete
* @retval OPAL_ERROR The operation was NOT successfully queued
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the atomic
* operation. Try again later
* @retval OPAL_ERR_NOT_AVAILABLE Atomic operation can not be performed due to
* alignment restrictions or the operation {op} is not supported
* by the hardware.
*
* After the operation is complete the remote address specified by {remote_address} and
* {remote_handle} will be updated with (*remote_address) = (*remote_address) op operand.
* {local_address} will be updated with the previous value stored in {remote_address}.
* The btl will guarantee consistency of atomic operations performed via the btl. Note,
* however, that not all btls will provide consistency between btl atomic operations and
* cpu atomics.
*/
typedef int (*mca_btl_base_module_atomic_fop64_fn_t) (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address,
struct mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
void *cbcontext, void *cbdata);
/**
* Initiate an asynchronous compare and swap operation.
* Completion Semantics: if this function returns a 1 then the operation
* is complete. a return of OPAL_SUCCESS indicates
* the atomic operation has been queued with the
* network.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param local_address (OUT) Local address to store the result in
* @param remote_address (IN) Remote address perfom operation on to (registered remotely)
* @param local_handle (IN) Local registration handle for region containing
* (local_address, local_address + 8)
* @param remote_handle (IN) Remote registration handle for region containing
* (remote_address, remote_address + 8)
* @param compare (IN) Operand for the operation
* @param value (IN) Value to store on success
* @param flags (IN) Flags for this put operation
* @param order (IN) Ordering
* @param cbfunc (IN) Function to call on completion (if queued)
* @param cbcontext (IN) Context for the callback
* @param cbdata (IN) Data for callback
*
* @retval OPAL_SUCCESS The operation was successfully queued
* @retval 1 The operation is complete
* @retval OPAL_ERROR The operation was NOT successfully queued
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the atomic
* operation. Try again later
* @retval OPAL_ERR_NOT_AVAILABLE Atomic operation can not be performed due to
* alignment restrictions or the operation {op} is not supported
* by the hardware.
*
* After the operation is complete the remote address specified by {remote_address} and
* {remote_handle} will be updated with {value} if *remote_address == compare.
* {local_address} will be updated with the previous value stored in {remote_address}.
* The btl will guarantee consistency of atomic operations performed via the btl. Note,
* however, that not all btls will provide consistency between btl atomic operations and
* cpu atomics.
*/
typedef int (*mca_btl_base_module_atomic_cswap_fn_t) (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address,
struct mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_registration_handle_t *remote_handle, uint64_t compare,
uint64_t value, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
void *cbcontext, void *cbdata);
/** /**
* Diagnostic dump of btl state. * Diagnostic dump of btl state.
@ -813,7 +1114,14 @@ struct mca_btl_base_module_t {
uint32_t btl_latency; /**< relative ranking of latency used to prioritize btls */ uint32_t btl_latency; /**< relative ranking of latency used to prioritize btls */
uint32_t btl_bandwidth; /**< bandwidth (Mbytes/sec) supported by each endpoint */ uint32_t btl_bandwidth; /**< bandwidth (Mbytes/sec) supported by each endpoint */
uint32_t btl_flags; /**< flags (put/get...) */ uint32_t btl_flags; /**< flags (put/get...) */
size_t btl_seg_size; /**< size of a btl segment */ uint32_t btl_atomic_flags; /**< atomic operations supported (add, and, xor, etc) */
size_t btl_registration_handle_size; /**< size of the BTLs registration handles */
/* One-sided limitations (0 for no alignment, SIZE_MAX for no limit ) */
size_t btl_get_limit; /**< maximum size supported by the btl_get function */
size_t btl_get_alignment; /**< minimum alignment/size needed by btl_get (power of 2) */
size_t btl_put_limit; /**< maximum size supported by the btl_put function */
size_t btl_put_alignment; /**< minimum alignment/size needed by btl_put (power of 2) */
/* BTL function table */ /* BTL function table */
mca_btl_base_module_add_procs_fn_t btl_add_procs; mca_btl_base_module_add_procs_fn_t btl_add_procs;
@ -824,13 +1132,21 @@ struct mca_btl_base_module_t {
mca_btl_base_module_alloc_fn_t btl_alloc; mca_btl_base_module_alloc_fn_t btl_alloc;
mca_btl_base_module_free_fn_t btl_free; mca_btl_base_module_free_fn_t btl_free;
mca_btl_base_module_prepare_fn_t btl_prepare_src; mca_btl_base_module_prepare_fn_t btl_prepare_src;
mca_btl_base_module_prepare_fn_t btl_prepare_dst;
mca_btl_base_module_send_fn_t btl_send; mca_btl_base_module_send_fn_t btl_send;
mca_btl_base_module_sendi_fn_t btl_sendi; mca_btl_base_module_sendi_fn_t btl_sendi;
mca_btl_base_module_put_fn_t btl_put; mca_btl_base_module_put_fn_t btl_put;
mca_btl_base_module_get_fn_t btl_get; mca_btl_base_module_get_fn_t btl_get;
mca_btl_base_module_dump_fn_t btl_dump; mca_btl_base_module_dump_fn_t btl_dump;
/* atomic operations */
mca_btl_base_module_atomic_op64_fn_t btl_atomic_op;
mca_btl_base_module_atomic_fop64_fn_t btl_atomic_fop;
mca_btl_base_module_atomic_cswap_fn_t btl_atomic_cswap;
/* new memory registration functions */
mca_btl_base_module_register_mem_fn_t btl_register_mem; /**< memory registration function (NULL if not needed) */
mca_btl_base_module_deregister_mem_fn_t btl_deregister_mem; /**< memory deregistration function (NULL if not needed) */
/** the mpool associated with this btl (optional) */ /** the mpool associated with this btl (optional) */
mca_mpool_base_module_t* btl_mpool; mca_mpool_base_module_t* btl_mpool;
/** register a default error handler */ /** register a default error handler */

Просмотреть файл

@ -59,6 +59,9 @@ sources = \
btl_openib_fd.c \ btl_openib_fd.c \
btl_openib_ip.h \ btl_openib_ip.h \
btl_openib_ip.c \ btl_openib_ip.c \
btl_openib_put.c \
btl_openib_get.c \
btl_openib_atomic.c \
connect/base.h \ connect/base.h \
connect/btl_openib_connect_base.c \ connect/btl_openib_connect_base.c \
connect/btl_openib_connect_empty.c \ connect/btl_openib_connect_empty.c \

Просмотреть файл

@ -91,6 +91,11 @@
#define MIN(a,b) ((a)<(b)?(a):(b)) #define MIN(a,b) ((a)<(b)?(a):(b))
#endif #endif
static mca_btl_base_registration_handle_t *mca_btl_openib_register_mem (mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
void *base, size_t size, uint32_t flags);
static int mca_btl_openib_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle);
mca_btl_openib_module_t mca_btl_openib_module = { mca_btl_openib_module_t mca_btl_openib_module = {
.super = { .super = {
.btl_component = &mca_btl_openib_component.super, .btl_component = &mca_btl_openib_component.super,
@ -101,14 +106,19 @@ mca_btl_openib_module_t mca_btl_openib_module = {
.btl_alloc = mca_btl_openib_alloc, .btl_alloc = mca_btl_openib_alloc,
.btl_free = mca_btl_openib_free, .btl_free = mca_btl_openib_free,
.btl_prepare_src = mca_btl_openib_prepare_src, .btl_prepare_src = mca_btl_openib_prepare_src,
.btl_prepare_dst = mca_btl_openib_prepare_dst,
.btl_send = mca_btl_openib_send, .btl_send = mca_btl_openib_send,
.btl_sendi = mca_btl_openib_sendi, /* send immediate */ .btl_sendi = mca_btl_openib_sendi, /* send immediate */
.btl_put = mca_btl_openib_put, .btl_put = mca_btl_openib_put,
.btl_get = mca_btl_openib_get, .btl_get = mca_btl_openib_get,
.btl_dump = mca_btl_base_dump, .btl_dump = mca_btl_base_dump,
.btl_register_error = mca_btl_openib_register_error_cb, /* error call back registration */ .btl_register_error = mca_btl_openib_register_error_cb, /* error call back registration */
.btl_ft_event = mca_btl_openib_ft_event .btl_ft_event = mca_btl_openib_ft_event,
.btl_register_mem = mca_btl_openib_register_mem,
.btl_deregister_mem = mca_btl_openib_deregister_mem,
#if HAVE_DECL_IBV_ATOMIC_HCA
.btl_atomic_fop = mca_btl_openib_atomic_fop,
.btl_atomic_cswap = mca_btl_openib_atomic_cswap,
#endif
} }
}; };
@ -514,10 +524,12 @@ static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl,
if a user distributes different INI files or parameters for different node/procs, if a user distributes different INI files or parameters for different node/procs,
it is on his own responsibility */ it is on his own responsibility */
switch(mca_btl_openib_component.receive_queues_source) { switch(mca_btl_openib_component.receive_queues_source) {
case MCA_BASE_VAR_SOURCE_COMMAND_LINE: case MCA_BASE_VAR_SOURCE_COMMAND_LINE:
case MCA_BASE_VAR_SOURCE_ENV: case MCA_BASE_VAR_SOURCE_ENV:
case MCA_BASE_VAR_SOURCE_MAX: case MCA_BASE_VAR_SOURCE_FILE:
break; case MCA_BASE_VAR_SOURCE_SET:
case MCA_BASE_VAR_SOURCE_OVERRIDE:
break;
/* If the queues configuration was set from command line /* If the queues configuration was set from command line
(with --mca btl_openib_receive_queues parameter) => both sides have a same configuration */ (with --mca btl_openib_receive_queues parameter) => both sides have a same configuration */
@ -526,40 +538,38 @@ static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl,
not possible that remote side got its queues configuration from command line => not possible that remote side got its queues configuration from command line =>
(by prio) the configuration was set from INI file or (if not configure) (by prio) the configuration was set from INI file or (if not configure)
by default queues configuration */ by default queues configuration */
case MCA_BASE_VAR_SOURCE_FILE: case BTL_OPENIB_RQ_SOURCE_DEVICE_INI:
case MCA_BASE_VAR_SOURCE_SET: if(NULL != values.receive_queues) {
case MCA_BASE_VAR_SOURCE_OVERRIDE: recv_qps = values.receive_queues;
if(NULL != values.receive_queues) { } else {
recv_qps = values.receive_queues; recv_qps = mca_btl_openib_component.default_recv_qps;
} else { }
recv_qps = mca_btl_openib_component.default_recv_qps;
}
if(0 != strcmp(mca_btl_openib_component.receive_queues, if(0 != strcmp(mca_btl_openib_component.receive_queues,
recv_qps)) { recv_qps)) {
opal_show_help("help-mpi-btl-openib.txt", opal_show_help("help-mpi-btl-openib.txt",
"unsupported queues configuration", true, "unsupported queues configuration", true,
opal_process_info.nodename, opal_process_info.nodename,
ibv_get_device_name(openib_btl->device->ib_dev), ibv_get_device_name(openib_btl->device->ib_dev),
(openib_btl->device->ib_dev_attr).vendor_id, (openib_btl->device->ib_dev_attr).vendor_id,
(openib_btl->device->ib_dev_attr).vendor_part_id, (openib_btl->device->ib_dev_attr).vendor_part_id,
mca_btl_openib_component.receive_queues, mca_btl_openib_component.receive_queues,
opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal), opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal),
endpoint->rem_info.rem_vendor_id, endpoint->rem_info.rem_vendor_id,
endpoint->rem_info.rem_vendor_part_id, endpoint->rem_info.rem_vendor_part_id,
recv_qps); recv_qps);
return OPAL_ERROR; return OPAL_ERROR;
} }
break; break;
/* If the local queues configuration was set /* If the local queues configuration was set
by default queues => check all possible cases for remote side and compare */ by default queues => check all possible cases for remote side and compare */
case MCA_BASE_VAR_SOURCE_DEFAULT: case MCA_BASE_VAR_SOURCE_DEFAULT:
if(NULL != values.receive_queues) { if(NULL != values.receive_queues) {
if(0 != strcmp(mca_btl_openib_component.receive_queues, if(0 != strcmp(mca_btl_openib_component.receive_queues,
values.receive_queues)) { values.receive_queues)) {
opal_show_help("help-mpi-btl-openib.txt", opal_show_help("help-mpi-btl-openib.txt",
"unsupported queues configuration", true, "unsupported queues configuration", true,
opal_process_info.nodename, opal_process_info.nodename,
ibv_get_device_name(openib_btl->device->ib_dev), ibv_get_device_name(openib_btl->device->ib_dev),
@ -571,10 +581,10 @@ static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl,
endpoint->rem_info.rem_vendor_part_id, endpoint->rem_info.rem_vendor_part_id,
values.receive_queues); values.receive_queues);
return OPAL_ERROR; return OPAL_ERROR;
}
} }
break; }
break;
} }
return OPAL_SUCCESS; return OPAL_SUCCESS;
@ -724,7 +734,7 @@ static int prepare_device_for_use (mca_btl_openib_device_t *device)
#if OPAL_HAVE_THREADS #if OPAL_HAVE_THREADS
if(mca_btl_openib_component.use_async_event_thread) { if(mca_btl_openib_component.use_async_event_thread) {
mca_btl_openib_async_cmd_t async_command; mca_btl_openib_async_cmd_t async_command;
/* start the async even thread if it is not already started */ /* start the async even thread if it is not already started */
if (start_async_event_thread() != OPAL_SUCCESS) if (start_async_event_thread() != OPAL_SUCCESS)
@ -732,8 +742,8 @@ static int prepare_device_for_use (mca_btl_openib_device_t *device)
device->got_fatal_event = false; device->got_fatal_event = false;
device->got_port_event = false; device->got_port_event = false;
async_command.a_cmd = OPENIB_ASYNC_CMD_FD_ADD; async_command.a_cmd = OPENIB_ASYNC_CMD_FD_ADD;
async_command.fd = device->ib_dev_context->async_fd; async_command.fd = device->ib_dev_context->async_fd;
if (write(mca_btl_openib_component.async_pipe[1], if (write(mca_btl_openib_component.async_pipe[1],
&async_command, sizeof(mca_btl_openib_async_cmd_t))<0){ &async_command, sizeof(mca_btl_openib_async_cmd_t))<0){
BTL_ERROR(("Failed to write to pipe [%d]",errno)); BTL_ERROR(("Failed to write to pipe [%d]",errno));
@ -948,6 +958,12 @@ int mca_btl_openib_add_procs(
return rc; return rc;
} }
rc = mca_btl_openib_size_queues(openib_btl, nprocs);
if (OPAL_SUCCESS != rc) {
BTL_ERROR(("error creating cqs"));
return rc;
}
for (i = 0, local_procs = 0 ; i < (int) nprocs; i++) { for (i = 0, local_procs = 0 ; i < (int) nprocs; i++) {
struct opal_proc_t* proc = procs[i]; struct opal_proc_t* proc = procs[i];
mca_btl_openib_proc_t* ib_proc; mca_btl_openib_proc_t* ib_proc;
@ -959,11 +975,6 @@ int mca_btl_openib_add_procs(
local_procs ++; local_procs ++;
} }
/* OOB, XOOB, and RDMACM do not support SELF comunication, so
* mark the prco as unreachable by openib btl */
if (0 == opal_compare_proc(OPAL_PROC_MY_NAME, proc->proc_name)) {
continue;
}
#if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE) #if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE)
/* Most current iWARP adapters (June 2008) cannot handle /* Most current iWARP adapters (June 2008) cannot handle
talking to other processes on the same host (!) -- so mark talking to other processes on the same host (!) -- so mark
@ -1133,7 +1144,7 @@ int mca_btl_openib_add_procs(
return OPAL_ERROR; return OPAL_ERROR;
} }
return mca_btl_openib_size_queues(openib_btl, nprocs); return OPAL_SUCCESS;
} }
/* /*
@ -1226,15 +1237,16 @@ ib_frag_alloc(mca_btl_openib_module_t *btl, size_t size, uint8_t order,
/* check if pending fragment has enough space for coalescing */ /* check if pending fragment has enough space for coalescing */
static mca_btl_openib_send_frag_t *check_coalescing(opal_list_t *frag_list, static mca_btl_openib_send_frag_t *check_coalescing(opal_list_t *frag_list,
opal_mutex_t *lock, mca_btl_base_endpoint_t *ep, size_t size) opal_mutex_t *lock, struct mca_btl_base_endpoint_t *ep, size_t size,
mca_btl_openib_coalesced_frag_t **cfrag)
{ {
mca_btl_openib_send_frag_t *frag = NULL; mca_btl_openib_send_frag_t *frag = NULL;
if(opal_list_is_empty(frag_list)) if (opal_list_is_empty(frag_list))
return NULL; return NULL;
OPAL_THREAD_LOCK(lock); OPAL_THREAD_LOCK(lock);
if(!opal_list_is_empty(frag_list)) { if (!opal_list_is_empty(frag_list)) {
int qp; int qp;
size_t total_length; size_t total_length;
opal_list_item_t *i = opal_list_get_first(frag_list); opal_list_item_t *i = opal_list_get_first(frag_list);
@ -1251,10 +1263,20 @@ static mca_btl_openib_send_frag_t *check_coalescing(opal_list_t *frag_list,
qp = to_base_frag(frag)->base.order; qp = to_base_frag(frag)->base.order;
if(total_length <= mca_btl_openib_component.qp_infos[qp].size) if(total_length <= mca_btl_openib_component.qp_infos[qp].size) {
opal_list_remove_first(frag_list); /* make sure we can allocate a coalescing frag before returning success */
else *cfrag = alloc_coalesced_frag();
if (OPAL_LIKELY(NULL != cfrag)) {
(*cfrag)->send_frag = frag;
(*cfrag)->sent = false;
opal_list_remove_first(frag_list);
} else {
frag = NULL;
}
} else {
frag = NULL; frag = NULL;
}
} }
OPAL_THREAD_UNLOCK(lock); OPAL_THREAD_UNLOCK(lock);
@ -1281,7 +1303,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_alloc(
mca_btl_openib_module_t *obtl = (mca_btl_openib_module_t*)btl; mca_btl_openib_module_t *obtl = (mca_btl_openib_module_t*)btl;
int qp = frag_size_to_order(obtl, size); int qp = frag_size_to_order(obtl, size);
mca_btl_openib_send_frag_t *sfrag = NULL; mca_btl_openib_send_frag_t *sfrag = NULL;
mca_btl_openib_coalesced_frag_t *cfrag; mca_btl_openib_coalesced_frag_t *cfrag = NULL;
assert(qp != MCA_BTL_NO_ORDER); assert(qp != MCA_BTL_NO_ORDER);
@ -1290,26 +1312,25 @@ mca_btl_base_descriptor_t* mca_btl_openib_alloc(
int prio = !(flags & MCA_BTL_DES_FLAGS_PRIORITY); int prio = !(flags & MCA_BTL_DES_FLAGS_PRIORITY);
sfrag = check_coalescing(&ep->qps[qp].no_wqe_pending_frags[prio], sfrag = check_coalescing(&ep->qps[qp].no_wqe_pending_frags[prio],
&ep->endpoint_lock, ep, size); &ep->endpoint_lock, ep, size, &cfrag);
if(NULL == sfrag) { if (NULL == sfrag) {
if(BTL_OPENIB_QP_TYPE_PP(qp)) { if(BTL_OPENIB_QP_TYPE_PP(qp)) {
sfrag = check_coalescing(&ep->qps[qp].no_credits_pending_frags[prio], sfrag = check_coalescing(&ep->qps[qp].no_credits_pending_frags[prio],
&ep->endpoint_lock, ep, size); &ep->endpoint_lock, ep, size, &cfrag);
} else { } else {
sfrag = check_coalescing( sfrag = check_coalescing(
&obtl->qps[qp].u.srq_qp.pending_frags[prio], &obtl->qps[qp].u.srq_qp.pending_frags[prio],
&obtl->ib_lock, ep, size); &obtl->ib_lock, ep, size, &cfrag);
} }
} }
} }
if(NULL == sfrag) if (NULL == sfrag) {
return ib_frag_alloc((mca_btl_openib_module_t*)btl, size, order, flags); return ib_frag_alloc((mca_btl_openib_module_t*)btl, size, order, flags);
}
/* begin coalescing message */ /* begin coalescing message */
cfrag = alloc_coalesced_frag();
cfrag->send_frag = sfrag;
/* fix up new coalescing header if this is the first coalesced frag */ /* fix up new coalescing header if this is the first coalesced frag */
if(sfrag->hdr != sfrag->chdr) { if(sfrag->hdr != sfrag->chdr) {
@ -1343,10 +1364,9 @@ mca_btl_base_descriptor_t* mca_btl_openib_alloc(
to_base_frag(cfrag)->segment.base.seg_addr.pval = cfrag->hdr + 1; to_base_frag(cfrag)->segment.base.seg_addr.pval = cfrag->hdr + 1;
to_base_frag(cfrag)->segment.base.seg_len = size; to_base_frag(cfrag)->segment.base.seg_len = size;
/* save coalesced fragment on a main fragment; we will need it after send /* NTH: there is no reason to append the coalesced fragment here. No more
* completion to free it and to call upper layer callback */ * fragments will be added until either send or free has been called on
opal_list_append(&sfrag->coalesced_frags, (opal_list_item_t*)cfrag); * the coalesced frag. */
sfrag->coalesced_length += (size+sizeof(mca_btl_openib_header_coalesced_t));
to_base_frag(cfrag)->base.des_flags = flags; to_base_frag(cfrag)->base.des_flags = flags;
@ -1363,18 +1383,6 @@ int mca_btl_openib_free(
struct mca_btl_base_module_t* btl, struct mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des) mca_btl_base_descriptor_t* des)
{ {
/* is this fragment pointing at user memory? */
if(MCA_BTL_OPENIB_FRAG_SEND_USER == openib_frag_type(des) ||
MCA_BTL_OPENIB_FRAG_RECV_USER == openib_frag_type(des)) {
mca_btl_openib_com_frag_t* frag = to_com_frag(des);
if(frag->registration != NULL) {
btl->btl_mpool->mpool_deregister(btl->btl_mpool,
(mca_mpool_base_registration_t*)frag->registration);
frag->registration = NULL;
}
}
/* reset those field on free so we will not have to do it on alloc */ /* reset those field on free so we will not have to do it on alloc */
to_base_frag(des)->base.des_flags = 0; to_base_frag(des)->base.des_flags = 0;
switch(openib_frag_type(des)) { switch(openib_frag_type(des)) {
@ -1390,15 +1398,18 @@ int mca_btl_openib_free(
to_send_frag(des)->hdr + 1; to_send_frag(des)->hdr + 1;
assert(!opal_list_get_size(&to_send_frag(des)->coalesced_frags)); assert(!opal_list_get_size(&to_send_frag(des)->coalesced_frags));
/* fall through */ /* fall through */
case MCA_BTL_OPENIB_FRAG_RECV:
case MCA_BTL_OPENIB_FRAG_RECV_USER:
case MCA_BTL_OPENIB_FRAG_SEND_USER:
to_base_frag(des)->base.des_remote = NULL;
to_base_frag(des)->base.des_remote_count = 0;
break;
default: default:
break; break;
} }
if (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_COALESCED && !to_coalesced_frag(des)->sent) {
mca_btl_openib_send_frag_t *sfrag = to_coalesced_frag(des)->send_frag;
/* the coalesced fragment would have sent the original fragment but that
* will not happen so send the fragment now */
mca_btl_openib_endpoint_send(to_com_frag(sfrag)->endpoint, sfrag);
}
MCA_BTL_IB_FRAG_RETURN(des); MCA_BTL_IB_FRAG_RETURN(des);
return OPAL_SUCCESS; return OPAL_SUCCESS;
@ -1430,7 +1441,6 @@ int mca_btl_openib_free(
mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
struct mca_btl_base_module_t* btl, struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint, struct mca_btl_base_endpoint_t* endpoint,
mca_mpool_base_registration_t* registration,
struct opal_convertor_t* convertor, struct opal_convertor_t* convertor,
uint8_t order, uint8_t order,
size_t reserve, size_t reserve,
@ -1438,7 +1448,6 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
uint32_t flags) uint32_t flags)
{ {
mca_btl_openib_module_t *openib_btl; mca_btl_openib_module_t *openib_btl;
mca_btl_openib_reg_t *openib_reg;
mca_btl_openib_com_frag_t *frag = NULL; mca_btl_openib_com_frag_t *frag = NULL;
struct iovec iov; struct iovec iov;
uint32_t iov_count = 1; uint32_t iov_count = 1;
@ -1448,83 +1457,20 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
openib_btl = (mca_btl_openib_module_t*)btl; openib_btl = (mca_btl_openib_module_t*)btl;
#if OPAL_CUDA_GDR_SUPPORT
if(opal_convertor_cuda_need_buffers(convertor) == false && 0 == reserve) {
#else
if(opal_convertor_need_buffers(convertor) == false && 0 == reserve) {
#endif /* OPAL_CUDA_GDR_SUPPORT */
/* GMS bloody HACK! */
if(registration != NULL || max_data > btl->btl_max_send_size) {
frag = alloc_send_user_frag();
if(NULL == frag) {
return NULL;
}
iov.iov_len = max_data;
iov.iov_base = NULL;
opal_convertor_pack(convertor, &iov, &iov_count, &max_data);
*size = max_data;
if(NULL == registration) {
rc = btl->btl_mpool->mpool_register(btl->btl_mpool,
iov.iov_base, max_data, 0, &registration);
if(OPAL_SUCCESS != rc || NULL == registration) {
MCA_BTL_IB_FRAG_RETURN(frag);
return NULL;
}
/* keep track of the registration we did */
to_com_frag(frag)->registration =
(mca_btl_openib_reg_t*)registration;
}
openib_reg = (mca_btl_openib_reg_t*)registration;
frag->sg_entry.length = max_data;
frag->sg_entry.lkey = openib_reg->mr->lkey;
frag->sg_entry.addr = (uint64_t)(uintptr_t)iov.iov_base;
to_base_frag(frag)->base.order = order;
to_base_frag(frag)->base.des_flags = flags;
to_base_frag(frag)->segment.base.seg_len = max_data;
to_base_frag(frag)->segment.base.seg_addr.lval = (uint64_t)(uintptr_t) iov.iov_base;
to_base_frag(frag)->segment.key = frag->sg_entry.lkey;
assert(MCA_BTL_NO_ORDER == order);
BTL_VERBOSE(("frag->sg_entry.lkey = %" PRIu32 " .addr = %" PRIx64,
frag->sg_entry.lkey, frag->sg_entry.addr));
return &to_base_frag(frag)->base;
}
}
assert(MCA_BTL_NO_ORDER == order); assert(MCA_BTL_NO_ORDER == order);
if(max_data + reserve > btl->btl_max_send_size) { if (max_data + reserve > btl->btl_max_send_size) {
max_data = btl->btl_max_send_size - reserve; max_data = btl->btl_max_send_size - reserve;
} }
if (OPAL_UNLIKELY(0 == reserve)) { frag = (mca_btl_openib_com_frag_t *) mca_btl_openib_alloc (btl, endpoint, order,
frag = (mca_btl_openib_com_frag_t *) ib_frag_alloc(openib_btl, max_data, order, flags);
if(NULL == frag)
return NULL;
/* NTH: this frag will be ue used for either a get or put so we need to set the lval to be
consistent with the usage in get and put. the pval will be restored in mca_btl_openib_free */
ptr = to_base_frag(frag)->segment.base.seg_addr.pval;
to_base_frag(frag)->segment.base.seg_addr.lval =
(uint64_t)(uintptr_t) ptr;
} else {
frag =
(mca_btl_openib_com_frag_t *) mca_btl_openib_alloc(btl, endpoint, order,
max_data + reserve, flags); max_data + reserve, flags);
if(NULL == frag) if (NULL == frag) {
return NULL; return NULL;
ptr = to_base_frag(frag)->segment.base.seg_addr.pval;
} }
ptr = to_base_frag(frag)->segment.base.seg_addr.pval;
iov.iov_len = max_data; iov.iov_len = max_data;
iov.iov_base = (IOVBASE_TYPE *) ( (unsigned char*) ptr + reserve ); iov.iov_base = (IOVBASE_TYPE *) ( (unsigned char*) ptr + reserve );
rc = opal_convertor_pack(convertor, &iov, &iov_count, &max_data); rc = opal_convertor_pack(convertor, &iov, &iov_count, &max_data);
@ -1547,103 +1493,6 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
return &to_base_frag(frag)->base; return &to_base_frag(frag)->base;
} }
/**
* Prepare the dst buffer
*
* @param btl (IN) BTL module
* @param peer (IN) BTL peer addressing
* prepare dest's behavior depends on the following:
* Has a valid memory registration been passed to prepare_src?
* if so we attempt to use the pre-registered user-buffer, if the memory registration
* is to small (only a portion of the user buffer) then we must reregister the user buffer
* Has the user requested the memory to be left pinned?
* if so we insert the memory registration into a memory tree for later lookup, we
* may also remove a previous registration if a MRU (most recently used) list of
* registrations is full, this prevents resources from being exhausted.
*/
mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
mca_mpool_base_registration_t* registration,
struct opal_convertor_t* convertor,
uint8_t order,
size_t reserve,
size_t* size,
uint32_t flags)
{
mca_btl_openib_module_t *openib_btl;
mca_btl_openib_component_t *openib_component;
mca_btl_openib_com_frag_t *frag;
mca_btl_openib_reg_t *openib_reg;
uint32_t max_msg_sz;
int rc;
void *buffer;
openib_btl = (mca_btl_openib_module_t*)btl;
openib_component = (mca_btl_openib_component_t*)btl->btl_component;
frag = alloc_recv_user_frag();
if(NULL == frag) {
return NULL;
}
/* max_msg_sz is the maximum message size of the HCA (hw limitation)
set the minimum between local max_msg_sz and the remote */
max_msg_sz = MIN(openib_btl->ib_port_attr.max_msg_sz,
endpoint->endpoint_btl->ib_port_attr.max_msg_sz);
/* check if user has explicitly limited the max message size */
if (openib_component->max_hw_msg_size > 0 &&
max_msg_sz > (size_t)openib_component->max_hw_msg_size) {
max_msg_sz = openib_component->max_hw_msg_size;
}
/* limit the message so to max_msg_sz */
if (*size > (size_t)max_msg_sz) {
*size = (size_t)max_msg_sz;
BTL_VERBOSE(("message size limited to %" PRIsize_t "\n", *size));
}
opal_convertor_get_current_pointer(convertor, &buffer);
if(NULL == registration){
/* we didn't get a memory registration passed in, so we have to
* register the region ourselves
*/
uint32_t mflags = 0;
#if OPAL_CUDA_GDR_SUPPORT
if (convertor->flags & CONVERTOR_CUDA) {
mflags |= MCA_MPOOL_FLAGS_CUDA_GPU_MEM;
}
#endif /* OPAL_CUDA_GDR_SUPPORT */
rc = btl->btl_mpool->mpool_register(btl->btl_mpool, buffer, *size, mflags,
&registration);
if(OPAL_SUCCESS != rc || NULL == registration) {
MCA_BTL_IB_FRAG_RETURN(frag);
return NULL;
}
/* keep track of the registration we did */
frag->registration = (mca_btl_openib_reg_t*)registration;
}
openib_reg = (mca_btl_openib_reg_t*)registration;
frag->sg_entry.length = *size;
frag->sg_entry.lkey = openib_reg->mr->lkey;
frag->sg_entry.addr = (uint64_t)(uintptr_t)buffer;
to_base_frag(frag)->segment.base.seg_addr.lval = (uint64_t)(uintptr_t) buffer;
to_base_frag(frag)->segment.base.seg_len = *size;
to_base_frag(frag)->segment.key = openib_reg->mr->rkey;
to_base_frag(frag)->base.order = order;
to_base_frag(frag)->base.des_flags = flags;
BTL_VERBOSE(("frag->sg_entry.lkey = %" PRIu32 " .addr = %" PRIx64 " "
"rkey = %" PRIu32, frag->sg_entry.lkey, frag->sg_entry.addr,
openib_reg->mr->rkey));
return &to_base_frag(frag)->base;
}
static int mca_btl_openib_finalize_resources(struct mca_btl_base_module_t* btl) { static int mca_btl_openib_finalize_resources(struct mca_btl_base_module_t* btl) {
mca_btl_openib_module_t* openib_btl; mca_btl_openib_module_t* openib_btl;
mca_btl_openib_endpoint_t* endpoint; mca_btl_openib_endpoint_t* endpoint;
@ -1796,16 +1645,15 @@ int mca_btl_openib_sendi( struct mca_btl_base_module_t* btl,
{ {
mca_btl_openib_module_t *obtl = (mca_btl_openib_module_t*)btl; mca_btl_openib_module_t *obtl = (mca_btl_openib_module_t*)btl;
size_t size = payload_size + header_size; size_t size = payload_size + header_size;
size_t eager_limit;
int qp = frag_size_to_order(obtl, size), int qp = frag_size_to_order(obtl, size),
prio = !(flags & MCA_BTL_DES_FLAGS_PRIORITY), prio = !(flags & MCA_BTL_DES_FLAGS_PRIORITY),
ib_rc; ib_rc;
int32_t cm_return;
bool do_rdma = false; bool do_rdma = false;
ompi_free_list_item_t* item = NULL; ompi_free_list_item_t* item = NULL;
mca_btl_openib_frag_t *frag; mca_btl_openib_frag_t *frag;
mca_btl_openib_header_t *hdr; mca_btl_openib_header_t *hdr;
int send_signaled; int send_signaled;
int rc;
OPAL_THREAD_LOCK(&ep->endpoint_lock); OPAL_THREAD_LOCK(&ep->endpoint_lock);
@ -1827,45 +1675,26 @@ int mca_btl_openib_sendi( struct mca_btl_base_module_t* btl,
/* Allocate WQE */ /* Allocate WQE */
if(OPAL_UNLIKELY(qp_get_wqe(ep, qp) < 0)) { if(OPAL_UNLIKELY(qp_get_wqe(ep, qp) < 0)) {
goto no_credits_or_wqe; goto cant_send_wqe;
}
/* eager rdma or send ? Check eager rdma credits */
/* Note: Maybe we want to implement isend only for eager rdma ?*/
eager_limit = mca_btl_openib_component.eager_limit +
sizeof(mca_btl_openib_header_coalesced_t) +
sizeof(mca_btl_openib_control_header_t);
if(OPAL_LIKELY(size <= eager_limit)) {
if(acquire_eager_rdma_send_credit(ep) == OPAL_SUCCESS) {
do_rdma = true;
}
}
/* if(!do_rdma && acquire_send_credit(ep, frag) != OPAL_SUCCESS) { */
/* Check send credits if it is no rdma */
if(!do_rdma) {
if(BTL_OPENIB_QP_TYPE_PP(qp)) {
if(OPAL_UNLIKELY(OPAL_THREAD_ADD32(&ep->qps[qp].u.pp_qp.sd_credits, -1) < 0)){
OPAL_THREAD_ADD32(&ep->qps[qp].u.pp_qp.sd_credits, 1);
goto no_credits_or_wqe;
}
} else {
if(OPAL_UNLIKELY(OPAL_THREAD_ADD32(&obtl->qps[qp].u.srq_qp.sd_credits, -1) < 0)){
OPAL_THREAD_ADD32(&obtl->qps[qp].u.srq_qp.sd_credits, 1);
goto no_credits_or_wqe;
}
}
} }
/* Allocate fragment */ /* Allocate fragment */
OMPI_FREE_LIST_GET_MT(&obtl->device->qps[qp].send_free, item); OMPI_FREE_LIST_GET_MT(&obtl->device->qps[qp].send_free, item);
if(OPAL_UNLIKELY(NULL == item)) { if(OPAL_UNLIKELY(NULL == item)) {
/* we don't return NULL because maybe later we will try to coalesce */ /* we don't return NULL because maybe later we will try to coalesce */
goto no_frags; goto cant_send_wqe;
} }
frag = to_base_frag(item); frag = to_base_frag(item);
hdr = to_send_frag(item)->hdr; hdr = to_send_frag(item)->hdr;
/* eager rdma or send ? Check eager rdma credits */
/* Note: Maybe we want to implement isend only for eager rdma ?*/
rc = mca_btl_openib_endpoint_credit_acquire (ep, qp, prio, size, &do_rdma,
to_send_frag(frag), false);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
goto cant_send_frag;
}
frag->segment.base.seg_len = size; frag->segment.base.seg_len = size;
frag->base.order = qp; frag->base.order = qp;
frag->base.des_flags = flags; frag->base.des_flags = flags;
@ -1890,29 +1719,6 @@ int mca_btl_openib_sendi( struct mca_btl_base_module_t* btl,
assert(max_data == payload_size); assert(max_data == payload_size);
} }
/* Set all credits */
BTL_OPENIB_GET_CREDITS(ep->eager_rdma_local.credits, hdr->credits);
if(hdr->credits)
hdr->credits |= BTL_OPENIB_RDMA_CREDITS_FLAG;
if(!do_rdma) {
if(BTL_OPENIB_QP_TYPE_PP(qp) && 0 == hdr->credits) {
BTL_OPENIB_GET_CREDITS(ep->qps[qp].u.pp_qp.rd_credits, hdr->credits);
}
} else {
hdr->credits |= (qp << 11);
}
BTL_OPENIB_GET_CREDITS(ep->qps[qp].u.pp_qp.cm_return, cm_return);
/* cm_seen is only 8 bytes, but cm_return is 32 bytes */
if(cm_return > 255) {
hdr->cm_seen = 255;
cm_return -= 255;
OPAL_THREAD_ADD32(&ep->qps[qp].u.pp_qp.cm_return, cm_return);
} else {
hdr->cm_seen = cm_return;
}
#if BTL_OPENIB_FAILOVER_ENABLED #if BTL_OPENIB_FAILOVER_ENABLED
send_signaled = 1; send_signaled = 1;
#else #else
@ -1920,7 +1726,7 @@ int mca_btl_openib_sendi( struct mca_btl_base_module_t* btl,
#endif #endif
ib_rc = post_send(ep, to_send_frag(item), do_rdma, send_signaled); ib_rc = post_send(ep, to_send_frag(item), do_rdma, send_signaled);
if(!ib_rc) { if (!ib_rc) {
if (0 == send_signaled) { if (0 == send_signaled) {
MCA_BTL_IB_FRAG_RETURN(frag); MCA_BTL_IB_FRAG_RETURN(frag);
} }
@ -1931,37 +1737,28 @@ int mca_btl_openib_sendi( struct mca_btl_base_module_t* btl,
} }
#endif #endif
OPAL_THREAD_UNLOCK(&ep->endpoint_lock); OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
return OPAL_SUCCESS; return OPAL_SUCCESS;
} }
/* Failed to send, do clean up all allocated resources */ /* Failed to send, do clean up all allocated resources */
if(ep->nbo) { if (ep->nbo) {
BTL_OPENIB_HEADER_NTOH(*hdr); BTL_OPENIB_HEADER_NTOH(*hdr);
} }
if(BTL_OPENIB_IS_RDMA_CREDITS(hdr->credits)) {
OPAL_THREAD_ADD32(&ep->eager_rdma_local.credits, mca_btl_openib_endpoint_credit_release (ep, qp, do_rdma, to_send_frag(frag));
BTL_OPENIB_CREDITS(hdr->credits));
} cant_send_frag:
if (!do_rdma && BTL_OPENIB_QP_TYPE_PP(qp)) { MCA_BTL_IB_FRAG_RETURN(frag);
OPAL_THREAD_ADD32(&ep->qps[qp].u.pp_qp.rd_credits, cant_send_wqe:
hdr->credits); qp_put_wqe (ep, qp);
}
no_frags:
if(do_rdma) {
OPAL_THREAD_ADD32(&ep->eager_rdma_remote.tokens, 1);
} else {
if(BTL_OPENIB_QP_TYPE_PP(qp)) {
OPAL_THREAD_ADD32(&ep->qps[qp].u.pp_qp.sd_credits, 1);
} else if BTL_OPENIB_QP_TYPE_SRQ(qp){
OPAL_THREAD_ADD32(&obtl->qps[qp].u.srq_qp.sd_credits, 1);
}
}
no_credits_or_wqe:
qp_put_wqe(ep, qp);
cant_send: cant_send:
OPAL_THREAD_UNLOCK(&ep->endpoint_lock); OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
/* We can not send the data directly, so we just return descriptor */ /* We can not send the data directly, so we just return descriptor */
*descriptor = mca_btl_openib_alloc(btl, ep, order, size, flags); if (NULL != descriptor) {
*descriptor = mca_btl_openib_alloc(btl, ep, order, size, flags);
}
return OPAL_ERR_RESOURCE_BUSY; return OPAL_ERR_RESOURCE_BUSY;
} }
/* /*
@ -1981,11 +1778,19 @@ int mca_btl_openib_send(
openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_COALESCED); openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_COALESCED);
if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_COALESCED) { if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_COALESCED) {
frag = to_coalesced_frag(des)->send_frag;
/* save coalesced fragment on a main fragment; we will need it after send
* completion to free it and to call upper layer callback */
opal_list_append(&frag->coalesced_frags, (opal_list_item_t*) des);
frag->coalesced_length += to_coalesced_frag(des)->hdr->alloc_size +
sizeof(mca_btl_openib_header_coalesced_t);
to_coalesced_frag(des)->sent = true;
to_coalesced_frag(des)->hdr->tag = tag; to_coalesced_frag(des)->hdr->tag = tag;
to_coalesced_frag(des)->hdr->size = des->des_local->seg_len; to_coalesced_frag(des)->hdr->size = des->des_segments->seg_len;
if(ep->nbo) if(ep->nbo)
BTL_OPENIB_HEADER_COALESCED_HTON(*to_coalesced_frag(des)->hdr); BTL_OPENIB_HEADER_COALESCED_HTON(*to_coalesced_frag(des)->hdr);
frag = to_coalesced_frag(des)->send_frag;
} else { } else {
frag = to_send_frag(des); frag = to_send_frag(des);
to_com_frag(des)->endpoint = ep; to_com_frag(des)->endpoint = ep;
@ -1997,161 +1802,34 @@ int mca_btl_openib_send(
return mca_btl_openib_endpoint_send(ep, frag); return mca_btl_openib_endpoint_send(ep, frag);
} }
/* static mca_btl_base_registration_handle_t *mca_btl_openib_register_mem (mca_btl_base_module_t *btl,
* RDMA WRITE local buffer to remote buffer address. mca_btl_base_endpoint_t *endpoint,
*/ void *base, size_t size, uint32_t flags)
int mca_btl_openib_put( mca_btl_base_module_t* btl,
mca_btl_base_endpoint_t* ep,
mca_btl_base_descriptor_t* descriptor)
{ {
mca_btl_openib_segment_t *src_seg = (mca_btl_openib_segment_t *) descriptor->des_local; mca_btl_openib_reg_t *reg;
mca_btl_openib_segment_t *dst_seg = (mca_btl_openib_segment_t *) descriptor->des_remote; uint32_t mflags = 0;
struct ibv_send_wr* bad_wr; int rc;
mca_btl_openib_out_frag_t* frag = to_out_frag(descriptor);
int qp = descriptor->order;
uint64_t rem_addr = dst_seg->base.seg_addr.lval;
uint32_t rkey = dst_seg->key;
assert(openib_frag_type(frag) == MCA_BTL_OPENIB_FRAG_SEND_USER || #if OPAL_CUDA_GDR_SUPPORT
openib_frag_type(frag) == MCA_BTL_OPENIB_FRAG_SEND); if (flags & MCA_BTL_REG_FLAG_CUDA_GPU_MEM) {
mflags |= MCA_MPOOL_FLAGS_CUDA_GPU_MEM;
}
#endif /* OPAL_CUDA_GDR_SUPPORT */
descriptor->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; rc = btl->btl_mpool->mpool_register (btl->btl_mpool, base, size, mflags,
(mca_mpool_base_registration_t **) &reg);
if(ep->endpoint_state != MCA_BTL_IB_CONNECTED) { if (OPAL_UNLIKELY(OPAL_SUCCESS != rc || NULL == reg)) {
int rc; return NULL;
OPAL_THREAD_LOCK(&ep->endpoint_lock);
rc = check_endpoint_state(ep, descriptor, &ep->pending_put_frags);
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
if(OPAL_ERR_RESOURCE_BUSY == rc)
return OPAL_SUCCESS;
if(OPAL_SUCCESS != rc)
return rc;
} }
if(MCA_BTL_NO_ORDER == qp) return &reg->btl_handle;
qp = mca_btl_openib_component.rdma_qp;
/* check for a send wqe */
if (qp_get_wqe(ep, qp) < 0) {
qp_put_wqe(ep, qp);
OPAL_THREAD_LOCK(&ep->endpoint_lock);
opal_list_append(&ep->pending_put_frags, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
return OPAL_SUCCESS;
}
/* post descriptor */
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
if((ep->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN)
!= (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) {
rem_addr = opal_swap_bytes8(rem_addr);
rkey = opal_swap_bytes4(rkey);
}
#endif
frag->sr_desc.wr.rdma.remote_addr = rem_addr;
frag->sr_desc.wr.rdma.rkey = rkey;
to_com_frag(frag)->sg_entry.addr = src_seg->base.seg_addr.lval;
to_com_frag(frag)->sg_entry.length = src_seg->base.seg_len;
to_com_frag(frag)->endpoint = ep;
#if HAVE_XRC
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp))
frag->sr_desc.xrc_remote_srq_num=ep->rem_info.rem_srqs[qp].rem_srq_num;
#endif
descriptor->order = qp;
/* Setting opcode on a frag constructor isn't enough since prepare_src
* may return send_frag instead of put_frag */
frag->sr_desc.opcode = IBV_WR_RDMA_WRITE;
frag->sr_desc.send_flags = ib_send_flags(src_seg->base.seg_len, &(ep->qps[qp]), 1);
qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag));
qp_reset_signal_count(ep, qp);
if(ibv_post_send(ep->qps[qp].qp->lcl_qp, &frag->sr_desc, &bad_wr))
return OPAL_ERROR;
return OPAL_SUCCESS;
} }
/* static int mca_btl_openib_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle)
* RDMA READ remote buffer to local buffer address.
*/
int mca_btl_openib_get(mca_btl_base_module_t* btl,
mca_btl_base_endpoint_t* ep,
mca_btl_base_descriptor_t* descriptor)
{ {
mca_btl_openib_segment_t *src_seg = (mca_btl_openib_segment_t *) descriptor->des_remote; mca_btl_openib_reg_t *reg = (mca_btl_openib_reg_t *)((intptr_t) handle - offsetof (mca_btl_openib_reg_t, btl_handle));
mca_btl_openib_segment_t *dst_seg = (mca_btl_openib_segment_t *) descriptor->des_local;
struct ibv_send_wr* bad_wr;
mca_btl_openib_get_frag_t* frag = to_get_frag(descriptor);
int qp = descriptor->order;
uint64_t rem_addr = src_seg->base.seg_addr.lval;
uint32_t rkey = src_seg->key;
assert(openib_frag_type(frag) == MCA_BTL_OPENIB_FRAG_RECV_USER); btl->btl_mpool->mpool_deregister (btl->btl_mpool, (mca_mpool_base_registration_t *) reg);
descriptor->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
if(ep->endpoint_state != MCA_BTL_IB_CONNECTED) {
int rc;
OPAL_THREAD_LOCK(&ep->endpoint_lock);
rc = check_endpoint_state(ep, descriptor, &ep->pending_get_frags);
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
if(OPAL_ERR_RESOURCE_BUSY == rc)
return OPAL_SUCCESS;
if(OPAL_SUCCESS != rc)
return rc;
}
if(MCA_BTL_NO_ORDER == qp)
qp = mca_btl_openib_component.rdma_qp;
/* check for a send wqe */
if (qp_get_wqe(ep, qp) < 0) {
qp_put_wqe(ep, qp);
OPAL_THREAD_LOCK(&ep->endpoint_lock);
opal_list_append(&ep->pending_get_frags, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
return OPAL_SUCCESS;
}
/* check for a get token */
if(OPAL_THREAD_ADD32(&ep->get_tokens,-1) < 0) {
qp_put_wqe(ep, qp);
OPAL_THREAD_ADD32(&ep->get_tokens,1);
OPAL_THREAD_LOCK(&ep->endpoint_lock);
opal_list_append(&ep->pending_get_frags, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
return OPAL_SUCCESS;
}
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
if((ep->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN)
!= (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) {
rem_addr = opal_swap_bytes8(rem_addr);
rkey = opal_swap_bytes4(rkey);
}
#endif
frag->sr_desc.wr.rdma.remote_addr = rem_addr;
frag->sr_desc.wr.rdma.rkey = rkey;
to_com_frag(frag)->sg_entry.addr = dst_seg->base.seg_addr.lval;
to_com_frag(frag)->sg_entry.length = dst_seg->base.seg_len;
to_com_frag(frag)->endpoint = ep;
#if HAVE_XRC
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp))
frag->sr_desc.xrc_remote_srq_num=ep->rem_info.rem_srqs[qp].rem_srq_num;
#endif
descriptor->order = qp;
qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag));
qp_reset_signal_count(ep, qp);
if(ibv_post_send(ep->qps[qp].qp->lcl_qp, &frag->sr_desc, &bad_wr))
return OPAL_ERROR;
return OPAL_SUCCESS; return OPAL_SUCCESS;
} }

Просмотреть файл

@ -126,10 +126,7 @@ struct mca_btl_openib_qp_info_t {
(BTL_OPENIB_QP_TYPE(Q) == MCA_BTL_OPENIB_XRC_QP) (BTL_OPENIB_QP_TYPE(Q) == MCA_BTL_OPENIB_XRC_QP)
typedef enum { typedef enum {
BTL_OPENIB_RQ_SOURCE_DEFAULT, BTL_OPENIB_RQ_SOURCE_DEVICE_INI = MCA_BASE_VAR_SOURCE_MAX,
BTL_OPENIB_RQ_SOURCE_MCA,
BTL_OPENIB_RQ_SOURCE_DEVICE_INI,
BTL_OPENIB_RQ_SOURCE_MAX
} btl_openib_receive_queues_source_t; } btl_openib_receive_queues_source_t;
typedef enum { typedef enum {
@ -497,9 +494,15 @@ typedef struct mca_btl_openib_module_t mca_btl_openib_module_t;
extern mca_btl_openib_module_t mca_btl_openib_module; extern mca_btl_openib_module_t mca_btl_openib_module;
struct mca_btl_base_registration_handle_t {
uint32_t rkey;
uint32_t lkey;
};
struct mca_btl_openib_reg_t { struct mca_btl_openib_reg_t {
mca_mpool_base_registration_t base; mca_mpool_base_registration_t base;
struct ibv_mr *mr; struct ibv_mr *mr;
mca_btl_base_registration_handle_t btl_handle;
}; };
typedef struct mca_btl_openib_reg_t mca_btl_openib_reg_t; typedef struct mca_btl_openib_reg_t mca_btl_openib_reg_t;
@ -612,32 +615,182 @@ extern int mca_btl_openib_sendi( struct mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t** descriptor mca_btl_base_descriptor_t** descriptor
); );
/** /* forward decaration for internal put/get */
* PML->BTL Initiate a put of the specified size. struct mca_btl_openib_put_frag_t;
* struct mca_btl_openib_get_frag_t;
* @param btl (IN) BTL instance
* @param btl_peer (IN) BTL peer addressing
* @param descriptor (IN) Descriptor of data to be transmitted.
*/
extern int mca_btl_openib_put(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* btl_peer,
struct mca_btl_base_descriptor_t* descriptor
);
/** /**
* PML->BTL Initiate a get of the specified size. * @brief Schedule a put fragment with the HCA (internal)
* *
* @param btl (IN) BTL instance * @param btl (IN) BTL instance
* @param btl_base_peer (IN) BTL peer addressing * @param ep (IN) BTL endpoint
* @param descriptor (IN) Descriptor of data to be transmitted. * @param frag (IN) Fragment prepared by mca_btl_openib_put
*
* If the fragment can not be scheduled due to resource limitations then
* the fragment will be put on the pending put fragment list and retried
* when another get/put fragment has completed.
*/ */
extern int mca_btl_openib_get( int mca_btl_openib_put_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep,
struct mca_btl_base_module_t* btl, struct mca_btl_openib_put_frag_t *frag);
struct mca_btl_base_endpoint_t* btl_peer,
struct mca_btl_base_descriptor_t* descriptor
);
/**
* @brief Schedule an RDMA write with the HCA
*
* @param btl (IN) BTL instance
* @param ep (IN) BTL endpoint
* @param local_address (IN) Source address
* @param remote_address (IN) Destination address
* @param local_handle (IN) Registration handle for region containing the region {local_address, size}
* @param remote_handle (IN) Registration handle for region containing the region {remote_address, size}
* @param size (IN) Number of bytes to write
* @param flags (IN) Transfer flags
* @param order (IN) Ordering
* @param cbfunc (IN) Function to call on completion
* @param cbcontext (IN) Context for completion callback
* @param cbdata (IN) Data for completion callback
*
* @return OPAL_ERR_BAD_PARAM if a bad parameter was passed
* @return OPAL_SUCCCESS if the operation was successfully scheduled
*
* This function will attempt to schedule a put operation with the HCA.
*/
int mca_btl_openib_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
/**
* @brief Schedule a get fragment with the HCA (internal)
*
* @param btl (IN) BTL instance
* @param ep (IN) BTL endpoint
* @param qp (IN) ID of queue pair to schedule the get on
* @param frag (IN) Fragment prepared by mca_btl_openib_get
*
* If the fragment can not be scheduled due to resource limitations then
* the fragment will be put on the pending get fragment list and retried
* when another get/put fragment has completed.
*/
int mca_btl_openib_get_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep,
struct mca_btl_openib_get_frag_t *frag);
/**
* @brief Schedule an RDMA read with the HCA
*
* @param btl (IN) BTL instance
* @param ep (IN) BTL endpoint
* @param local_address (IN) Destination address
* @param remote_address (IN) Source address
* @param local_handle (IN) Registration handle for region containing the region {local_address, size}
* @param remote_handle (IN) Registration handle for region containing the region {remote_address, size}
* @param size (IN) Number of bytes to read
* @param flags (IN) Transfer flags
* @param order (IN) Ordering
* @param cbfunc (IN) Function to call on completion
* @param cbcontext (IN) Context for completion callback
* @param cbdata (IN) Data for completion callback
*
* @return OPAL_ERR_BAD_PARAM if a bad parameter was passed
* @return OPAL_SUCCCESS if the operation was successfully scheduled
*
* This function will attempt to schedule a get operation with the HCA.
*/
int mca_btl_openib_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
/**
* Initiate an asynchronous fetching atomic operation.
* Completion Semantics: if this function returns a 1 then the operation
* is complete. a return of OPAL_SUCCESS indicates
* the atomic operation has been queued with the
* network.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param local_address (OUT) Local address to store the result in
* @param remote_address (IN) Remote address perfom operation on to (registered remotely)
* @param local_handle (IN) Local registration handle for region containing
* (local_address, local_address + 8)
* @param remote_handle (IN) Remote registration handle for region containing
* (remote_address, remote_address + 8)
* @param op (IN) Operation to perform
* @param operand (IN) Operand for the operation
* @param flags (IN) Flags for this put operation
* @param order (IN) Ordering
* @param cbfunc (IN) Function to call on completion (if queued)
* @param cbcontext (IN) Context for the callback
* @param cbdata (IN) Data for callback
*
* @retval OPAL_SUCCESS The operation was successfully queued
* @retval 1 The operation is complete
* @retval OPAL_ERROR The operation was NOT successfully queued
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the atomic
* operation. Try again later
* @retval OPAL_ERR_NOT_AVAILABLE Atomic operation can not be performed due to
* alignment restrictions or the operation {op} is not supported
* by the hardware.
*
* After the operation is complete the remote address specified by {remote_address} and
* {remote_handle} will be updated with (*remote_address) = (*remote_address) op operand.
* {local_address} will be updated with the previous value stored in {remote_address}.
* The btl will guarantee consistency of atomic operations performed via the btl. Note,
* however, that not all btls will provide consistency between btl atomic operations and
* cpu atomics.
*/
int mca_btl_openib_atomic_fop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, uint64_t remote_address,
struct mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
int64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
void *cbcontext, void *cbdata);
/**
* Initiate an asynchronous compare and swap operation.
* Completion Semantics: if this function returns a 1 then the operation
* is complete. a return of OPAL_SUCCESS indicates
* the atomic operation has been queued with the
* network.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param local_address (OUT) Local address to store the result in
* @param remote_address (IN) Remote address perfom operation on to (registered remotely)
* @param local_handle (IN) Local registration handle for region containing
* (local_address, local_address + 8)
* @param remote_handle (IN) Remote registration handle for region containing
* (remote_address, remote_address + 8)
* @param compare (IN) Operand for the operation
* @param value (IN) Value to store on success
* @param flags (IN) Flags for this put operation
* @param order (IN) Ordering
* @param cbfunc (IN) Function to call on completion (if queued)
* @param cbcontext (IN) Context for the callback
* @param cbdata (IN) Data for callback
*
* @retval OPAL_SUCCESS The operation was successfully queued
* @retval 1 The operation is complete
* @retval OPAL_ERROR The operation was NOT successfully queued
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the atomic
* operation. Try again later
* @retval OPAL_ERR_NOT_AVAILABLE Atomic operation can not be performed due to
* alignment restrictions or the operation {op} is not supported
* by the hardware.
*
* After the operation is complete the remote address specified by {remote_address} and
* {remote_handle} will be updated with {value} if *remote_address == compare.
* {local_address} will be updated with the previous value stored in {remote_address}.
* The btl will guarantee consistency of atomic operations performed via the btl. Note,
* however, that not all btls will provide consistency between btl atomic operations and
* cpu atomics.
*/
int mca_btl_openib_atomic_cswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, uint64_t remote_address,
struct mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_registration_handle_t *remote_handle, int64_t compare,
int64_t value, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
void *cbcontext, void *cbdata);
/** /**
* Allocate a descriptor. * Allocate a descriptor.
@ -674,7 +827,6 @@ extern int mca_btl_openib_free(
mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
struct mca_btl_base_module_t* btl, struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* peer, struct mca_btl_base_endpoint_t* peer,
mca_mpool_base_registration_t* registration,
struct opal_convertor_t* convertor, struct opal_convertor_t* convertor,
uint8_t order, uint8_t order,
size_t reserve, size_t reserve,
@ -682,22 +834,6 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
uint32_t flags uint32_t flags
); );
/**
* Allocate a descriptor initialized for RDMA write.
*
* @param btl (IN) BTL module
* @param peer (IN) BTL peer addressing
*/
extern mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* peer,
mca_mpool_base_registration_t* registration,
struct opal_convertor_t* convertor,
uint8_t order,
size_t reserve,
size_t* size,
uint32_t flags);
extern void mca_btl_openib_frag_progress_pending_put_get( extern void mca_btl_openib_frag_progress_pending_put_get(
struct mca_btl_base_endpoint_t*, const int); struct mca_btl_base_endpoint_t*, const int);

135
opal/mca/btl/openib/btl_openib_atomic.c Обычный файл
Просмотреть файл

@ -0,0 +1,135 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "btl_openib.h"
#include "btl_openib_endpoint.h"
#include "btl_openib_xrc.h"
#if HAVE_DECL_IBV_ATOMIC_HCA
static int mca_btl_openib_atomic_internal (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, enum ibv_wr_opcode opcode,
int64_t operand, int operand2, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
void *cbcontext, void *cbdata)
{
mca_btl_openib_get_frag_t* frag = NULL;
int qp = order;
int rc;
frag = to_get_frag(alloc_recv_user_frag());
if (OPAL_UNLIKELY(NULL == frag)) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
if (MCA_BTL_NO_ORDER == qp) {
qp = mca_btl_openib_component.rdma_qp;
}
/* set base descriptor flags */
to_base_frag(frag)->base.order = qp;
/* free this descriptor when the operation is complete */
to_base_frag(frag)->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
/* set up scatter-gather entry */
to_com_frag(frag)->sg_entry.length = 8;
to_com_frag(frag)->sg_entry.lkey = local_handle->lkey;
to_com_frag(frag)->sg_entry.addr = (uint64_t)(uintptr_t) local_address;
to_com_frag(frag)->endpoint = endpoint;
/* set up rdma callback */
frag->cb.func = cbfunc;
frag->cb.context = cbcontext;
frag->cb.data = cbdata;
frag->cb.local_handle = local_handle;
/* set up descriptor */
frag->sr_desc.wr.atomic.remote_addr = remote_address;
frag->sr_desc.opcode = opcode;
frag->sr_desc.wr.atomic.compare_add = operand;
frag->sr_desc.wr.atomic.swap = operand2;
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
if((endpoint->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN)
!= (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) {
frag->sr_desc.wr.atomic.rkey = opal_swap_bytes4 (remote_handle->rkey);
} else
#endif
{
frag->sr_desc.wr.atomic.rkey = remote_handle->rkey;
}
#if HAVE_XRC
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) {
frag->sr_desc.xrc_remote_srq_num=endpoint->rem_info.rem_srqs[qp].rem_srq_num;
}
#endif
if (endpoint->endpoint_state != MCA_BTL_IB_CONNECTED) {
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
rc = check_endpoint_state(endpoint, &to_base_frag(frag)->base, &endpoint->pending_get_frags);
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
if (OPAL_ERR_RESOURCE_BUSY == rc) {
return OPAL_SUCCESS;
}
if (OPAL_SUCCESS != rc) {
MCA_BTL_IB_FRAG_RETURN (frag);
return rc;
}
}
rc = mca_btl_openib_get_internal (btl, endpoint, frag);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
if (OPAL_LIKELY(OPAL_ERR_OUT_OF_RESOURCE == rc)) {
rc = OPAL_SUCCESS;
OPAL_THREAD_SCOPED_LOCK(&endpoint->endpoint_lock,
opal_list_append(&endpoint->pending_get_frags, (opal_list_item_t*)frag));
} else {
MCA_BTL_IB_FRAG_RETURN (frag);
}
}
return rc;
}
int mca_btl_openib_atomic_fop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, uint64_t remote_address,
struct mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
int64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
void *cbcontext, void *cbdata)
{
if (OPAL_UNLIKELY(MCA_BTL_ATOMIC_ADD != op)) {
return OPAL_ERR_NOT_SUPPORTED;
}
return mca_btl_openib_atomic_internal (btl, endpoint, local_address, remote_address, local_handle,
remote_handle, IBV_WR_ATOMIC_FETCH_AND_ADD, operand, 0,
flags, order, cbfunc, cbcontext, cbdata);
}
int mca_btl_openib_atomic_cswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, uint64_t remote_address,
struct mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_registration_handle_t *remote_handle, int64_t compare,
int64_t value, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
void *cbcontext, void *cbdata)
{
return mca_btl_openib_atomic_internal (btl, endpoint, local_address, remote_address, local_handle,
remote_handle, IBV_WR_ATOMIC_CMP_AND_SWP, compare, value,
flags, order, cbfunc, cbcontext, cbdata);
}
#endif

Просмотреть файл

@ -471,7 +471,7 @@ static void btl_openib_control(mca_btl_base_module_t* btl,
mca_btl_openib_header_coalesced_t *clsc_hdr = mca_btl_openib_header_coalesced_t *clsc_hdr =
(mca_btl_openib_header_coalesced_t*)(ctl_hdr + 1); (mca_btl_openib_header_coalesced_t*)(ctl_hdr + 1);
mca_btl_active_message_callback_t* reg; mca_btl_active_message_callback_t* reg;
size_t len = des->des_local->seg_len - sizeof(*ctl_hdr); size_t len = des->des_segments->seg_len - sizeof(*ctl_hdr);
switch (ctl_hdr->type) { switch (ctl_hdr->type) {
case MCA_BTL_OPENIB_CONTROL_CREDITS: case MCA_BTL_OPENIB_CONTROL_CREDITS:
@ -522,8 +522,8 @@ static void btl_openib_control(mca_btl_base_module_t* btl,
skip = (sizeof(*clsc_hdr) + clsc_hdr->alloc_size - pad); skip = (sizeof(*clsc_hdr) + clsc_hdr->alloc_size - pad);
tmp_des.des_local = &tmp_seg; tmp_des.des_segments = &tmp_seg;
tmp_des.des_local_count = 1; tmp_des.des_segment_count = 1;
tmp_seg.seg_addr.pval = clsc_hdr + 1; tmp_seg.seg_addr.pval = clsc_hdr + 1;
tmp_seg.seg_len = clsc_hdr->size; tmp_seg.seg_len = clsc_hdr->size;
@ -583,6 +583,10 @@ static int openib_reg_mr(void *reg_data, void *base, size_t size,
enum ibv_access_flags access_flag = (enum ibv_access_flags) (IBV_ACCESS_LOCAL_WRITE | enum ibv_access_flags access_flag = (enum ibv_access_flags) (IBV_ACCESS_LOCAL_WRITE |
IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ); IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ);
#if HAVE_DECL_IBV_ATOMIC_HCA
access_flag |= IBV_ACCESS_REMOTE_ATOMIC;
#endif
if (device->mem_reg_max && if (device->mem_reg_max &&
device->mem_reg_max < (device->mem_reg_active + size)) { device->mem_reg_max < (device->mem_reg_active + size)) {
return OPAL_ERR_OUT_OF_RESOURCE; return OPAL_ERR_OUT_OF_RESOURCE;
@ -605,6 +609,9 @@ static int openib_reg_mr(void *reg_data, void *base, size_t size,
return OPAL_ERR_OUT_OF_RESOURCE; return OPAL_ERR_OUT_OF_RESOURCE;
} }
openib_reg->btl_handle.lkey = openib_reg->mr->lkey;
openib_reg->btl_handle.rkey = openib_reg->mr->rkey;
OPAL_OUTPUT_VERBOSE((30, mca_btl_openib_component.memory_registration_verbose, OPAL_OUTPUT_VERBOSE((30, mca_btl_openib_component.memory_registration_verbose,
"openib_reg_mr: base=%p, bound=%p, size=%d, flags=0x%x", reg->base, reg->bound, "openib_reg_mr: base=%p, bound=%p, size=%d, flags=0x%x", reg->base, reg->bound,
(int) (reg->bound - reg->base + 1), reg->flags)); (int) (reg->bound - reg->base + 1), reg->flags));
@ -804,7 +811,30 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbfunc = btl_openib_control; mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbfunc = btl_openib_control;
mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbdata = NULL; mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbdata = NULL;
openib_btl->super.btl_seg_size = sizeof (mca_btl_openib_segment_t); if (openib_btl->super.btl_get_limit > openib_btl->ib_port_attr.max_msg_sz) {
openib_btl->super.btl_get_limit = openib_btl->ib_port_attr.max_msg_sz;
}
openib_btl->super.btl_get_alignment = 0;
if (openib_btl->super.btl_put_limit > openib_btl->ib_port_attr.max_msg_sz) {
openib_btl->super.btl_put_limit = openib_btl->ib_port_attr.max_msg_sz;
}
#if HAVE_DECL_IBV_ATOMIC_HCA
if (openib_btl->device->ib_dev_attr.atomic_cap == IBV_ATOMIC_NONE) {
openib_btl->super.btl_flags &= ~MCA_BTL_FLAGS_ATOMIC_FOPS;
openib_btl->super.btl_atomic_flags = 0;
openib_btl->super.btl_atomic_fop = NULL;
openib_btl->super.btl_atomic_cswap = NULL;
} else if (IBV_ATOMIC_GLOB == openib_btl->device->ib_dev_attr.atomic_cap) {
openib_btl->super.btl_flags |= MCA_BTL_ATOMIC_SUPPORTS_GLOB;
}
#endif
openib_btl->super.btl_put_alignment = 0;
openib_btl->super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t);
/* Check bandwidth configured for this device */ /* Check bandwidth configured for this device */
sprintf(param, "bandwidth_%s", ibv_get_device_name(device->ib_dev)); sprintf(param, "bandwidth_%s", ibv_get_device_name(device->ib_dev));
@ -1960,9 +1990,7 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
} }
/* If the MCA param was specified, skip all the checks */ /* If the MCA param was specified, skip all the checks */
if ( MCA_BASE_VAR_SOURCE_COMMAND_LINE || if (MCA_BASE_VAR_SOURCE_DEFAULT != mca_btl_openib_component.receive_queues_source) {
MCA_BASE_VAR_SOURCE_ENV ==
mca_btl_openib_component.receive_queues_source) {
goto good; goto good;
} }
@ -1980,7 +2008,7 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
mca_btl_openib_component.receive_queues = mca_btl_openib_component.receive_queues =
strdup(values.receive_queues); strdup(values.receive_queues);
mca_btl_openib_component.receive_queues_source = mca_btl_openib_component.receive_queues_source =
MCA_BASE_VAR_SOURCE_FILE; BTL_OPENIB_RQ_SOURCE_DEVICE_INI;
} }
} }
@ -2881,17 +2909,20 @@ void mca_btl_openib_frag_progress_pending_put_get(mca_btl_base_endpoint_t *ep,
size_t i, len = opal_list_get_size(&ep->pending_get_frags); size_t i, len = opal_list_get_size(&ep->pending_get_frags);
int rc; int rc;
for(i = 0; i < len && ep->qps[qp].qp->sd_wqe > 0 && ep->get_tokens > 0; i++) for(i = 0; i < len && ep->qps[qp].qp->sd_wqe > 0 && ep->get_tokens > 0; i++) {
{
OPAL_THREAD_LOCK(&ep->endpoint_lock); OPAL_THREAD_LOCK(&ep->endpoint_lock);
frag = opal_list_remove_first(&(ep->pending_get_frags)); frag = opal_list_remove_first(&(ep->pending_get_frags));
OPAL_THREAD_UNLOCK(&ep->endpoint_lock); OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
if(NULL == frag) if (NULL == frag)
break; break;
rc = mca_btl_openib_get((mca_btl_base_module_t *)openib_btl, ep, rc = mca_btl_openib_get_internal ((mca_btl_base_module_t *)openib_btl, ep,
&to_base_frag(frag)->base); to_get_frag(frag));
if(OPAL_ERR_OUT_OF_RESOURCE == rc) if (OPAL_ERR_OUT_OF_RESOURCE == rc) {
OPAL_THREAD_LOCK(&ep->endpoint_lock);
opal_list_prepend (&ep->pending_get_frags, frag);
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
break; break;
}
} }
len = opal_list_get_size(&ep->pending_put_frags); len = opal_list_get_size(&ep->pending_put_frags);
@ -2899,12 +2930,16 @@ void mca_btl_openib_frag_progress_pending_put_get(mca_btl_base_endpoint_t *ep,
OPAL_THREAD_LOCK(&ep->endpoint_lock); OPAL_THREAD_LOCK(&ep->endpoint_lock);
frag = opal_list_remove_first(&(ep->pending_put_frags)); frag = opal_list_remove_first(&(ep->pending_put_frags));
OPAL_THREAD_UNLOCK(&ep->endpoint_lock); OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
if(NULL == frag) if (NULL == frag)
break; break;
rc = mca_btl_openib_put((mca_btl_base_module_t*)openib_btl, ep, rc = mca_btl_openib_put_internal ((mca_btl_base_module_t*)openib_btl, ep,
&to_base_frag(frag)->base); to_put_frag(frag));
if(OPAL_ERR_OUT_OF_RESOURCE == rc) if (OPAL_ERR_OUT_OF_RESOURCE == rc) {
OPAL_THREAD_LOCK(&ep->endpoint_lock);
opal_list_prepend (&ep->pending_put_frags, frag);
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
break; break;
}
} }
} }
@ -2925,7 +2960,7 @@ static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl,
/* advance the segment address past the header and subtract from the /* advance the segment address past the header and subtract from the
* length.*/ * length.*/
des->des_local->seg_len = byte_len - sizeof(mca_btl_openib_header_t); des->des_segments->seg_len = byte_len - sizeof(mca_btl_openib_header_t);
if(OPAL_LIKELY(!(is_credit_msg = is_credit_message(frag)))) { if(OPAL_LIKELY(!(is_credit_msg = is_credit_message(frag)))) {
/* call registered callback */ /* call registered callback */
@ -2960,7 +2995,7 @@ static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl,
} }
} else { } else {
mca_btl_openib_rdma_credits_header_t *chdr = mca_btl_openib_rdma_credits_header_t *chdr =
(mca_btl_openib_rdma_credits_header_t *) des->des_local->seg_addr.pval; (mca_btl_openib_rdma_credits_header_t *) des->des_segments->seg_addr.pval;
if(ep->nbo) { if(ep->nbo) {
BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH(*chdr); BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH(*chdr);
} }
@ -3266,11 +3301,27 @@ static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq,
/* Handle work completions */ /* Handle work completions */
switch(wc->opcode) { switch(wc->opcode) {
case IBV_WC_RDMA_READ: case IBV_WC_RDMA_READ:
OPAL_OUTPUT((-1, "Got WC: RDMA_READ")); case IBV_WC_COMP_SWAP:
OPAL_THREAD_ADD32(&endpoint->get_tokens, 1); case IBV_WC_FETCH_ADD:
/* fall through */ OPAL_OUTPUT((-1, "Got WC: RDMA_READ or RDMA_WRITE"));
OPAL_THREAD_ADD32(&endpoint->get_tokens, 1);
mca_btl_openib_get_frag_t *get_frag = to_get_frag(des);
get_frag->cb.func (&openib_btl->super, endpoint, (void *)(intptr_t) frag->sg_entry.addr,
get_frag->cb.local_handle, get_frag->cb.context, get_frag->cb.data,
OPAL_SUCCESS);
case IBV_WC_RDMA_WRITE: case IBV_WC_RDMA_WRITE:
if (MCA_BTL_OPENIB_FRAG_SEND_USER == openib_frag_type(des)) {
mca_btl_openib_put_frag_t *put_frag = to_put_frag(des);
put_frag->cb.func (&openib_btl->super, endpoint, (void *)(intptr_t) frag->sg_entry.addr,
put_frag->cb.local_handle, put_frag->cb.context, put_frag->cb.data,
OPAL_SUCCESS);
put_frag->cb.func = NULL;
}
/* fall through */
case IBV_WC_SEND: case IBV_WC_SEND:
OPAL_OUTPUT((-1, "Got WC: RDMA_WRITE or SEND")); OPAL_OUTPUT((-1, "Got WC: RDMA_WRITE or SEND"));
if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) { if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) {
@ -3299,7 +3350,7 @@ static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq,
/* Process a completed send/put/get */ /* Process a completed send/put/get */
btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) { if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
des->des_cbfunc(&openib_btl->super, endpoint, des,OPAL_SUCCESS); des->des_cbfunc(&openib_btl->super, endpoint, des, OPAL_SUCCESS);
} }
if( btl_ownership ) { if( btl_ownership ) {
mca_btl_openib_free(&openib_btl->super, des); mca_btl_openib_free(&openib_btl->super, des);

Просмотреть файл

@ -1,4 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/* /*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
@ -11,7 +11,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2006-2013 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006-2012 Los Alamos National Security, LLC. All rights * Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2006-2007 Voltaire All rights reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved.
* Copyright (c) 2006-2009 Mellanox Technologies, Inc. All rights reserved. * Copyright (c) 2006-2009 Mellanox Technologies, Inc. All rights reserved.
@ -51,7 +51,7 @@
static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint); static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint);
static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint); static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint);
static inline int acruire_wqe(mca_btl_openib_endpoint_t *ep, static inline int acquire_wqe(mca_btl_openib_endpoint_t *ep,
mca_btl_openib_send_frag_t *frag) mca_btl_openib_send_frag_t *frag)
{ {
int qp = to_base_frag(frag)->base.order; int qp = to_base_frag(frag)->base.order;
@ -67,91 +67,34 @@ static inline int acruire_wqe(mca_btl_openib_endpoint_t *ep,
return OPAL_SUCCESS; return OPAL_SUCCESS;
} }
static int acquire_send_credit(mca_btl_openib_endpoint_t *endpoint,
mca_btl_openib_send_frag_t *frag)
{
mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl;
int qp = to_base_frag(frag)->base.order;
int prio = !(to_base_frag(frag)->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY);
if(BTL_OPENIB_QP_TYPE_PP(qp)) {
if(OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, -1) < 0) {
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, 1);
opal_list_append(&endpoint->qps[qp].no_credits_pending_frags[prio],
(opal_list_item_t *)frag);
return OPAL_ERR_OUT_OF_RESOURCE;
}
} else {
if(OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, -1) < 0)
{
OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1);
OPAL_THREAD_LOCK(&openib_btl->ib_lock);
opal_list_append(&openib_btl->qps[qp].u.srq_qp.pending_frags[prio],
(opal_list_item_t *)frag);
OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
return OPAL_ERR_OUT_OF_RESOURCE;
}
}
return OPAL_SUCCESS;
}
/* this function is called with endpoint->endpoint_lock held */ /* this function is called with endpoint->endpoint_lock held */
int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t *endpoint, int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t *endpoint,
mca_btl_openib_send_frag_t *frag) mca_btl_openib_send_frag_t *frag)
{ {
int prio = !(to_base_frag(frag)->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY);
mca_btl_openib_header_t *hdr = frag->hdr; mca_btl_openib_header_t *hdr = frag->hdr;
mca_btl_base_descriptor_t *des = &to_base_frag(frag)->base; mca_btl_base_descriptor_t *des = &to_base_frag(frag)->base;
int qp, ib_rc; int qp, ib_rc, rc;
int32_t cm_return;
bool do_rdma = false; bool do_rdma = false;
size_t eager_limit; size_t size;
if(OPAL_LIKELY(des->order == MCA_BTL_NO_ORDER)) if(OPAL_LIKELY(des->order == MCA_BTL_NO_ORDER))
des->order = frag->qp_idx; des->order = frag->qp_idx;
qp = des->order; qp = des->order;
if(acruire_wqe(endpoint, frag) != OPAL_SUCCESS) if(acquire_wqe(endpoint, frag) != OPAL_SUCCESS)
return OPAL_ERR_RESOURCE_BUSY; return OPAL_ERR_RESOURCE_BUSY;
eager_limit = mca_btl_openib_component.eager_limit + size = des->des_segments->seg_len + frag->coalesced_length;
sizeof(mca_btl_openib_header_coalesced_t) +
sizeof(mca_btl_openib_control_header_t);
if(des->des_local->seg_len + frag->coalesced_length <= eager_limit &&
(des->des_flags & MCA_BTL_DES_FLAGS_PRIORITY)) {
/* High priority frag. Try to send over eager RDMA */
if(acquire_eager_rdma_send_credit(endpoint) == OPAL_SUCCESS)
do_rdma = true;
}
if(!do_rdma && acquire_send_credit(endpoint, frag) != OPAL_SUCCESS) { rc = mca_btl_openib_endpoint_credit_acquire (endpoint, qp, prio, size,
&do_rdma, frag, true);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
qp_put_wqe(endpoint, qp); qp_put_wqe(endpoint, qp);
return OPAL_ERR_RESOURCE_BUSY; return OPAL_ERR_RESOURCE_BUSY;
} }
BTL_OPENIB_GET_CREDITS(endpoint->eager_rdma_local.credits, hdr->credits);
if(hdr->credits)
hdr->credits |= BTL_OPENIB_RDMA_CREDITS_FLAG;
if(!do_rdma) {
if(BTL_OPENIB_QP_TYPE_PP(qp) && 0 == hdr->credits) {
BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, hdr->credits);
}
} else {
hdr->credits |= (qp << 11);
}
BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
/* cm_seen is only 8 bytes, but cm_return is 32 bytes */
if(cm_return > 255) {
hdr->cm_seen = 255;
cm_return -= 255;
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
} else {
hdr->cm_seen = cm_return;
}
qp_reset_signal_count(endpoint, qp); qp_reset_signal_count(endpoint, qp);
ib_rc = post_send(endpoint, frag, do_rdma, 1); ib_rc = post_send(endpoint, frag, do_rdma, 1);
@ -161,27 +104,12 @@ int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t *endpoint,
if(endpoint->nbo) if(endpoint->nbo)
BTL_OPENIB_HEADER_NTOH(*hdr); BTL_OPENIB_HEADER_NTOH(*hdr);
if(BTL_OPENIB_IS_RDMA_CREDITS(hdr->credits)) { mca_btl_openib_endpoint_credit_release (endpoint, qp, do_rdma, frag);
OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits,
BTL_OPENIB_CREDITS(hdr->credits));
}
qp_put_wqe(endpoint, qp); qp_put_wqe(endpoint, qp);
if(do_rdma) { BTL_ERROR(("error posting send request error %d: %s. size = %lu\n",
OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1); ib_rc, strerror(ib_rc), size));
} else {
if(BTL_OPENIB_QP_TYPE_PP(qp)) {
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits,
hdr->credits);
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, 1);
} else if BTL_OPENIB_QP_TYPE_SRQ(qp){
mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl;
OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1);
}
}
BTL_ERROR(("error posting send request error %d: %s\n",
ib_rc, strerror(ib_rc)));
return OPAL_ERROR; return OPAL_ERROR;
} }
@ -690,8 +618,8 @@ void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint)
/* We need to post this one */ /* We need to post this one */
if (OPAL_ERROR == mca_btl_openib_endpoint_post_send(endpoint, frag)) { if (OPAL_ERROR == mca_btl_openib_endpoint_post_send(endpoint, frag)) {
BTL_ERROR(("Error posting send")); BTL_ERROR(("Error posting send"));
} }
} }
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);

Просмотреть файл

@ -1,3 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/* /*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
@ -10,7 +11,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights * Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2006-2007 Voltaire All rights reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved.
* Copyright (c) 2007-2009 Mellanox Technologies. All rights reserved. * Copyright (c) 2007-2009 Mellanox Technologies. All rights reserved.
@ -610,6 +611,101 @@ static inline int post_send(mca_btl_openib_endpoint_t *ep,
return ibv_post_send(ep->qps[qp].qp->lcl_qp, sr_desc, &bad_wr); return ibv_post_send(ep->qps[qp].qp->lcl_qp, sr_desc, &bad_wr);
} }
/* called with the endpoint lock held */
static inline int mca_btl_openib_endpoint_credit_acquire (struct mca_btl_base_endpoint_t *endpoint, int qp,
int prio, size_t size, bool *do_rdma,
mca_btl_openib_send_frag_t *frag, bool queue_frag)
{
mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl;
mca_btl_openib_header_t *hdr = frag->hdr;
size_t eager_limit;
int32_t cm_return;
eager_limit = mca_btl_openib_component.eager_limit +
sizeof(mca_btl_openib_header_coalesced_t) +
sizeof(mca_btl_openib_control_header_t);
if (!(prio && size < eager_limit && acquire_eager_rdma_send_credit(endpoint) == OPAL_SUCCESS)) {
*do_rdma = false;
if (BTL_OPENIB_QP_TYPE_PP(qp)) {
if (OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, -1) < 0) {
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, 1);
if (queue_frag) {
opal_list_append(&endpoint->qps[qp].no_credits_pending_frags[prio],
(opal_list_item_t *)frag);
}
return OPAL_ERR_OUT_OF_RESOURCE;
}
} else {
if(OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, -1) < 0) {
OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1);
if (queue_frag) {
OPAL_THREAD_LOCK(&openib_btl->ib_lock);
opal_list_append(&openib_btl->qps[qp].u.srq_qp.pending_frags[prio],
(opal_list_item_t *)frag);
OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
}
return OPAL_ERR_OUT_OF_RESOURCE;
}
}
} else {
/* High priority frag. Try to send over eager RDMA */
*do_rdma = true;
}
/* Set all credits */
BTL_OPENIB_GET_CREDITS(endpoint->eager_rdma_local.credits, hdr->credits);
if (hdr->credits) {
hdr->credits |= BTL_OPENIB_RDMA_CREDITS_FLAG;
}
if (!*do_rdma) {
if (BTL_OPENIB_QP_TYPE_PP(qp) && 0 == hdr->credits) {
BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, hdr->credits);
}
} else {
hdr->credits |= (qp << 11);
}
BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
/* cm_seen is only 8 bytes, but cm_return is 32 bytes */
if(cm_return > 255) {
hdr->cm_seen = 255;
cm_return -= 255;
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
} else {
hdr->cm_seen = cm_return;
}
return OPAL_SUCCESS;
}
/* called with the endpoint lock held. */
static inline void mca_btl_openib_endpoint_credit_release (struct mca_btl_base_endpoint_t *endpoint, int qp,
bool do_rdma, mca_btl_openib_send_frag_t *frag)
{
mca_btl_openib_header_t *hdr = frag->hdr;
if (BTL_OPENIB_IS_RDMA_CREDITS(hdr->credits)) {
OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits, BTL_OPENIB_CREDITS(hdr->credits));
}
if (do_rdma) {
OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1);
} else {
if(BTL_OPENIB_QP_TYPE_PP(qp)) {
OPAL_THREAD_ADD32 (&endpoint->qps[qp].u.pp_qp.rd_credits, hdr->credits);
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, 1);
} else if BTL_OPENIB_QP_TYPE_SRQ(qp){
mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl;
OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1);
}
}
}
END_C_DECLS END_C_DECLS
#endif #endif

Просмотреть файл

@ -153,8 +153,8 @@ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
if (NULL != btlname) free(btlname); if (NULL != btlname) free(btlname);
/* Since we believe we have done a send, read or write, then the /* Since we believe we have done a send, read or write, then the
* des_local fields should have valid data. */ * des_segments fields should have valid data. */
assert(des->des_local != NULL); assert(des->des_segments != NULL);
/* If the endpoint is not yet in the MCA_BTL_IB_CLOSED state, then /* If the endpoint is not yet in the MCA_BTL_IB_CLOSED state, then
* change the status. Since this connection was mapped out in the * change the status. Since this connection was mapped out in the

Просмотреть файл

@ -68,8 +68,8 @@ static void out_constructor(mca_btl_openib_out_frag_t *frag)
{ {
mca_btl_openib_frag_t *base_frag = to_base_frag(frag); mca_btl_openib_frag_t *base_frag = to_base_frag(frag);
base_frag->base.des_local = &base_frag->segment.base; base_frag->base.des_segments = &base_frag->segment.base;
base_frag->base.des_local_count = 1; base_frag->base.des_segment_count = 1;
frag->sr_desc.wr_id = (uint64_t)(uintptr_t)frag; frag->sr_desc.wr_id = (uint64_t)(uintptr_t)frag;
frag->sr_desc.sg_list = &to_com_frag(frag)->sg_entry; frag->sr_desc.sg_list = &to_com_frag(frag)->sg_entry;
@ -83,8 +83,8 @@ static void in_constructor(mca_btl_openib_in_frag_t *frag)
{ {
mca_btl_openib_frag_t *base_frag = to_base_frag(frag); mca_btl_openib_frag_t *base_frag = to_base_frag(frag);
base_frag->base.des_local = &base_frag->segment.base; base_frag->base.des_segments = &base_frag->segment.base;
base_frag->base.des_local_count = 1; base_frag->base.des_segment_count = 1;
} }
static void send_constructor(mca_btl_openib_send_frag_t *frag) static void send_constructor(mca_btl_openib_send_frag_t *frag)
@ -134,6 +134,7 @@ static void put_constructor(mca_btl_openib_put_frag_t *frag)
{ {
to_base_frag(frag)->type = MCA_BTL_OPENIB_FRAG_SEND_USER; to_base_frag(frag)->type = MCA_BTL_OPENIB_FRAG_SEND_USER;
to_out_frag(frag)->sr_desc.opcode = IBV_WR_RDMA_WRITE; to_out_frag(frag)->sr_desc.opcode = IBV_WR_RDMA_WRITE;
frag->cb.func = NULL;
} }
static void get_constructor(mca_btl_openib_get_frag_t *frag) static void get_constructor(mca_btl_openib_get_frag_t *frag)
@ -154,8 +155,8 @@ static void coalesced_constructor(mca_btl_openib_coalesced_frag_t *frag)
base_frag->type = MCA_BTL_OPENIB_FRAG_COALESCED; base_frag->type = MCA_BTL_OPENIB_FRAG_COALESCED;
base_frag->base.des_local = &base_frag->segment.base; base_frag->base.des_segments = &base_frag->segment.base;
base_frag->base.des_local_count = 1; base_frag->base.des_segment_count = 1;
} }
OBJ_CLASS_INSTANCE( OBJ_CLASS_INSTANCE(

Просмотреть файл

@ -349,7 +349,15 @@ OBJ_CLASS_DECLARATION(mca_btl_openib_recv_frag_t);
#define to_recv_frag(f) ((mca_btl_openib_recv_frag_t*)(f)) #define to_recv_frag(f) ((mca_btl_openib_recv_frag_t*)(f))
typedef struct mca_btl_openib_out_frag_t mca_btl_openib_put_frag_t; typedef struct mca_btl_openib_put_frag_t {
mca_btl_openib_out_frag_t super;
struct {
mca_btl_base_rdma_completion_fn_t func;
mca_btl_base_registration_handle_t *local_handle;
void *context;
void *data;
} cb;
} mca_btl_openib_put_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_put_frag_t); OBJ_CLASS_DECLARATION(mca_btl_openib_put_frag_t);
#define to_put_frag(f) ((mca_btl_openib_put_frag_t*)(f)) #define to_put_frag(f) ((mca_btl_openib_put_frag_t*)(f))
@ -357,6 +365,12 @@ OBJ_CLASS_DECLARATION(mca_btl_openib_put_frag_t);
typedef struct mca_btl_openib_get_frag_t { typedef struct mca_btl_openib_get_frag_t {
mca_btl_openib_in_frag_t super; mca_btl_openib_in_frag_t super;
struct ibv_send_wr sr_desc; struct ibv_send_wr sr_desc;
struct {
mca_btl_base_rdma_completion_fn_t func;
mca_btl_base_registration_handle_t *local_handle;
void *context;
void *data;
} cb;
} mca_btl_openib_get_frag_t; } mca_btl_openib_get_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_get_frag_t); OBJ_CLASS_DECLARATION(mca_btl_openib_get_frag_t);
@ -371,6 +385,7 @@ typedef struct mca_btl_openib_coalesced_frag_t {
mca_btl_openib_frag_t super; mca_btl_openib_frag_t super;
mca_btl_openib_send_frag_t *send_frag; mca_btl_openib_send_frag_t *send_frag;
mca_btl_openib_header_coalesced_t *hdr; mca_btl_openib_header_coalesced_t *hdr;
bool sent;
} mca_btl_openib_coalesced_frag_t; } mca_btl_openib_coalesced_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_coalesced_frag_t); OBJ_CLASS_DECLARATION(mca_btl_openib_coalesced_frag_t);

159
opal/mca/btl/openib/btl_openib_get.c Обычный файл
Просмотреть файл

@ -0,0 +1,159 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2013 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
* Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2006-2007 Voltaire All rights reserved.
* Copyright (c) 2008-2012 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2009 IBM Corporation. All rights reserved.
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "btl_openib.h"
#include "btl_openib_frag.h"
#include "btl_openib_endpoint.h"
#include "btl_openib_xrc.h"
/*
* RDMA READ remote buffer to local buffer address.
*/
int mca_btl_openib_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, void *local_address,
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{
mca_btl_openib_get_frag_t* frag = NULL;
int qp = order;
int rc;
if (OPAL_UNLIKELY(size > btl->btl_get_limit)) {
return OPAL_ERR_BAD_PARAM;
}
frag = to_get_frag(alloc_recv_user_frag());
if (OPAL_UNLIKELY(NULL == frag)) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
if (MCA_BTL_NO_ORDER == qp) {
qp = mca_btl_openib_component.rdma_qp;
}
/* set base descriptor flags */
to_base_frag(frag)->base.order = qp;
/* free this descriptor when the operation is complete */
to_base_frag(frag)->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
/* set up scatter-gather entry */
to_com_frag(frag)->sg_entry.length = size;
to_com_frag(frag)->sg_entry.lkey = local_handle->lkey;
to_com_frag(frag)->sg_entry.addr = (uint64_t)(uintptr_t) local_address;
to_com_frag(frag)->endpoint = ep;
/* set up rdma callback */
frag->cb.func = cbfunc;
frag->cb.context = cbcontext;
frag->cb.data = cbdata;
frag->cb.local_handle = local_handle;
/* set up descriptor */
frag->sr_desc.wr.rdma.remote_addr = remote_address;
/* the opcode may have been changed by an atomic operation */
frag->sr_desc.opcode = IBV_WR_RDMA_READ;
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
if((ep->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN)
!= (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) {
frag->sr_desc.wr.rdma.rkey = opal_swap_bytes4 (remote_handle->rkey);
} else
#endif
{
frag->sr_desc.wr.rdma.rkey = remote_handle->rkey;
}
#if HAVE_XRC
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) {
frag->sr_desc.xrc_remote_srq_num=ep->rem_info.rem_srqs[qp].rem_srq_num;
}
#endif
if (ep->endpoint_state != MCA_BTL_IB_CONNECTED) {
OPAL_THREAD_LOCK(&ep->endpoint_lock);
rc = check_endpoint_state(ep, &to_base_frag(frag)->base, &ep->pending_get_frags);
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
if (OPAL_ERR_RESOURCE_BUSY == rc) {
return OPAL_SUCCESS;
}
if (OPAL_SUCCESS != rc) {
MCA_BTL_IB_FRAG_RETURN (frag);
return rc;
}
}
rc = mca_btl_openib_get_internal (btl, ep, frag);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
if (OPAL_LIKELY(OPAL_ERR_OUT_OF_RESOURCE == rc)) {
rc = OPAL_SUCCESS;
OPAL_THREAD_LOCK(&ep->endpoint_lock);
opal_list_append(&ep->pending_get_frags, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
} else {
MCA_BTL_IB_FRAG_RETURN (frag);
}
}
return rc;
}
int mca_btl_openib_get_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep,
mca_btl_openib_get_frag_t *frag)
{
int qp = to_base_frag(frag)->base.order;
struct ibv_send_wr *bad_wr;
/* check for a send wqe */
if (qp_get_wqe(ep, qp) < 0) {
qp_put_wqe(ep, qp);
return OPAL_ERR_OUT_OF_RESOURCE;
}
/* check for a get token */
if (OPAL_THREAD_ADD32(&ep->get_tokens,-1) < 0) {
qp_put_wqe(ep, qp);
OPAL_THREAD_ADD32(&ep->get_tokens,1);
return OPAL_ERR_OUT_OF_RESOURCE;
}
qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag));
qp_reset_signal_count(ep, qp);
if (ibv_post_send(ep->qps[qp].qp->lcl_qp, &frag->sr_desc, &bad_wr)) {
qp_put_wqe(ep, qp);
OPAL_THREAD_ADD32(&ep->get_tokens,1);
return OPAL_ERROR;
}
return OPAL_SUCCESS;
}

Просмотреть файл

@ -1,3 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/* /*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
@ -11,7 +12,7 @@
* All rights reserved. * All rights reserved.
* Copyright (c) 2006-2013 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved. * Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights * Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2006-2007 Voltaire All rights reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved.
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
@ -567,10 +568,16 @@ int btl_openib_register_mca_params(void)
mca_btl_openib_module.super.btl_rdma_pipeline_frag_size = 1024 * 1024; mca_btl_openib_module.super.btl_rdma_pipeline_frag_size = 1024 * 1024;
mca_btl_openib_module.super.btl_min_rdma_pipeline_size = 256 * 1024; mca_btl_openib_module.super.btl_min_rdma_pipeline_size = 256 * 1024;
mca_btl_openib_module.super.btl_flags = MCA_BTL_FLAGS_RDMA | mca_btl_openib_module.super.btl_flags = MCA_BTL_FLAGS_RDMA |
MCA_BTL_FLAGS_NEED_ACK | MCA_BTL_FLAGS_NEED_CSUM | MCA_BTL_FLAGS_HETEROGENEOUS_RDMA; MCA_BTL_FLAGS_NEED_ACK | MCA_BTL_FLAGS_NEED_CSUM | MCA_BTL_FLAGS_HETEROGENEOUS_RDMA;
#if BTL_OPENIB_FAILOVER_ENABLED #if BTL_OPENIB_FAILOVER_ENABLED
mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_FAILOVER_SUPPORT; mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_FAILOVER_SUPPORT;
#endif #endif
#if HAVE_DECL_IBV_ATOMIC_HCA
mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_ATOMIC_FOPS;
mca_btl_openib_module.super.btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD | MCA_BTL_ATOMIC_SUPPORTS_CSWAP;
#endif
/* Default to bandwidth auto-detection */ /* Default to bandwidth auto-detection */
mca_btl_openib_module.super.btl_bandwidth = 0; mca_btl_openib_module.super.btl_bandwidth = 0;
mca_btl_openib_module.super.btl_latency = 4; mca_btl_openib_module.super.btl_latency = 4;

152
opal/mca/btl/openib/btl_openib_put.c Обычный файл
Просмотреть файл

@ -0,0 +1,152 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2013 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
* Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2006-2007 Voltaire All rights reserved.
* Copyright (c) 2008-2012 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2009 IBM Corporation. All rights reserved.
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "btl_openib.h"
#include "btl_openib_frag.h"
#include "btl_openib_endpoint.h"
#include "btl_openib_xrc.h"
/*
* RDMA WRITE local buffer to remote buffer address.
*/
int mca_btl_openib_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, void *local_address,
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{
mca_btl_openib_put_frag_t *frag = NULL;
int rc, qp = order;
if (OPAL_UNLIKELY(size > btl->btl_put_limit)) {
return OPAL_ERR_BAD_PARAM;
}
frag = to_put_frag(alloc_send_user_frag ());
if (OPAL_UNLIKELY(NULL == frag)) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
if (MCA_BTL_NO_ORDER == qp) {
qp = mca_btl_openib_component.rdma_qp;
}
/* set base descriptor flags */
to_base_frag(frag)->base.order = qp;
/* free this descriptor when the operation is complete */
to_base_frag(frag)->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
/* set up scatter-gather entry */
to_com_frag(frag)->sg_entry.length = size;
to_com_frag(frag)->sg_entry.lkey = local_handle->lkey;
to_com_frag(frag)->sg_entry.addr = (uint64_t)(intptr_t) local_address;
to_com_frag(frag)->endpoint = ep;
/* set up rdma callback */
frag->cb.func = cbfunc;
frag->cb.context = cbcontext;
frag->cb.data = cbdata;
frag->cb.local_handle = local_handle;
/* post descriptor */
to_out_frag(frag)->sr_desc.opcode = IBV_WR_RDMA_WRITE;
to_out_frag(frag)->sr_desc.send_flags = ib_send_flags(size, &(ep->qps[qp]), 1);
to_out_frag(frag)->sr_desc.wr.rdma.remote_addr = remote_address;
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
if ((ep->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN)
!= (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) {
to_out_frag(frag)->sr_desc.wr.rdma.rkey = opal_swap_bytes4(remote_handle->rkey);
} else
#endif
{
to_out_frag(frag)->sr_desc.wr.rdma.rkey = remote_handle->rkey;
}
#if HAVE_XRC
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp))
to_out_frag(frag)->sr_desc.xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num;
#endif
if (ep->endpoint_state != MCA_BTL_IB_CONNECTED) {
OPAL_THREAD_LOCK(&ep->endpoint_lock);
rc = check_endpoint_state(ep, &to_base_frag(frag)->base, &ep->pending_put_frags);
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
if (OPAL_ERR_RESOURCE_BUSY == rc) {
/* descriptor was queued pending connection */
return OPAL_SUCCESS;
}
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
MCA_BTL_IB_FRAG_RETURN (frag);
return rc;
}
}
rc = mca_btl_openib_put_internal (btl, ep, frag);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
if (OPAL_LIKELY(OPAL_ERR_OUT_OF_RESOURCE == rc)) {
rc = OPAL_SUCCESS;
/* queue the fragment for when resources are available */
OPAL_THREAD_LOCK(&ep->endpoint_lock);
opal_list_append(&ep->pending_put_frags, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
} else {
MCA_BTL_IB_FRAG_RETURN (frag);
}
}
return rc;
}
int mca_btl_openib_put_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep,
mca_btl_openib_put_frag_t *frag)
{
int qp = to_base_frag(frag)->base.order;
struct ibv_send_wr *bad_wr;
int rc;
/* check for a send wqe */
if (qp_get_wqe(ep, qp) < 0) {
qp_put_wqe(ep, qp);
return OPAL_ERR_OUT_OF_RESOURCE;
}
qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag));
qp_reset_signal_count(ep, qp);
if (0 != (rc = ibv_post_send(ep->qps[qp].qp->lcl_qp, &to_out_frag(frag)->sr_desc, &bad_wr))) {
qp_put_wqe(ep, qp);
return OPAL_ERROR;;
}
return OPAL_SUCCESS;
}

Просмотреть файл

@ -126,7 +126,6 @@ AC_DEFUN([MCA_opal_btl_openib_CONFIG],[
[enable openib BTL failover]) [enable openib BTL failover])
AM_CONDITIONAL([MCA_btl_openib_enable_failover], [test "x$btl_openib_failover_enabled" = "x1"]) AM_CONDITIONAL([MCA_btl_openib_enable_failover], [test "x$btl_openib_failover_enabled" = "x1"])
# Check for __malloc_hook availability # Check for __malloc_hook availability
AC_ARG_ENABLE(btl-openib-malloc-alignment, AC_ARG_ENABLE(btl-openib-malloc-alignment,
AC_HELP_STRING([--enable-btl-openib-malloc-alignment], [Enable support for allocated memory alignment. Default: enabled if supported, disabled otherwise.])) AC_HELP_STRING([--enable-btl-openib-malloc-alignment], [Enable support for allocated memory alignment. Default: enabled if supported, disabled otherwise.]))

Просмотреть файл

@ -321,16 +321,17 @@ static int udcm_send_request (mca_btl_base_endpoint_t *lcl_ep,
static void udcm_send_timeout (evutil_socket_t fd, short event, void *arg); static void udcm_send_timeout (evutil_socket_t fd, short event, void *arg);
static int udcm_finish_connection (mca_btl_openib_endpoint_t *lcl_ep); static int udcm_finish_connection (mca_btl_openib_endpoint_t *lcl_ep);
static int udcm_rc_qps_to_rts(mca_btl_openib_endpoint_t *lcl_ep);
/* XRC support */ /* XRC support */
#if HAVE_XRC #if HAVE_XRC
static int udcm_xrc_start_connect (opal_btl_openib_connect_base_module_t *cpc, static int udcm_xrc_start_connect (opal_btl_openib_connect_base_module_t *cpc,
mca_btl_base_endpoint_t *lcl_ep); mca_btl_base_endpoint_t *lcl_ep);
static int udcm_xrc_restart_connect (mca_btl_base_endpoint_t *lcl_ep); static int udcm_xrc_restart_connect (mca_btl_base_endpoint_t *lcl_ep);
static int udcm_xrc_send_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_hdr_t *msg_hdr); static int udcm_xrc_send_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, uint32_t rem_qp_num, uint32_t rem_psn);
static int udcm_xrc_send_qp_create (mca_btl_base_endpoint_t *lcl_ep); static int udcm_xrc_send_qp_create (mca_btl_base_endpoint_t *lcl_ep);
static int udcm_xrc_recv_qp_connect (mca_btl_openib_endpoint_t *lcl_ep); static int udcm_xrc_recv_qp_connect (mca_btl_openib_endpoint_t *lcl_ep);
static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_hdr_t *msg_hdr); static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, uint32_t rem_qp_num, uint32_t rem_psn);
static int udcm_xrc_send_request (mca_btl_base_endpoint_t *lcl_ep, mca_btl_base_endpoint_t *rem_ep, static int udcm_xrc_send_request (mca_btl_base_endpoint_t *lcl_ep, mca_btl_base_endpoint_t *rem_ep,
uint8_t msg_type); uint8_t msg_type);
static int udcm_xrc_send_xresponse (mca_btl_base_endpoint_t *lcl_ep, mca_btl_base_endpoint_t *rem_ep, static int udcm_xrc_send_xresponse (mca_btl_base_endpoint_t *lcl_ep, mca_btl_base_endpoint_t *rem_ep,
@ -507,6 +508,93 @@ static int udcm_component_finalize(void)
/* mark: udcm module */ /* mark: udcm module */
#if HAVE_XRC
static int udcm_endpoint_init_self_xrc (struct mca_btl_base_endpoint_t *lcl_ep)
{
udcm_endpoint_t *udep = UDCM_ENDPOINT_DATA(lcl_ep);
int rc;
opal_mutex_lock (&udep->udep_lock);
do {
rc = udcm_xrc_recv_qp_connect (lcl_ep);
if (OPAL_SUCCESS != rc) {
BTL_VERBOSE(("error connecting loopback XRC receive queue pair"));
break;
}
lcl_ep->xrc_recv_qp_num = lcl_ep->qps[0].qp->lcl_qp->qp_num;
rc = mca_btl_openib_endpoint_post_recvs (lcl_ep);
if (OPAL_SUCCESS != rc) {
BTL_VERBOSE(("error posting receives for loopback queue pair"));
break;
}
rc = udcm_xrc_recv_qp_create (lcl_ep, lcl_ep->qps[0].qp->lcl_qp->qp_num,
lcl_ep->qps[0].qp->lcl_psn);
if (OPAL_SUCCESS != rc) {
BTL_VERBOSE(("error creating loopback XRC receive queue pair"));
break;
}
rc = udcm_xrc_send_qp_connect (lcl_ep, lcl_ep->qps[0].qp->lcl_qp->qp_num,
lcl_ep->qps[0].qp->lcl_psn);
if (OPAL_SUCCESS != rc) {
BTL_VERBOSE(("error creating loopback XRC send queue pair"));
break;
}
lcl_ep->endpoint_state = MCA_BTL_IB_CONNECTED;
rc = udcm_finish_connection (lcl_ep);
} while (0);
opal_mutex_unlock (&udep->udep_lock);
return rc;
}
#endif
static int udcm_endpoint_init_self (struct mca_btl_base_endpoint_t *lcl_ep)
{
udcm_endpoint_t *udep = UDCM_ENDPOINT_DATA(lcl_ep);
int rc;
opal_mutex_lock (&udep->udep_lock);
do {
if (OPAL_SUCCESS != (rc = udcm_endpoint_init_data (lcl_ep))) {
BTL_VERBOSE(("error initializing loopback endpoint cpc data"));
break;
}
if (OPAL_SUCCESS != (rc = udcm_rc_qp_create_all (lcl_ep))) {
BTL_VERBOSE(("error initializing loopback endpoint qps"));
break;
}
/* save queue pair info */
lcl_ep->rem_info.rem_index = lcl_ep->index;
for (int i = 0 ; i < mca_btl_openib_component.num_qps ; ++i) {
lcl_ep->rem_info.rem_qps[i].rem_psn = lcl_ep->qps[i].qp->lcl_psn;
lcl_ep->rem_info.rem_qps[i].rem_qp_num = lcl_ep->qps[i].qp->lcl_qp->qp_num;
}
if (OPAL_SUCCESS != (rc = udcm_rc_qps_to_rts (lcl_ep))) {
BTL_VERBOSE(("error moving loopback endpoint qps to RTS"));
break;
}
lcl_ep->endpoint_state = MCA_BTL_IB_CONNECTED;
rc = udcm_finish_connection (lcl_ep);
return OPAL_SUCCESS;
} while (0);
opal_mutex_unlock (&udep->udep_lock);
return rc;
}
static int udcm_endpoint_init (struct mca_btl_base_endpoint_t *lcl_ep) static int udcm_endpoint_init (struct mca_btl_base_endpoint_t *lcl_ep)
{ {
udcm_endpoint_t *udep = lcl_ep->endpoint_local_cpc_data = udcm_endpoint_t *udep = lcl_ep->endpoint_local_cpc_data =
@ -518,6 +606,16 @@ static int udcm_endpoint_init (struct mca_btl_base_endpoint_t *lcl_ep)
OBJ_CONSTRUCT(&udep->udep_lock, opal_mutex_t); OBJ_CONSTRUCT(&udep->udep_lock, opal_mutex_t);
if (lcl_ep->endpoint_proc->proc_opal == opal_proc_local_get ()) {
/* go ahead and try to create a loopback queue pair */
#if HAVE_XRC
if (mca_btl_openib_component.num_xrc_qps > 0) {
return udcm_endpoint_init_self_xrc (lcl_ep);
} else
#endif
return udcm_endpoint_init_self (lcl_ep);
}
return OPAL_SUCCESS; return OPAL_SUCCESS;
} }
@ -1069,6 +1167,9 @@ static inline int udcm_rc_qp_to_init (struct ibv_qp *qp,
attr.pkey_index = btl->pkey_index; attr.pkey_index = btl->pkey_index;
attr.port_num = btl->port_num; attr.port_num = btl->port_num;
attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ; attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ;
#if HAVE_DECL_IBV_ATOMIC_HCA
attr.qp_access_flags |= IBV_ACCESS_REMOTE_ATOMIC;
#endif
attr_mask = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | attr_mask = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT |
IBV_QP_ACCESS_FLAGS; IBV_QP_ACCESS_FLAGS;
@ -2307,7 +2408,7 @@ static int udcm_xrc_restart_connect (mca_btl_base_endpoint_t *lcl_ep)
/* mark: xrc send qp */ /* mark: xrc send qp */
/* Send qp connect */ /* Send qp connect */
static int udcm_xrc_send_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_hdr_t *msg_hdr) static int udcm_xrc_send_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, uint32_t rem_qp_num, uint32_t rem_psn)
{ {
mca_btl_openib_module_t *openib_btl = lcl_ep->endpoint_btl; mca_btl_openib_module_t *openib_btl = lcl_ep->endpoint_btl;
struct ibv_qp_attr attr; struct ibv_qp_attr attr;
@ -2316,7 +2417,7 @@ static int udcm_xrc_send_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg
int ret; int ret;
BTL_VERBOSE(("Connecting send qp: %p, remote qp: %d", (void *)lcl_ep->qps[0].qp->lcl_qp, BTL_VERBOSE(("Connecting send qp: %p, remote qp: %d", (void *)lcl_ep->qps[0].qp->lcl_qp,
msg_hdr->data.xres.rem_qp_num)); rem_qp_num));
assert(NULL != lcl_ep->qps); assert(NULL != lcl_ep->qps);
qp = lcl_ep->qps[0].qp->lcl_qp; qp = lcl_ep->qps[0].qp->lcl_qp;
psn = lcl_ep->qps[0].qp->lcl_psn; psn = lcl_ep->qps[0].qp->lcl_psn;
@ -2326,8 +2427,8 @@ static int udcm_xrc_send_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg
attr.qp_state = IBV_QPS_RTR; attr.qp_state = IBV_QPS_RTR;
attr.path_mtu = (openib_btl->device->mtu < lcl_ep->rem_info.rem_mtu) ? attr.path_mtu = (openib_btl->device->mtu < lcl_ep->rem_info.rem_mtu) ?
openib_btl->device->mtu : lcl_ep->rem_info.rem_mtu; openib_btl->device->mtu : lcl_ep->rem_info.rem_mtu;
attr.dest_qp_num = msg_hdr->data.xres.rem_qp_num; attr.dest_qp_num = rem_qp_num;
attr.rq_psn = msg_hdr->data.xres.rem_psn; attr.rq_psn = rem_psn;
attr.max_dest_rd_atomic = mca_btl_openib_component.ib_max_rdma_dst_ops; attr.max_dest_rd_atomic = mca_btl_openib_component.ib_max_rdma_dst_ops;
attr.min_rnr_timer = mca_btl_openib_component.ib_min_rnr_timer; attr.min_rnr_timer = mca_btl_openib_component.ib_min_rnr_timer;
attr.ah_attr.is_global = 0; attr.ah_attr.is_global = 0;
@ -2460,6 +2561,9 @@ static int udcm_xrc_send_qp_create (mca_btl_base_endpoint_t *lcl_ep)
attr.pkey_index = openib_btl->pkey_index; attr.pkey_index = openib_btl->pkey_index;
attr.port_num = openib_btl->port_num; attr.port_num = openib_btl->port_num;
attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ; attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ;
#if HAVE_DECL_IBV_ATOMIC_HCA
attr.qp_access_flags |= IBV_ACCESS_REMOTE_ATOMIC;
#endif
ret = ibv_modify_qp(*qp, &attr, ret = ibv_modify_qp(*qp, &attr,
IBV_QP_STATE | IBV_QP_STATE |
IBV_QP_PKEY_INDEX | IBV_QP_PKEY_INDEX |
@ -2501,7 +2605,7 @@ static int udcm_xrc_recv_qp_connect (mca_btl_openib_endpoint_t *lcl_ep)
} }
/* Recv qp create */ /* Recv qp create */
static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_hdr_t *msg_hdr) static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, uint32_t rem_qp_num, uint32_t rem_psn)
{ {
mca_btl_openib_module_t* openib_btl = lcl_ep->endpoint_btl; mca_btl_openib_module_t* openib_btl = lcl_ep->endpoint_btl;
struct ibv_qp_init_attr qp_init_attr; struct ibv_qp_init_attr qp_init_attr;
@ -2525,6 +2629,11 @@ static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_
attr.pkey_index = openib_btl->pkey_index; attr.pkey_index = openib_btl->pkey_index;
attr.port_num = openib_btl->port_num; attr.port_num = openib_btl->port_num;
attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ; attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ;
#if HAVE_DECL_IBV_ATOMIC_HCA
attr.qp_access_flags |= IBV_ACCESS_REMOTE_ATOMIC;
#endif
ret = ibv_modify_xrc_rcv_qp(openib_btl->device->xrc_domain, ret = ibv_modify_xrc_rcv_qp(openib_btl->device->xrc_domain,
lcl_ep->xrc_recv_qp_num, &attr, lcl_ep->xrc_recv_qp_num, &attr,
IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_STATE | IBV_QP_PKEY_INDEX |
@ -2540,8 +2649,8 @@ static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_
attr.qp_state = IBV_QPS_RTR; attr.qp_state = IBV_QPS_RTR;
attr.path_mtu = (openib_btl->device->mtu < lcl_ep->rem_info.rem_mtu) ? attr.path_mtu = (openib_btl->device->mtu < lcl_ep->rem_info.rem_mtu) ?
openib_btl->device->mtu : lcl_ep->rem_info.rem_mtu; openib_btl->device->mtu : lcl_ep->rem_info.rem_mtu;
attr.dest_qp_num = msg_hdr->data.xreq.rem_qp_num; attr.dest_qp_num = rem_qp_num;
attr.rq_psn = msg_hdr->data.xreq.rem_psn; attr.rq_psn = rem_psn;
attr.max_dest_rd_atomic = mca_btl_openib_component.ib_max_rdma_dst_ops; attr.max_dest_rd_atomic = mca_btl_openib_component.ib_max_rdma_dst_ops;
attr.min_rnr_timer = mca_btl_openib_component.ib_min_rnr_timer; attr.min_rnr_timer = mca_btl_openib_component.ib_min_rnr_timer;
attr.ah_attr.is_global = 0; attr.ah_attr.is_global = 0;
@ -2715,7 +2824,7 @@ static int udcm_xrc_handle_xconnect (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg
response_type = UDCM_MESSAGE_XRESPONSE; response_type = UDCM_MESSAGE_XRESPONSE;
rc = udcm_xrc_recv_qp_create (lcl_ep, msg_hdr); rc = udcm_xrc_recv_qp_create (lcl_ep, msg_hdr->data.xreq.rem_qp_num, msg_hdr->data.xreq.rem_psn);
if (OPAL_SUCCESS != rc) { if (OPAL_SUCCESS != rc) {
break; break;
} }
@ -2761,7 +2870,7 @@ static int udcm_xrc_handle_xresponse (mca_btl_openib_endpoint_t *lcl_ep, udcm_ms
udep->recv_resp = true; udep->recv_resp = true;
rc = udcm_xrc_send_qp_connect (lcl_ep, msg_hdr); rc = udcm_xrc_send_qp_connect (lcl_ep, msg_hdr->data.xres.rem_qp_num, msg_hdr->data.xres.rem_psn);
if (OPAL_SUCCESS != rc) { if (OPAL_SUCCESS != rc) {
mca_btl_openib_endpoint_invoke_error (lcl_ep); mca_btl_openib_endpoint_invoke_error (lcl_ep);
} }

Просмотреть файл

@ -183,7 +183,7 @@ mca_btl_portals4_alloc(struct mca_btl_base_module_t* btl_base,
} }
frag->md_h = PTL_INVALID_HANDLE; frag->md_h = PTL_INVALID_HANDLE;
frag->base.des_local_count = 1; frag->base.des_segment_count = 1;
frag->base.des_flags = flags | MCA_BTL_DES_SEND_ALWAYS_CALLBACK; frag->base.des_flags = flags | MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
frag->base.order = MCA_BTL_NO_ORDER; frag->base.order = MCA_BTL_NO_ORDER;
@ -272,7 +272,7 @@ mca_btl_portals4_prepare_src(struct mca_btl_base_module_t* btl_base,
} }
frag->segments[0].base.seg_len = max_data + reserve; frag->segments[0].base.seg_len = max_data + reserve;
frag->base.des_local_count = 1; frag->base.des_segment_count = 1;
} else { } else {
/* no need to pack - rdma operation out of user's buffer */ /* no need to pack - rdma operation out of user's buffer */
@ -302,7 +302,7 @@ mca_btl_portals4_prepare_src(struct mca_btl_base_module_t* btl_base,
frag->segments[0].base.seg_len = max_data; frag->segments[0].base.seg_len = max_data;
frag->segments[0].base.seg_addr.pval = iov.iov_base; frag->segments[0].base.seg_addr.pval = iov.iov_base;
frag->segments[0].key = OPAL_THREAD_ADD64(&(portals4_btl->portals_rdma_key), 1); frag->segments[0].key = OPAL_THREAD_ADD64(&(portals4_btl->portals_rdma_key), 1);
frag->base.des_local_count = 1; frag->base.des_segment_count = 1;
/* either a put or get. figure out which later */ /* either a put or get. figure out which later */
OPAL_OUTPUT_VERBOSE((90, opal_btl_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((90, opal_btl_base_framework.framework_output,
@ -348,7 +348,7 @@ mca_btl_portals4_prepare_src(struct mca_btl_base_module_t* btl_base,
(void *)frag, frag->me_h, me.start, me.length, (void *)frag, frag->me_h, me.start, me.length,
me.match_id.phys.nid, me.match_id.phys.pid, me.match_bits)); me.match_id.phys.nid, me.match_id.phys.pid, me.match_bits));
} }
frag->base.des_local = &frag->segments[0].base; frag->base.des_segments = &frag->segments[0].base;
frag->base.des_remote = NULL; frag->base.des_remote = NULL;
frag->base.des_remote_count = 0; frag->base.des_remote_count = 0;
frag->base.des_flags = flags | MCA_BTL_DES_SEND_ALWAYS_CALLBACK; frag->base.des_flags = flags | MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
@ -390,8 +390,8 @@ mca_btl_portals4_prepare_dst(struct mca_btl_base_module_t* btl_base,
frag->segments[0].key = OPAL_THREAD_ADD64(&(portals4_btl->portals_rdma_key), 1); frag->segments[0].key = OPAL_THREAD_ADD64(&(portals4_btl->portals_rdma_key), 1);
frag->base.des_remote = NULL; frag->base.des_remote = NULL;
frag->base.des_remote_count = 0; frag->base.des_remote_count = 0;
frag->base.des_local = &frag->segments[0].base; frag->base.des_segments = &frag->segments[0].base;
frag->base.des_local_count = 1; frag->base.des_segment_count = 1;
frag->base.des_flags = flags | MCA_BTL_DES_SEND_ALWAYS_CALLBACK; frag->base.des_flags = flags | MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
frag->base.order = MCA_BTL_NO_ORDER; frag->base.order = MCA_BTL_NO_ORDER;
frag->md_h = PTL_INVALID_HANDLE; frag->md_h = PTL_INVALID_HANDLE;

Просмотреть файл

@ -725,11 +725,11 @@ mca_btl_portals4_component_progress(void)
frag = ev.user_ptr; frag = ev.user_ptr;
tag = (unsigned char) (ev.hdr_data); tag = (unsigned char) (ev.hdr_data);
frag->base.des_local = seg; frag->base.des_segments = seg;
seg[0].seg_addr.pval = ev.start; seg[0].seg_addr.pval = ev.start;
seg[0].seg_len = ev.mlength; seg[0].seg_len = ev.mlength;
frag->base.des_local_count = 1; frag->base.des_segment_count = 1;
reg = mca_btl_base_active_message_trigger + tag; reg = mca_btl_base_active_message_trigger + tag;
OPAL_OUTPUT_VERBOSE((50, opal_btl_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((50, opal_btl_base_framework.framework_output,

Просмотреть файл

@ -26,8 +26,8 @@ static void
mca_btl_portals4_frag_common_send_constructor(mca_btl_portals4_frag_t* frag) mca_btl_portals4_frag_common_send_constructor(mca_btl_portals4_frag_t* frag)
{ {
frag->base.des_flags = 0; frag->base.des_flags = 0;
frag->base.des_local = &frag->segments[0].base; frag->base.des_segments = &frag->segments[0].base;
frag->base.des_local_count = 2; frag->base.des_segment_count = 2;
frag->segments[0].base.seg_addr.pval = frag + 1; frag->segments[0].base.seg_addr.pval = frag + 1;
frag->segments[0].base.seg_len = frag->size; frag->segments[0].base.seg_len = frag->size;

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/* /*
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights * Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2014 Research Organization for Information Science * Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved. * and Technology (RIST). All rights reserved.
@ -197,29 +197,21 @@ int mca_btl_scif_sendi (struct mca_btl_base_module_t *btl,
* Initiate a get operation. * Initiate a get operation.
* *
* location: btl_scif_get.c * location: btl_scif_get.c
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*/ */
int int mca_btl_scif_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
mca_btl_scif_get (struct mca_btl_base_module_t *btl, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_endpoint_t *endpoint, mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
struct mca_btl_base_descriptor_t *des); int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
/** /**
* Initiate a put operation. * Initiate a put operation.
* *
* location: btl_scif_put.c * location: btl_scif_put.c
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*/ */
int int mca_btl_scif_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
mca_btl_scif_put (struct mca_btl_base_module_t *btl, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_endpoint_t *endpoint, mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
struct mca_btl_base_descriptor_t *des); int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
mca_btl_base_descriptor_t * mca_btl_base_descriptor_t *
mca_btl_scif_alloc(struct mca_btl_base_module_t *btl, mca_btl_scif_alloc(struct mca_btl_base_module_t *btl,
@ -228,9 +220,25 @@ mca_btl_scif_alloc(struct mca_btl_base_module_t *btl,
int mca_btl_scif_progress_send_wait_list (struct mca_btl_base_endpoint_t *endpoint); int mca_btl_scif_progress_send_wait_list (struct mca_btl_base_endpoint_t *endpoint);
struct mca_btl_scif_reg_t;
struct mca_btl_base_registration_handle_t {
/** scif offset */
off_t scif_offset;
/** base address of this scif region */
uintptr_t scif_base;
};
struct mca_btl_scif_registration_handle_t {
mca_btl_base_registration_handle_t btl_handle;
struct mca_btl_scif_reg_t *reg;
};
typedef struct mca_btl_scif_registration_handle_t mca_btl_scif_registration_handle_t;
typedef struct mca_btl_scif_reg_t { typedef struct mca_btl_scif_reg_t {
mca_mpool_base_registration_t base; mca_mpool_base_registration_t base;
off_t *registrations; /** per-endpoint btl handles for this registration */
mca_btl_scif_registration_handle_t *handles;
} mca_btl_scif_reg_t; } mca_btl_scif_reg_t;
/* Global structures */ /* Global structures */

Просмотреть файл

@ -165,14 +165,14 @@ static int scif_dereg_mem (void *reg_data, mca_mpool_base_registration_t *reg)
/* register the fragment with all connected endpoints */ /* register the fragment with all connected endpoints */
for (i = 0 ; i < (int) mca_btl_scif_module.endpoint_count ; ++i) { for (i = 0 ; i < (int) mca_btl_scif_module.endpoint_count ; ++i) {
if ((off_t)-1 != scif_reg->registrations[i] && if ((off_t)-1 != scif_reg->handles[i].btl_handle.scif_offset &&
MCA_BTL_SCIF_EP_STATE_CONNECTED == mca_btl_scif_module.endpoints[i].state) { MCA_BTL_SCIF_EP_STATE_CONNECTED == mca_btl_scif_module.endpoints[i].state) {
(void) scif_unregister(mca_btl_scif_module.endpoints[i].scif_epd, (void) scif_unregister(mca_btl_scif_module.endpoints[i].scif_epd,
scif_reg->registrations[i], size); scif_reg->handles[i].btl_handle.scif_offset, size);
} }
} }
free (scif_reg->registrations); free (scif_reg->handles);
return OPAL_SUCCESS; return OPAL_SUCCESS;
} }
@ -184,17 +184,22 @@ static int scif_reg_mem (void *reg_data, void *base, size_t size,
int rc = OPAL_SUCCESS; int rc = OPAL_SUCCESS;
unsigned int i; unsigned int i;
scif_reg->registrations = calloc (mca_btl_scif_module.endpoint_count, scif_reg->handles = calloc (mca_btl_scif_module.endpoint_count, sizeof (scif_reg->handles[0]));
sizeof (off_t));
memset (scif_reg->registrations, -1, mca_btl_scif_module.endpoint_count * sizeof (off_t)); /* intialize all scif offsets to -1 and initialize the pointer back to the mpool registration */
for (i = 0 ; i < mca_btl_scif_module.endpoint_count ; ++i) {
scif_reg->handles[i].btl_handle.scif_offset = -1;
scif_reg->handles[i].btl_handle.scif_base = (intptr_t) base;
scif_reg->handles[i].reg = scif_reg;
}
/* register the pointer with all connected endpoints */ /* register the pointer with all connected endpoints */
for (i = 0 ; i < mca_btl_scif_module.endpoint_count ; ++i) { for (i = 0 ; i < mca_btl_scif_module.endpoint_count ; ++i) {
if (MCA_BTL_SCIF_EP_STATE_CONNECTED == mca_btl_scif_module.endpoints[i].state) { if (MCA_BTL_SCIF_EP_STATE_CONNECTED == mca_btl_scif_module.endpoints[i].state) {
scif_reg->registrations[i] = scif_register(mca_btl_scif_module.endpoints[i].scif_epd, scif_reg->handles[i].btl_handle.scif_offset = scif_register (mca_btl_scif_module.endpoints[i].scif_epd,
base, size, 0, SCIF_PROT_READ | base, size, 0, SCIF_PROT_READ |
SCIF_PROT_WRITE, 0); SCIF_PROT_WRITE, 0);
if (SCIF_REGISTER_FAILED == scif_reg->registrations[i]) { if (SCIF_REGISTER_FAILED == scif_reg->handles[i].btl_handle.scif_offset) {
/* cleanup */ /* cleanup */
scif_dereg_mem (reg_data, reg); scif_dereg_mem (reg_data, reg);
rc = OPAL_ERR_OUT_OF_RESOURCE; rc = OPAL_ERR_OUT_OF_RESOURCE;

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/* /*
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights * Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2014 Research Organization for Information Science * Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved. * and Technology (RIST). All rights reserved.
@ -171,7 +171,7 @@ static int btl_scif_component_register(void)
mca_btl_scif_module.super.btl_flags = MCA_BTL_FLAGS_SEND | mca_btl_scif_module.super.btl_flags = MCA_BTL_FLAGS_SEND |
MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND_INPLACE; MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND_INPLACE;
mca_btl_scif_module.super.btl_seg_size = sizeof (mca_btl_scif_segment_t); mca_btl_scif_module.super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t);
mca_btl_scif_module.super.btl_bandwidth = 50000; /* Mbs */ mca_btl_scif_module.super.btl_bandwidth = 50000; /* Mbs */
mca_btl_scif_module.super.btl_latency = 2; /* Microsecs */ mca_btl_scif_module.super.btl_latency = 2; /* Microsecs */
@ -329,11 +329,11 @@ static int mca_btl_scif_progress_recvs (mca_btl_base_endpoint_t *ep)
* the fragment without introducing another copy here. this * the fragment without introducing another copy here. this
* limitation has not appeared to cause any performance * limitation has not appeared to cause any performance
* problems. */ * problems. */
frag.base.des_local_count = 1; frag.base.des_segment_count = 1;
frag.segments[0].base.seg_len = hdr->size; frag.segments[0].seg_len = hdr->size;
frag.segments[0].base.seg_addr.pval = (void *) (hdr + 1); frag.segments[0].seg_addr.pval = (void *) (hdr + 1);
frag.base.des_local = &frag.segments[0].base; frag.base.des_segments = frag.segments;
/* call the registered callback function */ /* call the registered callback function */
reg->cbfunc(&mca_btl_scif_module.super, hdr->tag, &frag.base, reg->cbdata); reg->cbfunc(&mca_btl_scif_module.super, hdr->tag, &frag.base, reg->cbdata);

Просмотреть файл

@ -15,13 +15,13 @@
static inline void mca_btl_scif_base_frag_constructor (mca_btl_scif_base_frag_t *frag) static inline void mca_btl_scif_base_frag_constructor (mca_btl_scif_base_frag_t *frag)
{ {
memset ((char *) frag + sizeof (frag->base), 0, sizeof (*frag) - sizeof (frag->base)); memset ((char *) frag + sizeof (frag->base), 0, sizeof (*frag) - sizeof (frag->base));
frag->segments[0].base.seg_addr.pval = frag->base.super.ptr; frag->segments[0].seg_addr.pval = frag->base.super.ptr;
} }
static inline void mca_btl_scif_eager_frag_constructor (mca_btl_scif_base_frag_t *frag) static inline void mca_btl_scif_eager_frag_constructor (mca_btl_scif_base_frag_t *frag)
{ {
memset ((char *) frag + sizeof (frag->base), 0, sizeof (*frag) - sizeof (frag->base)); memset ((char *) frag + sizeof (frag->base), 0, sizeof (*frag) - sizeof (frag->base));
frag->segments[0].base.seg_addr.pval = frag->base.super.ptr; frag->segments[0].seg_addr.pval = frag->base.super.ptr;
} }
OBJ_CLASS_INSTANCE(mca_btl_scif_eager_frag_t, mca_btl_base_descriptor_t, OBJ_CLASS_INSTANCE(mca_btl_scif_eager_frag_t, mca_btl_base_descriptor_t,

Просмотреть файл

@ -15,16 +15,6 @@
#include "btl_scif.h" #include "btl_scif.h"
#include "btl_scif_endpoint.h" #include "btl_scif_endpoint.h"
typedef struct mca_btl_scif_segment_t {
mca_btl_base_segment_t base;
/* scif offset */
off_t scif_offset;
/* original pointer */
uint64_t orig_ptr;
} mca_btl_scif_segment_t;
typedef struct mca_btl_scif_frag_hdr_t { typedef struct mca_btl_scif_frag_hdr_t {
#if defined(SCIF_USE_SEQ) #if defined(SCIF_USE_SEQ)
uint32_t seq; uint32_t seq;
@ -41,7 +31,7 @@ typedef void (*frag_cb_t) (struct mca_btl_scif_base_frag_t *, int);
typedef struct mca_btl_scif_base_frag_t { typedef struct mca_btl_scif_base_frag_t {
mca_btl_base_descriptor_t base; mca_btl_base_descriptor_t base;
mca_btl_scif_frag_hdr_t hdr; mca_btl_scif_frag_hdr_t hdr;
mca_btl_scif_segment_t segments[2]; mca_btl_base_segment_t segments[2];
mca_btl_base_endpoint_t *endpoint; mca_btl_base_endpoint_t *endpoint;
mca_btl_scif_reg_t *registration; mca_btl_scif_reg_t *registration;
ompi_free_list_t *my_list; ompi_free_list_t *my_list;
@ -78,9 +68,9 @@ static inline int mca_btl_scif_frag_return (mca_btl_scif_base_frag_t *frag)
frag->registration = NULL; frag->registration = NULL;
} }
frag->segments[0].base.seg_addr.pval = frag->base.super.ptr; frag->segments[0].seg_addr.pval = frag->base.super.ptr;
frag->segments[0].base.seg_len = 0; frag->segments[0].seg_len = 0;
frag->segments[1].base.seg_len = 0; frag->segments[1].seg_len = 0;
OMPI_FREE_LIST_RETURN_MT(frag->my_list, (ompi_free_list_item_t *) frag); OMPI_FREE_LIST_RETURN_MT(frag->my_list, (ompi_free_list_item_t *) frag);

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/* /*
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights * Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2014 Research Organization for Information Science * Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved. * and Technology (RIST). All rights reserved.
@ -20,18 +20,13 @@
/** /**
* Initiate a get operation. * Initiate a get operation.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*/ */
int mca_btl_scif_get (struct mca_btl_base_module_t *btl, int mca_btl_scif_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
struct mca_btl_base_endpoint_t *endpoint, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_descriptor_t *des) { mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
mca_btl_scif_segment_t *src = (mca_btl_scif_segment_t *) des->des_remote; int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
mca_btl_scif_segment_t *dst = (mca_btl_scif_segment_t *) des->des_local; {
size_t len = lmin (src->base.seg_len, dst->base.seg_len); int rc, mark, scif_flags = 0;
int rc, mark, flags = 0;
off_t roffset, loffset; off_t roffset, loffset;
#if defined(SCIF_TIMING) #if defined(SCIF_TIMING)
struct timespec ts; struct timespec ts;
@ -41,30 +36,27 @@ int mca_btl_scif_get (struct mca_btl_base_module_t *btl,
mca_btl_scif_component.get_count++; mca_btl_scif_component.get_count++;
#endif #endif
BTL_VERBOSE(("Using DMA Get for frag %p from offset %lu", (void *) des, BTL_VERBOSE(("Using DMA Get from remote address %" PRIx64 " to local address %p",
(unsigned long) src->scif_offset)); remote_address, local_address));
roffset = src->scif_offset + (off_t)(src->orig_ptr - src->base.seg_addr.lval); roffset = remote_handle->scif_offset + (off_t)(remote_address - remote_handle->scif_base);
loffset = dst->scif_offset + (off_t)(dst->orig_ptr - dst->base.seg_addr.lval); loffset = local_handle->scif_offset + (off_t)((intptr_t)local_address - local_handle->scif_base);
if (mca_btl_scif_component.rma_use_cpu) { if (mca_btl_scif_component.rma_use_cpu) {
flags = SCIF_RMA_USECPU; scif_flags = SCIF_RMA_USECPU;
} }
if (mca_btl_scif_component.rma_sync) { if (mca_btl_scif_component.rma_sync) {
flags |= SCIF_RMA_SYNC; scif_flags |= SCIF_RMA_SYNC;
} }
/* start the read */ /* start the read */
rc = scif_readfrom (endpoint->scif_epd, loffset, len, roffset, flags); rc = scif_readfrom (endpoint->scif_epd, loffset, size, roffset, scif_flags);
if (OPAL_UNLIKELY(-1 == rc)) { if (OPAL_UNLIKELY(-1 == rc)) {
return OPAL_ERROR; return OPAL_ERROR;
} }
/* always call the callback function */ if (!(scif_flags & SCIF_RMA_SYNC)) {
des->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
if (!(flags & SCIF_RMA_SYNC)) {
/* according to the scif documentation is is better to use a fence rather /* according to the scif documentation is is better to use a fence rather
* than using the SCIF_RMA_SYNC flag with scif_readfrom */ * than using the SCIF_RMA_SYNC flag with scif_readfrom */
scif_fence_mark (endpoint->scif_epd, SCIF_FENCE_INIT_SELF, &mark); scif_fence_mark (endpoint->scif_epd, SCIF_FENCE_INIT_SELF, &mark);
@ -76,8 +68,8 @@ int mca_btl_scif_get (struct mca_btl_base_module_t *btl,
mca_btl_scif_component.get_time_max, ts); mca_btl_scif_component.get_time_max, ts);
#endif #endif
/* since we completed the fence the RMA operation is complete */ /* always call the callback function */
mca_btl_scif_frag_complete ((mca_btl_scif_base_frag_t *) des, OPAL_SUCCESS); cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS);
return OPAL_SUCCESS; return OPAL_SUCCESS;
} }

Просмотреть файл

@ -24,17 +24,14 @@ mca_btl_scif_free (struct mca_btl_base_module_t *btl,
static int static int
mca_btl_scif_module_finalize (struct mca_btl_base_module_t* btl); mca_btl_scif_module_finalize (struct mca_btl_base_module_t* btl);
static mca_btl_base_descriptor_t * static mca_btl_base_registration_handle_t *mca_btl_scif_register_mem (struct mca_btl_base_module_t *btl,
mca_btl_scif_prepare_dst (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint,
mca_btl_base_endpoint_t *endpoint, void *base, size_t size, uint32_t flags);
mca_mpool_base_registration_t *registration, static int mca_btl_scif_deregister_mem (struct mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle);
opal_convertor_t *convertor, uint8_t order,
size_t reserve, size_t *size, uint32_t flags);
static struct mca_btl_base_descriptor_t * static struct mca_btl_base_descriptor_t *
mca_btl_scif_prepare_src (struct mca_btl_base_module_t *btl, mca_btl_scif_prepare_src (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint, struct mca_btl_base_endpoint_t *endpoint,
mca_mpool_base_registration_t *registration,
struct opal_convertor_t *convertor, struct opal_convertor_t *convertor,
uint8_t order, size_t reserve, size_t *size, uint8_t order, size_t reserve, size_t *size,
uint32_t flags); uint32_t flags);
@ -48,11 +45,12 @@ mca_btl_scif_module_t mca_btl_scif_module = {
.btl_alloc = mca_btl_scif_alloc, .btl_alloc = mca_btl_scif_alloc,
.btl_free = mca_btl_scif_free, .btl_free = mca_btl_scif_free,
.btl_prepare_src = mca_btl_scif_prepare_src, .btl_prepare_src = mca_btl_scif_prepare_src,
.btl_prepare_dst = mca_btl_scif_prepare_dst,
.btl_send = mca_btl_scif_send, .btl_send = mca_btl_scif_send,
.btl_sendi = mca_btl_scif_sendi, .btl_sendi = mca_btl_scif_sendi,
.btl_put = mca_btl_scif_put, .btl_put = mca_btl_scif_put,
.btl_get = mca_btl_scif_get, .btl_get = mca_btl_scif_get,
.btl_register_mem = mca_btl_scif_register_mem,
.btl_deregister_mem = mca_btl_scif_deregister_mem,
} }
}; };
@ -163,10 +161,10 @@ mca_btl_scif_alloc(struct mca_btl_base_module_t *btl,
frag->base.des_flags = flags; frag->base.des_flags = flags;
frag->base.order = order; frag->base.order = order;
frag->base.des_local = &frag->segments[0].base; frag->base.des_segments = frag->segments;
frag->base.des_local_count = 1; frag->base.des_segment_count = 1;
frag->segments[0].base.seg_len = size; frag->segments[0].seg_len = size;
return &frag->base; return &frag->base;
} }
@ -178,16 +176,19 @@ mca_btl_scif_free (struct mca_btl_base_module_t *btl,
return mca_btl_scif_frag_return ((mca_btl_scif_base_frag_t *) des); return mca_btl_scif_frag_return ((mca_btl_scif_base_frag_t *) des);
} }
static inline mca_btl_base_descriptor_t *mca_btl_scif_prepare_dma (struct mca_btl_base_module_t *btl, static mca_btl_base_registration_handle_t *mca_btl_scif_register_mem (struct mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint, mca_btl_base_endpoint_t *endpoint,
void *data_ptr, size_t size, void *base, size_t size, uint32_t flags)
mca_mpool_base_registration_t *registration,
uint8_t order, uint32_t flags)
{ {
mca_btl_scif_base_frag_t *frag;
mca_btl_scif_reg_t *scif_reg; mca_btl_scif_reg_t *scif_reg;
int rc; int rc;
if (MCA_BTL_ENDPOINT_ANY == endpoint) {
/* it probably isn't possible to support registering memory to use with any endpoint so
* return NULL */
return NULL;
}
if (OPAL_LIKELY(MCA_BTL_SCIF_EP_STATE_CONNECTED != endpoint->state)) { if (OPAL_LIKELY(MCA_BTL_SCIF_EP_STATE_CONNECTED != endpoint->state)) {
/* the endpoint needs to be connected before the fragment can be /* the endpoint needs to be connected before the fragment can be
* registered. */ * registered. */
@ -198,67 +199,36 @@ static inline mca_btl_base_descriptor_t *mca_btl_scif_prepare_dma (struct mca_bt
} }
} }
(void) MCA_BTL_SCIF_FRAG_ALLOC_DMA(endpoint, frag); rc = btl->btl_mpool->mpool_register(btl->btl_mpool, base, size, 0,
if (OPAL_UNLIKELY(NULL == frag)) { (mca_mpool_base_registration_t **) &scif_reg);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
return NULL; return NULL;
} }
if (NULL == registration) {
rc = btl->btl_mpool->mpool_register(btl->btl_mpool, data_ptr, size, 0,
(mca_mpool_base_registration_t **) &registration);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
mca_btl_scif_frag_return (frag);
return NULL;
}
frag->registration = (mca_btl_scif_reg_t *) registration;
}
scif_reg = (mca_btl_scif_reg_t *) registration;
/* register the memory location with this peer if it isn't already */ /* register the memory location with this peer if it isn't already */
if ((off_t) -1 == scif_reg->registrations[endpoint->id]) { if ((off_t) -1 == scif_reg->handles[endpoint->id].btl_handle.scif_offset) {
size_t seg_size = (size_t)((uintptr_t) registration->bound - (uintptr_t) registration->base) + 1; size_t seg_size = (size_t)((uintptr_t) scif_reg->base.bound - (uintptr_t) scif_reg->base.base) + 1;
scif_reg->registrations[endpoint->id] = scif_register (endpoint->scif_epd, registration->base,
seg_size, 0, SCIF_PROT_READ | /* NTH: until we determine a way to pass permissions to the mpool just make all segments
SCIF_PROT_WRITE, 0); * read/write */
scif_reg->handles[endpoint->id].btl_handle.scif_offset =
scif_register (endpoint->scif_epd, scif_reg->base.base, seg_size, 0, SCIF_PROT_READ |
SCIF_PROT_WRITE, 0);
BTL_VERBOSE(("registered fragment for scif DMA transaction. offset = %lu", BTL_VERBOSE(("registered fragment for scif DMA transaction. offset = %lu",
(unsigned long) scif_reg->registrations[endpoint->id])); (unsigned long) scif_reg->handles[endpoint->id].btl_handle.scif_offset));
} }
if (OPAL_UNLIKELY((off_t) -1 == scif_reg->registrations[endpoint->id])) { return &scif_reg->handles[endpoint->id].btl_handle;
mca_btl_scif_frag_return (frag);
return NULL;
}
frag->segments[0].base.seg_addr.lval = (uint64_t)(uintptr_t) data_ptr;
frag->segments[0].base.seg_len = size;
frag->segments[0].scif_offset = scif_reg->registrations[endpoint->id] +
(off_t) ((ptrdiff_t) data_ptr - (ptrdiff_t) registration->base);
/* save the original pointer so the offset can be adjusted if needed (this is
* required for osc/rdma) */
frag->segments[0].orig_ptr = (uint64_t)(uintptr_t) data_ptr;
frag->base.order = order;
frag->base.des_flags = flags;
frag->base.des_local = &frag->segments->base;
frag->base.des_local_count = 1;
return &frag->base;
} }
static inline mca_btl_base_descriptor_t *mca_btl_scif_prepare_dma_conv (struct mca_btl_base_module_t *btl, static int mca_btl_scif_deregister_mem (struct mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle)
mca_btl_base_endpoint_t *endpoint,
mca_mpool_base_registration_t *registration,
struct opal_convertor_t *convertor,
uint8_t order, size_t *size,
uint32_t flags)
{ {
void *data_ptr; mca_btl_scif_registration_handle_t *scif_handle = (mca_btl_scif_registration_handle_t *) handle;
mca_btl_scif_reg_t *scif_reg = scif_handle->reg;
opal_convertor_get_current_pointer (convertor, &data_ptr); btl->btl_mpool->mpool_deregister (btl->btl_mpool, &scif_reg->base);
return mca_btl_scif_prepare_dma (btl, endpoint, data_ptr, *size, registration, order, flags); return OPAL_SUCCESS;
} }
static inline struct mca_btl_base_descriptor_t * static inline struct mca_btl_base_descriptor_t *
@ -286,10 +256,10 @@ mca_btl_scif_prepare_src_send (struct mca_btl_base_module_t *btl,
return NULL; return NULL;
} }
frag->segments[0].base.seg_len = reserve; frag->segments[0].seg_len = reserve;
frag->segments[1].base.seg_addr.pval = data_ptr; frag->segments[1].seg_addr.pval = data_ptr;
frag->segments[1].base.seg_len = *size; frag->segments[1].seg_len = *size;
frag->base.des_local_count = 2; frag->base.des_segment_count = 2;
} else { } else {
/* buffered send */ /* buffered send */
(void) MCA_BTL_SCIF_FRAG_ALLOC_EAGER(endpoint, frag); (void) MCA_BTL_SCIF_FRAG_ALLOC_EAGER(endpoint, frag);
@ -299,7 +269,7 @@ mca_btl_scif_prepare_src_send (struct mca_btl_base_module_t *btl,
if (*size) { if (*size) {
iov.iov_len = *size; iov.iov_len = *size;
iov.iov_base = (IOVBASE_TYPE *) ((uintptr_t) frag->segments[0].base.seg_addr.pval + reserve); iov.iov_base = (IOVBASE_TYPE *) ((uintptr_t) frag->segments[0].seg_addr.pval + reserve);
rc = opal_convertor_pack (convertor, &iov, &iov_count, &max_size); rc = opal_convertor_pack (convertor, &iov, &iov_count, &max_size);
if (OPAL_UNLIKELY(rc < 0)) { if (OPAL_UNLIKELY(rc < 0)) {
@ -309,37 +279,22 @@ mca_btl_scif_prepare_src_send (struct mca_btl_base_module_t *btl,
*size = max_size; *size = max_size;
} }
frag->segments[0].base.seg_len = reserve + *size; frag->segments[0].seg_len = reserve + *size;
frag->base.des_local_count = 1; frag->base.des_segment_count = 1;
} }
frag->base.des_local = &frag->segments->base; frag->base.des_segments = frag->segments;
frag->base.order = order; frag->base.order = order;
frag->base.des_flags = flags; frag->base.des_flags = flags;
return &frag->base; return &frag->base;
} }
static mca_btl_base_descriptor_t *mca_btl_scif_prepare_src (struct mca_btl_base_module_t *btl, static mca_btl_base_descriptor_t *mca_btl_scif_prepare_src (struct mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint, mca_btl_base_endpoint_t *endpoint,
mca_mpool_base_registration_t *registration,
struct opal_convertor_t *convertor, struct opal_convertor_t *convertor,
uint8_t order, size_t reserve, size_t *size, uint8_t order, size_t reserve, size_t *size,
uint32_t flags) uint32_t flags)
{ {
if (OPAL_LIKELY(reserve)) { return mca_btl_scif_prepare_src_send (btl, endpoint, convertor, order, reserve, size, flags);
return mca_btl_scif_prepare_src_send (btl, endpoint, convertor,
order, reserve, size, flags);
} else {
return mca_btl_scif_prepare_dma_conv (btl, endpoint, registration, convertor, order, size, flags);
}
}
static mca_btl_base_descriptor_t *mca_btl_scif_prepare_dst (mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
mca_mpool_base_registration_t *registration,
opal_convertor_t *convertor, uint8_t order,
size_t reserve, size_t *size, uint32_t flags)
{
return mca_btl_scif_prepare_dma_conv (btl, endpoint, registration, convertor, order, size, flags);
} }

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/* /*
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights * Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
@ -16,63 +16,57 @@
/** /**
* Initiate a put operation. * Initiate a put operation.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*/ */
int mca_btl_scif_put (struct mca_btl_base_module_t *btl, int mca_btl_scif_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
struct mca_btl_base_endpoint_t *endpoint, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_descriptor_t *des) { mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
mca_btl_scif_segment_t *src = (mca_btl_scif_segment_t *) des->des_local; int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
mca_btl_scif_segment_t *dst = (mca_btl_scif_segment_t *) des->des_remote; {
size_t len = lmin (src->base.seg_len, dst->base.seg_len); int rc, mark, scif_flags = 0;
int rc, mark, flags = 0;
off_t roffset, loffset; off_t roffset, loffset;
#if defined(SCIF_TIMING) #if defined(SCIF_TIMING)
struct timespec ts; struct timespec ts;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
mca_btl_scif_component.put_count++; mca_btl_scif_component.get_count++;
#endif #endif
BTL_VERBOSE(("Using DMA Put for frag %p", (void *) des)); BTL_VERBOSE(("Using DMA Put from local address %p to remote address %" PRIx64,
local_address, remote_address));
roffset = dst->scif_offset + (off_t)(dst->orig_ptr - dst->base.seg_addr.lval); roffset = remote_handle->scif_offset + (off_t)(remote_address - remote_handle->scif_base);
loffset = src->scif_offset + (off_t)(src->orig_ptr - src->base.seg_addr.lval); loffset = local_handle->scif_offset + (off_t)((intptr_t) local_address - local_handle->scif_base);
if (mca_btl_scif_component.rma_use_cpu) { if (mca_btl_scif_component.rma_use_cpu) {
flags = SCIF_RMA_USECPU; scif_flags = SCIF_RMA_USECPU;
} }
if (mca_btl_scif_component.rma_sync) { if (mca_btl_scif_component.rma_sync) {
flags |= SCIF_RMA_SYNC; scif_flags |= SCIF_RMA_SYNC;
} }
/* start the write */ /* start the write */
rc = scif_writeto (endpoint->scif_epd, loffset, len, roffset, flags); rc = scif_writeto (endpoint->scif_epd, loffset, size, roffset, scif_flags);
rc = scif_readfrom (endpoint->scif_epd, loffset, size, roffset, scif_flags);
if (OPAL_UNLIKELY(-1 == rc)) { if (OPAL_UNLIKELY(-1 == rc)) {
return OPAL_ERROR; return OPAL_ERROR;
} }
/* always call the callback function */ if (!(scif_flags & SCIF_RMA_SYNC)) {
des->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; /* according to the scif documentation is is better to use a fence rather
* than using the SCIF_RMA_SYNC flag with scif_readfrom */
/* according to the scif documentation is is better to use a fence rather
* than using the SCIF_RMA_SYNC flag with scif_writeto */
if (!(flags & SCIF_RMA_SYNC)) {
scif_fence_mark (endpoint->scif_epd, SCIF_FENCE_INIT_SELF, &mark); scif_fence_mark (endpoint->scif_epd, SCIF_FENCE_INIT_SELF, &mark);
scif_fence_wait (endpoint->scif_epd, mark); scif_fence_wait (endpoint->scif_epd, mark);
} }
#if defined(SCIF_TIMING) #if defined(SCIF_TIMING)
SCIF_UPDATE_TIMER(mca_btl_scif_component.put_time, SCIF_UPDATE_TIMER(mca_btl_scif_component.get_time,
mca_btl_scif_component.put_time_max, ts); mca_btl_scif_component.get_time_max, ts);
#endif #endif
/* since we completed the fence the RMA operation is complete */ /* always call the callback function */
mca_btl_scif_frag_complete ((mca_btl_scif_base_frag_t *) des, OPAL_SUCCESS); cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS);
return OPAL_SUCCESS; return OPAL_SUCCESS;
} }

Просмотреть файл

@ -118,22 +118,22 @@ static int mca_btl_scif_send_frag (struct mca_btl_base_endpoint_t *endpoint,
unsigned char * restrict dst; unsigned char * restrict dst;
BTL_VERBOSE(("btl/scif sending descriptor %p from %d -> %d. length = %" PRIu64, (void *) frag, BTL_VERBOSE(("btl/scif sending descriptor %p from %d -> %d. length = %" PRIu64, (void *) frag,
OPAL_PROC_MY_NAME.vpid, endpoint->peer_proc->proc_name.vpid, frag->segments[0].base.seg_len)); opal_process_name_vpid(OPAL_PROC_MY_NAME), opal_process_name_vpid(endpoint->peer_proc->proc_name), frag->segments[0].seg_len));
if (OPAL_LIKELY(OPAL_SUCCESS == mca_btl_scif_send_get_buffer (endpoint, size, &dst))) { if (OPAL_LIKELY(OPAL_SUCCESS == mca_btl_scif_send_get_buffer (endpoint, size, &dst))) {
unsigned char * restrict data = (unsigned char * restrict) frag->segments[0].base.seg_addr.pval; unsigned char * restrict data = (unsigned char * restrict) frag->segments[0].seg_addr.pval;
#if defined(SCIF_TIMING) #if defined(SCIF_TIMING)
struct timespec ts; struct timespec ts;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
#endif #endif
memcpy (dst + sizeof (frag->hdr), data, frag->segments[0].base.seg_len); memcpy (dst + sizeof (frag->hdr), data, frag->segments[0].seg_len);
if (frag->segments[1].base.seg_len) { if (frag->segments[1].seg_len) {
memcpy (dst + sizeof (frag->hdr) + frag->segments[0].base.seg_len, memcpy (dst + sizeof (frag->hdr) + frag->segments[0].seg_len,
frag->segments[1].base.seg_addr.pval, frag->segments[1].seg_addr.pval,
frag->segments[1].base.seg_len); frag->segments[1].seg_len);
} }
#if defined(SCIF_USE_SEQ) #if defined(SCIF_USE_SEQ)
@ -165,7 +165,7 @@ int mca_btl_scif_send (struct mca_btl_base_module_t *btl,
mca_btl_base_tag_t tag) mca_btl_base_tag_t tag)
{ {
mca_btl_scif_base_frag_t *frag = (mca_btl_scif_base_frag_t *) descriptor; mca_btl_scif_base_frag_t *frag = (mca_btl_scif_base_frag_t *) descriptor;
size_t size = frag->segments[0].base.seg_len + frag->segments[1].base.seg_len; size_t size = frag->segments[0].seg_len + frag->segments[1].seg_len;
int rc; int rc;
frag->hdr.tag = tag; frag->hdr.tag = tag;
@ -223,7 +223,9 @@ int mca_btl_scif_sendi (struct mca_btl_base_module_t *btl,
rc = mca_btl_scif_send_get_buffer (endpoint, length, &base); rc = mca_btl_scif_send_get_buffer (endpoint, length, &base);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
*descriptor = NULL; if (NULL != descriptor) {
*descriptor = NULL;
}
return OPAL_ERR_OUT_OF_RESOURCE; return OPAL_ERR_OUT_OF_RESOURCE;
} }

Просмотреть файл

@ -38,13 +38,15 @@
#include "btl_self_frag.h" #include "btl_self_frag.h"
#include "opal/util/proc.h" #include "opal/util/proc.h"
static int mca_btl_self_put (struct mca_btl_base_module_t* btl, static int mca_btl_self_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
struct mca_btl_base_endpoint_t* endpoint, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_descriptor_t* des); mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
static int mca_btl_self_get (struct mca_btl_base_module_t* btl, static int mca_btl_self_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
struct mca_btl_base_endpoint_t* endpoint, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_descriptor_t* des); mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
mca_btl_base_module_t mca_btl_self = { mca_btl_base_module_t mca_btl_self = {
.btl_component = &mca_btl_self_component.super, .btl_component = &mca_btl_self_component.super,
@ -54,7 +56,6 @@ mca_btl_base_module_t mca_btl_self = {
.btl_alloc = mca_btl_self_alloc, .btl_alloc = mca_btl_self_alloc,
.btl_free = mca_btl_self_free, .btl_free = mca_btl_self_free,
.btl_prepare_src = mca_btl_self_prepare_src, .btl_prepare_src = mca_btl_self_prepare_src,
.btl_prepare_dst = mca_btl_self_prepare_dst,
.btl_send = mca_btl_self_send, .btl_send = mca_btl_self_send,
.btl_put = mca_btl_self_put, .btl_put = mca_btl_self_put,
.btl_get = mca_btl_self_get, .btl_get = mca_btl_self_get,
@ -135,8 +136,8 @@ mca_btl_base_descriptor_t* mca_btl_self_alloc(
frag->segment.seg_len = size; frag->segment.seg_len = size;
frag->base.des_flags = flags; frag->base.des_flags = flags;
frag->base.des_local = &(frag->segment); frag->base.des_segments = &(frag->segment);
frag->base.des_local_count = 1; frag->base.des_segment_count = 1;
return (mca_btl_base_descriptor_t*)frag; return (mca_btl_base_descriptor_t*)frag;
} }
@ -151,10 +152,8 @@ int mca_btl_self_free( struct mca_btl_base_module_t* btl,
{ {
mca_btl_self_frag_t* frag = (mca_btl_self_frag_t*)des; mca_btl_self_frag_t* frag = (mca_btl_self_frag_t*)des;
frag->base.des_local = NULL; frag->base.des_segments = NULL;
frag->base.des_local_count = 0; frag->base.des_segment_count = 0;
frag->base.des_remote = NULL;
frag->base.des_remote_count = 0;
if(frag->size == mca_btl_self.btl_eager_limit) { if(frag->size == mca_btl_self.btl_eager_limit) {
MCA_BTL_SELF_FRAG_RETURN_EAGER(frag); MCA_BTL_SELF_FRAG_RETURN_EAGER(frag);
@ -175,7 +174,6 @@ int mca_btl_self_free( struct mca_btl_base_module_t* btl,
struct mca_btl_base_descriptor_t* struct mca_btl_base_descriptor_t*
mca_btl_self_prepare_src( struct mca_btl_base_module_t* btl, mca_btl_self_prepare_src( struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint, struct mca_btl_base_endpoint_t* endpoint,
mca_mpool_base_registration_t* registration,
struct opal_convertor_t* convertor, struct opal_convertor_t* convertor,
uint8_t order, uint8_t order,
size_t reserve, size_t reserve,
@ -231,44 +229,11 @@ mca_btl_self_prepare_src( struct mca_btl_base_module_t* btl,
*size = max_data; *size = max_data;
} }
frag->base.des_flags = flags; frag->base.des_flags = flags;
frag->base.des_local = &frag->segment; frag->base.des_segments = &frag->segment;
frag->base.des_local_count = 1; frag->base.des_segment_count = 1;
return &frag->base; return &frag->base;
} }
/**
* Prepare data for receive.
*/
struct mca_btl_base_descriptor_t*
mca_btl_self_prepare_dst( struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
mca_mpool_base_registration_t* registration,
struct opal_convertor_t* convertor,
uint8_t order,
size_t reserve,
size_t* size,
uint32_t flags )
{
mca_btl_self_frag_t* frag;
size_t max_data = *size;
void *ptr;
MCA_BTL_SELF_FRAG_ALLOC_RDMA(frag);
if(OPAL_UNLIKELY(NULL == frag)) {
return NULL;
}
/* setup descriptor to point directly to user buffer */
opal_convertor_get_current_pointer( convertor, &ptr );
frag->segment.seg_addr.lval = (uint64_t)(uintptr_t) ptr;
frag->segment.seg_len = reserve + max_data;
frag->base.des_local = &frag->segment;
frag->base.des_local_count = 1;
frag->base.des_flags = flags;
return &frag->base;
}
/** /**
* Initiate a send to the peer. * Initiate a send to the peer.
@ -285,12 +250,6 @@ int mca_btl_self_send( struct mca_btl_base_module_t* btl,
mca_btl_active_message_callback_t* reg; mca_btl_active_message_callback_t* reg;
int btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); int btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
/**
* We have to set the dst before the call to the function and reset them
* after.
*/
des->des_remote = des->des_local;
des->des_remote_count = des->des_local_count;
/* upcall */ /* upcall */
reg = mca_btl_base_active_message_trigger + tag; reg = mca_btl_base_active_message_trigger + tag;
reg->cbfunc( btl, tag, des, reg->cbdata ); reg->cbfunc( btl, tag, des, reg->cbdata );
@ -305,100 +264,29 @@ int mca_btl_self_send( struct mca_btl_base_module_t* btl,
return 1; return 1;
} }
/**
* Initiate a put to the peer.
*
* @param btl (IN) BTL module
* @param peer (IN) BTL peer addressing
*/
static int mca_btl_self_rdma( struct mca_btl_base_module_t* btl, static int mca_btl_self_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
struct mca_btl_base_endpoint_t* endpoint, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_descriptor_t* des, mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
mca_btl_base_segment_t* src, size_t src_cnt, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
mca_btl_base_segment_t* dst, size_t dst_cnt)
{ {
unsigned char* src_addr = (unsigned char *)(uintptr_t) src->seg_addr.lval; memcpy ((void *)(intptr_t) remote_address, local_address, size);
size_t src_len = src->seg_len;
unsigned char* dst_addr = (unsigned char *)(uintptr_t) dst->seg_addr.lval;
size_t dst_len = dst->seg_len;
int btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
while(src_len && dst_len) { cbfunc (btl, endpoint, local_address, NULL, cbcontext, cbdata, OPAL_SUCCESS);
if(src_len == dst_len) {
memcpy(dst_addr, src_addr, src_len);
/* advance src */
if(--src_cnt != 0) {
src++;
src_addr = (unsigned char*)src->seg_addr.pval;
src_len = src->seg_len;
} else {
src_len = 0;
}
/* advance dst */
if(--dst_cnt != 0) {
dst++;
dst_addr = (unsigned char*)dst->seg_addr.pval;
dst_len = dst->seg_len;
} else {
dst_len = 0;
}
} else {
size_t bytes = src_len < dst_len ? src_len : dst_len;
memcpy(dst_addr, src_addr, bytes);
/* advance src */
src_len -= bytes;
if(src_len == 0) {
if(--src_cnt != 0) {
src++;
src_addr = (unsigned char*)src->seg_addr.pval;
src_len = src->seg_len;
}
} else {
src_addr += bytes;
}
/* advance dst */
dst_len -= bytes;
if(dst_len == 0) {
if(--dst_cnt != 0) {
dst++;
dst_addr = (unsigned char*)src->seg_addr.pval;
dst_len = src->seg_len;
}
} else {
dst_addr += bytes;
}
}
}
/* rdma completion */
des->des_cbfunc( btl, endpoint, des, OPAL_SUCCESS );
if( btl_ownership ) {
mca_btl_self_free( btl, des );
}
return OPAL_SUCCESS; return OPAL_SUCCESS;
} }
static int mca_btl_self_put (struct mca_btl_base_module_t* btl, static int mca_btl_self_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
struct mca_btl_base_endpoint_t* endpoint, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_descriptor_t* des) mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{ {
return mca_btl_self_rdma (btl, endpoint, des, des->des_local, des->des_local_count, memcpy (local_address, (void *)(intptr_t) remote_address, size);
des->des_remote, des->des_remote_count);
}
static int mca_btl_self_get (struct mca_btl_base_module_t *btl, cbfunc (btl, endpoint, local_address, NULL, cbcontext, cbdata, OPAL_SUCCESS);
struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *des) return OPAL_SUCCESS;
{
return mca_btl_self_rdma (btl, endpoint, des, des->des_remote, des->des_remote_count,
des->des_local, des->des_local_count);
} }
int mca_btl_self_ft_event(int state) { int mca_btl_self_ft_event(int state) {

Просмотреть файл

@ -165,24 +165,6 @@ int mca_btl_self_free(
struct mca_btl_base_descriptor_t* mca_btl_self_prepare_src( struct mca_btl_base_descriptor_t* mca_btl_self_prepare_src(
struct mca_btl_base_module_t* btl, struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint, struct mca_btl_base_endpoint_t* endpoint,
struct mca_mpool_base_registration_t* registration,
struct opal_convertor_t* convertor,
uint8_t order,
size_t reserve,
size_t* size,
uint32_t flags
);
/**
* Prepare data for RDMA
*
* @param btl (IN) BTL module
* @param peer (IN) BTL peer addressing
*/
struct mca_btl_base_descriptor_t* mca_btl_self_prepare_dst(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_mpool_base_registration_t* registration,
struct opal_convertor_t* convertor, struct opal_convertor_t* convertor,
uint8_t order, uint8_t order,
size_t reserve, size_t reserve,

Просмотреть файл

@ -99,7 +99,6 @@ static int mca_btl_self_component_register(void)
mca_btl_self.btl_rdma_pipeline_frag_size = INT_MAX; mca_btl_self.btl_rdma_pipeline_frag_size = INT_MAX;
mca_btl_self.btl_min_rdma_pipeline_size = 0; mca_btl_self.btl_min_rdma_pipeline_size = 0;
mca_btl_self.btl_flags = MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_SEND_INPLACE; mca_btl_self.btl_flags = MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_SEND_INPLACE;
mca_btl_self.btl_seg_size = sizeof (mca_btl_base_segment_t);
mca_btl_self.btl_bandwidth = 100; mca_btl_self.btl_bandwidth = 100;
mca_btl_self.btl_latency = 0; mca_btl_self.btl_latency = 0;
mca_btl_base_param_register(&mca_btl_self_component.super.btl_version, mca_btl_base_param_register(&mca_btl_self_component.super.btl_version,

Просмотреть файл

@ -23,8 +23,8 @@ static inline void mca_btl_self_frag_constructor(mca_btl_self_frag_t* frag)
{ {
frag->segment.seg_addr.pval = frag+1; frag->segment.seg_addr.pval = frag+1;
frag->segment.seg_len = (uint32_t)frag->size; frag->segment.seg_len = (uint32_t)frag->size;
frag->base.des_local = &frag->segment; frag->base.des_segments = &frag->segment;
frag->base.des_local_count = 1; frag->base.des_segment_count = 1;
frag->base.des_flags = 0; frag->base.des_flags = 0;
} }

Просмотреть файл

@ -57,6 +57,9 @@
#include "opal/mca/mpool/base/base.h" #include "opal/mca/mpool/base/base.h"
#include "opal/mca/mpool/sm/mpool_sm.h" #include "opal/mca/mpool/sm/mpool_sm.h"
#include "opal/align.h"
#include "opal/util/sys_limits.h"
#if OPAL_ENABLE_FT_CR == 1 #if OPAL_ENABLE_FT_CR == 1
#include "opal/util/basename.h" #include "opal/util/basename.h"
#include "opal/mca/crs/base/base.h" #include "opal/mca/crs/base/base.h"
@ -81,9 +84,6 @@ mca_btl_sm_t mca_btl_sm = {
.btl_alloc = mca_btl_sm_alloc, .btl_alloc = mca_btl_sm_alloc,
.btl_free = mca_btl_sm_free, .btl_free = mca_btl_sm_free,
.btl_prepare_src = mca_btl_sm_prepare_src, .btl_prepare_src = mca_btl_sm_prepare_src,
#if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA
.btl_prepare_dst = mca_btl_sm_prepare_dst,
#endif /* OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA */
.btl_send = mca_btl_sm_send, .btl_send = mca_btl_sm_send,
.btl_sendi = mca_btl_sm_sendi, .btl_sendi = mca_btl_sm_sendi,
.btl_dump = mca_btl_sm_dump, .btl_dump = mca_btl_sm_dump,
@ -743,7 +743,6 @@ extern int mca_btl_sm_free(
struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_src( struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_src(
struct mca_btl_base_module_t* btl, struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint, struct mca_btl_base_endpoint_t* endpoint,
mca_mpool_base_registration_t* registration,
struct opal_convertor_t* convertor, struct opal_convertor_t* convertor,
uint8_t order, uint8_t order,
size_t reserve, size_t reserve,
@ -828,11 +827,9 @@ struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_src(
} }
#endif /* OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA */ #endif /* OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA */
frag->base.des_local = &(frag->segment.base); frag->base.des_segments = &(frag->segment.base);
frag->base.des_local_count = 1; frag->base.des_segment_count = 1;
frag->base.order = MCA_BTL_NO_ORDER; frag->base.order = MCA_BTL_NO_ORDER;
frag->base.des_remote = NULL;
frag->base.des_remote_count = 0;
frag->base.des_flags = flags; frag->base.des_flags = flags;
*size = max_data; *size = max_data;
return &frag->base; return &frag->base;
@ -950,9 +947,12 @@ int mca_btl_sm_sendi( struct mca_btl_base_module_t* btl,
return OPAL_SUCCESS; return OPAL_SUCCESS;
} }
/* presumably, this code path will never get executed */ if (NULL != descriptor) {
*descriptor = mca_btl_sm_alloc( btl, endpoint, order, /* presumably, this code path will never get executed */
payload_size + header_size, flags); *descriptor = mca_btl_sm_alloc( btl, endpoint, order,
payload_size + header_size, flags);
}
return OPAL_ERR_RESOURCE_BUSY; return OPAL_ERR_RESOURCE_BUSY;
} }
@ -1001,51 +1001,87 @@ int mca_btl_sm_send( struct mca_btl_base_module_t* btl,
} }
#if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA #if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA
struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_dst( mca_btl_base_registration_handle_t *mca_btl_sm_register_mem (struct mca_btl_base_module_t* btl,
struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint,
struct mca_btl_base_endpoint_t* endpoint, void *base, size_t size, uint32_t flags)
struct mca_mpool_base_registration_t* registration,
struct opal_convertor_t* convertor,
uint8_t order,
size_t reserve,
size_t* size,
uint32_t flags)
{ {
void *ptr; mca_btl_sm_registration_handle_t *handle;
mca_btl_sm_frag_t* frag; mca_btl_sm_t *sm_btl = (mca_btl_sm_t *) btl;
ompi_free_list_item_t *item = NULL;
MCA_BTL_SM_FRAG_ALLOC_USER(frag); OMPI_FREE_LIST_GET_MT(&mca_btl_sm_component.registration_handles, item);
if(OPAL_UNLIKELY(NULL == frag)) { if (OPAL_UNLIKELY(NULL == item)) {
return NULL; return NULL;
} }
frag->segment.base.seg_len = *size; handle = (mca_btl_sm_registration_handle_t *) item;
opal_convertor_get_current_pointer( convertor, &ptr );
frag->segment.base.seg_addr.lval = (uint64_t)(uintptr_t) ptr; #if OPAL_BTL_SM_HAVE_KNEM
if (OPAL_LIKELY(mca_btl_sm_component.use_knem)) {
frag->base.des_remote = NULL; struct knem_cmd_create_region knem_cr;
frag->base.des_remote_count = 0; struct knem_cmd_param_iovec knem_iov;
frag->base.des_local = (mca_btl_base_segment_t*)&frag->segment;
frag->base.des_local_count = 1; knem_iov.base = (uintptr_t)base & ~(opal_getpagesize() - 1);
frag->base.des_flags = flags; knem_iov.len = OPAL_ALIGN(size + ((intptr_t) base - knem_iov.base), opal_getpagesize(), intptr_t);
return &frag->base; knem_cr.iovec_array = (uintptr_t)&knem_iov;
knem_cr.iovec_nr = 1;
knem_cr.flags = 0;
knem_cr.protection = 0;
if (flags & MCA_BTL_REG_FLAG_REMOTE_READ) {
knem_cr.protection |= PROT_READ;
}
if (flags & MCA_BTL_REG_FLAG_REMOTE_WRITE) {
knem_cr.protection |= PROT_WRITE;
}
if (OPAL_UNLIKELY(ioctl(sm_btl->knem_fd, KNEM_CMD_CREATE_REGION, &knem_cr) < 0)) {
OMPI_FREE_LIST_RETURN_MT(&mca_btl_sm_component.registration_handles, item);
return NULL;
}
handle->btl_handle.data.knem.cookie = knem_cr.cookie;
handle->btl_handle.data.knem.base_addr = knem_iov.base;
} else
#endif
{
/* the pid could be included in a modex but this will work until btl/sm is
* deleted */
handle->btl_handle.data.pid = getpid ();
}
/* return the public part of the handle */
return &handle->btl_handle;
} }
int mca_btl_sm_deregister_mem (struct mca_btl_base_module_t* btl, mca_btl_base_registration_handle_t *handle)
{
mca_btl_sm_registration_handle_t *sm_handle =
(mca_btl_sm_registration_handle_t *)((intptr_t) handle - offsetof (mca_btl_sm_registration_handle_t, btl_handle));
mca_btl_sm_t* sm_btl = (mca_btl_sm_t*) btl;
#if OPAL_BTL_SM_HAVE_KNEM
if (OPAL_LIKELY(mca_btl_sm_component.use_knem)) {
(void) ioctl(sm_btl->knem_fd, KNEM_CMD_DESTROY_REGION, &handle->data.knem.cookie);
}
#endif
OMPI_FREE_LIST_RETURN_MT(&mca_btl_sm_component.registration_handles, &sm_handle->super);
return OPAL_SUCCESS;
}
#endif /* OPAL_BTL_SM_HAVE_KNEM */
#if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA
/** /**
* Initiate an synchronous get. * Initiate an synchronous get.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*/ */
int mca_btl_sm_get_sync(struct mca_btl_base_module_t* btl, int mca_btl_sm_get_sync (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
struct mca_btl_base_endpoint_t* endpoint, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_descriptor_t* des) mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{ {
int btl_ownership;
mca_btl_sm_frag_t* frag = (mca_btl_sm_frag_t*)des;
mca_btl_sm_segment_t *src = (mca_btl_sm_segment_t*)des->des_remote;
mca_btl_sm_segment_t *dst = (mca_btl_sm_segment_t*)des->des_local;
#if OPAL_BTL_SM_HAVE_KNEM #if OPAL_BTL_SM_HAVE_KNEM
mca_btl_sm_t* sm_btl = (mca_btl_sm_t*) btl; mca_btl_sm_t* sm_btl = (mca_btl_sm_t*) btl;
if (OPAL_LIKELY(mca_btl_sm_component.use_knem)) { if (OPAL_LIKELY(mca_btl_sm_component.use_knem)) {
@ -1054,12 +1090,12 @@ int mca_btl_sm_get_sync(struct mca_btl_base_module_t* btl,
/* Fill in the ioctl data fields. There's no async completion, so /* Fill in the ioctl data fields. There's no async completion, so
we don't need to worry about getting a slot, etc. */ we don't need to worry about getting a slot, etc. */
recv_iovec.base = (uintptr_t) dst->base.seg_addr.lval; recv_iovec.base = (uintptr_t) local_address;
recv_iovec.len = dst->base.seg_len; recv_iovec.len = size;
icopy.local_iovec_array = (uintptr_t)&recv_iovec; icopy.local_iovec_array = (uintptr_t)&recv_iovec;
icopy.local_iovec_nr = 1; icopy.local_iovec_nr = 1;
icopy.remote_cookie = src->key; icopy.remote_cookie = remote_handle->data.knem.cookie;
icopy.remote_offset = 0; icopy.remote_offset = remote_address - remote_handle->data.knem.base_addr;
icopy.write = 0; icopy.write = 0;
/* Use the DMA flag if knem supports it *and* the segment length /* Use the DMA flag if knem supports it *and* the segment length
@ -1067,7 +1103,7 @@ int mca_btl_sm_get_sync(struct mca_btl_base_module_t* btl,
value is 0 (i.e., the MCA param was set to 0), the segment size value is 0 (i.e., the MCA param was set to 0), the segment size
will never be larger than it, so DMA will never be used. */ will never be larger than it, so DMA will never be used. */
icopy.flags = 0; icopy.flags = 0;
if (mca_btl_sm_component.knem_dma_min <= dst->base.seg_len) { if (mca_btl_sm_component.knem_dma_min <= size) {
icopy.flags = mca_btl_sm_component.knem_dma_flag; icopy.flags = mca_btl_sm_component.knem_dma_flag;
} }
/* synchronous flags only, no need to specify icopy.async_status_index */ /* synchronous flags only, no need to specify icopy.async_status_index */
@ -1085,27 +1121,19 @@ int mca_btl_sm_get_sync(struct mca_btl_base_module_t* btl,
#if OPAL_BTL_SM_HAVE_CMA #if OPAL_BTL_SM_HAVE_CMA
if (OPAL_LIKELY(mca_btl_sm_component.use_cma)) { if (OPAL_LIKELY(mca_btl_sm_component.use_cma)) {
char *remote_address, *local_address;
int remote_length, local_length;
struct iovec local, remote; struct iovec local, remote;
pid_t remote_pid; pid_t remote_pid;
int val; int val;
remote_address = (char *)(uintptr_t) src->base.seg_addr.lval; remote_pid = remote_handle->data.pid;
remote_length = src->base.seg_len; remote.iov_base = (void *) (intptr_t) remote_address;
remote.iov_len = size;
local_address = (char *)(uintptr_t) dst->base.seg_addr.lval;
local_length = dst->base.seg_len;
remote_pid = src->key;
remote.iov_base = remote_address;
remote.iov_len = remote_length;
local.iov_base = local_address; local.iov_base = local_address;
local.iov_len = local_length; local.iov_len = size;
val = process_vm_readv(remote_pid, &local, 1, &remote, 1, 0); val = process_vm_readv(remote_pid, &local, 1, &remote, 1, 0);
if (val != local_length) { if (val != size) {
if (val<0) { if (val<0) {
opal_output(0, "mca_btl_sm_get_sync: process_vm_readv failed: %i", opal_output(0, "mca_btl_sm_get_sync: process_vm_readv failed: %i",
errno); errno);
@ -1119,15 +1147,7 @@ int mca_btl_sm_get_sync(struct mca_btl_base_module_t* btl,
} }
#endif /* OPAL_BTL_SM_HAVE_CMA */ #endif /* OPAL_BTL_SM_HAVE_CMA */
btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS);
if (0 != (MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags)) {
frag->base.des_cbfunc(&mca_btl_sm.super,
frag->endpoint, &frag->base,
OPAL_SUCCESS);
}
if (btl_ownership) {
MCA_BTL_SM_FRAG_RETURN(frag);
}
return OPAL_SUCCESS; return OPAL_SUCCESS;
} }
@ -1139,34 +1159,42 @@ int mca_btl_sm_get_sync(struct mca_btl_base_module_t* btl,
/** /**
* Initiate an asynchronous get. * Initiate an asynchronous get.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*/ */
int mca_btl_sm_get_async(struct mca_btl_base_module_t* btl, int mca_btl_sm_get_async (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
struct mca_btl_base_endpoint_t* endpoint, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_descriptor_t* des) mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{ {
int btl_ownership;
mca_btl_sm_t* sm_btl = (mca_btl_sm_t*) btl; mca_btl_sm_t* sm_btl = (mca_btl_sm_t*) btl;
mca_btl_sm_frag_t* frag = (mca_btl_sm_frag_t*)des; mca_btl_sm_frag_t* frag;
mca_btl_sm_segment_t *src = (mca_btl_sm_segment_t*)des->des_remote;
mca_btl_sm_segment_t *dst = (mca_btl_sm_segment_t*)des->des_local;
struct knem_cmd_inline_copy icopy; struct knem_cmd_inline_copy icopy;
struct knem_cmd_param_iovec recv_iovec; struct knem_cmd_param_iovec recv_iovec;
/* If we have no knem slots available, return /* If we have no knem slots available, fall back to synchronous */
TEMP_OUT_OF_RESOURCE */
if (sm_btl->knem_status_num_used >= if (sm_btl->knem_status_num_used >=
mca_btl_sm_component.knem_max_simultaneous) { mca_btl_sm_component.knem_max_simultaneous) {
return OPAL_ERR_TEMP_OUT_OF_RESOURCE; return mca_btl_sm_get_sync (btl, endpoint, local_address, remote_address, local_handle,
remote_handle, size, flags, order, cbfunc, cbcontext, cbdata);
} }
/* allocate a fragment to keep track of this transaction */
MCA_BTL_SM_FRAG_ALLOC_USER(frag);
if (OPAL_UNLIKELY(NULL == frag)) {
return mca_btl_sm_get_sync (btl, endpoint, local_address, remote_address, local_handle,
remote_handle, size, flags, order, cbfunc, cbcontext, cbdata);
}
/* fill in callback data */
frag->cb.func = cbfunc;
frag->cb.context = cbcontext;
frag->cb.data = cbdata;
frag->cb.local_address = local_address;
frag->cb.local_handle = local_handle;
/* We have a slot, so fill in the data fields. Bump the /* We have a slot, so fill in the data fields. Bump the
first_avail and num_used counters. */ first_avail and num_used counters. */
recv_iovec.base = (uintptr_t) dst->base.seg_addr.lval; recv_iovec.base = (uintptr_t) local_address;
recv_iovec.len = dst->base.seg_len; recv_iovec.len = size;
icopy.local_iovec_array = (uintptr_t)&recv_iovec; icopy.local_iovec_array = (uintptr_t)&recv_iovec;
icopy.local_iovec_nr = 1; icopy.local_iovec_nr = 1;
icopy.write = 0; icopy.write = 0;
@ -1176,13 +1204,13 @@ int mca_btl_sm_get_async(struct mca_btl_base_module_t* btl,
sm_btl->knem_status_first_avail = 0; sm_btl->knem_status_first_avail = 0;
} }
++sm_btl->knem_status_num_used; ++sm_btl->knem_status_num_used;
icopy.remote_cookie = src->key; icopy.remote_cookie = remote_handle->data.knem.cookie;
icopy.remote_offset = 0; icopy.remote_offset = remote_address - remote_handle->data.knem.base_addr;
/* Use the DMA flag if knem supports it *and* the segment length /* Use the DMA flag if knem supports it *and* the segment length
is greater than the cutoff */ is greater than the cutoff */
icopy.flags = KNEM_FLAG_ASYNCDMACOMPLETE; icopy.flags = KNEM_FLAG_ASYNCDMACOMPLETE;
if (mca_btl_sm_component.knem_dma_min <= dst->base.seg_len) { if (mca_btl_sm_component.knem_dma_min <= size) {
icopy.flags = mca_btl_sm_component.knem_dma_flag; icopy.flags = mca_btl_sm_component.knem_dma_flag;
} }
@ -1190,19 +1218,11 @@ int mca_btl_sm_get_async(struct mca_btl_base_module_t* btl,
if (OPAL_LIKELY(0 == ioctl(sm_btl->knem_fd, if (OPAL_LIKELY(0 == ioctl(sm_btl->knem_fd,
KNEM_CMD_INLINE_COPY, &icopy))) { KNEM_CMD_INLINE_COPY, &icopy))) {
if (icopy.current_status != KNEM_STATUS_PENDING) { if (icopy.current_status != KNEM_STATUS_PENDING) {
MCA_BTL_SM_FRAG_RETURN(frag);
/* request completed synchronously */ /* request completed synchronously */
/* FIXME: what if icopy.current_status == KNEM_STATUS_FAILED? */ /* FIXME: what if icopy.current_status == KNEM_STATUS_FAILED? */
cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS);
btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
if (0 != (MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags)) {
frag->base.des_cbfunc(&mca_btl_sm.super,
frag->endpoint, &frag->base,
OPAL_SUCCESS);
}
if (btl_ownership) {
MCA_BTL_SM_FRAG_RETURN(frag);
}
--sm_btl->knem_status_num_used; --sm_btl->knem_status_num_used;
++sm_btl->knem_status_first_used; ++sm_btl->knem_status_first_used;

Просмотреть файл

@ -1,3 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/* /*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
@ -11,7 +12,7 @@
* All rights reserved. * All rights reserved.
* Copyright (c) 2006-2007 Voltaire. All rights reserved. * Copyright (c) 2006-2007 Voltaire. All rights reserved.
* Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010-2013 Los Alamos National Security, LLC. * Copyright (c) 2010-2014 Los Alamos National Security, LLC.
* All rights reserved. * All rights reserved.
* Copyright (c) 2010-2012 IBM Corporation. All rights reserved. * Copyright (c) 2010-2012 IBM Corporation. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
@ -182,6 +183,10 @@ struct mca_btl_sm_component_t {
#if OPAL_BTL_SM_HAVE_KNEM #if OPAL_BTL_SM_HAVE_KNEM
/* Knem capabilities info */ /* Knem capabilities info */
struct knem_cmd_info knem_info; struct knem_cmd_info knem_info;
#endif
#if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA
/** registration handles to hold knem cookies */
ompi_free_list_t registration_handles;
#endif /* OPAL_BTL_SM_HAVE_KNEM */ #endif /* OPAL_BTL_SM_HAVE_KNEM */
/** MCA: should we be using knem or not? neg=try but continue if /** MCA: should we be using knem or not? neg=try but continue if
@ -461,7 +466,6 @@ extern int mca_btl_sm_free(
struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_src( struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_src(
struct mca_btl_base_module_t* btl, struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint, struct mca_btl_base_endpoint_t* endpoint,
mca_mpool_base_registration_t* registration,
struct opal_convertor_t* convertor, struct opal_convertor_t* convertor,
uint8_t order, uint8_t order,
size_t reserve, size_t reserve,
@ -504,30 +508,20 @@ extern int mca_btl_sm_send(
/* /*
* Synchronous knem/cma get * Synchronous knem/cma get
*/ */
extern int mca_btl_sm_get_sync( int mca_btl_sm_get_sync (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
struct mca_btl_base_module_t* btl, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_endpoint_t* endpoint, mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
struct mca_btl_base_descriptor_t* des ); int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
extern struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_dst(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_mpool_base_registration_t* registration,
struct opal_convertor_t* convertor,
uint8_t order,
size_t reserve,
size_t* size,
uint32_t flags);
#endif /* OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA */ #endif /* OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA */
#if OPAL_BTL_SM_HAVE_KNEM #if OPAL_BTL_SM_HAVE_KNEM
/* /*
* Asynchronous knem get * Asynchronous knem get
*/ */
extern int mca_btl_sm_get_async( int mca_btl_sm_get_async (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
struct mca_btl_base_module_t* btl, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_endpoint_t* endpoint, mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
struct mca_btl_base_descriptor_t* des ); int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
#endif /* OPAL_BTL_SM_HAVE_KNEM */ #endif /* OPAL_BTL_SM_HAVE_KNEM */
@ -558,6 +552,31 @@ void mca_btl_sm_component_event_thread(opal_object_t*);
#define MCA_BTL_SM_SIGNAL_PEER(peer) #define MCA_BTL_SM_SIGNAL_PEER(peer)
#endif #endif
#if OPAL_BTL_SM_HAVE_KNEM | OPAL_BTL_SM_HAVE_CMA
struct mca_btl_base_registration_handle_t {
union {
struct {
uint64_t cookie;
intptr_t base_addr;
} knem;
pid_t pid;
} data;
};
struct mca_btl_sm_registration_handle_t {
ompi_free_list_item_t super;
mca_btl_base_registration_handle_t btl_handle;
};
typedef struct mca_btl_sm_registration_handle_t mca_btl_sm_registration_handle_t;
mca_btl_base_registration_handle_t *mca_btl_sm_register_mem (struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
void *base, size_t size, uint32_t flags);
int mca_btl_sm_deregister_mem (struct mca_btl_base_module_t* btl, mca_btl_base_registration_handle_t *handle);
#endif
END_C_DECLS END_C_DECLS
#endif #endif

Просмотреть файл

@ -67,6 +67,10 @@
#include "opal/mca/common/cuda/common_cuda.h" #include "opal/mca/common/cuda/common_cuda.h"
#endif /* OPAL_CUDA_SUPPORT */ #endif /* OPAL_CUDA_SUPPORT */
#if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA
static OBJ_CLASS_INSTANCE(mca_btl_sm_registration_handle_t, ompi_free_list_item_t, NULL, NULL);
#endif
static int mca_btl_sm_component_open(void); static int mca_btl_sm_component_open(void);
static int mca_btl_sm_component_close(void); static int mca_btl_sm_component_close(void);
static int sm_register(void); static int sm_register(void);
@ -251,10 +255,13 @@ static int sm_register(void)
mca_btl_sm.super.btl_rdma_pipeline_frag_size = 64*1024; mca_btl_sm.super.btl_rdma_pipeline_frag_size = 64*1024;
mca_btl_sm.super.btl_min_rdma_pipeline_size = 64*1024; mca_btl_sm.super.btl_min_rdma_pipeline_size = 64*1024;
mca_btl_sm.super.btl_flags = MCA_BTL_FLAGS_SEND; mca_btl_sm.super.btl_flags = MCA_BTL_FLAGS_SEND;
mca_btl_sm.super.btl_seg_size = sizeof (mca_btl_sm_segment_t);
mca_btl_sm.super.btl_bandwidth = 9000; /* Mbs */ mca_btl_sm.super.btl_bandwidth = 9000; /* Mbs */
mca_btl_sm.super.btl_latency = 1; /* Microsecs */ mca_btl_sm.super.btl_latency = 1; /* Microsecs */
#if OPAL_BTL_SM_HAVE_KNEM
mca_btl_sm.super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t);
#endif
/* Call the BTL based to register its MCA params */ /* Call the BTL based to register its MCA params */
mca_btl_base_param_register(&mca_btl_sm_component.super.btl_version, mca_btl_base_param_register(&mca_btl_sm_component.super.btl_version,
&mca_btl_sm.super); &mca_btl_sm.super);
@ -295,6 +302,11 @@ static int mca_btl_sm_component_open(void)
OBJ_CONSTRUCT(&mca_btl_sm_component.pending_send_fl, opal_free_list_t); OBJ_CONSTRUCT(&mca_btl_sm_component.pending_send_fl, opal_free_list_t);
mca_btl_sm_component.sm_seg = NULL; mca_btl_sm_component.sm_seg = NULL;
#if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA
OBJ_CONSTRUCT(&mca_btl_sm_component.registration_handles, ompi_free_list_t);
#endif
#if OPAL_BTL_SM_HAVE_KNEM #if OPAL_BTL_SM_HAVE_KNEM
mca_btl_sm.knem_fd = -1; mca_btl_sm.knem_fd = -1;
mca_btl_sm.knem_status_array = NULL; mca_btl_sm.knem_status_array = NULL;
@ -332,6 +344,10 @@ static int mca_btl_sm_component_close(void)
} }
#endif /* OPAL_BTL_SM_HAVE_KNEM */ #endif /* OPAL_BTL_SM_HAVE_KNEM */
#if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA
OBJ_DESTRUCT(&mca_btl_sm_component.registration_handles);
#endif
OBJ_DESTRUCT(&mca_btl_sm_component.sm_lock); OBJ_DESTRUCT(&mca_btl_sm_component.sm_lock);
/** /**
* We don't have to destroy the fragment lists. They are allocated * We don't have to destroy the fragment lists. They are allocated
@ -904,6 +920,9 @@ mca_btl_sm_component_init(int *num_btls,
} else { } else {
mca_btl_sm.super.btl_get = mca_btl_sm_get_sync; mca_btl_sm.super.btl_get = mca_btl_sm_get_sync;
} }
mca_btl_sm.super.btl_register_mem = mca_btl_sm_register_mem;
mca_btl_sm.super.btl_deregister_mem = mca_btl_sm_deregister_mem;
} }
#else #else
/* If the user explicitly asked for knem and we can't provide it, /* If the user explicitly asked for knem and we can't provide it,
@ -918,6 +937,8 @@ mca_btl_sm_component_init(int *num_btls,
/* Will only ever have either cma or knem enabled at runtime /* Will only ever have either cma or knem enabled at runtime
so no problems with accidentally overwriting this set earlier */ so no problems with accidentally overwriting this set earlier */
mca_btl_sm.super.btl_get = mca_btl_sm_get_sync; mca_btl_sm.super.btl_get = mca_btl_sm_get_sync;
mca_btl_sm.super.btl_register_mem = mca_btl_sm_register_mem;
mca_btl_sm.super.btl_deregister_mem = mca_btl_sm_deregister_mem;
} }
#else #else
/* If the user explicitly asked for CMA and we can't provide itm /* If the user explicitly asked for CMA and we can't provide itm
@ -931,6 +952,21 @@ mca_btl_sm_component_init(int *num_btls,
} }
#endif /* OPAL_BTL_SM_HAVE_CMA */ #endif /* OPAL_BTL_SM_HAVE_CMA */
#if OPAL_BTL_SM_HAVE_KNEM | OPAL_BTL_SM_HAVE_CMA
if (mca_btl_sm_component.use_cma || mca_btl_sm_component.use_knem) {
rc = ompi_free_list_init_new (&mca_btl_sm_component.registration_handles,
sizeof (mca_btl_sm_registration_handle_t),
8, OBJ_CLASS(mca_btl_sm_registration_handle_t),
0, 0, mca_btl_sm_component.sm_free_list_num,
mca_btl_sm_component.sm_free_list_max,
mca_btl_sm_component.sm_free_list_inc, NULL);
if (OPAL_SUCCESS != rc) {
free (btls);
return NULL;
}
}
#endif
return btls; return btls;
no_knem: no_knem:
@ -963,6 +999,7 @@ mca_btl_sm_component_init(int *num_btls,
/* disable get when not using knem or cma */ /* disable get when not using knem or cma */
mca_btl_sm.super.btl_get = NULL; mca_btl_sm.super.btl_get = NULL;
mca_btl_sm.super.btl_flags &= ~MCA_BTL_FLAGS_GET; mca_btl_sm.super.btl_flags &= ~MCA_BTL_FLAGS_GET;
mca_btl_sm_component.use_knem = 0;
} }
/* Otherwise, use_knem was 0 (and we didn't get here) or use_knem /* Otherwise, use_knem was 0 (and we didn't get here) or use_knem
@ -1090,8 +1127,8 @@ int mca_btl_sm_component_progress(void)
reg = mca_btl_base_active_message_trigger + hdr->tag; reg = mca_btl_base_active_message_trigger + hdr->tag;
seg.seg_addr.pval = ((char *)hdr) + sizeof(mca_btl_sm_hdr_t); seg.seg_addr.pval = ((char *)hdr) + sizeof(mca_btl_sm_hdr_t);
seg.seg_len = hdr->len; seg.seg_len = hdr->len;
Frag.base.des_local_count = 1; Frag.base.des_segment_count = 1;
Frag.base.des_local = &seg; Frag.base.des_segments = &seg;
reg->cbfunc(&mca_btl_sm.super, hdr->tag, &(Frag.base), reg->cbfunc(&mca_btl_sm.super, hdr->tag, &(Frag.base),
reg->cbdata); reg->cbdata);
/* return the fragment */ /* return the fragment */
@ -1176,22 +1213,14 @@ int mca_btl_sm_component_progress(void)
mca_btl_sm.knem_status_array[mca_btl_sm.knem_status_first_used]) { mca_btl_sm.knem_status_array[mca_btl_sm.knem_status_first_used]) {
if (KNEM_STATUS_SUCCESS == if (KNEM_STATUS_SUCCESS ==
mca_btl_sm.knem_status_array[mca_btl_sm.knem_status_first_used]) { mca_btl_sm.knem_status_array[mca_btl_sm.knem_status_first_used]) {
int btl_ownership;
/* Handle the completed fragment */ /* Handle the completed fragment */
frag = frag =
mca_btl_sm.knem_frag_array[mca_btl_sm.knem_status_first_used]; mca_btl_sm.knem_frag_array[mca_btl_sm.knem_status_first_used];
btl_ownership = (frag->base.des_flags & frag->cb.func (&mca_btl_sm.super, frag->endpoint,
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); frag->cb.local_address, frag->cb.local_handle,
if (0 != (MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->cb.context, frag->cb.data, OPAL_SUCCESS);
frag->base.des_flags)) { MCA_BTL_SM_FRAG_RETURN(frag);
frag->base.des_cbfunc(&mca_btl_sm.super,
frag->endpoint, &frag->base,
OPAL_SUCCESS);
}
if (btl_ownership) {
MCA_BTL_SM_FRAG_RETURN(frag);
}
/* Bump counters, loop around the circular buffer if /* Bump counters, loop around the circular buffer if
necessary */ necessary */

Просмотреть файл

@ -10,6 +10,8 @@
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2006-2007 Voltaire. All rights reserved. * Copyright (c) 2006-2007 Voltaire. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow

Просмотреть файл

@ -31,8 +31,8 @@ static inline void mca_btl_sm_frag_common_constructor(mca_btl_sm_frag_t* frag)
frag->hdr->my_smp_rank = mca_btl_sm_component.my_smp_rank; frag->hdr->my_smp_rank = mca_btl_sm_component.my_smp_rank;
} }
frag->segment.base.seg_len = frag->size; frag->segment.base.seg_len = frag->size;
frag->base.des_local = &frag->segment.base; frag->base.des_segments = &frag->segment.base;
frag->base.des_local_count = 1; frag->base.des_segment_count = 1;
frag->base.des_flags = 0; frag->base.des_flags = 0;
} }

Просмотреть файл

@ -1,3 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/* /*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
@ -11,6 +12,8 @@
* All rights reserved. * All rights reserved.
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -64,6 +67,16 @@ struct mca_btl_sm_frag_t {
/* pointer written to the FIFO, this is the base of the shared memory region */ /* pointer written to the FIFO, this is the base of the shared memory region */
mca_btl_sm_hdr_t *hdr; mca_btl_sm_hdr_t *hdr;
ompi_free_list_t* my_list; ompi_free_list_t* my_list;
#if OPAL_BTL_SM_HAVE_KNEM
/* rdma callback data. required for async get */
struct {
mca_btl_base_rdma_completion_fn_t func;
void *local_address;
struct mca_btl_base_registration_handle_t *local_handle;
void *context;
void *data;
} cb;
#endif
}; };
typedef struct mca_btl_sm_frag_t mca_btl_sm_frag_t; typedef struct mca_btl_sm_frag_t mca_btl_sm_frag_t;
typedef struct mca_btl_sm_frag_t mca_btl_sm_frag1_t; typedef struct mca_btl_sm_frag_t mca_btl_sm_frag1_t;

Просмотреть файл

@ -832,8 +832,8 @@ struct mca_btl_base_descriptor_t* mca_btl_smcuda_prepare_src(
} }
#endif /* OPAL_CUDA_SUPPORT */ #endif /* OPAL_CUDA_SUPPORT */
frag->base.des_local = &(frag->segment.base); frag->base.des_segments = &(frag->segment.base);
frag->base.des_local_count = 1; frag->base.des_segment_count = 1;
frag->base.order = MCA_BTL_NO_ORDER; frag->base.order = MCA_BTL_NO_ORDER;
frag->base.des_remote = NULL; frag->base.des_remote = NULL;
frag->base.des_remote_count = 0; frag->base.des_remote_count = 0;
@ -1045,8 +1045,8 @@ struct mca_btl_base_descriptor_t* mca_btl_smcuda_prepare_dst(
frag->base.des_remote = NULL; frag->base.des_remote = NULL;
frag->base.des_remote_count = 0; frag->base.des_remote_count = 0;
frag->base.des_local = &frag->segment.base; frag->base.des_segments = &frag->segment.base;
frag->base.des_local_count = 1; frag->base.des_segment_count = 1;
frag->base.des_flags = flags; frag->base.des_flags = flags;
return &frag->base; return &frag->base;
} }
@ -1059,7 +1059,7 @@ int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t* btl,
struct mca_btl_base_descriptor_t* descriptor) struct mca_btl_base_descriptor_t* descriptor)
{ {
mca_btl_smcuda_segment_t *src_seg = (mca_btl_smcuda_segment_t *) descriptor->des_remote; mca_btl_smcuda_segment_t *src_seg = (mca_btl_smcuda_segment_t *) descriptor->des_remote;
mca_btl_smcuda_segment_t *dst_seg = (mca_btl_smcuda_segment_t *) descriptor->des_local; mca_btl_smcuda_segment_t *dst_seg = (mca_btl_smcuda_segment_t *) descriptor->des_segments;
mca_mpool_common_cuda_reg_t rget_reg; mca_mpool_common_cuda_reg_t rget_reg;
mca_mpool_common_cuda_reg_t *reg_ptr = &rget_reg; mca_mpool_common_cuda_reg_t *reg_ptr = &rget_reg;
int btl_ownership; int btl_ownership;

Просмотреть файл

@ -691,7 +691,7 @@ static void btl_smcuda_control(mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t *endpoint; struct mca_btl_base_endpoint_t *endpoint;
mca_btl_smcuda_t *smcuda_btl = (mca_btl_smcuda_t *)btl; mca_btl_smcuda_t *smcuda_btl = (mca_btl_smcuda_t *)btl;
mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des; mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
mca_btl_base_segment_t* segments = des->des_local; mca_btl_base_segment_t* segments = des->des_segments;
/* Use the rank of the peer that sent the data to get to the endpoint /* Use the rank of the peer that sent the data to get to the endpoint
* structure. This is needed for PML callback. */ * structure. This is needed for PML callback. */
@ -1065,8 +1065,8 @@ int mca_btl_smcuda_component_progress(void)
reg = mca_btl_base_active_message_trigger + hdr->tag; reg = mca_btl_base_active_message_trigger + hdr->tag;
seg.seg_addr.pval = ((char *)hdr) + sizeof(mca_btl_smcuda_hdr_t); seg.seg_addr.pval = ((char *)hdr) + sizeof(mca_btl_smcuda_hdr_t);
seg.seg_len = hdr->len; seg.seg_len = hdr->len;
Frag.base.des_local_count = 1; Frag.base.des_segment_count = 1;
Frag.base.des_local = &seg; Frag.base.des_segments = &seg;
#if OPAL_CUDA_SUPPORT #if OPAL_CUDA_SUPPORT
Frag.hdr = hdr; /* needed for peer rank in control messages */ Frag.hdr = hdr; /* needed for peer rank in control messages */
#endif /* OPAL_CUDA_SUPPORT */ #endif /* OPAL_CUDA_SUPPORT */

Просмотреть файл

@ -32,8 +32,8 @@ static inline void mca_btl_smcuda_frag_common_constructor(mca_btl_smcuda_frag_t*
frag->hdr->my_smp_rank = mca_btl_smcuda_component.my_smp_rank; frag->hdr->my_smp_rank = mca_btl_smcuda_component.my_smp_rank;
} }
frag->segment.base.seg_len = frag->size; frag->segment.base.seg_len = frag->size;
frag->base.des_local = &frag->segment.base; frag->base.des_segments = &frag->segment.base;
frag->base.des_local_count = 1; frag->base.des_segment_count = 1;
frag->base.des_flags = 0; frag->base.des_flags = 0;
#if OPAL_CUDA_SUPPORT #if OPAL_CUDA_SUPPORT
frag->registration = NULL; frag->registration = NULL;

Просмотреть файл

@ -42,7 +42,6 @@ mca_btl_tcp_module_t mca_btl_tcp_module = {
.btl_alloc = mca_btl_tcp_alloc, .btl_alloc = mca_btl_tcp_alloc,
.btl_free = mca_btl_tcp_free, .btl_free = mca_btl_tcp_free,
.btl_prepare_src = mca_btl_tcp_prepare_src, .btl_prepare_src = mca_btl_tcp_prepare_src,
.btl_prepare_dst = mca_btl_tcp_prepare_dst,
.btl_send = mca_btl_tcp_send, .btl_send = mca_btl_tcp_send,
.btl_put = mca_btl_tcp_put, .btl_put = mca_btl_tcp_put,
.btl_dump = mca_btl_base_dump, .btl_dump = mca_btl_base_dump,
@ -170,8 +169,8 @@ mca_btl_base_descriptor_t* mca_btl_tcp_alloc(
frag->segments[0].seg_len = size; frag->segments[0].seg_len = size;
frag->segments[0].seg_addr.pval = frag+1; frag->segments[0].seg_addr.pval = frag+1;
frag->base.des_local = frag->segments; frag->base.des_segments = frag->segments;
frag->base.des_local_count = 1; frag->base.des_segment_count = 1;
frag->base.des_flags = flags; frag->base.des_flags = flags;
frag->base.order = MCA_BTL_NO_ORDER; frag->base.order = MCA_BTL_NO_ORDER;
frag->btl = (mca_btl_tcp_module_t*)btl; frag->btl = (mca_btl_tcp_module_t*)btl;
@ -202,7 +201,6 @@ int mca_btl_tcp_free(
mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src( mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src(
struct mca_btl_base_module_t* btl, struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint, struct mca_btl_base_endpoint_t* endpoint,
struct mca_mpool_base_registration_t* registration,
struct opal_convertor_t* convertor, struct opal_convertor_t* convertor,
uint8_t order, uint8_t order,
size_t reserve, size_t reserve,
@ -238,7 +236,7 @@ mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src(
frag->segments[0].seg_addr.pval = (frag + 1); frag->segments[0].seg_addr.pval = (frag + 1);
frag->segments[0].seg_len = reserve; frag->segments[0].seg_len = reserve;
frag->base.des_local_count = 1; frag->base.des_segment_count = 1;
if(opal_convertor_need_buffers(convertor)) { if(opal_convertor_need_buffers(convertor)) {
if (max_data + reserve > frag->size) { if (max_data + reserve > frag->size) {
@ -268,66 +266,16 @@ mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src(
frag->segments[1].seg_addr.pval = iov.iov_base; frag->segments[1].seg_addr.pval = iov.iov_base;
frag->segments[1].seg_len = max_data; frag->segments[1].seg_len = max_data;
frag->base.des_local_count = 2; frag->base.des_segment_count = 2;
} }
frag->base.des_local = frag->segments; frag->base.des_segments = frag->segments;
frag->base.des_remote = NULL;
frag->base.des_remote_count = 0;
frag->base.des_flags = flags; frag->base.des_flags = flags;
frag->base.order = MCA_BTL_NO_ORDER; frag->base.order = MCA_BTL_NO_ORDER;
*size = max_data; *size = max_data;
return &frag->base; return &frag->base;
} }
/**
* Prepare a descriptor for send/rdma using the supplied
* convertor. If the convertor references data that is contigous,
* the descriptor may simply point to the user buffer. Otherwise,
* this routine is responsible for allocating buffer space and
* packing if required.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL peer addressing
* @param convertor (IN) Data type convertor
* @param reserve (IN) Additional bytes requested by upper layer to precede user data
* @param size (IN/OUT) Number of bytes to prepare (IN), number of bytes actually prepared (OUT)
*/
mca_btl_base_descriptor_t* mca_btl_tcp_prepare_dst(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_mpool_base_registration_t* registration,
struct opal_convertor_t* convertor,
uint8_t order,
size_t reserve,
size_t* size,
uint32_t flags)
{
mca_btl_tcp_frag_t* frag;
if( OPAL_UNLIKELY((*size) > UINT32_MAX) ) { /* limit the size to what we support */
*size = (size_t)UINT32_MAX;
}
MCA_BTL_TCP_FRAG_ALLOC_USER(frag);
if( OPAL_UNLIKELY(NULL == frag) ) {
return NULL;
}
frag->segments->seg_len = *size;
opal_convertor_get_current_pointer( convertor, (void**)&(frag->segments->seg_addr.pval) );
frag->base.des_remote = NULL;
frag->base.des_remote_count = 0;
frag->base.des_local = frag->segments;
frag->base.des_local_count = 1;
frag->base.des_flags = flags;
frag->base.order = MCA_BTL_NO_ORDER;
return &frag->base;
}
/** /**
* Initiate an asynchronous send. * Initiate an asynchronous send.
* *
@ -355,7 +303,7 @@ int mca_btl_tcp_send( struct mca_btl_base_module_t* btl,
frag->iov[0].iov_base = (IOVBASE_TYPE*)&frag->hdr; frag->iov[0].iov_base = (IOVBASE_TYPE*)&frag->hdr;
frag->iov[0].iov_len = sizeof(frag->hdr); frag->iov[0].iov_len = sizeof(frag->hdr);
frag->hdr.size = 0; frag->hdr.size = 0;
for( i = 0; i < (int)frag->base.des_local_count; i++) { for( i = 0; i < (int)frag->base.des_segment_count; i++) {
frag->hdr.size += frag->segments[i].seg_len; frag->hdr.size += frag->segments[i].seg_len;
frag->iov[i+1].iov_len = frag->segments[i].seg_len; frag->iov[i+1].iov_len = frag->segments[i].seg_len;
frag->iov[i+1].iov_base = (IOVBASE_TYPE*)frag->segments[i].seg_addr.pval; frag->iov[i+1].iov_base = (IOVBASE_TYPE*)frag->segments[i].seg_addr.pval;
@ -368,23 +316,55 @@ int mca_btl_tcp_send( struct mca_btl_base_module_t* btl,
return mca_btl_tcp_endpoint_send(endpoint,frag); return mca_btl_tcp_endpoint_send(endpoint,frag);
} }
static void fake_rdma_complete (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint,
mca_btl_base_descriptor_t *desc, int rc)
{
mca_btl_tcp_frag_t *frag = (mca_btl_tcp_frag_t *) desc;
frag->cb.func (btl, endpoint, frag->segments[0].seg_addr.pval, NULL, frag->cb.context, frag->cb.data,
rc);
}
/** /**
* Initiate an asynchronous put. * Initiate an asynchronous put.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*/ */
int mca_btl_tcp_put( mca_btl_base_module_t* btl, int mca_btl_tcp_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
mca_btl_base_endpoint_t* endpoint, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_descriptor_t* descriptor ) mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{ {
mca_btl_tcp_module_t* tcp_btl = (mca_btl_tcp_module_t*) btl; mca_btl_tcp_module_t* tcp_btl = (mca_btl_tcp_module_t*) btl;
mca_btl_tcp_frag_t* frag = (mca_btl_tcp_frag_t*)descriptor; mca_btl_tcp_frag_t *frag = NULL;
int i; int i;
MCA_BTL_TCP_FRAG_ALLOC_USER(frag);
if( OPAL_UNLIKELY(NULL == frag) ) {
return OPAL_ERR_OUT_OF_RESOURCE;;
}
frag->endpoint = endpoint;
frag->segments->seg_len = size;
frag->segments->seg_addr.pval = local_address;
frag->base.des_segments = frag->segments;
frag->base.des_segment_count = 1;
frag->base.order = MCA_BTL_NO_ORDER;
frag->segments[0].seg_addr.pval = local_address;
frag->segments[0].seg_len = size;
frag->segments[1].seg_addr.lval = remote_address;
frag->segments[1].seg_len = size;
frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
frag->base.des_cbfunc = fake_rdma_complete;
frag->cb.func = cbfunc;
frag->cb.data = cbdata;
frag->cb.context = cbcontext;
frag->btl = tcp_btl; frag->btl = tcp_btl;
frag->endpoint = endpoint; frag->endpoint = endpoint;
frag->rc = 0; frag->rc = 0;
@ -394,9 +374,9 @@ int mca_btl_tcp_put( mca_btl_base_module_t* btl,
frag->iov_ptr = frag->iov; frag->iov_ptr = frag->iov;
frag->iov[0].iov_base = (IOVBASE_TYPE*)&frag->hdr; frag->iov[0].iov_base = (IOVBASE_TYPE*)&frag->hdr;
frag->iov[0].iov_len = sizeof(frag->hdr); frag->iov[0].iov_len = sizeof(frag->hdr);
frag->iov[1].iov_base = (IOVBASE_TYPE*)frag->base.des_remote; frag->iov[1].iov_base = (IOVBASE_TYPE*) (frag->segments + 1);
frag->iov[1].iov_len = frag->base.des_remote_count * sizeof(mca_btl_base_segment_t); frag->iov[1].iov_len = sizeof(mca_btl_base_segment_t);
for( i = 0; i < (int)frag->base.des_local_count; i++ ) { for( i = 0; i < (int)frag->base.des_segment_count; i++ ) {
frag->hdr.size += frag->segments[i].seg_len; frag->hdr.size += frag->segments[i].seg_len;
frag->iov[i+2].iov_len = frag->segments[i].seg_len; frag->iov[i+2].iov_len = frag->segments[i].seg_len;
frag->iov[i+2].iov_base = (IOVBASE_TYPE*)frag->segments[i].seg_addr.pval; frag->iov[i+2].iov_base = (IOVBASE_TYPE*)frag->segments[i].seg_addr.pval;
@ -404,7 +384,7 @@ int mca_btl_tcp_put( mca_btl_base_module_t* btl,
} }
frag->hdr.base.tag = MCA_BTL_TAG_BTL; frag->hdr.base.tag = MCA_BTL_TAG_BTL;
frag->hdr.type = MCA_BTL_TCP_HDR_TYPE_PUT; frag->hdr.type = MCA_BTL_TCP_HDR_TYPE_PUT;
frag->hdr.count = frag->base.des_remote_count; frag->hdr.count = 1;
if (endpoint->endpoint_nbo) MCA_BTL_TCP_HDR_HTON(frag->hdr); if (endpoint->endpoint_nbo) MCA_BTL_TCP_HDR_HTON(frag->hdr);
return ((i = mca_btl_tcp_endpoint_send(endpoint,frag)) >= 0 ? OPAL_SUCCESS : i); return ((i = mca_btl_tcp_endpoint_send(endpoint,frag)) >= 0 ? OPAL_SUCCESS : i);
} }
@ -412,22 +392,46 @@ int mca_btl_tcp_put( mca_btl_base_module_t* btl,
/** /**
* Initiate an asynchronous get. * Initiate an asynchronous get.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*
*/ */
int mca_btl_tcp_get( int mca_btl_tcp_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
mca_btl_base_module_t* btl, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_endpoint_t* endpoint, mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
mca_btl_base_descriptor_t* descriptor) int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{ {
mca_btl_tcp_module_t* tcp_btl = (mca_btl_tcp_module_t*) btl; mca_btl_tcp_module_t* tcp_btl = (mca_btl_tcp_module_t*) btl;
mca_btl_tcp_frag_t* frag = (mca_btl_tcp_frag_t*)descriptor; mca_btl_tcp_frag_t* frag = NULL;
int rc; int rc;
MCA_BTL_TCP_FRAG_ALLOC_USER(frag);
if( OPAL_UNLIKELY(NULL == frag) ) {
return OPAL_ERR_OUT_OF_RESOURCE;;
}
frag->endpoint = endpoint;
frag->segments->seg_len = size;
frag->segments->seg_addr.pval = local_address;
frag->base.des_segments = frag->segments;
frag->base.des_segment_count = 1;
frag->base.order = MCA_BTL_NO_ORDER;
frag->segments[0].seg_addr.pval = local_address;
frag->segments[0].seg_len = size;
frag->segments[1].seg_addr.lval = remote_address;
frag->segments[1].seg_len = size;
/* call the rdma callback through the descriptor callback. this is
* tcp so the extra latency is not an issue */
frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
frag->base.des_cbfunc = fake_rdma_complete;
frag->cb.func = cbfunc;
frag->cb.data = cbdata;
frag->cb.context = cbcontext;
frag->btl = tcp_btl; frag->btl = tcp_btl;
frag->endpoint = endpoint; frag->endpoint = endpoint;
frag->rc = 0; frag->rc = 0;
@ -437,11 +441,11 @@ int mca_btl_tcp_get(
frag->iov_ptr = frag->iov; frag->iov_ptr = frag->iov;
frag->iov[0].iov_base = (IOVBASE_TYPE*)&frag->hdr; frag->iov[0].iov_base = (IOVBASE_TYPE*)&frag->hdr;
frag->iov[0].iov_len = sizeof(frag->hdr); frag->iov[0].iov_len = sizeof(frag->hdr);
frag->iov[1].iov_base = (IOVBASE_TYPE*)frag->base.des_remote; frag->iov[1].iov_base = (IOVBASE_TYPE*) &frag->segments[1];
frag->iov[1].iov_len = frag->base.des_remote_count * sizeof(mca_btl_base_segment_t); frag->iov[1].iov_len = sizeof(mca_btl_base_segment_t);
frag->hdr.base.tag = MCA_BTL_TAG_BTL; frag->hdr.base.tag = MCA_BTL_TAG_BTL;
frag->hdr.type = MCA_BTL_TCP_HDR_TYPE_GET; frag->hdr.type = MCA_BTL_TCP_HDR_TYPE_GET;
frag->hdr.count = frag->base.des_remote_count; frag->hdr.count = 1;
if (endpoint->endpoint_nbo) MCA_BTL_TCP_HDR_HTON(frag->hdr); if (endpoint->endpoint_nbo) MCA_BTL_TCP_HDR_HTON(frag->hdr);
return ((rc = mca_btl_tcp_endpoint_send(endpoint,frag)) >= 0 ? OPAL_SUCCESS : rc); return ((rc = mca_btl_tcp_endpoint_send(endpoint,frag)) >= 0 ? OPAL_SUCCESS : rc);
} }

Просмотреть файл

@ -217,32 +217,22 @@ extern int mca_btl_tcp_send(
/** /**
* Initiate an asynchronous put. * Initiate an asynchronous put.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*/ */
extern int mca_btl_tcp_put( int mca_btl_tcp_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
struct mca_btl_base_module_t* btl, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_endpoint_t* btl_peer, mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
struct mca_btl_base_descriptor_t* decriptor int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
);
/** /**
* Initiate an asynchronous get. * Initiate an asynchronous get.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*/ */
extern int mca_btl_tcp_get( int mca_btl_tcp_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
struct mca_btl_base_module_t* btl, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_endpoint_t* btl_peer, mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
struct mca_btl_base_descriptor_t* decriptor int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
);
/** /**
* Allocate a descriptor with a segment of the requested size. * Allocate a descriptor with a segment of the requested size.
@ -290,7 +280,6 @@ extern int mca_btl_tcp_free(
mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src( mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src(
struct mca_btl_base_module_t* btl, struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* peer, struct mca_btl_base_endpoint_t* peer,
struct mca_mpool_base_registration_t*,
struct opal_convertor_t* convertor, struct opal_convertor_t* convertor,
uint8_t order, uint8_t order,
size_t reserve, size_t reserve,
@ -298,16 +287,6 @@ mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src(
uint32_t flags uint32_t flags
); );
extern mca_btl_base_descriptor_t* mca_btl_tcp_prepare_dst(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* peer,
struct mca_mpool_base_registration_t*,
struct opal_convertor_t* convertor,
uint8_t order,
size_t reserve,
size_t* size,
uint32_t flags);
/** /**
* Fault Tolerance Event Notification Function * Fault Tolerance Event Notification Function

Просмотреть файл

@ -287,7 +287,7 @@ static int mca_btl_tcp_component_register(void)
MCA_BTL_FLAGS_NEED_CSUM | MCA_BTL_FLAGS_NEED_CSUM |
MCA_BTL_FLAGS_NEED_ACK | MCA_BTL_FLAGS_NEED_ACK |
MCA_BTL_FLAGS_HETEROGENEOUS_RDMA; MCA_BTL_FLAGS_HETEROGENEOUS_RDMA;
mca_btl_tcp_module.super.btl_seg_size = sizeof (mca_btl_base_segment_t);
mca_btl_tcp_module.super.btl_bandwidth = 100; mca_btl_tcp_module.super.btl_bandwidth = 100;
mca_btl_tcp_module.super.btl_latency = 100; mca_btl_tcp_module.super.btl_latency = 100;

Просмотреть файл

@ -58,6 +58,12 @@ struct mca_btl_tcp_frag_t {
size_t size; size_t size;
int rc; int rc;
ompi_free_list_t* my_list; ompi_free_list_t* my_list;
/* fake rdma completion */
struct {
mca_btl_base_rdma_completion_fn_t func;
void *data;
void *context;
} cb;
}; };
typedef struct mca_btl_tcp_frag_t mca_btl_tcp_frag_t; typedef struct mca_btl_tcp_frag_t mca_btl_tcp_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_tcp_frag_t); OBJ_CLASS_DECLARATION(mca_btl_tcp_frag_t);
@ -116,10 +122,8 @@ do { \
frag->iov_cnt = 1; \ frag->iov_cnt = 1; \
frag->iov_idx = 0; \ frag->iov_idx = 0; \
frag->iov_ptr = frag->iov; \ frag->iov_ptr = frag->iov; \
frag->base.des_remote = NULL; \ frag->base.des_segments = frag->segments; \
frag->base.des_remote_count = 0; \ frag->base.des_segment_count = 1; \
frag->base.des_local = frag->segments; \
frag->base.des_local_count = 1; \
} while(0) } while(0)

Просмотреть файл

@ -270,8 +270,8 @@ mca_btl_base_descriptor_t* mca_btl_template_prepare_src(
frag->segment.seg_len = max_data + reserve; frag->segment.seg_len = max_data + reserve;
} }
frag->base.des_local = &frag->segment; frag->base.des_segments = &frag->segment;
frag->base.des_local_count = 1; frag->base.des_segment_count = 1;
frag->base.des_flags = 0; frag->base.des_flags = 0;
return &frag->base; return &frag->base;
} }
@ -311,8 +311,8 @@ mca_btl_base_descriptor_t* mca_btl_template_prepare_dst(
frag->segment.seg_len = *size; frag->segment.seg_len = *size;
opal_convertor_get_current_pointer( convertor, (void**)&(frag->segment.seg_addr.pval) ); opal_convertor_get_current_pointer( convertor, (void**)&(frag->segment.seg_addr.pval) );
frag->base.des_local = &frag->segment; frag->base.des_segments = &frag->segment;
frag->base.des_local_count = 1; frag->base.des_segment_count = 1;
frag->base.des_flags = 0; frag->base.des_flags = 0;
return &frag->base; return &frag->base;
} }

Просмотреть файл

@ -38,7 +38,8 @@ ugni_SOURCES = \
btl_ugni.h \ btl_ugni.h \
btl_ugni_smsg.h \ btl_ugni_smsg.h \
btl_ugni_smsg.c \ btl_ugni_smsg.c \
btl_ugni_prepare.h btl_ugni_prepare.h \
btl_ugni_atomic.c
mcacomponentdir = $(opallibdir) mcacomponentdir = $(opallibdir)
mcacomponent_LTLIBRARIES = $(component_install) mcacomponent_LTLIBRARIES = $(component_install)

Просмотреть файл

@ -33,6 +33,7 @@
#include "opal/mca/btl/base/btl_base_error.h" #include "opal/mca/btl/base/btl_base_error.h"
#include "opal/class/opal_hash_table.h" #include "opal/class/opal_hash_table.h"
#include "opal/class/ompi_free_list.h" #include "opal/class/ompi_free_list.h"
#include "opal/class/opal_free_list.h"
#include "opal/mca/common/ugni/common_ugni.h" #include "opal/mca/common/ugni/common_ugni.h"
#include <errno.h> #include <errno.h>
@ -80,6 +81,11 @@ typedef struct mca_btl_ugni_module_t {
opal_mutex_t eager_get_pending_lock; opal_mutex_t eager_get_pending_lock;
opal_list_t eager_get_pending; opal_list_t eager_get_pending;
opal_mutex_t pending_descriptors_lock;
opal_list_t pending_descriptors;
ompi_free_list_t post_descriptors;
mca_mpool_base_module_t *smsg_mpool; mca_mpool_base_module_t *smsg_mpool;
ompi_free_list_t smsg_mboxes; ompi_free_list_t smsg_mboxes;
@ -143,8 +149,6 @@ typedef struct mca_btl_ugni_component_t {
/* After this message size switch to BTE protocols */ /* After this message size switch to BTE protocols */
size_t ugni_fma_limit; size_t ugni_fma_limit;
/* Switch to put when trying to GET at or above this size */
size_t ugni_get_limit;
/* Switch to get when sending above this size */ /* Switch to get when sending above this size */
size_t ugni_smsg_limit; size_t ugni_smsg_limit;
@ -260,33 +264,31 @@ mca_btl_ugni_sendi (struct mca_btl_base_module_t *btl,
uint32_t flags, mca_btl_base_tag_t tag, uint32_t flags, mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t **descriptor); mca_btl_base_descriptor_t **descriptor);
/** int mca_btl_ugni_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
* Initiate a get operation. uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
* mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
* location: btl_ugni_get.c int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*/
int
mca_btl_ugni_get (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *des);
/** int mca_btl_ugni_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
* Initiate a put operation. uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
* mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
* location: btl_ugni_put.c int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
*
* @param btl (IN) BTL module int mca_btl_ugni_aop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
* @param endpoint (IN) BTL addressing information uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle,
* @param descriptor (IN) Description of the data to be transferred mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order,
*/ mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
int
mca_btl_ugni_put (struct mca_btl_base_module_t *btl, int mca_btl_ugni_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_descriptor_t *des); mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
void *cbcontext, void *cbdata);
int mca_btl_ugni_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, uint64_t value,
int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
int mca_btl_ugni_progress_send_wait_list (struct mca_btl_base_endpoint_t *endpoint); int mca_btl_ugni_progress_send_wait_list (struct mca_btl_base_endpoint_t *endpoint);
@ -295,9 +297,14 @@ mca_btl_ugni_alloc(struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint, struct mca_btl_base_endpoint_t *endpoint,
uint8_t order, size_t size, uint32_t flags); uint8_t order, size_t size, uint32_t flags);
struct mca_btl_base_registration_handle_t {
/** uGNI memory handle */
gni_mem_handle_t gni_handle;
};
typedef struct mca_btl_ugni_reg_t { typedef struct mca_btl_ugni_reg_t {
mca_mpool_base_registration_t base; mca_mpool_base_registration_t base;
gni_mem_handle_t memory_hdl; mca_btl_base_registration_handle_t handle;
} mca_btl_ugni_reg_t; } mca_btl_ugni_reg_t;
/* Global structures */ /* Global structures */

Просмотреть файл

@ -34,7 +34,6 @@ int mca_btl_ugni_add_procs(struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t **peers, struct mca_btl_base_endpoint_t **peers,
opal_bitmap_t *reachable) { opal_bitmap_t *reachable) {
mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl; mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl;
opal_proc_t *my_proc = opal_proc_local_get();
size_t i; size_t i;
int rc; int rc;
@ -66,11 +65,8 @@ int mca_btl_ugni_add_procs(struct mca_btl_base_module_t* btl,
if (OPAL_PROC_ON_LOCAL_NODE(ompi_proc->proc_flags)) { if (OPAL_PROC_ON_LOCAL_NODE(ompi_proc->proc_flags)) {
ugni_module->nlocal_procs++; ugni_module->nlocal_procs++;
/* Do not use uGNI to communicate with local procs unless we are adding more ranks. /* ugni is allowed on local processes to provide support for network
* Change this when sm and vader are updated to handle additional add procs. */ * atomic operations */
if (!ugni_module->initialized || my_proc == ompi_proc) {
continue;
}
} }
/* Create and Init endpoints */ /* Create and Init endpoints */
@ -188,7 +184,7 @@ static int ugni_reg_rdma_mem (void *reg_data, void *base, size_t size,
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); OPAL_THREAD_LOCK(&ugni_module->device->dev_lock);
rc = GNI_MemRegister (ugni_module->device->dev_handle, (uint64_t) base, rc = GNI_MemRegister (ugni_module->device->dev_handle, (uint64_t) base,
size, NULL, GNI_MEM_READWRITE | GNI_MEM_RELAXED_PI_ORDERING, size, NULL, GNI_MEM_READWRITE | GNI_MEM_RELAXED_PI_ORDERING,
-1, &(ugni_reg->memory_hdl)); -1, &(ugni_reg->handle.gni_handle));
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) { if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) {
@ -211,7 +207,7 @@ static int ugni_reg_smsg_mem (void *reg_data, void *base, size_t size,
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); OPAL_THREAD_LOCK(&ugni_module->device->dev_lock);
rc = GNI_MemRegister (ugni_module->device->dev_handle, (uint64_t) base, rc = GNI_MemRegister (ugni_module->device->dev_handle, (uint64_t) base,
size, ugni_module->smsg_remote_cq, GNI_MEM_READWRITE, -1, size, ugni_module->smsg_remote_cq, GNI_MEM_READWRITE, -1,
&(ugni_reg->memory_hdl)); &(ugni_reg->handle.gni_handle));
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
return opal_common_rc_ugni_to_opal (rc); return opal_common_rc_ugni_to_opal (rc);
} }
@ -224,7 +220,7 @@ ugni_dereg_mem (void *reg_data, mca_mpool_base_registration_t *reg)
gni_return_t rc; gni_return_t rc;
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); OPAL_THREAD_LOCK(&ugni_module->device->dev_lock);
rc = GNI_MemDeregister (ugni_module->device->dev_handle, &ugni_reg->memory_hdl); rc = GNI_MemDeregister (ugni_module->device->dev_handle, &ugni_reg->handle.gni_handle);
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
if (GNI_RC_SUCCESS != rc) { if (GNI_RC_SUCCESS != rc) {
return OPAL_ERROR; return OPAL_ERROR;
@ -401,6 +397,15 @@ mca_btl_ugni_setup_mpools (mca_btl_ugni_module_t *ugni_module)
return rc; return rc;
} }
rc = ompi_free_list_init_new (&ugni_module->post_descriptors,
sizeof (mca_btl_ugni_post_descriptor_t),
8, OBJ_CLASS(mca_btl_ugni_post_descriptor_t),
0, 0, 0, -1, 256, NULL);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
BTL_ERROR(("error creating post descriptor free list"));
return rc;
}
return OPAL_SUCCESS; return OPAL_SUCCESS;
} }

135
opal/mca/btl/ugni/btl_ugni_atomic.c Обычный файл
Просмотреть файл

@ -0,0 +1,135 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "btl_ugni_rdma.h"
static gni_fma_cmd_type_t famo_cmds[] = {
[MCA_BTL_ATOMIC_ADD] = GNI_FMA_ATOMIC_FADD,
[MCA_BTL_ATOMIC_AND] = GNI_FMA_ATOMIC_FAND,
[MCA_BTL_ATOMIC_OR] = GNI_FMA_ATOMIC_FOR,
[MCA_BTL_ATOMIC_XOR] = GNI_FMA_ATOMIC_FXOR,
};
static gni_fma_cmd_type_t amo_cmds[] = {
[MCA_BTL_ATOMIC_ADD] = GNI_FMA_ATOMIC_ADD,
[MCA_BTL_ATOMIC_AND] = GNI_FMA_ATOMIC_AND,
[MCA_BTL_ATOMIC_OR] = GNI_FMA_ATOMIC_OR,
[MCA_BTL_ATOMIC_XOR] = GNI_FMA_ATOMIC_XOR,
};
int mca_btl_ugni_aop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle,
mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order,
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{
gni_mem_handle_t dummy = {0, 0};
mca_btl_ugni_post_descriptor_t *post_desc;
int rc;
rc = mca_btl_ugni_check_endpoint_state_rdma (endpoint);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
return rc;
}
mca_btl_ugni_alloc_post_descriptor (endpoint, NULL, cbfunc, cbcontext, cbdata, &post_desc);
if (OPAL_UNLIKELY(NULL == post_desc)) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
init_gni_post_desc (&post_desc->desc, order, GNI_POST_AMO, 0, dummy, remote_address,
remote_handle->gni_handle, 8, 0);
post_desc->desc.base.amo_cmd = amo_cmds[op];
post_desc->desc.base.first_operand = operand;
OPAL_THREAD_LOCK(&endpoint->btl->device->dev_lock);
rc = GNI_PostFma (endpoint->rdma_ep_handle, &post_desc->desc.base);
OPAL_THREAD_UNLOCK(&endpoint->btl->device->dev_lock);
if (GNI_RC_SUCCESS != rc) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
return OPAL_SUCCESS;
}
int mca_btl_ugni_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
void *cbcontext, void *cbdata)
{
mca_btl_ugni_post_descriptor_t *post_desc;
int rc;
rc = mca_btl_ugni_check_endpoint_state_rdma (endpoint);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
return rc;
}
mca_btl_ugni_alloc_post_descriptor (endpoint, local_handle, cbfunc, cbcontext, cbdata, &post_desc);
if (OPAL_UNLIKELY(NULL == post_desc)) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
init_gni_post_desc (&post_desc->desc, order, GNI_POST_AMO, (intptr_t) local_address, local_handle->gni_handle,
remote_address, remote_handle->gni_handle, 8, 0);
post_desc->desc.base.amo_cmd = famo_cmds[op];
post_desc->desc.base.first_operand = operand;
OPAL_THREAD_LOCK(&endpoint->btl->device->dev_lock);
rc = GNI_PostFma (endpoint->rdma_ep_handle, &post_desc->desc.base);
OPAL_THREAD_UNLOCK(&endpoint->btl->device->dev_lock);
if (GNI_RC_SUCCESS != rc) {
mca_btl_ugni_return_post_descriptor (endpoint->btl, post_desc);
return OPAL_ERR_OUT_OF_RESOURCE;
}
return OPAL_SUCCESS;
}
int mca_btl_ugni_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, uint64_t value, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{
mca_btl_ugni_post_descriptor_t *post_desc;
int rc;
rc = mca_btl_ugni_check_endpoint_state_rdma (endpoint);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
return rc;
}
mca_btl_ugni_alloc_post_descriptor (endpoint, local_handle, cbfunc, cbcontext, cbdata, &post_desc);
if (OPAL_UNLIKELY(NULL == post_desc)) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
init_gni_post_desc (&post_desc->desc, order, GNI_POST_AMO, (intptr_t) local_address, local_handle->gni_handle,
remote_address, remote_handle->gni_handle, 8, 0);
post_desc->desc.base.amo_cmd = GNI_FMA_ATOMIC_CSWAP;
post_desc->desc.base.first_operand = compare;
post_desc->desc.base.second_operand = value;
OPAL_THREAD_LOCK(&endpoint->btl->device->dev_lock);
rc = GNI_PostFma (endpoint->rdma_ep_handle, &post_desc->desc.base);
OPAL_THREAD_UNLOCK(&endpoint->btl->device->dev_lock);
if (GNI_RC_SUCCESS != rc) {
mca_btl_ugni_return_post_descriptor (endpoint->btl, post_desc);
return OPAL_ERR_OUT_OF_RESOURCE;
}
return OPAL_SUCCESS;
}

Просмотреть файл

@ -52,6 +52,7 @@ static int
btl_ugni_component_register(void) btl_ugni_component_register(void)
{ {
mca_base_var_enum_t *new_enum; mca_base_var_enum_t *new_enum;
gni_nic_device_t device_type;
int rc; int rc;
(void) mca_base_var_group_component_register(&mca_btl_ugni_component.super.btl_version, (void) mca_base_var_group_component_register(&mca_btl_ugni_component.super.btl_version,
@ -139,15 +140,6 @@ btl_ugni_component_register(void)
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.ugni_fma_limit); &mca_btl_ugni_component.ugni_fma_limit);
mca_btl_ugni_component.ugni_get_limit = 1 * 1024 * 1024;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"get_limit", "Maximum size message that "
"will be sent using a get protocol "
"(default 1M)", MCA_BASE_VAR_TYPE_INT,
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.ugni_get_limit);
mca_btl_ugni_component.rdma_max_retries = 16; mca_btl_ugni_component.rdma_max_retries = 16;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version, (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"rdma_max_retries", NULL, MCA_BASE_VAR_TYPE_INT, "rdma_max_retries", NULL, MCA_BASE_VAR_TYPE_INT,
@ -212,13 +204,28 @@ btl_ugni_component_register(void)
mca_btl_ugni_module.super.btl_max_send_size = 8 * 1024; mca_btl_ugni_module.super.btl_max_send_size = 8 * 1024;
mca_btl_ugni_module.super.btl_rdma_pipeline_send_length = 8 * 1024; mca_btl_ugni_module.super.btl_rdma_pipeline_send_length = 8 * 1024;
mca_btl_ugni_module.super.btl_get_limit = 1 * 1024 * 1024;
/* determine if there are get alignment restrictions */
GNI_GetDeviceType (&device_type);
if (GNI_DEVICE_GEMINI == device_type) {
mca_btl_ugni_module.super.btl_get_alignment = 4;
} else {
mca_btl_ugni_module.super.btl_get_alignment = 0;
}
/* threshold for put */ /* threshold for put */
mca_btl_ugni_module.super.btl_min_rdma_pipeline_size = 8 * 1024; mca_btl_ugni_module.super.btl_min_rdma_pipeline_size = 8 * 1024;
mca_btl_ugni_module.super.btl_flags = MCA_BTL_FLAGS_SEND | mca_btl_ugni_module.super.btl_flags = MCA_BTL_FLAGS_SEND |
MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND_INPLACE; MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND_INPLACE | MCA_BTL_FLAGS_ATOMIC_OPS |
MCA_BTL_FLAGS_ATOMIC_FOPS;
mca_btl_ugni_module.super.btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD |
MCA_BTL_ATOMIC_SUPPORTS_AND | MCA_BTL_ATOMIC_SUPPORTS_OR | MCA_BTL_ATOMIC_SUPPORTS_XOR |
MCA_BTL_ATOMIC_SUPPORTS_CSWAP;
mca_btl_ugni_module.super.btl_seg_size = sizeof (mca_btl_ugni_segment_t); mca_btl_ugni_module.super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t);
mca_btl_ugni_module.super.btl_bandwidth = 40000; /* Mbs */ mca_btl_ugni_module.super.btl_bandwidth = 40000; /* Mbs */
mca_btl_ugni_module.super.btl_latency = 2; /* Microsecs */ mca_btl_ugni_module.super.btl_latency = 2; /* Microsecs */
@ -425,89 +432,107 @@ mca_btl_ugni_progress_datagram (mca_btl_ugni_module_t *ugni_module)
return count; return count;
} }
static inline int #if OPAL_ENABLE_DEBUG
mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module, int which_cq) static inline void btl_ugni_dump_post_desc (mca_btl_ugni_post_descriptor_t *desc)
{ {
opal_common_ugni_post_desc_t *desc;
mca_btl_ugni_base_frag_t *frag;
gni_cq_entry_t event_data = 0;
uint32_t recoverable = 1;
gni_return_t rc;
gni_cq_handle_t the_cq;
the_cq = (which_cq == 0) ? ugni_module->rdma_local_cq : ugni_module->rdma_local_irq_cq; fprintf (stderr, "desc->desc.base.post_id = %" PRIx64 "\n", desc->desc.base.post_id);
fprintf (stderr, "desc->desc.base.status = %" PRIx64 "\n", desc->desc.base.status);
fprintf (stderr, "desc->desc.base.cq_mode_complete = %hu\n", desc->desc.base.cq_mode_complete);
fprintf (stderr, "desc->desc.base.type = %d\n", desc->desc.base.type);
fprintf (stderr, "desc->desc.base.cq_mode = %hu\n", desc->desc.base.cq_mode);
fprintf (stderr, "desc->desc.base.dlvr_mode = %hu\n", desc->desc.base.dlvr_mode);
fprintf (stderr, "desc->desc.base.local_addr = %" PRIx64 "\n", desc->desc.base.local_addr);
fprintf (stderr, "desc->desc.base.local_mem_hndl = {%" PRIx64 ", %" PRIx64 "}\n", desc->desc.base.local_mem_hndl.qword1,
desc->desc.base.local_mem_hndl.qword2);
fprintf (stderr, "desc->desc.base.remote_addr = %" PRIx64 "\n", desc->desc.base.remote_addr);
fprintf (stderr, "desc->desc.base.remote_mem_hndl = {%" PRIx64 ", %" PRIx64 "}\n", desc->desc.base.remote_mem_hndl.qword1,
desc->desc.base.remote_mem_hndl.qword2);
fprintf (stderr, "desc->desc.base.length = %" PRIu64 "\n", desc->desc.base.length);
fprintf (stderr, "desc->desc.base.rdma_mode = %hu\n", desc->desc.base.rdma_mode);
fprintf (stderr, "desc->desc.base.amo_cmd = %d\n", desc->desc.base.amo_cmd);
}
#endif
static inline int mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module, int which_cq)
{
mca_btl_ugni_post_descriptor_t *post_desc = NULL;
gni_cq_entry_t event_data = 0;
gni_post_descriptor_t *desc;
uint32_t recoverable = 1;
gni_return_t grc;
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); OPAL_THREAD_LOCK(&ugni_module->device->dev_lock);
rc = GNI_CqGetEvent (the_cq, &event_data); grc = GNI_CqGetEvent (ugni_module->rdma_local_cq, &event_data);
if (GNI_RC_NOT_DONE == rc) { if (GNI_RC_NOT_DONE == grc) {
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
return 0; return 0;
} }
if (OPAL_UNLIKELY((GNI_RC_SUCCESS != rc && !event_data) || GNI_CQ_OVERRUN(event_data))) { if (OPAL_UNLIKELY((GNI_RC_SUCCESS != grc && !event_data) || GNI_CQ_OVERRUN(event_data))) {
/* TODO -- need to handle overrun -- how do we do this without an event? /* TODO -- need to handle overrun -- how do we do this without an event?
will the event eventually come back? Ask Cray */ will the event eventually come back? Ask Cray */
BTL_ERROR(("unhandled post error! ugni rc = %d %s", rc,gni_err_str[rc])); BTL_ERROR(("unhandled post error! ugni rc = %d %s", grc, gni_err_str[grc]));
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
return opal_common_rc_ugni_to_opal (rc);
return opal_common_rc_ugni_to_opal (grc);
} }
rc = GNI_GetCompleted (the_cq, event_data, (gni_post_descriptor_t **) &desc); grc = GNI_GetCompleted (ugni_module->rdma_local_cq, event_data, &desc);
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc && GNI_RC_TRANSACTION_ERROR != rc)) { if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc && GNI_RC_TRANSACTION_ERROR != grc)) {
BTL_ERROR(("Error in GNI_GetComplete %s", gni_err_str[rc])); BTL_ERROR(("Error in GNI_GetComplete %s", gni_err_str[grc]));
return opal_common_rc_ugni_to_opal (rc); return opal_common_rc_ugni_to_opal (grc);
} }
frag = MCA_BTL_UGNI_DESC_TO_FRAG(desc); post_desc = MCA_BTL_UGNI_DESC_TO_PDESC(desc);
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc || !GNI_CQ_STATUS_OK(event_data))) {
char buffer[1024];
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc || !GNI_CQ_STATUS_OK(event_data))) {
(void) GNI_CqErrorRecoverable (event_data, &recoverable); (void) GNI_CqErrorRecoverable (event_data, &recoverable);
GNI_CqErrorStr(event_data,buffer,sizeof(buffer));
if (OPAL_UNLIKELY(++frag->post_desc.tries >= mca_btl_ugni_component.rdma_max_retries || if (OPAL_UNLIKELY(++post_desc->desc.tries >= mca_btl_ugni_component.rdma_max_retries ||
!recoverable)) { !recoverable)) {
char char_buffer[1024];
GNI_CqErrorStr (event_data, char_buffer, 1024);
/* give up */ /* give up */
BTL_ERROR(("giving up on frag %p type %d CQE error %s", (void *) frag, frag->post_desc.base.type, buffer)); BTL_ERROR(("giving up on desciptor %p, recoverable %d: %s", (void *) post_desc,
mca_btl_ugni_frag_complete (frag, OPAL_ERROR); recoverable, char_buffer));
#if OPAL_ENABLE_DEBUG
btl_ugni_dump_post_desc (post_desc);
#endif
mca_btl_ugni_post_desc_complete (ugni_module, post_desc, OPAL_ERROR);
return OPAL_ERROR; return OPAL_ERROR;
} }
/* repost transaction */ mca_btl_ugni_repost (ugni_module, post_desc);
mca_btl_ugni_repost (frag);
return 0; return 0;
} }
BTL_VERBOSE(("RDMA/FMA complete for frag %p", (void *) frag)); mca_btl_ugni_post_desc_complete (ugni_module, post_desc, opal_common_rc_ugni_to_opal (grc));
mca_btl_ugni_frag_complete (frag, opal_common_rc_ugni_to_opal (rc));
return 1; return 1;
} }
static inline int static inline int
mca_btl_ugni_retry_failed (mca_btl_ugni_module_t *ugni_module) mca_btl_ugni_post_pending (mca_btl_ugni_module_t *ugni_module)
{ {
int count = opal_list_get_size (&ugni_module->failed_frags); int count = opal_list_get_size (&ugni_module->pending_descriptors);
int i; int i;
for (i = 0 ; i < count ; ++i) { for (i = 0 ; i < count ; ++i) {
OPAL_THREAD_LOCK(&ugni_module->failed_frags_lock); OPAL_THREAD_LOCK(&ugni_module->pending_descriptors_lock);
mca_btl_ugni_base_frag_t *frag = mca_btl_ugni_post_descriptor_t *post_desc =
(mca_btl_ugni_base_frag_t *) opal_list_remove_first (&ugni_module->failed_frags); (mca_btl_ugni_post_descriptor_t *) opal_list_remove_first (&ugni_module->pending_descriptors);
OPAL_THREAD_UNLOCK(&ugni_module->failed_frags_lock); OPAL_THREAD_UNLOCK(&ugni_module->pending_descriptors_lock);
if (NULL == frag) {
if (OPAL_SUCCESS != mca_btl_ugni_repost (ugni_module, post_desc)) {
break; break;
} }
mca_btl_ugni_repost (frag);
} }
return count; return i;
} }
static inline int static inline int
@ -557,7 +582,6 @@ static int mca_btl_ugni_component_progress (void)
for (i = 0 ; i < mca_btl_ugni_component.ugni_num_btls ; ++i) { for (i = 0 ; i < mca_btl_ugni_component.ugni_num_btls ; ++i) {
ugni_module = mca_btl_ugni_component.modules + i; ugni_module = mca_btl_ugni_component.modules + i;
mca_btl_ugni_retry_failed (ugni_module);
mca_btl_ugni_progress_wait_list (ugni_module); mca_btl_ugni_progress_wait_list (ugni_module);
count += mca_btl_ugni_progress_datagram (ugni_module); count += mca_btl_ugni_progress_datagram (ugni_module);
@ -565,6 +589,8 @@ static int mca_btl_ugni_component_progress (void)
count += mca_btl_ugni_progress_remote_smsg (ugni_module); count += mca_btl_ugni_progress_remote_smsg (ugni_module);
count += mca_btl_ugni_progress_rdma (ugni_module, 0); count += mca_btl_ugni_progress_rdma (ugni_module, 0);
/* post pending after progressing rdma */
mca_btl_ugni_post_pending (ugni_module);
} }
return count; return count;

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше