diff --git a/config/opal_check_openfabrics.m4 b/config/opal_check_openfabrics.m4 index 78563847ff..9a2b4cfcc2 100644 --- a/config/opal_check_openfabrics.m4 +++ b/config/opal_check_openfabrics.m4 @@ -11,7 +11,7 @@ # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. # Copyright (c) 2006-2015 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2006-2011 Los Alamos National Security, LLC. All rights +# Copyright (c) 2006-2015 Los Alamos National Security, LLC. All rights # reserved. # Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved. # Copyright (c) 2010-2012 Oracle and/or its affiliates. All rights reserved. @@ -140,7 +140,7 @@ AC_DEFUN([OPAL_CHECK_OPENFABRICS],[ # If we have the openib stuff available, find out what we've got AS_IF([test "$ompi_check_openib_happy" = "yes"], - [AC_CHECK_DECLS([IBV_EVENT_CLIENT_REREGISTER, IBV_ACCESS_SO], [], [], + [AC_CHECK_DECLS([IBV_EVENT_CLIENT_REREGISTER, IBV_ACCESS_SO, IBV_ATOMIC_HCA], [], [], [#include ]) AC_CHECK_FUNCS([ibv_get_device_list ibv_resize_cq]) diff --git a/ompi/mca/bml/base/bml_base_btl.c b/ompi/mca/bml/base/bml_base_btl.c index 1e1d90561b..d5fd5d6b59 100644 --- a/ompi/mca/bml/base/bml_base_btl.c +++ b/ompi/mca/bml/base/bml_base_btl.c @@ -91,7 +91,7 @@ static void mca_bml_base_completion( { mca_bml_base_context_t* ctx = (mca_bml_base_context_t*) des->des_cbdata; /* restore original state */ - ((unsigned char*)des->des_local[0].seg_addr.pval)[ctx->index] ^= ~0; + ((unsigned char*)des->des_segments[0].seg_addr.pval)[ctx->index] ^= ~0; des->des_cbdata = ctx->cbdata; des->des_cbfunc = ctx->cbfunc; free(ctx); @@ -121,11 +121,11 @@ int mca_bml_base_send( mca_bml_base_btl_t* bml_btl, malloc(sizeof(mca_bml_base_context_t)); if(NULL != ctx) { opal_output(0, "%s:%d: corrupting data\n", __FILE__, __LINE__); - ctx->index = (size_t) ((des->des_local[0].seg_len * + ctx->index = (size_t) ((des->des_segments[0].seg_len * opal_rand(&mca_bml_base_rand_buff) * 1.0) / (UINT32_MAX + 1.0)); ctx->cbfunc = des->des_cbfunc; ctx->cbdata = des->des_cbdata; - ((unsigned char*)des->des_local[0].seg_addr.pval)[ctx->index] ^= ~0; + ((unsigned char*)des->des_segments[0].seg_addr.pval)[ctx->index] ^= ~0; des->des_cbdata = ctx; des->des_cbfunc = mca_bml_base_completion; } diff --git a/ompi/mca/bml/bml.h b/ompi/mca/bml/bml.h index 718e6de5d1..125cd7fb95 100644 --- a/ompi/mca/bml/bml.h +++ b/ompi/mca/bml/bml.h @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * University Research and Technology @@ -10,7 +11,7 @@ * Copyright (c) 2004-2006 The Regents of the University of California. * All rights reserved. * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -307,27 +308,30 @@ static inline int mca_bml_base_sendi( mca_bml_base_btl_t* bml_btl, payload_size, order, flags, tag, descriptor); } -static inline int mca_bml_base_put( mca_bml_base_btl_t* bml_btl, - mca_btl_base_descriptor_t* des) +static inline int mca_bml_base_put( mca_bml_base_btl_t* bml_btl, void *local_address, uint64_t remote_address, + struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, size_t size, + int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbdata) { mca_btl_base_module_t* btl = bml_btl->btl; - des->des_context = (void*) bml_btl; - return btl->btl_put( btl, bml_btl->btl_endpoint, des ); + return btl->btl_put( btl, bml_btl->btl_endpoint, local_address, remote_address, local_handle, + remote_handle, size, flags, order, cbfunc, (void *) bml_btl, cbdata); } -static inline int mca_bml_base_get( mca_bml_base_btl_t* bml_btl, - mca_btl_base_descriptor_t* des) +static inline int mca_bml_base_get( mca_bml_base_btl_t* bml_btl, void *local_address, uint64_t remote_address, + struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, size_t size, + int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbdata) { mca_btl_base_module_t* btl = bml_btl->btl; - des->des_context = (void*) bml_btl; - return btl->btl_get( btl, bml_btl->btl_endpoint, des ); + return btl->btl_get( btl, bml_btl->btl_endpoint, local_address, remote_address, local_handle, + remote_handle, size, flags, order, cbfunc, (void *) bml_btl, cbdata); } static inline void mca_bml_base_prepare_src(mca_bml_base_btl_t* bml_btl, - mca_mpool_base_registration_t* reg, struct opal_convertor_t* conv, uint8_t order, size_t reserve, @@ -337,29 +341,27 @@ static inline void mca_bml_base_prepare_src(mca_bml_base_btl_t* bml_btl, { mca_btl_base_module_t* btl = bml_btl->btl; - *des = btl->btl_prepare_src( btl, bml_btl->btl_endpoint, reg, conv, + *des = btl->btl_prepare_src( btl, bml_btl->btl_endpoint, conv, order, reserve, size, flags ); if( OPAL_LIKELY((*des) != NULL) ) { (*des)->des_context = (void*) bml_btl; } } -static inline void mca_bml_base_prepare_dst(mca_bml_base_btl_t* bml_btl, - mca_mpool_base_registration_t* reg, - struct opal_convertor_t* conv, - uint8_t order, - size_t reserve, - size_t *size, - uint32_t flags, - mca_btl_base_descriptor_t** des) -{ +static inline void mca_bml_base_register_mem (mca_bml_base_btl_t* bml_btl, void *base, + size_t size, uint32_t flags, + mca_btl_base_registration_handle_t **handle) +{ mca_btl_base_module_t* btl = bml_btl->btl; - *des = btl->btl_prepare_dst( btl, bml_btl->btl_endpoint, reg, conv, - order, reserve, size, flags ); - if( OPAL_LIKELY((*des) != NULL) ) { - (*des)->des_context = (void*) bml_btl; - } + *handle = btl->btl_register_mem (btl, bml_btl->btl_endpoint, base, size, flags); +} + +static inline void mca_bml_base_deregister_mem (mca_bml_base_btl_t* bml_btl, mca_btl_base_registration_handle_t *handle) +{ + mca_btl_base_module_t* btl = bml_btl->btl; + + btl->btl_deregister_mem (btl, handle); } /* diff --git a/ompi/mca/bml/r2/bml_r2.c b/ompi/mca/bml/r2/bml_r2.c index ddb59ffd7c..57661e016b 100644 --- a/ompi/mca/bml/r2/bml_r2.c +++ b/ompi/mca/bml/r2/bml_r2.c @@ -17,6 +17,8 @@ * Copyright (c) 2014 NVIDIA Corporation. All rights reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -86,9 +88,7 @@ static int mca_bml_r2_add_btls( void ) return OMPI_ERR_OUT_OF_RESOURCE; } - for(selected_btl = (mca_btl_base_selected_module_t*)opal_list_get_first(btls); - selected_btl != (mca_btl_base_selected_module_t*)opal_list_get_end(btls); - selected_btl = (mca_btl_base_selected_module_t*)opal_list_get_next(selected_btl)) { + OPAL_LIST_FOREACH(selected_btl, btls, mca_btl_base_selected_module_t) { mca_btl_base_module_t *btl = selected_btl->btl_module; mca_bml_r2.btl_modules[mca_bml_r2.num_btl_modules++] = btl; for (i = 0; NULL != btl_names_argv && NULL != btl_names_argv[i]; ++i) { @@ -127,6 +127,23 @@ static int btl_bandwidth_compare(const void *v1, const void *v2) return b2->btl->btl_bandwidth - b1->btl->btl_bandwidth; } +static void mca_bml_r2_calculate_bandwidth_latency (mca_bml_base_btl_array_t *btl_array, double *total_bandwidth, uint32_t *latency) +{ + const size_t array_length = mca_bml_base_btl_array_get_size (btl_array); + + *latency = UINT_MAX; + *total_bandwidth = 0.; + + for (size_t i = 0 ; i < array_length ; ++i) { + mca_bml_base_btl_t *bml_btl = mca_bml_base_btl_array_get_index (btl_array, i); + mca_btl_base_module_t *btl = bml_btl->btl; + *total_bandwidth += btl->btl_bandwidth; + if (btl->btl_latency < *latency) { + *latency = btl->btl_latency; + } + } +} + /* * For each proc setup a datastructure that indicates the BTLs * that can be used to reach the destination. @@ -189,6 +206,7 @@ static int mca_bml_r2_add_procs( size_t nprocs, for(p_index = 0; p_index < mca_bml_r2.num_btl_modules; p_index++) { mca_btl_base_module_t* btl = mca_bml_r2.btl_modules[p_index]; int btl_inuse = 0; + int btl_flags; /* if the r2 can reach the destination proc it sets the * corresponding bit (proc index) in the reachable bitmap @@ -212,7 +230,7 @@ static int mca_bml_r2_add_procs( size_t nprocs, ompi_proc_t *proc = new_procs[p]; mca_bml_base_endpoint_t * bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; - mca_bml_base_btl_t* bml_btl; + mca_bml_base_btl_t* bml_btl = NULL; size_t size; if(NULL == bml_endpoint) { @@ -236,12 +254,35 @@ static int mca_bml_r2_add_procs( size_t nprocs, bml_endpoint->btl_flags_or = 0; } + btl_flags = btl->btl_flags; + if( (btl_flags & MCA_BTL_FLAGS_PUT) && (NULL == btl->btl_put) ) { + opal_output(0, "mca_bml_r2_add_procs: The PUT flag is specified for" + " the %s BTL without any PUT function attached. Discard the flag !", + bml_btl->btl->btl_component->btl_version.mca_component_name); + btl_flags ^= MCA_BTL_FLAGS_PUT; + } + if( (btl_flags & MCA_BTL_FLAGS_GET) && (NULL == btl->btl_get) ) { + opal_output(0, "mca_bml_r2_add_procs: The GET flag is specified for" + " the %s BTL without any GET function attached. Discard the flag !", + bml_btl->btl->btl_component->btl_version.mca_component_name); + btl_flags ^= MCA_BTL_FLAGS_GET; + } + + if( (btl_flags & (MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_SEND)) == 0 ) { + /** + * If no protocol specified, we have 2 choices: we ignore the BTL + * as we don't know which protocl to use, or we suppose that all + * BTLs support the send protocol. + */ + btl_flags |= MCA_BTL_FLAGS_SEND; + } + /* dont allow an additional BTL with a lower exclusivity ranking */ size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send); if(size > 0) { bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, size-1); - /* skip this btl if the exclusivity is less than the previous */ - if(bml_btl->btl->btl_exclusivity > btl->btl_exclusivity) { + /* skip this btl if the exclusivity is less than the previous only if the btl does not provide full rdma (for one-sided) */ + if(bml_btl->btl->btl_exclusivity > btl->btl_exclusivity && ((btl_flags & MCA_BTL_FLAGS_RDMA) != MCA_BTL_FLAGS_RDMA)) { btl->btl_del_procs(btl, 1, (opal_proc_t**)&proc, &btl_endpoints[p]); opal_output_verbose(20, opal_btl_base_framework.framework_output, "mca: bml: Not using %s btl to %s on node %s " @@ -261,39 +302,44 @@ static int mca_bml_r2_add_procs( size_t nprocs, proc->super.proc_hostname); /* cache the endpoint on the proc */ - bml_btl = mca_bml_base_btl_array_insert(&bml_endpoint->btl_send); - bml_btl->btl = btl; - bml_btl->btl_endpoint = btl_endpoints[p]; - bml_btl->btl_weight = 0; - bml_btl->btl_flags = btl->btl_flags; - if( (bml_btl->btl_flags & MCA_BTL_FLAGS_PUT) && (NULL == btl->btl_put) ) { - opal_output(0, "mca_bml_r2_add_procs: The PUT flag is specified for" - " the %s BTL without any PUT function attached. Discard the flag !", - bml_btl->btl->btl_component->btl_version.mca_component_name); - bml_btl->btl_flags ^= MCA_BTL_FLAGS_PUT; - } - if( (bml_btl->btl_flags & MCA_BTL_FLAGS_GET) && (NULL == btl->btl_get) ) { - opal_output(0, "mca_bml_r2_add_procs: The GET flag is specified for" - " the %s BTL without any GET function attached. Discard the flag !", - bml_btl->btl->btl_component->btl_version.mca_component_name); - bml_btl->btl_flags ^= MCA_BTL_FLAGS_GET; - } - if( (bml_btl->btl_flags & (MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_SEND)) == 0 ) { + if (NULL == bml_btl || (bml_btl->btl->btl_exclusivity <= btl->btl_exclusivity)) { + bml_btl = mca_bml_base_btl_array_insert(&bml_endpoint->btl_send); + bml_btl->btl = btl; + bml_btl->btl_endpoint = btl_endpoints[p]; + bml_btl->btl_weight = 0; + bml_btl->btl_flags = btl_flags; + /** - * If no protocol specified, we have 2 choices: we ignore the BTL - * as we don't know which protocl to use, or we suppose that all - * BTLs support the send protocol. + * calculate the bitwise OR of the btl flags */ - bml_btl->btl_flags |= MCA_BTL_FLAGS_SEND; + bml_endpoint->btl_flags_or |= bml_btl->btl_flags; } - /** - * calculate the bitwise OR of the btl flags - */ - bml_endpoint->btl_flags_or |= bml_btl->btl_flags; + + /* always add rdma endpoints */ + if ((btl_flags & MCA_BTL_FLAGS_RDMA) && + !((proc->super.proc_arch != ompi_proc_local_proc->super.proc_arch) && + (0 == (btl->btl_flags & MCA_BTL_FLAGS_HETEROGENEOUS_RDMA)))) { + mca_bml_base_btl_t *bml_btl_rdma = mca_bml_base_btl_array_insert(&bml_endpoint->btl_rdma); + + bml_btl_rdma->btl = btl; + bml_btl_rdma->btl_endpoint = btl_endpoints[p]; + bml_btl_rdma->btl_weight = 0; + bml_btl_rdma->btl_flags = btl_flags; + + if (bml_endpoint->btl_pipeline_send_length < btl->btl_rdma_pipeline_send_length) { + bml_endpoint->btl_pipeline_send_length = btl->btl_rdma_pipeline_send_length; + } + + if (bml_endpoint->btl_send_limit < btl->btl_min_rdma_pipeline_size) { + bml_endpoint->btl_send_limit = btl->btl_min_rdma_pipeline_size; + } + } + /* This BTL is in use, allow the progress registration */ btl_inuse++; } } + if(btl_inuse > 0 && NULL != btl->btl_component->btl_progress) { size_t p; bool found = false; @@ -319,9 +365,8 @@ static int mca_bml_r2_add_procs( size_t nprocs, mca_bml_base_endpoint_t* bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; double total_bandwidth = 0; - uint32_t latency = 0xffffffff; - size_t n_index; - size_t n_size; + uint32_t latency; + size_t n_send, n_rdma; /* skip over procs w/ no btl's registered */ if(NULL == bml_endpoint) { @@ -335,28 +380,22 @@ static int mca_bml_r2_add_procs( size_t nprocs, * weighting. Once the left over is smaller than this number we will * start using the weight to compute the correct amount. */ - n_size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send); - + n_send = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send); + n_rdma = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma); + /* sort BTLs in descending order according to bandwidth value */ - qsort(bml_endpoint->btl_send.bml_btls, n_size, + qsort(bml_endpoint->btl_send.bml_btls, n_send, sizeof(mca_bml_base_btl_t), btl_bandwidth_compare); bml_endpoint->btl_rdma_index = 0; - for(n_index = 0; n_index < n_size; n_index++) { - mca_bml_base_btl_t* bml_btl = - mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index); - mca_btl_base_module_t* btl = bml_btl->btl; - total_bandwidth += bml_btl->btl->btl_bandwidth; - if(btl->btl_latency < latency) { - latency = btl->btl_latency; - } - } - + + mca_bml_r2_calculate_bandwidth_latency (&bml_endpoint->btl_send, &total_bandwidth, &latency); + /* (1) set the weight of each btl as a percentage of overall bandwidth * (2) copy all btl instances at the highest priority ranking into the * list of btls used for first fragments */ - for(n_index = 0; n_index < n_size; n_index++) { + for (size_t n_index = 0 ; n_index < n_send ; ++n_index) { mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index); mca_btl_base_module_t *btl = bml_btl->btl; @@ -365,7 +404,7 @@ static int mca_bml_r2_add_procs( size_t nprocs, if(btl->btl_bandwidth > 0) { bml_btl->btl_weight = (float)(btl->btl_bandwidth / total_bandwidth); } else { - bml_btl->btl_weight = (float)(1.0 / n_size); + bml_btl->btl_weight = (float)(1.0 / n_send); } /* check to see if this r2 is already in the array of r2s @@ -380,21 +419,24 @@ static int mca_bml_r2_add_procs( size_t nprocs, /* set endpoint max send size as min of available btls */ if(bml_endpoint->btl_max_send_size > btl->btl_max_send_size) bml_endpoint->btl_max_send_size = btl->btl_max_send_size; + } - /* check flags - is rdma prefered */ - if ((btl->btl_flags & (MCA_BTL_FLAGS_PUT|MCA_BTL_FLAGS_GET)) && - !((proc->super.proc_arch != ompi_proc_local_proc->super.proc_arch) && - (0 == (btl->btl_flags & MCA_BTL_FLAGS_HETEROGENEOUS_RDMA)))) { - mca_bml_base_btl_t* bml_btl_rdma = mca_bml_base_btl_array_insert(&bml_endpoint->btl_rdma); - mca_btl_base_module_t* btl_rdma = bml_btl->btl; + /* sort BTLs in descending order according to bandwidth value */ + qsort(bml_endpoint->btl_rdma.bml_btls, n_rdma, + sizeof(mca_bml_base_btl_t), btl_bandwidth_compare); - *bml_btl_rdma = *bml_btl; - if(bml_endpoint->btl_pipeline_send_length < btl_rdma->btl_rdma_pipeline_send_length) { - bml_endpoint->btl_pipeline_send_length = btl_rdma->btl_rdma_pipeline_send_length; - } - if(bml_endpoint->btl_send_limit < btl_rdma->btl_min_rdma_pipeline_size) { - bml_endpoint->btl_send_limit = btl_rdma->btl_min_rdma_pipeline_size; - } + mca_bml_r2_calculate_bandwidth_latency (&bml_endpoint->btl_rdma, &total_bandwidth, &latency); + + /* set rdma btl weights */ + for (size_t n_index = 0 ; n_index < n_rdma ; ++n_index) { + mca_bml_base_btl_t *bml_btl = + mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma, n_index); + + /* compute weighting factor for this r2 */ + if (bml_btl->btl->btl_bandwidth > 0.0) { + bml_btl->btl_weight = (float)(bml_btl->btl->btl_bandwidth / total_bandwidth); + } else { + bml_btl->btl_weight = (float)(1.0 / n_rdma); } } } diff --git a/ompi/mca/pml/bfo/.opal_ignore b/ompi/mca/pml/bfo/.opal_ignore new file mode 100644 index 0000000000..e69de29bb2 diff --git a/ompi/mca/pml/ob1/pml_ob1.c b/ompi/mca/pml/ob1/pml_ob1.c index d82daee29a..eac927c489 100644 --- a/ompi/mca/pml/ob1/pml_ob1.c +++ b/ompi/mca/pml/ob1/pml_ob1.c @@ -14,7 +14,7 @@ * Copyright (c) 2006-2008 University of Houston. All rights reserved. * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. - * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2012 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ @@ -500,17 +500,17 @@ static void mca_pml_ob1_dump_hdr(mca_pml_ob1_hdr_t* hdr) case MCA_PML_OB1_HDR_TYPE_RGET: type = "RGET"; snprintf( header, 128, "ctx %5d src %d tag %d seq %d msg_length %" PRIu64 - "seg_cnt %d hdr_des %" PRIu64, + "frag %" PRIu64 " src_ptr %" PRIu64, hdr->hdr_rndv.hdr_match.hdr_ctx, hdr->hdr_rndv.hdr_match.hdr_src, hdr->hdr_rndv.hdr_match.hdr_tag, hdr->hdr_rndv.hdr_match.hdr_seq, - hdr->hdr_rndv.hdr_msg_length, - hdr->hdr_rget.hdr_seg_cnt, hdr->hdr_rget.hdr_des.lval); + hdr->hdr_rndv.hdr_msg_length, hdr->hdr_rget.hdr_frag.lval, + hdr->hdr_rget.hdr_src_ptr); break; case MCA_PML_OB1_HDR_TYPE_ACK: type = "ACK"; - snprintf( header, 128, "src_req %p dst_req %p offset %" PRIu64, + snprintf( header, 128, "src_req %p dst_req %p offset %" PRIu64 " size %" PRIu64, hdr->hdr_ack.hdr_src_req.pval, hdr->hdr_ack.hdr_dst_req.pval, - hdr->hdr_ack.hdr_send_offset); + hdr->hdr_ack.hdr_send_offset, hdr->hdr_ack.hdr_send_size); break; case MCA_PML_OB1_HDR_TYPE_FRAG: type = "FRAG"; @@ -520,10 +520,11 @@ static void mca_pml_ob1_dump_hdr(mca_pml_ob1_hdr_t* hdr) break; case MCA_PML_OB1_HDR_TYPE_PUT: type = "PUT"; - snprintf( header, 128, "seg_cnt %d dst_req %p src_des %p recv_req %p offset %" PRIu64 " [%p %" PRIu64 "]", - hdr->hdr_rdma.hdr_seg_cnt, hdr->hdr_rdma.hdr_req.pval, hdr->hdr_rdma.hdr_des.pval, + snprintf( header, 128, "dst_req %p src_frag %p recv_req %p offset %" PRIu64 + " dst_ptr %" PRIu64 " dst_size %" PRIu64, + hdr->hdr_rdma.hdr_req.pval, hdr->hdr_rdma.hdr_frag.pval, hdr->hdr_rdma.hdr_recv_req.pval, hdr->hdr_rdma.hdr_rdma_offset, - hdr->hdr_rdma.hdr_segs[0].seg_addr.pval, hdr->hdr_rdma.hdr_segs[0].seg_len); + hdr->hdr_rdma.hdr_dst_ptr, hdr->hdr_rdma.hdr_dst_size); break; case MCA_PML_OB1_HDR_TYPE_FIN: type = "FIN"; @@ -638,37 +639,32 @@ static void mca_pml_ob1_fin_completion( mca_btl_base_module_t* btl, */ int mca_pml_ob1_send_fin( ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl, - opal_ptr_t hdr_des, + opal_ptr_t hdr_frag, + uint64_t rdma_size, uint8_t order, - uint32_t status ) + int status ) { mca_btl_base_descriptor_t* fin; - mca_pml_ob1_fin_hdr_t* hdr; int rc; mca_bml_base_alloc(bml_btl, &fin, order, sizeof(mca_pml_ob1_fin_hdr_t), MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_FLAGS_SIGNAL); if(NULL == fin) { - MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status); + MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_frag, rdma_size, bml_btl, order, status); return OMPI_ERR_OUT_OF_RESOURCE; } fin->des_cbfunc = mca_pml_ob1_fin_completion; fin->des_cbdata = NULL; /* fill in header */ - hdr = (mca_pml_ob1_fin_hdr_t*)fin->des_local->seg_addr.pval; - hdr->hdr_common.hdr_flags = 0; - hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_FIN; - hdr->hdr_des = hdr_des; - hdr->hdr_fail = status; + mca_pml_ob1_fin_hdr_prepare ((mca_pml_ob1_fin_hdr_t *) fin->des_segments->seg_addr.pval, + 0, hdr_frag.lval, status ? status : (int64_t) rdma_size); ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_FIN, proc); /* queue request */ - rc = mca_bml_base_send( bml_btl, - fin, - MCA_PML_OB1_HDR_TYPE_FIN ); + rc = mca_bml_base_send( bml_btl, fin, MCA_PML_OB1_HDR_TYPE_FIN ); if( OPAL_LIKELY( rc >= 0 ) ) { if( OPAL_LIKELY( 1 == rc ) ) { MCA_PML_OB1_PROGRESS_PENDING(bml_btl); @@ -676,7 +672,7 @@ int mca_pml_ob1_send_fin( ompi_proc_t* proc, return OMPI_SUCCESS; } mca_bml_base_free(bml_btl, fin); - MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status); + MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_frag, rdma_size, bml_btl, order, status); return OMPI_ERR_OUT_OF_RESOURCE; } @@ -717,6 +713,7 @@ void mca_pml_ob1_process_pending_packets(mca_bml_base_btl_t* bml_btl) pckt->hdr.hdr_ack.hdr_src_req.lval, pckt->hdr.hdr_ack.hdr_dst_req.pval, pckt->hdr.hdr_ack.hdr_send_offset, + pckt->hdr.hdr_ack.hdr_send_size, pckt->hdr.hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA); if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) { OPAL_THREAD_LOCK(&mca_pml_ob1.lock); @@ -728,9 +725,10 @@ void mca_pml_ob1_process_pending_packets(mca_bml_base_btl_t* bml_btl) break; case MCA_PML_OB1_HDR_TYPE_FIN: rc = mca_pml_ob1_send_fin(pckt->proc, send_dst, - pckt->hdr.hdr_fin.hdr_des, + pckt->hdr.hdr_fin.hdr_frag, + pckt->hdr.hdr_fin.hdr_size, pckt->order, - pckt->hdr.hdr_fin.hdr_fail); + pckt->status); if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) { return; } diff --git a/ompi/mca/pml/ob1/pml_ob1.h b/ompi/mca/pml/ob1/pml_ob1.h index e059d7586d..924d3258f8 100644 --- a/ompi/mca/pml/ob1/pml_ob1.h +++ b/ompi/mca/pml/ob1/pml_ob1.h @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. - * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -216,6 +216,7 @@ struct mca_pml_ob1_pckt_pending_t { mca_pml_ob1_hdr_t hdr; struct mca_bml_base_btl_t *bml_btl; uint8_t order; + int status; }; typedef struct mca_pml_ob1_pckt_pending_t mca_pml_ob1_pckt_pending_t; OBJ_CLASS_DECLARATION(mca_pml_ob1_pckt_pending_t); @@ -234,17 +235,17 @@ do { \ (ompi_free_list_item_t*)pckt); \ } while(0) -#define MCA_PML_OB1_ADD_FIN_TO_PENDING(P, D, B, O, S) \ +#define MCA_PML_OB1_ADD_FIN_TO_PENDING(P, D, Sz, B, O, S) \ do { \ mca_pml_ob1_pckt_pending_t *_pckt; \ \ MCA_PML_OB1_PCKT_PENDING_ALLOC(_pckt); \ - _pckt->hdr.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_FIN; \ - _pckt->hdr.hdr_fin.hdr_des = (D); \ - _pckt->hdr.hdr_fin.hdr_fail = (S); \ + mca_pml_ob1_fin_hdr_prepare (&_pckt->hdr.hdr_fin, 0, \ + (D).lval, (Sz)); \ _pckt->proc = (P); \ _pckt->bml_btl = (B); \ _pckt->order = (O); \ + _pckt->status = (S); \ OPAL_THREAD_LOCK(&mca_pml_ob1.lock); \ opal_list_append(&mca_pml_ob1.pckt_pending, \ (opal_list_item_t*)_pckt); \ @@ -253,7 +254,7 @@ do { \ int mca_pml_ob1_send_fin(ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl, - opal_ptr_t hdr_des, uint8_t order, uint32_t status); + opal_ptr_t hdr_frag, uint64_t size, uint8_t order, int status); /* This function tries to resend FIN/ACK packets from pckt_pending queue. * Packets are added to the queue when sending of FIN or ACK is failed due to @@ -283,20 +284,6 @@ void mca_pml_ob1_process_pending_rdma(void); /* * Compute the total number of bytes on supplied descriptor */ -static inline size_t -mca_pml_ob1_compute_segment_length(size_t seg_size, void *segments, - size_t count, size_t hdrlen) -{ - size_t i, length = 0; - mca_btl_base_segment_t *segment = (mca_btl_base_segment_t*)segments; - - for (i = 0; i < count ; ++i) { - length += segment->seg_len; - segment = (mca_btl_base_segment_t *)((char *)segment + seg_size); - } - return (length - hdrlen); -} - static inline size_t mca_pml_ob1_compute_segment_length_base(mca_btl_base_segment_t *segments, size_t count, size_t hdrlen) @@ -338,7 +325,7 @@ mca_pml_ob1_compute_segment_length_remote (size_t seg_size, void *segments, /* represent BTL chosen for sending request */ struct mca_pml_ob1_com_btl_t { mca_bml_base_btl_t *bml_btl; - struct mca_mpool_base_registration_t* btl_reg; + struct mca_btl_base_registration_handle_t *btl_reg; size_t length; }; typedef struct mca_pml_ob1_com_btl_t mca_pml_ob1_com_btl_t; diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c index 3574bb390d..29462932f1 100644 --- a/ompi/mca/pml/ob1/pml_ob1_cuda.c +++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology @@ -12,6 +13,8 @@ * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012-2013 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -123,19 +126,20 @@ size_t mca_pml_ob1_rdma_cuda_btls( mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n); if (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET) { - mca_mpool_base_registration_t* reg = NULL; - mca_mpool_base_module_t *btl_mpool = bml_btl->btl->btl_mpool; + mca_btl_base_registration_handle_t *handle = NULL; - if( NULL != btl_mpool ) { + if( NULL != bml_btl->btl->btl_register_mem ) { /* register the memory */ - btl_mpool->mpool_register(btl_mpool, base, size, MCA_MPOOL_FLAGS_CUDA_GPU_MEM, ®); + handle = bml_btl->btl->btl_register_mem (bml_btl->btl, bml_btl->btl_endpoint, + base, size, MCA_BTL_REG_FLAG_CUDA_GPU_MEM | + MCA_BTL_REG_FLAG_REMOTE_READ); } - if(NULL == reg) + if(NULL == handle) continue; rdma_btls[num_btls_used].bml_btl = bml_btl; - rdma_btls[num_btls_used].btl_reg = reg; + rdma_btls[num_btls_used].btl_reg = handle; weight_total += bml_btl->btl_weight; num_btls_used++; } diff --git a/ompi/mca/pml/ob1/pml_ob1_hdr.h b/ompi/mca/pml/ob1/pml_ob1_hdr.h index 71e52ae608..e53f4afd90 100644 --- a/ompi/mca/pml/ob1/pml_ob1_hdr.h +++ b/ompi/mca/pml/ob1/pml_ob1_hdr.h @@ -11,7 +11,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2009 IBM Corporation. All rights reserved. - * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -64,6 +64,13 @@ struct mca_pml_ob1_common_hdr_t { }; typedef struct mca_pml_ob1_common_hdr_t mca_pml_ob1_common_hdr_t; +static inline void mca_pml_ob1_common_hdr_prepare (mca_pml_ob1_common_hdr_t *hdr, uint8_t hdr_type, + uint8_t hdr_flags) +{ + hdr->hdr_type = hdr_type; + hdr->hdr_flags = hdr_flags; +} + #define MCA_PML_OB1_COMMON_HDR_NTOH(h) #define MCA_PML_OB1_COMMON_HDR_HTON(h) @@ -89,15 +96,19 @@ struct mca_pml_ob1_match_hdr_t { typedef struct mca_pml_ob1_match_hdr_t mca_pml_ob1_match_hdr_t; +static inline void mca_pml_ob1_match_hdr_prepare (mca_pml_ob1_match_hdr_t *hdr, uint8_t hdr_type, uint8_t hdr_flags, + uint16_t hdr_ctx, int32_t hdr_src, int32_t hdr_tag, uint16_t hdr_seq) +{ + mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, hdr_type, hdr_flags); + hdr->hdr_ctx = hdr_ctx; + hdr->hdr_src = hdr_src; + hdr->hdr_tag = hdr_tag; + hdr->hdr_seq = hdr_seq; #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG -#define MCA_PML_OB1_MATCH_HDR_FILL(h) \ -do { \ - (h).hdr_padding[0] = 0; \ - (h).hdr_padding[1] = 0; \ -} while(0) -#else -#define MCA_PML_OB1_MATCH_HDR_FILL(h) -#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */ + hdr->hdr_padding[0] = 0; + hdr->hdr_padding[1] = 0; +#endif +} #define MCA_PML_OB1_MATCH_HDR_NTOH(h) \ do { \ @@ -111,7 +122,6 @@ do { \ #define MCA_PML_OB1_MATCH_HDR_HTON(h) \ do { \ MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \ - MCA_PML_OB1_MATCH_HDR_FILL(h); \ (h).hdr_ctx = htons((h).hdr_ctx); \ (h).hdr_src = htonl((h).hdr_src); \ (h).hdr_tag = htonl((h).hdr_tag); \ @@ -130,12 +140,14 @@ struct mca_pml_ob1_rendezvous_hdr_t { }; typedef struct mca_pml_ob1_rendezvous_hdr_t mca_pml_ob1_rendezvous_hdr_t; -#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG -#define MCA_PML_OB1_RNDV_HDR_FILL(h) \ - MCA_PML_OB1_MATCH_HDR_FILL((h).hdr_match) -#else -#define MCA_PML_OB1_RNDV_HDR_FILL(h) -#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */ +static inline void mca_pml_ob1_rendezvous_hdr_prepare (mca_pml_ob1_rendezvous_hdr_t *hdr, uint8_t hdr_type, uint8_t hdr_flags, + uint16_t hdr_ctx, int32_t hdr_src, int32_t hdr_tag, uint16_t hdr_seq, + uint64_t hdr_msg_length, void *hdr_src_req) +{ + mca_pml_ob1_match_hdr_prepare (&hdr->hdr_match, hdr_type, hdr_flags, hdr_ctx, hdr_src, hdr_tag, hdr_seq); + hdr->hdr_msg_length = hdr_msg_length; + hdr->hdr_src_req.pval = hdr_src_req; +} /* Note that hdr_src_req is not put in network byte order because it is never processed by the receiver, other than being copied into @@ -149,7 +161,6 @@ typedef struct mca_pml_ob1_rendezvous_hdr_t mca_pml_ob1_rendezvous_hdr_t; #define MCA_PML_OB1_RNDV_HDR_HTON(h) \ do { \ MCA_PML_OB1_MATCH_HDR_HTON((h).hdr_match); \ - MCA_PML_OB1_RNDV_HDR_FILL(h); \ (h).hdr_msg_length = hton64((h).hdr_msg_length); \ } while (0) @@ -158,38 +169,47 @@ typedef struct mca_pml_ob1_rendezvous_hdr_t mca_pml_ob1_rendezvous_hdr_t; */ struct mca_pml_ob1_rget_hdr_t { mca_pml_ob1_rendezvous_hdr_t hdr_rndv; - uint32_t hdr_seg_cnt; /**< number of segments for rdma */ #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT uint8_t hdr_padding[4]; #endif - opal_ptr_t hdr_des; /**< source descriptor */ + opal_ptr_t hdr_frag; /**< source fragment (for fin) */ + uint64_t hdr_src_ptr; /**< source pointer */ + /* btl registration handle data follows */ }; typedef struct mca_pml_ob1_rget_hdr_t mca_pml_ob1_rget_hdr_t; +static inline void mca_pml_ob1_rget_hdr_prepare (mca_pml_ob1_rget_hdr_t *hdr, uint8_t hdr_flags, + uint16_t hdr_ctx, int32_t hdr_src, int32_t hdr_tag, uint16_t hdr_seq, + uint64_t hdr_msg_length, void *hdr_src_req, void *hdr_frag, + void *hdr_src_ptr, void *local_handle, size_t local_handle_size) +{ + mca_pml_ob1_rendezvous_hdr_prepare (&hdr->hdr_rndv, MCA_PML_OB1_HDR_TYPE_RGET, hdr_flags, + hdr_ctx, hdr_src, hdr_tag, hdr_seq, hdr_msg_length, hdr_src_req); #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG -#define MCA_PML_OB1_RGET_HDR_FILL(h) \ -do { \ - MCA_PML_OB1_RNDV_HDR_FILL((h).hdr_rndv); \ - (h).hdr_padding[0] = 0; \ - (h).hdr_padding[1] = 0; \ - (h).hdr_padding[2] = 0; \ - (h).hdr_padding[3] = 0; \ -} while(0) -#else -#define MCA_PML_OB1_RGET_HDR_FILL(h) -#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */ + hdr->hdr_padding[0] = 0; + hdr->hdr_padding[1] = 0; + hdr->hdr_padding[2] = 0; + hdr->hdr_padding[3] = 0; +#endif + hdr->hdr_frag.pval = hdr_frag; + hdr->hdr_src_ptr = (uint64_t)(intptr_t) hdr_src_ptr; -#define MCA_PML_OB1_RGET_HDR_NTOH(h) \ - do { \ - MCA_PML_OB1_RNDV_HDR_NTOH((h).hdr_rndv); \ - (h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \ + /* copy registration handle */ + memcpy (hdr + 1, local_handle, local_handle_size); +} + +#define MCA_PML_OB1_RGET_HDR_NTOH(h) \ + do { \ + MCA_PML_OB1_RNDV_HDR_NTOH((h).hdr_rndv); \ + (h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \ + (h).hdr_src_ptr = ntoh64((h).hdr_src_ptr); \ } while (0) -#define MCA_PML_OB1_RGET_HDR_HTON(h) \ - do { \ - MCA_PML_OB1_RNDV_HDR_HTON((h).hdr_rndv); \ - MCA_PML_OB1_RGET_HDR_FILL(h); \ - (h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \ +#define MCA_PML_OB1_RGET_HDR_HTON(h) \ + do { \ + MCA_PML_OB1_RNDV_HDR_HTON((h).hdr_rndv); \ + (h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \ + (h).hdr_src_ptr = hton64((h).hdr_src_ptr); \ } while (0) /** @@ -206,19 +226,23 @@ struct mca_pml_ob1_frag_hdr_t { }; typedef struct mca_pml_ob1_frag_hdr_t mca_pml_ob1_frag_hdr_t; +static inline void mca_pml_ob1_frag_hdr_prepare (mca_pml_ob1_frag_hdr_t *hdr, uint8_t hdr_flags, + uint64_t hdr_frag_offset, void *hdr_src_req, + uint64_t hdr_dst_req) +{ + mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, MCA_PML_OB1_HDR_TYPE_FRAG, hdr_flags); #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG -#define MCA_PML_OB1_FRAG_HDR_FILL(h) \ -do { \ - (h).hdr_padding[0] = 0; \ - (h).hdr_padding[1] = 0; \ - (h).hdr_padding[2] = 0; \ - (h).hdr_padding[3] = 0; \ - (h).hdr_padding[4] = 0; \ - (h).hdr_padding[5] = 0; \ -} while(0) -#else -#define MCA_PML_OB1_FRAG_HDR_FILL(h) -#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */ + hdr->hdr_padding[0] = 0; + hdr->hdr_padding[1] = 0; + hdr->hdr_padding[2] = 0; + hdr->hdr_padding[3] = 0; + hdr->hdr_padding[4] = 0; + hdr->hdr_padding[5] = 0; +#endif + hdr->hdr_frag_offset = hdr_frag_offset; + hdr->hdr_src_req.pval = hdr_src_req; + hdr->hdr_dst_req.lval = hdr_dst_req; +} #define MCA_PML_OB1_FRAG_HDR_NTOH(h) \ do { \ @@ -229,7 +253,6 @@ do { \ #define MCA_PML_OB1_FRAG_HDR_HTON(h) \ do { \ MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \ - MCA_PML_OB1_FRAG_HDR_FILL(h); \ (h).hdr_frag_offset = hton64((h).hdr_frag_offset); \ } while (0) @@ -245,38 +268,45 @@ struct mca_pml_ob1_ack_hdr_t { opal_ptr_t hdr_src_req; /**< source request */ opal_ptr_t hdr_dst_req; /**< matched receive request */ uint64_t hdr_send_offset; /**< starting point of copy in/out */ + uint64_t hdr_send_size; /**< number of bytes requested (0: all remaining) */ }; typedef struct mca_pml_ob1_ack_hdr_t mca_pml_ob1_ack_hdr_t; +static inline void mca_pml_ob1_ack_hdr_prepare (mca_pml_ob1_ack_hdr_t *hdr, uint8_t hdr_flags, + uint64_t hdr_src_req, void *hdr_dst_req, + uint64_t hdr_send_offset, uint64_t hdr_send_size) +{ + mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, MCA_PML_OB1_HDR_TYPE_ACK, hdr_flags); #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG -#define MCA_PML_OB1_ACK_HDR_FILL(h) \ -do { \ - (h).hdr_padding[0] = 0; \ - (h).hdr_padding[1] = 0; \ - (h).hdr_padding[2] = 0; \ - (h).hdr_padding[3] = 0; \ - (h).hdr_padding[4] = 0; \ - (h).hdr_padding[5] = 0; \ -} while (0) -#else -#define MCA_PML_OB1_ACK_HDR_FILL(h) -#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */ + hdr->hdr_padding[0] = 0; + hdr->hdr_padding[1] = 0; + hdr->hdr_padding[2] = 0; + hdr->hdr_padding[3] = 0; + hdr->hdr_padding[4] = 0; + hdr->hdr_padding[5] = 0; +#endif + hdr->hdr_src_req.lval = hdr_src_req; + hdr->hdr_dst_req.pval = hdr_dst_req; + hdr->hdr_send_offset = hdr_send_offset; + hdr->hdr_send_size = hdr_send_size; +} /* Note that the request headers are not put in NBO because the src_req is already in receiver's byte order and the dst_req is not used by the receiver for anything other than backpointers in return headers */ -#define MCA_PML_OB1_ACK_HDR_NTOH(h) \ - do { \ - MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \ +#define MCA_PML_OB1_ACK_HDR_NTOH(h) \ + do { \ + MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \ (h).hdr_send_offset = ntoh64((h).hdr_send_offset); \ + (h).hdr_send_size = ntoh64((h).hdr_send_size); \ } while (0) -#define MCA_PML_OB1_ACK_HDR_HTON(h) \ - do { \ - MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \ - MCA_PML_OB1_ACK_HDR_FILL(h); \ +#define MCA_PML_OB1_ACK_HDR_HTON(h) \ + do { \ + MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \ (h).hdr_send_offset = hton64((h).hdr_send_offset); \ + (h).hdr_send_size = hton64((h).hdr_send_size); \ } while (0) /** @@ -288,38 +318,55 @@ struct mca_pml_ob1_rdma_hdr_t { #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT uint8_t hdr_padding[2]; /** two to pad out the hdr to a 4 byte alignment. hdr_req will then be 8 byte aligned after 4 for hdr_seg_cnt */ #endif - uint32_t hdr_seg_cnt; /**< number of segments for rdma */ + /* TODO: add real support for multiple destination segments */ opal_ptr_t hdr_req; /**< destination request */ - opal_ptr_t hdr_des; /**< source descriptor */ + opal_ptr_t hdr_frag; /**< receiver fragment */ opal_ptr_t hdr_recv_req; /**< receive request (NTH: needed for put fallback on send) */ - uint64_t hdr_rdma_offset; /**< current offset into user buffer */ - mca_btl_base_segment_t hdr_segs[1]; /**< list of segments for rdma */ + uint64_t hdr_rdma_offset; /**< current offset into user buffer */ + uint64_t hdr_dst_ptr; /**< destination address */ + uint64_t hdr_dst_size; /**< destination size */ + /* registration data follows */ }; typedef struct mca_pml_ob1_rdma_hdr_t mca_pml_ob1_rdma_hdr_t; +static inline void mca_pml_ob1_rdma_hdr_prepare (mca_pml_ob1_rdma_hdr_t *hdr, uint8_t hdr_flags, + uint64_t hdr_req, void *hdr_frag, void *hdr_recv_req, + uint64_t hdr_rdma_offset, void *hdr_dst_ptr, + uint64_t hdr_dst_size, void *local_handle, + size_t local_handle_size) +{ + mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, MCA_PML_OB1_HDR_TYPE_PUT, hdr_flags); #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG -#define MCA_PML_OB1_RDMA_HDR_FILL(h) \ -do { \ - (h).hdr_padding[0] = 0; \ - (h).hdr_padding[1] = 0; \ -} while(0) -#else -#define MCA_PML_OB1_RDMA_HDR_FILL(h) -#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */ + hdr->hdr_padding[0] = 0; + hdr->hdr_padding[1] = 0; +#endif + hdr->hdr_req.lval = hdr_req; + hdr->hdr_frag.pval = hdr_frag; + hdr->hdr_recv_req.pval = hdr_recv_req; + hdr->hdr_rdma_offset = hdr_rdma_offset; + hdr->hdr_dst_ptr = (uint64_t)(intptr_t) hdr_dst_ptr; + hdr->hdr_dst_size = hdr_dst_size; -#define MCA_PML_OB1_RDMA_HDR_NTOH(h) \ - do { \ - MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \ - (h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \ + /* copy segments */ + memcpy (hdr + 1, local_handle, local_handle_size); +} + +#define MCA_PML_OB1_RDMA_HDR_NTOH(h) \ + do { \ + MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \ + (h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \ (h).hdr_rdma_offset = ntoh64((h).hdr_rdma_offset); \ + (h).hdr_dst_ptr = ntoh64((h).hdr_dst_ptr); \ + (h).hdr_dst_size = ntoh64((h).hdr_dst_size); \ } while (0) -#define MCA_PML_OB1_RDMA_HDR_HTON(h) \ - do { \ - MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \ - MCA_PML_OB1_RDMA_HDR_FILL(h); \ - (h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \ +#define MCA_PML_OB1_RDMA_HDR_HTON(h) \ + do { \ + MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \ + (h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \ (h).hdr_rdma_offset = hton64((h).hdr_rdma_offset); \ + (h).hdr_dst_ptr = hton64((h).hdr_dst_ptr); \ + (h).hdr_dst_size = hton64((h).hdr_dst_size); \ } while (0) /** @@ -331,31 +378,34 @@ struct mca_pml_ob1_fin_hdr_t { #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT uint8_t hdr_padding[2]; #endif - uint32_t hdr_fail; /**< RDMA operation failed */ - opal_ptr_t hdr_des; /**< completed descriptor */ + int64_t hdr_size; /**< number of bytes completed (positive), error code (negative) */ + opal_ptr_t hdr_frag; /**< completed RDMA fragment */ }; typedef struct mca_pml_ob1_fin_hdr_t mca_pml_ob1_fin_hdr_t; +static inline void mca_pml_ob1_fin_hdr_prepare (mca_pml_ob1_fin_hdr_t *hdr, uint8_t hdr_flags, + uint64_t hdr_frag, int64_t hdr_size) +{ + mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, MCA_PML_OB1_HDR_TYPE_FIN, hdr_flags); #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG -#define MCA_PML_OB1_FIN_HDR_FILL(h) \ -do { \ - (h).hdr_padding[0] = 0; \ - (h).hdr_padding[1] = 0; \ -} while (0) -#else -#define MCA_PML_OB1_FIN_HDR_FILL(h) -#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */ + hdr->hdr_padding[0] = 0; + hdr->hdr_padding[1] = 0; +#endif + hdr->hdr_frag.lval = hdr_frag; + hdr->hdr_size = hdr_size; +} -#define MCA_PML_OB1_FIN_HDR_NTOH(h) \ - do { \ +#define MCA_PML_OB1_FIN_HDR_NTOH(h) \ + do { \ MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \ + (h).hdr_size = ntoh64((h).hdr_size); \ } while (0) -#define MCA_PML_OB1_FIN_HDR_HTON(h) \ - do { \ +#define MCA_PML_OB1_FIN_HDR_HTON(h) \ + do { \ MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \ - MCA_PML_OB1_FIN_HDR_FILL(h); \ - } while (0) + (h).hdr_size = hton64((h).hdr_size); \ + } while (0) /** * Union of defined hdr types. diff --git a/ompi/mca/pml/ob1/pml_ob1_isend.c b/ompi/mca/pml/ob1/pml_ob1_isend.c index 852cf5fad5..157cddd730 100644 --- a/ompi/mca/pml/ob1/pml_ob1_isend.c +++ b/ompi/mca/pml/ob1/pml_ob1_isend.c @@ -10,7 +10,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2014 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2015 Research Organization for Information Science @@ -68,7 +68,6 @@ static inline int mca_pml_ob1_send_inline (void *buf, size_t count, ompi_proc_t *dst_proc, mca_bml_base_endpoint_t* endpoint, ompi_communicator_t * comm) { - mca_btl_base_descriptor_t *des = NULL; mca_pml_ob1_match_hdr_t match; mca_bml_base_btl_t *bml_btl; opal_convertor_t convertor; @@ -98,28 +97,21 @@ static inline int mca_pml_ob1_send_inline (void *buf, size_t count, size = 0; } - match.hdr_common.hdr_flags = 0; - match.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH; - match.hdr_ctx = comm->c_contextid; - match.hdr_src = comm->c_my_rank; - match.hdr_tag = tag; - match.hdr_seq = seqn; + mca_pml_ob1_match_hdr_prepare (&match, MCA_PML_OB1_HDR_TYPE_MATCH, 0, + comm->c_contextid, comm->c_my_rank, + tag, seqn); ob1_hdr_hton(&match, MCA_PML_OB1_HDR_TYPE_MATCH, dst_proc); /* try to send immediately */ rc = mca_bml_base_sendi (bml_btl, &convertor, &match, OMPI_PML_OB1_MATCH_HDR_LEN, size, MCA_BTL_NO_ORDER, MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP, - MCA_PML_OB1_HDR_TYPE_MATCH, &des); + MCA_PML_OB1_HDR_TYPE_MATCH, NULL); if (count > 0) { opal_convertor_cleanup (&convertor); } if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - if (des) { - mca_bml_base_free (bml_btl, des); - } - return rc; } @@ -224,7 +216,7 @@ int mca_pml_ob1_send(void *buf, OBJ_CONSTRUCT(sendreq, mca_pml_ob1_send_request_t); sendreq->req_send.req_base.req_proc = dst_proc; - sendreq->src_des = NULL; + sendreq->rdma_frag = NULL; MCA_PML_OB1_SEND_REQUEST_INIT(sendreq, buf, diff --git a/ompi/mca/pml/ob1/pml_ob1_rdma.c b/ompi/mca/pml/ob1/pml_ob1_rdma.c index e1afda0689..c2c9bbbe89 100644 --- a/ompi/mca/pml/ob1/pml_ob1_rdma.c +++ b/ompi/mca/pml/ob1/pml_ob1_rdma.c @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology @@ -9,6 +10,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -27,11 +30,6 @@ #include "pml_ob1.h" #include "pml_ob1_rdma.h" -/* Use this registration if no registration needed for a BTL instead of NULL. - * This will help other code to distinguish case when memory is not registered - * from case when registration is not needed */ -static mca_mpool_base_registration_t pml_ob1_dummy_reg; - /* * Check to see if memory is registered or can be registered. Build a * set of registrations on the request. @@ -45,7 +43,7 @@ size_t mca_pml_ob1_rdma_btls( { int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma); double weight_total = 0; - int num_btls_used = 0, n; + int num_btls_used = 0; /* shortcut when there are no rdma capable btls */ if(num_btls == 0) { @@ -53,29 +51,33 @@ size_t mca_pml_ob1_rdma_btls( } /* check to see if memory is registered */ - for(n = 0; n < num_btls && num_btls_used < mca_pml_ob1.max_rdma_per_request; - n++) { + for (int n = 0; n < num_btls && num_btls_used < mca_pml_ob1.max_rdma_per_request; n++) { mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma, - (bml_endpoint->btl_rdma_index + n) % num_btls); - mca_mpool_base_registration_t* reg = &pml_ob1_dummy_reg; - mca_mpool_base_module_t *btl_mpool = bml_btl->btl->btl_mpool; + (bml_endpoint->btl_rdma_index + n) % num_btls); + mca_btl_base_registration_handle_t *reg_handle = NULL; + mca_btl_base_module_t *btl = bml_btl->btl; - if( NULL != btl_mpool ) { - if(!mca_pml_ob1.leave_pinned) { - /* look through existing registrations */ - btl_mpool->mpool_find(btl_mpool, base, size, ®); - } else { - /* register the memory */ - btl_mpool->mpool_register(btl_mpool, base, size, 0, ®); + if (btl->btl_register_mem) { + /* do not use the RDMA protocol with this btl if 1) leave pinned is disabled, + * 2) the btl supports put, and 3) the fragment is larger than the minimum + * pipeline size specified by the BTL */ + if (!mca_pml_ob1.leave_pinned && (btl->btl_flags & MCA_BTL_FLAGS_PUT) && + size > btl->btl_min_rdma_pipeline_size) { + continue; } - if(NULL == reg) + /* try to register the memory region with the btl */ + reg_handle = btl->btl_register_mem (btl, bml_btl->btl_endpoint, base, + size, MCA_BTL_REG_FLAG_REMOTE_READ); + if (NULL == reg_handle) { + /* btl requires registration but the registration failed */ continue; - } + } + } /* else no registration is needed with this btl */ rdma_btls[num_btls_used].bml_btl = bml_btl; - rdma_btls[num_btls_used].btl_reg = reg; + rdma_btls[num_btls_used].btl_reg = reg_handle; weight_total += bml_btl->btl_weight; num_btls_used++; } @@ -83,7 +85,7 @@ size_t mca_pml_ob1_rdma_btls( /* if we don't use leave_pinned and all BTLs that already have this memory * registered amount to less then half of available bandwidth - fall back to * pipeline protocol */ - if(0 == num_btls_used || (!mca_pml_ob1.leave_pinned && weight_total < 0.5)) + if (0 == num_btls_used || (!mca_pml_ob1.leave_pinned && weight_total < 0.5)) return 0; mca_pml_ob1_calc_weighted_length(rdma_btls, num_btls_used, size, @@ -103,10 +105,7 @@ size_t mca_pml_ob1_rdma_pipeline_btls( mca_bml_base_endpoint_t* bml_endpoint, for(i = 0; i < num_btls && i < mca_pml_ob1.max_rdma_per_request; i++) { rdma_btls[i].bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma); - if(NULL != rdma_btls[i].bml_btl->btl->btl_mpool) - rdma_btls[i].btl_reg = NULL; - else - rdma_btls[i].btl_reg = &pml_ob1_dummy_reg; + rdma_btls[i].btl_reg = NULL; weight_total += rdma_btls[i].bml_btl->btl_weight; } diff --git a/ompi/mca/pml/ob1/pml_ob1_rdmafrag.c b/ompi/mca/pml/ob1/pml_ob1_rdmafrag.c index c814141a8e..cc13628eb4 100644 --- a/ompi/mca/pml/ob1/pml_ob1_rdmafrag.c +++ b/ompi/mca/pml/ob1/pml_ob1_rdmafrag.c @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology @@ -9,6 +10,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2014 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -21,9 +24,13 @@ #include "pml_ob1.h" #include "pml_ob1_rdmafrag.h" +static void mca_pml_ob1_rdma_frag_constructor (mca_pml_ob1_rdma_frag_t *frag) +{ + frag->local_handle = NULL; +} OBJ_CLASS_INSTANCE( mca_pml_ob1_rdma_frag_t, ompi_free_list_item_t, - NULL, + mca_pml_ob1_rdma_frag_constructor, NULL); diff --git a/ompi/mca/pml/ob1/pml_ob1_rdmafrag.h b/ompi/mca/pml/ob1/pml_ob1_rdmafrag.h index 287daed022..132c962833 100644 --- a/ompi/mca/pml/ob1/pml_ob1_rdmafrag.h +++ b/ompi/mca/pml/ob1/pml_ob1_rdmafrag.h @@ -10,6 +10,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2014 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -32,38 +34,52 @@ typedef enum { MCA_PML_OB1_RDMA_GET } mca_pml_ob1_rdma_state_t; +struct mca_pml_ob1_rdma_frag_t; + +typedef void (*mca_pml_ob1_rdma_frag_callback_t)(struct mca_pml_ob1_rdma_frag_t *frag, int64_t rdma_length); + +/** + * Used to keep track of local and remote RDMA operations. + */ struct mca_pml_ob1_rdma_frag_t { ompi_free_list_item_t super; - mca_bml_base_btl_t* rdma_bml; + mca_bml_base_btl_t *rdma_bml; mca_pml_ob1_hdr_t rdma_hdr; mca_pml_ob1_rdma_state_t rdma_state; size_t rdma_length; - uint8_t rdma_segs[MCA_BTL_SEG_MAX_SIZE * MCA_BTL_DES_MAX_SEGMENTS]; void *rdma_req; - struct mca_bml_base_endpoint_t* rdma_ep; - opal_convertor_t convertor; - mca_mpool_base_registration_t* reg; uint32_t retries; + mca_pml_ob1_rdma_frag_callback_t cbfunc; + + uint64_t rdma_offset; + void *local_address; + mca_btl_base_registration_handle_t *local_handle; + + uint64_t remote_address; + uint8_t remote_handle[MCA_BTL_REG_HANDLE_MAX_SIZE]; }; typedef struct mca_pml_ob1_rdma_frag_t mca_pml_ob1_rdma_frag_t; OBJ_CLASS_DECLARATION(mca_pml_ob1_rdma_frag_t); -#define MCA_PML_OB1_RDMA_FRAG_ALLOC(frag) \ -do { \ - ompi_free_list_item_t* item; \ +#define MCA_PML_OB1_RDMA_FRAG_ALLOC(frag) \ + do { \ + ompi_free_list_item_t* item; \ OMPI_FREE_LIST_WAIT_MT(&mca_pml_ob1.rdma_frags, item); \ - frag = (mca_pml_ob1_rdma_frag_t*)item; \ -} while(0) - -#define MCA_PML_OB1_RDMA_FRAG_RETURN(frag) \ -do { \ - /* return fragment */ \ - OMPI_FREE_LIST_RETURN_MT(&mca_pml_ob1.rdma_frags, \ - (ompi_free_list_item_t*)frag); \ + frag = (mca_pml_ob1_rdma_frag_t*)item; \ } while(0) +#define MCA_PML_OB1_RDMA_FRAG_RETURN(frag) \ + do { \ + /* return fragment */ \ + if (frag->local_handle) { \ + mca_bml_base_deregister_mem (frag->rdma_bml, frag->local_handle); \ + frag->local_handle = NULL; \ + } \ + OMPI_FREE_LIST_RETURN_MT(&mca_pml_ob1.rdma_frags, \ + (ompi_free_list_item_t*)frag); \ + } while (0) END_C_DECLS diff --git a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c index 337496dc7b..e37e1e7144 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c @@ -13,7 +13,7 @@ * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2006-2008 University of Houston. All rights reserved. * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -108,13 +108,13 @@ void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl, mca_btl_base_descriptor_t* des, void* cbdata ) { - mca_btl_base_segment_t* segments = des->des_local; + mca_btl_base_segment_t* segments = des->des_segments; mca_pml_ob1_match_hdr_t* hdr = (mca_pml_ob1_match_hdr_t*)segments->seg_addr.pval; ompi_communicator_t *comm_ptr; mca_pml_ob1_recv_request_t *match = NULL; mca_pml_ob1_comm_t *comm; mca_pml_ob1_comm_proc_t *proc; - size_t num_segments = des->des_local_count; + size_t num_segments = des->des_segment_count; size_t bytes_received = 0; assert(num_segments <= MCA_BTL_DES_MAX_SEGMENTS); @@ -256,7 +256,7 @@ void mca_pml_ob1_recv_frag_callback_rndv(mca_btl_base_module_t* btl, mca_btl_base_descriptor_t* des, void* cbdata ) { - mca_btl_base_segment_t* segments = des->des_local; + mca_btl_base_segment_t* segments = des->des_segments; mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval; if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) { @@ -264,7 +264,7 @@ void mca_pml_ob1_recv_frag_callback_rndv(mca_btl_base_module_t* btl, } ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_RNDV); mca_pml_ob1_recv_frag_match(btl, &hdr->hdr_match, segments, - des->des_local_count, MCA_PML_OB1_HDR_TYPE_RNDV); + des->des_segment_count, MCA_PML_OB1_HDR_TYPE_RNDV); return; } @@ -273,7 +273,7 @@ void mca_pml_ob1_recv_frag_callback_rget(mca_btl_base_module_t* btl, mca_btl_base_descriptor_t* des, void* cbdata ) { - mca_btl_base_segment_t* segments = des->des_local; + mca_btl_base_segment_t* segments = des->des_segments; mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval; if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) { @@ -281,7 +281,7 @@ void mca_pml_ob1_recv_frag_callback_rget(mca_btl_base_module_t* btl, } ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_RGET); mca_pml_ob1_recv_frag_match(btl, &hdr->hdr_match, segments, - des->des_local_count, MCA_PML_OB1_HDR_TYPE_RGET); + des->des_segment_count, MCA_PML_OB1_HDR_TYPE_RGET); return; } @@ -292,9 +292,10 @@ void mca_pml_ob1_recv_frag_callback_ack(mca_btl_base_module_t* btl, mca_btl_base_descriptor_t* des, void* cbdata ) { - mca_btl_base_segment_t* segments = des->des_local; + mca_btl_base_segment_t* segments = des->des_segments; mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval; mca_pml_ob1_send_request_t* sendreq; + size_t size; if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) { return; @@ -307,19 +308,25 @@ void mca_pml_ob1_recv_frag_callback_ack(mca_btl_base_module_t* btl, /* if the request should be delivered entirely by copy in/out * then throttle sends */ if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA) { - if (NULL != sendreq->src_des) { - /* release registered memory */ - mca_bml_base_free (sendreq->req_rdma[0].bml_btl, sendreq->src_des); - sendreq->src_des = NULL; + if (NULL != sendreq->rdma_frag) { + if (NULL != sendreq->rdma_frag->local_handle) { + mca_bml_base_deregister_mem (sendreq->req_rdma[0].bml_btl, sendreq->rdma_frag->local_handle); + sendreq->rdma_frag->local_handle = NULL; + } + MCA_PML_OB1_RDMA_FRAG_RETURN(sendreq->rdma_frag); + sendreq->rdma_frag = NULL; } sendreq->req_throttle_sends = true; } - - mca_pml_ob1_send_request_copy_in_out(sendreq, - hdr->hdr_ack.hdr_send_offset, - sendreq->req_send.req_bytes_packed - - hdr->hdr_ack.hdr_send_offset); + + if (hdr->hdr_ack.hdr_send_size) { + size = hdr->hdr_ack.hdr_send_size; + } else { + size = sendreq->req_send.req_bytes_packed - hdr->hdr_ack.hdr_send_offset; + } + + mca_pml_ob1_send_request_copy_in_out(sendreq, hdr->hdr_ack.hdr_send_offset, size); if (sendreq->req_state != 0) { /* Typical receipt of an ACK message causes req_state to be @@ -355,13 +362,14 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl, mca_btl_base_tag_t tag, mca_btl_base_descriptor_t* des, void* cbdata ) { - mca_btl_base_segment_t* segments = des->des_local; + mca_btl_base_segment_t* segments = des->des_segments; mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval; mca_pml_ob1_recv_request_t* recvreq; if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) { return; } + ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_FRAG); recvreq = (mca_pml_ob1_recv_request_t*)hdr->hdr_frag.hdr_dst_req.pval; #if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */ @@ -372,7 +380,7 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl, assert(btl->btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV); /* This will trigger the opal_convertor_pack to start asynchronous copy. */ - mca_pml_ob1_recv_request_frag_copy_start(recvreq,btl,segments,des->des_local_count,des); + mca_pml_ob1_recv_request_frag_copy_start(recvreq,btl,segments,des->des_segment_count,des); /* Let BTL know that it CANNOT free the frag */ des->des_flags |= MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC; @@ -380,7 +388,8 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl, return; } #endif /* OPAL_CUDA_SUPPORT */ - mca_pml_ob1_recv_request_progress_frag(recvreq,btl,segments,des->des_local_count); + + mca_pml_ob1_recv_request_progress_frag(recvreq,btl,segments,des->des_segment_count); return; } @@ -390,7 +399,7 @@ void mca_pml_ob1_recv_frag_callback_put(mca_btl_base_module_t* btl, mca_btl_base_tag_t tag, mca_btl_base_descriptor_t* des, void* cbdata ) { - mca_btl_base_segment_t* segments = des->des_local; + mca_btl_base_segment_t* segments = des->des_segments; mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval; mca_pml_ob1_send_request_t* sendreq; @@ -410,20 +419,17 @@ void mca_pml_ob1_recv_frag_callback_fin(mca_btl_base_module_t* btl, mca_btl_base_tag_t tag, mca_btl_base_descriptor_t* des, void* cbdata ) { - mca_btl_base_segment_t* segments = des->des_local; - mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval; - mca_btl_base_descriptor_t* rdma; + mca_btl_base_segment_t* segments = des->des_segments; + mca_pml_ob1_fin_hdr_t* hdr = (mca_pml_ob1_fin_hdr_t *) segments->seg_addr.pval; + mca_pml_ob1_rdma_frag_t *frag; - if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) { + if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_fin_hdr_t)) ) { return; } - + ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_FIN); - rdma = (mca_btl_base_descriptor_t*)hdr->hdr_fin.hdr_des.pval; - rdma->des_cbfunc(btl, NULL, rdma, - hdr->hdr_fin.hdr_fail ? OMPI_ERROR : OMPI_SUCCESS); - - return; + frag = (mca_pml_ob1_rdma_frag_t *) hdr->hdr_frag.pval; + frag->cbfunc (frag, hdr->hdr_size); } @@ -699,7 +705,7 @@ out_of_order_match: OPAL_THREAD_UNLOCK(&comm->matching_lock); if(OPAL_LIKELY(match)) { - switch(type) { + switch(type) { case MCA_PML_OB1_HDR_TYPE_MATCH: mca_pml_ob1_recv_request_progress_match(match, btl, segments, num_segments); break; diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c index a8206af0ca..92f068df9b 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c @@ -13,7 +13,7 @@ * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. * Copyright (c) 2012-2013 NVIDIA Corporation. All rights reserved. - * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2012 FUJITSU LIMITED. All rights reserved. * Copyright (c) 2014 Research Organization for Information Science @@ -150,12 +150,17 @@ static void mca_pml_ob1_recv_request_construct(mca_pml_ob1_recv_request_t* reque request->req_recv.req_base.req_ompi.req_free = mca_pml_ob1_recv_request_free; request->req_recv.req_base.req_ompi.req_cancel = mca_pml_ob1_recv_request_cancel; request->req_rdma_cnt = 0; + request->local_handle = NULL; OBJ_CONSTRUCT(&request->lock, opal_mutex_t); } static void mca_pml_ob1_recv_request_destruct(mca_pml_ob1_recv_request_t* request) { OBJ_DESTRUCT(&request->lock); + if (OPAL_UNLIKELY(request->local_handle)) { + mca_bml_base_deregister_mem (request->rdma_bml, request->local_handle); + request->local_handle = NULL; + } } OBJ_CLASS_INSTANCE( @@ -183,31 +188,27 @@ static void mca_pml_ob1_recv_ctl_completion( mca_btl_base_module_t* btl, * Put operation has completed remotely - update request status */ -static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* ep, - struct mca_btl_base_descriptor_t* des, - int status ) +static void mca_pml_ob1_put_completion (mca_pml_ob1_rdma_frag_t *frag, int64_t rdma_size) { - mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context; - mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)des->des_cbdata; - size_t bytes_received = 0; + mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req; + mca_bml_base_btl_t *bml_btl = frag->rdma_bml; - if( OPAL_LIKELY(status == OMPI_SUCCESS) ) { - bytes_received = mca_pml_ob1_compute_segment_length (btl->btl_seg_size, - (void *) des->des_local, - des->des_local_count, 0); - } OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth,-1); - mca_bml_base_free(bml_btl, des); + MCA_PML_OB1_RDMA_FRAG_RETURN(frag); - /* check completion status */ - OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, bytes_received); - if(recv_request_pml_complete_check(recvreq) == false && + if (OPAL_LIKELY(0 < rdma_size)) { + assert ((uint64_t) rdma_size == frag->rdma_length); + + /* check completion status */ + OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, (size_t) rdma_size); + if (recv_request_pml_complete_check(recvreq) == false && recvreq->req_rdma_offset < recvreq->req_send_offset) { - /* schedule additional rdma operations */ - mca_pml_ob1_recv_request_schedule(recvreq, bml_btl); + /* schedule additional rdma operations */ + mca_pml_ob1_recv_request_schedule(recvreq, bml_btl); + } } + MCA_PML_OB1_PROGRESS_PENDING(bml_btl); } @@ -218,7 +219,7 @@ static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl, int mca_pml_ob1_recv_request_ack_send_btl( ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl, uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset, - bool nordma) + uint64_t size, bool nordma) { mca_btl_base_descriptor_t* des; mca_pml_ob1_ack_hdr_t* ack; @@ -234,12 +235,9 @@ int mca_pml_ob1_recv_request_ack_send_btl( } /* fill out header */ - ack = (mca_pml_ob1_ack_hdr_t*)des->des_local->seg_addr.pval; - ack->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_ACK; - ack->hdr_common.hdr_flags = nordma ? MCA_PML_OB1_HDR_FLAGS_NORDMA : 0; - ack->hdr_src_req.lval = hdr_src_req; - ack->hdr_dst_req.pval = hdr_dst_req; - ack->hdr_send_offset = hdr_send_offset; + ack = (mca_pml_ob1_ack_hdr_t*)des->des_segments->seg_addr.pval; + mca_pml_ob1_ack_hdr_prepare (ack, nordma ? MCA_PML_OB1_HDR_FLAGS_NORDMA : 0, + hdr_src_req, hdr_dst_req, hdr_send_offset, size); ob1_hdr_hton(ack, MCA_PML_OB1_HDR_TYPE_ACK, proc); @@ -313,63 +311,99 @@ static int mca_pml_ob1_recv_request_ack( if(recvreq->req_send_offset == hdr->hdr_msg_length) return OMPI_SUCCESS; } + /* let know to shedule function there is no need to put ACK flag */ recvreq->req_ack_sent = true; return mca_pml_ob1_recv_request_ack_send(proc, hdr->hdr_src_req.lval, - recvreq, recvreq->req_send_offset, + recvreq, recvreq->req_send_offset, 0, recvreq->req_send_offset == bytes_received); } +static int mca_pml_ob1_recv_request_put_frag (mca_pml_ob1_rdma_frag_t *frag); + +static int mca_pml_ob1_recv_request_get_frag_failed (mca_pml_ob1_rdma_frag_t *frag, int rc) +{ + mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req; + ompi_proc_t *proc = (ompi_proc_t *) recvreq->req_recv.req_base.req_proc; + + if (OMPI_ERR_NOT_AVAILABLE == rc) { + /* get isn't supported for this transfer. tell peer to fallback on put */ + rc = mca_pml_ob1_recv_request_put_frag (frag); + if (OMPI_ERR_OUT_OF_RESOURCE == rc) { + OPAL_THREAD_LOCK(&mca_pml_ob1.lock); + opal_list_append (&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag); + OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock); + + return OMPI_SUCCESS; + } + } + + if (++frag->retries < mca_pml_ob1.rdma_retries_limit && + OMPI_ERR_OUT_OF_RESOURCE == rc) { + OPAL_THREAD_LOCK(&mca_pml_ob1.lock); + opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag); + OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock); + + return OMPI_SUCCESS; + } + + /* tell peer to fall back on send for this region */ + rc = mca_pml_ob1_recv_request_ack_send(proc, frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req.lval, + recvreq, frag->rdma_offset, frag->rdma_length, false); + MCA_PML_OB1_RDMA_FRAG_RETURN(frag); + return rc; +} + /** * Return resources used by the RDMA */ -static void mca_pml_ob1_rget_completion( mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* ep, - struct mca_btl_base_descriptor_t* des, - int status ) +static void mca_pml_ob1_rget_completion (mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* ep, + void *local_address, mca_btl_base_registration_handle_t *local_handle, + void *context, void *cbdata, int status) { - mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context; - mca_pml_ob1_rdma_frag_t* frag = (mca_pml_ob1_rdma_frag_t*)des->des_cbdata; - mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)frag->rdma_req; + mca_bml_base_btl_t *bml_btl = (mca_bml_base_btl_t *) context; + mca_pml_ob1_rdma_frag_t *frag = (mca_pml_ob1_rdma_frag_t *) cbdata; + mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req; /* check completion status */ - if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) { - /* TSW - FIX */ - OMPI_ERROR_LOG(status); - ompi_rte_abort(-1, NULL); + if (OPAL_UNLIKELY(OMPI_SUCCESS != status)) { + status = mca_pml_ob1_recv_request_get_frag_failed (frag, status); + if (OPAL_UNLIKELY(OMPI_SUCCESS != status)) { + /* TSW - FIX */ + OMPI_ERROR_LOG(status); + ompi_rte_abort(-1, NULL); + } + } else { + /* is receive request complete */ + OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, frag->rdma_length); + /* TODO: re-add order */ + mca_pml_ob1_send_fin (recvreq->req_recv.req_base.req_proc, + bml_btl, frag->rdma_hdr.hdr_rget.hdr_frag, + frag->rdma_length, 0, 0); + + recv_request_pml_complete_check(recvreq); + + MCA_PML_OB1_RDMA_FRAG_RETURN(frag); } - /* is receive request complete */ - OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, frag->rdma_length); - if (recvreq->req_recv.req_bytes_packed <= recvreq->req_bytes_received) { - mca_pml_ob1_send_fin(recvreq->req_recv.req_base.req_proc, - bml_btl, - frag->rdma_hdr.hdr_rget.hdr_des, - des->order, 0); - } - - recv_request_pml_complete_check(recvreq); - - MCA_PML_OB1_RDMA_FRAG_RETURN(frag); - MCA_PML_OB1_PROGRESS_PENDING(bml_btl); } -static int mca_pml_ob1_init_get_fallback (mca_pml_ob1_rdma_frag_t *frag, - mca_btl_base_descriptor_t *dst) { +static int mca_pml_ob1_recv_request_put_frag (mca_pml_ob1_rdma_frag_t *frag) +{ mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req; mca_bml_base_btl_t *bml_btl = frag->rdma_bml; mca_btl_base_descriptor_t *ctl; mca_pml_ob1_rdma_hdr_t *hdr; - size_t seg_size; + size_t reg_size; int rc; - seg_size = bml_btl->btl->btl_seg_size * dst->des_local_count; + reg_size = bml_btl->btl->btl_registration_handle_size; /* prepare a descriptor for rdma control message */ - mca_bml_base_alloc (bml_btl, &ctl, MCA_BTL_NO_ORDER, sizeof (mca_pml_ob1_rdma_hdr_t) + seg_size, + mca_bml_base_alloc (bml_btl, &ctl, MCA_BTL_NO_ORDER, sizeof (mca_pml_ob1_rdma_hdr_t) + reg_size, MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK | MCA_BTL_DES_FLAGS_SIGNAL); if (OPAL_UNLIKELY(NULL == ctl)) { @@ -378,26 +412,19 @@ static int mca_pml_ob1_init_get_fallback (mca_pml_ob1_rdma_frag_t *frag, ctl->des_cbfunc = mca_pml_ob1_recv_ctl_completion; /* fill in rdma header */ - hdr = (mca_pml_ob1_rdma_hdr_t *) ctl->des_local->seg_addr.pval; - hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_PUT; - hdr->hdr_common.hdr_flags = - (!recvreq->req_ack_sent) ? MCA_PML_OB1_HDR_TYPE_ACK : 0; + hdr = (mca_pml_ob1_rdma_hdr_t *) ctl->des_segments->seg_addr.pval; + mca_pml_ob1_rdma_hdr_prepare (hdr, (!recvreq->req_ack_sent) ? MCA_PML_OB1_HDR_TYPE_ACK : 0, + recvreq->remote_req_send.lval, frag, recvreq, frag->rdma_offset, + frag->local_address, frag->rdma_length, frag->local_handle, + reg_size); - hdr->hdr_req = frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req; - hdr->hdr_rdma_offset = recvreq->req_rdma_offset; - hdr->hdr_des.pval = dst; - hdr->hdr_recv_req.pval = recvreq; + frag->cbfunc = mca_pml_ob1_put_completion; - hdr->hdr_seg_cnt = dst->des_local_count; + recvreq->req_ack_sent = true; - /* copy segments */ - memcpy (hdr + 1, dst->des_local, seg_size); - - dst->des_cbfunc = mca_pml_ob1_put_completion; - dst->des_cbdata = recvreq; - - if (!recvreq->req_ack_sent) - recvreq->req_ack_sent = true; + PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE, + &(recvreq->req_recv.req_base), size, + PERUSE_RECV); /* send rdma request to peer */ rc = mca_bml_base_send (bml_btl, ctl, MCA_PML_OB1_HDR_TYPE_PUT); @@ -412,71 +439,38 @@ static int mca_pml_ob1_init_get_fallback (mca_pml_ob1_rdma_frag_t *frag, /* * */ -int mca_pml_ob1_recv_request_get_frag( mca_pml_ob1_rdma_frag_t* frag ) +int mca_pml_ob1_recv_request_get_frag (mca_pml_ob1_rdma_frag_t *frag) { - mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)frag->rdma_req; - mca_bml_base_btl_t* bml_btl = frag->rdma_bml; - mca_btl_base_descriptor_t* descriptor; - size_t save_size = frag->rdma_length; + mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req; + mca_btl_base_registration_handle_t *local_handle = NULL; + mca_bml_base_btl_t *bml_btl = frag->rdma_bml; int rc; /* prepare descriptor */ - mca_bml_base_prepare_dst( bml_btl, - NULL, - &recvreq->req_recv.req_base.req_convertor, - MCA_BTL_NO_ORDER, - 0, - &frag->rdma_length, - MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK | - MCA_BTL_DES_FLAGS_GET, - &descriptor ); - if( OPAL_UNLIKELY(NULL == descriptor) ) { - if (frag->retries < mca_pml_ob1.rdma_retries_limit) { - frag->rdma_length = save_size; - OPAL_THREAD_LOCK(&mca_pml_ob1.lock); - opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag); - OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock); - return OMPI_ERR_OUT_OF_RESOURCE; - } else { - ompi_proc_t *proc = (ompi_proc_t *) recvreq->req_recv.req_base.req_proc; - - /* tell peer to fall back on send */ - recvreq->req_send_offset = 0; - rc = mca_pml_ob1_recv_request_ack_send(proc, frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req.lval, - recvreq, recvreq->req_send_offset, true); - MCA_PML_OB1_RDMA_FRAG_RETURN(frag); - return rc; + if (bml_btl->btl->btl_register_mem && !frag->local_handle && !recvreq->local_handle) { + mca_bml_base_register_mem (bml_btl, frag->local_address, frag->rdma_length, MCA_BTL_REG_FLAG_LOCAL_WRITE | + MCA_BTL_REG_FLAG_REMOTE_WRITE, &frag->local_handle); + if (OPAL_UNLIKELY(NULL == frag->local_handle)) { + return mca_pml_ob1_recv_request_get_frag_failed (frag, OMPI_ERR_OUT_OF_RESOURCE); } } - descriptor->des_remote = (mca_btl_base_segment_t *) frag->rdma_segs; - descriptor->des_remote_count = frag->rdma_hdr.hdr_rdma.hdr_seg_cnt; - descriptor->des_cbfunc = mca_pml_ob1_rget_completion; - descriptor->des_cbdata = frag; + if (frag->local_handle) { + local_handle = frag->local_handle; + } else if (recvreq->local_handle) { + local_handle = recvreq->local_handle; + } PERUSE_TRACE_COMM_OMPI_EVENT(PERUSE_COMM_REQ_XFER_CONTINUE, - &(recvreq->req_recv.req_base), + &(((mca_pml_ob1_recv_request_t *) frag->rdma_req)->req_recv.req_base), frag->rdma_length, PERUSE_RECV); /* queue up get request */ - rc = mca_bml_base_get(bml_btl,descriptor); + rc = mca_bml_base_get (bml_btl, frag->local_address, frag->remote_address, local_handle, + (mca_btl_base_registration_handle_t *) frag->remote_handle, frag->rdma_length, + 0, MCA_BTL_NO_ORDER, mca_pml_ob1_rget_completion, frag); if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) { - if (OPAL_UNLIKELY(OMPI_ERR_NOT_AVAILABLE == rc)) { - /* get isn't supported for this transfer. tell peer to fallback on put */ - rc = mca_pml_ob1_init_get_fallback (frag, descriptor); - } - - if(OMPI_ERR_OUT_OF_RESOURCE == rc) { - mca_bml_base_free(bml_btl, descriptor); - OPAL_THREAD_LOCK(&mca_pml_ob1.lock); - opal_list_append(&mca_pml_ob1.rdma_pending, - (opal_list_item_t*)frag); - OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock); - return OMPI_ERR_OUT_OF_RESOURCE; - } else if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - OMPI_ERROR_LOG(rc); - ompi_rte_abort(-1, NULL); - } + return mca_pml_ob1_recv_request_get_frag_failed (frag, OMPI_ERR_OUT_OF_RESOURCE); } return OMPI_SUCCESS; @@ -502,6 +496,7 @@ void mca_pml_ob1_recv_request_progress_frag( mca_pml_ob1_recv_request_t* recvreq bytes_received = mca_pml_ob1_compute_segment_length_base (segments, num_segments, sizeof(mca_pml_ob1_frag_hdr_t)); data_offset = hdr->hdr_frag.hdr_frag_offset; + /* * Make user buffer accessible(defined) before unpacking. */ @@ -573,7 +568,7 @@ void mca_pml_ob1_recv_request_frag_copy_start( mca_pml_ob1_recv_request_t* recvr /* Store the receive request in unused context pointer. */ des->des_context = (void *)recvreq; /* Store the amount of bytes in unused remote count value */ - des->des_remote_count = bytes_delivered; + des->des_segment_count = bytes_delivered; /* Then record an event that will get triggered by a PML progress call which * checks the stream events. If we get an error, abort. Should get message * from CUDA code about what went wrong. */ @@ -598,7 +593,7 @@ void mca_pml_ob1_recv_request_frag_copy_finished( mca_btl_base_module_t* btl, int status ) { mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)des->des_context; - size_t bytes_received = des->des_remote_count; + size_t bytes_received = des->des_segment_count; OPAL_OUTPUT((-1, "frag_copy_finished (delivered=%d), frag=%p", (int)bytes_received, (void *)des)); /* Call into the BTL so it can free the descriptor. At this point, it is @@ -629,7 +624,6 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq mca_pml_ob1_rget_hdr_t* hdr = (mca_pml_ob1_rget_hdr_t*)segments->seg_addr.pval; mca_bml_base_endpoint_t* bml_endpoint = NULL; size_t bytes_remaining, prev_sent, offset; - mca_btl_base_segment_t *r_segments; mca_pml_ob1_rdma_frag_t *frag; mca_bml_base_btl_t *rdma_bml; int rc; @@ -637,6 +631,7 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq prev_sent = offset = 0; bytes_remaining = hdr->hdr_rndv.hdr_msg_length; recvreq->req_recv.req_bytes_packed = hdr->hdr_rndv.hdr_msg_length; + recvreq->req_send_offset = 0; MCA_PML_OB1_RECV_REQUEST_MATCHED(recvreq, &hdr->hdr_rndv.hdr_match); @@ -680,8 +675,28 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq ompi_rte_abort(-1, NULL); } - bytes_remaining = mca_pml_ob1_compute_segment_length_remote (btl->btl_seg_size, (void *)(hdr + 1), - hdr->hdr_seg_cnt, recvreq->req_recv.req_base.req_proc); + bytes_remaining = hdr->hdr_rndv.hdr_msg_length; + + /* save the request for put fallback */ + recvreq->remote_req_send = hdr->hdr_rndv.hdr_src_req; + recvreq->rdma_bml = rdma_bml; + + /* try to register the entire buffer */ + if (rdma_bml->btl->btl_register_mem) { + void *data_ptr; + + offset = 0; + + OPAL_THREAD_LOCK(&recvreq->lock); + opal_convertor_set_position( &recvreq->req_recv.req_base.req_convertor, &offset); + opal_convertor_get_current_pointer (&recvreq->req_recv.req_base.req_convertor, &data_ptr); + OPAL_THREAD_UNLOCK(&recvreq->lock); + + mca_bml_base_register_mem (rdma_bml, data_ptr, bytes_remaining, MCA_BTL_REG_FLAG_LOCAL_WRITE | + MCA_BTL_REG_FLAG_REMOTE_WRITE, &recvreq->local_handle); + /* It is not an error if the memory region can not be registered here. The registration will + * be attempted again for each get fragment. */ + } /* The while loop adds a fragmentation mechanism. The variable bytes_remaining holds the num * of bytes left to be send. In each iteration we send the max possible bytes supported @@ -690,7 +705,12 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq * the next iteration with the updated size. * Also - In each iteration we update the location in the buffer to be used for writing * the message ,and the location to read from. This is done using the offset variable that - * accumulates the number of bytes that were sent so far. */ + * accumulates the number of bytes that were sent so far. + * + * NTH: This fragmentation may go away if we change the btls to require them to handle + * get fragmentation internally. This is a reasonable solution since some btls do not + * need any fragmentation (sm, vader, self, etc). Remove this loop if this ends up + * being the case. */ while (bytes_remaining > 0) { /* allocate/initialize a fragment */ MCA_PML_OB1_RDMA_FRAG_ALLOC(frag); @@ -700,29 +720,31 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq ompi_rte_abort(-1, NULL); } - assert (btl->btl_seg_size * hdr->hdr_seg_cnt <= sizeof (frag->rdma_segs)); + memcpy (frag->remote_handle, hdr + 1, btl->btl_registration_handle_size); - memcpy (frag->rdma_segs, hdr + 1, btl->btl_seg_size * hdr->hdr_seg_cnt); - - /* update the read location -- NTH: note this will only work if there is exactly one - segment. TODO -- make this work with multiple segments */ - r_segments = (mca_btl_base_segment_t *) frag->rdma_segs; - r_segments->seg_addr.lval += offset; + /* update the read location */ + frag->remote_address = hdr->hdr_src_ptr + offset; /* updating the write location */ OPAL_THREAD_LOCK(&recvreq->lock); opal_convertor_set_position( &recvreq->req_recv.req_base.req_convertor, &offset); + opal_convertor_get_current_pointer (&recvreq->req_recv.req_base.req_convertor, &frag->local_address); OPAL_THREAD_UNLOCK(&recvreq->lock); frag->rdma_bml = rdma_bml; frag->rdma_hdr.hdr_rget = *hdr; - frag->retries = 0; - frag->rdma_req = recvreq; - frag->rdma_ep = bml_endpoint; - frag->rdma_state = MCA_PML_OB1_RDMA_GET; - frag->reg = NULL; - frag->rdma_length = bytes_remaining; + frag->retries = 0; + frag->rdma_req = recvreq; + frag->rdma_state = MCA_PML_OB1_RDMA_GET; + frag->local_handle = NULL; + frag->rdma_offset = offset; + + if (bytes_remaining > rdma_bml->btl->btl_get_limit) { + frag->rdma_length = rdma_bml->btl->btl_get_limit; + } else { + frag->rdma_length = bytes_remaining; + } /* NTH: TODO -- handle error conditions gracefully */ rc = mca_pml_ob1_recv_request_get_frag(frag); @@ -921,13 +943,11 @@ int mca_pml_ob1_recv_request_schedule_once( mca_pml_ob1_recv_request_t* recvreq, while(bytes_remaining > 0 && recvreq->req_pipeline_depth < mca_pml_ob1.recv_pipeline_depth) { - size_t size, seg_size; - mca_pml_ob1_rdma_hdr_t* hdr; - mca_btl_base_descriptor_t* dst; - mca_btl_base_descriptor_t* ctl; - mca_mpool_base_registration_t * reg = NULL; - mca_btl_base_module_t* btl; + mca_pml_ob1_rdma_frag_t *frag = NULL; + mca_btl_base_module_t *btl; int rc, rdma_idx; + void *data_ptr; + size_t size; if(prev_bytes_remaining == bytes_remaining) { if(++num_fail == num_tries) { @@ -948,86 +968,62 @@ int mca_pml_ob1_recv_request_schedule_once( mca_pml_ob1_recv_request_t* recvreq, do { rdma_idx = recvreq->req_rdma_idx; bml_btl = recvreq->req_rdma[rdma_idx].bml_btl; - reg = recvreq->req_rdma[rdma_idx].btl_reg; size = recvreq->req_rdma[rdma_idx].length; if(++recvreq->req_rdma_idx >= recvreq->req_rdma_cnt) recvreq->req_rdma_idx = 0; } while(!size); btl = bml_btl->btl; - /* makes sure that we don't exceed BTL max rdma size - * if memory is not pinned already */ - if( (NULL == reg) && (btl->btl_rdma_pipeline_frag_size != 0) && - (size > btl->btl_rdma_pipeline_frag_size)) { + /* NTH: This conditional used to check if there was a registration in + * recvreq->req_rdma[rdma_idx].btl_reg. If once existed it was due to + * the btl not needed registration (equivalent to btl->btl_register_mem + * != NULL. This new check is equivalent. Note: I feel this protocol + * needs work to better improve resource usage when running with a + * leave pinned protocol. */ + if (btl->btl_register_mem && (btl->btl_rdma_pipeline_frag_size != 0) && + (size > btl->btl_rdma_pipeline_frag_size)) { size = btl->btl_rdma_pipeline_frag_size; } - /* take lock to protect converter against concurrent access + MCA_PML_OB1_RDMA_FRAG_ALLOC(frag); + if (OPAL_UNLIKELY(NULL == frag)) { + continue; + } + + /* take lock to protect convertor against concurrent access * from unpack */ OPAL_THREAD_LOCK(&recvreq->lock); - opal_convertor_set_position( &recvreq->req_recv.req_base.req_convertor, - &recvreq->req_rdma_offset ); - - /* prepare a descriptor for RDMA */ - mca_bml_base_prepare_dst(bml_btl, reg, - &recvreq->req_recv.req_base.req_convertor, - MCA_BTL_NO_ORDER, 0, &size, MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | - MCA_BTL_DES_FLAGS_PUT, &dst); + opal_convertor_set_position (&recvreq->req_recv.req_base.req_convertor, + &recvreq->req_rdma_offset); + opal_convertor_get_current_pointer (&recvreq->req_recv.req_base.req_convertor, &data_ptr); OPAL_THREAD_UNLOCK(&recvreq->lock); - if(OPAL_UNLIKELY(dst == NULL)) { - continue; + if (btl->btl_register_mem) { + mca_bml_base_register_mem (bml_btl, data_ptr, size, MCA_BTL_REG_FLAG_REMOTE_WRITE, + &frag->local_handle); + if (OPAL_UNLIKELY(NULL == frag->local_handle)) { + MCA_PML_OB1_RDMA_FRAG_RETURN(frag); + continue; + } } - dst->des_cbfunc = mca_pml_ob1_put_completion; - dst->des_cbdata = recvreq; + /* fill in the minimum information needed to handle the fin message */ + frag->cbfunc = mca_pml_ob1_put_completion; + frag->rdma_length = size; + frag->rdma_req = recvreq; + frag->rdma_bml = bml_btl; + frag->local_address = data_ptr; + frag->rdma_offset = recvreq->req_rdma_offset; - seg_size = btl->btl_seg_size * dst->des_local_count; - - /* prepare a descriptor for rdma control message */ - mca_bml_base_alloc(bml_btl, &ctl, MCA_BTL_NO_ORDER, sizeof(mca_pml_ob1_rdma_hdr_t) + seg_size, - MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | - MCA_BTL_DES_SEND_ALWAYS_CALLBACK | MCA_BTL_DES_FLAGS_SIGNAL); - - if( OPAL_UNLIKELY(NULL == ctl) ) { - mca_bml_base_free(bml_btl,dst); - continue; - } - ctl->des_cbfunc = mca_pml_ob1_recv_ctl_completion; - - /* fill in rdma header */ - hdr = (mca_pml_ob1_rdma_hdr_t*)ctl->des_local->seg_addr.pval; - hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_PUT; - hdr->hdr_common.hdr_flags = - (!recvreq->req_ack_sent) ? MCA_PML_OB1_HDR_TYPE_ACK : 0; - hdr->hdr_req = recvreq->remote_req_send; - hdr->hdr_des.pval = dst; - hdr->hdr_recv_req.pval = recvreq; - hdr->hdr_rdma_offset = recvreq->req_rdma_offset; - hdr->hdr_seg_cnt = dst->des_local_count; - - /* copy segments */ - memmove (hdr + 1, dst->des_local, seg_size); - - if(!recvreq->req_ack_sent) - recvreq->req_ack_sent = true; - ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_PUT, recvreq->req_recv.req_base.req_proc); - - PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE, - &(recvreq->req_recv.req_base), size, - PERUSE_RECV); - - /* send rdma request to peer */ - rc = mca_bml_base_send(bml_btl, ctl, MCA_PML_OB1_HDR_TYPE_PUT); - if( OPAL_LIKELY( rc >= 0 ) ) { + rc = mca_pml_ob1_recv_request_put_frag (frag); + if (OPAL_LIKELY(OMPI_SUCCESS == rc)) { /* update request state */ recvreq->req_rdma_offset += size; OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth, 1); recvreq->req_rdma[rdma_idx].length -= size; bytes_remaining -= size; } else { - mca_bml_base_free(bml_btl,ctl); - mca_bml_base_free(bml_btl,dst); + MCA_PML_OB1_RDMA_FRAG_RETURN(frag); } } diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.h b/ompi/mca/pml/ob1/pml_ob1_recvreq.h index e49d5b6013..4340ade3f3 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.h @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology @@ -10,7 +11,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. - * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. @@ -52,6 +53,8 @@ struct mca_pml_ob1_recv_request_t { bool req_ack_sent; /**< whether ack was sent to the sender */ bool req_match_received; /**< Prevent request to be completed prematurely */ opal_mutex_t lock; + mca_bml_base_btl_t *rdma_bml; + mca_btl_base_registration_handle_t *local_handle; mca_pml_ob1_com_btl_t req_rdma[1]; }; typedef struct mca_pml_ob1_recv_request_t mca_pml_ob1_recv_request_t; @@ -131,8 +134,12 @@ do { \ #define MCA_PML_OB1_RECV_REQUEST_RETURN(recvreq) \ { \ MCA_PML_BASE_RECV_REQUEST_FINI(&(recvreq)->req_recv); \ - OMPI_FREE_LIST_RETURN_MT( &mca_pml_base_recv_requests, \ - (ompi_free_list_item_t*)(recvreq)); \ + if ((recvreq)->local_handle) { \ + mca_bml_base_deregister_mem ((recvreq)->rdma_bml, (recvreq)->local_handle); \ + (recvreq)->local_handle = NULL; \ + } \ + OMPI_FREE_LIST_RETURN_MT( &mca_pml_base_recv_requests, \ + (ompi_free_list_item_t*)(recvreq)); \ } /** @@ -154,9 +161,11 @@ recv_request_pml_complete(mca_pml_ob1_recv_request_t *recvreq) } for(i = 0; i < recvreq->req_rdma_cnt; i++) { - mca_mpool_base_registration_t* btl_reg = recvreq->req_rdma[i].btl_reg; - if( NULL != btl_reg && btl_reg->mpool != NULL) { - btl_reg->mpool->mpool_deregister( btl_reg->mpool, btl_reg ); + struct mca_btl_base_registration_handle_t *handle = recvreq->req_rdma[i].btl_reg; + mca_bml_base_btl_t *bml_btl = recvreq->req_rdma[i].bml_btl; + + if (NULL != handle) { + mca_bml_base_deregister_mem (bml_btl, handle); } } recvreq->req_rdma_cnt = 0; @@ -178,6 +187,10 @@ recv_request_pml_complete(mca_pml_ob1_recv_request_t *recvreq) recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR = MPI_ERR_TRUNCATE; } + if (OPAL_UNLIKELY(recvreq->local_handle)) { + mca_bml_base_deregister_mem (recvreq->rdma_bml, recvreq->local_handle); + recvreq->local_handle = NULL; + } MCA_PML_OB1_RECV_REQUEST_MPI_COMPLETE(recvreq); } OPAL_THREAD_UNLOCK(&ompi_request_lock); @@ -387,7 +400,7 @@ static inline void mca_pml_ob1_recv_request_schedule( (void)mca_pml_ob1_recv_request_schedule_exclusive(req, start_bml_btl); } -#define MCA_PML_OB1_ADD_ACK_TO_PENDING(P, S, D, O) \ +#define MCA_PML_OB1_ADD_ACK_TO_PENDING(P, S, D, O, Sz) \ do { \ mca_pml_ob1_pckt_pending_t *_pckt; \ \ @@ -396,6 +409,7 @@ static inline void mca_pml_ob1_recv_request_schedule( _pckt->hdr.hdr_ack.hdr_src_req.lval = (S); \ _pckt->hdr.hdr_ack.hdr_dst_req.pval = (D); \ _pckt->hdr.hdr_ack.hdr_send_offset = (O); \ + _pckt->hdr.hdr_ack.hdr_send_size = (Sz); \ _pckt->proc = (P); \ _pckt->bml_btl = NULL; \ OPAL_THREAD_LOCK(&mca_pml_ob1.lock); \ @@ -406,11 +420,11 @@ static inline void mca_pml_ob1_recv_request_schedule( int mca_pml_ob1_recv_request_ack_send_btl(ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl, uint64_t hdr_src_req, void *hdr_dst_req, - uint64_t hdr_rdma_offset, bool nordma); + uint64_t hdr_rdma_offset, uint64_t size, bool nordma); static inline int mca_pml_ob1_recv_request_ack_send(ompi_proc_t* proc, uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset, - bool nordma) + uint64_t size, bool nordma) { size_t i; mca_bml_base_btl_t* bml_btl; @@ -420,12 +434,12 @@ static inline int mca_pml_ob1_recv_request_ack_send(ompi_proc_t* proc, for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) { bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager); if(mca_pml_ob1_recv_request_ack_send_btl(proc, bml_btl, hdr_src_req, - hdr_dst_req, hdr_send_offset, nordma) == OMPI_SUCCESS) + hdr_dst_req, hdr_send_offset, size, nordma) == OMPI_SUCCESS) return OMPI_SUCCESS; } MCA_PML_OB1_ADD_ACK_TO_PENDING(proc, hdr_src_req, hdr_dst_req, - hdr_send_offset); + hdr_send_offset, size); return OMPI_ERR_OUT_OF_RESOURCE; } diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.c b/ompi/mca/pml/ob1/pml_ob1_sendreq.c index 86d7dc0dce..b9c6dedc1b 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c @@ -13,7 +13,7 @@ * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 NVIDIA Corporation. All rights reserved. - * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -137,6 +137,7 @@ static void mca_pml_ob1_send_request_construct(mca_pml_ob1_send_request_t* req) req->req_send.req_base.req_ompi.req_cancel = mca_pml_ob1_send_request_cancel; req->req_rdma_cnt = 0; req->req_throttle_sends = false; + req->rdma_frag = NULL; OBJ_CONSTRUCT(&req->req_send_ranges, opal_list_t); OBJ_CONSTRUCT(&req->req_send_range_lock, opal_mutex_t); } @@ -145,6 +146,10 @@ static void mca_pml_ob1_send_request_destruct(mca_pml_ob1_send_request_t* req) { OBJ_DESTRUCT(&req->req_send_ranges); OBJ_DESTRUCT(&req->req_send_range_lock); + if (req->rdma_frag) { + MCA_PML_OB1_RDMA_FRAG_RETURN(req->rdma_frag); + req->rdma_frag = NULL; + } } OBJ_CLASS_INSTANCE( mca_pml_ob1_send_request_t, @@ -236,10 +241,9 @@ mca_pml_ob1_rndv_completion( mca_btl_base_module_t* btl, * happens in one thread, the increase of the req_bytes_delivered does not * have to be atomic. */ - req_bytes_delivered = mca_pml_ob1_compute_segment_length (btl->btl_seg_size, - (void *) des->des_local, - des->des_local_count, - sizeof(mca_pml_ob1_rendezvous_hdr_t)); + req_bytes_delivered = mca_pml_ob1_compute_segment_length_base ((void *) des->des_segments, + des->des_segment_count, + sizeof(mca_pml_ob1_rendezvous_hdr_t)); mca_pml_ob1_rndv_completion_request( bml_btl, sendreq, req_bytes_delivered ); } @@ -250,27 +254,18 @@ mca_pml_ob1_rndv_completion( mca_btl_base_module_t* btl, */ static void -mca_pml_ob1_rget_completion( mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* ep, - struct mca_btl_base_descriptor_t* des, - int status ) +mca_pml_ob1_rget_completion (mca_pml_ob1_rdma_frag_t *frag, int64_t rdma_length) { - mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)des->des_cbdata; - mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context; - size_t req_bytes_delivered; + mca_pml_ob1_send_request_t *sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req; + mca_bml_base_btl_t *bml_btl = frag->rdma_bml; /* count bytes of user data actually delivered and check for request completion */ - if (OPAL_LIKELY(OMPI_SUCCESS == status)) { - req_bytes_delivered = mca_pml_ob1_compute_segment_length (btl->btl_seg_size, - (void *) des->des_local, - des->des_local_count, 0); - OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered); + if (OPAL_LIKELY(0 < rdma_length)) { + OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, (size_t) rdma_length); } - sendreq->src_des = NULL; send_request_pml_complete_check(sendreq); - /* free the descriptor */ - mca_bml_base_free(bml_btl, des); + MCA_PML_OB1_PROGRESS_PENDING(bml_btl); } @@ -314,10 +309,9 @@ mca_pml_ob1_frag_completion( mca_btl_base_module_t* btl, } /* count bytes of user data actually delivered */ - req_bytes_delivered = mca_pml_ob1_compute_segment_length (btl->btl_seg_size, - (void *) des->des_local, - des->des_local_count, - sizeof(mca_pml_ob1_frag_hdr_t)); + req_bytes_delivered = mca_pml_ob1_compute_segment_length_base ((void *) des->des_segments, + des->des_segment_count, + sizeof(mca_pml_ob1_frag_hdr_t)); OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, -1); OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered); @@ -389,7 +383,7 @@ int mca_pml_ob1_send_request_start_buffered( if( OPAL_UNLIKELY(NULL == des) ) { return OMPI_ERR_OUT_OF_RESOURCE; } - segment = des->des_local; + segment = des->des_segments; /* pack the data into the BTL supplied buffer */ iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval + @@ -408,17 +402,14 @@ int mca_pml_ob1_send_request_start_buffered( /* build rendezvous header */ hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval; - hdr->hdr_common.hdr_flags = 0; - hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_RNDV; - hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; - hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; - hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; - hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; - hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed; - hdr->hdr_rndv.hdr_src_req.pval = sendreq; + mca_pml_ob1_rendezvous_hdr_prepare (&hdr->hdr_rndv, MCA_PML_OB1_HDR_TYPE_RNDV, 0, + sendreq->req_send.req_base.req_comm->c_contextid, + sendreq->req_send.req_base.req_comm->c_my_rank, + sendreq->req_send.req_base.req_tag, + (uint16_t)sendreq->req_send.req_base.req_sequence, + sendreq->req_send.req_bytes_packed, sendreq); - ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV, - sendreq->req_send.req_base.req_proc); + ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV, sendreq->req_send.req_base.req_proc); /* update lengths */ segment->seg_len = sizeof(mca_pml_ob1_rendezvous_hdr_t) + max_data; @@ -491,15 +482,13 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq, if(NULL != bml_btl->btl->btl_sendi) { mca_pml_ob1_match_hdr_t match; - match.hdr_common.hdr_flags = 0; - match.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH; - match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; - match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; - match.hdr_tag = sendreq->req_send.req_base.req_tag; - match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; + mca_pml_ob1_match_hdr_prepare (&match, MCA_PML_OB1_HDR_TYPE_MATCH, 0, + sendreq->req_send.req_base.req_comm->c_contextid, + sendreq->req_send.req_base.req_comm->c_my_rank, + sendreq->req_send.req_base.req_tag, + (uint16_t)sendreq->req_send.req_base.req_sequence); - ob1_hdr_hton(&match, MCA_PML_OB1_HDR_TYPE_MATCH, - sendreq->req_send.req_base.req_proc); + ob1_hdr_hton (&match, MCA_PML_OB1_HDR_TYPE_MATCH, sendreq->req_send.req_base.req_proc); /* try to send immediately */ rc = mca_bml_base_sendi( bml_btl, &sendreq->req_send.req_base.req_convertor, @@ -532,7 +521,7 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq, return OMPI_ERR_OUT_OF_RESOURCE; } - segment = des->des_local; + segment = des->des_segments; if(size > 0) { /* pack the data into the supplied buffer */ @@ -566,15 +555,13 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq, /* build match header */ hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval; - hdr->hdr_common.hdr_flags = 0; - hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH; - hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; - hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; - hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; - hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; + mca_pml_ob1_match_hdr_prepare (&hdr->hdr_match, MCA_PML_OB1_HDR_TYPE_MATCH, 0, + sendreq->req_send.req_base.req_comm->c_contextid, + sendreq->req_send.req_base.req_comm->c_my_rank, + sendreq->req_send.req_base.req_tag, + (uint16_t)sendreq->req_send.req_base.req_sequence); - ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH, - sendreq->req_send.req_base.req_proc); + ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH, sendreq->req_send.req_base.req_proc); /* update lengths */ segment->seg_len = OMPI_PML_OB1_MATCH_HDR_LEN + max_data; @@ -618,7 +605,6 @@ int mca_pml_ob1_send_request_start_prepare( mca_pml_ob1_send_request_t* sendreq, /* prepare descriptor */ mca_bml_base_prepare_src( bml_btl, - NULL, &sendreq->req_send.req_base.req_convertor, MCA_BTL_NO_ORDER, OMPI_PML_OB1_MATCH_HDR_LEN, @@ -628,19 +614,17 @@ int mca_pml_ob1_send_request_start_prepare( mca_pml_ob1_send_request_t* sendreq, if( OPAL_UNLIKELY(NULL == des) ) { return OMPI_ERR_OUT_OF_RESOURCE; } - segment = des->des_local; + segment = des->des_segments; /* build match header */ hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval; - hdr->hdr_common.hdr_flags = 0; - hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH; - hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; - hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; - hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; - hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; + mca_pml_ob1_match_hdr_prepare (&hdr->hdr_match, MCA_PML_OB1_HDR_TYPE_MATCH, 0, + sendreq->req_send.req_base.req_comm->c_contextid, + sendreq->req_send.req_base.req_comm->c_my_rank, + sendreq->req_send.req_base.req_tag, + (uint16_t)sendreq->req_send.req_base.req_sequence); - ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH, - sendreq->req_send.req_base.req_proc); + ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH, sendreq->req_send.req_base.req_proc); /* short message */ des->des_cbfunc = mca_pml_ob1_match_completion_free; @@ -674,80 +658,68 @@ int mca_pml_ob1_send_request_start_rdma( mca_pml_ob1_send_request_t* sendreq, * one RDMA capable BTLs). This way round robin distribution of RDMA * operation is achieved. */ - - mca_btl_base_descriptor_t *des, *src = NULL; + mca_btl_base_registration_handle_t *local_handle; + mca_btl_base_descriptor_t *des; + mca_pml_ob1_rdma_frag_t *frag; mca_pml_ob1_rget_hdr_t *hdr; - size_t seg_size; + size_t reg_size; + void *data_ptr; int rc; - sendreq->src_des = NULL; - bml_btl = sendreq->req_rdma[0].bml_btl; if (!(bml_btl->btl_flags & (MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_CUDA_GET))) { + sendreq->rdma_frag = NULL; /* This BTL does not support get. Use rendezvous to start the RDMA operation using put instead. */ return mca_pml_ob1_send_request_start_rndv (sendreq, bml_btl, 0, MCA_PML_OB1_HDR_FLAGS_CONTIG | MCA_PML_OB1_HDR_FLAGS_PIN); } - MEMCHECKER( - memchecker_call(&opal_memchecker_base_mem_defined, - sendreq->req_send.req_base.req_addr, - sendreq->req_send.req_base.req_count, - sendreq->req_send.req_base.req_datatype); - ); - /* prepare source descriptor/segment(s) */ - /* PML owns this descriptor and will free it in */ - /* mca_pml_ob1_rget_completion */ - mca_bml_base_prepare_src( bml_btl, sendreq->req_rdma[0].btl_reg, - &sendreq->req_send.req_base.req_convertor, - MCA_BTL_NO_ORDER, 0, &size, MCA_BTL_DES_FLAGS_GET | - MCA_BTL_DES_FLAGS_BTL_OWNERSHIP, &src ); - MEMCHECKER( - memchecker_call(&opal_memchecker_base_mem_noaccess, - sendreq->req_send.req_base.req_addr, - sendreq->req_send.req_base.req_count, - sendreq->req_send.req_base.req_datatype); - ); - if( OPAL_UNLIKELY(NULL == src) ) { - return OMPI_ERR_OUT_OF_RESOURCE; + /* at this time ob1 does not support non-contiguous gets. the convertor represents a + * contiguous block of memory */ + opal_convertor_get_current_pointer (&sendreq->req_send.req_base.req_convertor, &data_ptr); + + local_handle = sendreq->req_rdma[0].btl_reg; + + /* allocate an rdma fragment to keep track of the request size for use in the fin message */ + MCA_PML_OB1_RDMA_FRAG_ALLOC(frag); + if (OPAL_UNLIKELY(NULL == frag)) { + return OPAL_ERR_OUT_OF_RESOURCE; } - - src->des_cbfunc = mca_pml_ob1_rget_completion; - src->des_cbdata = sendreq; - sendreq->src_des = src; + /* fill in necessary fragment data */ + frag->rdma_req = sendreq; + frag->rdma_bml = bml_btl; + frag->rdma_length = size; + frag->cbfunc = mca_pml_ob1_rget_completion; + /* do not store the local handle in the fragment. it will be released by mca_pml_ob1_free_rdma_resources */ - seg_size = bml_btl->btl->btl_seg_size * src->des_local_count; + /* save the fragment for get->put fallback */ + sendreq->rdma_frag = frag; + + reg_size = bml_btl->btl->btl_registration_handle_size; /* allocate space for get hdr + segment list */ - mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, sizeof (*hdr) + seg_size, + mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, sizeof (*hdr) + reg_size, MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_FLAGS_SIGNAL); if( OPAL_UNLIKELY(NULL == des) ) { /* NTH: no need to reset the converter here. it will be reset before it is retried */ - mca_bml_base_free(bml_btl, src); return OMPI_ERR_OUT_OF_RESOURCE; } /* build match header */ - hdr = (mca_pml_ob1_rget_hdr_t *) des->des_local->seg_addr.pval; - - hdr->hdr_rndv.hdr_match.hdr_common.hdr_flags = MCA_PML_OB1_HDR_FLAGS_CONTIG|MCA_PML_OB1_HDR_FLAGS_PIN; - hdr->hdr_rndv.hdr_match.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_RGET; - hdr->hdr_rndv.hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; - hdr->hdr_rndv.hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; - hdr->hdr_rndv.hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; - hdr->hdr_rndv.hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; - hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed; - hdr->hdr_rndv.hdr_src_req.pval = sendreq; - hdr->hdr_des.pval = src; - hdr->hdr_seg_cnt = src->des_local_count; + hdr = (mca_pml_ob1_rget_hdr_t *) des->des_segments->seg_addr.pval; + /* TODO -- Add support for multiple segments for get */ + mca_pml_ob1_rget_hdr_prepare (hdr, MCA_PML_OB1_HDR_FLAGS_CONTIG | MCA_PML_OB1_HDR_FLAGS_PIN, + sendreq->req_send.req_base.req_comm->c_contextid, + sendreq->req_send.req_base.req_comm->c_my_rank, + sendreq->req_send.req_base.req_tag, + (uint16_t)sendreq->req_send.req_base.req_sequence, + sendreq->req_send.req_bytes_packed, sendreq, + frag, data_ptr, local_handle, reg_size); ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RGET, sendreq->req_send.req_base.req_proc); - /* copy segment data */ - memcpy (hdr + 1, src->des_local, seg_size); - des->des_cbfunc = mca_pml_ob1_send_ctl_completion; des->des_cbdata = sendreq; @@ -765,12 +737,6 @@ int mca_pml_ob1_send_request_start_rdma( mca_pml_ob1_send_request_t* sendreq, rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_RGET); if (OPAL_UNLIKELY(rc < 0)) { mca_bml_base_free(bml_btl, des); - - if (sendreq->src_des) { - mca_bml_base_free (bml_btl, sendreq->src_des); - sendreq->src_des = NULL; - } - return rc; } @@ -808,7 +774,6 @@ int mca_pml_ob1_send_request_start_rndv( mca_pml_ob1_send_request_t* sendreq, sendreq->req_send.req_base.req_datatype); ); mca_bml_base_prepare_src( bml_btl, - NULL, &sendreq->req_send.req_base.req_convertor, MCA_BTL_NO_ORDER, sizeof(mca_pml_ob1_rendezvous_hdr_t), @@ -827,21 +792,19 @@ int mca_pml_ob1_send_request_start_rndv( mca_pml_ob1_send_request_t* sendreq, if( OPAL_UNLIKELY(NULL == des) ) { return OMPI_ERR_OUT_OF_RESOURCE; } - segment = des->des_local; + segment = des->des_segments; /* build hdr */ hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval; - hdr->hdr_common.hdr_flags = flags | MCA_PML_OB1_HDR_FLAGS_SIGNAL; - hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_RNDV; - hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; - hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; - hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; - hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; - hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed; - hdr->hdr_rndv.hdr_src_req.pval = sendreq; + mca_pml_ob1_rendezvous_hdr_prepare (&hdr->hdr_rndv, MCA_PML_OB1_HDR_TYPE_RNDV, flags | + MCA_PML_OB1_HDR_FLAGS_SIGNAL, + sendreq->req_send.req_base.req_comm->c_contextid, + sendreq->req_send.req_base.req_comm->c_my_rank, + sendreq->req_send.req_base.req_tag, + (uint16_t)sendreq->req_send.req_base.req_sequence, + sendreq->req_send.req_bytes_packed, sendreq); - ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV, - sendreq->req_send.req_base.req_proc); + ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV, sendreq->req_send.req_base.req_proc); /* first fragment of a long message */ des->des_cbdata = sendreq; @@ -1022,13 +985,10 @@ cannot_pack: sendreq->req_send.req_base.req_count, sendreq->req_send.req_base.req_datatype); ); - mca_bml_base_prepare_src(bml_btl, NULL, - &sendreq->req_send.req_base.req_convertor, - MCA_BTL_NO_ORDER, - sizeof(mca_pml_ob1_frag_hdr_t), + mca_bml_base_prepare_src(bml_btl, &sendreq->req_send.req_base.req_convertor, + MCA_BTL_NO_ORDER, sizeof(mca_pml_ob1_frag_hdr_t), &size, MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK | - MCA_BTL_DES_FLAGS_SIGNAL, - &des); + MCA_BTL_DES_FLAGS_SIGNAL, &des); MEMCHECKER( memchecker_call(&opal_memchecker_base_mem_noaccess, sendreq->req_send.req_base.req_addr, @@ -1051,12 +1011,9 @@ cannot_pack: des->des_cbdata = sendreq; /* setup header */ - hdr = (mca_pml_ob1_frag_hdr_t*)des->des_local->seg_addr.pval; - hdr->hdr_common.hdr_flags = 0; - hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_FRAG; - hdr->hdr_frag_offset = range->range_send_offset; - hdr->hdr_src_req.pval = sendreq; - hdr->hdr_dst_req = sendreq->req_recv; + hdr = (mca_pml_ob1_frag_hdr_t*)des->des_segments->seg_addr.pval; + mca_pml_ob1_frag_hdr_prepare (hdr, 0, range->range_send_offset, sendreq, + sendreq->req_recv.lval); ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_FRAG, sendreq->req_send.req_base.req_proc); @@ -1113,38 +1070,66 @@ cannot_pack: } +/** + * A put fragment could not be started. Queue the fragment to be retried later or + * fall back on send/recv. + */ +static void mca_pml_ob1_send_request_put_frag_failed (mca_pml_ob1_rdma_frag_t *frag, int rc) +{ + mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req; + mca_bml_base_btl_t *bml_btl = frag->rdma_bml; + + if (++frag->retries < mca_pml_ob1.rdma_retries_limit && OMPI_ERR_OUT_OF_RESOURCE == rc) { + /* queue the frag for later if there was a resource error */ + OPAL_THREAD_LOCK(&mca_pml_ob1.lock); + opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag); + OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock); + } else { + /* tell receiver to deregister memory */ + mca_pml_ob1_send_fin (sendreq->req_send.req_base.req_proc, bml_btl, + frag->rdma_hdr.hdr_rdma.hdr_frag, 0, MCA_BTL_NO_ORDER, + OPAL_ERR_TEMP_OUT_OF_RESOURCE); + + /* send fragment by copy in/out */ + mca_pml_ob1_send_request_copy_in_out(sendreq, frag->rdma_hdr.hdr_rdma.hdr_rdma_offset, + frag->rdma_length); + /* if a pointer to a receive request is not set it means that + * ACK was not yet received. Don't schedule sends before ACK */ + if (NULL != sendreq->req_recv.pval) + mca_pml_ob1_send_request_schedule (sendreq); + } +} + /** * An RDMA put operation has completed: * (1) Update request status and if required set completed - * (2) Send FIN control message to the destination + * (2) Send FIN control message to the destination */ -static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* ep, - struct mca_btl_base_descriptor_t* des, - int status ) +static void mca_pml_ob1_put_completion (mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* ep, + void *local_address, mca_btl_base_registration_handle_t *local_handle, + void *context, void *cbdata, int status) { - mca_pml_ob1_rdma_frag_t* frag = (mca_pml_ob1_rdma_frag_t*)des->des_cbdata; - mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)frag->rdma_req; - mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context; + mca_pml_ob1_rdma_frag_t *frag = (mca_pml_ob1_rdma_frag_t *) cbdata; + mca_pml_ob1_send_request_t *sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req; + mca_bml_base_btl_t *bml_btl = (mca_bml_base_btl_t *) context; /* check completion status */ - if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) { - /* TSW - FIX */ - OMPI_ERROR_LOG(status); - ompi_rte_abort(-1, NULL); + if( OPAL_UNLIKELY(OMPI_SUCCESS == status) ) { + /* TODO -- readd ordering */ + mca_pml_ob1_send_fin (sendreq->req_send.req_base.req_proc, bml_btl, + frag->rdma_hdr.hdr_rdma.hdr_frag, frag->rdma_length, + 0, 0); + + /* check for request completion */ + OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, frag->rdma_length); + + send_request_pml_complete_check(sendreq); + } else { + /* try to fall back on send/recv */ + mca_pml_ob1_send_request_put_frag_failed (frag, status); } - mca_pml_ob1_send_fin(sendreq->req_send.req_base.req_proc, - bml_btl, - frag->rdma_hdr.hdr_rdma.hdr_des, - des->order, 0); - - /* check for request completion */ - OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, frag->rdma_length); - - send_request_pml_complete_check(sendreq); - MCA_PML_OB1_RDMA_FRAG_RETURN(frag); MCA_PML_OB1_PROGRESS_PENDING(bml_btl); @@ -1152,81 +1137,45 @@ static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl, int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t *frag ) { - mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)frag->rdma_req; - mca_mpool_base_registration_t *reg = NULL; + mca_pml_ob1_send_request_t *sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req; + mca_btl_base_registration_handle_t *local_handle = NULL; mca_bml_base_btl_t *bml_btl = frag->rdma_bml; - mca_btl_base_descriptor_t *des; - size_t save_size = frag->rdma_length; int rc; - if (OPAL_LIKELY(NULL == sendreq->src_des)) { - /* setup descriptor */ - mca_bml_base_prepare_src( bml_btl, - reg, - &frag->convertor, - MCA_BTL_NO_ORDER, - 0, - &frag->rdma_length, - MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | - MCA_BTL_DES_FLAGS_PUT, - &des ); - - if( OPAL_UNLIKELY(NULL == des) ) { - if(frag->retries < mca_pml_ob1.rdma_retries_limit) { - size_t offset = (size_t)frag->rdma_hdr.hdr_rdma.hdr_rdma_offset; - frag->rdma_length = save_size; - opal_convertor_set_position(&frag->convertor, &offset); - OPAL_THREAD_LOCK(&mca_pml_ob1.lock); - opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag); - OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock); - } else { - mca_pml_ob1_send_request_t *sendreq = - (mca_pml_ob1_send_request_t*)frag->rdma_req; + if (bml_btl->btl->btl_register_mem && NULL == frag->local_handle) { + /* Check if the segment is already registered */ + for (size_t i = 0 ; i < sendreq->req_rdma_cnt ; ++i) { + if (sendreq->req_rdma[i].bml_btl == frag->rdma_bml) { + /* do not copy the handle to the fragment to avoid deregistring it twice */ + local_handle = sendreq->req_rdma[i].btl_reg; + break; + } + } - /* tell receiver to unregister memory */ - mca_pml_ob1_send_fin(sendreq->req_send.req_base.req_proc, - bml_btl, frag->rdma_hdr.hdr_rdma.hdr_des, - MCA_BTL_NO_ORDER, 1); + if (NULL == frag->local_handle) { + /* Not already registered. Register the region with the BTL. */ + mca_bml_base_register_mem (bml_btl, frag->local_address, frag->rdma_length, 0, + &frag->local_handle); - /* send fragment by copy in/out */ - mca_pml_ob1_send_request_copy_in_out(sendreq, - frag->rdma_hdr.hdr_rdma.hdr_rdma_offset, frag->rdma_length); - /* if a pointer to a receive request is not set it means that - * ACK was not yet received. Don't schedule sends before ACK */ - if(NULL != sendreq->req_recv.pval) - mca_pml_ob1_send_request_schedule(sendreq); + if (OPAL_UNLIKELY(NULL == frag->local_handle)) { + mca_pml_ob1_send_request_put_frag_failed (frag, OMPI_ERR_OUT_OF_RESOURCE); + + return OMPI_ERR_OUT_OF_RESOURCE; } - return OMPI_ERR_OUT_OF_RESOURCE; + local_handle = frag->local_handle; } - } else { - /* already have a source descriptor */ - des = sendreq->src_des; - sendreq->src_des = NULL; } - des->des_remote = (mca_btl_base_segment_t *) frag->rdma_segs; - des->des_remote_count = frag->rdma_hdr.hdr_rdma.hdr_seg_cnt; - des->des_cbfunc = mca_pml_ob1_put_completion; - des->des_cbdata = frag; - PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE, &(((mca_pml_ob1_send_request_t*)frag->rdma_req)->req_send.req_base), save_size, PERUSE_SEND ); - rc = mca_bml_base_put(bml_btl, des); + rc = mca_bml_base_put (bml_btl, frag->local_address, frag->remote_address, local_handle, + (mca_btl_base_registration_handle_t *) frag->remote_handle, frag->rdma_length, + 0, MCA_BTL_NO_ORDER, mca_pml_ob1_put_completion, frag); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - mca_bml_base_free(bml_btl, des); - frag->rdma_length = save_size; - if(OMPI_ERR_OUT_OF_RESOURCE == rc) { - OPAL_THREAD_LOCK(&mca_pml_ob1.lock); - opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag); - OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock); - return OMPI_ERR_OUT_OF_RESOURCE; - } else { - /* TSW - FIX */ - OMPI_ERROR_LOG(rc); - ompi_rte_abort(-1, NULL); - } + mca_pml_ob1_send_request_put_frag_failed (frag, rc); + return rc; } return OMPI_SUCCESS; @@ -1240,12 +1189,11 @@ int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t *frag ) */ void mca_pml_ob1_send_request_put( mca_pml_ob1_send_request_t* sendreq, - mca_btl_base_module_t* btl, + mca_btl_base_module_t* btl, mca_pml_ob1_rdma_hdr_t* hdr ) { mca_bml_base_endpoint_t *bml_endpoint = sendreq->req_endpoint; mca_pml_ob1_rdma_frag_t* frag; - size_t i, size = 0; if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_TYPE_ACK) { OPAL_THREAD_ADD32(&sendreq->req_state, -1); @@ -1253,61 +1201,36 @@ void mca_pml_ob1_send_request_put( mca_pml_ob1_send_request_t* sendreq, sendreq->req_recv.pval = hdr->hdr_recv_req.pval; - MCA_PML_OB1_RDMA_FRAG_ALLOC(frag); + if (NULL == sendreq->rdma_frag) { + MCA_PML_OB1_RDMA_FRAG_ALLOC(frag); - if( OPAL_UNLIKELY(NULL == frag) ) { - /* TSW - FIX */ - OMPI_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE); - ompi_rte_abort(-1, NULL); - } - - assert (btl->btl_seg_size * hdr->hdr_seg_cnt <= sizeof (frag->rdma_segs)); - - /* setup fragment */ - memcpy (frag->rdma_segs, hdr + 1, btl->btl_seg_size * hdr->hdr_seg_cnt); - - for( i = 0; i < hdr->hdr_seg_cnt; i++ ) { - mca_btl_base_segment_t *seg = (mca_btl_base_segment_t *) ((uintptr_t)(frag->rdma_segs) + i * btl->btl_seg_size); - -#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if ((sendreq->req_send.req_base.req_proc->super.proc_arch & OPAL_ARCH_ISBIGENDIAN) != - (ompi_proc_local()->super.proc_arch & OPAL_ARCH_ISBIGENDIAN)) { - size += opal_swap_bytes4(seg->seg_len); - } else -#endif - { - size += seg->seg_len; + if( OPAL_UNLIKELY(NULL == frag) ) { + /* TSW - FIX */ + OMPI_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE); + ompi_rte_abort(-1, NULL); } + } else { + /* rget fallback on put */ + frag = sendreq->rdma_frag; + sendreq->rdma_frag = NULL; + sendreq->req_state = 0; } + /* copy registration data */ + memcpy (frag->remote_handle, hdr + 1, btl->btl_registration_handle_size); + frag->rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl); frag->rdma_hdr.hdr_rdma = *hdr; frag->rdma_req = sendreq; - frag->rdma_ep = bml_endpoint; - frag->rdma_length = size; + frag->rdma_length = hdr->hdr_dst_size; frag->rdma_state = MCA_PML_OB1_RDMA_PUT; - frag->reg = NULL; + frag->remote_address = hdr->hdr_dst_ptr; frag->retries = 0; - if (OPAL_UNLIKELY(NULL != sendreq->src_des)) { - /* get fallback path */ - sendreq->req_state = 0; - } - - /* lookup the corresponding registration */ - for(i=0; ireq_rdma_cnt; i++) { - if(sendreq->req_rdma[i].bml_btl == frag->rdma_bml) { - frag->reg = sendreq->req_rdma[i].btl_reg; - break; - } - } - - /* RDMA writes may proceed in parallel to send and to each other, so - * create clone of the convertor for each RDMA fragment - */ - size = hdr->hdr_rdma_offset; - opal_convertor_clone_with_position(&sendreq->req_send.req_base.req_convertor, - &frag->convertor, 0, &size); + /* Get the address of the current offset. Note: at this time ob1 CAN NOT handle + * non-contiguous RDMA. If that changes this code will be wrong. */ + opal_convertor_get_offset_pointer (&sendreq->req_send.req_base.req_convertor, + hdr->hdr_rdma_offset, &frag->local_address); mca_pml_ob1_send_request_put_frag(frag); } diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.h b/ompi/mca/pml/ob1/pml_ob1_sendreq.h index ff9a85381f..d78845538f 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.h @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2011-2012 NVIDIA Corporation. All rights reserved. - * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -54,7 +54,7 @@ struct mca_pml_ob1_send_request_t { mca_pml_ob1_send_pending_t req_pending; opal_mutex_t req_send_range_lock; opal_list_t req_send_ranges; - mca_btl_base_descriptor_t *src_des; + mca_pml_ob1_rdma_frag_t *rdma_frag; mca_pml_ob1_com_btl_t req_rdma[1]; }; typedef struct mca_pml_ob1_send_request_t mca_pml_ob1_send_request_t; @@ -124,10 +124,9 @@ get_request_from_send_pending(mca_pml_ob1_send_pending_t *type) ompi_free_list_item_t* item; \ \ if( OPAL_LIKELY(NULL != proc) ) { \ - OMPI_FREE_LIST_WAIT_MT(&mca_pml_base_send_requests, item); \ + OMPI_FREE_LIST_WAIT_MT(&mca_pml_base_send_requests, item); \ sendreq = (mca_pml_ob1_send_request_t*)item; \ sendreq->req_send.req_base.req_proc = proc; \ - sendreq->src_des = NULL; \ } \ } @@ -163,15 +162,18 @@ get_request_from_send_pending(mca_pml_ob1_send_pending_t *type) assert( 0 == _position ); \ } -static inline void mca_pml_ob1_free_rdma_resources(mca_pml_ob1_send_request_t* sendreq) +static inline void mca_pml_ob1_free_rdma_resources (mca_pml_ob1_send_request_t* sendreq) { size_t r; /* return mpool resources */ for(r = 0; r < sendreq->req_rdma_cnt; r++) { - mca_mpool_base_registration_t* reg = sendreq->req_rdma[r].btl_reg; - if( NULL != reg && reg->mpool != NULL ) { - reg->mpool->mpool_deregister(reg->mpool, reg); + struct mca_btl_base_registration_handle_t *handle = sendreq->req_rdma[r].btl_reg; + mca_bml_base_btl_t *bml_btl = sendreq->req_rdma[r].bml_btl; + + if (NULL != handle) { + mca_bml_base_deregister_mem (bml_btl, handle); + sendreq->req_rdma[r].btl_reg = NULL; } } sendreq->req_rdma_cnt = 0; @@ -218,10 +220,14 @@ do { #define MCA_PML_OB1_SEND_REQUEST_RETURN(sendreq) \ do { \ - /* Let the base handle the reference counts */ \ - MCA_PML_BASE_SEND_REQUEST_FINI((&(sendreq)->req_send)); \ - OMPI_FREE_LIST_RETURN_MT( &mca_pml_base_send_requests, \ - (ompi_free_list_item_t*)sendreq); \ + /* Let the base handle the reference counts */ \ + MCA_PML_BASE_SEND_REQUEST_FINI((&(sendreq)->req_send)); \ + if (sendreq->rdma_frag) { \ + MCA_PML_OB1_RDMA_FRAG_RETURN (sendreq->rdma_frag); \ + sendreq->rdma_frag = NULL; \ + } \ + OMPI_FREE_LIST_RETURN_MT( &mca_pml_base_send_requests, \ + (ompi_free_list_item_t*)sendreq); \ } while(0) diff --git a/opal/datatype/opal_convertor.h b/opal/datatype/opal_convertor.h index 878fe724c0..0629b69aa2 100644 --- a/opal/datatype/opal_convertor.h +++ b/opal/datatype/opal_convertor.h @@ -1,4 +1,4 @@ -/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology @@ -219,6 +219,14 @@ static inline void opal_convertor_get_current_pointer( const opal_convertor_t* p *position = (void*)base; } +static inline void opal_convertor_get_offset_pointer( const opal_convertor_t* pConv, + size_t offset, void** position ) +{ + unsigned char* base = pConv->pBaseBuf + offset + pConv->pDesc->true_lb; + *position = (void*)base; +} + + /* * */ diff --git a/opal/mca/btl/base/btl_base_frame.c b/opal/mca/btl/base/btl_base_frame.c index 19d0508ca1..9e5262f2e7 100644 --- a/opal/mca/btl/base/btl_base_frame.c +++ b/opal/mca/btl/base/btl_base_frame.c @@ -36,10 +36,8 @@ mca_btl_active_message_callback_t mca_btl_base_active_message_trigger[MCA_BTL_TA static void mca_btl_base_descriptor_constructor(mca_btl_base_descriptor_t* des) { - des->des_local = NULL; - des->des_local_count = 0; - des->des_remote = NULL; - des->des_remote_count = 0; + des->des_segments = NULL; + des->des_segment_count = 0; des->des_cbfunc = NULL; des->des_cbdata = NULL; des->des_flags = 0; diff --git a/opal/mca/btl/base/btl_base_mca.c b/opal/mca/btl/base/btl_base_mca.c index 81dc876620..5c14a32aaf 100644 --- a/opal/mca/btl/base/btl_base_mca.c +++ b/opal/mca/btl/base/btl_base_mca.c @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology @@ -45,13 +46,15 @@ int mca_btl_base_param_register(mca_base_component_t *version, MCA_BASE_VAR_SCOPE_READONLY, &module->btl_exclusivity); - asprintf(&msg, "BTL bit flags (general flags: SEND=%d, PUT=%d, GET=%d, SEND_INPLACE=%d, RDMA_MATCHED=%d, HETEROGENEOUS_RDMA=%d; flags only used by the \"dr\" PML (ignored by others): ACK=%d, CHECKSUM=%d, RDMA_COMPLETION=%d; flags only used by the \"bfo\" PML (ignored by others): FAILOVER_SUPPORT=%d)", + asprintf(&msg, "BTL bit flags (general flags: SEND=%d, PUT=%d, GET=%d, SEND_INPLACE=%d, HETEROGENEOUS_RDMA=%d, " + "ATOMIC_OPS=%d; flags only used by the \"dr\" PML (ignored by others): ACK=%d, CHECKSUM=%d, " + "RDMA_COMPLETION=%d; flags only used by the \"bfo\" PML (ignored by others): FAILOVER_SUPPORT=%d)", MCA_BTL_FLAGS_SEND, MCA_BTL_FLAGS_PUT, MCA_BTL_FLAGS_GET, MCA_BTL_FLAGS_SEND_INPLACE, - MCA_BTL_FLAGS_RDMA_MATCHED, MCA_BTL_FLAGS_HETEROGENEOUS_RDMA, + MCA_BTL_FLAGS_ATOMIC_OPS, MCA_BTL_FLAGS_NEED_ACK, MCA_BTL_FLAGS_NEED_CSUM, MCA_BTL_FLAGS_RDMA_COMPLETION, @@ -63,6 +66,14 @@ int mca_btl_base_param_register(mca_base_component_t *version, &module->btl_flags); free(msg); + asprintf (&msg, "BTL atomic bit flags (general flags: ADD=%d, AND=%d, OR=%d, XOR=%d", + MCA_BTL_ATOMIC_SUPPORTS_ADD, MCA_BTL_ATOMIC_SUPPORTS_AND, MCA_BTL_ATOMIC_SUPPORTS_OR, + MCA_BTL_ATOMIC_SUPPORTS_XOR); + (void) mca_base_component_var_register(version, "atomic_flags", msg, MCA_BASE_VAR_TYPE_UNSIGNED_INT, + NULL, 0, MCA_BASE_VAR_FLAG_DEFAULT_ONLY, OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_CONSTANT, &module->btl_atomic_flags); + free(msg); + (void) mca_base_component_var_register(version, "rndv_eager_limit", "Size (in bytes, including header) of \"phase 1\" fragment sent for all large messages (must be >= 0 and <= eager_limit)", MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, OPAL_INFO_LVL_4, @@ -74,6 +85,39 @@ int mca_btl_base_param_register(mca_base_component_t *version, OPAL_INFO_LVL_4, MCA_BASE_VAR_SCOPE_READONLY, &module->btl_eager_limit); + + if ((module->btl_flags & MCA_BTL_FLAGS_GET) && module->btl_get) { + if (0 == module->btl_get_limit) { + module->btl_get_limit = SIZE_MAX; + } + + (void) mca_base_component_var_register(version, "get_limit", "Maximum size (in bytes) for btl get", + MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, OPAL_INFO_LVL_4, + MCA_BASE_VAR_SCOPE_READONLY, &module->btl_get_limit); + + /* Allow the user to set the alignment. The BTL should double-check the alignment in its open + * function. */ + (void) mca_base_component_var_register(version, "get_alignment", "Alignment required for btl get", + MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, OPAL_INFO_LVL_6, + MCA_BASE_VAR_SCOPE_CONSTANT, &module->btl_get_alignment); + } + + if ((module->btl_flags & MCA_BTL_FLAGS_PUT) && module->btl_put) { + if (0 == module->btl_put_limit) { + module->btl_put_limit = SIZE_MAX; + } + (void) mca_base_component_var_register(version, "put_limit", "Maximum size (in bytes) for btl put", + MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, OPAL_INFO_LVL_4, + MCA_BASE_VAR_SCOPE_READONLY, &module->btl_put_limit); + + /* Allow the user to set the alignment. The BTL should double-check the alignment in its open + * function. */ + (void) mca_base_component_var_register(version, "put_alignment", "Alignment required for btl put", + MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, OPAL_INFO_LVL_6, + MCA_BASE_VAR_SCOPE_CONSTANT, &module->btl_put_alignment); + } + + #if OPAL_CUDA_GDR_SUPPORT /* If no CUDA RDMA support, zero them out */ if (!(MCA_BTL_FLAGS_CUDA_GET & module->btl_flags)) { @@ -149,5 +193,17 @@ int mca_btl_base_param_verify(mca_btl_base_module_t *module) module->btl_flags &= ~MCA_BTL_FLAGS_GET; } + if (0 == module->btl_atomic_flags) { + module->btl_flags &= ~MCA_BTL_FLAGS_ATOMIC_OPS; + } + + if (0 == module->btl_get_limit) { + module->btl_get_limit = SIZE_MAX; + } + + if (0 == module->btl_put_limit) { + module->btl_put_limit = SIZE_MAX; + } + return OPAL_SUCCESS; } diff --git a/opal/mca/btl/btl.h b/opal/mca/btl/btl.h index 5f65be994e..bdd267c6f5 100644 --- a/opal/mca/btl/btl.h +++ b/opal/mca/btl/btl.h @@ -6,18 +6,19 @@ * Copyright (c) 2004-2008 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights - * reserved. + * Copyright (c) 2006-2015 Los Alamos National Security, LLC. All rights + * reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012-2013 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ */ /** @@ -75,8 +76,8 @@ * TCP 0 Selected based on network reachability * TCP 0 Selected based on network reachability * - * When mca_btl_base_add_proc_fn_t() is called on a BTL module, the BTL - * will populate an OUT variable with mca_btl_base_endpoint_t pointers. + * When mca_btl_base_add_proc_fn_t() is called on a BTL module, the BTL + * will populate an OUT variable with mca_btl_base_endpoint_t pointers. * Each pointer is treated as an opaque handle by the upper layer and is * returned to the BTL on subsequent data transfer calls to the * corresponding destination process. The actual contents of the @@ -132,8 +133,25 @@ struct mca_btl_base_module_t; struct mca_btl_base_endpoint_t; struct mca_btl_base_descriptor_t; struct mca_mpool_base_resources_t; -struct opal_proc_t; +struct opal_proc_t; +/** + * Opaque registration handle for executing RDMA and atomic + * operations on a memory region. + * + * This data inside this handle is appropriate for passing + * to remote peers to execute RDMA and atomic operations. The + * size needed to send the registration handle can be + * obtained from the btl via the btl_registration_handle_size + * member. If this size is 0 then no registration data is + * needed to execute RDMA or atomic operations. + */ +struct mca_btl_base_registration_handle_t; +typedef struct mca_btl_base_registration_handle_t mca_btl_base_registration_handle_t; + + +/* Wildcard endpoint for use in the register_mem function */ +#define MCA_BTL_ENDPOINT_ANY (struct mca_btl_base_endpoint_t *) -1 /* send/recv operations require tag matching */ typedef uint8_t mca_btl_base_tag_t; @@ -173,6 +191,9 @@ typedef uint8_t mca_btl_base_tag_t; #define MCA_BTL_FLAGS_SEND 0x0001 #define MCA_BTL_FLAGS_PUT 0x0002 #define MCA_BTL_FLAGS_GET 0x0004 +/* btls that set the MCA_BTL_FLAGS_RDMA will always get added to the BML + * rdma_btls list. This allows the updated one-sided component to + * use btls that are not otherwise used for send/recv. */ #define MCA_BTL_FLAGS_RDMA (MCA_BTL_FLAGS_GET|MCA_BTL_FLAGS_PUT) /* btl can send directly from user buffer w/out registration */ @@ -182,8 +203,7 @@ typedef uint8_t mca_btl_base_tag_t; #define MCA_BTL_FLAGS_NEED_ACK 0x0010 #define MCA_BTL_FLAGS_NEED_CSUM 0x0020 -/** RDMA put/get calls must have a matching prepare_{src,dst} call - on the target with the same base (and possibly bound). */ +/** deprecated (BTL 3.0) */ #define MCA_BTL_FLAGS_RDMA_MATCHED 0x0040 /* btl needs local rdma completion */ @@ -209,6 +229,12 @@ typedef uint8_t mca_btl_base_tag_t; */ #define MCA_BTL_FLAGS_SIGNALED 0x4000 + +/** The BTL supports network atomic operations */ +#define MCA_BTL_FLAGS_ATOMIC_OPS 0x08000 +/** The BTL supports fetching network atomic operations */ +#define MCA_BTL_FLAGS_ATOMIC_FOPS 0x10000 + /* Default exclusivity levels */ #define MCA_BTL_EXCLUSIVITY_HIGH (64*1024) /* internal loopback */ #define MCA_BTL_EXCLUSIVITY_DEFAULT 1024 /* GM/IB/etc. */ @@ -219,11 +245,67 @@ typedef uint8_t mca_btl_base_tag_t; #define MCA_BTL_ERROR_FLAGS_NONFATAL 0x2 #define MCA_BTL_ERROR_FLAGS_ADD_CUDA_IPC 0x4 +/** registration flags */ +enum { + /** Allow local write on the registered region. If a region is registered + * with this flag the registration can be used as the local handle for a + * btl_get operation. */ + MCA_BTL_REG_FLAG_LOCAL_WRITE = 0x00000001, + /** Allow remote read on the registered region. If a region is registered + * with this flag the registration can be used as the remote handle for a + * btl_get operation. */ + MCA_BTL_REG_FLAG_REMOTE_READ = 0x00000002, + /** Allow remote write on the registered region. If a region is registered + * with this flag the registration can be used as the remote handle for a + * btl_put operation. */ + MCA_BTL_REG_FLAG_REMOTE_WRITE = 0x00000004, + /** Allow remote atomic operations on the registered region. If a region is + * registered with this flag the registration can be used as the remote + * handle for a btl_atomic_op or btl_atomic_fop operation. */ + MCA_BTL_REG_FLAG_REMOTE_ATOMIC = 0x00000008, + /** Allow any btl operation on the registered region. If a region is registered + * with this flag the registration can be used as the local or remote handle for + * any btl operation. */ + MCA_BTL_REG_FLAG_ACCESS_ANY = 0x0000000f, +#if OPAL_CUDA_GDR_SUPPORT + /** Region is in GPU memory */ + MCA_BTL_REG_FLAG_CUDA_GPU_MEM = 0x00010000, +#endif +}; + +/** supported atomic operations */ +enum { + /** The btl supports atomic add */ + MCA_BTL_ATOMIC_SUPPORTS_ADD = 0x00000001, + /** The btl supports atomic bitwise and */ + MCA_BTL_ATOMIC_SUPPORTS_AND = 0x00000200, + /** The btl supports atomic bitwise or */ + MCA_BTL_ATOMIC_SUPPORTS_OR = 0x00000400, + /** The btl supports atomic bitwise exclusive or */ + MCA_BTL_ATOMIC_SUPPORTS_XOR = 0x00000800, + /** The btl supports atomic compare-and-swap */ + MCA_BTL_ATOMIC_SUPPORTS_CSWAP = 0x10000000, + /** The btl guarantees global atomicity (can mix btl atomics with cpu atomics) */ + MCA_BTL_ATOMIC_SUPPORTS_GLOB = 0x20000000, +}; + +enum mca_btl_base_atomic_op_t { + /** Atomic add: (*remote_address) = (*remote_address) + operand */ + MCA_BTL_ATOMIC_ADD = 0x0001, + /** Atomic and: (*remote_address) = (*remote_address) & operand */ + MCA_BTL_ATOMIC_AND = 0x0011, + /** Atomic or: (*remote_address) = (*remote_address) | operand */ + MCA_BTL_ATOMIC_OR = 0x0012, + /** Atomic xor: (*remote_address) = (*remote_address) ^ operand */ + MCA_BTL_ATOMIC_XOR = 0x0014, +}; +typedef enum mca_btl_base_atomic_op_t mca_btl_base_atomic_op_t; + /** * Asynchronous callback function on completion of an operation. - * Completion Semantics: The descriptor can be reused or returned to the + * Completion Semantics: The descriptor can be reused or returned to the * BTL via mca_btl_base_module_free_fn_t. The operation has been queued to - * the network device or will otherwise make asynchronous progress without + * the network device or will otherwise make asynchronous progress without * subsequent calls to btl_progress. * * @param[IN] module the BTL module @@ -237,8 +319,34 @@ typedef void (*mca_btl_base_completion_fn_t)( struct mca_btl_base_descriptor_t* descriptor, int status); + /** - * Describes a region/segment of memory that is addressable + * Asynchronous callback function on completion of an rdma or atomic operation. + * Completion Semantics: The rdma or atomic memory operation has completed + * remotely (i.e.) is remotely visible and the caller is free to deregister + * the local_handle or modify the memory in local_address. + * + * @param[IN] module the BTL module + * @param[IN] endpoint the BTL endpoint + * @param[IN] local_address local address for the operation (if any) + * @param[IN] local_handle local handle associated with the local_address + * @param[IN] context callback context supplied to the rdma/atomic operation + * @param[IN] cbdata callback data supplied to the rdma/atomic operation + * @param[IN] status status of the operation + * + */ +typedef void (*mca_btl_base_rdma_completion_fn_t)( + struct mca_btl_base_module_t* module, + struct mca_btl_base_endpoint_t* endpoint, + void *local_address, + struct mca_btl_base_registration_handle_t *local_handle, + void *context, + void *cbdata, + int status); + + +/** + * Describes a region/segment of memory that is addressable * by an BTL. * * Note: In many cases the alloc and prepare methods of BTLs @@ -256,38 +364,37 @@ typedef void (*mca_btl_base_completion_fn_t)( struct mca_btl_base_segment_t { /** Address of the memory */ - opal_ptr_t seg_addr; + opal_ptr_t seg_addr; /** Length in bytes */ uint64_t seg_len; }; typedef struct mca_btl_base_segment_t mca_btl_base_segment_t; + /** * A descriptor that holds the parameters to a send/put/get * operation along w/ a callback routine that is called on * completion of the request. * Note: receive callbacks will store the incomming data segments in - * des_local + * des_segments */ struct mca_btl_base_descriptor_t { - ompi_free_list_item_t super; - mca_btl_base_segment_t *des_local; /**< local segments */ - size_t des_local_count; /**< number of local segments */ - mca_btl_base_segment_t *des_remote; /**< remote segments */ - size_t des_remote_count; /**< number of destination segments */ - mca_btl_base_completion_fn_t des_cbfunc; /**< local callback function */ + ompi_free_list_item_t super; + mca_btl_base_segment_t *des_segments; /**< local segments */ + size_t des_segment_count; /**< number of local segments */ + mca_btl_base_completion_fn_t des_cbfunc; /**< local callback function */ void* des_cbdata; /**< opaque callback data */ void* des_context; /**< more opaque callback data */ uint32_t des_flags; /**< hints to BTL */ - /** order value, this is only - valid in the local completion callback - and may be used in subsequent calls to - btl_alloc, btl_prepare_src/dst to request - a descriptor that will be ordered w.r.t. + /** order value, this is only + valid in the local completion callback + and may be used in subsequent calls to + btl_alloc, btl_prepare_src to request + a descriptor that will be ordered w.r.t. this descriptor */ - uint8_t order; + uint8_t order; }; typedef struct mca_btl_base_descriptor_t mca_btl_base_descriptor_t; @@ -329,13 +436,18 @@ OPAL_DECLSPEC OBJ_CLASS_DECLARATION(mca_btl_base_descriptor_t); */ #define MCA_BTL_SEG_MAX_SIZE 256 -/* - * BTL base header, stores the tag at a minimum - */ -struct mca_btl_base_header_t{ - mca_btl_base_tag_t tag; -}; -typedef struct mca_btl_base_header_t mca_btl_base_header_t; +/** + * Maximum size of a BTL registration handle in bytes + */ +#define MCA_BTL_REG_HANDLE_MAX_SIZE 256 + +/* + * BTL base header, stores the tag at a minimum + */ +struct mca_btl_base_header_t{ + mca_btl_base_tag_t tag; +}; +typedef struct mca_btl_base_header_t mca_btl_base_header_t; #define MCA_BTL_BASE_HEADER_HTON(hdr) #define MCA_BTL_BASE_HEADER_NTOH(hdr) @@ -359,19 +471,19 @@ typedef struct mca_btl_base_header_t mca_btl_base_header_t; * indicates whether multiple threads may invoke this component * simultaneously or not. * - * @return Array of pointers to BTL modules, or NULL if the transport + * @return Array of pointers to BTL modules, or NULL if the transport * is not available. * * During component initialization, the BTL component should discover * the physical devices that are available for the given transport, - * and create a BTL module to represent each device. Any addressing - * information required by peers to reach the device should be published - * during this function via the modex_send() interface. + * and create a BTL module to represent each device. Any addressing + * information required by peers to reach the device should be published + * during this function via the modex_send() interface. * */ typedef struct mca_btl_base_module_t** (*mca_btl_base_component_init_fn_t)( - int *num_btls, + int *num_btls, bool enable_progress_threads, bool enable_mpi_threads ); @@ -380,8 +492,8 @@ typedef struct mca_btl_base_module_t** (*mca_btl_base_component_init_fn_t)( * MCA->BTL Called to progress outstanding requests for * non-threaded polling environments. * - * @return Count of "completions", a metric of - * how many items where completed in the call + * @return Count of "completions", a metric of + * how many items where completed in the call * to progress. */ @@ -390,22 +502,22 @@ typedef int (*mca_btl_base_component_progress_fn_t)(void); /** * Callback function that is called asynchronously on receipt - * of data by the transport layer. - * Note that the the mca_btl_base_descriptor_t is only valid within the - * completion function, this implies that all data payload in the - * mca_btl_base_descriptor_t must be copied out within this callback or + * of data by the transport layer. + * Note that the the mca_btl_base_descriptor_t is only valid within the + * completion function, this implies that all data payload in the + * mca_btl_base_descriptor_t must be copied out within this callback or * forfeited back to the BTL. - * Note also that descriptor segments (des_local) must be base + * Note also that descriptor segments (des_segments) must be base * segments for all callbacks. - * + * * @param[IN] btl BTL module - * @param[IN] tag The active message receive callback tag value - * @param[IN] descriptor The BTL descriptor (contains the receive payload) + * @param[IN] tag The active message receive callback tag value + * @param[IN] descriptor The BTL descriptor (contains the receive payload) * @param[IN] cbdata Opaque callback data */ typedef void (*mca_btl_base_module_recv_cb_fn_t)( - struct mca_btl_base_module_t* btl, + struct mca_btl_base_module_t* btl, mca_btl_base_tag_t tag, mca_btl_base_descriptor_t* descriptor, void* cbdata @@ -424,26 +536,22 @@ mca_btl_active_message_callback_t mca_btl_base_active_message_trigger[MCA_BTL_TA * and component open/close/init functions. */ -struct mca_btl_base_component_2_0_0_t { +struct mca_btl_base_component_3_0_0_t { mca_base_component_t btl_version; mca_base_component_data_t btl_data; mca_btl_base_component_init_fn_t btl_init; mca_btl_base_component_progress_fn_t btl_progress; }; -typedef struct mca_btl_base_component_2_0_0_t mca_btl_base_component_2_0_0_t; -typedef struct mca_btl_base_component_2_0_0_t mca_btl_base_component_t; +typedef struct mca_btl_base_component_3_0_0_t mca_btl_base_component_3_0_0_t; +typedef struct mca_btl_base_component_3_0_0_t mca_btl_base_component_t; -/* add the 1_0_0_t typedef for source compatibility - * we can do this safely because 1_0_0 components are the same as - * 1_0_1 components, the difference is in the btl module. - * Fortunately the only difference in the module is an additional interface - * function added to 1_0_1. We can therefore safely treat an older module just - * just like the new one so long as we check the component version - * prior to invoking the new interface function. +/* add the 2_0_0_t typedef for source compatibility + * we can do this safely because 2_0_0 components are the same as + * 3_0_0 components, the difference is in the btl module. + * Unfortunately 2_0_0 modules are not compatible with BTL 3_0_0 and + * can not be used with the new interface. */ -typedef struct mca_btl_base_component_2_0_0_t mca_btl_base_component_1_0_1_t; -typedef struct mca_btl_base_component_2_0_0_t mca_btl_base_component_1_0_0_t; - +typedef struct mca_btl_base_component_3_0_0_t mca_btl_base_component_2_0_0_t; /* @@ -451,24 +559,24 @@ typedef struct mca_btl_base_component_2_0_0_t mca_btl_base_component_1_0_0_t; */ /** - * MCA->BTL Clean up any resources held by BTL module + * MCA->BTL Clean up any resources held by BTL module * before the module is unloaded. - * + * * @param btl (IN) BTL module. * @return OPAL_SUCCESS or error status on failure. * - * Prior to unloading a BTL module, the MCA framework will call - * the BTL finalize method of the module. Any resources held by + * Prior to unloading a BTL module, the MCA framework will call + * the BTL finalize method of the module. Any resources held by * the BTL should be released and if required the memory corresponding * to the BTL module freed. - * + * */ typedef int (*mca_btl_base_module_finalize_fn_t)( struct mca_btl_base_module_t* btl ); - + /** - * BML->BTL notification of change in the process list. + * BML->BTL notification of change in the process list. * * @param btl (IN) BTL module * @param nprocs (IN) Number of processes @@ -477,24 +585,24 @@ typedef int (*mca_btl_base_module_finalize_fn_t)( * @param reachable (OUT) Bitmask indicating set of peer processes that are reachable by this BTL. * @return OPAL_SUCCESS or error status on failure. * - * The mca_btl_base_module_add_procs_fn_t() is called by the BML to + * The mca_btl_base_module_add_procs_fn_t() is called by the BML to * determine the set of BTLs that should be used to reach each process. * Any addressing information exported by the peer via the modex_send() - * function should be available during this call via the corresponding - * modex_recv() function. The BTL may utilize this information to - * determine reachability of each peer process. + * function should be available during this call via the corresponding + * modex_recv() function. The BTL may utilize this information to + * determine reachability of each peer process. * - * For each process that is reachable by the BTL, the bit corresponding to the index - * into the proc array (nprocs) should be set in the reachable bitmask. The BTL + * For each process that is reachable by the BTL, the bit corresponding to the index + * into the proc array (nprocs) should be set in the reachable bitmask. The BTL * will return an array of pointers to a data structure defined * by the BTL that is then returned to the BTL on subsequent calls to the BTL data - * transfer functions (e.g btl_send). This may be used by the BTL to cache any addressing + * transfer functions (e.g btl_send). This may be used by the BTL to cache any addressing * or connection information (e.g. TCP socket, IB queue pair). */ typedef int (*mca_btl_base_module_add_procs_fn_t)( - struct mca_btl_base_module_t* btl, + struct mca_btl_base_module_t* btl, size_t nprocs, - struct opal_proc_t** procs, + struct opal_proc_t** procs, struct mca_btl_base_endpoint_t** endpoints, struct opal_bitmap_t* reachable ); @@ -513,9 +621,9 @@ typedef int (*mca_btl_base_module_add_procs_fn_t)( * resources associated with the peer. */ typedef int (*mca_btl_base_module_del_procs_fn_t)( - struct mca_btl_base_module_t* btl, + struct mca_btl_base_module_t* btl, size_t nprocs, - struct opal_proc_t** procs, + struct opal_proc_t** procs, struct mca_btl_base_endpoint_t** peer ); @@ -524,17 +632,17 @@ typedef int (*mca_btl_base_module_del_procs_fn_t)( * of a fragment. * * @param[IN] btl BTL module - * @param[IN] tag tag value of this callback + * @param[IN] tag tag value of this callback * (specified on subsequent send operations) * @param[IN] cbfunc The callback function - * @param[IN] cbdata Opaque callback data - * + * @param[IN] cbdata Opaque callback data + * * @return OPAL_SUCCESS The callback was registered successfully * @return OPAL_ERROR The callback was NOT registered successfully * */ typedef int (*mca_btl_base_module_register_fn_t)( - struct mca_btl_base_module_t* btl, + struct mca_btl_base_module_t* btl, mca_btl_base_tag_t tag, mca_btl_base_module_recv_cb_fn_t cbfunc, void* cbdata @@ -543,10 +651,10 @@ typedef int (*mca_btl_base_module_register_fn_t)( /** * Callback function that is called asynchronously on receipt - * of an error from the transport layer + * of an error from the transport layer * * @param[IN] btl BTL module - * @param[IN] flags type of error + * @param[IN] flags type of error * @param[IN] errproc process that had an error * @param[IN] btlinfo descriptive string from the BTL */ @@ -571,21 +679,21 @@ typedef void (*mca_btl_base_module_error_cb_fn_t)( * */ typedef int (*mca_btl_base_module_register_error_fn_t)( - struct mca_btl_base_module_t* btl, + struct mca_btl_base_module_t* btl, mca_btl_base_module_error_cb_fn_t cbfunc ); /** - * Allocate a descriptor with a segment of the requested size. + * Allocate a descriptor with a segment of the requested size. * Note that the BTL layer may choose to return a smaller size * if it cannot support the request. The order tag value ensures that - * operations on the descriptor that is allocated will be - * ordered w.r.t. a previous operation on a particular descriptor. - * Ordering is only guaranteed if the previous descriptor had its - * local completion callback function called and the order tag of + * operations on the descriptor that is allocated will be + * ordered w.r.t. a previous operation on a particular descriptor. + * Ordering is only guaranteed if the previous descriptor had its + * local completion callback function called and the order tag of * that descriptor is only valid upon the local completion callback function. - * + * * * @param btl (IN) BTL module * @param size (IN) Request segment size. @@ -602,9 +710,9 @@ typedef mca_btl_base_descriptor_t* (*mca_btl_base_module_alloc_fn_t)( /** * Return a descriptor allocated from this BTL via alloc/prepare. - * A descriptor can only be deallocated after its local completion + * A descriptor can only be deallocated after its local completion * callback function has called for all send/put/get operations. - * + * * @param btl (IN) BTL module * @param segment (IN) Descriptor allocated from the BTL */ @@ -615,23 +723,16 @@ typedef int (*mca_btl_base_module_free_fn_t)( /** - * Prepare a descriptor for send/put/get using the supplied - * convertor. If the convertor references data that is contiguous, - * the descriptor may simply point to the user buffer. Otherwise, - * this routine is responsible for allocating buffer space and - * packing if required. + * Prepare a descriptor for send using the supplied convertor. If the convertor + * references data that is contiguous, the descriptor may simply point to the + * user buffer. Otherwise, this routine is responsible for allocating buffer + * space and packing if required. * - * The descriptor returned can be used in multiple concurrent operations - * (send/put/get) unless the BTL has the MCA_BTL_FLAGS_RDMA_MATCHED flag set - * in which case a corresponding prepare call must accompany the put/get call - * in addition, the address and length that is put/get must match the address - * and length which is prepared. - * - * The order tag value ensures that operations on the + * The order tag value ensures that operations on the * descriptor that is prepared will be ordered w.r.t. a previous - * operation on a particular descriptor. Ordering is only guaranteed if - * the previous descriptor had its local completion callback function - * called and the order tag of that descriptor is only valid upon the local + * operation on a particular descriptor. Ordering is only guaranteed if + * the previous descriptor had its local completion callback function + * called and the order tag of that descriptor is only valid upon the local * completion callback function. * * @param btl (IN) BTL module @@ -647,7 +748,6 @@ typedef int (*mca_btl_base_module_free_fn_t)( typedef struct mca_btl_base_descriptor_t* (*mca_btl_base_module_prepare_fn_t)( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, - mca_mpool_base_registration_t* registration, struct opal_convertor_t* convertor, uint8_t order, size_t reserve, @@ -655,22 +755,67 @@ typedef struct mca_btl_base_descriptor_t* (*mca_btl_base_module_prepare_fn_t)( uint32_t flags ); +/** + * @brief Register a memory region for put/get/atomic operations. + * + * @param btl (IN) BTL module + * @param endpoint(IN) BTL addressing information (or NULL for all endpoints) + * @param base (IN) Pointer to start of region + * @param size (IN) Size of region + * @param flags (IN) Flags including access permissions + * + * @returns a memory registration handle valid for both local and remote operations + * @returns NULL if the region could not be registered + * + * This function registers the specified region with the hardware for use with + * the btl_put, btl_get, btl_atomic_cas, btl_atomic_op, and btl_atomic_fop + * functions. Care should be taken to not hold an excessive number of registrations + * as they may use limited system/NIC resources. + * + * Ownership of the memory pointed to by the returned (struct + * mca_btl_base_registration_handle_t*) is passed to the caller. The + * BTL module cannot free or reuse the handle until it is returned via + * the mca_btl_base_module_deregister_mem_fn_t function. + */ +typedef struct mca_btl_base_registration_handle_t *(*mca_btl_base_module_register_mem_fn_t)( + struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t *endpoint, void *base, + size_t size, uint32_t flags); + +/** + * @brief Deregister a memory region + * + * @param btl (IN) BTL module region was registered with + * @param handle (IN) BTL registration handle to deregister + * + * This function deregisters the memory region associated with the specified handle. Care + * should be taken to not perform any RDMA or atomic operation on this memory region + * after it is deregistered. It is erroneous to specify a memory handle associated with + * a remote node. + * + * The handle passed in will be a value previously returned by the + * mca_btl_base_module_register_mem_fn_t function. Ownership of the + * memory pointed to by handle passes to the BTL module; this function + * is now is allowed to free the memory, return it to a freelist, etc. + */ +typedef int (*mca_btl_base_module_deregister_mem_fn_t)( + struct mca_btl_base_module_t* btl, struct mca_btl_base_registration_handle_t *handle); + /** * Initiate an asynchronous send. * Completion Semantics: the descriptor has been queued for a send operation - * the BTL now controls the descriptor until local + * the BTL now controls the descriptor until local * completion callback is made on the descriptor - * + * * All BTLs allow multiple concurrent asynchronous send operations on a descriptor * * @param btl (IN) BTL module * @param endpoint (IN) BTL addressing information * @param descriptor (IN) Description of the data to be transfered * @param tag (IN) The tag value used to notify the peer. - * - * @retval OPAL_SUCCESS The descriptor was successfully queued for a send - * @retval OPAL_ERROR The descriptor was NOT successfully queued for a send - * @retval OPAL_ERR_UNREACH The endpoint is not reachable + * + * @retval OPAL_SUCCESS The descriptor was successfully queued for a send + * @retval OPAL_ERROR The descriptor was NOT successfully queued for a send + * @retval OPAL_ERR_UNREACH The endpoint is not reachable */ typedef int (*mca_btl_base_module_send_fn_t)( struct mca_btl_base_module_t* btl, @@ -680,12 +825,12 @@ typedef int (*mca_btl_base_module_send_fn_t)( ); /** - * Initiate an immediate blocking send. - * Completion Semantics: the BTL will make a best effort - * to send the header and "size" bytes from the datatype using the convertor. - * The header is guaranteed to be delivered entirely in the first segment. - * Should the BTL be unable to deliver the data due to resource constraints - * the BTL will return a descriptor (via the OUT param) + * Initiate an immediate blocking send. + * Completion Semantics: the BTL will make a best effort + * to send the header and "size" bytes from the datatype using the convertor. + * The header is guaranteed to be delivered entirely in the first segment. + * Should the BTL be unable to deliver the data due to resource constraints + * the BTL will return a descriptor (via the OUT param) * of size "payload_size + header_size". * * @param btl (IN) BTL module @@ -698,13 +843,13 @@ typedef int (*mca_btl_base_module_send_fn_t)( * @param flags (IN) Flags. * @param tag (IN) The tag value used to notify the peer. * @param descriptor (OUT) The descriptor to be returned unable to be sent immediately - - * @retval OPAL_SUCCESS The send was successfully queued - * @retval OPAL_ERROR The send failed - * @retval OPAL_ERR_UNREACH The endpoint is not reachable - * @retval OPAL_ERR_RESOURCE_BUSY The BTL is busy a descriptor will be returned - * (via the OUT param) if descriptors are available - + * (may be NULL). + * + * @retval OPAL_SUCCESS The send was successfully queued + * @retval OPAL_ERROR The send failed + * @retval OPAL_ERR_UNREACH The endpoint is not reachable + * @retval OPAL_ERR_RESOURCE_BUSY The BTL is busy a descriptor will be returned + * (via the OUT param) if descriptors are available */ typedef int (*mca_btl_base_module_sendi_fn_t)( @@ -721,59 +866,211 @@ typedef int (*mca_btl_base_module_sendi_fn_t)( ); /** - * Initiate an asynchronous put. - * Completion Semantics: the descriptor has been queued for a put operation - * the BTL now controls the descriptor until local - * completion callback is made on the descriptor + * Initiate an asynchronous put. + * Completion Semantics: if this function returns a 1 then the operation + * is complete. a return of OPAL_SUCCESS indicates + * the put operation has been queued with the + * network. the local_handle can not be deregistered + * until all outstanding operations on that handle + * have been completed. + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL addressing information + * @param local_address (IN) Local address to put from (registered) + * @param remote_address (IN) Remote address to put to (registered remotely) + * @param local_handle (IN) Registration handle for region containing + * (local_address, local_address + size) + * @param remote_handle (IN) Remote registration handle for region containing + * (remote_address, remote_address + size) + * @param size (IN) Number of bytes to put + * @param flags (IN) Flags for this put operation + * @param order (IN) Ordering + * @param cbfunc (IN) Function to call on completion (if queued) + * @param cbcontext (IN) Context for the callback + * @param cbdata (IN) Data for callback * - * BTLs that do not have the MCA_BTL_FLAGS_RDMA_MATCHED flag set - * allow multiple concurrent put operations on the same descriptor. - * BTLs that do have the MCA_BTL_FLAGS_RDMA_MATCHED flag set require - * a corresponding prepare_src/dst call for each put operation and - * therefore prohibit multiple concurrent put operations. - * - * @param btl (IN) BTL module - * @param endpoint (IN) BTL addressing information - * @param descriptor (IN) Description of the data to be transferred - * * @retval OPAL_SUCCESS The descriptor was successfully queued for a put * @retval OPAL_ERROR The descriptor was NOT successfully queued for a put + * @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put + * operation. Try again later + * @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or + * alignment restrictions. */ - -typedef int (*mca_btl_base_module_put_fn_t)( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - struct mca_btl_base_descriptor_t* descriptor -); +typedef int (*mca_btl_base_module_put_fn_t) (struct mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); /** * Initiate an asynchronous get. + * Completion Semantics: if this function returns a 1 then the operation + * is complete. a return of OPAL_SUCCESS indicates + * the get operation has been queued with the + * network. the local_handle can not be deregistered + * until all outstanding operations on that handle + * have been completed. * - * Completion Semantics: the descriptor has been queued for a get operation - * the BTL now controls the descriptor until local - * completion callback is made on the descriptor - * - * BTLs that do not have the MCA_BTL_FLAGS_RDMA_MATCHED flag set - * allow multiple concurrent get operations on the same descriptor. - * BTLs that do have the MCA_BTL_FLAGS_RDMA_MATCHED flag set require - * a corresponding prepare_src/dst call for each get operation and - * therefore prohibit multiple concurrent get operations. - * - * @param btl (IN) BTL module - * @param endpoint (IN) BTL addressing information - * @param descriptor (IN) Description of the data to be transferred - * - * @retval OPAL_SUCCESS The descriptor was successfully queued for a get - * @retval OPAL_ERROR The descriptor was NOT successfully queued for a get + * @param btl (IN) BTL module + * @param endpoint (IN) BTL addressing information + * @param local_address (IN) Local address to put from (registered) + * @param remote_address (IN) Remote address to put to (registered remotely) + * @param local_handle (IN) Registration handle for region containing + * (local_address, local_address + size) + * @param remote_handle (IN) Remote registration handle for region containing + * (remote_address, remote_address + size) + * @param size (IN) Number of bytes to put + * @param flags (IN) Flags for this put operation + * @param order (IN) Ordering + * @param cbfunc (IN) Function to call on completion (if queued) + * @param cbcontext (IN) Context for the callback + * @param cbdata (IN) Data for callback * + * @retval OPAL_SUCCESS The descriptor was successfully queued for a put + * @retval OPAL_ERROR The descriptor was NOT successfully queued for a put + * @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put + * operation. Try again later + * @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or + * alignment restrictions. */ +typedef int (*mca_btl_base_module_get_fn_t) (struct mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); -typedef int (*mca_btl_base_module_get_fn_t)( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - struct mca_btl_base_descriptor_t* descriptor -); +/** + * Initiate an asynchronous atomic operation. + * Completion Semantics: if this function returns a 1 then the operation + * is complete. a return of OPAL_SUCCESS indicates + * the atomic operation has been queued with the + * network. + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL addressing information + * @param remote_address (IN) Remote address to put to (registered remotely) + * @param remote_handle (IN) Remote registration handle for region containing + * (remote_address, remote_address + 8) + * @param op (IN) Operation to perform + * @param operand (IN) Operand for the operation + * @param flags (IN) Flags for this put operation + * @param order (IN) Ordering + * @param cbfunc (IN) Function to call on completion (if queued) + * @param cbcontext (IN) Context for the callback + * @param cbdata (IN) Data for callback + * + * @retval OPAL_SUCCESS The operation was successfully queued + * @retval 1 The operation is complete + * @retval OPAL_ERROR The operation was NOT successfully queued + * @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the atomic + * operation. Try again later + * @retval OPAL_ERR_NOT_AVAILABLE Atomic operation can not be performed due to + * alignment restrictions or the operation {op} is not supported + * by the hardware. + * + * After the operation is complete the remote address specified by {remote_address} and + * {remote_handle} will be updated with (*remote_address) = (*remote_address) op operand. + * The btl will guarantee consistency of atomic operations performed via the btl. Note, + * however, that not all btls will provide consistency between btl atomic operations and + * cpu or other btl atomics. + */ +typedef int (*mca_btl_base_module_atomic_op64_fn_t) (struct mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *endpoint, uint64_t remote_address, + struct mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op, + uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, + void *cbcontext, void *cbdata); +/** + * Initiate an asynchronous fetching atomic operation. + * Completion Semantics: if this function returns a 1 then the operation + * is complete. a return of OPAL_SUCCESS indicates + * the atomic operation has been queued with the + * network. + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL addressing information + * @param local_address (OUT) Local address to store the result in + * @param remote_address (IN) Remote address perfom operation on to (registered remotely) + * @param local_handle (IN) Local registration handle for region containing + * (local_address, local_address + 8) + * @param remote_handle (IN) Remote registration handle for region containing + * (remote_address, remote_address + 8) + * @param op (IN) Operation to perform + * @param operand (IN) Operand for the operation + * @param flags (IN) Flags for this put operation + * @param order (IN) Ordering + * @param cbfunc (IN) Function to call on completion (if queued) + * @param cbcontext (IN) Context for the callback + * @param cbdata (IN) Data for callback + * + * @retval OPAL_SUCCESS The operation was successfully queued + * @retval 1 The operation is complete + * @retval OPAL_ERROR The operation was NOT successfully queued + * @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the atomic + * operation. Try again later + * @retval OPAL_ERR_NOT_AVAILABLE Atomic operation can not be performed due to + * alignment restrictions or the operation {op} is not supported + * by the hardware. + * + * After the operation is complete the remote address specified by {remote_address} and + * {remote_handle} will be updated with (*remote_address) = (*remote_address) op operand. + * {local_address} will be updated with the previous value stored in {remote_address}. + * The btl will guarantee consistency of atomic operations performed via the btl. Note, + * however, that not all btls will provide consistency between btl atomic operations and + * cpu or other btl atomics. + */ +typedef int (*mca_btl_base_module_atomic_fop64_fn_t) (struct mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address, + struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op, + uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, + void *cbcontext, void *cbdata); + +/** + * Initiate an asynchronous compare and swap operation. + * Completion Semantics: if this function returns a 1 then the operation + * is complete. a return of OPAL_SUCCESS indicates + * the atomic operation has been queued with the + * network. + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL addressing information + * @param local_address (OUT) Local address to store the result in + * @param remote_address (IN) Remote address perfom operation on to (registered remotely) + * @param local_handle (IN) Local registration handle for region containing + * (local_address, local_address + 8) + * @param remote_handle (IN) Remote registration handle for region containing + * (remote_address, remote_address + 8) + * @param compare (IN) Operand for the operation + * @param value (IN) Value to store on success + * @param flags (IN) Flags for this put operation + * @param order (IN) Ordering + * @param cbfunc (IN) Function to call on completion (if queued) + * @param cbcontext (IN) Context for the callback + * @param cbdata (IN) Data for callback + * + * @retval OPAL_SUCCESS The operation was successfully queued + * @retval 1 The operation is complete + * @retval OPAL_ERROR The operation was NOT successfully queued + * @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the atomic + * operation. Try again later + * @retval OPAL_ERR_NOT_AVAILABLE Atomic operation can not be performed due to + * alignment restrictions or the operation {op} is not supported + * by the hardware. + * + * After the operation is complete the remote address specified by {remote_address} and + * {remote_handle} will be updated with {value} if *remote_address == compare. + * {local_address} will be updated with the previous value stored in {remote_address}. + * The btl will guarantee consistency of atomic operations performed via the btl. Note, + * however, that not all btls will provide consistency between btl atomic operations and + * cpu atomics. + */ +typedef int (*mca_btl_base_module_atomic_cswap64_fn_t) (struct mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address, + struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, + uint64_t value, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, + void *cbcontext, void *cbdata); /** * Diagnostic dump of btl state. @@ -813,7 +1110,14 @@ struct mca_btl_base_module_t { uint32_t btl_latency; /**< relative ranking of latency used to prioritize btls */ uint32_t btl_bandwidth; /**< bandwidth (Mbytes/sec) supported by each endpoint */ uint32_t btl_flags; /**< flags (put/get...) */ - size_t btl_seg_size; /**< size of a btl segment */ + uint32_t btl_atomic_flags; /**< atomic operations supported (add, and, xor, etc) */ + size_t btl_registration_handle_size; /**< size of the BTLs registration handles */ + + /* One-sided limitations (0 for no alignment, SIZE_MAX for no limit ) */ + size_t btl_get_limit; /**< maximum size supported by the btl_get function */ + size_t btl_get_alignment; /**< minimum alignment/size needed by btl_get (power of 2) */ + size_t btl_put_limit; /**< maximum size supported by the btl_put function */ + size_t btl_put_alignment; /**< minimum alignment/size needed by btl_put (power of 2) */ /* BTL function table */ mca_btl_base_module_add_procs_fn_t btl_add_procs; @@ -824,16 +1128,24 @@ struct mca_btl_base_module_t { mca_btl_base_module_alloc_fn_t btl_alloc; mca_btl_base_module_free_fn_t btl_free; mca_btl_base_module_prepare_fn_t btl_prepare_src; - mca_btl_base_module_prepare_fn_t btl_prepare_dst; mca_btl_base_module_send_fn_t btl_send; mca_btl_base_module_sendi_fn_t btl_sendi; mca_btl_base_module_put_fn_t btl_put; mca_btl_base_module_get_fn_t btl_get; - mca_btl_base_module_dump_fn_t btl_dump; - - /** the mpool associated with this btl (optional) */ - mca_mpool_base_module_t* btl_mpool; - /** register a default error handler */ + mca_btl_base_module_dump_fn_t btl_dump; + + /* atomic operations */ + mca_btl_base_module_atomic_op64_fn_t btl_atomic_op; + mca_btl_base_module_atomic_fop64_fn_t btl_atomic_fop; + mca_btl_base_module_atomic_cswap64_fn_t btl_atomic_cswap; + + /* new memory registration functions */ + mca_btl_base_module_register_mem_fn_t btl_register_mem; /**< memory registration function (NULL if not needed) */ + mca_btl_base_module_deregister_mem_fn_t btl_deregister_mem; /**< memory deregistration function (NULL if not needed) */ + + /** the mpool associated with this btl (optional) */ + mca_mpool_base_module_t* btl_mpool; + /** register a default error handler */ mca_btl_base_module_register_error_fn_t btl_register_error; /** fault tolerant even notification */ mca_btl_base_module_ft_event_fn_t btl_ft_event; diff --git a/opal/mca/btl/openib/Makefile.am b/opal/mca/btl/openib/Makefile.am index dfd65ac4c1..8ec0d4398f 100644 --- a/opal/mca/btl/openib/Makefile.am +++ b/opal/mca/btl/openib/Makefile.am @@ -59,6 +59,9 @@ sources = \ btl_openib_fd.c \ btl_openib_ip.h \ btl_openib_ip.c \ + btl_openib_put.c \ + btl_openib_get.c \ + btl_openib_atomic.c \ connect/base.h \ connect/btl_openib_connect_base.c \ connect/btl_openib_connect_empty.c \ diff --git a/opal/mca/btl/openib/btl_openib.c b/opal/mca/btl/openib/btl_openib.c index 3d2ed4e78d..338cef48e3 100644 --- a/opal/mca/btl/openib/btl_openib.c +++ b/opal/mca/btl/openib/btl_openib.c @@ -92,6 +92,11 @@ #define MIN(a,b) ((a)<(b)?(a):(b)) #endif +static mca_btl_base_registration_handle_t *mca_btl_openib_register_mem (mca_btl_base_module_t *btl, + mca_btl_base_endpoint_t *endpoint, + void *base, size_t size, uint32_t flags); +static int mca_btl_openib_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle); + mca_btl_openib_module_t mca_btl_openib_module = { .super = { .btl_component = &mca_btl_openib_component.super, @@ -102,14 +107,19 @@ mca_btl_openib_module_t mca_btl_openib_module = { .btl_alloc = mca_btl_openib_alloc, .btl_free = mca_btl_openib_free, .btl_prepare_src = mca_btl_openib_prepare_src, - .btl_prepare_dst = mca_btl_openib_prepare_dst, .btl_send = mca_btl_openib_send, .btl_sendi = mca_btl_openib_sendi, /* send immediate */ .btl_put = mca_btl_openib_put, .btl_get = mca_btl_openib_get, .btl_dump = mca_btl_base_dump, .btl_register_error = mca_btl_openib_register_error_cb, /* error call back registration */ - .btl_ft_event = mca_btl_openib_ft_event + .btl_ft_event = mca_btl_openib_ft_event, + .btl_register_mem = mca_btl_openib_register_mem, + .btl_deregister_mem = mca_btl_openib_deregister_mem, +#if HAVE_DECL_IBV_ATOMIC_HCA + .btl_atomic_fop = mca_btl_openib_atomic_fop, + .btl_atomic_cswap = mca_btl_openib_atomic_cswap, +#endif } }; @@ -854,6 +864,12 @@ int mca_btl_openib_add_procs( return rc; } + rc = mca_btl_openib_size_queues(openib_btl, nprocs); + if (OPAL_SUCCESS != rc) { + BTL_ERROR(("error creating cqs")); + return rc; + } + for (i = 0, local_procs = 0 ; i < (int) nprocs; i++) { struct opal_proc_t* proc = procs[i]; mca_btl_openib_proc_t* ib_proc; @@ -865,11 +881,6 @@ int mca_btl_openib_add_procs( local_procs ++; } - /* OOB, XOOB, and RDMACM do not support SELF comunication, so - * mark the prco as unreachable by openib btl */ - if (0 == opal_compare_proc(OPAL_PROC_MY_NAME, proc->proc_name)) { - continue; - } #if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE) /* Most current iWARP adapters (June 2008) cannot handle talking to other processes on the same host (!) -- so mark @@ -1036,7 +1047,7 @@ int mca_btl_openib_add_procs( openib_btl->local_procs += local_procs; openib_btl->device->mem_reg_max /= openib_btl->local_procs; - return mca_btl_openib_size_queues(openib_btl, nprocs); + return OPAL_SUCCESS; } /* @@ -1275,18 +1286,6 @@ int mca_btl_openib_free( struct mca_btl_base_module_t* btl, mca_btl_base_descriptor_t* des) { - /* is this fragment pointing at user memory? */ - if(MCA_BTL_OPENIB_FRAG_SEND_USER == openib_frag_type(des) || - MCA_BTL_OPENIB_FRAG_RECV_USER == openib_frag_type(des)) { - mca_btl_openib_com_frag_t* frag = to_com_frag(des); - - if(frag->registration != NULL) { - btl->btl_mpool->mpool_deregister(btl->btl_mpool, - (mca_mpool_base_registration_t*)frag->registration); - frag->registration = NULL; - } - } - /* reset those field on free so we will not have to do it on alloc */ to_base_frag(des)->base.des_flags = 0; switch(openib_frag_type(des)) { @@ -1302,12 +1301,6 @@ int mca_btl_openib_free( to_send_frag(des)->hdr + 1; assert(!opal_list_get_size(&to_send_frag(des)->coalesced_frags)); /* fall through */ - case MCA_BTL_OPENIB_FRAG_RECV: - case MCA_BTL_OPENIB_FRAG_RECV_USER: - case MCA_BTL_OPENIB_FRAG_SEND_USER: - to_base_frag(des)->base.des_remote = NULL; - to_base_frag(des)->base.des_remote_count = 0; - break; default: break; } @@ -1351,15 +1344,12 @@ int mca_btl_openib_free( mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, - mca_mpool_base_registration_t* registration, struct opal_convertor_t* convertor, uint8_t order, size_t reserve, size_t* size, uint32_t flags) { - mca_btl_openib_module_t *openib_btl; - mca_btl_openib_reg_t *openib_reg; mca_btl_openib_com_frag_t *frag = NULL; struct iovec iov; uint32_t iov_count = 1; @@ -1367,85 +1357,20 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( void *ptr; int rc; - openib_btl = (mca_btl_openib_module_t*)btl; - -#if OPAL_CUDA_GDR_SUPPORT - if(opal_convertor_cuda_need_buffers(convertor) == false && 0 == reserve) { -#else - if(opal_convertor_need_buffers(convertor) == false && 0 == reserve) { -#endif /* OPAL_CUDA_GDR_SUPPORT */ - /* GMS bloody HACK! */ - if(registration != NULL || max_data > btl->btl_max_send_size) { - frag = alloc_send_user_frag(); - if(NULL == frag) { - return NULL; - } - - iov.iov_len = max_data; - iov.iov_base = NULL; - - opal_convertor_pack(convertor, &iov, &iov_count, &max_data); - - *size = max_data; - - if(NULL == registration) { - rc = btl->btl_mpool->mpool_register(btl->btl_mpool, - iov.iov_base, max_data, 0, ®istration); - if(OPAL_SUCCESS != rc || NULL == registration) { - MCA_BTL_IB_FRAG_RETURN(frag); - return NULL; - } - /* keep track of the registration we did */ - to_com_frag(frag)->registration = - (mca_btl_openib_reg_t*)registration; - } - openib_reg = (mca_btl_openib_reg_t*)registration; - - frag->sg_entry.length = max_data; - frag->sg_entry.lkey = openib_reg->mr->lkey; - frag->sg_entry.addr = (uint64_t)(uintptr_t)iov.iov_base; - - to_base_frag(frag)->base.order = order; - to_base_frag(frag)->base.des_flags = flags; - to_base_frag(frag)->segment.base.seg_len = max_data; - to_base_frag(frag)->segment.base.seg_addr.lval = (uint64_t)(uintptr_t) iov.iov_base; - to_base_frag(frag)->segment.key = frag->sg_entry.lkey; - - assert(MCA_BTL_NO_ORDER == order); - - BTL_VERBOSE(("frag->sg_entry.lkey = %" PRIu32 " .addr = %" PRIx64, - frag->sg_entry.lkey, frag->sg_entry.addr)); - - return &to_base_frag(frag)->base; - } - } - assert(MCA_BTL_NO_ORDER == order); - if(max_data + reserve > btl->btl_max_send_size) { + if (max_data + reserve > btl->btl_max_send_size) { max_data = btl->btl_max_send_size - reserve; } - if (OPAL_UNLIKELY(0 == reserve)) { - frag = (mca_btl_openib_com_frag_t *) ib_frag_alloc(openib_btl, max_data, order, flags); - if(NULL == frag) - return NULL; - - /* NTH: this frag will be ue used for either a get or put so we need to set the lval to be - consistent with the usage in get and put. the pval will be restored in mca_btl_openib_free */ - ptr = to_base_frag(frag)->segment.base.seg_addr.pval; - to_base_frag(frag)->segment.base.seg_addr.lval = - (uint64_t)(uintptr_t) ptr; - } else { - frag = - (mca_btl_openib_com_frag_t *) mca_btl_openib_alloc(btl, endpoint, order, + frag = (mca_btl_openib_com_frag_t *) mca_btl_openib_alloc (btl, endpoint, order, max_data + reserve, flags); - if(NULL == frag) - return NULL; - - ptr = to_base_frag(frag)->segment.base.seg_addr.pval; + if (NULL == frag) { + return NULL; } + ptr = to_base_frag(frag)->segment.base.seg_addr.pval; + iov.iov_len = max_data; iov.iov_base = (IOVBASE_TYPE *) ( (unsigned char*) ptr + reserve ); rc = opal_convertor_pack(convertor, &iov, &iov_count, &max_data); @@ -1468,103 +1393,6 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( return &to_base_frag(frag)->base; } -/** - * Prepare the dst buffer - * - * @param btl (IN) BTL module - * @param peer (IN) BTL peer addressing - * prepare dest's behavior depends on the following: - * Has a valid memory registration been passed to prepare_src? - * if so we attempt to use the pre-registered user-buffer, if the memory registration - * is to small (only a portion of the user buffer) then we must reregister the user buffer - * Has the user requested the memory to be left pinned? - * if so we insert the memory registration into a memory tree for later lookup, we - * may also remove a previous registration if a MRU (most recently used) list of - * registrations is full, this prevents resources from being exhausted. - */ -mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - mca_mpool_base_registration_t* registration, - struct opal_convertor_t* convertor, - uint8_t order, - size_t reserve, - size_t* size, - uint32_t flags) -{ - mca_btl_openib_module_t *openib_btl; - mca_btl_openib_component_t *openib_component; - mca_btl_openib_com_frag_t *frag; - mca_btl_openib_reg_t *openib_reg; - uint32_t max_msg_sz; - int rc; - void *buffer; - - openib_btl = (mca_btl_openib_module_t*)btl; - openib_component = (mca_btl_openib_component_t*)btl->btl_component; - - frag = alloc_recv_user_frag(); - if(NULL == frag) { - return NULL; - } - - /* max_msg_sz is the maximum message size of the HCA (hw limitation) - set the minimum between local max_msg_sz and the remote */ - max_msg_sz = MIN(openib_btl->ib_port_attr.max_msg_sz, - endpoint->endpoint_btl->ib_port_attr.max_msg_sz); - - /* check if user has explicitly limited the max message size */ - if (openib_component->max_hw_msg_size > 0 && - max_msg_sz > (size_t)openib_component->max_hw_msg_size) { - max_msg_sz = openib_component->max_hw_msg_size; - } - - /* limit the message so to max_msg_sz */ - if (*size > (size_t)max_msg_sz) { - *size = (size_t)max_msg_sz; - BTL_VERBOSE(("message size limited to %" PRIsize_t "\n", *size)); - } - - opal_convertor_get_current_pointer(convertor, &buffer); - - if(NULL == registration){ - /* we didn't get a memory registration passed in, so we have to - * register the region ourselves - */ - uint32_t mflags = 0; -#if OPAL_CUDA_GDR_SUPPORT - if (convertor->flags & CONVERTOR_CUDA) { - mflags |= MCA_MPOOL_FLAGS_CUDA_GPU_MEM; - } -#endif /* OPAL_CUDA_GDR_SUPPORT */ - rc = btl->btl_mpool->mpool_register(btl->btl_mpool, buffer, *size, mflags, - ®istration); - if(OPAL_SUCCESS != rc || NULL == registration) { - MCA_BTL_IB_FRAG_RETURN(frag); - return NULL; - } - /* keep track of the registration we did */ - frag->registration = (mca_btl_openib_reg_t*)registration; - } - openib_reg = (mca_btl_openib_reg_t*)registration; - - frag->sg_entry.length = *size; - frag->sg_entry.lkey = openib_reg->mr->lkey; - frag->sg_entry.addr = (uint64_t)(uintptr_t)buffer; - - to_base_frag(frag)->segment.base.seg_addr.lval = (uint64_t)(uintptr_t) buffer; - to_base_frag(frag)->segment.base.seg_len = *size; - to_base_frag(frag)->segment.key = openib_reg->mr->rkey; - to_base_frag(frag)->base.order = order; - to_base_frag(frag)->base.des_flags = flags; - - BTL_VERBOSE(("frag->sg_entry.lkey = %" PRIu32 " .addr = %" PRIx64 " " - "rkey = %" PRIu32, frag->sg_entry.lkey, frag->sg_entry.addr, - openib_reg->mr->rkey)); - - return &to_base_frag(frag)->base; -} - static int mca_btl_openib_finalize_resources(struct mca_btl_base_module_t* btl) { mca_btl_openib_module_t* openib_btl; mca_btl_openib_endpoint_t* endpoint; @@ -1825,7 +1653,10 @@ cant_send_wqe: cant_send: OPAL_THREAD_UNLOCK(&ep->endpoint_lock); /* We can not send the data directly, so we just return descriptor */ - *descriptor = mca_btl_openib_alloc(btl, ep, order, size, flags); + if (NULL != descriptor) { + *descriptor = mca_btl_openib_alloc(btl, ep, order, size, flags); + } + return OPAL_ERR_RESOURCE_BUSY; } /* @@ -1855,7 +1686,7 @@ int mca_btl_openib_send( to_coalesced_frag(des)->sent = true; to_coalesced_frag(des)->hdr->tag = tag; - to_coalesced_frag(des)->hdr->size = des->des_local->seg_len; + to_coalesced_frag(des)->hdr->size = des->des_segments->seg_len; if(ep->nbo) BTL_OPENIB_HEADER_COALESCED_HTON(*to_coalesced_frag(des)->hdr); } else { @@ -1869,171 +1700,34 @@ int mca_btl_openib_send( return mca_btl_openib_endpoint_send(ep, frag); } -/* - * RDMA WRITE local buffer to remote buffer address. - */ - -int mca_btl_openib_put( mca_btl_base_module_t* btl, - mca_btl_base_endpoint_t* ep, - mca_btl_base_descriptor_t* descriptor) +static mca_btl_base_registration_handle_t *mca_btl_openib_register_mem (mca_btl_base_module_t *btl, + mca_btl_base_endpoint_t *endpoint, + void *base, size_t size, uint32_t flags) { - mca_btl_openib_segment_t *src_seg = (mca_btl_openib_segment_t *) descriptor->des_local; - mca_btl_openib_segment_t *dst_seg = (mca_btl_openib_segment_t *) descriptor->des_remote; - struct ibv_send_wr* bad_wr; - mca_btl_openib_out_frag_t* frag = to_out_frag(descriptor); - int qp = descriptor->order; - uint64_t rem_addr = dst_seg->base.seg_addr.lval; - uint32_t rkey = dst_seg->key; + mca_btl_openib_reg_t *reg; + uint32_t mflags = 0; + int rc; - assert(openib_frag_type(frag) == MCA_BTL_OPENIB_FRAG_SEND_USER || - openib_frag_type(frag) == MCA_BTL_OPENIB_FRAG_SEND); +#if OPAL_CUDA_GDR_SUPPORT + if (flags & MCA_BTL_REG_FLAG_CUDA_GPU_MEM) { + mflags |= MCA_MPOOL_FLAGS_CUDA_GPU_MEM; + } +#endif /* OPAL_CUDA_GDR_SUPPORT */ - descriptor->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; - - if(ep->endpoint_state != MCA_BTL_IB_CONNECTED) { - int rc; - OPAL_THREAD_LOCK(&ep->endpoint_lock); - rc = check_endpoint_state(ep, descriptor, &ep->pending_put_frags); - OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - if(OPAL_ERR_RESOURCE_BUSY == rc) - return OPAL_SUCCESS; - if(OPAL_SUCCESS != rc) - return rc; + rc = btl->btl_mpool->mpool_register (btl->btl_mpool, base, size, mflags, + (mca_mpool_base_registration_t **) ®); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc || NULL == reg)) { + return NULL; } - if(MCA_BTL_NO_ORDER == qp) - qp = mca_btl_openib_component.rdma_qp; - - /* check for a send wqe */ - if (qp_get_wqe(ep, qp) < 0) { - qp_put_wqe(ep, qp); - OPAL_THREAD_LOCK(&ep->endpoint_lock); - opal_list_append(&ep->pending_put_frags, (opal_list_item_t*)frag); - OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - return OPAL_SUCCESS; - } - /* post descriptor */ -#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if((ep->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN) - != (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) { - rem_addr = opal_swap_bytes8(rem_addr); - rkey = opal_swap_bytes4(rkey); - } -#endif - frag->sr_desc.wr.rdma.remote_addr = rem_addr; - frag->sr_desc.wr.rdma.rkey = rkey; - - to_com_frag(frag)->sg_entry.addr = src_seg->base.seg_addr.lval; - to_com_frag(frag)->sg_entry.length = src_seg->base.seg_len; - to_com_frag(frag)->endpoint = ep; -#if HAVE_XRC - if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) -#if OPAL_HAVE_CONNECTX_XRC_DOMAINS - frag->sr_desc.qp_type.xrc.remote_srqn=ep->rem_info.rem_srqs[qp].rem_srq_num; -#else - frag->sr_desc.xrc_remote_srq_num=ep->rem_info.rem_srqs[qp].rem_srq_num; -#endif -#endif - - descriptor->order = qp; - /* Setting opcode on a frag constructor isn't enough since prepare_src - * may return send_frag instead of put_frag */ - frag->sr_desc.opcode = IBV_WR_RDMA_WRITE; - frag->sr_desc.send_flags = ib_send_flags(descriptor->des_local->seg_len, &(ep->qps[qp]), 1); - qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag)); - qp_reset_signal_count(ep, qp); - - qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag)); - qp_reset_signal_count(ep, qp); - - if(ibv_post_send(ep->qps[qp].qp->lcl_qp, &frag->sr_desc, &bad_wr)) - return OPAL_ERROR; - - return OPAL_SUCCESS; + return ®->btl_handle; } -/* - * RDMA READ remote buffer to local buffer address. - */ - -int mca_btl_openib_get(mca_btl_base_module_t* btl, - mca_btl_base_endpoint_t* ep, - mca_btl_base_descriptor_t* descriptor) +static int mca_btl_openib_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle) { - mca_btl_openib_segment_t *src_seg = (mca_btl_openib_segment_t *) descriptor->des_remote; - mca_btl_openib_segment_t *dst_seg = (mca_btl_openib_segment_t *) descriptor->des_local; - struct ibv_send_wr* bad_wr; - mca_btl_openib_get_frag_t* frag = to_get_frag(descriptor); - int qp = descriptor->order; - uint64_t rem_addr = src_seg->base.seg_addr.lval; - uint32_t rkey = src_seg->key; + mca_btl_openib_reg_t *reg = (mca_btl_openib_reg_t *)((intptr_t) handle - offsetof (mca_btl_openib_reg_t, btl_handle)); - assert(openib_frag_type(frag) == MCA_BTL_OPENIB_FRAG_RECV_USER); - - descriptor->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; - - if(ep->endpoint_state != MCA_BTL_IB_CONNECTED) { - int rc; - OPAL_THREAD_LOCK(&ep->endpoint_lock); - rc = check_endpoint_state(ep, descriptor, &ep->pending_get_frags); - OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - if(OPAL_ERR_RESOURCE_BUSY == rc) - return OPAL_SUCCESS; - if(OPAL_SUCCESS != rc) - return rc; - } - - if(MCA_BTL_NO_ORDER == qp) - qp = mca_btl_openib_component.rdma_qp; - - /* check for a send wqe */ - if (qp_get_wqe(ep, qp) < 0) { - qp_put_wqe(ep, qp); - OPAL_THREAD_LOCK(&ep->endpoint_lock); - opal_list_append(&ep->pending_get_frags, (opal_list_item_t*)frag); - OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - return OPAL_SUCCESS; - } - - /* check for a get token */ - if(OPAL_THREAD_ADD32(&ep->get_tokens,-1) < 0) { - qp_put_wqe(ep, qp); - OPAL_THREAD_ADD32(&ep->get_tokens,1); - OPAL_THREAD_LOCK(&ep->endpoint_lock); - opal_list_append(&ep->pending_get_frags, (opal_list_item_t*)frag); - OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - return OPAL_SUCCESS; - } - -#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if((ep->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN) - != (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) { - rem_addr = opal_swap_bytes8(rem_addr); - rkey = opal_swap_bytes4(rkey); - } -#endif - frag->sr_desc.wr.rdma.remote_addr = rem_addr; - frag->sr_desc.wr.rdma.rkey = rkey; - - to_com_frag(frag)->sg_entry.addr = dst_seg->base.seg_addr.lval; - to_com_frag(frag)->sg_entry.length = dst_seg->base.seg_len; - to_com_frag(frag)->endpoint = ep; - -#if HAVE_XRC - if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) -#if OPAL_HAVE_CONNECTX_XRC_DOMAINS - frag->sr_desc.qp_type.xrc.remote_srqn=ep->rem_info.rem_srqs[qp].rem_srq_num; -#else - frag->sr_desc.xrc_remote_srq_num=ep->rem_info.rem_srqs[qp].rem_srq_num; -#endif -#endif - descriptor->order = qp; - - qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag)); - qp_reset_signal_count(ep, qp); - - if(ibv_post_send(ep->qps[qp].qp->lcl_qp, &frag->sr_desc, &bad_wr)) - return OPAL_ERROR; + btl->btl_mpool->mpool_deregister (btl->btl_mpool, (mca_mpool_base_registration_t *) reg); return OPAL_SUCCESS; } diff --git a/opal/mca/btl/openib/btl_openib.h b/opal/mca/btl/openib/btl_openib.h index 92e3d6a6eb..1282e8bc23 100644 --- a/opal/mca/btl/openib/btl_openib.h +++ b/opal/mca/btl/openib/btl_openib.h @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved. - * Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights + * Copyright (c) 2006-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved. * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved. @@ -148,7 +148,7 @@ typedef struct mca_btl_openib_srq_manager_t { } mca_btl_openib_srq_manager_t; struct mca_btl_openib_component_t { - mca_btl_base_component_2_0_0_t super; /**< base BTL component */ + mca_btl_base_component_3_0_0_t super; /**< base BTL component */ int ib_max_btls; /**< maximum number of devices available to openib component */ @@ -496,9 +496,15 @@ typedef struct mca_btl_openib_module_t mca_btl_openib_module_t; extern mca_btl_openib_module_t mca_btl_openib_module; +struct mca_btl_base_registration_handle_t { + uint32_t rkey; + uint32_t lkey; +}; + struct mca_btl_openib_reg_t { mca_mpool_base_registration_t base; struct ibv_mr *mr; + mca_btl_base_registration_handle_t btl_handle; }; typedef struct mca_btl_openib_reg_t mca_btl_openib_reg_t; @@ -611,32 +617,182 @@ extern int mca_btl_openib_sendi( struct mca_btl_base_module_t* btl, mca_btl_base_descriptor_t** descriptor ); -/** - * PML->BTL Initiate a put of the specified size. - * - * @param btl (IN) BTL instance - * @param btl_peer (IN) BTL peer addressing - * @param descriptor (IN) Descriptor of data to be transmitted. - */ -extern int mca_btl_openib_put( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* btl_peer, - struct mca_btl_base_descriptor_t* descriptor - ); +/* forward decaration for internal put/get */ +struct mca_btl_openib_put_frag_t; +struct mca_btl_openib_get_frag_t; /** - * PML->BTL Initiate a get of the specified size. + * @brief Schedule a put fragment with the HCA (internal) * * @param btl (IN) BTL instance - * @param btl_base_peer (IN) BTL peer addressing - * @param descriptor (IN) Descriptor of data to be transmitted. + * @param ep (IN) BTL endpoint + * @param frag (IN) Fragment prepared by mca_btl_openib_put + * + * If the fragment can not be scheduled due to resource limitations then + * the fragment will be put on the pending put fragment list and retried + * when another get/put fragment has completed. */ -extern int mca_btl_openib_get( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* btl_peer, - struct mca_btl_base_descriptor_t* descriptor - ); +int mca_btl_openib_put_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, + struct mca_btl_openib_put_frag_t *frag); +/** + * @brief Schedule an RDMA write with the HCA + * + * @param btl (IN) BTL instance + * @param ep (IN) BTL endpoint + * @param local_address (IN) Source address + * @param remote_address (IN) Destination address + * @param local_handle (IN) Registration handle for region containing the region {local_address, size} + * @param remote_handle (IN) Registration handle for region containing the region {remote_address, size} + * @param size (IN) Number of bytes to write + * @param flags (IN) Transfer flags + * @param order (IN) Ordering + * @param cbfunc (IN) Function to call on completion + * @param cbcontext (IN) Context for completion callback + * @param cbdata (IN) Data for completion callback + * + * @return OPAL_ERR_BAD_PARAM if a bad parameter was passed + * @return OPAL_SUCCCESS if the operation was successfully scheduled + * + * This function will attempt to schedule a put operation with the HCA. + */ +int mca_btl_openib_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); + +/** + * @brief Schedule a get fragment with the HCA (internal) + * + * @param btl (IN) BTL instance + * @param ep (IN) BTL endpoint + * @param qp (IN) ID of queue pair to schedule the get on + * @param frag (IN) Fragment prepared by mca_btl_openib_get + * + * If the fragment can not be scheduled due to resource limitations then + * the fragment will be put on the pending get fragment list and retried + * when another get/put fragment has completed. + */ +int mca_btl_openib_get_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, + struct mca_btl_openib_get_frag_t *frag); + +/** + * @brief Schedule an RDMA read with the HCA + * + * @param btl (IN) BTL instance + * @param ep (IN) BTL endpoint + * @param local_address (IN) Destination address + * @param remote_address (IN) Source address + * @param local_handle (IN) Registration handle for region containing the region {local_address, size} + * @param remote_handle (IN) Registration handle for region containing the region {remote_address, size} + * @param size (IN) Number of bytes to read + * @param flags (IN) Transfer flags + * @param order (IN) Ordering + * @param cbfunc (IN) Function to call on completion + * @param cbcontext (IN) Context for completion callback + * @param cbdata (IN) Data for completion callback + * + * @return OPAL_ERR_BAD_PARAM if a bad parameter was passed + * @return OPAL_SUCCCESS if the operation was successfully scheduled + * + * This function will attempt to schedule a get operation with the HCA. + */ +int mca_btl_openib_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); + +/** + * Initiate an asynchronous fetching atomic operation. + * Completion Semantics: if this function returns a 1 then the operation + * is complete. a return of OPAL_SUCCESS indicates + * the atomic operation has been queued with the + * network. + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL addressing information + * @param local_address (OUT) Local address to store the result in + * @param remote_address (IN) Remote address perfom operation on to (registered remotely) + * @param local_handle (IN) Local registration handle for region containing + * (local_address, local_address + 8) + * @param remote_handle (IN) Remote registration handle for region containing + * (remote_address, remote_address + 8) + * @param op (IN) Operation to perform + * @param operand (IN) Operand for the operation + * @param flags (IN) Flags for this put operation + * @param order (IN) Ordering + * @param cbfunc (IN) Function to call on completion (if queued) + * @param cbcontext (IN) Context for the callback + * @param cbdata (IN) Data for callback + * + * @retval OPAL_SUCCESS The operation was successfully queued + * @retval 1 The operation is complete + * @retval OPAL_ERROR The operation was NOT successfully queued + * @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the atomic + * operation. Try again later + * @retval OPAL_ERR_NOT_AVAILABLE Atomic operation can not be performed due to + * alignment restrictions or the operation {op} is not supported + * by the hardware. + * + * After the operation is complete the remote address specified by {remote_address} and + * {remote_handle} will be updated with (*remote_address) = (*remote_address) op operand. + * {local_address} will be updated with the previous value stored in {remote_address}. + * The btl will guarantee consistency of atomic operations performed via the btl. Note, + * however, that not all btls will provide consistency between btl atomic operations and + * cpu atomics. + */ +int mca_btl_openib_atomic_fop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, + struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op, + uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, + void *cbcontext, void *cbdata); + +/** + * Initiate an asynchronous compare and swap operation. + * Completion Semantics: if this function returns a 1 then the operation + * is complete. a return of OPAL_SUCCESS indicates + * the atomic operation has been queued with the + * network. + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL addressing information + * @param local_address (OUT) Local address to store the result in + * @param remote_address (IN) Remote address perfom operation on to (registered remotely) + * @param local_handle (IN) Local registration handle for region containing + * (local_address, local_address + 8) + * @param remote_handle (IN) Remote registration handle for region containing + * (remote_address, remote_address + 8) + * @param compare (IN) Operand for the operation + * @param value (IN) Value to store on success + * @param flags (IN) Flags for this put operation + * @param order (IN) Ordering + * @param cbfunc (IN) Function to call on completion (if queued) + * @param cbcontext (IN) Context for the callback + * @param cbdata (IN) Data for callback + * + * @retval OPAL_SUCCESS The operation was successfully queued + * @retval 1 The operation is complete + * @retval OPAL_ERROR The operation was NOT successfully queued + * @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the atomic + * operation. Try again later + * @retval OPAL_ERR_NOT_AVAILABLE Atomic operation can not be performed due to + * alignment restrictions or the operation {op} is not supported + * by the hardware. + * + * After the operation is complete the remote address specified by {remote_address} and + * {remote_handle} will be updated with {value} if *remote_address == compare. + * {local_address} will be updated with the previous value stored in {remote_address}. + * The btl will guarantee consistency of atomic operations performed via the btl. Note, + * however, that not all btls will provide consistency between btl atomic operations and + * cpu atomics. + */ +int mca_btl_openib_atomic_cswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, + struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, + uint64_t value, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, + void *cbcontext, void *cbdata); /** * Allocate a descriptor. @@ -673,7 +829,6 @@ extern int mca_btl_openib_free( mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* peer, - mca_mpool_base_registration_t* registration, struct opal_convertor_t* convertor, uint8_t order, size_t reserve, @@ -681,22 +836,6 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( uint32_t flags ); -/** - * Allocate a descriptor initialized for RDMA write. - * - * @param btl (IN) BTL module - * @param peer (IN) BTL peer addressing - */ -extern mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* peer, - mca_mpool_base_registration_t* registration, - struct opal_convertor_t* convertor, - uint8_t order, - size_t reserve, - size_t* size, - uint32_t flags); - extern void mca_btl_openib_frag_progress_pending_put_get( struct mca_btl_base_endpoint_t*, const int); diff --git a/opal/mca/btl/openib/btl_openib_atomic.c b/opal/mca/btl/openib/btl_openib_atomic.c new file mode 100644 index 0000000000..3200a9097f --- /dev/null +++ b/opal/mca/btl/openib/btl_openib_atomic.c @@ -0,0 +1,135 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2014 Los Alamos National Security, LLC. All rights + * reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "btl_openib.h" +#include "btl_openib_endpoint.h" +#include "btl_openib_xrc.h" + +#if HAVE_DECL_IBV_ATOMIC_HCA + +static int mca_btl_openib_atomic_internal (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, enum ibv_wr_opcode opcode, + int64_t operand, int operand2, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, + void *cbcontext, void *cbdata) +{ + mca_btl_openib_get_frag_t* frag = NULL; + int qp = order; + int rc; + + frag = to_get_frag(alloc_recv_user_frag()); + if (OPAL_UNLIKELY(NULL == frag)) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + if (MCA_BTL_NO_ORDER == qp) { + qp = mca_btl_openib_component.rdma_qp; + } + + /* set base descriptor flags */ + to_base_frag(frag)->base.order = qp; + /* free this descriptor when the operation is complete */ + to_base_frag(frag)->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP; + + /* set up scatter-gather entry */ + to_com_frag(frag)->sg_entry.length = 8; + to_com_frag(frag)->sg_entry.lkey = local_handle->lkey; + to_com_frag(frag)->sg_entry.addr = (uint64_t)(uintptr_t) local_address; + to_com_frag(frag)->endpoint = endpoint; + + /* set up rdma callback */ + frag->cb.func = cbfunc; + frag->cb.context = cbcontext; + frag->cb.data = cbdata; + frag->cb.local_handle = local_handle; + + /* set up descriptor */ + frag->sr_desc.wr.atomic.remote_addr = remote_address; + frag->sr_desc.opcode = opcode; + frag->sr_desc.wr.atomic.compare_add = operand; + frag->sr_desc.wr.atomic.swap = operand2; + +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT + if((endpoint->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN) + != (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) { + frag->sr_desc.wr.atomic.rkey = opal_swap_bytes4 (remote_handle->rkey); + } else +#endif + { + frag->sr_desc.wr.atomic.rkey = remote_handle->rkey; + } + +#if HAVE_XRC + if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) { + frag->sr_desc.xrc_remote_srq_num=endpoint->rem_info.rem_srqs[qp].rem_srq_num; + } +#endif + + if (endpoint->endpoint_state != MCA_BTL_IB_CONNECTED) { + OPAL_THREAD_LOCK(&endpoint->endpoint_lock); + rc = check_endpoint_state(endpoint, &to_base_frag(frag)->base, &endpoint->pending_get_frags); + OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); + if (OPAL_ERR_RESOURCE_BUSY == rc) { + return OPAL_SUCCESS; + } + + if (OPAL_SUCCESS != rc) { + MCA_BTL_IB_FRAG_RETURN (frag); + return rc; + } + } + + rc = mca_btl_openib_get_internal (btl, endpoint, frag); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + if (OPAL_LIKELY(OPAL_ERR_OUT_OF_RESOURCE == rc)) { + rc = OPAL_SUCCESS; + + OPAL_THREAD_SCOPED_LOCK(&endpoint->endpoint_lock, + opal_list_append(&endpoint->pending_get_frags, (opal_list_item_t*)frag)); + } else { + MCA_BTL_IB_FRAG_RETURN (frag); + } + } + + return rc; +} + +int mca_btl_openib_atomic_fop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, + struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op, + uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, + void *cbcontext, void *cbdata) +{ + + if (OPAL_UNLIKELY(MCA_BTL_ATOMIC_ADD != op)) { + return OPAL_ERR_NOT_SUPPORTED; + } + + return mca_btl_openib_atomic_internal (btl, endpoint, local_address, remote_address, local_handle, + remote_handle, IBV_WR_ATOMIC_FETCH_AND_ADD, operand, 0, + flags, order, cbfunc, cbcontext, cbdata); +} + +int mca_btl_openib_atomic_cswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, + struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, + uint64_t value, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, + void *cbcontext, void *cbdata) +{ + return mca_btl_openib_atomic_internal (btl, endpoint, local_address, remote_address, local_handle, + remote_handle, IBV_WR_ATOMIC_CMP_AND_SWP, compare, value, + flags, order, cbfunc, cbcontext, cbdata); +} + +#endif diff --git a/opal/mca/btl/openib/btl_openib_component.c b/opal/mca/btl/openib/btl_openib_component.c index 987fb66632..5a3d75b594 100644 --- a/opal/mca/btl/openib/btl_openib_component.c +++ b/opal/mca/btl/openib/btl_openib_component.c @@ -468,7 +468,7 @@ static void btl_openib_control(mca_btl_base_module_t* btl, mca_btl_openib_header_coalesced_t *clsc_hdr = (mca_btl_openib_header_coalesced_t*)(ctl_hdr + 1); mca_btl_active_message_callback_t* reg; - size_t len = des->des_local->seg_len - sizeof(*ctl_hdr); + size_t len = des->des_segments->seg_len - sizeof(*ctl_hdr); switch (ctl_hdr->type) { case MCA_BTL_OPENIB_CONTROL_CREDITS: @@ -519,8 +519,8 @@ static void btl_openib_control(mca_btl_base_module_t* btl, skip = (sizeof(*clsc_hdr) + clsc_hdr->alloc_size - pad); - tmp_des.des_local = &tmp_seg; - tmp_des.des_local_count = 1; + tmp_des.des_segments = &tmp_seg; + tmp_des.des_segment_count = 1; tmp_seg.seg_addr.pval = clsc_hdr + 1; tmp_seg.seg_len = clsc_hdr->size; @@ -580,6 +580,10 @@ static int openib_reg_mr(void *reg_data, void *base, size_t size, enum ibv_access_flags access_flag = (enum ibv_access_flags) (IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ); +#if HAVE_DECL_IBV_ATOMIC_HCA + access_flag |= IBV_ACCESS_REMOTE_ATOMIC; +#endif + if (device->mem_reg_max && device->mem_reg_max < (device->mem_reg_active + size)) { return OPAL_ERR_OUT_OF_RESOURCE; @@ -602,6 +606,9 @@ static int openib_reg_mr(void *reg_data, void *base, size_t size, return OPAL_ERR_OUT_OF_RESOURCE; } + openib_reg->btl_handle.lkey = openib_reg->mr->lkey; + openib_reg->btl_handle.rkey = openib_reg->mr->rkey; + OPAL_OUTPUT_VERBOSE((30, mca_btl_openib_component.memory_registration_verbose, "openib_reg_mr: base=%p, bound=%p, size=%d, flags=0x%x", reg->base, reg->bound, (int) (reg->bound - reg->base + 1), reg->flags)); @@ -799,7 +806,30 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device, mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbfunc = btl_openib_control; mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbdata = NULL; - openib_btl->super.btl_seg_size = sizeof (mca_btl_openib_segment_t); + if (openib_btl->super.btl_get_limit > openib_btl->ib_port_attr.max_msg_sz) { + openib_btl->super.btl_get_limit = openib_btl->ib_port_attr.max_msg_sz; + } + + openib_btl->super.btl_get_alignment = 0; + + if (openib_btl->super.btl_put_limit > openib_btl->ib_port_attr.max_msg_sz) { + openib_btl->super.btl_put_limit = openib_btl->ib_port_attr.max_msg_sz; + } + +#if HAVE_DECL_IBV_ATOMIC_HCA + if (openib_btl->device->ib_dev_attr.atomic_cap == IBV_ATOMIC_NONE) { + openib_btl->super.btl_flags &= ~MCA_BTL_FLAGS_ATOMIC_FOPS; + openib_btl->super.btl_atomic_flags = 0; + openib_btl->super.btl_atomic_fop = NULL; + openib_btl->super.btl_atomic_cswap = NULL; + } else if (IBV_ATOMIC_GLOB == openib_btl->device->ib_dev_attr.atomic_cap) { + openib_btl->super.btl_flags |= MCA_BTL_ATOMIC_SUPPORTS_GLOB; + } +#endif + + openib_btl->super.btl_put_alignment = 0; + + openib_btl->super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t); /* Check bandwidth configured for this device */ sprintf(param, "bandwidth_%s", ibv_get_device_name(device->ib_dev)); @@ -2976,17 +3006,20 @@ void mca_btl_openib_frag_progress_pending_put_get(mca_btl_base_endpoint_t *ep, size_t i, len = opal_list_get_size(&ep->pending_get_frags); int rc; - for(i = 0; i < len && ep->qps[qp].qp->sd_wqe > 0 && ep->get_tokens > 0; i++) - { + for(i = 0; i < len && ep->qps[qp].qp->sd_wqe > 0 && ep->get_tokens > 0; i++) { OPAL_THREAD_LOCK(&ep->endpoint_lock); frag = opal_list_remove_first(&(ep->pending_get_frags)); OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - if(NULL == frag) + if (NULL == frag) break; - rc = mca_btl_openib_get((mca_btl_base_module_t *)openib_btl, ep, - &to_base_frag(frag)->base); - if(OPAL_ERR_OUT_OF_RESOURCE == rc) + rc = mca_btl_openib_get_internal ((mca_btl_base_module_t *)openib_btl, ep, + to_get_frag(frag)); + if (OPAL_ERR_OUT_OF_RESOURCE == rc) { + OPAL_THREAD_LOCK(&ep->endpoint_lock); + opal_list_prepend (&ep->pending_get_frags, frag); + OPAL_THREAD_UNLOCK(&ep->endpoint_lock); break; + } } len = opal_list_get_size(&ep->pending_put_frags); @@ -2994,12 +3027,16 @@ void mca_btl_openib_frag_progress_pending_put_get(mca_btl_base_endpoint_t *ep, OPAL_THREAD_LOCK(&ep->endpoint_lock); frag = opal_list_remove_first(&(ep->pending_put_frags)); OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - if(NULL == frag) + if (NULL == frag) break; - rc = mca_btl_openib_put((mca_btl_base_module_t*)openib_btl, ep, - &to_base_frag(frag)->base); - if(OPAL_ERR_OUT_OF_RESOURCE == rc) + rc = mca_btl_openib_put_internal ((mca_btl_base_module_t*)openib_btl, ep, + to_put_frag(frag)); + if (OPAL_ERR_OUT_OF_RESOURCE == rc) { + OPAL_THREAD_LOCK(&ep->endpoint_lock); + opal_list_prepend (&ep->pending_put_frags, frag); + OPAL_THREAD_UNLOCK(&ep->endpoint_lock); break; + } } } @@ -3020,7 +3057,7 @@ static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl, /* advance the segment address past the header and subtract from the * length.*/ - des->des_local->seg_len = byte_len - sizeof(mca_btl_openib_header_t); + des->des_segments->seg_len = byte_len - sizeof(mca_btl_openib_header_t); if(OPAL_LIKELY(!(is_credit_msg = is_credit_message(frag)))) { /* call registered callback */ @@ -3055,7 +3092,7 @@ static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl, } } else { mca_btl_openib_rdma_credits_header_t *chdr = - (mca_btl_openib_rdma_credits_header_t *) des->des_local->seg_addr.pval; + (mca_btl_openib_rdma_credits_header_t *) des->des_segments->seg_addr.pval; if(ep->nbo) { BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH(*chdr); } @@ -3361,11 +3398,27 @@ static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq, /* Handle work completions */ switch(wc->opcode) { case IBV_WC_RDMA_READ: - OPAL_OUTPUT((-1, "Got WC: RDMA_READ")); - OPAL_THREAD_ADD32(&endpoint->get_tokens, 1); - /* fall through */ + case IBV_WC_COMP_SWAP: + case IBV_WC_FETCH_ADD: + OPAL_OUTPUT((-1, "Got WC: RDMA_READ or RDMA_WRITE")); + OPAL_THREAD_ADD32(&endpoint->get_tokens, 1); + + mca_btl_openib_get_frag_t *get_frag = to_get_frag(des); + + get_frag->cb.func (&openib_btl->super, endpoint, (void *)(intptr_t) frag->sg_entry.addr, + get_frag->cb.local_handle, get_frag->cb.context, get_frag->cb.data, + OPAL_SUCCESS); case IBV_WC_RDMA_WRITE: + if (MCA_BTL_OPENIB_FRAG_SEND_USER == openib_frag_type(des)) { + mca_btl_openib_put_frag_t *put_frag = to_put_frag(des); + + put_frag->cb.func (&openib_btl->super, endpoint, (void *)(intptr_t) frag->sg_entry.addr, + put_frag->cb.local_handle, put_frag->cb.context, put_frag->cb.data, + OPAL_SUCCESS); + put_frag->cb.func = NULL; + } + /* fall through */ case IBV_WC_SEND: OPAL_OUTPUT((-1, "Got WC: RDMA_WRITE or SEND")); if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) { @@ -3394,7 +3447,7 @@ static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq, /* Process a completed send/put/get */ btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) { - des->des_cbfunc(&openib_btl->super, endpoint, des,OPAL_SUCCESS); + des->des_cbfunc(&openib_btl->super, endpoint, des, OPAL_SUCCESS); } if( btl_ownership ) { mca_btl_openib_free(&openib_btl->super, des); diff --git a/opal/mca/btl/openib/btl_openib_endpoint.c b/opal/mca/btl/openib/btl_openib_endpoint.c index c26aed6395..82a648e08d 100644 --- a/opal/mca/btl/openib/btl_openib_endpoint.c +++ b/opal/mca/btl/openib/btl_openib_endpoint.c @@ -89,7 +89,7 @@ int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t *endpoint, if(acquire_wqe(endpoint, frag) != OPAL_SUCCESS) return OPAL_ERR_RESOURCE_BUSY; - size = des->des_local->seg_len + frag->coalesced_length; + size = des->des_segments->seg_len + frag->coalesced_length; rc = mca_btl_openib_endpoint_credit_acquire (endpoint, qp, prio, size, &do_rdma, frag, true); diff --git a/opal/mca/btl/openib/btl_openib_endpoint.h b/opal/mca/btl/openib/btl_openib_endpoint.h index ec751c6bec..41096a6c62 100644 --- a/opal/mca/btl/openib/btl_openib_endpoint.h +++ b/opal/mca/btl/openib/btl_openib_endpoint.h @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology @@ -10,7 +11,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights + * Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved. * Copyright (c) 2007-2009 Mellanox Technologies. All rights reserved. diff --git a/opal/mca/btl/openib/btl_openib_failover.c b/opal/mca/btl/openib/btl_openib_failover.c index 255c617116..d5f9c15874 100644 --- a/opal/mca/btl/openib/btl_openib_failover.c +++ b/opal/mca/btl/openib/btl_openib_failover.c @@ -153,8 +153,8 @@ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl, if (NULL != btlname) free(btlname); /* Since we believe we have done a send, read or write, then the - * des_local fields should have valid data. */ - assert(des->des_local != NULL); + * des_segments fields should have valid data. */ + assert(des->des_segments != NULL); /* If the endpoint is not yet in the MCA_BTL_IB_CLOSED state, then * change the status. Since this connection was mapped out in the diff --git a/opal/mca/btl/openib/btl_openib_frag.c b/opal/mca/btl/openib/btl_openib_frag.c index 2c5fcbdaf4..6768fcd76c 100644 --- a/opal/mca/btl/openib/btl_openib_frag.c +++ b/opal/mca/btl/openib/btl_openib_frag.c @@ -68,8 +68,8 @@ static void out_constructor(mca_btl_openib_out_frag_t *frag) { mca_btl_openib_frag_t *base_frag = to_base_frag(frag); - base_frag->base.des_local = &base_frag->segment.base; - base_frag->base.des_local_count = 1; + base_frag->base.des_segments = &base_frag->segment.base; + base_frag->base.des_segment_count = 1; frag->sr_desc.wr_id = (uint64_t)(uintptr_t)frag; frag->sr_desc.sg_list = &to_com_frag(frag)->sg_entry; @@ -83,8 +83,8 @@ static void in_constructor(mca_btl_openib_in_frag_t *frag) { mca_btl_openib_frag_t *base_frag = to_base_frag(frag); - base_frag->base.des_local = &base_frag->segment.base; - base_frag->base.des_local_count = 1; + base_frag->base.des_segments = &base_frag->segment.base; + base_frag->base.des_segment_count = 1; } static void send_constructor(mca_btl_openib_send_frag_t *frag) @@ -134,6 +134,7 @@ static void put_constructor(mca_btl_openib_put_frag_t *frag) { to_base_frag(frag)->type = MCA_BTL_OPENIB_FRAG_SEND_USER; to_out_frag(frag)->sr_desc.opcode = IBV_WR_RDMA_WRITE; + frag->cb.func = NULL; } static void get_constructor(mca_btl_openib_get_frag_t *frag) @@ -154,8 +155,8 @@ static void coalesced_constructor(mca_btl_openib_coalesced_frag_t *frag) base_frag->type = MCA_BTL_OPENIB_FRAG_COALESCED; - base_frag->base.des_local = &base_frag->segment.base; - base_frag->base.des_local_count = 1; + base_frag->base.des_segments = &base_frag->segment.base; + base_frag->base.des_segment_count = 1; } OBJ_CLASS_INSTANCE( diff --git a/opal/mca/btl/openib/btl_openib_frag.h b/opal/mca/btl/openib/btl_openib_frag.h index 3369e732bf..0a5ad6bc4b 100644 --- a/opal/mca/btl/openib/btl_openib_frag.h +++ b/opal/mca/btl/openib/btl_openib_frag.h @@ -349,7 +349,15 @@ OBJ_CLASS_DECLARATION(mca_btl_openib_recv_frag_t); #define to_recv_frag(f) ((mca_btl_openib_recv_frag_t*)(f)) -typedef struct mca_btl_openib_out_frag_t mca_btl_openib_put_frag_t; +typedef struct mca_btl_openib_put_frag_t { + mca_btl_openib_out_frag_t super; + struct { + mca_btl_base_rdma_completion_fn_t func; + mca_btl_base_registration_handle_t *local_handle; + void *context; + void *data; + } cb; +} mca_btl_openib_put_frag_t; OBJ_CLASS_DECLARATION(mca_btl_openib_put_frag_t); #define to_put_frag(f) ((mca_btl_openib_put_frag_t*)(f)) @@ -357,6 +365,12 @@ OBJ_CLASS_DECLARATION(mca_btl_openib_put_frag_t); typedef struct mca_btl_openib_get_frag_t { mca_btl_openib_in_frag_t super; struct ibv_send_wr sr_desc; + struct { + mca_btl_base_rdma_completion_fn_t func; + mca_btl_base_registration_handle_t *local_handle; + void *context; + void *data; + } cb; } mca_btl_openib_get_frag_t; OBJ_CLASS_DECLARATION(mca_btl_openib_get_frag_t); diff --git a/opal/mca/btl/openib/btl_openib_get.c b/opal/mca/btl/openib/btl_openib_get.c new file mode 100644 index 0000000000..61904eb1b8 --- /dev/null +++ b/opal/mca/btl/openib/btl_openib_get.c @@ -0,0 +1,163 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2013 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007-2013 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved. + * Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2006-2007 Voltaire All rights reserved. + * Copyright (c) 2008-2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2009 IBM Corporation. All rights reserved. + * Copyright (c) 2013-2014 Intel, Inc. All rights reserved + * Copyright (c) 2013 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2014 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "btl_openib.h" +#include "btl_openib_frag.h" +#include "btl_openib_endpoint.h" +#include "btl_openib_xrc.h" + +/* + * RDMA READ remote buffer to local buffer address. + */ + +int mca_btl_openib_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) +{ + mca_btl_openib_get_frag_t* frag = NULL; + int qp = order; + int rc; + + if (OPAL_UNLIKELY(size > btl->btl_get_limit)) { + return OPAL_ERR_BAD_PARAM; + } + + frag = to_get_frag(alloc_recv_user_frag()); + if (OPAL_UNLIKELY(NULL == frag)) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + if (MCA_BTL_NO_ORDER == qp) { + qp = mca_btl_openib_component.rdma_qp; + } + + /* set base descriptor flags */ + to_base_frag(frag)->base.order = qp; + /* free this descriptor when the operation is complete */ + to_base_frag(frag)->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP; + + /* set up scatter-gather entry */ + to_com_frag(frag)->sg_entry.length = size; + to_com_frag(frag)->sg_entry.lkey = local_handle->lkey; + to_com_frag(frag)->sg_entry.addr = (uint64_t)(uintptr_t) local_address; + to_com_frag(frag)->endpoint = ep; + + /* set up rdma callback */ + frag->cb.func = cbfunc; + frag->cb.context = cbcontext; + frag->cb.data = cbdata; + frag->cb.local_handle = local_handle; + + /* set up descriptor */ + frag->sr_desc.wr.rdma.remote_addr = remote_address; + /* the opcode may have been changed by an atomic operation */ + frag->sr_desc.opcode = IBV_WR_RDMA_READ; + +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT + if((ep->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN) + != (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) { + frag->sr_desc.wr.rdma.rkey = opal_swap_bytes4 (remote_handle->rkey); + } else +#endif + { + frag->sr_desc.wr.rdma.rkey = remote_handle->rkey; + } + +#if HAVE_XRC + if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) { +#if OPAL_HAVE_CONNECTX_XRC_DOMAINS + frag->sr_desc.qp_type.xrc.remote_srqn = ep->rem_info.rem_srqs[qp].rem_srq_num; +#else + frag->sr_desc.xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num; +#endif + } +#endif + + if (ep->endpoint_state != MCA_BTL_IB_CONNECTED) { + OPAL_THREAD_LOCK(&ep->endpoint_lock); + rc = check_endpoint_state(ep, &to_base_frag(frag)->base, &ep->pending_get_frags); + OPAL_THREAD_UNLOCK(&ep->endpoint_lock); + if (OPAL_ERR_RESOURCE_BUSY == rc) { + return OPAL_SUCCESS; + } + + if (OPAL_SUCCESS != rc) { + MCA_BTL_IB_FRAG_RETURN (frag); + return rc; + } + } + + rc = mca_btl_openib_get_internal (btl, ep, frag); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + if (OPAL_LIKELY(OPAL_ERR_OUT_OF_RESOURCE == rc)) { + rc = OPAL_SUCCESS; + + OPAL_THREAD_LOCK(&ep->endpoint_lock); + opal_list_append(&ep->pending_get_frags, (opal_list_item_t*)frag); + OPAL_THREAD_UNLOCK(&ep->endpoint_lock); + } else { + MCA_BTL_IB_FRAG_RETURN (frag); + } + } + + return rc; +} + +int mca_btl_openib_get_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, + mca_btl_openib_get_frag_t *frag) +{ + int qp = to_base_frag(frag)->base.order; + struct ibv_send_wr *bad_wr; + + /* check for a send wqe */ + if (qp_get_wqe(ep, qp) < 0) { + qp_put_wqe(ep, qp); + return OPAL_ERR_OUT_OF_RESOURCE; + } + + /* check for a get token */ + if (OPAL_THREAD_ADD32(&ep->get_tokens,-1) < 0) { + qp_put_wqe(ep, qp); + OPAL_THREAD_ADD32(&ep->get_tokens,1); + return OPAL_ERR_OUT_OF_RESOURCE; + } + + qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag)); + qp_reset_signal_count(ep, qp); + + if (ibv_post_send(ep->qps[qp].qp->lcl_qp, &frag->sr_desc, &bad_wr)) { + qp_put_wqe(ep, qp); + OPAL_THREAD_ADD32(&ep->get_tokens,1); + return OPAL_ERROR; + } + + return OPAL_SUCCESS; +} diff --git a/opal/mca/btl/openib/btl_openib_mca.c b/opal/mca/btl/openib/btl_openib_mca.c index 94f267c9b3..207769fe3f 100644 --- a/opal/mca/btl/openib/btl_openib_mca.c +++ b/opal/mca/btl/openib/btl_openib_mca.c @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology @@ -11,7 +12,7 @@ * All rights reserved. * Copyright (c) 2006-2013 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved. - * Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights + * Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved. * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved. @@ -567,10 +568,16 @@ int btl_openib_register_mca_params(void) mca_btl_openib_module.super.btl_rdma_pipeline_frag_size = 1024 * 1024; mca_btl_openib_module.super.btl_min_rdma_pipeline_size = 256 * 1024; mca_btl_openib_module.super.btl_flags = MCA_BTL_FLAGS_RDMA | - MCA_BTL_FLAGS_NEED_ACK | MCA_BTL_FLAGS_NEED_CSUM | MCA_BTL_FLAGS_HETEROGENEOUS_RDMA; + MCA_BTL_FLAGS_NEED_ACK | MCA_BTL_FLAGS_NEED_CSUM | MCA_BTL_FLAGS_HETEROGENEOUS_RDMA; #if BTL_OPENIB_FAILOVER_ENABLED mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_FAILOVER_SUPPORT; #endif + +#if HAVE_DECL_IBV_ATOMIC_HCA + mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_ATOMIC_FOPS; + mca_btl_openib_module.super.btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD | MCA_BTL_ATOMIC_SUPPORTS_CSWAP; +#endif + /* Default to bandwidth auto-detection */ mca_btl_openib_module.super.btl_bandwidth = 0; mca_btl_openib_module.super.btl_latency = 4; diff --git a/opal/mca/btl/openib/btl_openib_put.c b/opal/mca/btl/openib/btl_openib_put.c new file mode 100644 index 0000000000..cc2cadcf73 --- /dev/null +++ b/opal/mca/btl/openib/btl_openib_put.c @@ -0,0 +1,160 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2013 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007-2013 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved. + * Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2006-2007 Voltaire All rights reserved. + * Copyright (c) 2008-2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2009 IBM Corporation. All rights reserved. + * Copyright (c) 2013-2014 Intel, Inc. All rights reserved + * Copyright (c) 2013 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2014 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "btl_openib.h" +#include "btl_openib_frag.h" +#include "btl_openib_endpoint.h" +#include "btl_openib_xrc.h" + +/* + * RDMA WRITE local buffer to remote buffer address. + */ + +int mca_btl_openib_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) +{ + mca_btl_openib_put_frag_t *frag = NULL; + int rc, qp = order; + + if (OPAL_UNLIKELY(size > btl->btl_put_limit)) { + return OPAL_ERR_BAD_PARAM; + } + + frag = to_put_frag(alloc_send_user_frag ()); + if (OPAL_UNLIKELY(NULL == frag)) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + if (MCA_BTL_NO_ORDER == qp) { + qp = mca_btl_openib_component.rdma_qp; + } + + /* set base descriptor flags */ + to_base_frag(frag)->base.order = qp; + /* free this descriptor when the operation is complete */ + to_base_frag(frag)->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP; + + /* set up scatter-gather entry */ + to_com_frag(frag)->sg_entry.length = size; + to_com_frag(frag)->sg_entry.lkey = local_handle->lkey; + to_com_frag(frag)->sg_entry.addr = (uint64_t)(intptr_t) local_address; + to_com_frag(frag)->endpoint = ep; + + /* set up rdma callback */ + frag->cb.func = cbfunc; + frag->cb.context = cbcontext; + frag->cb.data = cbdata; + frag->cb.local_handle = local_handle; + + /* post descriptor */ + to_out_frag(frag)->sr_desc.opcode = IBV_WR_RDMA_WRITE; + to_out_frag(frag)->sr_desc.send_flags = ib_send_flags(size, &(ep->qps[qp]), 1); + to_out_frag(frag)->sr_desc.wr.rdma.remote_addr = remote_address; + + qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag)); + qp_reset_signal_count(ep, qp); + +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT + if ((ep->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN) + != (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) { + to_out_frag(frag)->sr_desc.wr.rdma.rkey = opal_swap_bytes4(remote_handle->rkey); + } else +#endif + { + to_out_frag(frag)->sr_desc.wr.rdma.rkey = remote_handle->rkey; + } + +#if HAVE_XRC + if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) { +#if OPAL_HAVE_CONNECTX_XRC_DOMAINS + to_out_frag(frag)->sr_desc.qp_type.xrc.remote_srqn = ep->rem_info.rem_srqs[qp].rem_srq_num; +#else + to_out_frag(frag)->sr_desc.xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num; +#endif + } +#endif + + if (ep->endpoint_state != MCA_BTL_IB_CONNECTED) { + OPAL_THREAD_LOCK(&ep->endpoint_lock); + rc = check_endpoint_state(ep, &to_base_frag(frag)->base, &ep->pending_put_frags); + OPAL_THREAD_UNLOCK(&ep->endpoint_lock); + if (OPAL_ERR_RESOURCE_BUSY == rc) { + /* descriptor was queued pending connection */ + return OPAL_SUCCESS; + } + + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + MCA_BTL_IB_FRAG_RETURN (frag); + return rc; + } + } + + rc = mca_btl_openib_put_internal (btl, ep, frag); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + if (OPAL_LIKELY(OPAL_ERR_OUT_OF_RESOURCE == rc)) { + rc = OPAL_SUCCESS; + + /* queue the fragment for when resources are available */ + OPAL_THREAD_LOCK(&ep->endpoint_lock); + opal_list_append(&ep->pending_put_frags, (opal_list_item_t*)frag); + OPAL_THREAD_UNLOCK(&ep->endpoint_lock); + } else { + MCA_BTL_IB_FRAG_RETURN (frag); + } + } + + return rc; +} + +int mca_btl_openib_put_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, + mca_btl_openib_put_frag_t *frag) +{ + int qp = to_base_frag(frag)->base.order; + struct ibv_send_wr *bad_wr; + int rc; + + /* check for a send wqe */ + if (qp_get_wqe(ep, qp) < 0) { + qp_put_wqe(ep, qp); + return OPAL_ERR_OUT_OF_RESOURCE; + } + + qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag)); + qp_reset_signal_count(ep, qp); + + if (0 != (rc = ibv_post_send(ep->qps[qp].qp->lcl_qp, &to_out_frag(frag)->sr_desc, &bad_wr))) { + qp_put_wqe(ep, qp); + return OPAL_ERROR;; + } + + return OPAL_SUCCESS; +} diff --git a/opal/mca/btl/openib/configure.m4 b/opal/mca/btl/openib/configure.m4 index 5ca6a1af9c..bbcca40757 100644 --- a/opal/mca/btl/openib/configure.m4 +++ b/opal/mca/btl/openib/configure.m4 @@ -119,7 +119,6 @@ AC_DEFUN([MCA_opal_btl_openib_CONFIG],[ [enable openib BTL failover]) AM_CONDITIONAL([MCA_btl_openib_enable_failover], [test "x$btl_openib_failover_enabled" = "x1"]) - # Check for __malloc_hook availability AC_ARG_ENABLE(btl-openib-malloc-alignment, AC_HELP_STRING([--enable-btl-openib-malloc-alignment], [Enable support for allocated memory alignment. Default: enabled if supported, disabled otherwise.])) diff --git a/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c b/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c index 5e9d3874bf..e0b8e80639 100644 --- a/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c +++ b/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c @@ -322,20 +322,21 @@ static int udcm_send_request (mca_btl_base_endpoint_t *lcl_ep, static void udcm_send_timeout (evutil_socket_t fd, short event, void *arg); static int udcm_finish_connection (mca_btl_openib_endpoint_t *lcl_ep); +static int udcm_rc_qps_to_rts(mca_btl_openib_endpoint_t *lcl_ep); /* XRC support */ #if HAVE_XRC static int udcm_xrc_start_connect (opal_btl_openib_connect_base_module_t *cpc, mca_btl_base_endpoint_t *lcl_ep); static int udcm_xrc_restart_connect (mca_btl_base_endpoint_t *lcl_ep); -static int udcm_xrc_send_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_hdr_t *msg_hdr); +static int udcm_xrc_send_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, uint32_t rem_qp_num, uint32_t rem_psn); static int udcm_xrc_send_qp_create (mca_btl_base_endpoint_t *lcl_ep); #if OPAL_HAVE_CONNECTX_XRC_DOMAINS static int udcm_xrc_recv_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, uint32_t qp_num); #else static int udcm_xrc_recv_qp_connect (mca_btl_openib_endpoint_t *lcl_ep); #endif -static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_hdr_t *msg_hdr); +static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, uint32_t rem_qp_num, uint32_t rem_psn); static int udcm_xrc_send_request (mca_btl_base_endpoint_t *lcl_ep, mca_btl_base_endpoint_t *rem_ep, uint8_t msg_type); static int udcm_xrc_send_xresponse (mca_btl_base_endpoint_t *lcl_ep, mca_btl_base_endpoint_t *rem_ep, @@ -512,6 +513,96 @@ static int udcm_component_finalize(void) /* mark: udcm module */ +#if HAVE_XRC +static int udcm_endpoint_init_self_xrc (struct mca_btl_base_endpoint_t *lcl_ep) +{ + udcm_endpoint_t *udep = UDCM_ENDPOINT_DATA(lcl_ep); + int rc; + + opal_mutex_lock (&udep->udep_lock); + do { +#if OPAL_HAVE_CONNECTX_XRC_DOMAINS + rc = udcm_xrc_recv_qp_connect (lcl_ep, lcl_ep->qps[0].qp->lcl_qp->qp_num); +#else + lcl_ep->xrc_recv_qp_num = lcl_ep->qps[0].qp->lcl_qp->qp_num; + rc = udcm_xrc_recv_qp_connect (lcl_ep); +#endif + if (OPAL_SUCCESS != rc) { + BTL_VERBOSE(("error connecting loopback XRC receive queue pair")); + break; + } + + rc = mca_btl_openib_endpoint_post_recvs (lcl_ep); + if (OPAL_SUCCESS != rc) { + BTL_VERBOSE(("error posting receives for loopback queue pair")); + break; + } + + rc = udcm_xrc_recv_qp_create (lcl_ep, lcl_ep->qps[0].qp->lcl_qp->qp_num, + lcl_ep->qps[0].qp->lcl_psn); + if (OPAL_SUCCESS != rc) { + BTL_VERBOSE(("error creating loopback XRC receive queue pair")); + break; + } + + rc = udcm_xrc_send_qp_connect (lcl_ep, lcl_ep->qps[0].qp->lcl_qp->qp_num, + lcl_ep->qps[0].qp->lcl_psn); + if (OPAL_SUCCESS != rc) { + BTL_VERBOSE(("error creating loopback XRC send queue pair")); + break; + } + + lcl_ep->endpoint_state = MCA_BTL_IB_CONNECTED; + + rc = udcm_finish_connection (lcl_ep); + } while (0); + opal_mutex_unlock (&udep->udep_lock); + + return rc; +} +#endif + +static int udcm_endpoint_init_self (struct mca_btl_base_endpoint_t *lcl_ep) +{ + udcm_endpoint_t *udep = UDCM_ENDPOINT_DATA(lcl_ep); + int rc; + + opal_mutex_lock (&udep->udep_lock); + do { + if (OPAL_SUCCESS != (rc = udcm_endpoint_init_data (lcl_ep))) { + BTL_VERBOSE(("error initializing loopback endpoint cpc data")); + break; + } + + if (OPAL_SUCCESS != (rc = udcm_rc_qp_create_all (lcl_ep))) { + BTL_VERBOSE(("error initializing loopback endpoint qps")); + break; + } + + /* save queue pair info */ + lcl_ep->rem_info.rem_index = lcl_ep->index; + + for (int i = 0 ; i < mca_btl_openib_component.num_qps ; ++i) { + lcl_ep->rem_info.rem_qps[i].rem_psn = lcl_ep->qps[i].qp->lcl_psn; + lcl_ep->rem_info.rem_qps[i].rem_qp_num = lcl_ep->qps[i].qp->lcl_qp->qp_num; + } + + if (OPAL_SUCCESS != (rc = udcm_rc_qps_to_rts (lcl_ep))) { + BTL_VERBOSE(("error moving loopback endpoint qps to RTS")); + break; + } + + lcl_ep->endpoint_state = MCA_BTL_IB_CONNECTED; + + rc = udcm_finish_connection (lcl_ep); + + return OPAL_SUCCESS; + } while (0); + opal_mutex_unlock (&udep->udep_lock); + + return rc; +} + static int udcm_endpoint_init (struct mca_btl_base_endpoint_t *lcl_ep) { udcm_endpoint_t *udep = lcl_ep->endpoint_local_cpc_data = @@ -523,6 +614,16 @@ static int udcm_endpoint_init (struct mca_btl_base_endpoint_t *lcl_ep) OBJ_CONSTRUCT(&udep->udep_lock, opal_mutex_t); + if (lcl_ep->endpoint_proc->proc_opal == opal_proc_local_get ()) { + /* go ahead and try to create a loopback queue pair */ +#if HAVE_XRC + if (mca_btl_openib_component.num_xrc_qps > 0) { + return udcm_endpoint_init_self_xrc (lcl_ep); + } else +#endif + return udcm_endpoint_init_self (lcl_ep); + } + return OPAL_SUCCESS; } @@ -1072,6 +1173,9 @@ static inline int udcm_rc_qp_to_init (struct ibv_qp *qp, attr.pkey_index = btl->pkey_index; attr.port_num = btl->port_num; attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ; +#if HAVE_DECL_IBV_ATOMIC_HCA + attr.qp_access_flags |= IBV_ACCESS_REMOTE_ATOMIC; +#endif attr_mask = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS; @@ -2313,7 +2417,7 @@ static int udcm_xrc_restart_connect (mca_btl_base_endpoint_t *lcl_ep) /* mark: xrc send qp */ /* Send qp connect */ -static int udcm_xrc_send_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_hdr_t *msg_hdr) +static int udcm_xrc_send_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, uint32_t rem_qp_num, uint32_t rem_psn) { mca_btl_openib_module_t *openib_btl = lcl_ep->endpoint_btl; struct ibv_qp_attr attr; @@ -2322,7 +2426,7 @@ static int udcm_xrc_send_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg int ret; BTL_VERBOSE(("Connecting send qp: %p, remote qp: %d", (void *)lcl_ep->qps[0].qp->lcl_qp, - msg_hdr->data.xres.rem_qp_num)); + rem_qp_num)); assert(NULL != lcl_ep->qps); qp = lcl_ep->qps[0].qp->lcl_qp; psn = lcl_ep->qps[0].qp->lcl_psn; @@ -2332,8 +2436,8 @@ static int udcm_xrc_send_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg attr.qp_state = IBV_QPS_RTR; attr.path_mtu = (openib_btl->device->mtu < lcl_ep->rem_info.rem_mtu) ? openib_btl->device->mtu : lcl_ep->rem_info.rem_mtu; - attr.dest_qp_num = msg_hdr->data.xres.rem_qp_num; - attr.rq_psn = msg_hdr->data.xres.rem_psn; + attr.dest_qp_num = rem_qp_num; + attr.rq_psn = rem_psn; attr.max_dest_rd_atomic = mca_btl_openib_component.ib_max_rdma_dst_ops; attr.min_rnr_timer = mca_btl_openib_component.ib_min_rnr_timer; attr.ah_attr.is_global = 0; @@ -2481,6 +2585,9 @@ static int udcm_xrc_send_qp_create (mca_btl_base_endpoint_t *lcl_ep) attr.pkey_index = openib_btl->pkey_index; attr.port_num = openib_btl->port_num; attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ; +#if HAVE_DECL_IBV_ATOMIC_HCA + attr.qp_access_flags |= IBV_ACCESS_REMOTE_ATOMIC; +#endif ret = ibv_modify_qp(*qp, &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | @@ -2546,7 +2653,7 @@ static int udcm_xrc_recv_qp_connect (mca_btl_openib_endpoint_t *lcl_ep) } /* Recv qp create */ -static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_hdr_t *msg_hdr) +static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, uint32_t rem_qp_num, uint32_t rem_psn) { mca_btl_openib_module_t* openib_btl = lcl_ep->endpoint_btl; #if OPAL_HAVE_CONNECTX_XRC_DOMAINS @@ -2588,6 +2695,11 @@ static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_ attr.pkey_index = openib_btl->pkey_index; attr.port_num = openib_btl->port_num; attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ; + +#if HAVE_DECL_IBV_ATOMIC_HCA + attr.qp_access_flags |= IBV_ACCESS_REMOTE_ATOMIC; +#endif + #if OPAL_HAVE_CONNECTX_XRC_DOMAINS ret = ibv_modify_qp(lcl_ep->xrc_recv_qp, &attr, @@ -2617,8 +2729,8 @@ static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_ attr.qp_state = IBV_QPS_RTR; attr.path_mtu = (openib_btl->device->mtu < lcl_ep->rem_info.rem_mtu) ? openib_btl->device->mtu : lcl_ep->rem_info.rem_mtu; - attr.dest_qp_num = msg_hdr->data.xreq.rem_qp_num; - attr.rq_psn = msg_hdr->data.xreq.rem_psn; + attr.dest_qp_num = rem_qp_num; + attr.rq_psn = rem_psn; attr.max_dest_rd_atomic = mca_btl_openib_component.ib_max_rdma_dst_ops; attr.min_rnr_timer = mca_btl_openib_component.ib_min_rnr_timer; attr.ah_attr.is_global = 0; @@ -2834,7 +2946,7 @@ static int udcm_xrc_handle_xconnect (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg response_type = UDCM_MESSAGE_XRESPONSE; - rc = udcm_xrc_recv_qp_create (lcl_ep, msg_hdr); + rc = udcm_xrc_recv_qp_create (lcl_ep, msg_hdr->data.xreq.rem_qp_num, msg_hdr->data.xreq.rem_psn); if (OPAL_SUCCESS != rc) { break; } @@ -2880,7 +2992,7 @@ static int udcm_xrc_handle_xresponse (mca_btl_openib_endpoint_t *lcl_ep, udcm_ms udep->recv_resp = true; - rc = udcm_xrc_send_qp_connect (lcl_ep, msg_hdr); + rc = udcm_xrc_send_qp_connect (lcl_ep, msg_hdr->data.xres.rem_qp_num, msg_hdr->data.xres.rem_psn); if (OPAL_SUCCESS != rc) { mca_btl_openib_endpoint_invoke_error (lcl_ep); } diff --git a/opal/mca/btl/scif/btl_scif.h b/opal/mca/btl/scif/btl_scif.h index b8d9aabaf5..e55ace0a31 100644 --- a/opal/mca/btl/scif/btl_scif.h +++ b/opal/mca/btl/scif/btl_scif.h @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2013 Los Alamos National Security, LLC. All rights + * Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. @@ -98,7 +98,7 @@ typedef struct mca_btl_scif_module_t { typedef struct mca_btl_scif_component_t { /* base BTL component */ - mca_btl_base_component_2_0_0_t super; + mca_btl_base_component_3_0_0_t super; /* DMA free list settings */ int scif_free_list_num; @@ -197,29 +197,21 @@ int mca_btl_scif_sendi (struct mca_btl_base_module_t *btl, * Initiate a get operation. * * location: btl_scif_get.c - * - * @param btl (IN) BTL module - * @param endpoint (IN) BTL addressing information - * @param descriptor (IN) Description of the data to be transferred */ -int -mca_btl_scif_get (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t *des); +int mca_btl_scif_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); /** * Initiate a put operation. * * location: btl_scif_put.c - * - * @param btl (IN) BTL module - * @param endpoint (IN) BTL addressing information - * @param descriptor (IN) Description of the data to be transferred */ -int -mca_btl_scif_put (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t *des); +int mca_btl_scif_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); mca_btl_base_descriptor_t * mca_btl_scif_alloc(struct mca_btl_base_module_t *btl, @@ -228,9 +220,25 @@ mca_btl_scif_alloc(struct mca_btl_base_module_t *btl, int mca_btl_scif_progress_send_wait_list (struct mca_btl_base_endpoint_t *endpoint); +struct mca_btl_scif_reg_t; + +struct mca_btl_base_registration_handle_t { + /** scif offset */ + off_t scif_offset; + /** base address of this scif region */ + uintptr_t scif_base; +}; + +struct mca_btl_scif_registration_handle_t { + mca_btl_base_registration_handle_t btl_handle; + struct mca_btl_scif_reg_t *reg; +}; +typedef struct mca_btl_scif_registration_handle_t mca_btl_scif_registration_handle_t; + typedef struct mca_btl_scif_reg_t { mca_mpool_base_registration_t base; - off_t *registrations; + /** per-endpoint btl handles for this registration */ + mca_btl_scif_registration_handle_t *handles; } mca_btl_scif_reg_t; /* Global structures */ diff --git a/opal/mca/btl/scif/btl_scif_add_procs.c b/opal/mca/btl/scif/btl_scif_add_procs.c index 4a6d838102..f801ee5c7a 100644 --- a/opal/mca/btl/scif/btl_scif_add_procs.c +++ b/opal/mca/btl/scif/btl_scif_add_procs.c @@ -165,14 +165,14 @@ static int scif_dereg_mem (void *reg_data, mca_mpool_base_registration_t *reg) /* register the fragment with all connected endpoints */ for (i = 0 ; i < (int) mca_btl_scif_module.endpoint_count ; ++i) { - if ((off_t)-1 != scif_reg->registrations[i] && + if ((off_t)-1 != scif_reg->handles[i].btl_handle.scif_offset && MCA_BTL_SCIF_EP_STATE_CONNECTED == mca_btl_scif_module.endpoints[i].state) { (void) scif_unregister(mca_btl_scif_module.endpoints[i].scif_epd, - scif_reg->registrations[i], size); + scif_reg->handles[i].btl_handle.scif_offset, size); } } - free (scif_reg->registrations); + free (scif_reg->handles); return OPAL_SUCCESS; } @@ -184,17 +184,22 @@ static int scif_reg_mem (void *reg_data, void *base, size_t size, int rc = OPAL_SUCCESS; unsigned int i; - scif_reg->registrations = calloc (mca_btl_scif_module.endpoint_count, - sizeof (off_t)); - memset (scif_reg->registrations, -1, mca_btl_scif_module.endpoint_count * sizeof (off_t)); + scif_reg->handles = calloc (mca_btl_scif_module.endpoint_count, sizeof (scif_reg->handles[0])); + + /* intialize all scif offsets to -1 and initialize the pointer back to the mpool registration */ + for (i = 0 ; i < mca_btl_scif_module.endpoint_count ; ++i) { + scif_reg->handles[i].btl_handle.scif_offset = -1; + scif_reg->handles[i].btl_handle.scif_base = (intptr_t) base; + scif_reg->handles[i].reg = scif_reg; + } /* register the pointer with all connected endpoints */ for (i = 0 ; i < mca_btl_scif_module.endpoint_count ; ++i) { if (MCA_BTL_SCIF_EP_STATE_CONNECTED == mca_btl_scif_module.endpoints[i].state) { - scif_reg->registrations[i] = scif_register(mca_btl_scif_module.endpoints[i].scif_epd, - base, size, 0, SCIF_PROT_READ | - SCIF_PROT_WRITE, 0); - if (SCIF_REGISTER_FAILED == scif_reg->registrations[i]) { + scif_reg->handles[i].btl_handle.scif_offset = scif_register (mca_btl_scif_module.endpoints[i].scif_epd, + base, size, 0, SCIF_PROT_READ | + SCIF_PROT_WRITE, 0); + if (SCIF_REGISTER_FAILED == scif_reg->handles[i].btl_handle.scif_offset) { /* cleanup */ scif_dereg_mem (reg_data, reg); rc = OPAL_ERR_OUT_OF_RESOURCE; diff --git a/opal/mca/btl/scif/btl_scif_component.c b/opal/mca/btl/scif/btl_scif_component.c index 61bf2d15c7..36db880445 100644 --- a/opal/mca/btl/scif/btl_scif_component.c +++ b/opal/mca/btl/scif/btl_scif_component.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2013 Los Alamos National Security, LLC. All rights + * Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. @@ -171,7 +171,7 @@ static int btl_scif_component_register(void) mca_btl_scif_module.super.btl_flags = MCA_BTL_FLAGS_SEND | MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND_INPLACE; - mca_btl_scif_module.super.btl_seg_size = sizeof (mca_btl_scif_segment_t); + mca_btl_scif_module.super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t); mca_btl_scif_module.super.btl_bandwidth = 50000; /* Mbs */ mca_btl_scif_module.super.btl_latency = 2; /* Microsecs */ @@ -329,11 +329,11 @@ static int mca_btl_scif_progress_recvs (mca_btl_base_endpoint_t *ep) * the fragment without introducing another copy here. this * limitation has not appeared to cause any performance * problems. */ - frag.base.des_local_count = 1; - frag.segments[0].base.seg_len = hdr->size; - frag.segments[0].base.seg_addr.pval = (void *) (hdr + 1); + frag.base.des_segment_count = 1; + frag.segments[0].seg_len = hdr->size; + frag.segments[0].seg_addr.pval = (void *) (hdr + 1); - frag.base.des_local = &frag.segments[0].base; + frag.base.des_segments = frag.segments; /* call the registered callback function */ reg->cbfunc(&mca_btl_scif_module.super, hdr->tag, &frag.base, reg->cbdata); diff --git a/opal/mca/btl/scif/btl_scif_frag.c b/opal/mca/btl/scif/btl_scif_frag.c index 651e88cf51..6a684defb6 100644 --- a/opal/mca/btl/scif/btl_scif_frag.c +++ b/opal/mca/btl/scif/btl_scif_frag.c @@ -15,13 +15,13 @@ static inline void mca_btl_scif_base_frag_constructor (mca_btl_scif_base_frag_t *frag) { memset ((char *) frag + sizeof (frag->base), 0, sizeof (*frag) - sizeof (frag->base)); - frag->segments[0].base.seg_addr.pval = frag->base.super.ptr; + frag->segments[0].seg_addr.pval = frag->base.super.ptr; } static inline void mca_btl_scif_eager_frag_constructor (mca_btl_scif_base_frag_t *frag) { memset ((char *) frag + sizeof (frag->base), 0, sizeof (*frag) - sizeof (frag->base)); - frag->segments[0].base.seg_addr.pval = frag->base.super.ptr; + frag->segments[0].seg_addr.pval = frag->base.super.ptr; } OBJ_CLASS_INSTANCE(mca_btl_scif_eager_frag_t, mca_btl_base_descriptor_t, diff --git a/opal/mca/btl/scif/btl_scif_frag.h b/opal/mca/btl/scif/btl_scif_frag.h index a4ade0c9f4..2f6736a6dc 100644 --- a/opal/mca/btl/scif/btl_scif_frag.h +++ b/opal/mca/btl/scif/btl_scif_frag.h @@ -15,16 +15,6 @@ #include "btl_scif.h" #include "btl_scif_endpoint.h" -typedef struct mca_btl_scif_segment_t { - mca_btl_base_segment_t base; - - /* scif offset */ - off_t scif_offset; - - /* original pointer */ - uint64_t orig_ptr; -} mca_btl_scif_segment_t; - typedef struct mca_btl_scif_frag_hdr_t { #if defined(SCIF_USE_SEQ) uint32_t seq; @@ -41,7 +31,7 @@ typedef void (*frag_cb_t) (struct mca_btl_scif_base_frag_t *, int); typedef struct mca_btl_scif_base_frag_t { mca_btl_base_descriptor_t base; mca_btl_scif_frag_hdr_t hdr; - mca_btl_scif_segment_t segments[2]; + mca_btl_base_segment_t segments[2]; mca_btl_base_endpoint_t *endpoint; mca_btl_scif_reg_t *registration; ompi_free_list_t *my_list; @@ -78,9 +68,9 @@ static inline int mca_btl_scif_frag_return (mca_btl_scif_base_frag_t *frag) frag->registration = NULL; } - frag->segments[0].base.seg_addr.pval = frag->base.super.ptr; - frag->segments[0].base.seg_len = 0; - frag->segments[1].base.seg_len = 0; + frag->segments[0].seg_addr.pval = frag->base.super.ptr; + frag->segments[0].seg_len = 0; + frag->segments[1].seg_len = 0; OMPI_FREE_LIST_RETURN_MT(frag->my_list, (ompi_free_list_item_t *) frag); diff --git a/opal/mca/btl/scif/btl_scif_get.c b/opal/mca/btl/scif/btl_scif_get.c index 4f97eadb0e..131352b327 100644 --- a/opal/mca/btl/scif/btl_scif_get.c +++ b/opal/mca/btl/scif/btl_scif_get.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2013 Los Alamos National Security, LLC. All rights + * Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. @@ -20,18 +20,13 @@ /** * Initiate a get operation. - * - * @param btl (IN) BTL module - * @param endpoint (IN) BTL addressing information - * @param descriptor (IN) Description of the data to be transferred */ -int mca_btl_scif_get (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t *des) { - mca_btl_scif_segment_t *src = (mca_btl_scif_segment_t *) des->des_remote; - mca_btl_scif_segment_t *dst = (mca_btl_scif_segment_t *) des->des_local; - size_t len = lmin (src->base.seg_len, dst->base.seg_len); - int rc, mark, flags = 0; +int mca_btl_scif_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) +{ + int rc, mark, scif_flags = 0; off_t roffset, loffset; #if defined(SCIF_TIMING) struct timespec ts; @@ -41,30 +36,27 @@ int mca_btl_scif_get (struct mca_btl_base_module_t *btl, mca_btl_scif_component.get_count++; #endif - BTL_VERBOSE(("Using DMA Get for frag %p from offset %lu", (void *) des, - (unsigned long) src->scif_offset)); + BTL_VERBOSE(("Using DMA Get from remote address %" PRIx64 " to local address %p", + remote_address, local_address)); - roffset = src->scif_offset + (off_t)(src->orig_ptr - src->base.seg_addr.lval); - loffset = dst->scif_offset + (off_t)(dst->orig_ptr - dst->base.seg_addr.lval); + roffset = remote_handle->scif_offset + (off_t)(remote_address - remote_handle->scif_base); + loffset = local_handle->scif_offset + (off_t)((intptr_t)local_address - local_handle->scif_base); if (mca_btl_scif_component.rma_use_cpu) { - flags = SCIF_RMA_USECPU; + scif_flags = SCIF_RMA_USECPU; } if (mca_btl_scif_component.rma_sync) { - flags |= SCIF_RMA_SYNC; + scif_flags |= SCIF_RMA_SYNC; } /* start the read */ - rc = scif_readfrom (endpoint->scif_epd, loffset, len, roffset, flags); + rc = scif_readfrom (endpoint->scif_epd, loffset, size, roffset, scif_flags); if (OPAL_UNLIKELY(-1 == rc)) { return OPAL_ERROR; } - /* always call the callback function */ - des->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; - - if (!(flags & SCIF_RMA_SYNC)) { + if (!(scif_flags & SCIF_RMA_SYNC)) { /* according to the scif documentation is is better to use a fence rather * than using the SCIF_RMA_SYNC flag with scif_readfrom */ scif_fence_mark (endpoint->scif_epd, SCIF_FENCE_INIT_SELF, &mark); @@ -76,8 +68,8 @@ int mca_btl_scif_get (struct mca_btl_base_module_t *btl, mca_btl_scif_component.get_time_max, ts); #endif - /* since we completed the fence the RMA operation is complete */ - mca_btl_scif_frag_complete ((mca_btl_scif_base_frag_t *) des, OPAL_SUCCESS); + /* always call the callback function */ + cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS); return OPAL_SUCCESS; } diff --git a/opal/mca/btl/scif/btl_scif_module.c b/opal/mca/btl/scif/btl_scif_module.c index fb36b7ba47..1926efa86c 100644 --- a/opal/mca/btl/scif/btl_scif_module.c +++ b/opal/mca/btl/scif/btl_scif_module.c @@ -24,17 +24,14 @@ mca_btl_scif_free (struct mca_btl_base_module_t *btl, static int mca_btl_scif_module_finalize (struct mca_btl_base_module_t* btl); -static mca_btl_base_descriptor_t * -mca_btl_scif_prepare_dst (mca_btl_base_module_t *btl, - mca_btl_base_endpoint_t *endpoint, - mca_mpool_base_registration_t *registration, - opal_convertor_t *convertor, uint8_t order, - size_t reserve, size_t *size, uint32_t flags); +static mca_btl_base_registration_handle_t *mca_btl_scif_register_mem (struct mca_btl_base_module_t *btl, + mca_btl_base_endpoint_t *endpoint, + void *base, size_t size, uint32_t flags); +static int mca_btl_scif_deregister_mem (struct mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle); static struct mca_btl_base_descriptor_t * mca_btl_scif_prepare_src (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, - mca_mpool_base_registration_t *registration, struct opal_convertor_t *convertor, uint8_t order, size_t reserve, size_t *size, uint32_t flags); @@ -48,11 +45,12 @@ mca_btl_scif_module_t mca_btl_scif_module = { .btl_alloc = mca_btl_scif_alloc, .btl_free = mca_btl_scif_free, .btl_prepare_src = mca_btl_scif_prepare_src, - .btl_prepare_dst = mca_btl_scif_prepare_dst, .btl_send = mca_btl_scif_send, .btl_sendi = mca_btl_scif_sendi, .btl_put = mca_btl_scif_put, .btl_get = mca_btl_scif_get, + .btl_register_mem = mca_btl_scif_register_mem, + .btl_deregister_mem = mca_btl_scif_deregister_mem, } }; @@ -163,10 +161,10 @@ mca_btl_scif_alloc(struct mca_btl_base_module_t *btl, frag->base.des_flags = flags; frag->base.order = order; - frag->base.des_local = &frag->segments[0].base; - frag->base.des_local_count = 1; + frag->base.des_segments = frag->segments; + frag->base.des_segment_count = 1; - frag->segments[0].base.seg_len = size; + frag->segments[0].seg_len = size; return &frag->base; } @@ -178,16 +176,19 @@ mca_btl_scif_free (struct mca_btl_base_module_t *btl, return mca_btl_scif_frag_return ((mca_btl_scif_base_frag_t *) des); } -static inline mca_btl_base_descriptor_t *mca_btl_scif_prepare_dma (struct mca_btl_base_module_t *btl, - mca_btl_base_endpoint_t *endpoint, - void *data_ptr, size_t size, - mca_mpool_base_registration_t *registration, - uint8_t order, uint32_t flags) +static mca_btl_base_registration_handle_t *mca_btl_scif_register_mem (struct mca_btl_base_module_t *btl, + mca_btl_base_endpoint_t *endpoint, + void *base, size_t size, uint32_t flags) { - mca_btl_scif_base_frag_t *frag; mca_btl_scif_reg_t *scif_reg; int rc; + if (MCA_BTL_ENDPOINT_ANY == endpoint) { + /* it probably isn't possible to support registering memory to use with any endpoint so + * return NULL */ + return NULL; + } + if (OPAL_LIKELY(MCA_BTL_SCIF_EP_STATE_CONNECTED != endpoint->state)) { /* the endpoint needs to be connected before the fragment can be * registered. */ @@ -198,67 +199,36 @@ static inline mca_btl_base_descriptor_t *mca_btl_scif_prepare_dma (struct mca_bt } } - (void) MCA_BTL_SCIF_FRAG_ALLOC_DMA(endpoint, frag); - if (OPAL_UNLIKELY(NULL == frag)) { + rc = btl->btl_mpool->mpool_register(btl->btl_mpool, base, size, 0, + (mca_mpool_base_registration_t **) &scif_reg); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { return NULL; } - if (NULL == registration) { - rc = btl->btl_mpool->mpool_register(btl->btl_mpool, data_ptr, size, 0, - (mca_mpool_base_registration_t **) ®istration); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - mca_btl_scif_frag_return (frag); - return NULL; - } - - frag->registration = (mca_btl_scif_reg_t *) registration; - } - - scif_reg = (mca_btl_scif_reg_t *) registration; - /* register the memory location with this peer if it isn't already */ - if ((off_t) -1 == scif_reg->registrations[endpoint->id]) { - size_t seg_size = (size_t)((uintptr_t) registration->bound - (uintptr_t) registration->base) + 1; - scif_reg->registrations[endpoint->id] = scif_register (endpoint->scif_epd, registration->base, - seg_size, 0, SCIF_PROT_READ | - SCIF_PROT_WRITE, 0); + if ((off_t) -1 == scif_reg->handles[endpoint->id].btl_handle.scif_offset) { + size_t seg_size = (size_t)((uintptr_t) scif_reg->base.bound - (uintptr_t) scif_reg->base.base) + 1; + + /* NTH: until we determine a way to pass permissions to the mpool just make all segments + * read/write */ + scif_reg->handles[endpoint->id].btl_handle.scif_offset = + scif_register (endpoint->scif_epd, scif_reg->base.base, seg_size, 0, SCIF_PROT_READ | + SCIF_PROT_WRITE, 0); BTL_VERBOSE(("registered fragment for scif DMA transaction. offset = %lu", - (unsigned long) scif_reg->registrations[endpoint->id])); + (unsigned long) scif_reg->handles[endpoint->id].btl_handle.scif_offset)); } - if (OPAL_UNLIKELY((off_t) -1 == scif_reg->registrations[endpoint->id])) { - mca_btl_scif_frag_return (frag); - return NULL; - } - - frag->segments[0].base.seg_addr.lval = (uint64_t)(uintptr_t) data_ptr; - frag->segments[0].base.seg_len = size; - frag->segments[0].scif_offset = scif_reg->registrations[endpoint->id] + - (off_t) ((ptrdiff_t) data_ptr - (ptrdiff_t) registration->base); - /* save the original pointer so the offset can be adjusted if needed (this is - * required for osc/rdma) */ - frag->segments[0].orig_ptr = (uint64_t)(uintptr_t) data_ptr; - frag->base.order = order; - frag->base.des_flags = flags; - - frag->base.des_local = &frag->segments->base; - frag->base.des_local_count = 1; - - return &frag->base; + return &scif_reg->handles[endpoint->id].btl_handle; } -static inline mca_btl_base_descriptor_t *mca_btl_scif_prepare_dma_conv (struct mca_btl_base_module_t *btl, - mca_btl_base_endpoint_t *endpoint, - mca_mpool_base_registration_t *registration, - struct opal_convertor_t *convertor, - uint8_t order, size_t *size, - uint32_t flags) +static int mca_btl_scif_deregister_mem (struct mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle) { - void *data_ptr; + mca_btl_scif_registration_handle_t *scif_handle = (mca_btl_scif_registration_handle_t *) handle; + mca_btl_scif_reg_t *scif_reg = scif_handle->reg; - opal_convertor_get_current_pointer (convertor, &data_ptr); + btl->btl_mpool->mpool_deregister (btl->btl_mpool, &scif_reg->base); - return mca_btl_scif_prepare_dma (btl, endpoint, data_ptr, *size, registration, order, flags); + return OPAL_SUCCESS; } static inline struct mca_btl_base_descriptor_t * @@ -286,10 +256,10 @@ mca_btl_scif_prepare_src_send (struct mca_btl_base_module_t *btl, return NULL; } - frag->segments[0].base.seg_len = reserve; - frag->segments[1].base.seg_addr.pval = data_ptr; - frag->segments[1].base.seg_len = *size; - frag->base.des_local_count = 2; + frag->segments[0].seg_len = reserve; + frag->segments[1].seg_addr.pval = data_ptr; + frag->segments[1].seg_len = *size; + frag->base.des_segment_count = 2; } else { /* buffered send */ (void) MCA_BTL_SCIF_FRAG_ALLOC_EAGER(endpoint, frag); @@ -299,7 +269,7 @@ mca_btl_scif_prepare_src_send (struct mca_btl_base_module_t *btl, if (*size) { iov.iov_len = *size; - iov.iov_base = (IOVBASE_TYPE *) ((uintptr_t) frag->segments[0].base.seg_addr.pval + reserve); + iov.iov_base = (IOVBASE_TYPE *) ((uintptr_t) frag->segments[0].seg_addr.pval + reserve); rc = opal_convertor_pack (convertor, &iov, &iov_count, &max_size); if (OPAL_UNLIKELY(rc < 0)) { @@ -309,37 +279,22 @@ mca_btl_scif_prepare_src_send (struct mca_btl_base_module_t *btl, *size = max_size; } - frag->segments[0].base.seg_len = reserve + *size; - frag->base.des_local_count = 1; + frag->segments[0].seg_len = reserve + *size; + frag->base.des_segment_count = 1; } - frag->base.des_local = &frag->segments->base; - frag->base.order = order; - frag->base.des_flags = flags; + frag->base.des_segments = frag->segments; + frag->base.order = order; + frag->base.des_flags = flags; return &frag->base; } static mca_btl_base_descriptor_t *mca_btl_scif_prepare_src (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, - mca_mpool_base_registration_t *registration, struct opal_convertor_t *convertor, uint8_t order, size_t reserve, size_t *size, uint32_t flags) { - if (OPAL_LIKELY(reserve)) { - return mca_btl_scif_prepare_src_send (btl, endpoint, convertor, - order, reserve, size, flags); - } else { - return mca_btl_scif_prepare_dma_conv (btl, endpoint, registration, convertor, order, size, flags); - } -} - -static mca_btl_base_descriptor_t *mca_btl_scif_prepare_dst (mca_btl_base_module_t *btl, - mca_btl_base_endpoint_t *endpoint, - mca_mpool_base_registration_t *registration, - opal_convertor_t *convertor, uint8_t order, - size_t reserve, size_t *size, uint32_t flags) -{ - return mca_btl_scif_prepare_dma_conv (btl, endpoint, registration, convertor, order, size, flags); + return mca_btl_scif_prepare_src_send (btl, endpoint, convertor, order, reserve, size, flags); } diff --git a/opal/mca/btl/scif/btl_scif_put.c b/opal/mca/btl/scif/btl_scif_put.c index 5f2134f3d9..27355a3e5c 100644 --- a/opal/mca/btl/scif/btl_scif_put.c +++ b/opal/mca/btl/scif/btl_scif_put.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2013 Los Alamos National Security, LLC. All rights + * Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -16,63 +16,57 @@ /** * Initiate a put operation. - * - * @param btl (IN) BTL module - * @param endpoint (IN) BTL addressing information - * @param descriptor (IN) Description of the data to be transferred */ -int mca_btl_scif_put (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t *des) { - mca_btl_scif_segment_t *src = (mca_btl_scif_segment_t *) des->des_local; - mca_btl_scif_segment_t *dst = (mca_btl_scif_segment_t *) des->des_remote; - size_t len = lmin (src->base.seg_len, dst->base.seg_len); - int rc, mark, flags = 0; +int mca_btl_scif_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) +{ + int rc, mark, scif_flags = 0; off_t roffset, loffset; #if defined(SCIF_TIMING) struct timespec ts; clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); - mca_btl_scif_component.put_count++; + mca_btl_scif_component.get_count++; #endif - BTL_VERBOSE(("Using DMA Put for frag %p", (void *) des)); + BTL_VERBOSE(("Using DMA Put from local address %p to remote address %" PRIx64, + local_address, remote_address)); - roffset = dst->scif_offset + (off_t)(dst->orig_ptr - dst->base.seg_addr.lval); - loffset = src->scif_offset + (off_t)(src->orig_ptr - src->base.seg_addr.lval); + roffset = remote_handle->scif_offset + (off_t)(remote_address - remote_handle->scif_base); + loffset = local_handle->scif_offset + (off_t)((intptr_t) local_address - local_handle->scif_base); if (mca_btl_scif_component.rma_use_cpu) { - flags = SCIF_RMA_USECPU; + scif_flags = SCIF_RMA_USECPU; } if (mca_btl_scif_component.rma_sync) { - flags |= SCIF_RMA_SYNC; + scif_flags |= SCIF_RMA_SYNC; } /* start the write */ - rc = scif_writeto (endpoint->scif_epd, loffset, len, roffset, flags); + rc = scif_writeto (endpoint->scif_epd, loffset, size, roffset, scif_flags); + rc = scif_readfrom (endpoint->scif_epd, loffset, size, roffset, scif_flags); if (OPAL_UNLIKELY(-1 == rc)) { return OPAL_ERROR; } - /* always call the callback function */ - des->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; - - /* according to the scif documentation is is better to use a fence rather - * than using the SCIF_RMA_SYNC flag with scif_writeto */ - if (!(flags & SCIF_RMA_SYNC)) { + if (!(scif_flags & SCIF_RMA_SYNC)) { + /* according to the scif documentation is is better to use a fence rather + * than using the SCIF_RMA_SYNC flag with scif_readfrom */ scif_fence_mark (endpoint->scif_epd, SCIF_FENCE_INIT_SELF, &mark); scif_fence_wait (endpoint->scif_epd, mark); } #if defined(SCIF_TIMING) - SCIF_UPDATE_TIMER(mca_btl_scif_component.put_time, - mca_btl_scif_component.put_time_max, ts); + SCIF_UPDATE_TIMER(mca_btl_scif_component.get_time, + mca_btl_scif_component.get_time_max, ts); #endif - /* since we completed the fence the RMA operation is complete */ - mca_btl_scif_frag_complete ((mca_btl_scif_base_frag_t *) des, OPAL_SUCCESS); + /* always call the callback function */ + cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS); return OPAL_SUCCESS; } diff --git a/opal/mca/btl/scif/btl_scif_send.c b/opal/mca/btl/scif/btl_scif_send.c index 3df0f1cc9c..f9cb3ea21f 100644 --- a/opal/mca/btl/scif/btl_scif_send.c +++ b/opal/mca/btl/scif/btl_scif_send.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2013 Los Alamos National Security, LLC. All rights + * Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. @@ -118,22 +118,22 @@ static int mca_btl_scif_send_frag (struct mca_btl_base_endpoint_t *endpoint, unsigned char * restrict dst; BTL_VERBOSE(("btl/scif sending descriptor %p from %d -> %d. length = %" PRIu64, (void *) frag, - OPAL_PROC_MY_NAME.vpid, endpoint->peer_proc->proc_name.vpid, frag->segments[0].base.seg_len)); + OPAL_PROC_MY_NAME.vpid, endpoint->peer_proc->proc_name.vpid, frag->segments[0].seg_len)); if (OPAL_LIKELY(OPAL_SUCCESS == mca_btl_scif_send_get_buffer (endpoint, size, &dst))) { - unsigned char * restrict data = (unsigned char * restrict) frag->segments[0].base.seg_addr.pval; + unsigned char * restrict data = (unsigned char * restrict) frag->segments[0].seg_addr.pval; #if defined(SCIF_TIMING) struct timespec ts; clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); #endif - memcpy (dst + sizeof (frag->hdr), data, frag->segments[0].base.seg_len); + memcpy (dst + sizeof (frag->hdr), data, frag->segments[0].seg_len); - if (frag->segments[1].base.seg_len) { - memcpy (dst + sizeof (frag->hdr) + frag->segments[0].base.seg_len, - frag->segments[1].base.seg_addr.pval, - frag->segments[1].base.seg_len); + if (frag->segments[1].seg_len) { + memcpy (dst + sizeof (frag->hdr) + frag->segments[0].seg_len, + frag->segments[1].seg_addr.pval, + frag->segments[1].seg_len); } #if defined(SCIF_USE_SEQ) @@ -165,7 +165,7 @@ int mca_btl_scif_send (struct mca_btl_base_module_t *btl, mca_btl_base_tag_t tag) { mca_btl_scif_base_frag_t *frag = (mca_btl_scif_base_frag_t *) descriptor; - size_t size = frag->segments[0].base.seg_len + frag->segments[1].base.seg_len; + size_t size = frag->segments[0].seg_len + frag->segments[1].seg_len; int rc; frag->hdr.tag = tag; @@ -223,7 +223,9 @@ int mca_btl_scif_sendi (struct mca_btl_base_module_t *btl, rc = mca_btl_scif_send_get_buffer (endpoint, length, &base); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - *descriptor = NULL; + if (NULL != descriptor) { + *descriptor = NULL; + } return OPAL_ERR_OUT_OF_RESOURCE; } diff --git a/opal/mca/btl/self/btl_self.c b/opal/mca/btl/self/btl_self.c index ce5906d138..26f2a88f8e 100644 --- a/opal/mca/btl/self/btl_self.c +++ b/opal/mca/btl/self/btl_self.c @@ -38,13 +38,15 @@ #include "btl_self_frag.h" #include "opal/util/proc.h" -static int mca_btl_self_put (struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - struct mca_btl_base_descriptor_t* des); +static int mca_btl_self_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); -static int mca_btl_self_get (struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - struct mca_btl_base_descriptor_t* des); +static int mca_btl_self_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); mca_btl_base_module_t mca_btl_self = { .btl_component = &mca_btl_self_component.super, @@ -54,7 +56,6 @@ mca_btl_base_module_t mca_btl_self = { .btl_alloc = mca_btl_self_alloc, .btl_free = mca_btl_self_free, .btl_prepare_src = mca_btl_self_prepare_src, - .btl_prepare_dst = mca_btl_self_prepare_dst, .btl_send = mca_btl_self_send, .btl_put = mca_btl_self_put, .btl_get = mca_btl_self_get, @@ -135,8 +136,8 @@ mca_btl_base_descriptor_t* mca_btl_self_alloc( frag->segment.seg_len = size; frag->base.des_flags = flags; - frag->base.des_local = &(frag->segment); - frag->base.des_local_count = 1; + frag->base.des_segments = &(frag->segment); + frag->base.des_segment_count = 1; return (mca_btl_base_descriptor_t*)frag; } @@ -151,10 +152,8 @@ int mca_btl_self_free( struct mca_btl_base_module_t* btl, { mca_btl_self_frag_t* frag = (mca_btl_self_frag_t*)des; - frag->base.des_local = NULL; - frag->base.des_local_count = 0; - frag->base.des_remote = NULL; - frag->base.des_remote_count = 0; + frag->base.des_segments = NULL; + frag->base.des_segment_count = 0; if(frag->size == mca_btl_self.btl_eager_limit) { MCA_BTL_SELF_FRAG_RETURN_EAGER(frag); @@ -175,7 +174,6 @@ int mca_btl_self_free( struct mca_btl_base_module_t* btl, struct mca_btl_base_descriptor_t* mca_btl_self_prepare_src( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, - mca_mpool_base_registration_t* registration, struct opal_convertor_t* convertor, uint8_t order, size_t reserve, @@ -231,44 +229,11 @@ mca_btl_self_prepare_src( struct mca_btl_base_module_t* btl, *size = max_data; } frag->base.des_flags = flags; - frag->base.des_local = &frag->segment; - frag->base.des_local_count = 1; + frag->base.des_segments = &frag->segment; + frag->base.des_segment_count = 1; return &frag->base; } - -/** - * Prepare data for receive. - */ -struct mca_btl_base_descriptor_t* -mca_btl_self_prepare_dst( struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - mca_mpool_base_registration_t* registration, - struct opal_convertor_t* convertor, - uint8_t order, - size_t reserve, - size_t* size, - uint32_t flags ) -{ - mca_btl_self_frag_t* frag; - size_t max_data = *size; - void *ptr; - - MCA_BTL_SELF_FRAG_ALLOC_RDMA(frag); - if(OPAL_UNLIKELY(NULL == frag)) { - return NULL; - } - - /* setup descriptor to point directly to user buffer */ - opal_convertor_get_current_pointer( convertor, &ptr ); - frag->segment.seg_addr.lval = (uint64_t)(uintptr_t) ptr; - - frag->segment.seg_len = reserve + max_data; - frag->base.des_local = &frag->segment; - frag->base.des_local_count = 1; - frag->base.des_flags = flags; - return &frag->base; -} /** * Initiate a send to the peer. @@ -285,12 +250,6 @@ int mca_btl_self_send( struct mca_btl_base_module_t* btl, mca_btl_active_message_callback_t* reg; int btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); - /** - * We have to set the dst before the call to the function and reset them - * after. - */ - des->des_remote = des->des_local; - des->des_remote_count = des->des_local_count; /* upcall */ reg = mca_btl_base_active_message_trigger + tag; reg->cbfunc( btl, tag, des, reg->cbdata ); @@ -305,100 +264,29 @@ int mca_btl_self_send( struct mca_btl_base_module_t* btl, return 1; } -/** - * Initiate a put to the peer. - * - * @param btl (IN) BTL module - * @param peer (IN) BTL peer addressing - */ -static int mca_btl_self_rdma( struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - struct mca_btl_base_descriptor_t* des, - mca_btl_base_segment_t* src, size_t src_cnt, - mca_btl_base_segment_t* dst, size_t dst_cnt) +static int mca_btl_self_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { - unsigned char* src_addr = (unsigned char *)(uintptr_t) src->seg_addr.lval; - size_t src_len = src->seg_len; - unsigned char* dst_addr = (unsigned char *)(uintptr_t) dst->seg_addr.lval; - size_t dst_len = dst->seg_len; - int btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); + memcpy ((void *)(intptr_t) remote_address, local_address, size); - while(src_len && dst_len) { + cbfunc (btl, endpoint, local_address, NULL, cbcontext, cbdata, OPAL_SUCCESS); - if(src_len == dst_len) { - memcpy(dst_addr, src_addr, src_len); - - /* advance src */ - if(--src_cnt != 0) { - src++; - src_addr = (unsigned char*)src->seg_addr.pval; - src_len = src->seg_len; - } else { - src_len = 0; - } - - /* advance dst */ - if(--dst_cnt != 0) { - dst++; - dst_addr = (unsigned char*)dst->seg_addr.pval; - dst_len = dst->seg_len; - } else { - dst_len = 0; - } - - } else { - size_t bytes = src_len < dst_len ? src_len : dst_len; - memcpy(dst_addr, src_addr, bytes); - - /* advance src */ - src_len -= bytes; - if(src_len == 0) { - if(--src_cnt != 0) { - src++; - src_addr = (unsigned char*)src->seg_addr.pval; - src_len = src->seg_len; - } - } else { - src_addr += bytes; - } - - /* advance dst */ - dst_len -= bytes; - if(dst_len == 0) { - if(--dst_cnt != 0) { - dst++; - dst_addr = (unsigned char*)src->seg_addr.pval; - dst_len = src->seg_len; - } - } else { - dst_addr += bytes; - } - } - } - - /* rdma completion */ - des->des_cbfunc( btl, endpoint, des, OPAL_SUCCESS ); - if( btl_ownership ) { - mca_btl_self_free( btl, des ); - } return OPAL_SUCCESS; } -static int mca_btl_self_put (struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - struct mca_btl_base_descriptor_t* des) +static int mca_btl_self_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { - return mca_btl_self_rdma (btl, endpoint, des, des->des_local, des->des_local_count, - des->des_remote, des->des_remote_count); -} + memcpy (local_address, (void *)(intptr_t) remote_address, size); -static int mca_btl_self_get (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t *des) -{ - return mca_btl_self_rdma (btl, endpoint, des, des->des_remote, des->des_remote_count, - des->des_local, des->des_local_count); + cbfunc (btl, endpoint, local_address, NULL, cbcontext, cbdata, OPAL_SUCCESS); + + return OPAL_SUCCESS; } int mca_btl_self_ft_event(int state) { diff --git a/opal/mca/btl/self/btl_self.h b/opal/mca/btl/self/btl_self.h index d738e82afb..37316d7b40 100644 --- a/opal/mca/btl/self/btl_self.h +++ b/opal/mca/btl/self/btl_self.h @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * University Research and Technology @@ -9,6 +10,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -40,7 +43,7 @@ BEGIN_C_DECLS * Shared Memory (SELF) BTL module. */ struct mca_btl_self_component_t { - mca_btl_base_component_2_0_0_t super; /**< base BTL component */ + mca_btl_base_component_3_0_0_t super; /**< base BTL component */ int free_list_num; /**< initial size of free lists */ int free_list_max; /**< maximum size of free lists */ int free_list_inc; /**< number of elements to alloc when growing free lists */ @@ -165,24 +168,6 @@ int mca_btl_self_free( struct mca_btl_base_descriptor_t* mca_btl_self_prepare_src( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, - struct mca_mpool_base_registration_t* registration, - struct opal_convertor_t* convertor, - uint8_t order, - size_t reserve, - size_t* size, - uint32_t flags -); - -/** - * Prepare data for RDMA - * - * @param btl (IN) BTL module - * @param peer (IN) BTL peer addressing - */ -struct mca_btl_base_descriptor_t* mca_btl_self_prepare_dst( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - struct mca_mpool_base_registration_t* registration, struct opal_convertor_t* convertor, uint8_t order, size_t reserve, diff --git a/opal/mca/btl/self/btl_self_component.c b/opal/mca/btl/self/btl_self_component.c index 7da16b586e..5ddc17365b 100644 --- a/opal/mca/btl/self/btl_self_component.c +++ b/opal/mca/btl/self/btl_self_component.c @@ -99,7 +99,6 @@ static int mca_btl_self_component_register(void) mca_btl_self.btl_rdma_pipeline_frag_size = INT_MAX; mca_btl_self.btl_min_rdma_pipeline_size = 0; mca_btl_self.btl_flags = MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_SEND_INPLACE; - mca_btl_self.btl_seg_size = sizeof (mca_btl_base_segment_t); mca_btl_self.btl_bandwidth = 100; mca_btl_self.btl_latency = 0; mca_btl_base_param_register(&mca_btl_self_component.super.btl_version, diff --git a/opal/mca/btl/self/btl_self_frag.c b/opal/mca/btl/self/btl_self_frag.c index 77893d2ad4..95186ac67d 100644 --- a/opal/mca/btl/self/btl_self_frag.c +++ b/opal/mca/btl/self/btl_self_frag.c @@ -23,8 +23,8 @@ static inline void mca_btl_self_frag_constructor(mca_btl_self_frag_t* frag) { frag->segment.seg_addr.pval = frag+1; frag->segment.seg_len = (uint32_t)frag->size; - frag->base.des_local = &frag->segment; - frag->base.des_local_count = 1; + frag->base.des_segments = &frag->segment; + frag->base.des_segment_count = 1; frag->base.des_flags = 0; } diff --git a/opal/mca/btl/sm/btl_sm.c b/opal/mca/btl/sm/btl_sm.c index d72522bf75..a4f70857a6 100644 --- a/opal/mca/btl/sm/btl_sm.c +++ b/opal/mca/btl/sm/btl_sm.c @@ -57,6 +57,9 @@ #include "opal/mca/mpool/base/base.h" #include "opal/mca/mpool/sm/mpool_sm.h" +#include "opal/align.h" +#include "opal/util/sys_limits.h" + #if OPAL_ENABLE_FT_CR == 1 #include "opal/util/basename.h" #include "opal/mca/crs/base/base.h" @@ -81,9 +84,6 @@ mca_btl_sm_t mca_btl_sm = { .btl_alloc = mca_btl_sm_alloc, .btl_free = mca_btl_sm_free, .btl_prepare_src = mca_btl_sm_prepare_src, -#if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA - .btl_prepare_dst = mca_btl_sm_prepare_dst, -#endif /* OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA */ .btl_send = mca_btl_sm_send, .btl_sendi = mca_btl_sm_sendi, .btl_dump = mca_btl_sm_dump, @@ -743,7 +743,6 @@ extern int mca_btl_sm_free( struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_src( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, - mca_mpool_base_registration_t* registration, struct opal_convertor_t* convertor, uint8_t order, size_t reserve, @@ -828,11 +827,9 @@ struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_src( } #endif /* OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA */ - frag->base.des_local = &(frag->segment.base); - frag->base.des_local_count = 1; + frag->base.des_segments = &(frag->segment.base); + frag->base.des_segment_count = 1; frag->base.order = MCA_BTL_NO_ORDER; - frag->base.des_remote = NULL; - frag->base.des_remote_count = 0; frag->base.des_flags = flags; *size = max_data; return &frag->base; @@ -950,9 +947,12 @@ int mca_btl_sm_sendi( struct mca_btl_base_module_t* btl, return OPAL_SUCCESS; } - /* presumably, this code path will never get executed */ - *descriptor = mca_btl_sm_alloc( btl, endpoint, order, - payload_size + header_size, flags); + if (NULL != descriptor) { + /* presumably, this code path will never get executed */ + *descriptor = mca_btl_sm_alloc( btl, endpoint, order, + payload_size + header_size, flags); + } + return OPAL_ERR_RESOURCE_BUSY; } @@ -1001,51 +1001,87 @@ int mca_btl_sm_send( struct mca_btl_base_module_t* btl, } #if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA -struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_dst( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - struct mca_mpool_base_registration_t* registration, - struct opal_convertor_t* convertor, - uint8_t order, - size_t reserve, - size_t* size, - uint32_t flags) +mca_btl_base_registration_handle_t *mca_btl_sm_register_mem (struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* endpoint, + void *base, size_t size, uint32_t flags) { - void *ptr; - mca_btl_sm_frag_t* frag; + mca_btl_sm_registration_handle_t *handle; + mca_btl_sm_t *sm_btl = (mca_btl_sm_t *) btl; + ompi_free_list_item_t *item = NULL; - MCA_BTL_SM_FRAG_ALLOC_USER(frag); - if(OPAL_UNLIKELY(NULL == frag)) { + OMPI_FREE_LIST_GET_MT(&mca_btl_sm_component.registration_handles, item); + if (OPAL_UNLIKELY(NULL == item)) { return NULL; } - frag->segment.base.seg_len = *size; - opal_convertor_get_current_pointer( convertor, &ptr ); - frag->segment.base.seg_addr.lval = (uint64_t)(uintptr_t) ptr; - - frag->base.des_remote = NULL; - frag->base.des_remote_count = 0; - frag->base.des_local = (mca_btl_base_segment_t*)&frag->segment; - frag->base.des_local_count = 1; - frag->base.des_flags = flags; - return &frag->base; + handle = (mca_btl_sm_registration_handle_t *) item; + +#if OPAL_BTL_SM_HAVE_KNEM + if (OPAL_LIKELY(mca_btl_sm_component.use_knem)) { + struct knem_cmd_create_region knem_cr; + struct knem_cmd_param_iovec knem_iov; + + knem_iov.base = (uintptr_t)base & ~(opal_getpagesize() - 1); + knem_iov.len = OPAL_ALIGN(size + ((intptr_t) base - knem_iov.base), opal_getpagesize(), intptr_t); + knem_cr.iovec_array = (uintptr_t)&knem_iov; + knem_cr.iovec_nr = 1; + knem_cr.flags = 0; + knem_cr.protection = 0; + + if (flags & MCA_BTL_REG_FLAG_REMOTE_READ) { + knem_cr.protection |= PROT_READ; + } + if (flags & MCA_BTL_REG_FLAG_REMOTE_WRITE) { + knem_cr.protection |= PROT_WRITE; + } + + if (OPAL_UNLIKELY(ioctl(sm_btl->knem_fd, KNEM_CMD_CREATE_REGION, &knem_cr) < 0)) { + OMPI_FREE_LIST_RETURN_MT(&mca_btl_sm_component.registration_handles, item); + return NULL; + } + + handle->btl_handle.data.knem.cookie = knem_cr.cookie; + handle->btl_handle.data.knem.base_addr = knem_iov.base; + } else +#endif + { + /* the pid could be included in a modex but this will work until btl/sm is + * deleted */ + handle->btl_handle.data.pid = getpid (); + } + + /* return the public part of the handle */ + return &handle->btl_handle; } +int mca_btl_sm_deregister_mem (struct mca_btl_base_module_t* btl, mca_btl_base_registration_handle_t *handle) +{ + mca_btl_sm_registration_handle_t *sm_handle = + (mca_btl_sm_registration_handle_t *)((intptr_t) handle - offsetof (mca_btl_sm_registration_handle_t, btl_handle)); + mca_btl_sm_t* sm_btl = (mca_btl_sm_t*) btl; + +#if OPAL_BTL_SM_HAVE_KNEM + if (OPAL_LIKELY(mca_btl_sm_component.use_knem)) { + (void) ioctl(sm_btl->knem_fd, KNEM_CMD_DESTROY_REGION, &handle->data.knem.cookie); + } +#endif + + OMPI_FREE_LIST_RETURN_MT(&mca_btl_sm_component.registration_handles, &sm_handle->super); + + return OPAL_SUCCESS; +} +#endif /* OPAL_BTL_SM_HAVE_KNEM */ + +#if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA + /** * Initiate an synchronous get. - * - * @param btl (IN) BTL module - * @param endpoint (IN) BTL addressing information - * @param descriptor (IN) Description of the data to be transferred */ -int mca_btl_sm_get_sync(struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - struct mca_btl_base_descriptor_t* des) +int mca_btl_sm_get_sync (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { - int btl_ownership; - mca_btl_sm_frag_t* frag = (mca_btl_sm_frag_t*)des; - mca_btl_sm_segment_t *src = (mca_btl_sm_segment_t*)des->des_remote; - mca_btl_sm_segment_t *dst = (mca_btl_sm_segment_t*)des->des_local; #if OPAL_BTL_SM_HAVE_KNEM mca_btl_sm_t* sm_btl = (mca_btl_sm_t*) btl; if (OPAL_LIKELY(mca_btl_sm_component.use_knem)) { @@ -1054,12 +1090,12 @@ int mca_btl_sm_get_sync(struct mca_btl_base_module_t* btl, /* Fill in the ioctl data fields. There's no async completion, so we don't need to worry about getting a slot, etc. */ - recv_iovec.base = (uintptr_t) dst->base.seg_addr.lval; - recv_iovec.len = dst->base.seg_len; + recv_iovec.base = (uintptr_t) local_address; + recv_iovec.len = size; icopy.local_iovec_array = (uintptr_t)&recv_iovec; icopy.local_iovec_nr = 1; - icopy.remote_cookie = src->key; - icopy.remote_offset = 0; + icopy.remote_cookie = remote_handle->data.knem.cookie; + icopy.remote_offset = remote_address - remote_handle->data.knem.base_addr; icopy.write = 0; /* Use the DMA flag if knem supports it *and* the segment length @@ -1067,7 +1103,7 @@ int mca_btl_sm_get_sync(struct mca_btl_base_module_t* btl, value is 0 (i.e., the MCA param was set to 0), the segment size will never be larger than it, so DMA will never be used. */ icopy.flags = 0; - if (mca_btl_sm_component.knem_dma_min <= dst->base.seg_len) { + if (mca_btl_sm_component.knem_dma_min <= size) { icopy.flags = mca_btl_sm_component.knem_dma_flag; } /* synchronous flags only, no need to specify icopy.async_status_index */ @@ -1085,27 +1121,19 @@ int mca_btl_sm_get_sync(struct mca_btl_base_module_t* btl, #if OPAL_BTL_SM_HAVE_CMA if (OPAL_LIKELY(mca_btl_sm_component.use_cma)) { - char *remote_address, *local_address; - int remote_length, local_length; struct iovec local, remote; pid_t remote_pid; int val; - remote_address = (char *)(uintptr_t) src->base.seg_addr.lval; - remote_length = src->base.seg_len; - - local_address = (char *)(uintptr_t) dst->base.seg_addr.lval; - local_length = dst->base.seg_len; - - remote_pid = src->key; - remote.iov_base = remote_address; - remote.iov_len = remote_length; + remote_pid = remote_handle->data.pid; + remote.iov_base = (void *) (intptr_t) remote_address; + remote.iov_len = size; local.iov_base = local_address; - local.iov_len = local_length; + local.iov_len = size; val = process_vm_readv(remote_pid, &local, 1, &remote, 1, 0); - if (val != local_length) { + if (val != size) { if (val<0) { opal_output(0, "mca_btl_sm_get_sync: process_vm_readv failed: %i", errno); @@ -1119,15 +1147,7 @@ int mca_btl_sm_get_sync(struct mca_btl_base_module_t* btl, } #endif /* OPAL_BTL_SM_HAVE_CMA */ - btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); - if (0 != (MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags)) { - frag->base.des_cbfunc(&mca_btl_sm.super, - frag->endpoint, &frag->base, - OPAL_SUCCESS); - } - if (btl_ownership) { - MCA_BTL_SM_FRAG_RETURN(frag); - } + cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS); return OPAL_SUCCESS; } @@ -1139,34 +1159,42 @@ int mca_btl_sm_get_sync(struct mca_btl_base_module_t* btl, /** * Initiate an asynchronous get. - * - * @param btl (IN) BTL module - * @param endpoint (IN) BTL addressing information - * @param descriptor (IN) Description of the data to be transferred */ -int mca_btl_sm_get_async(struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - struct mca_btl_base_descriptor_t* des) +int mca_btl_sm_get_async (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { - int btl_ownership; mca_btl_sm_t* sm_btl = (mca_btl_sm_t*) btl; - mca_btl_sm_frag_t* frag = (mca_btl_sm_frag_t*)des; - mca_btl_sm_segment_t *src = (mca_btl_sm_segment_t*)des->des_remote; - mca_btl_sm_segment_t *dst = (mca_btl_sm_segment_t*)des->des_local; + mca_btl_sm_frag_t* frag; struct knem_cmd_inline_copy icopy; struct knem_cmd_param_iovec recv_iovec; - /* If we have no knem slots available, return - TEMP_OUT_OF_RESOURCE */ + /* If we have no knem slots available, fall back to synchronous */ if (sm_btl->knem_status_num_used >= mca_btl_sm_component.knem_max_simultaneous) { - return OPAL_ERR_TEMP_OUT_OF_RESOURCE; + return mca_btl_sm_get_sync (btl, endpoint, local_address, remote_address, local_handle, + remote_handle, size, flags, order, cbfunc, cbcontext, cbdata); } + /* allocate a fragment to keep track of this transaction */ + MCA_BTL_SM_FRAG_ALLOC_USER(frag); + if (OPAL_UNLIKELY(NULL == frag)) { + return mca_btl_sm_get_sync (btl, endpoint, local_address, remote_address, local_handle, + remote_handle, size, flags, order, cbfunc, cbcontext, cbdata); + } + + /* fill in callback data */ + frag->cb.func = cbfunc; + frag->cb.context = cbcontext; + frag->cb.data = cbdata; + frag->cb.local_address = local_address; + frag->cb.local_handle = local_handle; + /* We have a slot, so fill in the data fields. Bump the first_avail and num_used counters. */ - recv_iovec.base = (uintptr_t) dst->base.seg_addr.lval; - recv_iovec.len = dst->base.seg_len; + recv_iovec.base = (uintptr_t) local_address; + recv_iovec.len = size; icopy.local_iovec_array = (uintptr_t)&recv_iovec; icopy.local_iovec_nr = 1; icopy.write = 0; @@ -1176,13 +1204,13 @@ int mca_btl_sm_get_async(struct mca_btl_base_module_t* btl, sm_btl->knem_status_first_avail = 0; } ++sm_btl->knem_status_num_used; - icopy.remote_cookie = src->key; - icopy.remote_offset = 0; + icopy.remote_cookie = remote_handle->data.knem.cookie; + icopy.remote_offset = remote_address - remote_handle->data.knem.base_addr; /* Use the DMA flag if knem supports it *and* the segment length is greater than the cutoff */ icopy.flags = KNEM_FLAG_ASYNCDMACOMPLETE; - if (mca_btl_sm_component.knem_dma_min <= dst->base.seg_len) { + if (mca_btl_sm_component.knem_dma_min <= size) { icopy.flags = mca_btl_sm_component.knem_dma_flag; } @@ -1190,19 +1218,11 @@ int mca_btl_sm_get_async(struct mca_btl_base_module_t* btl, if (OPAL_LIKELY(0 == ioctl(sm_btl->knem_fd, KNEM_CMD_INLINE_COPY, &icopy))) { if (icopy.current_status != KNEM_STATUS_PENDING) { + MCA_BTL_SM_FRAG_RETURN(frag); /* request completed synchronously */ /* FIXME: what if icopy.current_status == KNEM_STATUS_FAILED? */ - - btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); - if (0 != (MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags)) { - frag->base.des_cbfunc(&mca_btl_sm.super, - frag->endpoint, &frag->base, - OPAL_SUCCESS); - } - if (btl_ownership) { - MCA_BTL_SM_FRAG_RETURN(frag); - } + cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS); --sm_btl->knem_status_num_used; ++sm_btl->knem_status_first_used; diff --git a/opal/mca/btl/sm/btl_sm.h b/opal/mca/btl/sm/btl_sm.h index fd7271fb3e..efad491eff 100644 --- a/opal/mca/btl/sm/btl_sm.h +++ b/opal/mca/btl/sm/btl_sm.h @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * University Research and Technology @@ -11,7 +12,7 @@ * All rights reserved. * Copyright (c) 2006-2007 Voltaire. All rights reserved. * Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010-2013 Los Alamos National Security, LLC. + * Copyright (c) 2010-2015 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2010-2012 IBM Corporation. All rights reserved. * $COPYRIGHT$ @@ -126,7 +127,7 @@ typedef struct mca_btl_sm_mem_node_t { * Shared Memory (SM) BTL module. */ struct mca_btl_sm_component_t { - mca_btl_base_component_2_0_0_t super; /**< base BTL component */ + mca_btl_base_component_3_0_0_t super; /**< base BTL component */ int sm_free_list_num; /**< initial size of free lists */ int sm_free_list_max; /**< maximum size of free lists */ int sm_free_list_inc; /**< number of elements to alloc when growing free lists */ @@ -182,6 +183,10 @@ struct mca_btl_sm_component_t { #if OPAL_BTL_SM_HAVE_KNEM /* Knem capabilities info */ struct knem_cmd_info knem_info; +#endif +#if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA + /** registration handles to hold knem cookies */ + ompi_free_list_t registration_handles; #endif /* OPAL_BTL_SM_HAVE_KNEM */ /** MCA: should we be using knem or not? neg=try but continue if @@ -461,7 +466,6 @@ extern int mca_btl_sm_free( struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_src( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, - mca_mpool_base_registration_t* registration, struct opal_convertor_t* convertor, uint8_t order, size_t reserve, @@ -504,30 +508,20 @@ extern int mca_btl_sm_send( /* * Synchronous knem/cma get */ -extern int mca_btl_sm_get_sync( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - struct mca_btl_base_descriptor_t* des ); - -extern struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_dst( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - struct mca_mpool_base_registration_t* registration, - struct opal_convertor_t* convertor, - uint8_t order, - size_t reserve, - size_t* size, - uint32_t flags); +int mca_btl_sm_get_sync (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); #endif /* OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA */ #if OPAL_BTL_SM_HAVE_KNEM /* * Asynchronous knem get */ -extern int mca_btl_sm_get_async( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - struct mca_btl_base_descriptor_t* des ); +int mca_btl_sm_get_async (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); #endif /* OPAL_BTL_SM_HAVE_KNEM */ @@ -558,6 +552,31 @@ void mca_btl_sm_component_event_thread(opal_object_t*); #define MCA_BTL_SM_SIGNAL_PEER(peer) #endif +#if OPAL_BTL_SM_HAVE_KNEM | OPAL_BTL_SM_HAVE_CMA +struct mca_btl_base_registration_handle_t { + union { + struct { + uint64_t cookie; + intptr_t base_addr; + } knem; + pid_t pid; + } data; +}; + +struct mca_btl_sm_registration_handle_t { + ompi_free_list_item_t super; + mca_btl_base_registration_handle_t btl_handle; +}; +typedef struct mca_btl_sm_registration_handle_t mca_btl_sm_registration_handle_t; + +mca_btl_base_registration_handle_t *mca_btl_sm_register_mem (struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* endpoint, + void *base, size_t size, uint32_t flags); + +int mca_btl_sm_deregister_mem (struct mca_btl_base_module_t* btl, mca_btl_base_registration_handle_t *handle); + +#endif + END_C_DECLS #endif diff --git a/opal/mca/btl/sm/btl_sm_component.c b/opal/mca/btl/sm/btl_sm_component.c index 02e65491d1..ad5a2585c0 100644 --- a/opal/mca/btl/sm/btl_sm_component.c +++ b/opal/mca/btl/sm/btl_sm_component.c @@ -67,6 +67,10 @@ #include "opal/mca/common/cuda/common_cuda.h" #endif /* OPAL_CUDA_SUPPORT */ +#if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA +static OBJ_CLASS_INSTANCE(mca_btl_sm_registration_handle_t, ompi_free_list_item_t, NULL, NULL); +#endif + static int mca_btl_sm_component_open(void); static int mca_btl_sm_component_close(void); static int sm_register(void); @@ -251,10 +255,13 @@ static int sm_register(void) mca_btl_sm.super.btl_rdma_pipeline_frag_size = 64*1024; mca_btl_sm.super.btl_min_rdma_pipeline_size = 64*1024; mca_btl_sm.super.btl_flags = MCA_BTL_FLAGS_SEND; - mca_btl_sm.super.btl_seg_size = sizeof (mca_btl_sm_segment_t); mca_btl_sm.super.btl_bandwidth = 9000; /* Mbs */ mca_btl_sm.super.btl_latency = 1; /* Microsecs */ +#if OPAL_BTL_SM_HAVE_KNEM + mca_btl_sm.super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t); +#endif + /* Call the BTL based to register its MCA params */ mca_btl_base_param_register(&mca_btl_sm_component.super.btl_version, &mca_btl_sm.super); @@ -295,6 +302,11 @@ static int mca_btl_sm_component_open(void) OBJ_CONSTRUCT(&mca_btl_sm_component.pending_send_fl, opal_free_list_t); mca_btl_sm_component.sm_seg = NULL; + +#if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA + OBJ_CONSTRUCT(&mca_btl_sm_component.registration_handles, ompi_free_list_t); +#endif + #if OPAL_BTL_SM_HAVE_KNEM mca_btl_sm.knem_fd = -1; mca_btl_sm.knem_status_array = NULL; @@ -332,6 +344,10 @@ static int mca_btl_sm_component_close(void) } #endif /* OPAL_BTL_SM_HAVE_KNEM */ +#if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA + OBJ_DESTRUCT(&mca_btl_sm_component.registration_handles); +#endif + OBJ_DESTRUCT(&mca_btl_sm_component.sm_lock); /** * We don't have to destroy the fragment lists. They are allocated @@ -904,6 +920,9 @@ mca_btl_sm_component_init(int *num_btls, } else { mca_btl_sm.super.btl_get = mca_btl_sm_get_sync; } + + mca_btl_sm.super.btl_register_mem = mca_btl_sm_register_mem; + mca_btl_sm.super.btl_deregister_mem = mca_btl_sm_deregister_mem; } #else /* If the user explicitly asked for knem and we can't provide it, @@ -918,6 +937,8 @@ mca_btl_sm_component_init(int *num_btls, /* Will only ever have either cma or knem enabled at runtime so no problems with accidentally overwriting this set earlier */ mca_btl_sm.super.btl_get = mca_btl_sm_get_sync; + mca_btl_sm.super.btl_register_mem = mca_btl_sm_register_mem; + mca_btl_sm.super.btl_deregister_mem = mca_btl_sm_deregister_mem; } #else /* If the user explicitly asked for CMA and we can't provide itm @@ -931,6 +952,21 @@ mca_btl_sm_component_init(int *num_btls, } #endif /* OPAL_BTL_SM_HAVE_CMA */ +#if OPAL_BTL_SM_HAVE_KNEM | OPAL_BTL_SM_HAVE_CMA + if (mca_btl_sm_component.use_cma || mca_btl_sm_component.use_knem) { + rc = ompi_free_list_init_new (&mca_btl_sm_component.registration_handles, + sizeof (mca_btl_sm_registration_handle_t), + 8, OBJ_CLASS(mca_btl_sm_registration_handle_t), + 0, 0, mca_btl_sm_component.sm_free_list_num, + mca_btl_sm_component.sm_free_list_max, + mca_btl_sm_component.sm_free_list_inc, NULL); + if (OPAL_SUCCESS != rc) { + free (btls); + return NULL; + } + } +#endif + return btls; no_knem: @@ -963,6 +999,7 @@ mca_btl_sm_component_init(int *num_btls, /* disable get when not using knem or cma */ mca_btl_sm.super.btl_get = NULL; mca_btl_sm.super.btl_flags &= ~MCA_BTL_FLAGS_GET; + mca_btl_sm_component.use_knem = 0; } /* Otherwise, use_knem was 0 (and we didn't get here) or use_knem @@ -1090,8 +1127,8 @@ int mca_btl_sm_component_progress(void) reg = mca_btl_base_active_message_trigger + hdr->tag; seg.seg_addr.pval = ((char *)hdr) + sizeof(mca_btl_sm_hdr_t); seg.seg_len = hdr->len; - Frag.base.des_local_count = 1; - Frag.base.des_local = &seg; + Frag.base.des_segment_count = 1; + Frag.base.des_segments = &seg; reg->cbfunc(&mca_btl_sm.super, hdr->tag, &(Frag.base), reg->cbdata); /* return the fragment */ @@ -1176,22 +1213,14 @@ int mca_btl_sm_component_progress(void) mca_btl_sm.knem_status_array[mca_btl_sm.knem_status_first_used]) { if (KNEM_STATUS_SUCCESS == mca_btl_sm.knem_status_array[mca_btl_sm.knem_status_first_used]) { - int btl_ownership; /* Handle the completed fragment */ frag = mca_btl_sm.knem_frag_array[mca_btl_sm.knem_status_first_used]; - btl_ownership = (frag->base.des_flags & - MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); - if (0 != (MCA_BTL_DES_SEND_ALWAYS_CALLBACK & - frag->base.des_flags)) { - frag->base.des_cbfunc(&mca_btl_sm.super, - frag->endpoint, &frag->base, - OPAL_SUCCESS); - } - if (btl_ownership) { - MCA_BTL_SM_FRAG_RETURN(frag); - } + frag->cb.func (&mca_btl_sm.super, frag->endpoint, + frag->cb.local_address, frag->cb.local_handle, + frag->cb.context, frag->cb.data, OPAL_SUCCESS); + MCA_BTL_SM_FRAG_RETURN(frag); /* Bump counters, loop around the circular buffer if necessary */ diff --git a/opal/mca/btl/sm/btl_sm_endpoint.h b/opal/mca/btl/sm/btl_sm_endpoint.h index 5e32510a67..04708dc856 100644 --- a/opal/mca/btl/sm/btl_sm_endpoint.h +++ b/opal/mca/btl/sm/btl_sm_endpoint.h @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2007 Voltaire. All rights reserved. + * Copyright (c) 2014 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow diff --git a/opal/mca/btl/sm/btl_sm_frag.c b/opal/mca/btl/sm/btl_sm_frag.c index 3360e3fba3..0e84617327 100644 --- a/opal/mca/btl/sm/btl_sm_frag.c +++ b/opal/mca/btl/sm/btl_sm_frag.c @@ -31,8 +31,8 @@ static inline void mca_btl_sm_frag_common_constructor(mca_btl_sm_frag_t* frag) frag->hdr->my_smp_rank = mca_btl_sm_component.my_smp_rank; } frag->segment.base.seg_len = frag->size; - frag->base.des_local = &frag->segment.base; - frag->base.des_local_count = 1; + frag->base.des_segments = &frag->segment.base; + frag->base.des_segment_count = 1; frag->base.des_flags = 0; } diff --git a/opal/mca/btl/sm/btl_sm_frag.h b/opal/mca/btl/sm/btl_sm_frag.h index 3dde48c802..424de6a7fb 100644 --- a/opal/mca/btl/sm/btl_sm_frag.h +++ b/opal/mca/btl/sm/btl_sm_frag.h @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology @@ -11,6 +12,8 @@ * All rights reserved. * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2014 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -64,6 +67,16 @@ struct mca_btl_sm_frag_t { /* pointer written to the FIFO, this is the base of the shared memory region */ mca_btl_sm_hdr_t *hdr; ompi_free_list_t* my_list; +#if OPAL_BTL_SM_HAVE_KNEM + /* rdma callback data. required for async get */ + struct { + mca_btl_base_rdma_completion_fn_t func; + void *local_address; + struct mca_btl_base_registration_handle_t *local_handle; + void *context; + void *data; + } cb; +#endif }; typedef struct mca_btl_sm_frag_t mca_btl_sm_frag_t; typedef struct mca_btl_sm_frag_t mca_btl_sm_frag1_t; diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c index 67f019b70b..196f2dfe74 100644 --- a/opal/mca/btl/smcuda/btl_smcuda.c +++ b/opal/mca/btl/smcuda/btl_smcuda.c @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2006-2007 Voltaire. All rights reserved. * Copyright (c) 2009-2012 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010-2014 Los Alamos National Security, LLC. + * Copyright (c) 2010-2015 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2012-2014 NVIDIA Corporation. All rights reserved. * Copyright (c) 2012 Oracle and/or its affiliates. All rights reserved. @@ -71,6 +71,15 @@ #include "btl_smcuda_frag.h" #include "btl_smcuda_fifo.h" +#if OPAL_CUDA_SUPPORT +static struct mca_btl_base_registration_handle_t *mca_btl_smcuda_register_mem ( + struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t *endpoint, void *base, + size_t size, uint32_t flags); + +static int mca_btl_smcuda_deregister_mem (struct mca_btl_base_module_t* btl, + struct mca_btl_base_registration_handle_t *handle); +#endif + mca_btl_smcuda_t mca_btl_smcuda = { .super = { .btl_component = &mca_btl_smcuda_component.super, @@ -80,9 +89,10 @@ mca_btl_smcuda_t mca_btl_smcuda = { .btl_alloc = mca_btl_smcuda_alloc, .btl_free = mca_btl_smcuda_free, .btl_prepare_src = mca_btl_smcuda_prepare_src, -#if OPAL_CUDA_SUPPORT || OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA - .btl_prepare_dst = mca_btl_smcuda_prepare_dst, -#endif /* OPAL_CUDA_SUPPORT || OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA */ +#if OPAL_CUDA_SUPPORT + .btl_register_mem = mca_btl_smcuda_register_mem, + .btl_deregister_mem = mca_btl_smcuda_deregister_mem, +#endif /* OPAL_CUDA_SUPPORT */ .btl_send = mca_btl_smcuda_send, .btl_sendi = mca_btl_smcuda_sendi, .btl_dump = mca_btl_smcuda_dump, @@ -741,7 +751,7 @@ extern mca_btl_base_descriptor_t* mca_btl_smcuda_alloc( } if (OPAL_LIKELY(frag != NULL)) { - frag->segment.base.seg_len = size; + frag->segment.seg_len = size; frag->base.des_flags = flags; } return (mca_btl_base_descriptor_t*)frag; @@ -772,7 +782,6 @@ extern int mca_btl_smcuda_free( struct mca_btl_base_descriptor_t* mca_btl_smcuda_prepare_src( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, - mca_mpool_base_registration_t* registration, struct opal_convertor_t* convertor, uint8_t order, size_t reserve, @@ -784,68 +793,33 @@ struct mca_btl_base_descriptor_t* mca_btl_smcuda_prepare_src( uint32_t iov_count = 1; size_t max_data = *size; int rc; -#if OPAL_CUDA_SUPPORT - if (0 != reserve) { -#endif /* OPAL_CUDA_SUPPORT */ - if ( reserve + max_data <= mca_btl_smcuda_component.eager_limit ) { - MCA_BTL_SMCUDA_FRAG_ALLOC_EAGER(frag); - } else { - MCA_BTL_SMCUDA_FRAG_ALLOC_MAX(frag); - } - if( OPAL_UNLIKELY(NULL == frag) ) { - return NULL; - } - if( OPAL_UNLIKELY(reserve + max_data > frag->size) ) { - max_data = frag->size - reserve; - } - iov.iov_len = max_data; - iov.iov_base = - (IOVBASE_TYPE*)(((unsigned char*)(frag->segment.base.seg_addr.pval)) + reserve); - - rc = opal_convertor_pack(convertor, &iov, &iov_count, &max_data ); - if( OPAL_UNLIKELY(rc < 0) ) { - MCA_BTL_SMCUDA_FRAG_RETURN(frag); - return NULL; - } - frag->segment.base.seg_len = reserve + max_data; -#if OPAL_CUDA_SUPPORT + if ( reserve + max_data <= mca_btl_smcuda_component.eager_limit ) { + MCA_BTL_SMCUDA_FRAG_ALLOC_EAGER(frag); } else { - /* Normally, we are here because we have a GPU buffer and we are preparing - * to send it. However, we can also be there because we have received a - * PUT message because we are trying to send a host buffer. Therefore, - * we need to again check to make sure buffer is GPU. If not, then return - * NULL. We can just check the convertor since we have that. */ - if (!(convertor->flags & CONVERTOR_CUDA)) { - return NULL; - } - - MCA_BTL_SMCUDA_FRAG_ALLOC_USER(frag); - if( OPAL_UNLIKELY(NULL == frag) ) { - return NULL; - } - iov.iov_len = max_data; - iov.iov_base = NULL; - rc = opal_convertor_pack(convertor, &iov, &iov_count, &max_data); - if( OPAL_UNLIKELY(rc < 0) ) { - MCA_BTL_SMCUDA_FRAG_RETURN(frag); - return NULL; - } - frag->segment.base.seg_addr.lval = (uint64_t)(uintptr_t) iov.iov_base; - frag->segment.base.seg_len = max_data; - memcpy(frag->segment.key, ((mca_mpool_common_cuda_reg_t *)registration)->memHandle, - sizeof(((mca_mpool_common_cuda_reg_t *)registration)->memHandle) + - sizeof(((mca_mpool_common_cuda_reg_t *)registration)->evtHandle)); - frag->segment.memh_seg_addr.pval = registration->base; - frag->segment.memh_seg_len = registration->bound - registration->base + 1; - + MCA_BTL_SMCUDA_FRAG_ALLOC_MAX(frag); } -#endif /* OPAL_CUDA_SUPPORT */ - frag->base.des_local = &(frag->segment.base); - frag->base.des_local_count = 1; + if( OPAL_UNLIKELY(NULL == frag) ) { + return NULL; + } + + if( OPAL_UNLIKELY(reserve + max_data > frag->size) ) { + max_data = frag->size - reserve; + } + iov.iov_len = max_data; + iov.iov_base = + (IOVBASE_TYPE*)(((unsigned char*)(frag->segment.seg_addr.pval)) + reserve); + + rc = opal_convertor_pack(convertor, &iov, &iov_count, &max_data ); + if( OPAL_UNLIKELY(rc < 0) ) { + MCA_BTL_SMCUDA_FRAG_RETURN(frag); + return NULL; + } + + frag->segment.seg_len = reserve + max_data; + frag->base.des_segments = &frag->segment; + frag->base.des_segment_count = 1; frag->base.order = MCA_BTL_NO_ORDER; - frag->base.des_remote = NULL; - frag->base.des_remote_count = 0; frag->base.des_flags = flags; *size = max_data; return &frag->base; @@ -854,8 +828,8 @@ struct mca_btl_base_descriptor_t* mca_btl_smcuda_prepare_src( #if 0 #define MCA_BTL_SMCUDA_TOUCH_DATA_TILL_CACHELINE_BOUNDARY(sm_frag) \ do { \ - char* _memory = (char*)(sm_frag)->segment.base.seg_addr.pval + \ - (sm_frag)->segment.base.seg_len; \ + char* _memory = (char*)(sm_frag)->segment.seg_addr.pval + \ + (sm_frag)->segment.seg_len; \ int* _intmem; \ size_t align = (intptr_t)_memory & 0xFUL; \ switch( align & 0x3 ) { \ @@ -926,7 +900,7 @@ int mca_btl_smcuda_sendi( struct mca_btl_base_module_t* btl, } /* fill in fragment fields */ - frag->segment.base.seg_len = length; + frag->segment.seg_len = length; frag->hdr->len = length; assert( 0 == (flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) ); frag->base.des_flags = flags | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP; /* why do any flags matter here other than OWNERSHIP? */ @@ -934,7 +908,7 @@ int mca_btl_smcuda_sendi( struct mca_btl_base_module_t* btl, frag->endpoint = endpoint; /* write the match header (with MPI comm/tag/etc. info) */ - memcpy( frag->segment.base.seg_addr.pval, header, header_size ); + memcpy( frag->segment.seg_addr.pval, header, header_size ); /* write the message data if there is any */ /* @@ -945,7 +919,7 @@ int mca_btl_smcuda_sendi( struct mca_btl_base_module_t* btl, struct iovec iov; uint32_t iov_count; /* pack the data into the supplied buffer */ - iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)frag->segment.base.seg_addr.pval + header_size); + iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)frag->segment.seg_addr.pval + header_size); iov.iov_len = max_data = payload_size; iov_count = 1; @@ -1000,7 +974,7 @@ int mca_btl_smcuda_send( struct mca_btl_base_module_t* btl, #endif /* OPAL_CUDA_SUPPORT */ /* available header space */ - frag->hdr->len = frag->segment.base.seg_len; + frag->hdr->len = frag->segment.seg_len; /* type of message, pt-2-pt, one-sided, etc */ frag->hdr->tag = tag; @@ -1024,65 +998,76 @@ int mca_btl_smcuda_send( struct mca_btl_base_module_t* btl, */ return 0; } + #if OPAL_CUDA_SUPPORT -struct mca_btl_base_descriptor_t* mca_btl_smcuda_prepare_dst( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - struct mca_mpool_base_registration_t* registration, - struct opal_convertor_t* convertor, - uint8_t order, - size_t reserve, - size_t* size, - uint32_t flags) +static struct mca_btl_base_registration_handle_t *mca_btl_smcuda_register_mem ( + struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t *endpoint, void *base, + size_t size, uint32_t flags) { - void *ptr; - mca_btl_smcuda_frag_t* frag; + mca_mpool_common_cuda_reg_t *reg; + int mpool_flags = 0; - /* Only support GPU buffers */ - if (!(convertor->flags & CONVERTOR_CUDA)) { + if (MCA_BTL_REG_FLAG_CUDA_GPU_MEM & flags) { + mpool_flags |= MCA_MPOOL_FLAGS_CUDA_GPU_MEM; + } + + btl->btl_mpool->mpool_register (btl->btl_mpool, base, size, mpool_flags, + (mca_mpool_base_registration_t **) ®); + if (OPAL_UNLIKELY(NULL == reg)) { return NULL; } - MCA_BTL_SMCUDA_FRAG_ALLOC_USER(frag); - if(OPAL_UNLIKELY(NULL == frag)) { - return NULL; - } - - frag->segment.base.seg_len = *size; - opal_convertor_get_current_pointer( convertor, &ptr ); - frag->segment.base.seg_addr.lval = (uint64_t)(uintptr_t) ptr; - - frag->base.des_remote = NULL; - frag->base.des_remote_count = 0; - frag->base.des_local = &frag->segment.base; - frag->base.des_local_count = 1; - frag->base.des_flags = flags; - return &frag->base; + return (mca_btl_base_registration_handle_t *) ®->data; } -#endif /* OPAL_CUDA_SUPPORT */ - -#if OPAL_CUDA_SUPPORT -int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* ep, - struct mca_btl_base_descriptor_t* descriptor) +static int mca_btl_smcuda_deregister_mem (struct mca_btl_base_module_t* btl, + struct mca_btl_base_registration_handle_t *handle) +{ + mca_mpool_common_cuda_reg_t *reg = (mca_mpool_common_cuda_reg_t *) + ((intptr_t) handle - offsetof (mca_mpool_common_cuda_reg_t, data)); + + btl->btl_mpool->mpool_deregister (btl->btl_mpool, ®->base); + + return OPAL_SUCCESS; +} + +int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *ep, void *local_address, + uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { - mca_btl_smcuda_segment_t *src_seg = (mca_btl_smcuda_segment_t *) descriptor->des_remote; - mca_btl_smcuda_segment_t *dst_seg = (mca_btl_smcuda_segment_t *) descriptor->des_local; mca_mpool_common_cuda_reg_t rget_reg; mca_mpool_common_cuda_reg_t *reg_ptr = &rget_reg; - int btl_ownership; int rc, done; void *remote_memory_address; size_t offset; - mca_btl_smcuda_frag_t* frag = (mca_btl_smcuda_frag_t*)descriptor; + mca_btl_smcuda_frag_t *frag; + + /* NTH: copied from old prepare_dst function */ + MCA_BTL_SMCUDA_FRAG_ALLOC_USER(frag); + if(OPAL_UNLIKELY(NULL == frag)) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + /* shove all the info needed for completion callbacks into the fragment */ + frag->segment.seg_len = size; + frag->segment.seg_addr.pval = local_address; + frag->base.des_segments = &frag->segment; + frag->base.des_segment_count = 1; + frag->base.des_flags = flags; + frag->base.des_cbfunc = (mca_btl_base_completion_fn_t) cbfunc; + frag->base.des_cbdata = cbdata; + frag->base.des_context = cbcontext; + frag->local_handle = local_handle; /* Set to 0 for debugging since it is a list item but I am not * intializing it properly and it is annoying to see all the * garbage in the debugger. */ memset(&rget_reg, 0, sizeof(rget_reg)); - memcpy(&rget_reg.memHandle, src_seg->key, sizeof(src_seg->key)); + memcpy(&rget_reg.data.memHandle, remote_handle->reg_data.memHandle, + sizeof(remote_handle->reg_data.memHandle)); /* Open the memory handle to the remote memory. If it is cached, then * we just retrieve it from cache and avoid a call to open the handle. That @@ -1091,8 +1076,8 @@ int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t* btl, * remote memory which may lie somewhere in the middle. This is taken care of * a few lines down. Note that we hand in the peer rank just for debugging * support. */ - rc = ep->mpool->mpool_register(ep->mpool, src_seg->memh_seg_addr.pval, - src_seg->memh_seg_len, ep->peer_smp_rank, + rc = ep->mpool->mpool_register(ep->mpool, remote_handle->reg_data.memh_seg_addr.pval, + remote_handle->reg_data.memh_seg_len, ep->peer_smp_rank, (mca_mpool_base_registration_t **)®_ptr); if (OPAL_SUCCESS != rc) { @@ -1107,7 +1092,7 @@ int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t* btl, * not equal the address that was used to retrieve the block. * Therefore, compute the offset and add it to the address of the * memory handle. */ - offset = (unsigned char *)src_seg->base.seg_addr.lval - reg_ptr->base.base; + offset = (size_t) ((intptr_t) remote_address - (intptr_t) reg_ptr->base.base); remote_memory_address = (unsigned char *)reg_ptr->base.alloc_base + offset; if (0 != offset) { opal_output(-1, "OFFSET=%d", (int)offset); @@ -1120,8 +1105,7 @@ int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t* btl, * rget_reg, not reg_ptr, as we do not cache the event. */ mca_common_wait_stream_synchronize(&rget_reg); - rc = mca_common_cuda_memcpy((void *)(uintptr_t) dst_seg->base.seg_addr.lval, - remote_memory_address, dst_seg->base.seg_len, + rc = mca_common_cuda_memcpy(local_address, remote_memory_address, size, "mca_btl_smcuda_get", (mca_btl_base_descriptor_t *)frag, &done); if (OPAL_SUCCESS != rc) { @@ -1133,17 +1117,8 @@ int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t* btl, } if (OPAL_UNLIKELY(1 == done)) { - /* This should only be true when experimenting with synchronous copies. */ - btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); - if (0 != (MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags)) { - frag->base.des_cbfunc(&mca_btl_smcuda.super, - frag->endpoint, &frag->base, - OPAL_SUCCESS); - } - - if (btl_ownership) { - mca_btl_smcuda_free(btl, (mca_btl_base_descriptor_t *)frag); - } + cbfunc (btl, ep, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS); + mca_btl_smcuda_free(btl, (mca_btl_base_descriptor_t *)frag); } return OPAL_SUCCESS; @@ -1208,7 +1183,7 @@ static void mca_btl_smcuda_send_cuda_ipc_request(struct mca_btl_base_module_t* b frag->endpoint = endpoint; ctrlhdr.ctag = IPC_REQ; ctrlhdr.cudev = mydevnum; - memcpy(frag->segment.base.seg_addr.pval, &ctrlhdr, sizeof(struct ctrlhdr_st)); + memcpy(frag->segment.seg_addr.pval, &ctrlhdr, sizeof(struct ctrlhdr_st)); MCA_BTL_SMCUDA_TOUCH_DATA_TILL_CACHELINE_BOUNDARY(frag); /* write the fragment pointer to the FIFO */ diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h index 7327177e75..edbe681efc 100644 --- a/opal/mca/btl/smcuda/btl_smcuda.h +++ b/opal/mca/btl/smcuda/btl_smcuda.h @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * University Research and Technology @@ -11,7 +12,7 @@ * All rights reserved. * Copyright (c) 2006-2007 Voltaire. All rights reserved. * Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010 Los Alamos National Security, LLC. + * Copyright (c) 2010-2015 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2012-2013 NVIDIA Corporation. All rights reserved. * $COPYRIGHT$ @@ -438,7 +439,6 @@ extern int mca_btl_smcuda_free( struct mca_btl_base_descriptor_t* mca_btl_smcuda_prepare_src( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, - mca_mpool_base_registration_t* registration, struct opal_convertor_t* convertor, uint8_t order, size_t reserve, @@ -481,19 +481,11 @@ extern int mca_btl_smcuda_send( /** * Remote get using device memory. */ -extern int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* ep, - struct mca_btl_base_descriptor_t* descriptor); - -extern struct mca_btl_base_descriptor_t* mca_btl_smcuda_prepare_dst( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - struct mca_mpool_base_registration_t* registration, - struct opal_convertor_t* convertor, - uint8_t order, - size_t reserve, - size_t* size, - uint32_t flags); +int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *ep, void *local_address, + uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); /* CUDA IPC control message tags */ enum ipcCtrlMsg { diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c index c5e6fd0677..3f3d0f42ae 100644 --- a/opal/mca/btl/smcuda/btl_smcuda_component.c +++ b/opal/mca/btl/smcuda/btl_smcuda_component.c @@ -180,7 +180,7 @@ static int smcuda_register(void) mca_btl_smcuda.super.btl_rdma_pipeline_frag_size = 64*1024; mca_btl_smcuda.super.btl_min_rdma_pipeline_size = 64*1024; mca_btl_smcuda.super.btl_flags = MCA_BTL_FLAGS_SEND; - mca_btl_smcuda.super.btl_seg_size = sizeof (mca_btl_smcuda_segment_t); + mca_btl_smcuda.super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t); mca_btl_smcuda.super.btl_bandwidth = 9000; /* Mbs */ mca_btl_smcuda.super.btl_latency = 1; /* Microsecs */ @@ -655,7 +655,7 @@ static void mca_btl_smcuda_send_cuda_ipc_ack(struct mca_btl_base_module_t* btl, frag->hdr->tag = MCA_BTL_TAG_SMCUDA; frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP; frag->endpoint = endpoint; - memcpy(frag->segment.base.seg_addr.pval, &ctrlhdr, sizeof(struct ctrlhdr_st)); + memcpy(frag->segment.seg_addr.pval, &ctrlhdr, sizeof(struct ctrlhdr_st)); /* write the fragment pointer to the FIFO */ /* @@ -691,7 +691,7 @@ static void btl_smcuda_control(mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t *endpoint; mca_btl_smcuda_t *smcuda_btl = (mca_btl_smcuda_t *)btl; mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des; - mca_btl_base_segment_t* segments = des->des_local; + mca_btl_base_segment_t* segments = des->des_segments; /* Use the rank of the peer that sent the data to get to the endpoint * structure. This is needed for PML callback. */ @@ -1065,8 +1065,8 @@ int mca_btl_smcuda_component_progress(void) reg = mca_btl_base_active_message_trigger + hdr->tag; seg.seg_addr.pval = ((char *)hdr) + sizeof(mca_btl_smcuda_hdr_t); seg.seg_len = hdr->len; - Frag.base.des_local_count = 1; - Frag.base.des_local = &seg; + Frag.base.des_segment_count = 1; + Frag.base.des_segments = &seg; #if OPAL_CUDA_SUPPORT Frag.hdr = hdr; /* needed for peer rank in control messages */ #endif /* OPAL_CUDA_SUPPORT */ @@ -1134,20 +1134,16 @@ int mca_btl_smcuda_component_progress(void) * completed. If so, issue the PML callbacks on the fragments. */ while (1 == progress_one_cuda_ipc_event((mca_btl_base_descriptor_t **)&frag)) { - int btl_ownership; - btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); - if (0 != (MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags)) { - frag->base.des_cbfunc(&mca_btl_smcuda.super, - frag->endpoint, &frag->base, - OPAL_SUCCESS); - } + mca_btl_base_rdma_completion_fn_t cbfunc = (mca_btl_base_rdma_completion_fn_t) frag->base.des_cbfunc; - if (btl_ownership) { - if(frag->registration != NULL) { - frag->endpoint->mpool->mpool_deregister(frag->endpoint->mpool, - (mca_mpool_base_registration_t*)frag->registration); - frag->registration = NULL; - } + cbfunc (&mca_btl_smcuda.super, frag->endpoint, frag->segment.seg_addr.pval, + frag->local_handle, frag->base.des_context, frag->base.des_cbdata, + OPAL_SUCCESS); + + if(frag->registration != NULL) { + frag->endpoint->mpool->mpool_deregister(frag->endpoint->mpool, + (mca_mpool_base_registration_t*)frag->registration); + frag->registration = NULL; MCA_BTL_SMCUDA_FRAG_RETURN(frag); } nevents++; diff --git a/opal/mca/btl/smcuda/btl_smcuda_frag.c b/opal/mca/btl/smcuda/btl_smcuda_frag.c index 79b08ada6d..3f28e7cdba 100644 --- a/opal/mca/btl/smcuda/btl_smcuda_frag.c +++ b/opal/mca/btl/smcuda/btl_smcuda_frag.c @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology @@ -11,6 +12,8 @@ * All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2012 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -27,13 +30,13 @@ static inline void mca_btl_smcuda_frag_common_constructor(mca_btl_smcuda_frag_t* if(frag->hdr != NULL) { frag->hdr->frag = (mca_btl_smcuda_frag_t*)((uintptr_t)frag | MCA_BTL_SMCUDA_FRAG_ACK); - frag->segment.base.seg_addr.pval = ((char*)frag->hdr) + + frag->segment.seg_addr.pval = ((char*)frag->hdr) + sizeof(mca_btl_smcuda_hdr_t); frag->hdr->my_smp_rank = mca_btl_smcuda_component.my_smp_rank; } - frag->segment.base.seg_len = frag->size; - frag->base.des_local = &frag->segment.base; - frag->base.des_local_count = 1; + frag->segment.seg_len = frag->size; + frag->base.des_segments = &frag->segment; + frag->base.des_segment_count = 1; frag->base.des_flags = 0; #if OPAL_CUDA_SUPPORT frag->registration = NULL; diff --git a/opal/mca/btl/smcuda/btl_smcuda_frag.h b/opal/mca/btl/smcuda/btl_smcuda_frag.h index 54a52d6197..7ed50df079 100644 --- a/opal/mca/btl/smcuda/btl_smcuda_frag.h +++ b/opal/mca/btl/smcuda/btl_smcuda_frag.h @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology @@ -12,6 +13,8 @@ * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2012 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -27,6 +30,9 @@ #include "opal_config.h" #include "btl_smcuda.h" +#if OPAL_CUDA_SUPPORT +#include "opal/mca/common/cuda/common_cuda.h" +#endif #define MCA_BTL_SMCUDA_FRAG_TYPE_MASK ((uintptr_t)0x3) #define MCA_BTL_SMCUDA_FRAG_SEND ((uintptr_t)0x0) @@ -46,6 +52,12 @@ struct mca_btl_smcuda_hdr_t { }; typedef struct mca_btl_smcuda_hdr_t mca_btl_smcuda_hdr_t; +#if OPAL_CUDA_SUPPORT +struct mca_btl_base_registration_handle_t { + mca_mpool_common_cuda_reg_data_t reg_data; +}; +#endif + struct mca_btl_smcuda_segment_t { mca_btl_base_segment_t base; #if OPAL_CUDA_SUPPORT @@ -63,10 +75,11 @@ typedef struct mca_btl_smcuda_segment_t mca_btl_smcuda_segment_t; */ struct mca_btl_smcuda_frag_t { mca_btl_base_descriptor_t base; - mca_btl_smcuda_segment_t segment; + mca_btl_base_segment_t segment; struct mca_btl_base_endpoint_t *endpoint; #if OPAL_CUDA_SUPPORT struct mca_mpool_base_registration_t *registration; + struct mca_btl_base_registration_handle_t *local_handle; #endif /* OPAL_CUDA_SUPPORT */ size_t size; /* pointer written to the FIFO, this is the base of the shared memory region */ diff --git a/opal/mca/btl/tcp/btl_tcp.c b/opal/mca/btl/tcp/btl_tcp.c index 0b41afa7b5..9541451619 100644 --- a/opal/mca/btl/tcp/btl_tcp.c +++ b/opal/mca/btl/tcp/btl_tcp.c @@ -42,7 +42,6 @@ mca_btl_tcp_module_t mca_btl_tcp_module = { .btl_alloc = mca_btl_tcp_alloc, .btl_free = mca_btl_tcp_free, .btl_prepare_src = mca_btl_tcp_prepare_src, - .btl_prepare_dst = mca_btl_tcp_prepare_dst, .btl_send = mca_btl_tcp_send, .btl_put = mca_btl_tcp_put, .btl_dump = mca_btl_base_dump, @@ -170,8 +169,8 @@ mca_btl_base_descriptor_t* mca_btl_tcp_alloc( frag->segments[0].seg_len = size; frag->segments[0].seg_addr.pval = frag+1; - frag->base.des_local = frag->segments; - frag->base.des_local_count = 1; + frag->base.des_segments = frag->segments; + frag->base.des_segment_count = 1; frag->base.des_flags = flags; frag->base.order = MCA_BTL_NO_ORDER; frag->btl = (mca_btl_tcp_module_t*)btl; @@ -202,7 +201,6 @@ int mca_btl_tcp_free( mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, - struct mca_mpool_base_registration_t* registration, struct opal_convertor_t* convertor, uint8_t order, size_t reserve, @@ -238,7 +236,7 @@ mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src( frag->segments[0].seg_addr.pval = (frag + 1); frag->segments[0].seg_len = reserve; - frag->base.des_local_count = 1; + frag->base.des_segment_count = 1; if(opal_convertor_need_buffers(convertor)) { if (max_data + reserve > frag->size) { @@ -268,66 +266,16 @@ mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src( frag->segments[1].seg_addr.pval = iov.iov_base; frag->segments[1].seg_len = max_data; - frag->base.des_local_count = 2; + frag->base.des_segment_count = 2; } - frag->base.des_local = frag->segments; - frag->base.des_remote = NULL; - frag->base.des_remote_count = 0; + frag->base.des_segments = frag->segments; frag->base.des_flags = flags; frag->base.order = MCA_BTL_NO_ORDER; *size = max_data; return &frag->base; } - -/** - * Prepare a descriptor for send/rdma using the supplied - * convertor. If the convertor references data that is contigous, - * the descriptor may simply point to the user buffer. Otherwise, - * this routine is responsible for allocating buffer space and - * packing if required. - * - * @param btl (IN) BTL module - * @param endpoint (IN) BTL peer addressing - * @param convertor (IN) Data type convertor - * @param reserve (IN) Additional bytes requested by upper layer to precede user data - * @param size (IN/OUT) Number of bytes to prepare (IN), number of bytes actually prepared (OUT) - */ - -mca_btl_base_descriptor_t* mca_btl_tcp_prepare_dst( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - struct mca_mpool_base_registration_t* registration, - struct opal_convertor_t* convertor, - uint8_t order, - size_t reserve, - size_t* size, - uint32_t flags) -{ - mca_btl_tcp_frag_t* frag; - - if( OPAL_UNLIKELY((*size) > UINT32_MAX) ) { /* limit the size to what we support */ - *size = (size_t)UINT32_MAX; - } - MCA_BTL_TCP_FRAG_ALLOC_USER(frag); - if( OPAL_UNLIKELY(NULL == frag) ) { - return NULL; - } - - frag->segments->seg_len = *size; - opal_convertor_get_current_pointer( convertor, (void**)&(frag->segments->seg_addr.pval) ); - - frag->base.des_remote = NULL; - frag->base.des_remote_count = 0; - frag->base.des_local = frag->segments; - frag->base.des_local_count = 1; - frag->base.des_flags = flags; - frag->base.order = MCA_BTL_NO_ORDER; - return &frag->base; -} - - /** * Initiate an asynchronous send. * @@ -355,7 +303,7 @@ int mca_btl_tcp_send( struct mca_btl_base_module_t* btl, frag->iov[0].iov_base = (IOVBASE_TYPE*)&frag->hdr; frag->iov[0].iov_len = sizeof(frag->hdr); frag->hdr.size = 0; - for( i = 0; i < (int)frag->base.des_local_count; i++) { + for( i = 0; i < (int)frag->base.des_segment_count; i++) { frag->hdr.size += frag->segments[i].seg_len; frag->iov[i+1].iov_len = frag->segments[i].seg_len; frag->iov[i+1].iov_base = (IOVBASE_TYPE*)frag->segments[i].seg_addr.pval; @@ -368,23 +316,55 @@ int mca_btl_tcp_send( struct mca_btl_base_module_t* btl, return mca_btl_tcp_endpoint_send(endpoint,frag); } +static void fake_rdma_complete (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, + mca_btl_base_descriptor_t *desc, int rc) +{ + mca_btl_tcp_frag_t *frag = (mca_btl_tcp_frag_t *) desc; + + frag->cb.func (btl, endpoint, frag->segments[0].seg_addr.pval, NULL, frag->cb.context, frag->cb.data, + rc); +} /** * Initiate an asynchronous put. - * - * @param btl (IN) BTL module - * @param endpoint (IN) BTL addressing information - * @param descriptor (IN) Description of the data to be transferred */ -int mca_btl_tcp_put( mca_btl_base_module_t* btl, - mca_btl_base_endpoint_t* endpoint, - mca_btl_base_descriptor_t* descriptor ) +int mca_btl_tcp_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { mca_btl_tcp_module_t* tcp_btl = (mca_btl_tcp_module_t*) btl; - mca_btl_tcp_frag_t* frag = (mca_btl_tcp_frag_t*)descriptor; + mca_btl_tcp_frag_t *frag = NULL; int i; + MCA_BTL_TCP_FRAG_ALLOC_USER(frag); + if( OPAL_UNLIKELY(NULL == frag) ) { + return OPAL_ERR_OUT_OF_RESOURCE;; + } + + frag->endpoint = endpoint; + + frag->segments->seg_len = size; + frag->segments->seg_addr.pval = local_address; + + frag->base.des_segments = frag->segments; + frag->base.des_segment_count = 1; + frag->base.order = MCA_BTL_NO_ORDER; + + frag->segments[0].seg_addr.pval = local_address; + frag->segments[0].seg_len = size; + + frag->segments[1].seg_addr.lval = remote_address; + frag->segments[1].seg_len = size; + + frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK; + frag->base.des_cbfunc = fake_rdma_complete; + + frag->cb.func = cbfunc; + frag->cb.data = cbdata; + frag->cb.context = cbcontext; + frag->btl = tcp_btl; frag->endpoint = endpoint; frag->rc = 0; @@ -394,9 +374,9 @@ int mca_btl_tcp_put( mca_btl_base_module_t* btl, frag->iov_ptr = frag->iov; frag->iov[0].iov_base = (IOVBASE_TYPE*)&frag->hdr; frag->iov[0].iov_len = sizeof(frag->hdr); - frag->iov[1].iov_base = (IOVBASE_TYPE*)frag->base.des_remote; - frag->iov[1].iov_len = frag->base.des_remote_count * sizeof(mca_btl_base_segment_t); - for( i = 0; i < (int)frag->base.des_local_count; i++ ) { + frag->iov[1].iov_base = (IOVBASE_TYPE*) (frag->segments + 1); + frag->iov[1].iov_len = sizeof(mca_btl_base_segment_t); + for( i = 0; i < (int)frag->base.des_segment_count; i++ ) { frag->hdr.size += frag->segments[i].seg_len; frag->iov[i+2].iov_len = frag->segments[i].seg_len; frag->iov[i+2].iov_base = (IOVBASE_TYPE*)frag->segments[i].seg_addr.pval; @@ -404,7 +384,7 @@ int mca_btl_tcp_put( mca_btl_base_module_t* btl, } frag->hdr.base.tag = MCA_BTL_TAG_BTL; frag->hdr.type = MCA_BTL_TCP_HDR_TYPE_PUT; - frag->hdr.count = frag->base.des_remote_count; + frag->hdr.count = 1; if (endpoint->endpoint_nbo) MCA_BTL_TCP_HDR_HTON(frag->hdr); return ((i = mca_btl_tcp_endpoint_send(endpoint,frag)) >= 0 ? OPAL_SUCCESS : i); } @@ -412,22 +392,46 @@ int mca_btl_tcp_put( mca_btl_base_module_t* btl, /** * Initiate an asynchronous get. - * - * @param btl (IN) BTL module - * @param endpoint (IN) BTL addressing information - * @param descriptor (IN) Description of the data to be transferred - * */ -int mca_btl_tcp_get( - mca_btl_base_module_t* btl, - mca_btl_base_endpoint_t* endpoint, - mca_btl_base_descriptor_t* descriptor) +int mca_btl_tcp_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { mca_btl_tcp_module_t* tcp_btl = (mca_btl_tcp_module_t*) btl; - mca_btl_tcp_frag_t* frag = (mca_btl_tcp_frag_t*)descriptor; + mca_btl_tcp_frag_t* frag = NULL; int rc; + MCA_BTL_TCP_FRAG_ALLOC_USER(frag); + if( OPAL_UNLIKELY(NULL == frag) ) { + return OPAL_ERR_OUT_OF_RESOURCE;; + } + + frag->endpoint = endpoint; + + frag->segments->seg_len = size; + frag->segments->seg_addr.pval = local_address; + + frag->base.des_segments = frag->segments; + frag->base.des_segment_count = 1; + frag->base.order = MCA_BTL_NO_ORDER; + + frag->segments[0].seg_addr.pval = local_address; + frag->segments[0].seg_len = size; + + frag->segments[1].seg_addr.lval = remote_address; + frag->segments[1].seg_len = size; + + /* call the rdma callback through the descriptor callback. this is + * tcp so the extra latency is not an issue */ + frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK; + frag->base.des_cbfunc = fake_rdma_complete; + + frag->cb.func = cbfunc; + frag->cb.data = cbdata; + frag->cb.context = cbcontext; + frag->btl = tcp_btl; frag->endpoint = endpoint; frag->rc = 0; @@ -437,11 +441,11 @@ int mca_btl_tcp_get( frag->iov_ptr = frag->iov; frag->iov[0].iov_base = (IOVBASE_TYPE*)&frag->hdr; frag->iov[0].iov_len = sizeof(frag->hdr); - frag->iov[1].iov_base = (IOVBASE_TYPE*)frag->base.des_remote; - frag->iov[1].iov_len = frag->base.des_remote_count * sizeof(mca_btl_base_segment_t); + frag->iov[1].iov_base = (IOVBASE_TYPE*) &frag->segments[1]; + frag->iov[1].iov_len = sizeof(mca_btl_base_segment_t); frag->hdr.base.tag = MCA_BTL_TAG_BTL; frag->hdr.type = MCA_BTL_TCP_HDR_TYPE_GET; - frag->hdr.count = frag->base.des_remote_count; + frag->hdr.count = 1; if (endpoint->endpoint_nbo) MCA_BTL_TCP_HDR_HTON(frag->hdr); return ((rc = mca_btl_tcp_endpoint_send(endpoint,frag)) >= 0 ? OPAL_SUCCESS : rc); } diff --git a/opal/mca/btl/tcp/btl_tcp.h b/opal/mca/btl/tcp/btl_tcp.h index 70553ae70d..7b45e08ed0 100644 --- a/opal/mca/btl/tcp/btl_tcp.h +++ b/opal/mca/btl/tcp/btl_tcp.h @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * University Research and Technology @@ -12,6 +13,8 @@ * Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -52,7 +55,7 @@ BEGIN_C_DECLS */ struct mca_btl_tcp_component_t { - mca_btl_base_component_2_0_0_t super; /**< base BTL component */ + mca_btl_base_component_3_0_0_t super; /**< base BTL component */ uint32_t tcp_addr_count; /**< total number of addresses */ uint32_t tcp_num_btls; /**< number of interfaces available to the TCP component */ unsigned int tcp_num_links; /**< number of logical links per physical device */ @@ -217,32 +220,22 @@ extern int mca_btl_tcp_send( /** * Initiate an asynchronous put. - * - * @param btl (IN) BTL module - * @param endpoint (IN) BTL addressing information - * @param descriptor (IN) Description of the data to be transferred */ -extern int mca_btl_tcp_put( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* btl_peer, - struct mca_btl_base_descriptor_t* decriptor -); +int mca_btl_tcp_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); /** * Initiate an asynchronous get. - * - * @param btl (IN) BTL module - * @param endpoint (IN) BTL addressing information - * @param descriptor (IN) Description of the data to be transferred */ -extern int mca_btl_tcp_get( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* btl_peer, - struct mca_btl_base_descriptor_t* decriptor -); +int mca_btl_tcp_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); /** * Allocate a descriptor with a segment of the requested size. @@ -290,7 +283,6 @@ extern int mca_btl_tcp_free( mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* peer, - struct mca_mpool_base_registration_t*, struct opal_convertor_t* convertor, uint8_t order, size_t reserve, @@ -298,16 +290,6 @@ mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src( uint32_t flags ); -extern mca_btl_base_descriptor_t* mca_btl_tcp_prepare_dst( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* peer, - struct mca_mpool_base_registration_t*, - struct opal_convertor_t* convertor, - uint8_t order, - size_t reserve, - size_t* size, - uint32_t flags); - /** * Fault Tolerance Event Notification Function diff --git a/opal/mca/btl/tcp/btl_tcp_component.c b/opal/mca/btl/tcp/btl_tcp_component.c index 187cb56ad5..8610f98726 100644 --- a/opal/mca/btl/tcp/btl_tcp_component.c +++ b/opal/mca/btl/tcp/btl_tcp_component.c @@ -270,7 +270,7 @@ static int mca_btl_tcp_component_register(void) MCA_BTL_FLAGS_NEED_CSUM | MCA_BTL_FLAGS_NEED_ACK | MCA_BTL_FLAGS_HETEROGENEOUS_RDMA; - mca_btl_tcp_module.super.btl_seg_size = sizeof (mca_btl_base_segment_t); + mca_btl_tcp_module.super.btl_bandwidth = 100; mca_btl_tcp_module.super.btl_latency = 100; diff --git a/opal/mca/btl/tcp/btl_tcp_frag.h b/opal/mca/btl/tcp/btl_tcp_frag.h index 2fa173d690..ab7637b4d7 100644 --- a/opal/mca/btl/tcp/btl_tcp_frag.h +++ b/opal/mca/btl/tcp/btl_tcp_frag.h @@ -58,6 +58,12 @@ struct mca_btl_tcp_frag_t { size_t size; int rc; ompi_free_list_t* my_list; + /* fake rdma completion */ + struct { + mca_btl_base_rdma_completion_fn_t func; + void *data; + void *context; + } cb; }; typedef struct mca_btl_tcp_frag_t mca_btl_tcp_frag_t; OBJ_CLASS_DECLARATION(mca_btl_tcp_frag_t); @@ -116,10 +122,8 @@ do { \ frag->iov_cnt = 1; \ frag->iov_idx = 0; \ frag->iov_ptr = frag->iov; \ - frag->base.des_remote = NULL; \ - frag->base.des_remote_count = 0; \ - frag->base.des_local = frag->segments; \ - frag->base.des_local_count = 1; \ + frag->base.des_segments = frag->segments; \ + frag->base.des_segment_count = 1; \ } while(0) diff --git a/opal/mca/btl/template/btl_template.c b/opal/mca/btl/template/btl_template.c index 7ad3777173..8387c6e239 100644 --- a/opal/mca/btl/template/btl_template.c +++ b/opal/mca/btl/template/btl_template.c @@ -10,7 +10,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2014 Los Alamos National Security, LLC. All rights + * Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -43,9 +43,11 @@ mca_btl_template_module_t mca_btl_template_module = { .btl_alloc = mca_btl_template_alloc, .btl_free = mca_btl_template_free, .btl_prepare_src = mca_btl_template_prepare_src, - .btl_prepare_dst = mca_btl_template_prepare_dst, .btl_send = mca_btl_template_send, .btl_put = mca_btl_template_put, + .btl_get = mca_btl_template_get, + .btl_register_mem = mca_btl_template_register_mem, + .btl_deregister_mem = mca_btl_template_deregister_mem, .btl_ft_event = mca_btl_template_ft_event } }; @@ -206,7 +208,6 @@ int mca_btl_template_free( mca_btl_base_descriptor_t* mca_btl_template_prepare_src( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, - struct mca_mpool_base_registration_t* registration, struct opal_convertor_t* convertor, uint8_t order, size_t reserve, @@ -270,49 +271,8 @@ mca_btl_base_descriptor_t* mca_btl_template_prepare_src( frag->segment.seg_len = max_data + reserve; } - frag->base.des_local = &frag->segment; - frag->base.des_local_count = 1; - frag->base.des_flags = 0; - return &frag->base; -} - - -/** - * Prepare a descriptor for send/rdma using the supplied - * convertor. If the convertor references data that is contigous, - * the descriptor may simply point to the user buffer. Otherwise, - * this routine is responsible for allocating buffer space and - * packing if required. - * - * @param btl (IN) BTL module - * @param endpoint (IN) BTL peer addressing - * @param convertor (IN) Data type convertor - * @param reserve (IN) Additional bytes requested by upper layer to precede user data - * @param size (IN/OUT) Number of bytes to prepare (IN), number of bytes actually prepared (OUT) - */ - -mca_btl_base_descriptor_t* mca_btl_template_prepare_dst( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - struct mca_mpool_base_registration_t* registration, - struct opal_convertor_t* convertor, - uint8_t order, - size_t reserve, - size_t* size, - uint32_t flags) -{ - mca_btl_template_frag_t* frag; - - MCA_BTL_TEMPLATE_FRAG_ALLOC_USER(btl, frag); - if(OPAL_UNLIKELY(NULL == frag)) { - return NULL; - } - - frag->segment.seg_len = *size; - opal_convertor_get_current_pointer( convertor, (void**)&(frag->segment.seg_addr.pval) ); - - frag->base.des_local = &frag->segment; - frag->base.des_local_count = 1; + frag->base.des_segments = &frag->segment; + frag->base.des_segment_count = 1; frag->base.des_flags = 0; return &frag->base; } @@ -350,14 +310,13 @@ int mca_btl_template_send( * @param descriptor (IN) Description of the data to be transferred */ -int mca_btl_template_put( - mca_btl_base_module_t* btl, - mca_btl_base_endpoint_t* endpoint, - mca_btl_base_descriptor_t* descriptor) +int mca_btl_template_put (struct mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { /* mca_btl_template_module_t* template_btl = (mca_btl_template_module_t*) btl; */ - mca_btl_template_frag_t* frag = (mca_btl_template_frag_t*) descriptor; - frag->endpoint = endpoint; /* TODO */ return OPAL_ERR_NOT_IMPLEMENTED; } @@ -372,18 +331,64 @@ int mca_btl_template_put( * */ -int mca_btl_template_get( - mca_btl_base_module_t* btl, - mca_btl_base_endpoint_t* endpoint, - mca_btl_base_descriptor_t* descriptor) +int mca_btl_template_get (struct mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { /* mca_btl_template_module_t* template_btl = (mca_btl_template_module_t*) btl; */ - mca_btl_template_frag_t* frag = (mca_btl_template_frag_t*) descriptor; - frag->endpoint = endpoint; /* TODO */ return OPAL_ERR_NOT_IMPLEMENTED; } +/** + * @brief Register a memory region for put/get/atomic operations. + * + * @param btl (IN) BTL module + * @param endpoint(IN) BTL addressing information (or NULL for all endpoints) + * @param base (IN) Pointer to start of region + * @param size (IN) Size of region + * @param flags (IN) Flags indicating what operation will be performed. Valid + * values are MCA_BTL_DES_FLAGS_PUT, MCA_BTL_DES_FLAGS_GET, + * and MCA_BTL_DES_FLAGS_ATOMIC + * + * @returns a memory registration handle valid for both local and remote operations + * @returns NULL if the region could not be registered + * + * This function registers the specified region with the hardware for use with + * the btl_put, btl_get, btl_atomic_cas, btl_atomic_op, and btl_atomic_fop + * functions. Care should be taken to not hold an excessive number of registrations + * as they may use limited system/NIC resources. + */ +struct mca_btl_base_registration_handle_t *mca_btl_template_register_mem ( + struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t *endpoint, void *base, + size_t size, uint32_t flags) +{ + /* mca_btl_template_module_t* template_btl = (mca_btl_template_module_t*) btl; */ + /* TODO */ + return NULL; +} + +/** + * @brief Deregister a memory region + * + * @param btl (IN) BTL module region was registered with + * @param handle (IN) BTL registration handle to deregister + * + * This function deregisters the memory region associated with the specified handle. Care + * should be taken to not perform any RDMA or atomic operation on this memory region + * after it is deregistered. It is erroneous to specify a memory handle associated with + * a remote node. + */ +int mca_btl_template_deregister_mem (struct mca_btl_base_module_t* btl, + struct mca_btl_base_registration_handle_t *handle) +{ + /* mca_btl_template_module_t* template_btl = (mca_btl_template_module_t*) btl; */ + /* TODO */ + return OPAL_ERR_NOT_IMPLEMENTED; +} + /* * Cleanup/release module resources. diff --git a/opal/mca/btl/template/btl_template.h b/opal/mca/btl/template/btl_template.h index c7f6a1929c..cfe7231230 100644 --- a/opal/mca/btl/template/btl_template.h +++ b/opal/mca/btl/template/btl_template.h @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * University Research and Technology @@ -9,6 +10,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -41,7 +44,7 @@ BEGIN_C_DECLS */ struct mca_btl_template_component_t { - mca_btl_base_component_2_0_0_t super; /**< base BTL component */ + mca_btl_base_component_3_0_0_t super; /**< base BTL component */ uint32_t template_num_btls; /**< number of hcas available to the TEMPLATE component */ @@ -187,32 +190,114 @@ extern int mca_btl_template_send( /** * Initiate an asynchronous put. + * Completion Semantics: if this function returns a 1 then the operation + * is complete. a return of OPAL_SUCCESS indicates + * the put operation has been queued with the + * network. the local_handle can not be deregistered + * until all outstanding operations on that handle + * have been completed. * - * @param btl (IN) BTL module - * @param endpoint (IN) BTL addressing information - * @param descriptor (IN) Description of the data to be transferred + * @param btl (IN) BTL module + * @param endpoint (IN) BTL addressing information + * @param local_address (IN) Local address to put from (registered) + * @param remote_address (IN) Remote address to put to (registered remotely) + * @param local_handle (IN) Registration handle for region containing + * (local_address, local_address + size) + * @param remote_handle (IN) Remote registration handle for region containing + * (remote_address, remote_address + size) + * @param size (IN) Number of bytes to put + * @param flags (IN) Flags for this put operation + * @param order (IN) Ordering + * @param cbfunc (IN) Function to call on completion (if queued) + * @param cbcontext (IN) Context for the callback + * @param cbdata (IN) Data for callback + * + * @retval OPAL_SUCCESS The descriptor was successfully queued for a put + * @retval OPAL_ERROR The descriptor was NOT successfully queued for a put + * @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put + * operation. Try again later + * @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or + * alignment restrictions. */ - -extern int mca_btl_template_put( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* btl_peer, - struct mca_btl_base_descriptor_t* decriptor -); - +int mca_btl_template_put (struct mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); /** * Initiate an asynchronous get. + * Completion Semantics: if this function returns a 1 then the operation + * is complete. a return of OPAL_SUCCESS indicates + * the get operation has been queued with the + * network. the local_handle can not be deregistered + * until all outstanding operations on that handle + * have been completed. + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL addressing information + * @param local_address (IN) Local address to put from (registered) + * @param remote_address (IN) Remote address to put to (registered remotely) + * @param local_handle (IN) Registration handle for region containing + * (local_address, local_address + size) + * @param remote_handle (IN) Remote registration handle for region containing + * (remote_address, remote_address + size) + * @param size (IN) Number of bytes to put + * @param flags (IN) Flags for this put operation + * @param order (IN) Ordering + * @param cbfunc (IN) Function to call on completion (if queued) + * @param cbcontext (IN) Context for the callback + * @param cbdata (IN) Data for callback + * + * @retval OPAL_SUCCESS The descriptor was successfully queued for a put + * @retval OPAL_ERROR The descriptor was NOT successfully queued for a put + * @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put + * operation. Try again later + * @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or + * alignment restrictions. + */ +int mca_btl_template_get (struct mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); + +/** + * @brief Register a memory region for put/get/atomic operations. * * @param btl (IN) BTL module - * @param endpoint (IN) BTL addressing information - * @param descriptor (IN) Description of the data to be transferred + * @param endpoint(IN) BTL addressing information (or NULL for all endpoints) + * @param base (IN) Pointer to start of region + * @param size (IN) Size of region + * @param flags (IN) Flags indicating what operation will be performed. Valid + * values are MCA_BTL_DES_FLAGS_PUT, MCA_BTL_DES_FLAGS_GET, + * and MCA_BTL_DES_FLAGS_ATOMIC + * + * @returns a memory registration handle valid for both local and remote operations + * @returns NULL if the region could not be registered + * + * This function registers the specified region with the hardware for use with + * the btl_put, btl_get, btl_atomic_cas, btl_atomic_op, and btl_atomic_fop + * functions. Care should be taken to not hold an excessive number of registrations + * as they may use limited system/NIC resources. */ - -extern int mca_btl_template_get( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* btl_peer, - struct mca_btl_base_descriptor_t* decriptor -); +struct mca_btl_base_registration_handle_t *mca_btl_template_register_mem ( + struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t *endpoint, void *base, + size_t size, uint32_t flags); + +/** + * @brief Deregister a memory region + * + * @param btl (IN) BTL module region was registered with + * @param handle (IN) BTL registration handle to deregister + * + * This function deregisters the memory region associated with the specified handle. Care + * should be taken to not perform any RDMA or atomic operation on this memory region + * after it is deregistered. It is erroneous to specify a memory handle associated with + * a remote node. + */ +int mca_btl_template_deregister_mem (struct mca_btl_base_module_t* btl, + struct mca_btl_base_registration_handle_t *handle); /** * Register a callback function that is called on receipt @@ -275,7 +360,6 @@ extern int mca_btl_template_free( mca_btl_base_descriptor_t* mca_btl_template_prepare_src( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* peer, - struct mca_mpool_base_registration_t*, struct opal_convertor_t* convertor, uint8_t order, size_t reserve, @@ -283,16 +367,6 @@ mca_btl_base_descriptor_t* mca_btl_template_prepare_src( uint32_t flags ); -extern mca_btl_base_descriptor_t* mca_btl_template_prepare_dst( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* peer, - struct mca_mpool_base_registration_t*, - struct opal_convertor_t* convertor, - uint8_t order, - size_t reserve, - size_t* size, - uint32_t flags); - /** * Fault Tolerance Event Notification Function * @param state Checkpoint Stae diff --git a/opal/mca/btl/ugni/Makefile.am b/opal/mca/btl/ugni/Makefile.am index 7304f1baeb..cff4f734a4 100644 --- a/opal/mca/btl/ugni/Makefile.am +++ b/opal/mca/btl/ugni/Makefile.am @@ -39,7 +39,8 @@ ugni_SOURCES = \ btl_ugni_smsg.h \ btl_ugni_smsg.c \ btl_ugni_progress_thread.c \ - btl_ugni_prepare.h + btl_ugni_prepare.h \ + btl_ugni_atomic.c mcacomponentdir = $(opallibdir) mcacomponent_LTLIBRARIES = $(component_install) diff --git a/opal/mca/btl/ugni/btl_ugni.h b/opal/mca/btl/ugni/btl_ugni.h index 01b16d99d7..977fcf7f77 100644 --- a/opal/mca/btl/ugni/btl_ugni.h +++ b/opal/mca/btl/ugni/btl_ugni.h @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2014 Research Organization for Information Science @@ -33,6 +33,7 @@ #include "opal/mca/btl/base/btl_base_error.h" #include "opal/class/opal_hash_table.h" #include "opal/class/ompi_free_list.h" +#include "opal/class/opal_free_list.h" #include "opal/mca/common/ugni/common_ugni.h" #include @@ -80,11 +81,16 @@ typedef struct mca_btl_ugni_module_t { opal_mutex_t eager_get_pending_lock; opal_list_t eager_get_pending; + opal_mutex_t pending_descriptors_lock; + opal_list_t pending_descriptors; + + ompi_free_list_t post_descriptors; + mca_mpool_base_module_t *smsg_mpool; ompi_free_list_t smsg_mboxes; gni_ep_handle_t wildcard_ep; - gni_ep_handle_t local_ep; + struct mca_btl_base_endpoint_t *local_ep; struct mca_btl_ugni_endpoint_attr_t wc_remote_attr, wc_local_attr; @@ -126,7 +132,7 @@ typedef struct mca_btl_ugni_module_t { typedef struct mca_btl_ugni_component_t { /* base BTL component */ - mca_btl_base_component_2_0_0_t super; + mca_btl_base_component_3_0_0_t super; /* maximum supported btls. hardcoded to 1 for now */ uint32_t ugni_max_btls; @@ -143,8 +149,6 @@ typedef struct mca_btl_ugni_component_t { /* After this message size switch to BTE protocols */ size_t ugni_fma_limit; - /* Switch to put when trying to GET at or above this size */ - size_t ugni_get_limit; /* Switch to get when sending above this size */ size_t ugni_smsg_limit; @@ -267,33 +271,31 @@ mca_btl_ugni_sendi (struct mca_btl_base_module_t *btl, uint32_t flags, mca_btl_base_tag_t tag, mca_btl_base_descriptor_t **descriptor); -/** - * Initiate a get operation. - * - * location: btl_ugni_get.c - * - * @param btl (IN) BTL module - * @param endpoint (IN) BTL addressing information - * @param descriptor (IN) Description of the data to be transferred - */ -int -mca_btl_ugni_get (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t *des); +int mca_btl_ugni_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); -/** - * Initiate a put operation. - * - * location: btl_ugni_put.c - * - * @param btl (IN) BTL module - * @param endpoint (IN) BTL addressing information - * @param descriptor (IN) Description of the data to be transferred - */ -int -mca_btl_ugni_put (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t *des); +int mca_btl_ugni_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); + +int mca_btl_ugni_aop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle, + mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order, + mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); + +int mca_btl_ugni_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op, + uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, + void *cbcontext, void *cbdata); + +int mca_btl_ugni_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, uint64_t value, + int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); int mca_btl_ugni_progress_send_wait_list (struct mca_btl_base_endpoint_t *endpoint); @@ -302,9 +304,14 @@ mca_btl_ugni_alloc(struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, uint8_t order, size_t size, uint32_t flags); +struct mca_btl_base_registration_handle_t { + /** uGNI memory handle */ + gni_mem_handle_t gni_handle; +}; + typedef struct mca_btl_ugni_reg_t { mca_mpool_base_registration_t base; - gni_mem_handle_t memory_hdl; + mca_btl_base_registration_handle_t handle; } mca_btl_ugni_reg_t; /* Global structures */ @@ -321,5 +328,7 @@ static inline uint64_t mca_btl_ugni_proc_name_to_id (opal_process_name_t name) { int mca_btl_ugni_spawn_progress_thread(struct mca_btl_base_module_t* btl); int mca_btl_ugni_kill_progress_thread(void); +/** Number of times the progress thread has woken up */ +extern unsigned int mca_btl_ugni_progress_thread_wakeups; #endif diff --git a/opal/mca/btl/ugni/btl_ugni_add_procs.c b/opal/mca/btl/ugni/btl_ugni_add_procs.c index c810bd34d6..bc5a184dc6 100644 --- a/opal/mca/btl/ugni/btl_ugni_add_procs.c +++ b/opal/mca/btl/ugni/btl_ugni_add_procs.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2014 Intel, Inc. All rights reserved. @@ -34,7 +34,6 @@ int mca_btl_ugni_add_procs(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t **peers, opal_bitmap_t *reachable) { mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl; - opal_proc_t *my_proc = opal_proc_local_get(); size_t i; int rc; void *mmap_start_addr; @@ -61,26 +60,28 @@ int mca_btl_ugni_add_procs(struct mca_btl_base_module_t* btl, } for (i = 0 ; i < nprocs ; ++i) { - struct opal_proc_t *ompi_proc = procs[i]; - uint64_t proc_id = mca_btl_ugni_proc_name_to_id(ompi_proc->proc_name); + struct opal_proc_t *opal_proc = procs[i]; + uint64_t proc_id = mca_btl_ugni_proc_name_to_id(opal_proc->proc_name); - if (OPAL_PROC_ON_LOCAL_NODE(ompi_proc->proc_flags)) { + if (OPAL_PROC_ON_LOCAL_NODE(opal_proc->proc_flags)) { ugni_module->nlocal_procs++; - /* Do not use uGNI to communicate with local procs unless we are adding more ranks. - * Change this when sm and vader are updated to handle additional add procs. */ - if (!ugni_module->initialized || my_proc == ompi_proc) { - continue; - } + /* ugni is allowed on local processes to provide support for network + * atomic operations */ } /* Create and Init endpoints */ - rc = mca_btl_ugni_init_ep (ugni_module, peers + i, (mca_btl_ugni_module_t *) btl, ompi_proc); + rc = mca_btl_ugni_init_ep (ugni_module, peers + i, (mca_btl_ugni_module_t *) btl, opal_proc); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { BTL_ERROR(("btl/ugni error initializing endpoint")); return rc; } + /* go ahead and connect the local endpoint for RDMA/CQ write */ + if (opal_proc == opal_proc_local_get ()) { + ugni_module->local_ep = peers[i]; + } + /* Add this endpoint to the pointer array. */ BTL_VERBOSE(("initialized uGNI endpoint for proc id: 0x%" PRIx64 " ptr: %p", proc_id, (void *) peers[i])); opal_hash_table_set_value_uint64 (&ugni_module->id_to_endpoint, proc_id, peers[i]); @@ -138,26 +139,6 @@ int mca_btl_ugni_add_procs(struct mca_btl_base_module_t* btl, BTL_ERROR(("error creating remote SMSG CQ")); return opal_common_rc_ugni_to_opal (rc); } - - OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); - rc = GNI_EpCreate (ugni_module->device->dev_handle, ugni_module->rdma_local_cq, - &ugni_module->local_ep); - OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - BTL_ERROR(("error creating local ugni endpoint")); - return opal_common_rc_ugni_to_opal (rc); - } - - OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); - rc = GNI_EpBind (ugni_module->local_ep, - ugni_module->device->dev_addr, - getpid()); - OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - BTL_ERROR(("error binding local ugni endpoint")); - return opal_common_rc_ugni_to_opal (rc); - } - } rc = mca_btl_ugni_setup_mpools (ugni_module); @@ -222,8 +203,8 @@ int mca_btl_ugni_del_procs (struct mca_btl_base_module_t *btl, } for (i = 0 ; i < nprocs ; ++i) { - struct opal_proc_t *ompi_proc = procs[i]; - uint64_t proc_id = mca_btl_ugni_proc_name_to_id(ompi_proc->proc_name); + struct opal_proc_t *opal_proc = procs[i]; + uint64_t proc_id = mca_btl_ugni_proc_name_to_id(opal_proc->proc_name); mca_btl_base_endpoint_t *ep = NULL; /* lookup this proc in the hash table */ @@ -257,7 +238,7 @@ static int ugni_reg_rdma_mem (void *reg_data, void *base, size_t size, OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); rc = GNI_MemRegister (ugni_module->device->dev_handle, (uint64_t) base, size, NULL, GNI_MEM_READWRITE | GNI_MEM_RELAXED_PI_ORDERING, - -1, &(ugni_reg->memory_hdl)); + -1, &(ugni_reg->handle.gni_handle)); OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) { @@ -280,7 +261,7 @@ static int ugni_reg_smsg_mem (void *reg_data, void *base, size_t size, OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); rc = GNI_MemRegister (ugni_module->device->dev_handle, (uint64_t) base, size, ugni_module->smsg_remote_cq, GNI_MEM_READWRITE, -1, - &(ugni_reg->memory_hdl)); + &(ugni_reg->handle.gni_handle)); OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); return opal_common_rc_ugni_to_opal (rc); } @@ -293,7 +274,7 @@ ugni_dereg_mem (void *reg_data, mca_mpool_base_registration_t *reg) gni_return_t rc; OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); - rc = GNI_MemDeregister (ugni_module->device->dev_handle, &ugni_reg->memory_hdl); + rc = GNI_MemDeregister (ugni_module->device->dev_handle, &ugni_reg->handle.gni_handle); OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); if (GNI_RC_SUCCESS != rc) { return OPAL_ERROR; @@ -470,6 +451,15 @@ mca_btl_ugni_setup_mpools (mca_btl_ugni_module_t *ugni_module) return rc; } + rc = ompi_free_list_init_new (&ugni_module->post_descriptors, + sizeof (mca_btl_ugni_post_descriptor_t), + 8, OBJ_CLASS(mca_btl_ugni_post_descriptor_t), + 0, 0, 0, -1, 256, NULL); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + BTL_ERROR(("error creating post descriptor free list")); + return rc; + } + return OPAL_SUCCESS; } diff --git a/opal/mca/btl/ugni/btl_ugni_atomic.c b/opal/mca/btl/ugni/btl_ugni_atomic.c new file mode 100644 index 0000000000..981bc759ee --- /dev/null +++ b/opal/mca/btl/ugni/btl_ugni_atomic.c @@ -0,0 +1,135 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2014 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "btl_ugni_rdma.h" + +static gni_fma_cmd_type_t famo_cmds[] = { + [MCA_BTL_ATOMIC_ADD] = GNI_FMA_ATOMIC_FADD, + [MCA_BTL_ATOMIC_AND] = GNI_FMA_ATOMIC_FAND, + [MCA_BTL_ATOMIC_OR] = GNI_FMA_ATOMIC_FOR, + [MCA_BTL_ATOMIC_XOR] = GNI_FMA_ATOMIC_FXOR, +}; + +static gni_fma_cmd_type_t amo_cmds[] = { + [MCA_BTL_ATOMIC_ADD] = GNI_FMA_ATOMIC_ADD, + [MCA_BTL_ATOMIC_AND] = GNI_FMA_ATOMIC_AND, + [MCA_BTL_ATOMIC_OR] = GNI_FMA_ATOMIC_OR, + [MCA_BTL_ATOMIC_XOR] = GNI_FMA_ATOMIC_XOR, +}; + +int mca_btl_ugni_aop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle, + mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order, + mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) +{ + gni_mem_handle_t dummy = {0, 0}; + mca_btl_ugni_post_descriptor_t *post_desc; + int rc; + + rc = mca_btl_ugni_check_endpoint_state_rdma (endpoint); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + return rc; + } + + mca_btl_ugni_alloc_post_descriptor (endpoint, NULL, cbfunc, cbcontext, cbdata, &post_desc); + if (OPAL_UNLIKELY(NULL == post_desc)) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + init_gni_post_desc (&post_desc->desc, order, GNI_POST_AMO, 0, dummy, remote_address, + remote_handle->gni_handle, 8, 0); + post_desc->desc.base.amo_cmd = amo_cmds[op]; + + post_desc->desc.base.first_operand = operand; + + OPAL_THREAD_LOCK(&endpoint->btl->device->dev_lock); + rc = GNI_PostFma (endpoint->rdma_ep_handle, &post_desc->desc.base); + OPAL_THREAD_UNLOCK(&endpoint->btl->device->dev_lock); + if (GNI_RC_SUCCESS != rc) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + return OPAL_SUCCESS; +} + +int mca_btl_ugni_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op, + uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, + void *cbcontext, void *cbdata) +{ + mca_btl_ugni_post_descriptor_t *post_desc; + int rc; + + rc = mca_btl_ugni_check_endpoint_state_rdma (endpoint); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + return rc; + } + + mca_btl_ugni_alloc_post_descriptor (endpoint, local_handle, cbfunc, cbcontext, cbdata, &post_desc); + if (OPAL_UNLIKELY(NULL == post_desc)) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + + init_gni_post_desc (&post_desc->desc, order, GNI_POST_AMO, (intptr_t) local_address, local_handle->gni_handle, + remote_address, remote_handle->gni_handle, 8, 0); + post_desc->desc.base.amo_cmd = famo_cmds[op]; + + post_desc->desc.base.first_operand = operand; + + OPAL_THREAD_LOCK(&endpoint->btl->device->dev_lock); + rc = GNI_PostFma (endpoint->rdma_ep_handle, &post_desc->desc.base); + OPAL_THREAD_UNLOCK(&endpoint->btl->device->dev_lock); + if (GNI_RC_SUCCESS != rc) { + mca_btl_ugni_return_post_descriptor (endpoint->btl, post_desc); + return OPAL_ERR_OUT_OF_RESOURCE; + } + + return OPAL_SUCCESS; +} + +int mca_btl_ugni_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, uint64_t value, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) +{ + mca_btl_ugni_post_descriptor_t *post_desc; + int rc; + + rc = mca_btl_ugni_check_endpoint_state_rdma (endpoint); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + return rc; + } + + mca_btl_ugni_alloc_post_descriptor (endpoint, local_handle, cbfunc, cbcontext, cbdata, &post_desc); + if (OPAL_UNLIKELY(NULL == post_desc)) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + + init_gni_post_desc (&post_desc->desc, order, GNI_POST_AMO, (intptr_t) local_address, local_handle->gni_handle, + remote_address, remote_handle->gni_handle, 8, 0); + post_desc->desc.base.amo_cmd = GNI_FMA_ATOMIC_CSWAP; + + post_desc->desc.base.first_operand = compare; + post_desc->desc.base.second_operand = value; + + OPAL_THREAD_LOCK(&endpoint->btl->device->dev_lock); + rc = GNI_PostFma (endpoint->rdma_ep_handle, &post_desc->desc.base); + OPAL_THREAD_UNLOCK(&endpoint->btl->device->dev_lock); + if (GNI_RC_SUCCESS != rc) { + mca_btl_ugni_return_post_descriptor (endpoint->btl, post_desc); + return OPAL_ERR_OUT_OF_RESOURCE; + } + + return OPAL_SUCCESS; +} diff --git a/opal/mca/btl/ugni/btl_ugni_component.c b/opal/mca/btl/ugni/btl_ugni_component.c index f933495942..9338c1fa21 100644 --- a/opal/mca/btl/ugni/btl_ugni_component.c +++ b/opal/mca/btl/ugni/btl_ugni_component.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. * $COPYRIGHT$ @@ -18,6 +18,8 @@ #include "opal/memoryhooks/memory.h" #include "opal/runtime/opal_params.h" +#include "opal/mca/base/mca_base_pvar.h" + static int btl_ugni_component_register(void); static int btl_ugni_component_open(void); static int btl_ugni_component_close(void); @@ -52,6 +54,7 @@ static int btl_ugni_component_register(void) { mca_base_var_enum_t *new_enum; + gni_nic_device_t device_type; int rc; (void) mca_base_var_group_component_register(&mca_btl_ugni_component.super.btl_version, @@ -139,15 +142,6 @@ btl_ugni_component_register(void) OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.ugni_fma_limit); - mca_btl_ugni_component.ugni_get_limit = 1 * 1024 * 1024; - (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version, - "get_limit", "Maximum size message that " - "will be sent using a get protocol " - "(default 1M)", MCA_BASE_VAR_TYPE_INT, - NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, - OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL, - &mca_btl_ugni_component.ugni_get_limit); - mca_btl_ugni_component.rdma_max_retries = 16; (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version, "rdma_max_retries", NULL, MCA_BASE_VAR_TYPE_INT, @@ -199,6 +193,15 @@ btl_ugni_component_register(void) MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.progress_thread_requested); + /* performance variables */ + mca_btl_ugni_progress_thread_wakeups = 0; + (void) mca_base_component_pvar_register(&mca_btl_ugni_component.super.btl_version, + "progress_thread_wakeups", "Number of times the progress thread " + "has been woken", OPAL_INFO_LVL_9, MCA_BASE_PVAR_CLASS_COUNTER, + MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, MCA_BASE_VAR_BIND_NO_OBJECT, + MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS, NULL, + NULL, NULL, &mca_btl_ugni_progress_thread_wakeups); + /* btl/ugni can only support only a fixed set of mpools (these mpools have compatible resource * structures) */ rc = mca_base_var_enum_create ("btl_ugni_mpool", mpool_values, &new_enum); @@ -222,13 +225,28 @@ btl_ugni_component_register(void) mca_btl_ugni_module.super.btl_max_send_size = 8 * 1024; mca_btl_ugni_module.super.btl_rdma_pipeline_send_length = 8 * 1024; + mca_btl_ugni_module.super.btl_get_limit = 1 * 1024 * 1024; + + /* determine if there are get alignment restrictions */ + GNI_GetDeviceType (&device_type); + + if (GNI_DEVICE_GEMINI == device_type) { + mca_btl_ugni_module.super.btl_get_alignment = 4; + } else { + mca_btl_ugni_module.super.btl_get_alignment = 0; + } + /* threshold for put */ mca_btl_ugni_module.super.btl_min_rdma_pipeline_size = 8 * 1024; mca_btl_ugni_module.super.btl_flags = MCA_BTL_FLAGS_SEND | - MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND_INPLACE; + MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND_INPLACE | MCA_BTL_FLAGS_ATOMIC_OPS | + MCA_BTL_FLAGS_ATOMIC_FOPS; + mca_btl_ugni_module.super.btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD | + MCA_BTL_ATOMIC_SUPPORTS_AND | MCA_BTL_ATOMIC_SUPPORTS_OR | MCA_BTL_ATOMIC_SUPPORTS_XOR | + MCA_BTL_ATOMIC_SUPPORTS_CSWAP; - mca_btl_ugni_module.super.btl_seg_size = sizeof (mca_btl_ugni_segment_t); + mca_btl_ugni_module.super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t); mca_btl_ugni_module.super.btl_bandwidth = 40000; /* Mbs */ mca_btl_ugni_module.super.btl_latency = 2; /* Microsecs */ @@ -439,89 +457,110 @@ mca_btl_ugni_progress_datagram (mca_btl_ugni_module_t *ugni_module) return count; } -static inline int -mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module, int which_cq) +#if OPAL_ENABLE_DEBUG +static inline void btl_ugni_dump_post_desc (mca_btl_ugni_post_descriptor_t *desc) { - opal_common_ugni_post_desc_t *desc; - mca_btl_ugni_base_frag_t *frag; + + fprintf (stderr, "desc->desc.base.post_id = %" PRIx64 "\n", desc->desc.base.post_id); + fprintf (stderr, "desc->desc.base.status = %" PRIx64 "\n", desc->desc.base.status); + fprintf (stderr, "desc->desc.base.cq_mode_complete = %hu\n", desc->desc.base.cq_mode_complete); + fprintf (stderr, "desc->desc.base.type = %d\n", desc->desc.base.type); + fprintf (stderr, "desc->desc.base.cq_mode = %hu\n", desc->desc.base.cq_mode); + fprintf (stderr, "desc->desc.base.dlvr_mode = %hu\n", desc->desc.base.dlvr_mode); + fprintf (stderr, "desc->desc.base.local_addr = %" PRIx64 "\n", desc->desc.base.local_addr); + fprintf (stderr, "desc->desc.base.local_mem_hndl = {%" PRIx64 ", %" PRIx64 "}\n", desc->desc.base.local_mem_hndl.qword1, + desc->desc.base.local_mem_hndl.qword2); + fprintf (stderr, "desc->desc.base.remote_addr = %" PRIx64 "\n", desc->desc.base.remote_addr); + fprintf (stderr, "desc->desc.base.remote_mem_hndl = {%" PRIx64 ", %" PRIx64 "}\n", desc->desc.base.remote_mem_hndl.qword1, + desc->desc.base.remote_mem_hndl.qword2); + fprintf (stderr, "desc->desc.base.length = %" PRIu64 "\n", desc->desc.base.length); + fprintf (stderr, "desc->desc.base.rdma_mode = %hu\n", desc->desc.base.rdma_mode); + fprintf (stderr, "desc->desc.base.amo_cmd = %d\n", desc->desc.base.amo_cmd); +} +#endif + +static inline int mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module, int which_cq) +{ + mca_btl_ugni_post_descriptor_t *post_desc = NULL; gni_cq_entry_t event_data = 0; + gni_post_descriptor_t *desc; uint32_t recoverable = 1; - gni_return_t rc; + gni_return_t grc; gni_cq_handle_t the_cq; the_cq = (which_cq == 0) ? ugni_module->rdma_local_cq : ugni_module->rdma_local_irq_cq; OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); - rc = GNI_CqGetEvent (the_cq, &event_data); - if (GNI_RC_NOT_DONE == rc) { + grc = GNI_CqGetEvent (the_cq, &event_data); + if (GNI_RC_NOT_DONE == grc) { OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); return 0; } - if (OPAL_UNLIKELY((GNI_RC_SUCCESS != rc && !event_data) || GNI_CQ_OVERRUN(event_data))) { + if (OPAL_UNLIKELY((GNI_RC_SUCCESS != grc && !event_data) || GNI_CQ_OVERRUN(event_data))) { /* TODO -- need to handle overrun -- how do we do this without an event? will the event eventually come back? Ask Cray */ - BTL_ERROR(("unhandled post error! ugni rc = %d %s", rc,gni_err_str[rc])); + BTL_ERROR(("unhandled post error! ugni rc = %d %s", grc, gni_err_str[grc])); OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); - return opal_common_rc_ugni_to_opal (rc); + + return opal_common_rc_ugni_to_opal (grc); } - rc = GNI_GetCompleted (the_cq, event_data, (gni_post_descriptor_t **) &desc); + grc = GNI_GetCompleted (the_cq, event_data, &desc); OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); - if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc && GNI_RC_TRANSACTION_ERROR != rc)) { - BTL_ERROR(("Error in GNI_GetComplete %s", gni_err_str[rc])); - return opal_common_rc_ugni_to_opal (rc); + if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc && GNI_RC_TRANSACTION_ERROR != grc)) { + BTL_ERROR(("Error in GNI_GetComplete %s", gni_err_str[grc])); + return opal_common_rc_ugni_to_opal (grc); } - frag = MCA_BTL_UGNI_DESC_TO_FRAG(desc); - - if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc || !GNI_CQ_STATUS_OK(event_data))) { - char buffer[1024]; + post_desc = MCA_BTL_UGNI_DESC_TO_PDESC(desc); + if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc || !GNI_CQ_STATUS_OK(event_data))) { (void) GNI_CqErrorRecoverable (event_data, &recoverable); - GNI_CqErrorStr(event_data,buffer,sizeof(buffer)); - if (OPAL_UNLIKELY(++frag->post_desc.tries >= mca_btl_ugni_component.rdma_max_retries || + if (OPAL_UNLIKELY(++post_desc->desc.tries >= mca_btl_ugni_component.rdma_max_retries || !recoverable)) { + char char_buffer[1024]; + GNI_CqErrorStr (event_data, char_buffer, 1024); /* give up */ - BTL_ERROR(("giving up on frag %p type %d CQE error %s", (void *) frag, frag->post_desc.base.type, buffer)); - mca_btl_ugni_frag_complete (frag, OPAL_ERROR); + BTL_ERROR(("giving up on desciptor %p, recoverable %d: %s", (void *) post_desc, + recoverable, char_buffer)); +#if OPAL_ENABLE_DEBUG + btl_ugni_dump_post_desc (post_desc); +#endif + mca_btl_ugni_post_desc_complete (ugni_module, post_desc, OPAL_ERROR); return OPAL_ERROR; } - /* repost transaction */ - mca_btl_ugni_repost (frag); + mca_btl_ugni_repost (ugni_module, post_desc); return 0; } - BTL_VERBOSE(("RDMA/FMA complete for frag %p", (void *) frag)); - - mca_btl_ugni_frag_complete (frag, opal_common_rc_ugni_to_opal (rc)); + mca_btl_ugni_post_desc_complete (ugni_module, post_desc, opal_common_rc_ugni_to_opal (grc)); return 1; } static inline int -mca_btl_ugni_retry_failed (mca_btl_ugni_module_t *ugni_module) +mca_btl_ugni_post_pending (mca_btl_ugni_module_t *ugni_module) { - int count = opal_list_get_size (&ugni_module->failed_frags); + int count = opal_list_get_size (&ugni_module->pending_descriptors); int i; for (i = 0 ; i < count ; ++i) { - OPAL_THREAD_LOCK(&ugni_module->failed_frags_lock); - mca_btl_ugni_base_frag_t *frag = - (mca_btl_ugni_base_frag_t *) opal_list_remove_first (&ugni_module->failed_frags); - OPAL_THREAD_UNLOCK(&ugni_module->failed_frags_lock); - if (NULL == frag) { + OPAL_THREAD_LOCK(&ugni_module->pending_descriptors_lock); + mca_btl_ugni_post_descriptor_t *post_desc = + (mca_btl_ugni_post_descriptor_t *) opal_list_remove_first (&ugni_module->pending_descriptors); + OPAL_THREAD_UNLOCK(&ugni_module->pending_descriptors_lock); + + if (OPAL_SUCCESS != mca_btl_ugni_repost (ugni_module, post_desc)) { break; } - - mca_btl_ugni_repost (frag); } - return count; + return i; } static inline int @@ -571,7 +610,6 @@ static int mca_btl_ugni_component_progress (void) for (i = 0 ; i < mca_btl_ugni_component.ugni_num_btls ; ++i) { ugni_module = mca_btl_ugni_component.modules + i; - mca_btl_ugni_retry_failed (ugni_module); mca_btl_ugni_progress_wait_list (ugni_module); count += mca_btl_ugni_progress_datagram (ugni_module); @@ -581,6 +619,9 @@ static int mca_btl_ugni_component_progress (void) if (mca_btl_ugni_component.progress_thread_enabled) { count += mca_btl_ugni_progress_rdma (ugni_module, 1); } + + /* post pending after progressing rdma */ + mca_btl_ugni_post_pending (ugni_module); } return count; diff --git a/opal/mca/btl/ugni/btl_ugni_endpoint.c b/opal/mca/btl/ugni/btl_ugni_endpoint.c index 7496601944..df2a81bf84 100644 --- a/opal/mca/btl/ugni/btl_ugni_endpoint.c +++ b/opal/mca/btl/ugni/btl_ugni_endpoint.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2011-2013 UT-Battelle, LLC. All rights reserved. * $COPYRIGHT$ @@ -10,8 +10,6 @@ * $HEADER$ */ -#include "btl_ugni.h" - #include "btl_ugni_endpoint.h" #include "btl_ugni_smsg.h" @@ -90,10 +88,8 @@ int mca_btl_ugni_ep_disconnect (mca_btl_base_endpoint_t *ep, bool send_disconnec static inline int mca_btl_ugni_ep_connect_start (mca_btl_base_endpoint_t *ep) { int rc; - /* get the modex info for this endpoint and setup a ugni endpoint */ - rc = opal_common_ugni_endpoint_for_proc (ep->btl->device, ep->peer_proc, &ep->common); - if (OPAL_SUCCESS != rc) { - assert (0); + rc = mca_btl_ugni_ep_connect_rdma (ep); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { return rc; } @@ -107,11 +103,6 @@ static inline int mca_btl_ugni_ep_connect_start (mca_btl_base_endpoint_t *ep) { return rc; } - rc = opal_common_ugni_ep_create (ep->common, ep->btl->rdma_local_cq, &ep->rdma_ep_handle); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - return rc; - } - /* build connection data */ rc = mca_btl_ugni_ep_smsg_get_mbox (ep); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { @@ -198,7 +189,7 @@ int mca_btl_ugni_ep_connect_progress (mca_btl_base_endpoint_t *ep) { return OPAL_SUCCESS; } - if (MCA_BTL_UGNI_EP_STATE_INIT == ep->state) { + if (MCA_BTL_UGNI_EP_STATE_RDMA >= ep->state) { rc = mca_btl_ugni_ep_connect_start (ep); if (OPAL_SUCCESS != rc) { return rc; diff --git a/opal/mca/btl/ugni/btl_ugni_endpoint.h b/opal/mca/btl/ugni/btl_ugni_endpoint.h index c83f81ea53..79908471f9 100644 --- a/opal/mca/btl/ugni/btl_ugni_endpoint.h +++ b/opal/mca/btl/ugni/btl_ugni_endpoint.h @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. * $COPYRIGHT$ @@ -17,6 +17,7 @@ enum mca_btl_ugni_endpoint_state_t { MCA_BTL_UGNI_EP_STATE_INIT = 0, + MCA_BTL_UGNI_EP_STATE_RDMA, MCA_BTL_UGNI_EP_STATE_CONNECTING, MCA_BTL_UGNI_EP_STATE_CONNECTED }; @@ -114,6 +115,7 @@ static inline int mca_btl_ugni_check_endpoint_state (mca_btl_ugni_endpoint_t *ep switch (ep->state) { case MCA_BTL_UGNI_EP_STATE_INIT: + case MCA_BTL_UGNI_EP_STATE_RDMA: rc = mca_btl_ugni_ep_connect_progress (ep); if (OPAL_SUCCESS != rc) { break; @@ -130,6 +132,43 @@ static inline int mca_btl_ugni_check_endpoint_state (mca_btl_ugni_endpoint_t *ep return rc; } +static inline int mca_btl_ugni_ep_connect_rdma (mca_btl_base_endpoint_t *ep) { + int rc; + + if (ep->state >= MCA_BTL_UGNI_EP_STATE_RDMA) { + return OPAL_SUCCESS; + } + + /* get the modex info for this endpoint and setup a ugni endpoint */ + rc = opal_common_ugni_endpoint_for_proc (ep->btl->device, ep->peer_proc, &ep->common); + if (OPAL_SUCCESS != rc) { + assert (0); + return rc; + } + + /* bind endpoint to remote address */ + rc = opal_common_ugni_ep_create (ep->common, ep->btl->rdma_local_cq, &ep->rdma_ep_handle); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + return rc; + } + + ep->state = MCA_BTL_UGNI_EP_STATE_RDMA; + + return OPAL_SUCCESS; +} + +static inline int mca_btl_ugni_check_endpoint_state_rdma (mca_btl_base_endpoint_t *ep) { + int rc; + if (OPAL_LIKELY(MCA_BTL_UGNI_EP_STATE_INIT < ep->state)) { + return OPAL_SUCCESS; + } + + opal_mutex_lock (&ep->lock); + rc = mca_btl_ugni_ep_connect_rdma (ep); + opal_mutex_unlock (&ep->lock); + return rc; +} + static inline int mca_btl_ugni_wildcard_ep_post (mca_btl_ugni_module_t *ugni_module) { gni_return_t rc; diff --git a/opal/mca/btl/ugni/btl_ugni_frag.c b/opal/mca/btl/ugni/btl_ugni_frag.c index 06eee0e95f..024a42b65c 100644 --- a/opal/mca/btl/ugni/btl_ugni_frag.c +++ b/opal/mca/btl/ugni/btl_ugni_frag.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. * $COPYRIGHT$ @@ -16,7 +16,7 @@ static inline void mca_btl_ugni_base_frag_constructor (mca_btl_ugni_base_frag_t *frag) { memset ((char *) frag + sizeof (frag->base), 0, sizeof (*frag) - sizeof (frag->base)); - frag->segments[0].base.seg_addr.pval = frag->base.super.ptr; + frag->segments[0].seg_addr.pval = frag->base.super.ptr; } static inline void mca_btl_ugni_eager_frag_constructor (mca_btl_ugni_base_frag_t *frag) @@ -26,7 +26,7 @@ static inline void mca_btl_ugni_eager_frag_constructor (mca_btl_ugni_base_frag_t mca_btl_ugni_base_frag_constructor (frag); - frag->segments[0].memory_handle = reg->memory_hdl; + frag->memory_handle = reg->handle; } OBJ_CLASS_INSTANCE(mca_btl_ugni_smsg_frag_t, mca_btl_base_descriptor_t, @@ -38,6 +38,9 @@ OBJ_CLASS_INSTANCE(mca_btl_ugni_rdma_frag_t, mca_btl_base_descriptor_t, OBJ_CLASS_INSTANCE(mca_btl_ugni_eager_frag_t, mca_btl_base_descriptor_t, mca_btl_ugni_eager_frag_constructor, NULL); +OBJ_CLASS_INSTANCE(mca_btl_ugni_post_descriptor_t, ompi_free_list_item_t, + NULL, NULL); + void mca_btl_ugni_frag_init (mca_btl_ugni_base_frag_t *frag, mca_btl_ugni_module_t *ugni_module) { frag->msg_id = opal_pointer_array_add (&ugni_module->pending_smsg_frags_bb, (void *) frag); diff --git a/opal/mca/btl/ugni/btl_ugni_frag.h b/opal/mca/btl/ugni/btl_ugni_frag.h index 4150404bc2..543e71237a 100644 --- a/opal/mca/btl/ugni/btl_ugni_frag.h +++ b/opal/mca/btl/ugni/btl_ugni_frag.h @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2013 The University of Tennessee and The University @@ -19,13 +19,6 @@ #include "btl_ugni.h" #include "btl_ugni_endpoint.h" -typedef struct mca_btl_ugni_segment_t { - mca_btl_base_segment_t base; - gni_mem_handle_t memory_handle; - uint8_t extra_bytes[3]; - uint8_t extra_byte_count; -} mca_btl_ugni_segment_t; - typedef struct mca_btl_ugni_send_frag_hdr_t { uint32_t lag; } mca_btl_ugni_send_frag_hdr_t; @@ -41,7 +34,9 @@ typedef struct mca_btl_ugni_rdma_frag_hdr_t { typedef struct mca_btl_ugni_eager_frag_hdr_t { mca_btl_ugni_send_frag_hdr_t send; - mca_btl_ugni_segment_t src_seg; + uint32_t size; + uint64_t address; + mca_btl_base_registration_handle_t memory_handle; void *ctx; } mca_btl_ugni_eager_frag_hdr_t; @@ -59,29 +54,28 @@ typedef union mca_btl_ugni_frag_hdr_t { } mca_btl_ugni_frag_hdr_t; enum { - MCA_BTL_UGNI_FRAG_BUFFERED = 1, /* frag data is buffered */ - MCA_BTL_UGNI_FRAG_COMPLETE = 2, /* smsg complete for frag */ - MCA_BTL_UGNI_FRAG_EAGER = 4, /* eager get frag */ - MCA_BTL_UGNI_FRAG_IGNORE = 8, /* ignore local smsg completion */ - MCA_BTL_UGNI_FRAG_SMSG_COMPLETE = 16 /* SMSG has completed for this message */ + MCA_BTL_UGNI_FRAG_BUFFERED = 1, /* frag data is buffered */ + MCA_BTL_UGNI_FRAG_COMPLETE = 2, /* smsg complete for frag */ + MCA_BTL_UGNI_FRAG_EAGER = 4, /* eager get frag */ + MCA_BTL_UGNI_FRAG_IGNORE = 8, /* ignore local smsg completion */ + MCA_BTL_UGNI_FRAG_SMSG_COMPLETE = 16, /* SMSG has completed for this message */ + MCA_BTL_UGNI_FRAG_RESPONSE = 32, }; struct mca_btl_ugni_base_frag_t; -typedef void (*frag_cb_t) (struct mca_btl_ugni_base_frag_t *, int); - typedef struct mca_btl_ugni_base_frag_t { mca_btl_base_descriptor_t base; uint32_t msg_id; uint16_t hdr_size; uint16_t flags; mca_btl_ugni_frag_hdr_t hdr; - mca_btl_ugni_segment_t segments[2]; + mca_btl_base_segment_t segments[2]; opal_common_ugni_post_desc_t post_desc; mca_btl_base_endpoint_t *endpoint; mca_btl_ugni_reg_t *registration; ompi_free_list_t *my_list; - frag_cb_t cbfunc; + mca_btl_base_registration_handle_t memory_handle; } mca_btl_ugni_base_frag_t; typedef struct mca_btl_ugni_base_frag_t mca_btl_ugni_smsg_frag_t; @@ -91,6 +85,58 @@ typedef struct mca_btl_ugni_base_frag_t mca_btl_ugni_eager_frag_t; #define MCA_BTL_UGNI_DESC_TO_FRAG(desc) \ ((mca_btl_ugni_base_frag_t *)((uintptr_t) (desc) - offsetof (mca_btl_ugni_base_frag_t, post_desc))) +typedef struct mca_btl_ugni_post_descriptor_t { + ompi_free_list_item_t super; + opal_common_ugni_post_desc_t desc; + mca_btl_base_endpoint_t *endpoint; + mca_btl_base_registration_handle_t *local_handle; + mca_btl_base_rdma_completion_fn_t cbfunc; + void *cbdata; + void *ctx; +} mca_btl_ugni_post_descriptor_t; + +OBJ_CLASS_DECLARATION(mca_btl_ugni_post_descriptor_t); + +#define MCA_BTL_UGNI_DESC_TO_PDESC(desc) \ + ((mca_btl_ugni_post_descriptor_t *)((uintptr_t) (desc) - offsetof (mca_btl_ugni_post_descriptor_t, desc))) + +static inline void mca_btl_ugni_alloc_post_descriptor (mca_btl_base_endpoint_t *endpoint, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata, + mca_btl_ugni_post_descriptor_t **desc) +{ + ompi_free_list_item_t *item = NULL; + + OMPI_FREE_LIST_GET_MT(&endpoint->btl->post_descriptors, item); + *desc = (mca_btl_ugni_post_descriptor_t *) item; + if (NULL != item) { + (*desc)->cbfunc = cbfunc; + (*desc)->ctx = cbcontext; + (*desc)->cbdata = cbdata; + (*desc)->local_handle = local_handle; + (*desc)->endpoint = endpoint; + } +} + +static inline void mca_btl_ugni_return_post_descriptor (mca_btl_ugni_module_t *module, + mca_btl_ugni_post_descriptor_t *desc) +{ + OMPI_FREE_LIST_RETURN_MT(&module->post_descriptors, &desc->super); +} + +static inline void mca_btl_ugni_post_desc_complete (mca_btl_ugni_module_t *module, mca_btl_ugni_post_descriptor_t *desc, int rc) +{ + BTL_VERBOSE(("RDMA/FMA/ATOMIC operation complete for post descriptor %p. rc = %d", (void *) desc, rc)); + + if (NULL != desc->cbfunc) { + /* call the user's callback function */ + desc->cbfunc (&module->super, desc->endpoint, (void *)(intptr_t) desc->desc.base.local_addr, + desc->local_handle, desc->ctx, desc->cbdata, rc); + } + + /* the descriptor is no longer needed */ + mca_btl_ugni_return_post_descriptor (module, desc); +} + OBJ_CLASS_DECLARATION(mca_btl_ugni_smsg_frag_t); OBJ_CLASS_DECLARATION(mca_btl_ugni_rdma_frag_t); OBJ_CLASS_DECLARATION(mca_btl_ugni_eager_frag_t); diff --git a/opal/mca/btl/ugni/btl_ugni_get.c b/opal/mca/btl/ugni/btl_ugni_get.c index 7942613376..f244035a56 100644 --- a/opal/mca/btl/ugni/btl_ugni_get.c +++ b/opal/mca/btl/ugni/btl_ugni_get.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. * $COPYRIGHT$ @@ -13,44 +13,31 @@ #include "btl_ugni_rdma.h" #include "btl_ugni_smsg.h" -/** - * Initiate a get operation. - * - * @param btl (IN) BTL module - * @param endpoint (IN) BTL addressing information - * @param descriptor (IN) Description of the data to be transferred - */ -int mca_btl_ugni_get (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t *des) { - mca_btl_ugni_base_frag_t *frag = (mca_btl_ugni_base_frag_t *) des; - mca_btl_ugni_segment_t *src_seg = (mca_btl_ugni_segment_t *) des->des_remote; - mca_btl_ugni_segment_t *dst_seg = (mca_btl_ugni_segment_t *) des->des_local; - size_t size = src_seg->base.seg_len - src_seg->extra_byte_count; +int mca_btl_ugni_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) +{ bool check; - BTL_VERBOSE(("Using RDMA/FMA Get")); - - /* cause endpoint to bind if it isn't already (bind is sufficient for rdma) */ - (void) mca_btl_ugni_check_endpoint_state(endpoint); - /* Check if the get is aligned/sized on a multiple of 4 */ - check = !!((des->des_remote->seg_addr.lval | des->des_local->seg_addr.lval | size) & 3); + check = !!((remote_address | (uint64_t)(intptr_t) local_address | size) & (mca_btl_ugni_module.super.btl_get_alignment - 1)); - if (OPAL_UNLIKELY(check || size > mca_btl_ugni_component.ugni_get_limit)) { - /* switch to put */ + if (OPAL_UNLIKELY(check || size > mca_btl_ugni_module.super.btl_get_limit)) { + BTL_VERBOSE(("RDMA/FMA Get not available due to size or alignment restrictions")); + + /* notify the caller that get is not available */ return OPAL_ERR_NOT_AVAILABLE; } - if (src_seg->extra_byte_count) { - memmove ((char *) dst_seg->base.seg_addr.pval + size, src_seg->extra_bytes, src_seg->extra_byte_count); - src_seg->base.seg_len = size; - dst_seg->base.seg_len = size; - } + BTL_VERBOSE(("Using RDMA/FMA Get from local address %p to remote address %" PRIx64, + local_address, remote_address)); - des->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; + /* cause endpoint to bind if it isn't already (bind is sufficient for rdma) */ + (void) mca_btl_ugni_check_endpoint_state_rdma (endpoint); - return mca_btl_ugni_post (frag, true, dst_seg, src_seg); + return mca_btl_ugni_post (endpoint, true, size, local_address, remote_address, local_handle, + remote_handle, order, cbfunc, cbcontext, cbdata); } /* eager get */ @@ -60,6 +47,8 @@ static void mca_btl_ugni_callback_eager_get_progress_pending (struct mca_btl_bas mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl; mca_btl_ugni_base_frag_t *pending_frag, *frag = (mca_btl_ugni_base_frag_t *) desc; + memset (&frag->hdr, 0, sizeof (frag->hdr)); + OPAL_THREAD_LOCK(&ugni_module->eager_get_pending_lock); pending_frag = (mca_btl_ugni_base_frag_t *) opal_list_remove_first (&ugni_module->eager_get_pending); OPAL_THREAD_UNLOCK(&ugni_module->eager_get_pending_lock); @@ -68,6 +57,8 @@ static void mca_btl_ugni_callback_eager_get_progress_pending (struct mca_btl_bas /* copy the relevant data out of the pending fragment */ frag->endpoint = pending_frag->endpoint; + assert (frag != pending_frag); + /* start the next eager get using this fragment */ (void) mca_btl_ugni_start_eager_get (frag->endpoint, pending_frag->hdr.eager_ex, frag); @@ -80,39 +71,43 @@ static void mca_btl_ugni_callback_eager_get_progress_pending (struct mca_btl_bas } static void mca_btl_ugni_callback_eager_get (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t *desc, int rc) + void *local_address, mca_btl_base_registration_handle_t *local_handle, + void *context, void *cbdata, int status) { mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl; - mca_btl_ugni_base_frag_t *frag = (mca_btl_ugni_base_frag_t *) desc; + mca_btl_ugni_base_frag_t *frag = (mca_btl_ugni_base_frag_t *) context; uint32_t len = frag->hdr.eager.send.lag & 0x00ffffff; uint8_t tag = frag->hdr.eager.send.lag >> 24; - size_t payload_len = frag->hdr.eager.src_seg.base.seg_len; + size_t payload_len = frag->hdr.eager.size; size_t hdr_len = len - payload_len; mca_btl_active_message_callback_t *reg; mca_btl_base_segment_t segs[2]; mca_btl_ugni_base_frag_t tmp; + int rc; - BTL_VERBOSE(("eager get for rem_ctx %p complete", frag->hdr.eager.ctx)); + BTL_VERBOSE(("eager get for rem_ctx %p complete", frag->hdr.eager.ctx)) - tmp.base.des_local = segs; + tmp.base.des_segments = segs; if (hdr_len) { - tmp.base.des_local_count = 2; + tmp.base.des_segment_count = 2; segs[0].seg_addr.pval = frag->hdr.eager_ex.pml_header; segs[0].seg_len = hdr_len; - segs[1].seg_addr.pval = frag->segments[0].base.seg_addr.pval; + segs[1].seg_addr.pval = local_address; segs[1].seg_len = payload_len; } else { - tmp.base.des_local_count = 1; + tmp.base.des_segment_count = 1; - segs[0].seg_addr.pval = frag->segments[0].base.seg_addr.pval; + segs[0].seg_addr.pval = local_address; segs[0].seg_len = payload_len; } reg = mca_btl_base_active_message_trigger + tag; reg->cbfunc(&frag->endpoint->btl->super, tag, &(tmp.base), reg->cbdata); + /* fill in the response header */ frag->hdr.rdma.ctx = frag->hdr.eager.ctx; + frag->flags = MCA_BTL_UGNI_FRAG_RESPONSE; /* once complete use this fragment for a pending eager get if any exist */ frag->base.des_cbfunc = mca_btl_ugni_callback_eager_get_progress_pending; @@ -122,6 +117,7 @@ static void mca_btl_ugni_callback_eager_get (struct mca_btl_base_module_t *btl, NULL, 0, MCA_BTL_UGNI_TAG_RDMA_COMPLETE); if (OPAL_UNLIKELY(0 > rc)) { /* queue fragment */ + OPAL_THREAD_LOCK(&endpoint->lock); if (false == endpoint->wait_listed) { OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock); opal_list_append (&ugni_module->ep_wait_list, &endpoint->super); @@ -129,50 +125,50 @@ static void mca_btl_ugni_callback_eager_get (struct mca_btl_base_module_t *btl, endpoint->wait_listed = true; } - OPAL_THREAD_LOCK(&endpoint->lock); opal_list_append (&endpoint->frag_wait_list, (opal_list_item_t *) frag); OPAL_THREAD_UNLOCK(&endpoint->lock); } } -int mca_btl_ugni_start_eager_get (mca_btl_base_endpoint_t *ep, +int mca_btl_ugni_start_eager_get (mca_btl_base_endpoint_t *endpoint, mca_btl_ugni_eager_ex_frag_hdr_t hdr, mca_btl_ugni_base_frag_t *frag) { - mca_btl_ugni_module_t *ugni_module = ep->btl; + mca_btl_ugni_module_t *ugni_module = endpoint->btl; + size_t size; int rc; BTL_VERBOSE(("starting eager get for remote ctx: %p", hdr.eager.ctx)); do { if (NULL == frag) { - rc = MCA_BTL_UGNI_FRAG_ALLOC_EAGER_RECV(ep, frag); + /* try to allocate a registered buffer */ + rc = MCA_BTL_UGNI_FRAG_ALLOC_EAGER_RECV(endpoint, frag); if (OPAL_UNLIKELY(NULL == frag)) { - (void) MCA_BTL_UGNI_FRAG_ALLOC_RDMA_INT(ep, frag); + /* no registered buffers available. try again later */ + (void) MCA_BTL_UGNI_FRAG_ALLOC_RDMA_INT(endpoint, frag); + + /* not much can be done if a small fragment can not be allocated. abort! */ assert (NULL != frag); frag->hdr.eager_ex = hdr; break; } } - frag->hdr.eager_ex = hdr; frag->flags = 0; - frag->base.des_flags = 0; + frag->hdr.eager_ex = hdr; - frag->segments[1] = hdr.eager.src_seg; - - /* increase size to a multiple of 4 bytes (required for get) */ - frag->segments[0].base.seg_len = frag->segments[1].base.seg_len = - (hdr.eager.src_seg.base.seg_len + 3) & ~3; - - frag->base.des_local = &frag->segments[1].base; + /* increase size to a multiple of 4 bytes (required for get on Gemini) */ + size = (hdr.eager.size + 3) & ~3; /* set up callback for get completion */ frag->base.des_flags = MCA_BTL_DES_SEND_ALWAYS_CALLBACK; - frag->base.des_cbfunc = mca_btl_ugni_callback_eager_get; - rc = mca_btl_ugni_post (frag, GNI_POST_RDMA_GET, frag->segments, frag->segments + 1); + /* start the get */ + rc = mca_btl_ugni_post (endpoint, true, size, frag->base.super.ptr, hdr.eager.address, + &frag->memory_handle, &hdr.eager.memory_handle, + MCA_BTL_NO_ORDER, mca_btl_ugni_callback_eager_get, frag, NULL); if (OPAL_UNLIKELY(OPAL_SUCCESS == rc)) { return OPAL_SUCCESS; } diff --git a/opal/mca/btl/ugni/btl_ugni_module.c b/opal/mca/btl/ugni/btl_ugni_module.c index 0bc4e59a27..9161d653f0 100644 --- a/opal/mca/btl/ugni/btl_ugni_module.c +++ b/opal/mca/btl/ugni/btl_ugni_module.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2014 Research Organization for Information Science @@ -27,35 +27,37 @@ mca_btl_ugni_free (struct mca_btl_base_module_t *btl, static int mca_btl_ugni_module_finalize (struct mca_btl_base_module_t* btl); -static mca_btl_base_descriptor_t * -mca_btl_ugni_prepare_dst (mca_btl_base_module_t *btl, - mca_btl_base_endpoint_t *endpoint, - mca_mpool_base_registration_t *registration, - opal_convertor_t *convertor, uint8_t order, - size_t reserve, size_t *size, uint32_t flags); - static struct mca_btl_base_descriptor_t * mca_btl_ugni_prepare_src (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, - mca_mpool_base_registration_t *registration, struct opal_convertor_t *convertor, uint8_t order, size_t reserve, size_t *size, uint32_t flags); +static mca_btl_base_registration_handle_t * +mca_btl_ugni_register_mem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *base, + size_t size, uint32_t flags); + +static int mca_btl_ugni_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle); + mca_btl_ugni_module_t mca_btl_ugni_module = { .super = { - .btl_component = &mca_btl_ugni_component.super, - .btl_add_procs = mca_btl_ugni_add_procs, - .btl_del_procs = mca_btl_ugni_del_procs, - .btl_finalize = mca_btl_ugni_module_finalize, - .btl_alloc = mca_btl_ugni_alloc, - .btl_free = mca_btl_ugni_free, - .btl_prepare_src = mca_btl_ugni_prepare_src, - .btl_prepare_dst = mca_btl_ugni_prepare_dst, - .btl_send = mca_btl_ugni_send, - .btl_sendi = mca_btl_ugni_sendi, - .btl_put = mca_btl_ugni_put, - .btl_get = mca_btl_ugni_get, + .btl_component = &mca_btl_ugni_component.super, + .btl_add_procs = mca_btl_ugni_add_procs, + .btl_del_procs = mca_btl_ugni_del_procs, + .btl_finalize = mca_btl_ugni_module_finalize, + .btl_alloc = mca_btl_ugni_alloc, + .btl_free = mca_btl_ugni_free, + .btl_prepare_src = mca_btl_ugni_prepare_src, + .btl_send = mca_btl_ugni_send, + .btl_sendi = mca_btl_ugni_sendi, + .btl_put = mca_btl_ugni_put, + .btl_get = mca_btl_ugni_get, + .btl_register_mem = mca_btl_ugni_register_mem, + .btl_deregister_mem = mca_btl_ugni_deregister_mem, + .btl_atomic_op = mca_btl_ugni_aop, + .btl_atomic_fop = mca_btl_ugni_afop, + .btl_atomic_cswap = mca_btl_ugni_acswap, } }; @@ -92,6 +94,9 @@ mca_btl_ugni_module_init (mca_btl_ugni_module_t *ugni_module, OBJ_CONSTRUCT(&ugni_module->endpoints, opal_pointer_array_t); OBJ_CONSTRUCT(&ugni_module->id_to_endpoint, opal_hash_table_t); OBJ_CONSTRUCT(&ugni_module->smsg_mboxes, ompi_free_list_t); + OBJ_CONSTRUCT(&ugni_module->pending_descriptors, opal_list_t); + OBJ_CONSTRUCT(&ugni_module->eager_get_pending, opal_list_t); + OBJ_CONSTRUCT(&ugni_module->post_descriptors, ompi_free_list_t); ugni_module->device = dev; dev->btl_ctx = (void *) ugni_module; @@ -204,7 +209,6 @@ mca_btl_ugni_module_finalize (struct mca_btl_base_module_t *btl) OBJ_DESTRUCT(&ugni_module->pending_smsg_frags_bb); OBJ_DESTRUCT(&ugni_module->id_to_endpoint); OBJ_DESTRUCT(&ugni_module->endpoints); - OBJ_DESTRUCT(&ugni_module->failed_frags); OBJ_DESTRUCT(&ugni_module->eager_get_pending); OBJ_DESTRUCT(&ugni_module->eager_get_pending_lock); @@ -250,13 +254,13 @@ mca_btl_ugni_alloc(struct mca_btl_base_module_t *btl, frag->base.des_flags = flags; frag->base.order = order; - frag->base.des_local = &frag->segments[1].base; - frag->base.des_local_count = 1; + frag->base.des_segments = &frag->segments[1]; + frag->base.des_segment_count = 1; - frag->segments[0].base.seg_addr.pval = NULL; - frag->segments[0].base.seg_len = 0; - frag->segments[1].base.seg_addr.pval = frag->base.super.ptr; - frag->segments[1].base.seg_len = size; + frag->segments[0].seg_addr.pval = NULL; + frag->segments[0].seg_len = 0; + frag->segments[1].seg_addr.pval = frag->base.super.ptr; + frag->segments[1].seg_len = size; frag->flags = MCA_BTL_UGNI_FRAG_BUFFERED; if (size > mca_btl_ugni_component.smsg_max_data) { @@ -267,7 +271,7 @@ mca_btl_ugni_alloc(struct mca_btl_base_module_t *btl, registration = (mca_btl_ugni_reg_t *) frag->base.super.registration; - frag->segments[1].memory_handle = registration->memory_hdl; + frag->hdr.eager.memory_handle = registration->handle; } else { frag->hdr_size = sizeof (frag->hdr.send); } @@ -285,59 +289,36 @@ mca_btl_ugni_free (struct mca_btl_base_module_t *btl, static struct mca_btl_base_descriptor_t * mca_btl_ugni_prepare_src (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, - mca_mpool_base_registration_t *registration, struct opal_convertor_t *convertor, uint8_t order, size_t reserve, size_t *size, uint32_t flags) { - if (OPAL_LIKELY(reserve)) { - return mca_btl_ugni_prepare_src_send (btl, endpoint, convertor, - order, reserve, size, flags); - } else { - return mca_btl_ugni_prepare_src_rdma (btl, endpoint, registration, - convertor, order, size, flags); - } + return mca_btl_ugni_prepare_src_send (btl, endpoint, convertor, + order, reserve, size, flags); } -static mca_btl_base_descriptor_t * -mca_btl_ugni_prepare_dst (mca_btl_base_module_t *btl, - mca_btl_base_endpoint_t *endpoint, - mca_mpool_base_registration_t *registration, - opal_convertor_t *convertor, uint8_t order, - size_t reserve, size_t *size, uint32_t flags) +static mca_btl_base_registration_handle_t * +mca_btl_ugni_register_mem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *base, + size_t size, uint32_t flags) { - mca_btl_ugni_base_frag_t *frag; - void *data_ptr; + mca_btl_ugni_reg_t *reg; int rc; - opal_convertor_get_current_pointer (convertor, &data_ptr); - - (void) MCA_BTL_UGNI_FRAG_ALLOC_RDMA(endpoint, frag); - if (OPAL_UNLIKELY(NULL == frag)) { + rc = btl->btl_mpool->mpool_register(btl->btl_mpool, base, size, 0, + (mca_mpool_base_registration_t **) ®); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { return NULL; } - /* always need to register the buffer for put/get (even for fma) */ - if (NULL == registration) { - rc = btl->btl_mpool->mpool_register(btl->btl_mpool, - data_ptr, *size, 0, - ®istration); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - mca_btl_ugni_frag_return (frag); - return NULL; - } - - frag->registration = (mca_btl_ugni_reg_t*) registration; - } - - frag->segments[0].memory_handle = ((mca_btl_ugni_reg_t *)registration)->memory_hdl; - frag->segments[0].base.seg_len = *size; - frag->segments[0].base.seg_addr.lval = (uint64_t)(uintptr_t) data_ptr; - - frag->base.des_local = &frag->segments->base; - frag->base.des_local_count = 1; - frag->base.order = order; - frag->base.des_flags = flags; - - return (struct mca_btl_base_descriptor_t *) frag; + return ®->handle; +} + +static int mca_btl_ugni_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle) +{ + mca_btl_ugni_reg_t *reg = + (mca_btl_ugni_reg_t *)((intptr_t) handle - offsetof (mca_btl_ugni_reg_t, handle)); + + (void) btl->btl_mpool->mpool_deregister (btl->btl_mpool, ®->base); + + return OPAL_SUCCESS; } diff --git a/opal/mca/btl/ugni/btl_ugni_prepare.h b/opal/mca/btl/ugni/btl_ugni_prepare.h index b6a9f4b4da..bd46aa227a 100644 --- a/opal/mca/btl/ugni/btl_ugni_prepare.h +++ b/opal/mca/btl/ugni/btl_ugni_prepare.h @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. * $COPYRIGHT$ @@ -35,14 +35,14 @@ mca_btl_ugni_prepare_src_send_nodata (struct mca_btl_base_module_t *btl, frag->hdr_size = reserve + sizeof (frag->hdr.send); - frag->segments[0].base.seg_addr.pval = frag->hdr.send_ex.pml_header; - frag->segments[0].base.seg_len = reserve; + frag->segments[0].seg_addr.pval = frag->hdr.send_ex.pml_header; + frag->segments[0].seg_len = reserve; - frag->segments[1].base.seg_addr.pval = NULL; - frag->segments[1].base.seg_len = 0; + frag->segments[1].seg_addr.pval = NULL; + frag->segments[1].seg_len = 0; - frag->base.des_local = &frag->segments->base; - frag->base.des_local_count = 1; + frag->base.des_segments = frag->segments; + frag->base.des_segment_count = 1; frag->base.order = order; frag->base.des_flags = flags; @@ -84,22 +84,22 @@ mca_btl_ugni_prepare_src_send_inplace (struct mca_btl_base_module_t *btl, frag->flags = MCA_BTL_UGNI_FRAG_EAGER | MCA_BTL_UGNI_FRAG_IGNORE; frag->registration = registration; - frag->segments[1].memory_handle = registration->memory_hdl; + frag->hdr.eager.memory_handle = registration->handle;; frag->hdr_size = reserve + sizeof (frag->hdr.eager); - frag->segments[0].base.seg_addr.pval = frag->hdr.eager_ex.pml_header; + frag->segments[0].seg_addr.pval = frag->hdr.eager_ex.pml_header; } else { frag->hdr_size = reserve + sizeof (frag->hdr.send); - frag->segments[0].base.seg_addr.pval = frag->hdr.send_ex.pml_header; + frag->segments[0].seg_addr.pval = frag->hdr.send_ex.pml_header; } - frag->segments[0].base.seg_len = reserve; + frag->segments[0].seg_len = reserve; - frag->segments[1].base.seg_addr.pval = data_ptr; - frag->segments[1].base.seg_len = *size; + frag->segments[1].seg_addr.pval = data_ptr; + frag->segments[1].seg_len = *size; - frag->base.des_local = &frag->segments->base; - frag->base.des_local_count = 2; + frag->base.des_segments = frag->segments; + frag->base.des_segment_count = 2; frag->base.order = order; frag->base.des_flags = flags; @@ -130,10 +130,9 @@ mca_btl_ugni_prepare_src_send_buffered (struct mca_btl_base_module_t *btl, registration = (mca_btl_ugni_reg_t *) frag->base.super.registration; - frag->segments[1].memory_handle = registration->memory_hdl; - + frag->hdr.eager.memory_handle = registration->handle; frag->hdr_size = reserve + sizeof (frag->hdr.eager); - frag->segments[0].base.seg_addr.pval = frag->hdr.eager_ex.pml_header; + frag->segments[0].seg_addr.pval = frag->hdr.eager_ex.pml_header; } else { (void) MCA_BTL_UGNI_FRAG_ALLOC_SMSG(endpoint, frag); if (OPAL_UNLIKELY(NULL == frag)) { @@ -141,7 +140,7 @@ mca_btl_ugni_prepare_src_send_buffered (struct mca_btl_base_module_t *btl, } frag->hdr_size = reserve + sizeof (frag->hdr.send); - frag->segments[0].base.seg_addr.pval = frag->hdr.send_ex.pml_header; + frag->segments[0].seg_addr.pval = frag->hdr.send_ex.pml_header; } frag->flags |= MCA_BTL_UGNI_FRAG_BUFFERED; @@ -155,13 +154,13 @@ mca_btl_ugni_prepare_src_send_buffered (struct mca_btl_base_module_t *btl, return NULL; } - frag->segments[0].base.seg_len = reserve; + frag->segments[0].seg_len = reserve; - frag->segments[1].base.seg_addr.pval = frag->base.super.ptr; - frag->segments[1].base.seg_len = *size; + frag->segments[1].seg_addr.pval = frag->base.super.ptr; + frag->segments[1].seg_len = *size; - frag->base.des_local = &frag->segments->base; - frag->base.des_local_count = 2; + frag->base.des_segments = frag->segments; + frag->base.des_segment_count = 2; frag->base.order = order; frag->base.des_flags = flags; @@ -197,66 +196,4 @@ mca_btl_ugni_prepare_src_send (struct mca_btl_base_module_t *btl, } } -static inline struct mca_btl_base_descriptor_t * -mca_btl_ugni_prepare_src_rdma (struct mca_btl_base_module_t *btl, - mca_btl_base_endpoint_t *endpoint, - mca_mpool_base_registration_t *registration, - struct opal_convertor_t *convertor, - uint8_t order, size_t *size, - uint32_t flags) -{ - mca_btl_ugni_base_frag_t *frag; - void *data_ptr; - int rc; - - opal_convertor_get_current_pointer (convertor, &data_ptr); - - (void) MCA_BTL_UGNI_FRAG_ALLOC_RDMA(endpoint, frag); - if (OPAL_UNLIKELY(NULL == frag)) { - return NULL; - } - - /* - * For medium message use FMA protocols and for large message - * use BTE protocols - */ - /* No need to register while using FMA Put (registration is - * non-null in get-- is this always true?) */ - if (*size >= mca_btl_ugni_component.ugni_fma_limit || (flags & MCA_BTL_DES_FLAGS_GET)) { - if (NULL == registration) { - rc = btl->btl_mpool->mpool_register(btl->btl_mpool, data_ptr, *size, 0, - (mca_mpool_base_registration_t **) ®istration); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - mca_btl_ugni_frag_return (frag); - return NULL; - } - - frag->registration = (mca_btl_ugni_reg_t *) registration; - } - - frag->segments[0].memory_handle = ((mca_btl_ugni_reg_t *)registration)->memory_hdl; - } else { - memset ((void *) &frag->segments[0].memory_handle, 0, - sizeof (frag->segments[0].memory_handle)); - } - - if ((flags & MCA_BTL_DES_FLAGS_GET) && (*size & 0x3)) { - memmove (frag->segments[0].extra_bytes, (char *) data_ptr + (*size & ~0x3), - *size & 0x3); - frag->segments[0].extra_byte_count = *size & 0x3; - } else { - frag->segments[0].extra_byte_count = 0; - } - - frag->segments[0].base.seg_addr.lval = (uint64_t)(uintptr_t) data_ptr; - frag->segments[0].base.seg_len = *size; - - frag->base.des_local = &frag->segments->base; - frag->base.des_local_count = 1; - frag->base.order = order; - frag->base.des_flags = flags; - - return &frag->base; -} - #endif diff --git a/opal/mca/btl/ugni/btl_ugni_progress_thread.c b/opal/mca/btl/ugni/btl_ugni_progress_thread.c index e2d35cc4cc..23f6a28614 100644 --- a/opal/mca/btl/ugni/btl_ugni_progress_thread.c +++ b/opal/mca/btl/ugni/btl_ugni_progress_thread.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. * $COPYRIGHT$ @@ -20,17 +20,13 @@ static pthread_t mca_btl_ugni_progress_thread_id; -static pthread_mutex_t progress_mutex = PTHREAD_MUTEX_INITIALIZER; -static pthread_cond_t progress_cond = PTHREAD_COND_INITIALIZER; static volatile int stop_progress_thread = 0; -static volatile int progress_thread_done = 0; -static int thread_wakeups = 0; +unsigned int mca_btl_ugni_progress_thread_wakeups; static void *mca_btl_ugni_prog_thread_fn(void * data) { - int rc,ret = OPAL_SUCCESS; uint32_t which; gni_return_t status; gni_cq_handle_t cq_vec[2]; @@ -59,36 +55,12 @@ static void *mca_btl_ugni_prog_thread_fn(void * data) if (status == GNI_RC_NOT_DONE) continue; if ((status == GNI_RC_SUCCESS) && (stop_progress_thread == 0)) { - thread_wakeups++; + mca_btl_ugni_progress_thread_wakeups++; opal_progress(); } } - /* Send a signal to the main thread saying we are done */ - rc = pthread_mutex_lock(&progress_mutex); - if (0 != rc) { - BTL_ERROR(("btl/ugni pthread_mutex_lock returned %s ",strerror(rc))); - ret = OPAL_ERROR; - goto fn_exit; - } - - progress_thread_done = 1; - - rc = pthread_mutex_unlock(&progress_mutex); - if (0 != rc) { - BTL_ERROR(("btl/ugni pthread_mutex_unlock returned %s ",strerror(rc))); - ret = OPAL_ERROR; - goto fn_exit; - } - - rc = pthread_cond_signal(&progress_cond); - if (0 != rc) { - BTL_ERROR(("btl/ugni pthread_cond_signal returned %s ",strerror(rc))); - ret = OPAL_ERROR; - } - - fn_exit: - return ret; + return (void *) (intptr_t) OPAL_SUCCESS; } int mca_btl_ugni_spawn_progress_thread(struct mca_btl_base_module_t *btl) @@ -124,9 +96,8 @@ int mca_btl_ugni_spawn_progress_thread(struct mca_btl_base_module_t *btl) int mca_btl_ugni_kill_progress_thread(void) { - int rc, ret=OPAL_SUCCESS; - gni_return_t status; - static mca_btl_ugni_base_frag_t cq_write_frag; + int ret=OPAL_SUCCESS; + void *thread_rc; stop_progress_thread = 1; @@ -134,61 +105,23 @@ int mca_btl_ugni_kill_progress_thread(void) * post a CQ to myself to wake my thread up */ - cq_write_frag.post_desc.base.type = GNI_POST_CQWRITE; - cq_write_frag.post_desc.base.cqwrite_value = 0xdead; /* up to 48 bytes here, not used for now */ - cq_write_frag.post_desc.base.cq_mode = GNI_CQMODE_GLOBAL_EVENT; - cq_write_frag.post_desc.base.dlvr_mode = GNI_DLVMODE_IN_ORDER; - cq_write_frag.post_desc.base.src_cq_hndl = mca_btl_ugni_component.modules[0].rdma_local_cq; - cq_write_frag.post_desc.base.remote_mem_hndl = mca_btl_ugni_component.modules[0].device->smsg_irq_mhndl; - cq_write_frag.post_desc.tries = 0; - cq_write_frag.cbfunc = NULL; - OPAL_THREAD_LOCK(&mca_btl_ugni_component.modules[0].device->dev_lock); - status = GNI_PostCqWrite(mca_btl_ugni_component.modules[0].local_ep, - &cq_write_frag.post_desc.base); - OPAL_THREAD_UNLOCK(&mca_btl_ugni_component.modules[0].device->dev_lock); + ret = mca_btl_ugni_post_cqwrite (mca_btl_ugni_component.modules[0].local_ep, + mca_btl_ugni_component.modules[0].rdma_local_cq, + mca_btl_ugni_component.modules[0].device->smsg_irq_mhndl, + 0xdead, NULL, NULL, NULL); /* * TODO: if error returned, need to kill off thread manually */ - if (GNI_RC_SUCCESS != status) { - BTL_ERROR(("GNI_PostCqWrite returned error - %s",gni_err_str[status])); - ret = opal_common_rc_ugni_to_opal(status); + if (OPAL_SUCCESS != ret) { + /* force the thread to exit */ + pthread_cancel (mca_btl_ugni_progress_thread_id); goto fn_exit; } - rc = pthread_mutex_lock(&progress_mutex); - if (0 != rc) { - BTL_ERROR(("btl/ugni pthread_mutex_lock returned %s ",strerror(rc))); - ret = OPAL_ERROR; - goto fn_exit; - } - - while (!progress_thread_done) { - pthread_cond_wait(&progress_cond, &progress_mutex); - if (0 != rc) { - BTL_ERROR(("btl/ugni pthread_cond_wait returned %s ",strerror(rc))); - ret = OPAL_ERROR; - goto fn_exit; - } - } - - rc = pthread_mutex_unlock(&progress_mutex); - if (0 != rc) { - BTL_ERROR(("btl/ugni pthread_mutex_unlock returned %s ",strerror(rc))); - ret = OPAL_ERROR; - goto fn_exit; - } - - /* - * destroy the local_ep - */ - - OPAL_THREAD_LOCK(&mca_btl_ugni_component.modules[0].device->dev_lock); - status = GNI_EpDestroy (mca_btl_ugni_component.modules[0].local_ep); - OPAL_THREAD_UNLOCK(&mca_btl_ugni_component.modules[0].device->dev_lock); - if (OPAL_UNLIKELY(GNI_RC_SUCCESS != status)) { - BTL_ERROR(("GNI_EpDestroy returned error - %s", gni_err_str[status])); - ret = opal_common_rc_ugni_to_opal(status); - goto fn_exit; + pthread_join (mca_btl_ugni_progress_thread_id, &thread_rc); + if (0 != (intptr_t) thread_rc) { + BTL_ERROR(("btl/ugni error returned from progress thread: %d", (int) (intptr_t) thread_rc)); + ret = (int)(intptr_t) thread_rc; } fn_exit: diff --git a/opal/mca/btl/ugni/btl_ugni_put.c b/opal/mca/btl/ugni/btl_ugni_put.c index d77396b6f1..2729314e37 100644 --- a/opal/mca/btl/ugni/btl_ugni_put.c +++ b/opal/mca/btl/ugni/btl_ugni_put.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. * $COPYRIGHT$ @@ -14,25 +14,17 @@ #include "btl_ugni_rdma.h" -/** - * Initiate a put operation. - * - * @param btl (IN) BTL module - * @param endpoint (IN) BTL addressing information - * @param descriptor (IN) Description of the data to be transferred - */ -int mca_btl_ugni_put (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t *des) { - mca_btl_ugni_base_frag_t *frag = (mca_btl_ugni_base_frag_t *) des; - - BTL_VERBOSE(("Using RDMA/FMA Put for frag %p", (void *) des)); +int mca_btl_ugni_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) +{ + BTL_VERBOSE(("Using RDMA/FMA Put from local address %p to remote address %" PRIx64, + local_address, remote_address)); /* cause endpoint to bind if it isn't already (bind is sufficient for rdma) */ - (void) mca_btl_ugni_check_endpoint_state(endpoint); + (void) mca_btl_ugni_check_endpoint_state_rdma (endpoint); - des->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; - - return mca_btl_ugni_post (frag, false, (mca_btl_ugni_segment_t *) des->des_local, - (mca_btl_ugni_segment_t *) des->des_remote); + return mca_btl_ugni_post (endpoint, false, size, local_address, remote_address, local_handle, + remote_handle, order, cbfunc, cbcontext, cbdata); } diff --git a/opal/mca/btl/ugni/btl_ugni_rdma.h b/opal/mca/btl/ugni/btl_ugni_rdma.h index 120d890a09..bcc8a5f33c 100644 --- a/opal/mca/btl/ugni/btl_ugni_rdma.h +++ b/opal/mca/btl/ugni/btl_ugni_rdma.h @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. * $COPYRIGHT$ @@ -20,107 +20,185 @@ int mca_btl_ugni_start_eager_get (mca_btl_base_endpoint_t *ep, mca_btl_ugni_eager_ex_frag_hdr_t hdr, mca_btl_ugni_base_frag_t *frag); -static inline void init_gni_post_desc (mca_btl_ugni_base_frag_t *frag, - gni_post_type_t op_type, - uint64_t lcl_addr, - gni_mem_handle_t lcl_mdh, - uint64_t rem_addr, - gni_mem_handle_t rem_mdh, - uint64_t bufsize, - gni_cq_handle_t cq_hndl) { - frag->post_desc.base.type = op_type; - frag->post_desc.base.cq_mode = GNI_CQMODE_GLOBAL_EVENT; - frag->post_desc.base.dlvr_mode = GNI_DLVMODE_PERFORMANCE; - frag->post_desc.base.local_addr = (uint64_t) lcl_addr; - frag->post_desc.base.local_mem_hndl = lcl_mdh; - frag->post_desc.base.remote_addr = (uint64_t) rem_addr; - frag->post_desc.base.remote_mem_hndl = rem_mdh; - frag->post_desc.base.length = bufsize; - frag->post_desc.base.rdma_mode = 0; - frag->post_desc.base.rdma_mode = 0; - frag->post_desc.base.src_cq_hndl = cq_hndl; - frag->post_desc.tries = 0; +static inline void init_gni_post_desc (opal_common_ugni_post_desc_t *post_desc, + int order, gni_post_type_t op_type, + uint64_t lcl_addr, + gni_mem_handle_t lcl_mdh, + uint64_t rem_addr, + gni_mem_handle_t rem_mdh, + uint64_t bufsize, + gni_cq_handle_t cq_hndl) { + post_desc->base.type = op_type; + post_desc->base.cq_mode = GNI_CQMODE_GLOBAL_EVENT; + if (MCA_BTL_NO_ORDER == order) { + post_desc->base.dlvr_mode = GNI_DLVMODE_PERFORMANCE; + } else { + post_desc->base.dlvr_mode = GNI_DLVMODE_NO_ADAPT; + } + post_desc->base.local_addr = (uint64_t) lcl_addr; + post_desc->base.local_mem_hndl = lcl_mdh; + post_desc->base.remote_addr = (uint64_t) rem_addr; + post_desc->base.remote_mem_hndl = rem_mdh; + post_desc->base.length = bufsize; + post_desc->base.rdma_mode = 0; + post_desc->base.src_cq_hndl = cq_hndl; + post_desc->tries = 0; } -static inline int mca_btl_ugni_post_fma (mca_btl_ugni_base_frag_t *frag, gni_post_type_t op_type, - mca_btl_ugni_segment_t *lcl_seg, mca_btl_ugni_segment_t *rem_seg) +static inline int mca_btl_ugni_post_fma (struct mca_btl_base_endpoint_t *endpoint, gni_post_type_t op_type, + size_t size, void *local_address, uint64_t remote_address, + mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, + void *cbcontext, void *cbdata) { - gni_return_t rc; + mca_btl_ugni_post_descriptor_t *post_desc; + gni_return_t grc; - /* Post descriptor (CQ is ignored for FMA transactions) */ - init_gni_post_desc (frag, op_type, lcl_seg->base.seg_addr.lval, lcl_seg->memory_handle, - rem_seg->base.seg_addr.lval, rem_seg->memory_handle, lcl_seg->base.seg_len, 0); + mca_btl_ugni_alloc_post_descriptor (endpoint, local_handle, cbfunc, cbcontext, cbdata, &post_desc); + if (OPAL_UNLIKELY(NULL == post_desc)) { + return OPAL_ERR_OUT_OF_RESOURCE; + } - OPAL_THREAD_LOCK(&frag->endpoint->common->dev->dev_lock); - rc = GNI_PostFma (frag->endpoint->rdma_ep_handle, &frag->post_desc.base); - OPAL_THREAD_UNLOCK(&frag->endpoint->common->dev->dev_lock); - if (GNI_RC_SUCCESS != rc) { - BTL_VERBOSE(("GNI_PostFma failed with gni rc: %d", rc)); + /* Post descriptor (CQ is ignored for FMA transactions) -- The CQ associated with the endpoint + * is used. */ + init_gni_post_desc (&post_desc->desc, order, op_type, (intptr_t) local_address, local_handle->gni_handle, + remote_address, remote_handle->gni_handle, size, 0); + + OPAL_THREAD_LOCK(&endpoint->btl->device->dev_lock); + grc = GNI_PostFma (endpoint->rdma_ep_handle, &post_desc->desc.base); + OPAL_THREAD_UNLOCK(&endpoint->btl->device->dev_lock); + if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc)) { + mca_btl_ugni_return_post_descriptor (endpoint->btl, post_desc); + + if (GNI_RC_ALIGNMENT_ERROR == grc) { + BTL_VERBOSE(("GNI_PostFma failed with an alignment error")); + return OPAL_ERR_NOT_AVAILABLE; + } + + BTL_VERBOSE(("GNI_PostFma failed with gni rc: %d", grc)); return OPAL_ERR_OUT_OF_RESOURCE; } return OPAL_SUCCESS; } -static inline int mca_btl_ugni_post_bte (mca_btl_ugni_base_frag_t *frag, gni_post_type_t op_type, - mca_btl_ugni_segment_t *lcl_seg, mca_btl_ugni_segment_t *rem_seg) +static inline int mca_btl_ugni_post_bte (mca_btl_base_endpoint_t *endpoint, gni_post_type_t op_type, + size_t size, void *local_address, uint64_t remote_address, + mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, + void *cbcontext, void *cbdata) { + mca_btl_ugni_post_descriptor_t *post_desc; + gni_cq_handle_t cq_handle = endpoint->btl->rdma_local_cq; gni_return_t status; - /* Post descriptor */ - if (mca_btl_ugni_component.progress_thread_enabled) { - init_gni_post_desc (frag, op_type, lcl_seg->base.seg_addr.lval, lcl_seg->memory_handle, - rem_seg->base.seg_addr.lval, rem_seg->memory_handle, lcl_seg->base.seg_len, - frag->endpoint->btl->rdma_local_irq_cq); - } else { - init_gni_post_desc (frag, op_type, lcl_seg->base.seg_addr.lval, lcl_seg->memory_handle, - rem_seg->base.seg_addr.lval, rem_seg->memory_handle, lcl_seg->base.seg_len, - frag->endpoint->btl->rdma_local_cq); + mca_btl_ugni_alloc_post_descriptor (endpoint, local_handle, cbfunc, cbcontext, cbdata, &post_desc); + if (OPAL_UNLIKELY(NULL == post_desc)) { + return OPAL_ERR_OUT_OF_RESOURCE; } - OPAL_THREAD_LOCK(&frag->endpoint->common->dev->dev_lock); - status = GNI_PostRdma (frag->endpoint->rdma_ep_handle, &frag->post_desc.base); - OPAL_THREAD_UNLOCK(&frag->endpoint->common->dev->dev_lock); - if (GNI_RC_SUCCESS != status) { + if (mca_btl_ugni_component.progress_thread_enabled) { + cq_handle = endpoint->btl->rdma_local_irq_cq; + } + + /* Post descriptor */ + init_gni_post_desc (&post_desc->desc, order, op_type, (intptr_t) local_address, local_handle->gni_handle, + remote_address, remote_handle->gni_handle, size, cq_handle); + + OPAL_THREAD_LOCK(&endpoint->btl->device->dev_lock); + status = GNI_PostRdma (endpoint->rdma_ep_handle, &post_desc->desc.base); + OPAL_THREAD_UNLOCK(&endpoint->btl->device->dev_lock); + if (OPAL_UNLIKELY(GNI_RC_SUCCESS != status)) { + mca_btl_ugni_return_post_descriptor (endpoint->btl, post_desc); + + if (GNI_RC_ALIGNMENT_ERROR == status) { + BTL_VERBOSE(("GNI_PostRdma failed with an alignment error")); + return OPAL_ERR_NOT_AVAILABLE; + } + BTL_VERBOSE(("GNI_PostRdma failed with gni rc: %d", status)); - return opal_common_rc_ugni_to_opal(status); + return OPAL_ERR_OUT_OF_RESOURCE; } return OPAL_SUCCESS; } -static inline int mca_btl_ugni_post (mca_btl_ugni_base_frag_t *frag, bool get, mca_btl_ugni_segment_t *lcl_seg, - mca_btl_ugni_segment_t *rem_seg) { +static inline int mca_btl_ugni_post_cqwrite (mca_btl_base_endpoint_t *endpoint, gni_cq_handle_t cq_handle, + gni_mem_handle_t irq_mhndl, uint64_t value, + mca_btl_base_rdma_completion_fn_t cbfunc, + void *cbcontext, void *cbdata) +{ + mca_btl_ugni_post_descriptor_t *post_desc; + gni_return_t grc; + + mca_btl_ugni_alloc_post_descriptor (endpoint, NULL, cbfunc, cbcontext, cbdata, &post_desc); + if (OPAL_UNLIKELY(NULL == post_desc)) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + post_desc->desc.base.type = GNI_POST_CQWRITE; + post_desc->desc.base.cqwrite_value = value; /* up to 48 bytes here, not used for now */ + post_desc->desc.base.cq_mode = GNI_CQMODE_GLOBAL_EVENT; + post_desc->desc.base.dlvr_mode = GNI_DLVMODE_IN_ORDER; + post_desc->desc.base.src_cq_hndl = cq_handle; + post_desc->desc.base.remote_mem_hndl = irq_mhndl; + post_desc->desc.tries = 0; + + OPAL_THREAD_LOCK(&endpoint->common->dev->dev_lock); + grc = GNI_PostCqWrite(endpoint->rdma_ep_handle, &post_desc->desc.base); + OPAL_THREAD_UNLOCK(&endpoint->common->dev->dev_lock); + if (GNI_RC_SUCCESS != grc) { /* errors for PostCqWrite treated as non-fatal */ + BTL_VERBOSE(("GNI_PostCqWrite returned error - %s", gni_err_str[grc])); + mca_btl_ugni_return_post_descriptor (endpoint->btl, post_desc); + } + + return opal_common_rc_ugni_to_opal (grc); +} + +static inline int mca_btl_ugni_post (mca_btl_base_endpoint_t *endpoint, int get, size_t size, + void *local_address, uint64_t remote_address, + mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, + void *cbcontext, void *cbdata) +{ const gni_post_type_t fma_ops[2] = {GNI_POST_FMA_PUT, GNI_POST_FMA_GET}; const gni_post_type_t rdma_ops[2] = {GNI_POST_RDMA_PUT, GNI_POST_RDMA_GET}; - if (frag->base.des_local->seg_len <= mca_btl_ugni_component.ugni_fma_limit) { - return mca_btl_ugni_post_fma (frag, fma_ops[get], lcl_seg, rem_seg); + if (size <= mca_btl_ugni_component.ugni_fma_limit) { + return mca_btl_ugni_post_fma (endpoint, fma_ops[get], size, local_address, remote_address, + local_handle, remote_handle, order, cbfunc, cbcontext, cbdata); } - return mca_btl_ugni_post_bte (frag, rdma_ops[get], lcl_seg, rem_seg); + return mca_btl_ugni_post_bte (endpoint, rdma_ops[get], size, local_address, remote_address, + local_handle, remote_handle, order, cbfunc, cbcontext, cbdata); } -static inline void mca_btl_ugni_repost (mca_btl_ugni_base_frag_t *frag) { +static inline int mca_btl_ugni_repost (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_post_descriptor_t *post_desc) +{ gni_return_t grc; - OPAL_THREAD_LOCK(&frag->endpoint->common->dev->dev_lock); - if (GNI_POST_RDMA_PUT == frag->post_desc.base.type || - GNI_POST_RDMA_GET == frag->post_desc.base.type) { - grc = GNI_PostRdma (frag->endpoint->rdma_ep_handle, &frag->post_desc.base); + OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); + if (GNI_POST_RDMA_PUT == post_desc->desc.base.type || + GNI_POST_RDMA_GET == post_desc->desc.base.type) { + grc = GNI_PostRdma (post_desc->endpoint->rdma_ep_handle, &post_desc->desc.base); } else { - grc = GNI_PostFma (frag->endpoint->rdma_ep_handle, &frag->post_desc.base); + grc = GNI_PostFma (post_desc->endpoint->rdma_ep_handle, &post_desc->desc.base); } - OPAL_THREAD_UNLOCK(&frag->endpoint->common->dev->dev_lock); + OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc)) { /* NTH: Should we even retry these? When this code was written there was no indication * whether an error in post is recoverable. Clobber this code and the associated data * structures if post errors are not recoverable. */ - OPAL_THREAD_LOCK(&frag->endpoint->btl->failed_frags_lock); - opal_list_append (&frag->endpoint->btl->failed_frags, (opal_list_item_t *) frag); - OPAL_THREAD_UNLOCK(&frag->endpoint->btl->failed_frags_lock); + OPAL_THREAD_LOCK(&ugni_module->pending_descriptors_lock); + opal_list_append (&ugni_module->pending_descriptors, (opal_list_item_t *) post_desc); + OPAL_THREAD_UNLOCK(&ugni_module->pending_descriptors_lock); } + + return opal_common_rc_ugni_to_opal (grc); } #endif /* MCA_BTL_UGNI_RDMA_H */ diff --git a/opal/mca/btl/ugni/btl_ugni_send.c b/opal/mca/btl/ugni/btl_ugni_send.c index aa014ebd17..d20881aca0 100644 --- a/opal/mca/btl/ugni/btl_ugni_send.c +++ b/opal/mca/btl/ugni/btl_ugni_send.c @@ -23,7 +23,7 @@ int mca_btl_ugni_send (struct mca_btl_base_module_t *btl, mca_btl_base_tag_t tag) { mca_btl_ugni_base_frag_t *frag = (mca_btl_ugni_base_frag_t *) descriptor; - size_t size = frag->segments[0].base.seg_len + frag->segments[1].base.seg_len; + size_t size = frag->segments[0].seg_len + frag->segments[1].seg_len; mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl; int flags_save = frag->base.des_flags; int rc; @@ -41,7 +41,7 @@ int mca_btl_ugni_send (struct mca_btl_base_module_t *btl, } BTL_VERBOSE(("btl/ugni sending descriptor %p from %d -> %d. length = %" PRIu64, (void *)descriptor, - OPAL_PROC_MY_NAME.vpid, endpoint->common->ep_rem_id, frag->segments[0].base.seg_len)); + OPAL_PROC_MY_NAME.vpid, endpoint->common->ep_rem_id, size)); /* temporarily disable ownership and callback flags so we can reliably check the complete flag */ frag->base.des_flags &= ~(MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK); @@ -90,14 +90,13 @@ int mca_btl_ugni_send (struct mca_btl_base_module_t *btl, return rc; } -int -mca_btl_ugni_sendi (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct opal_convertor_t *convertor, - void *header, size_t header_size, - size_t payload_size, uint8_t order, - uint32_t flags, mca_btl_base_tag_t tag, - mca_btl_base_descriptor_t **descriptor) +int mca_btl_ugni_sendi (struct mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *endpoint, + struct opal_convertor_t *convertor, + void *header, size_t header_size, + size_t payload_size, uint8_t order, + uint32_t flags, mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t **descriptor) { size_t total_size = header_size + payload_size; mca_btl_ugni_base_frag_t *frag = NULL; @@ -118,13 +117,14 @@ mca_btl_ugni_sendi (struct mca_btl_base_module_t *btl, frag = (mca_btl_ugni_base_frag_t *) mca_btl_ugni_prepare_src_send_buffered (btl, endpoint, convertor, order, header_size, &packed_size, flags); } + assert (packed_size == payload_size); if (OPAL_UNLIKELY(NULL == frag)) { break; } frag->hdr.send.lag = (tag << 24) | total_size; - memcpy (frag->segments[0].base.seg_addr.pval, header, header_size); + memcpy (frag->segments[0].seg_addr.pval, header, header_size); rc = mca_btl_ugni_send_frag (endpoint, frag); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { @@ -135,7 +135,9 @@ mca_btl_ugni_sendi (struct mca_btl_base_module_t *btl, return OPAL_SUCCESS; } while (0); - *descriptor = NULL; + if (NULL != descriptor) { + *descriptor = NULL; + } return OPAL_ERR_OUT_OF_RESOURCE; } @@ -151,7 +153,13 @@ int mca_btl_ugni_progress_send_wait_list (mca_btl_base_endpoint_t *endpoint) if (NULL == frag) { break; } - rc = mca_btl_ugni_send_frag (endpoint, frag); + if (OPAL_LIKELY(!(frag->flags & MCA_BTL_UGNI_FRAG_RESPONSE))) { + rc = mca_btl_ugni_send_frag (endpoint, frag); + } else { + rc = opal_mca_btl_ugni_smsg_send (frag, &frag->hdr.rdma, sizeof (frag->hdr.rdma), + NULL, 0, MCA_BTL_UGNI_TAG_RDMA_COMPLETE); + } + if (OPAL_UNLIKELY(OPAL_SUCCESS > rc)) { if (OPAL_LIKELY(OPAL_ERR_OUT_OF_RESOURCE == rc)) { OPAL_THREAD_LOCK(&endpoint->lock); diff --git a/opal/mca/btl/ugni/btl_ugni_smsg.c b/opal/mca/btl/ugni/btl_ugni_smsg.c index c4bf94f79e..cd857f96f8 100644 --- a/opal/mca/btl/ugni/btl_ugni_smsg.c +++ b/opal/mca/btl/ugni/btl_ugni_smsg.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. * $COPYRIGHT$ @@ -26,7 +26,7 @@ static void mca_btl_ugni_smsg_mbox_construct (mca_btl_ugni_smsg_mbox_t *mbox) { mbox->attr.smsg_attr.mbox_offset = (uintptr_t) mbox->super.ptr - (uintptr_t) base_reg->base; mbox->attr.smsg_attr.msg_buffer = base_reg->base; mbox->attr.smsg_attr.buff_size = mca_btl_ugni_component.smsg_mbox_size; - mbox->attr.smsg_attr.mem_hndl = ugni_reg->memory_hdl; + mbox->attr.smsg_attr.mem_hndl = ugni_reg->handle.gni_handle; mbox->attr.proc_id = mca_btl_ugni_proc_name_to_id (OPAL_PROC_MY_NAME); mbox->attr.rmt_irq_mem_hndl = mca_btl_ugni_component.modules[0].device->smsg_irq_mhndl; } @@ -106,8 +106,8 @@ int mca_btl_ugni_smsg_process (mca_btl_base_endpoint_t *ep) BTL_VERBOSE(("received smsg fragment. hdr = {len = %u, tag = %d}", len, tag)); reg = mca_btl_base_active_message_trigger + tag; - frag.base.des_local = &seg; - frag.base.des_local_count = 1; + frag.base.des_segments = &seg; + frag.base.des_segment_count = 1; seg.seg_addr.pval = (void *)((uintptr_t)data_ptr + sizeof (mca_btl_ugni_send_frag_hdr_t)); seg.seg_len = len; diff --git a/opal/mca/btl/ugni/btl_ugni_smsg.h b/opal/mca/btl/ugni/btl_ugni_smsg.h index 4298a5083b..2a191a2eee 100644 --- a/opal/mca/btl/ugni/btl_ugni_smsg.h +++ b/opal/mca/btl/ugni/btl_ugni_smsg.h @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. * $COPYRIGHT$ @@ -82,22 +82,12 @@ static inline int mca_btl_ugni_progress_local_smsg (mca_btl_ugni_module_t *ugni_ return 1; } -static void mca_btl_ugni_cqwrite_complete (struct mca_btl_ugni_base_frag_t *frag, int rc) -{ - frag->flags |= MCA_BTL_UGNI_FRAG_COMPLETE; - - BTL_VERBOSE(("cqwrite frag complete")); - mca_btl_ugni_frag_return (frag); -} - static inline int opal_mca_btl_ugni_smsg_send (mca_btl_ugni_base_frag_t *frag, void *hdr, size_t hdr_len, void *payload, size_t payload_len, mca_btl_ugni_smsg_tag_t tag) { - int rc; gni_return_t grc; - mca_btl_ugni_base_frag_t *cq_write_frag = NULL; OPAL_THREAD_LOCK(&frag->endpoint->common->dev->dev_lock); grc = GNI_SmsgSendWTag (frag->endpoint->smsg_ep_handle, hdr, hdr_len, @@ -110,28 +100,9 @@ static inline int opal_mca_btl_ugni_smsg_send (mca_btl_ugni_base_frag_t *frag, if (mca_btl_ugni_component.progress_thread_enabled) { if (frag->base.des_flags & MCA_BTL_DES_FLAGS_SIGNAL) { - rc = mca_btl_ugni_frag_alloc(frag->endpoint, - &frag->endpoint->btl->rdma_frags, - &cq_write_frag); - if (rc == OPAL_SUCCESS) { - cq_write_frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP; - cq_write_frag->registration = NULL; - cq_write_frag->endpoint = frag->endpoint; - cq_write_frag->post_desc.base.type = GNI_POST_CQWRITE; - cq_write_frag->post_desc.base.cqwrite_value = 0xdead; /* up to 48 bytes here, not used for now */ - cq_write_frag->post_desc.base.cq_mode = GNI_CQMODE_GLOBAL_EVENT; - cq_write_frag->post_desc.base.dlvr_mode = GNI_DLVMODE_IN_ORDER; - cq_write_frag->post_desc.base.src_cq_hndl = frag->endpoint->btl->rdma_local_cq; - cq_write_frag->post_desc.base.remote_mem_hndl = frag->endpoint->rmt_irq_mem_hndl; - cq_write_frag->post_desc.tries = 0; - cq_write_frag->cbfunc = mca_btl_ugni_cqwrite_complete; - OPAL_THREAD_LOCK(&frag->endpoint->common->dev->dev_lock); - grc = GNI_PostCqWrite(frag->endpoint->rdma_ep_handle, &cq_write_frag->post_desc.base); - OPAL_THREAD_UNLOCK(&frag->endpoint->common->dev->dev_lock); - if (grc == GNI_RC_ERROR_RESOURCE) { /* errors for PostCqWrite treated as non-fatal */ - mca_btl_ugni_frag_return (cq_write_frag); - } - } + /* errors for PostCqWrite treated as non-fatal */ + (void) mca_btl_ugni_post_cqwrite (frag->endpoint, frag->endpoint->btl->rdma_local_cq, + frag->endpoint->rmt_irq_mem_hndl, 0xdead, NULL, NULL, NULL); } } @@ -155,12 +126,13 @@ static inline int mca_btl_ugni_send_frag (struct mca_btl_base_endpoint_t *btl_pe mca_btl_ugni_base_frag_t *frag) { if (OPAL_LIKELY(!(frag->flags & MCA_BTL_UGNI_FRAG_EAGER))) { return opal_mca_btl_ugni_smsg_send (frag, &frag->hdr.send, frag->hdr_size, - frag->segments[1].base.seg_addr.pval, - frag->segments[1].base.seg_len, + frag->segments[1].seg_addr.pval, + frag->segments[1].seg_len, MCA_BTL_UGNI_TAG_SEND); } - frag->hdr.eager.src_seg = frag->segments[1]; + frag->hdr.eager.size = frag->segments[1].seg_len; + frag->hdr.eager.address = frag->segments[1].seg_addr.lval; frag->hdr.eager.ctx = (void *) frag; return opal_mca_btl_ugni_smsg_send (frag, &frag->hdr.eager, frag->hdr_size, diff --git a/opal/mca/btl/usnic/README.txt b/opal/mca/btl/usnic/README.txt index 6483372e96..6166ce37f5 100644 --- a/opal/mca/btl/usnic/README.txt +++ b/opal/mca/btl/usnic/README.txt @@ -141,9 +141,12 @@ After the checks above are done, the fragment is enqueued to be sent via opal_btl_usnic_endpoint_enqueue_frag() usnic_put() -PML will have filled in destination address in descriptor. This is saved -and the fragment is enqueued for processing. - +Do a fast version of what happens in prepare_src() (can take shortcuts +because we know it will always be a contiguous buffer / no convertor +needed). PML gives us the destination address, which we save on the +fragment (which is the sentinel value that the underlying engine uses +to know that this is a PUT and not a SEND), and the fragment is +enqueued for processing. opal_btl_usnic_endpoint_enqueue_frag() This appends the fragment to the "to be sent" list of the endpoint and @@ -200,8 +203,6 @@ opal_btl_usnic_recv_fast() called fastpath_ok which is set to false every time the fastpath is taken. A call into the regular progress routine will set this flag back to true. - - ====================================== reliability: @@ -233,7 +234,6 @@ rcvr: sender: duplicate ACK triggers immediate retrans if one is not pending for that segment - ====================================== Reordering induced by two queues and piggy-backing: @@ -248,6 +248,42 @@ keep command queue empty enough and also beat out the large sends. send credits limit how many larges can be queued on the sender, but there could be many on the receiver + +====================================== +RDMA emulation + +We emulate the RDMA PUT because it's more efficient than regular send: +it allows the receive to copy directly to the target buffer +(vs. making an intermediate copy out of the bounce buffer). + +It would actually be better to morph this PUT into a GET -- GET would +be slightly more efficient. In short, when the target requests the +actual RDMA data, with PUT, the request has to go up to the PML, which +will then invoke PUT on the source's BTL module. With GET, the target +issues the GET, and the source BTL module can reply without needing to +go up the stack to the PML. + +Once we start supporting RDMA in hardware: + +- we need to provide module.btl_register_mem and + module.btl_deregister_mem functions (see openib for an example) +- we need to put something meaningful in + btl_usnic_frag.h:mca_btl_base_registration_handle_t. +- we need to set module.btl_registration_handle_size to sizeof(struct + mca_btl_base_registration_handle_t). +- module.btl_put / module.btl_get will receive the + mca_btl_base_registration_handle_t from the peer as a cookie. + +Also, module.btl_put / module.btl_get do not need to make descriptors +(this was an optimization added in BTL 3.0). They are now called with +enough information to do whatever they need to do. module.btl_put +still makes a descriptor and submits it to the usnic sending engine so +as to utilize a common infrastructure for send and put. + +But it doesn't necessarily have to be that way -- we could optimize +out the use of the descriptors. Have not investigated how easy/hard +that would be. + ====================================== November 2014 / SC 2014 diff --git a/opal/mca/btl/usnic/btl_usnic_ack.c b/opal/mca/btl/usnic/btl_usnic_ack.c index ebb3214546..03492bc548 100644 --- a/opal/mca/btl/usnic/btl_usnic_ack.c +++ b/opal/mca/btl/usnic/btl_usnic_ack.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2014 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -147,11 +147,21 @@ opal_btl_usnic_handle_ack( * fragment really needs to be freed, we'll take care of it in a few * lines below. */ - if (frag->sf_ack_bytes_left == bytes_acked && - ((frag->sf_base.uf_remote_seg[0].seg_addr.pval != NULL) || - (frag->sf_base.uf_base.des_flags & - MCA_BTL_DES_SEND_ALWAYS_CALLBACK))) { - OPAL_BTL_USNIC_DO_SEND_FRAG_CB(module, frag, "send completion"); + if (frag->sf_ack_bytes_left == bytes_acked) { +#if BTL_VERSION == 30 + if (frag->sf_base.uf_remote_seg[0].seg_addr.pval != NULL) { + OPAL_BTL_USNIC_DO_PUT_FRAG_CB(module, frag, "put completion"); + } else if (frag->sf_base.uf_base.des_flags & + MCA_BTL_DES_SEND_ALWAYS_CALLBACK) { + OPAL_BTL_USNIC_DO_SEND_FRAG_CB(module, frag, "send completion"); + } +#else + if ((frag->sf_base.uf_remote_seg[0].seg_addr.pval != NULL) || + (frag->sf_base.uf_base.des_flags & + MCA_BTL_DES_SEND_ALWAYS_CALLBACK)) { + OPAL_BTL_USNIC_DO_SEND_FRAG_CB(module, frag, "send completion"); + } +#endif } /* free this segment */ diff --git a/opal/mca/btl/usnic/btl_usnic_ack.h b/opal/mca/btl/usnic/btl_usnic_ack.h index 04b63182b0..0aaf8306d7 100644 --- a/opal/mca/btl/usnic/btl_usnic_ack.h +++ b/opal/mca/btl/usnic/btl_usnic_ack.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2014 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -17,12 +17,13 @@ #include "btl_usnic.h" #include "btl_usnic_frag.h" #include "btl_usnic_endpoint.h" +#include "btl_usnic_compat.h" -/* Invoke the descriptor callback for the frag, updating stats and clearing the - * _CALLBACK flag in the process. */ +/* Invoke the descriptor callback for a (non-PUT) send frag, updating + * stats and clearing the _CALLBACK flag in the process. */ #define OPAL_BTL_USNIC_DO_SEND_FRAG_CB(module, send_frag, comment) \ do { \ - MSGDEBUG1_OUT("%s:%d: %s send callback for module=%p frag=%p\n", \ + MSGDEBUG1_OUT("%s:%d: %s SEND callback for module=%p frag=%p\n", \ __func__, __LINE__, \ (comment), (void *)(module), (void *)(send_frag)); \ (send_frag)->sf_base.uf_base.des_cbfunc( \ @@ -34,6 +35,28 @@ ++((module)->stats.pml_send_callbacks); \ } while (0) +#if BTL_VERSION == 30 +/* Invoke the descriptor callback for a send frag that was a PUT, + * updating stats and clearing the _CALLBACK flag in the process. */ +#define OPAL_BTL_USNIC_DO_PUT_FRAG_CB(module, send_frag, comment) \ + do { \ + MSGDEBUG1_OUT("%s:%d: %s PUT callback for module=%p frag=%p\n", \ + __func__, __LINE__, \ + (comment), (void *)(module), (void *)(send_frag)); \ + mca_btl_base_rdma_completion_fn_t func = \ + (mca_btl_base_rdma_completion_fn_t) \ + (send_frag)->sf_base.uf_base.des_cbfunc; \ + func(&(module)->super, \ + (send_frag)->sf_endpoint, \ + (send_frag)->sf_base.uf_local_seg[0].seg_addr.pval, \ + NULL, \ + (send_frag)->sf_base.uf_base.des_context, \ + (send_frag)->sf_base.uf_base.des_cbdata, \ + OPAL_SUCCESS); \ + ++((module)->stats.pml_send_callbacks); \ + } while (0) +#endif + /* * Reap an ACK send that is complete */ diff --git a/opal/mca/btl/usnic/btl_usnic_compat.c b/opal/mca/btl/usnic/btl_usnic_compat.c index 306ecc86e6..13c80550c0 100644 --- a/opal/mca/btl/usnic/btl_usnic_compat.c +++ b/opal/mca/btl/usnic/btl_usnic_compat.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2014-2015 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -9,15 +9,20 @@ #if BTL_IN_OPAL #include "opal_config.h" +#include "opal/mca/btl/btl.h" #else #include "ompi_config.h" +#include "ompi/mca/btl/btl.h" #endif #include "opal/mca/mca.h" #include "opal_stdint.h" #include "btl_usnic_compat.h" +#include "btl_usnic_frag.h" #include "btl_usnic_endpoint.h" +#include "btl_usnic_connectivity.h" +#include "btl_usnic_send.h" /************************************************************************/ @@ -114,4 +119,599 @@ const char *usnic_compat_proc_name_print(opal_process_name_t *pname) return OMPI_NAME_PRINT(pname); } +#endif /* OMPI version */ + +/************************************************************************/ + +/* BTL 2.0 and 3.0 compatibilty functions */ + +/*----------------------------------------------------------------------*/ + +/* The following functions are common between BTL 2.0 and 3.0 */ + +/* Responsible for sending "small" frags (reserve + *size <= max_frag_payload) + * in the same manner as btl_prepare_src. Must return a smaller amount than + * requested if the given convertor cannot process the entire (*size). + */ +static inline opal_btl_usnic_send_frag_t * +prepare_src_small( + struct opal_btl_usnic_module_t* module, + struct mca_btl_base_endpoint_t* endpoint, + struct opal_convertor_t* convertor, + uint8_t order, + size_t reserve, + size_t* size, + uint32_t flags) +{ + opal_btl_usnic_send_frag_t *frag; + opal_btl_usnic_small_send_frag_t *sfrag; + size_t payload_len; + + payload_len = *size + reserve; + assert(payload_len <= module->max_frag_payload); /* precondition */ + + sfrag = opal_btl_usnic_small_send_frag_alloc(module); + if (OPAL_UNLIKELY(NULL == sfrag)) { + return NULL; + } + frag = &sfrag->ssf_base; + + /* In the case of a convertor, we will copy the data in now, since that is + * the cheapest way to discover how much we can actually send (since we know + * we will pack it anyway later). The alternative is to do all of the + * following: + * 1) clone_with_position(convertor) and see where the new position ends up + * actually being (see opal_btl_usnic_convertor_pack_peek). Otherwise we + * aren't fulfilling our contract w.r.t. (*size). + * 2) Add a bunch of branches checking for different cases, both here and in + * progress_sends + * 3) If we choose to defer the packing, we must clone the convertor because + * the PML owns it and might reuse it for another prepare_src call. + * + * Two convertor clones is likely to be at least as slow as just copying the + * data and might consume a similar amount of memory. Plus we still have to + * pack it later to send it. + * + * The reason we do not copy non-convertor buffer at this point is because + * we might still use INLINE for the send, and in that case we do not want + * to copy the data at all. + */ + if (OPAL_UNLIKELY(opal_convertor_need_buffers(convertor))) { + /* put user data just after end of 1st seg (upper layer header) */ + assert(payload_len <= module->max_frag_payload); + usnic_convertor_pack_simple( + convertor, + (IOVBASE_TYPE*)(intptr_t)(frag->sf_base.uf_local_seg[0].seg_addr.lval + reserve), + *size, + size); + payload_len = reserve + *size; + frag->sf_base.uf_base.USNIC_SEND_LOCAL_COUNT = 1; + /* PML will copy header into beginning of segment */ + frag->sf_base.uf_local_seg[0].seg_len = payload_len; + } else { + opal_convertor_get_current_pointer(convertor, + &sfrag->ssf_base.sf_base.uf_local_seg[1].seg_addr.pval); + frag->sf_base.uf_base.USNIC_SEND_LOCAL_COUNT = 2; + frag->sf_base.uf_local_seg[0].seg_len = reserve; + frag->sf_base.uf_local_seg[1].seg_len = *size; + } + + frag->sf_base.uf_base.des_flags = flags; + frag->sf_endpoint = endpoint; + + return frag; +} + +static void * +pack_chunk_seg_chain_with_reserve( + struct opal_btl_usnic_module_t* module, + opal_btl_usnic_large_send_frag_t *lfrag, + size_t reserve_len, + opal_convertor_t *convertor, + size_t max_convertor_bytes, + size_t *convertor_bytes_packed) +{ + opal_btl_usnic_chunk_segment_t *seg; + void *ret_ptr = NULL; + int n_segs; + uint8_t *copyptr; + size_t copylen; + size_t seg_space; + size_t max_data; + bool first_pass; + + assert(NULL != lfrag); + assert(NULL != convertor_bytes_packed); + + n_segs = 0; + *convertor_bytes_packed = 0; + + first_pass = true; + while (*convertor_bytes_packed < max_convertor_bytes || + first_pass) { + seg = opal_btl_usnic_chunk_segment_alloc(module); + if (OPAL_UNLIKELY(NULL == seg)) { + BTL_ERROR(("chunk segment allocation error")); + abort(); /* XXX */ + } + ++n_segs; + + seg_space = module->max_chunk_payload; + copyptr = seg->ss_base.us_payload.raw; + + if (first_pass) { + /* logic could accommodate >max, but currently doesn't */ + assert(reserve_len <= module->max_chunk_payload); + ret_ptr = copyptr; + seg_space -= reserve_len; + copyptr += reserve_len; + } + + /* now pack any convertor data */ + if (*convertor_bytes_packed < max_convertor_bytes && seg_space > 0) { + copylen = max_convertor_bytes - *convertor_bytes_packed; + if (copylen > seg_space) { + copylen = seg_space; + } + usnic_convertor_pack_simple(convertor, copyptr, copylen, &max_data); + seg_space -= max_data; + *convertor_bytes_packed += max_data; + + /* If unable to pack any of the remaining bytes, release the + * most recently allocated segment and finish processing. + */ + if (seg_space == module->max_chunk_payload) { + assert(max_data == 0); /* only way this can happen */ + opal_btl_usnic_chunk_segment_return(module, seg); + break; + } + } + + /* bozo checks */ + assert(seg_space >= 0); + assert(seg_space < module->max_chunk_payload); + + /* append segment of data to chain to send */ + seg->ss_parent_frag = &lfrag->lsf_base; + seg->ss_len = module->max_chunk_payload - seg_space; + opal_list_append(&lfrag->lsf_seg_chain, &seg->ss_base.us_list.super); + +#if MSGDEBUG1 + opal_output(0, "%s: appending seg=%p, frag=%p, payload=%zd\n", + __func__, (void *)seg, (void *)lfrag, + (module->max_chunk_payload - seg_space)); #endif + + first_pass = false; + } + + return ret_ptr; +} + +/* Responsible for handling "large" frags (reserve + *size > max_frag_payload) + * in the same manner as btl_prepare_src. Must return a smaller amount than + * requested if the given convertor cannot process the entire (*size). + */ +static opal_btl_usnic_send_frag_t * +prepare_src_large( + struct opal_btl_usnic_module_t* module, + struct mca_btl_base_endpoint_t* endpoint, + struct opal_convertor_t* convertor, + uint8_t order, + size_t reserve, + size_t* size, + uint32_t flags) +{ + opal_btl_usnic_send_frag_t *frag; + opal_btl_usnic_large_send_frag_t *lfrag; + int rc; + + /* Get holder for the msg */ + lfrag = opal_btl_usnic_large_send_frag_alloc(module); + if (OPAL_UNLIKELY(NULL == lfrag)) { + return NULL; + } + frag = &lfrag->lsf_base; + + /* The header location goes in SG[0], payload in SG[1]. If we are using a + * convertor then SG[1].seg_len is accurate but seg_addr is NULL. */ + frag->sf_base.uf_base.USNIC_SEND_LOCAL_COUNT = 2; + + /* stash header location, PML will write here */ + frag->sf_base.uf_local_seg[0].seg_addr.pval = &lfrag->lsf_ompi_header; + frag->sf_base.uf_local_seg[0].seg_len = reserve; + /* make sure upper header small enough */ + assert(reserve <= sizeof(lfrag->lsf_ompi_header)); + + if (OPAL_UNLIKELY(opal_convertor_need_buffers(convertor))) { + /* threshold == -1 means always pack eagerly */ + if (mca_btl_usnic_component.pack_lazy_threshold >= 0 && + *size >= (size_t)mca_btl_usnic_component.pack_lazy_threshold) { + MSGDEBUG1_OUT("packing frag %p on the fly", (void *)frag); + lfrag->lsf_pack_on_the_fly = true; + + /* tell the PML we will absorb as much as possible while still + * respecting indivisible element boundaries in the convertor */ + *size = opal_btl_usnic_convertor_pack_peek(convertor, *size); + + /* Clone the convertor b/c we (the BTL) don't own it and the PML + * might mutate it after we return from this function. */ + rc = opal_convertor_clone(convertor, &frag->sf_convertor, + /*copy_stack=*/true); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + BTL_ERROR(("unexpected convertor clone error")); + abort(); /* XXX */ + } + } + else { + /* pack everything in the convertor into a chain of segments now, + * leaving space for the PML header in the first segment */ + lfrag->lsf_base.sf_base.uf_local_seg[0].seg_addr.pval = + pack_chunk_seg_chain_with_reserve(module, lfrag, reserve, + convertor, *size, size); + } + + /* We set SG[1] to {NULL,bytes_packed} so that various calculations + * by both PML and this BTL will be correct. For example, the PML adds + * up the bytes in the descriptor segments to determine if an MPI-level + * request is complete or not. */ + frag->sf_base.uf_local_seg[1].seg_addr.pval = NULL; + frag->sf_base.uf_local_seg[1].seg_len = *size; + } else { + /* convertor not needed, just save the payload pointer in SG[1] */ + lfrag->lsf_pack_on_the_fly = true; + opal_convertor_get_current_pointer(convertor, + &frag->sf_base.uf_local_seg[1].seg_addr.pval); + frag->sf_base.uf_local_seg[1].seg_len = *size; + } + + frag->sf_base.uf_base.des_flags = flags; + frag->sf_endpoint = endpoint; + + return frag; +} + +/*----------------------------------------------------------------------*/ + +#if BTL_VERSION == 20 + +/* + * BTL 2.0 version of module.btl_prepare_src. + * + * Note the "user" data the PML wishes to communicate and return a descriptor + * that can be used for send or put. We create a frag (which is also a + * descriptor by virtue of its base class) and populate it with enough + * source information to complete a future send/put. + * + * We will create either a small send frag if < than an MTU, otherwise a large + * send frag. The convertor will be saved for deferred packing if the user + * buffer is noncontiguous. Otherwise it will be saved in one of the + * descriptor's SGEs. + * + * NOTE that the *only* reason this routine is allowed to return a size smaller + * than was requested is if the convertor cannot process the entire amount. + */ +mca_btl_base_descriptor_t* +opal_btl_usnic_prepare_src( + struct mca_btl_base_module_t* base_module, + struct mca_btl_base_endpoint_t* endpoint, + struct mca_mpool_base_registration_t* registration, + struct opal_convertor_t* convertor, + uint8_t order, + size_t reserve, + size_t* size, + uint32_t flags) +{ + opal_btl_usnic_module_t *module = (opal_btl_usnic_module_t*) base_module; + opal_btl_usnic_send_frag_t *frag; + uint32_t payload_len; +#if MSGDEBUG2 + size_t osize = *size; +#endif + + /* Do we need to check the connectivity? If enabled, we'll check + the connectivity at either first send to peer X or first ACK to + peer X. */ + opal_btl_usnic_check_connectivity(module, endpoint); + + /* + * if total payload len fits in one MTU use small send, else large + */ + payload_len = *size + reserve; + if (payload_len <= module->max_frag_payload) { + frag = prepare_src_small(module, endpoint, convertor, + order, reserve, size, flags); + } else { + frag = prepare_src_large(module, endpoint, convertor, + order, reserve, size, flags); + } + +#if MSGDEBUG2 + opal_output(0, "prep_src: %s %s frag %p, size=%d+%u (was %u), conv=%p\n", + module->fabric_info->fabric_attr->name, + (reserve + *size) <= module->max_frag_payload?"small":"large", + (void *)frag, (int)reserve, (unsigned)*size, (unsigned)osize, + (void *)convertor); +#if MSGDEBUG1 + { + unsigned i; + mca_btl_base_descriptor_t *desc = &frag->sf_base.uf_base; + for (i=0; iUSNIC_SEND_LOCAL_COUNT; ++i) { + opal_output(0, " %d: ptr:%p len:%d\n", i, + (void *)desc->USNIC_SEND_LOCAL[i].seg_addr.pval, + desc->USNIC_SEND_LOCAL[i].seg_len); + } + } +#endif +#endif + + return &frag->sf_base.uf_base; +} + +/* + * BTL 2.0 prepare_dst function (this function does not exist in BTL + * 3.0). + */ +mca_btl_base_descriptor_t* +opal_btl_usnic_prepare_dst( + struct mca_btl_base_module_t* base_module, + struct mca_btl_base_endpoint_t* endpoint, + struct mca_mpool_base_registration_t* registration, + struct opal_convertor_t* convertor, + uint8_t order, + size_t reserve, + size_t* size, + uint32_t flags) +{ + opal_btl_usnic_put_dest_frag_t *pfrag; + opal_btl_usnic_module_t *module; + void *data_ptr; + + module = (opal_btl_usnic_module_t *)base_module; + + /* allocate a fragment for this */ + pfrag = (opal_btl_usnic_put_dest_frag_t *) + opal_btl_usnic_put_dest_frag_alloc(module); + if (NULL == pfrag) { + return NULL; + } + + /* find start of the data */ + opal_convertor_get_current_pointer(convertor, (void **) &data_ptr); + + /* make a seg entry pointing at data_ptr */ + pfrag->uf_remote_seg[0].seg_addr.pval = data_ptr; + pfrag->uf_remote_seg[0].seg_len = *size; + + pfrag->uf_base.order = order; + pfrag->uf_base.des_flags = flags; + +#if MSGDEBUG2 + opal_output(0, "prep_dst size=%d, addr=%p, pfrag=%p\n", (int)*size, + data_ptr, (void *)pfrag); +#endif + + return &pfrag->uf_base; +} + + +/* + * BTL 2.0 version of module.btl_put. + * + * Emulate an RDMA put. We'll send the remote address + * across to the other side so it will know where to put the data + */ +int opal_btl_usnic_put( + struct mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *endpoint, + struct mca_btl_base_descriptor_t *desc) +{ + int rc; + opal_btl_usnic_send_frag_t *frag; + + frag = (opal_btl_usnic_send_frag_t *)desc; + + opal_btl_usnic_compute_sf_size(frag); + frag->sf_ack_bytes_left = frag->sf_size; + +#if MSGDEBUG2 + opal_output(0, "usnic_put, frag=%p, size=%d\n", (void *)frag, + (int)frag->sf_size); +#if MSGDEBUG1 + { unsigned i; + for (i=0; iUSNIC_PUT_LOCAL_COUNT; ++i) { + opal_output(0, " %d: ptr:%p len:%d%s\n", i, + desc->USNIC_PUT_LOCAL[i].seg_addr.pval, + desc->USNIC_PUT_LOCAL[i].seg_len, + (i==0)?" (put local)":""); + } + for (i=0; iUSNIC_PUT_REMOTE_COUNT; ++i) { + opal_output(0, " %d: ptr:%p len:%d%s\n", i, + desc->USNIC_PUT_REMOTE[i].seg_addr.pval, + desc->USNIC_PUT_REMOTE[i].seg_len, + (i==0)?" (put remote)":""); + } + } +#endif +#endif + + /* RFXX copy out address - why does he not use our provided holder? */ + /* JMS What does this mean? ^^ */ + frag->sf_base.uf_remote_seg[0].seg_addr.pval = + desc->USNIC_PUT_REMOTE->seg_addr.pval; + + rc = opal_btl_usnic_finish_put_or_send((opal_btl_usnic_module_t *)btl, + (opal_btl_usnic_endpoint_t *)endpoint, + frag, + /*tag=*/MCA_BTL_NO_ORDER); + + return rc; +} + +/*----------------------------------------------------------------------*/ + +#elif BTL_VERSION == 30 + +/* + * BTL 3.0 prepare_src function. + * + * This function is only used for sending PML fragments (not putting + * or getting fragments). + * + * Note the "user" data the PML wishes to communicate and return a + * descriptor. We create a frag (which is also a descriptor by virtue + * of its base class) and populate it with enough source information + * to complete a future send. + * + * Recall that the usnic BTL's max_send_size is almost certainly + * larger than the MTU (by default, max_send_size is either 25K or + * 150K). Therefore, the PML may give us a fragment up to + * max_send_size in this function. Hence, we make the decision here + * as to whether it's a "small" fragment (i.e., size <= MTU, meaning + * that it fits in a single datagram) or a "large" fragment (i.e., + * size > MTU, meaning that it must be chunked into multiple + * datagrams). + * + * The convertor will be saved for deferred packing if the user buffer + * is noncontiguous. Otherwise, it will be saved in one of the + * descriptor's SGEs. + * + * NOTE that the *only* reason this routine is allowed to return a size smaller + * than was requested is if the convertor cannot process the entire amount. + */ +struct mca_btl_base_descriptor_t * +opal_btl_usnic_prepare_src(struct mca_btl_base_module_t *base_module, + struct mca_btl_base_endpoint_t *endpoint, + struct opal_convertor_t *convertor, + uint8_t order, + size_t reserve, + size_t *size, + uint32_t flags) +{ + opal_btl_usnic_module_t *module = (opal_btl_usnic_module_t*) base_module; + opal_btl_usnic_send_frag_t *frag; + uint32_t payload_len; +#if MSGDEBUG2 + size_t osize = *size; +#endif + + /* Do we need to check the connectivity? If enabled, we'll check + the connectivity at either first send to peer X or first ACK to + peer X. */ + opal_btl_usnic_check_connectivity(module, endpoint); + + /* + * if total payload len fits in one MTU use small send, else large + */ + payload_len = *size + reserve; + if (payload_len <= module->max_frag_payload) { + frag = prepare_src_small(module, endpoint, convertor, + order, reserve, size, flags); + } else { + frag = prepare_src_large(module, endpoint, convertor, + order, reserve, size, flags); + } + +#if MSGDEBUG2 + opal_output(0, "prep_src: %s %s frag %p, size=%d+%u (was %u), conv=%p\n", + module->fabric_info->fabric_attr->name, + (reserve + *size) <= module->max_frag_payload?"small":"large", + (void *)frag, (int)reserve, (unsigned)*size, (unsigned)osize, + (void *)convertor); +#if MSGDEBUG1 + { + unsigned i; + mca_btl_base_descriptor_t *desc = &frag->sf_base.uf_base; + for (i=0; iUSNIC_SEND_LOCAL_COUNT; ++i) { + opal_output(0, " %d: ptr:%p len:%d\n", i, + (void *)desc->USNIC_SEND_LOCAL[i].seg_addr.pval, + desc->USNIC_SEND_LOCAL[i].seg_len); + } + } +#endif +#endif + + return &frag->sf_base.uf_base; +} + +/* + * BTL 3.0 version of module.btl_put. + * + * Emulate an RDMA put. We'll send the remote address across to the + * other side so it will know where to put the data. + * + * Note that this function is only ever called with contiguous + * buffers, so a convertor is not necessary. + */ +int +opal_btl_usnic_put(struct mca_btl_base_module_t *base_module, + struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, + struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, + size_t size, int flags, int order, + mca_btl_base_rdma_completion_fn_t cbfunc, + void *cbcontext, void *cbdata) +{ + opal_btl_usnic_send_frag_t *sfrag; + opal_btl_usnic_module_t *module = (opal_btl_usnic_module_t*) base_module; + + /* At least for the moment, continue to make a descriptor, like we + used to in BTL 2.0 */ + if (size <= module->max_frag_payload) { + /* Small send fragment -- the whole thing fits in one MTU + (i.e., a single chunk) */ + opal_btl_usnic_small_send_frag_t *ssfrag; + ssfrag = opal_btl_usnic_small_send_frag_alloc(module); + if (OPAL_UNLIKELY(NULL == ssfrag)) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + sfrag = &ssfrag->ssf_base; + } else { + /* Large send fragment -- need more than one MTU (i.e., + multiple chunks) */ + opal_btl_usnic_large_send_frag_t *lsfrag; + lsfrag = opal_btl_usnic_large_send_frag_alloc(module); + if (OPAL_UNLIKELY(NULL == lsfrag)) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + lsfrag->lsf_pack_on_the_fly = true; + + sfrag = &lsfrag->lsf_base; + } + + sfrag->sf_endpoint = endpoint; + sfrag->sf_size = size; + sfrag->sf_ack_bytes_left = size; + + opal_btl_usnic_frag_t *frag; + frag = &sfrag->sf_base; + frag->uf_local_seg[0].seg_len = size; + frag->uf_local_seg[0].seg_addr.pval = local_address; + frag->uf_remote_seg[0].seg_len = size; + frag->uf_remote_seg[0].seg_addr.pval = + (void *)(uintptr_t) remote_address; + + mca_btl_base_descriptor_t *desc; + desc = &frag->uf_base; + desc->des_segment_count = 1; + desc->des_segments = &frag->uf_local_seg[0]; + /* This is really the wrong cbfunc type, but we'll cast it to + the Right type before we use it. So it'll be ok. */ + desc->des_cbfunc = (mca_btl_base_completion_fn_t) cbfunc; + desc->des_cbdata = cbdata; + desc->des_context = cbcontext; + desc->des_flags = flags; + desc->order = order; + + int rc; + rc = opal_btl_usnic_finish_put_or_send(module, + (opal_btl_usnic_endpoint_t *)endpoint, + sfrag, + /*tag=*/MCA_BTL_NO_ORDER); + return rc; +} + +#endif /* BTL_VERSION */ diff --git a/opal/mca/btl/usnic/btl_usnic_compat.h b/opal/mca/btl/usnic/btl_usnic_compat.h index f294041a19..ebc4251ea7 100644 --- a/opal/mca/btl/usnic/btl_usnic_compat.h +++ b/opal/mca/btl/usnic/btl_usnic_compat.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2014 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -40,20 +40,22 @@ #endif # define USNIC_BTL_DEFAULT_VERSION(name) MCA_BTL_DEFAULT_VERSION(name) -# define USNIC_SEND_LOCAL des_local -# define USNIC_SEND_LOCAL_COUNT des_local_count -# define USNIC_SEND_REMOTE des_remote -# define USNIC_SEND_REMOTE_COUNT des_remote_count +# define USNIC_SEND_LOCAL des_segments +# define USNIC_SEND_LOCAL_COUNT des_segment_count +# define USNIC_SEND_REMOTE des_segments +# define USNIC_SEND_REMOTE_COUNT des_segment_count -# define USNIC_RECV_LOCAL des_local -# define USNIC_RECV_LOCAL_COUNT des_local_count -# define USNIC_RECV_REMOTE des_remote -# define USNIC_RECV_REMOTE_COUNT des_remote_count +# define USNIC_RECV_LOCAL des_segments +# define USNIC_RECV_LOCAL_COUNT des_segment_count +# define USNIC_RECV_REMOTE des_segments +# define USNIC_RECV_REMOTE_COUNT des_segment_count -# define USNIC_PUT_LOCAL des_local -# define USNIC_PUT_LOCAL_COUNT des_local_count -# define USNIC_PUT_REMOTE des_remote -# define USNIC_PUT_REMOTE_COUNT des_remote_count +# define USNIC_PUT_LOCAL des_segments +# define USNIC_PUT_LOCAL_COUNT des_segment_count +# define USNIC_PUT_REMOTE des_segments +# define USNIC_PUT_REMOTE_COUNT des_segments_count + +# define BTL_VERSION 30 /* * Performance critical; needs to be inline @@ -134,6 +136,8 @@ usnic_compat_proc_name_compare(opal_process_name_t a, # define USNIC_PUT_REMOTE des_dst # define USNIC_PUT_REMOTE_COUNT des_dst_cnt +# define BTL_VERSION 20 + # define USNIC_COMPAT_BASE_VERSION \ MCA_BTL_BASE_VERSION_2_0_0, \ .mca_type_name = "btl", \ @@ -207,4 +211,82 @@ void usnic_compat_modex_recv(int *rc, uint64_t usnic_compat_rte_hash_name(opal_process_name_t *pname); const char *usnic_compat_proc_name_print(opal_process_name_t *pname); +/************************************************************************/ + +/* BTL 2.0 vs 3.0 compatibilty functions (specifically: some BTL API + functions changed signatures between 2.0 and 3.0) */ + +struct mca_btl_base_module_t; +struct mca_btl_base_endpoint_t; + +/* BTL 2.0 (i.e., v1.7/v1.8, but listed separately because these are + really BTL API issues) */ + +#if BTL_VERSION == 20 + +#include "ompi/mca/btl/btl.h" + +/* This function changed signature in BTL 3.0 */ +mca_btl_base_descriptor_t* +opal_btl_usnic_prepare_src( + struct mca_btl_base_module_t* base_module, + struct mca_btl_base_endpoint_t* endpoint, + struct mca_mpool_base_registration_t* registration, + struct opal_convertor_t* convertor, + uint8_t order, + size_t reserve, + size_t* size, + uint32_t flags); + +/* This function no longer exists in BTL 3.0 */ +mca_btl_base_descriptor_t* +opal_btl_usnic_prepare_dst( + struct mca_btl_base_module_t* base_module, + struct mca_btl_base_endpoint_t* endpoint, + struct mca_mpool_base_registration_t* registration, + struct opal_convertor_t* convertor, + uint8_t order, + size_t reserve, + size_t* size, + uint32_t flags); + +/* This function changed signature in BTL 3.0 */ +int +opal_btl_usnic_put( + struct mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *endpoint, + struct mca_btl_base_descriptor_t *desc); + +/************************************************************************/ + +/* BTL 3.0 (i.e., >=v1.9, but listed separately because these are + really BTL API issues) */ + +#elif BTL_VERSION == 30 + +#include "opal/mca/btl/btl.h" + +/* This function changed signature compared to BTL 2.0 */ +struct mca_btl_base_descriptor_t * +opal_btl_usnic_prepare_src(struct mca_btl_base_module_t *base_module, + struct mca_btl_base_endpoint_t *endpoint, + struct opal_convertor_t *convertor, + uint8_t order, + size_t reserve, + size_t *size, + uint32_t flags); + +/* This function changed signature compared to BTL 2.0 */ +int +opal_btl_usnic_put(struct mca_btl_base_module_t *base_module, + struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, + struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, + size_t size, int flags, int order, + mca_btl_base_rdma_completion_fn_t cbfunc, + void *cbcontext, void *cbdata); + +#endif /* BTL_VERSION */ + #endif /* BTL_USNIC_COMPAT_H */ diff --git a/opal/mca/btl/usnic/btl_usnic_frag.c b/opal/mca/btl/usnic/btl_usnic_frag.c index 10d5fecb9e..257adc8ec0 100644 --- a/opal/mca/btl/usnic/btl_usnic_frag.c +++ b/opal/mca/btl/usnic/btl_usnic_frag.c @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2006 Sandia National Laboratories. All rights * reserved. - * Copyright (c) 2013-2014 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -122,11 +122,18 @@ recv_seg_constructor( mca_btl_usnic_component.transport_header_len); /* initialize descriptor */ - seg->rs_desc.USNIC_RECV_LOCAL = &seg->rs_segment; - seg->rs_desc.USNIC_RECV_LOCAL_COUNT = 1; + /* JMS Initializing RECV_REMOTE for receive frags is unnecessary + with BTL 3.0. The only reason to keep this here would be for + compatibility with the BTL 2.0 usnic-v1.8 git branch (i.e., + it's harmless to do this assignment first, before the + RECV_LOCAL assignments -- the compiler will likely compile out + this dead code, anyway). */ seg->rs_desc.USNIC_RECV_REMOTE = NULL; seg->rs_desc.USNIC_RECV_REMOTE_COUNT = 0; + seg->rs_desc.USNIC_RECV_LOCAL = &seg->rs_segment; + seg->rs_desc.USNIC_RECV_LOCAL_COUNT = 1; + /* * This pointer is only correct for incoming segments of type * OPAL_BTL_USNIC_PAYLOAD_TYPE_FRAG, but that's the only time @@ -144,12 +151,20 @@ send_frag_constructor(opal_btl_usnic_send_frag_t *frag) /* Fill in source descriptor */ desc = &frag->sf_base.uf_base; + + /* JMS Initializing SEND_REMOTE for receive frags is unnecessary + with BTL 3.0. The only reason to keep this here would be for + compatibility with the BTL 2.0 usnic-v1.8 git branch (i.e., + it's harmless to do this assignment first, before the + SEND_LOCAL assignments -- the compiler will likely compile out + this dead code, anyway). */ + desc->USNIC_SEND_REMOTE = frag->sf_base.uf_remote_seg; + desc->USNIC_SEND_REMOTE_COUNT = 0; + desc->USNIC_SEND_LOCAL = frag->sf_base.uf_local_seg; frag->sf_base.uf_local_seg[0].seg_len = 0; frag->sf_base.uf_local_seg[1].seg_len = 0; desc->USNIC_SEND_LOCAL_COUNT = 2; - desc->USNIC_SEND_REMOTE = frag->sf_base.uf_remote_seg; - desc->USNIC_SEND_REMOTE_COUNT = 0; desc->order = MCA_BTL_NO_ORDER; desc->des_flags = 0; diff --git a/opal/mca/btl/usnic/btl_usnic_frag.h b/opal/mca/btl/usnic/btl_usnic_frag.h index 100edbdd06..69d79ae3cb 100644 --- a/opal/mca/btl/usnic/btl_usnic_frag.h +++ b/opal/mca/btl/usnic/btl_usnic_frag.h @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2006 Sandia National Laboratories. All rights * reserved. - * Copyright (c) 2013-2014 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -85,9 +85,28 @@ usnic_seg_type_str(opal_btl_usnic_seg_type_t t) } +/* + * usnic registration handle (passed over the network to peers as a + * cookie). + * + * Currently, this struct is meaningless (but it must be defined / + * exist) because we are emulating RDMA and do not have + * btl_register_mem and btl_deregister_mem functions (and we set + * module.btl_registration_handle_size to 0, not sizeof(struct + * mca_btl_base_registration_handle_t)). + */ +struct mca_btl_base_registration_handle_t { + /* Maybe we'll need fields like this */ + uint32_t lkey; + uint32_t rkey; +}; + +/* + * usnic local registration + */ typedef struct opal_btl_usnic_reg_t { mca_mpool_base_registration_t base; - struct fid_mr *mr; + struct fid_mr *ur_mr; } opal_btl_usnic_reg_t; @@ -145,7 +164,7 @@ typedef struct { /** * Descriptor for a common segment. This is exactly one packet and may - * be send or receive + * be sent or received. */ typedef struct opal_btl_usnic_segment_t { ompi_free_list_item_t us_list; @@ -221,7 +240,7 @@ typedef struct opal_btl_usnic_frag_t { /* fragment descriptor type */ opal_btl_usnic_frag_type_t uf_type; - /* utility segments */ + /* utility segments (just seg_addr/seg_len) */ mca_btl_base_segment_t uf_local_seg[2]; mca_btl_base_segment_t uf_remote_seg[1]; @@ -568,6 +587,31 @@ opal_btl_usnic_ack_segment_return( OMPI_FREE_LIST_RETURN_MT(&(module->ack_segs), &(ack->ss_base.us_list)); } +/* Compute and set the proper value for sfrag->sf_size. This must not be used + * during usnic_alloc, since the PML might change the segment size after + * usnic_alloc returns. */ +static inline void +opal_btl_usnic_compute_sf_size(opal_btl_usnic_send_frag_t *sfrag) +{ + opal_btl_usnic_frag_t *frag; + + frag = &sfrag->sf_base; + + /* JMS This can be a put or a send, and the buffers are different... */ +#if 0 + assert(frag->uf_base.USNIC_SEND_LOCAL_COUNT > 0); + assert(frag->uf_base.USNIC_SEND_LOCAL_COUNT <= 2); + + /* belt and suspenders: second len should be zero if only one SGE */ + assert(2 == frag->uf_base.USNIC_SEND_LOCAL_COUNT || + 0 == frag->uf_local_seg[1].seg_len); +#endif + + sfrag->sf_size = 0; + sfrag->sf_size += frag->uf_local_seg[0].seg_len; + sfrag->sf_size += frag->uf_local_seg[1].seg_len; +} + END_C_DECLS #endif diff --git a/opal/mca/btl/usnic/btl_usnic_module.c b/opal/mca/btl/usnic/btl_usnic_module.c index 8a94dd1887..d3520bc2eb 100644 --- a/opal/mca/btl/usnic/btl_usnic_module.c +++ b/opal/mca/btl/usnic/btl_usnic_module.c @@ -68,30 +68,6 @@ static void finalize_one_channel(opal_btl_usnic_module_t *module, struct opal_btl_usnic_channel_t *channel); -/* Compute and set the proper value for sfrag->sf_size. This must not be used - * during usnic_alloc, since the PML might change the segment size after - * usnic_alloc returns. */ -static inline void compute_sf_size(opal_btl_usnic_send_frag_t *sfrag) -{ - opal_btl_usnic_frag_t *frag; - - frag = &sfrag->sf_base; - - /* JMS This can be a put or a send, and the buffers are different... */ -#if 0 - assert(frag->uf_base.USNIC_SEND_LOCAL_COUNT > 0); - assert(frag->uf_base.USNIC_SEND_LOCAL_COUNT <= 2); - - /* belt and suspenders: second len should be zero if only one SGE */ - assert(2 == frag->uf_base.USNIC_SEND_LOCAL_COUNT || - 0 == frag->uf_local_seg[1].seg_len); -#endif - - sfrag->sf_size = 0; - sfrag->sf_size += frag->uf_local_seg[0].seg_len; - sfrag->sf_size += frag->uf_local_seg[1].seg_len; -} - /* * Loop over all procs sent to us in add_procs and see if we want to * add a proc/endpoint for them. @@ -644,98 +620,6 @@ static int usnic_free(struct mca_btl_base_module_t* btl, return OPAL_SUCCESS; } -/* - * Notes from george: - * - * - BTL ALLOC: allocating control messages or eager frags if BTL - does not have INPLACE flag. To be clear: max it will ever alloc - is eager_limit. THEREFORE: eager_limit is the max that ALLOC - must always be able to alloc. - --> Contraction in the btl.h documentation. - * - * - BTL PREPARE SRC: max_send_size frags go through here. Can return - a smaller size than was asked for. - * - * - BTL PREPARE DEST: not used if you don't have PUT/GET - * - * - BTL SEND: will be used after ALLOC / PREPARE - */ - -/* Responsible for handling "small" frags (reserve + *size <= max_frag_payload) - * in the same manner as btl_prepare_src. Must return a smaller amount than - * requested if the given convertor cannot process the entire (*size). - */ -static inline -opal_btl_usnic_send_frag_t * -prepare_src_small( - struct opal_btl_usnic_module_t* module, - struct mca_btl_base_endpoint_t* endpoint, - struct mca_mpool_base_registration_t* registration, - struct opal_convertor_t* convertor, - uint8_t order, - size_t reserve, - size_t* size, - uint32_t flags) -{ - opal_btl_usnic_send_frag_t *frag; - opal_btl_usnic_small_send_frag_t *sfrag; - size_t payload_len; - - payload_len = *size + reserve; - assert(payload_len <= module->max_frag_payload); /* precondition */ - - sfrag = opal_btl_usnic_small_send_frag_alloc(module); - if (OPAL_UNLIKELY(NULL == sfrag)) { - return NULL; - } - frag = &sfrag->ssf_base; - - /* In the case of a convertor, we will copy the data in now, since that is - * the cheapest way to discover how much we can actually send (since we know - * we will pack it anyway later). The alternative is to do all of the - * following: - * 1) clone_with_position(convertor) and see where the new position ends up - * actually being (see opal_btl_usnic_convertor_pack_peek). Otherwise we - * aren't fulfilling our contract w.r.t. (*size). - * 2) Add a bunch of branches checking for different cases, both here and in - * progress_sends - * 3) If we choose to defer the packing, we must clone the convertor because - * the PML owns it and might reuse it for another prepare_src call. - * - * Two convertor clones is likely to be at least as slow as just copying the - * data and might consume a similar amount of memory. Plus we still have to - * pack it later to send it. - * - * The reason we do not copy non-convertor buffer at this point is because - * we might still use INLINE for the send, and in that case we do not want - * to copy the data at all. - */ - if (OPAL_UNLIKELY(opal_convertor_need_buffers(convertor))) { - /* put user data just after end of 1st seg (upper layer header) */ - assert(payload_len <= module->max_frag_payload); - usnic_convertor_pack_simple( - convertor, - (IOVBASE_TYPE*)(intptr_t)(frag->sf_base.uf_local_seg[0].seg_addr.lval + reserve), - *size, - size); - payload_len = reserve + *size; - frag->sf_base.uf_base.USNIC_SEND_LOCAL_COUNT = 1; - /* PML will copy header into beginning of segment */ - frag->sf_base.uf_local_seg[0].seg_len = payload_len; - } else { - opal_convertor_get_current_pointer(convertor, - &sfrag->ssf_base.sf_base.uf_local_seg[1].seg_addr.pval); - frag->sf_base.uf_base.USNIC_SEND_LOCAL_COUNT = 2; - frag->sf_base.uf_local_seg[0].seg_len = reserve; - frag->sf_base.uf_local_seg[1].seg_len = *size; - } - - frag->sf_base.uf_base.des_flags = flags; - frag->sf_endpoint = endpoint; - - return frag; -} - /* Packs data from the given large send frag into single new segment and * returns a pointer to it. The packed data comes first from SG[0] (PML * header) and then second from either SG[1] (if seg_addr is non-NULL) or from @@ -827,345 +711,6 @@ pack_chunk_seg_from_frag( return seg; } -static -void * -pack_chunk_seg_chain_with_reserve( - struct opal_btl_usnic_module_t* module, - opal_btl_usnic_large_send_frag_t *lfrag, - size_t reserve_len, - opal_convertor_t *convertor, - size_t max_convertor_bytes, - size_t *convertor_bytes_packed) -{ - opal_btl_usnic_chunk_segment_t *seg; - void *ret_ptr = NULL; - int n_segs; - uint8_t *copyptr; - size_t copylen; - size_t seg_space; - size_t max_data; - bool first_pass; - - assert(NULL != lfrag); - assert(NULL != convertor_bytes_packed); - - n_segs = 0; - *convertor_bytes_packed = 0; - - first_pass = true; - while (*convertor_bytes_packed < max_convertor_bytes || - first_pass) { - seg = opal_btl_usnic_chunk_segment_alloc(module); - if (OPAL_UNLIKELY(NULL == seg)) { - BTL_ERROR(("chunk segment allocation error")); - abort(); /* XXX */ - } - ++n_segs; - - seg_space = module->max_chunk_payload; - copyptr = seg->ss_base.us_payload.raw; - - if (first_pass && reserve_len > 0) { - /* logic could accommodate >max, but currently doesn't */ - assert(reserve_len <= module->max_chunk_payload); - ret_ptr = copyptr; - seg_space -= reserve_len; - copyptr += reserve_len; - } - - /* now pack any convertor data */ - if (*convertor_bytes_packed < max_convertor_bytes && seg_space > 0) { - copylen = max_convertor_bytes - *convertor_bytes_packed; - if (copylen > seg_space) { - copylen = seg_space; - } - usnic_convertor_pack_simple(convertor, copyptr, copylen, &max_data); - seg_space -= max_data; - *convertor_bytes_packed += max_data; - - /* If unable to pack any of the remaining bytes, release the - * most recently allocated segment and finish processing. - */ - if (seg_space == module->max_chunk_payload) { - assert(max_data == 0); /* only way this can happen */ - opal_btl_usnic_chunk_segment_return(module, seg); - break; - } - } - - /* bozo checks */ - assert(seg_space >= 0); - assert(seg_space < module->max_chunk_payload); - - /* append segment of data to chain to send */ - seg->ss_parent_frag = &lfrag->lsf_base; - seg->ss_len = module->max_chunk_payload - seg_space; - opal_list_append(&lfrag->lsf_seg_chain, &seg->ss_base.us_list.super); - -#if MSGDEBUG1 - opal_output(0, "%s: appending seg=%p, frag=%p, payload=%zd\n", - __func__, (void *)seg, (void *)lfrag, - (module->max_chunk_payload - seg_space)); -#endif - - first_pass = false; - } - - return ret_ptr; -} - -/* Responsible for handling "large" frags (reserve + *size > max_frag_payload) - * in the same manner as btl_prepare_src. Must return a smaller amount than - * requested if the given convertor cannot process the entire (*size). - */ -static -opal_btl_usnic_send_frag_t * -prepare_src_large( - struct opal_btl_usnic_module_t* module, - struct mca_btl_base_endpoint_t* endpoint, - struct mca_mpool_base_registration_t* registration, - struct opal_convertor_t* convertor, - uint8_t order, - size_t reserve, - size_t* size, - uint32_t flags) -{ - opal_btl_usnic_send_frag_t *frag; - opal_btl_usnic_large_send_frag_t *lfrag; - int rc; - - /* Get holder for the msg */ - lfrag = opal_btl_usnic_large_send_frag_alloc(module); - if (OPAL_UNLIKELY(NULL == lfrag)) { - return NULL; - } - frag = &lfrag->lsf_base; - - /* The header location goes in SG[0], payload in SG[1]. If we are using a - * convertor then SG[1].seg_len is accurate but seg_addr is NULL. */ - frag->sf_base.uf_base.USNIC_SEND_LOCAL_COUNT = 2; - - /* stash header location, PML will write here */ - frag->sf_base.uf_local_seg[0].seg_addr.pval = &lfrag->lsf_ompi_header; - frag->sf_base.uf_local_seg[0].seg_len = reserve; - /* make sure upper header small enough */ - assert(reserve <= sizeof(lfrag->lsf_ompi_header)); - - if (OPAL_UNLIKELY(opal_convertor_need_buffers(convertor))) { - /* threshold == -1 means always pack eagerly */ - if (mca_btl_usnic_component.pack_lazy_threshold >= 0 && - *size >= (size_t)mca_btl_usnic_component.pack_lazy_threshold) { - MSGDEBUG1_OUT("packing frag %p on the fly", (void *)frag); - lfrag->lsf_pack_on_the_fly = true; - - /* tell the PML we will absorb as much as possible while still - * respecting indivisible element boundaries in the convertor */ - *size = opal_btl_usnic_convertor_pack_peek(convertor, *size); - - /* Clone the convertor b/c we (the BTL) don't own it and the PML - * might mutate it after we return from this function. */ - rc = opal_convertor_clone(convertor, &frag->sf_convertor, - /*copy_stack=*/true); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - BTL_ERROR(("unexpected convertor clone error")); - abort(); /* XXX */ - } - } - else { - /* pack everything in the convertor into a chain of segments now, - * leaving space for the PML header in the first segment */ - lfrag->lsf_base.sf_base.uf_local_seg[0].seg_addr.pval = - pack_chunk_seg_chain_with_reserve(module, lfrag, reserve, - convertor, *size, size); - } - - /* We set SG[1] to {NULL,bytes_packed} so that various calculations - * by both PML and this BTL will be correct. For example, the PML adds - * up the bytes in the descriptor segments to determine if an MPI-level - * request is complete or not. */ - frag->sf_base.uf_local_seg[1].seg_addr.pval = NULL; - frag->sf_base.uf_local_seg[1].seg_len = *size; - } else { - /* convertor not needed, just save the payload pointer in SG[1] */ - lfrag->lsf_pack_on_the_fly = true; - opal_convertor_get_current_pointer(convertor, - &frag->sf_base.uf_local_seg[1].seg_addr.pval); - frag->sf_base.uf_local_seg[1].seg_len = *size; - } - - frag->sf_base.uf_base.des_flags = flags; - frag->sf_endpoint = endpoint; - - return frag; -} - - -/** - * Note the "user" data the PML wishes to communicate and return a descriptor - * that can be used for send or put. We create a frag (which is also a - * descriptor by virtue of its base class) and populate it with enough - * source information to complete a future send/put. - * - * We will create either a small send frag if < than an MTU, otherwise a large - * send frag. The convertor will be saved for deferred packing if the user - * buffer is noncontiguous. Otherwise it will be saved in one of the - * descriptor's SGEs. - * - * NOTE that the *only* reason this routine is allowed to return a size smaller - * than was requested is if the convertor cannot process the entire amount. - */ -static mca_btl_base_descriptor_t* -usnic_prepare_src( - struct mca_btl_base_module_t* base_module, - struct mca_btl_base_endpoint_t* endpoint, - struct mca_mpool_base_registration_t* registration, - struct opal_convertor_t* convertor, - uint8_t order, - size_t reserve, - size_t* size, - uint32_t flags) -{ - opal_btl_usnic_module_t *module = (opal_btl_usnic_module_t*) base_module; - opal_btl_usnic_send_frag_t *frag; - uint32_t payload_len; -#if MSGDEBUG2 - size_t osize = *size; -#endif - - /* Do we need to check the connectivity? If enabled, we'll check - the connectivity at either first send to peer X or first ACK to - peer X. */ - opal_btl_usnic_check_connectivity(module, endpoint); - - /* - * if total payload len fits in one MTU use small send, else large - */ - payload_len = *size + reserve; - if (payload_len <= module->max_frag_payload) { - frag = prepare_src_small(module, endpoint, registration, convertor, - order, reserve, size, flags); - } else { - frag = prepare_src_large(module, endpoint, registration, convertor, - order, reserve, size, flags); - } - -#if MSGDEBUG2 - opal_output(0, "prep_src: %s %s frag %p, size=%d+%u (was %u), conv=%p\n", - module->fabric_info->fabric_attr->name, - (reserve + *size) <= module->max_frag_payload?"small":"large", - (void *)frag, (int)reserve, (unsigned)*size, (unsigned)osize, - (void *)convertor); -#if MSGDEBUG1 - { - unsigned i; - mca_btl_base_descriptor_t *desc = &frag->sf_base.uf_base; - for (i=0; iUSNIC_SEND_LOCAL_COUNT; ++i) { - opal_output(0, " %d: ptr:%p len:%d\n", i, - (void *)desc->USNIC_SEND_LOCAL[i].seg_addr.pval, - desc->USNIC_SEND_LOCAL[i].seg_len); - } - } -#endif -#endif - - return &frag->sf_base.uf_base; -} - -static mca_btl_base_descriptor_t* -usnic_prepare_dst( - struct mca_btl_base_module_t* base_module, - struct mca_btl_base_endpoint_t* endpoint, - struct mca_mpool_base_registration_t* registration, - struct opal_convertor_t* convertor, - uint8_t order, - size_t reserve, - size_t* size, - uint32_t flags) -{ - opal_btl_usnic_put_dest_frag_t *pfrag; - opal_btl_usnic_module_t *module; - void *data_ptr; - - module = (opal_btl_usnic_module_t *)base_module; - - /* allocate a fragment for this */ - pfrag = (opal_btl_usnic_put_dest_frag_t *) - opal_btl_usnic_put_dest_frag_alloc(module); - if (NULL == pfrag) { - return NULL; - } - - /* find start of the data */ - opal_convertor_get_current_pointer(convertor, (void **) &data_ptr); - - /* make a seg entry pointing at data_ptr */ - pfrag->uf_remote_seg[0].seg_addr.pval = data_ptr; - pfrag->uf_remote_seg[0].seg_len = *size; - - pfrag->uf_base.order = order; - pfrag->uf_base.des_flags = flags; - -#if MSGDEBUG2 - opal_output(0, "prep_dst size=%d, addr=%p, pfrag=%p\n", (int)*size, - data_ptr, (void *)pfrag); -#endif - - return &pfrag->uf_base; -} - - -/* - * Emulate an RDMA put. We'll send the remote address - * across to the other side so it will know where to put the data - */ -static int -usnic_put( - struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t *desc) -{ - int rc; - opal_btl_usnic_send_frag_t *frag; - - frag = (opal_btl_usnic_send_frag_t *)desc; - - compute_sf_size(frag); - frag->sf_ack_bytes_left = frag->sf_size; - -#if MSGDEBUG2 - opal_output(0, "usnic_put, frag=%p, size=%d\n", (void *)frag, - (int)frag->sf_size); -#if MSGDEBUG1 - { unsigned i; - for (i=0; iUSNIC_PUT_LOCAL_COUNT; ++i) { - opal_output(0, " %d: ptr:%p len:%d%s\n", i, - desc->USNIC_PUT_LOCAL[i].seg_addr.pval, - desc->USNIC_PUT_LOCAL[i].seg_len, - (i==0)?" (put local)":""); - } - for (i=0; iUSNIC_PUT_REMOTE_COUNT; ++i) { - opal_output(0, " %d: ptr:%p len:%d%s\n", i, - desc->USNIC_PUT_REMOTE[i].seg_addr.pval, - desc->USNIC_PUT_REMOTE[i].seg_len, - (i==0)?" (put remote)":""); - } - } -#endif -#endif - - /* RFXX copy out address - why does he not use our provided holder? */ - /* JMS What does this mean? ^^ */ - frag->sf_base.uf_remote_seg[0].seg_addr.pval = - desc->USNIC_PUT_REMOTE->seg_addr.pval; - - rc = opal_btl_usnic_finish_put_or_send((opal_btl_usnic_module_t *)btl, - (opal_btl_usnic_endpoint_t *)endpoint, - frag, - /*tag=*/MCA_BTL_NO_ORDER); - - return rc; -} - static int usnic_finalize(struct mca_btl_base_module_t* btl) { opal_btl_usnic_module_t* module = (opal_btl_usnic_module_t*)btl; @@ -1569,7 +1114,7 @@ usnic_send( assert(frag->sf_endpoint == endpoint); frag->sf_base.uf_remote_seg[0].seg_addr.pval = NULL; /* not a PUT */ - compute_sf_size(frag); + opal_btl_usnic_compute_sf_size(frag); frag->sf_ack_bytes_left = frag->sf_size; #if MSGDEBUG2 @@ -1698,13 +1243,13 @@ static int usnic_sendi(struct mca_btl_base_module_t* btl, * RDMA Memory Pool (de)register callbacks */ static int usnic_reg_mr(void* reg_data, void* base, size_t size, - mca_mpool_base_registration_t* reg) + mca_mpool_base_registration_t* reg) { opal_btl_usnic_module_t* mod = (opal_btl_usnic_module_t*)reg_data; - opal_btl_usnic_reg_t* ud_reg = (opal_btl_usnic_reg_t*)reg; + opal_btl_usnic_reg_t* ur = (opal_btl_usnic_reg_t*)reg; int rc; - rc = fi_mr_reg(mod->domain, base, size, 0, 0, 0, 0, &ud_reg->mr, NULL); + rc = fi_mr_reg(mod->domain, base, size, 0, 0, 0, 0, &ur->ur_mr, NULL); if (0 != rc) { return OPAL_ERR_OUT_OF_RESOURCE; } @@ -1712,21 +1257,20 @@ static int usnic_reg_mr(void* reg_data, void* base, size_t size, return OPAL_SUCCESS; } - static int usnic_dereg_mr(void* reg_data, - mca_mpool_base_registration_t* reg) + mca_mpool_base_registration_t* reg) { - opal_btl_usnic_reg_t* ud_reg = (opal_btl_usnic_reg_t*)reg; + opal_btl_usnic_reg_t* ur = (opal_btl_usnic_reg_t*)reg; - if (ud_reg->mr != NULL) { - if (0 != fi_close(&ud_reg->mr->fid)) { + if (ur->ur_mr != NULL) { + if (0 != fi_close(&ur->ur_mr->fid)) { opal_output(0, "%s: error unpinning USD memory mr=%p: %s\n", - __func__, (void*) ud_reg->mr, strerror(errno)); + __func__, (void*) ur->ur_mr, strerror(errno)); return OPAL_ERROR; } } - ud_reg->mr = NULL; + ur->ur_mr = NULL; return OPAL_SUCCESS; } @@ -2243,7 +1787,13 @@ static void init_pml_values(opal_btl_usnic_module_t *module) /* Since we emulate PUT, max_send_size can be same as eager_limit */ - module->super.btl_max_send_size = module->super.btl_eager_limit; + module->super.btl_max_send_size = + module->super.btl_eager_limit; + +#if BTL_VERSION == 30 + module->super.btl_put_limit = + module->super.btl_eager_limit; +#endif } static void init_senders(opal_btl_usnic_module_t *module) @@ -2626,22 +2176,45 @@ static int usnic_ft_event(int state) opal_btl_usnic_module_t opal_btl_usnic_module_template = { .super = { .btl_component = &mca_btl_usnic_component.super, + +#if BTL_VERSION == 20 + .btl_prepare_dst = opal_btl_usnic_prepare_dst, + .btl_seg_size = sizeof(mca_btl_base_segment_t), +#elif BTL_VERSION == 30 + .btl_atomic_flags = 0, + .btl_registration_handle_size = 0, + + .btl_get_limit = 0, + .btl_get_alignment = 0, + .btl_put_limit = 0, + .btl_put_alignment = 0, + + .btl_atomic_op = NULL, + .btl_atomic_fop = NULL, + .btl_atomic_cswap = NULL, +#endif + .btl_exclusivity = MCA_BTL_EXCLUSIVITY_DEFAULT, .btl_flags = MCA_BTL_FLAGS_SEND | MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_SEND_INPLACE, - .btl_seg_size = sizeof(mca_btl_base_segment_t), + .btl_add_procs = usnic_add_procs, .btl_del_procs = usnic_del_procs, + .btl_register = NULL, .btl_finalize = usnic_finalize, + .btl_alloc = usnic_alloc, .btl_free = usnic_free, - .btl_prepare_src = usnic_prepare_src, - .btl_prepare_dst = usnic_prepare_dst, + .btl_prepare_src = opal_btl_usnic_prepare_src, .btl_send = usnic_send, - .btl_put = usnic_put, + .btl_sendi = NULL, + .btl_put = opal_btl_usnic_put, + .btl_get = NULL, .btl_dump = mca_btl_base_dump, + + .btl_mpool = NULL, .btl_register_error = usnic_register_pml_err_cb, .btl_ft_event = usnic_ft_event } diff --git a/opal/mca/btl/usnic/btl_usnic_recv.h b/opal/mca/btl/usnic/btl_usnic_recv.h index 7990975d8d..227c5e62fd 100644 --- a/opal/mca/btl/usnic/btl_usnic_recv.h +++ b/opal/mca/btl/usnic/btl_usnic_recv.h @@ -277,6 +277,10 @@ opal_btl_usnic_recv_fast(opal_btl_usnic_module_t *module, opal_output(0, "fast recv %d bytes:\n", bseg->us_btl_header->payload_len + sizeof(opal_btl_usnic_btl_header_t)); opal_btl_usnic_dump_hex(bseg->us_btl_header, bseg->us_btl_header->payload_len + sizeof(opal_btl_usnic_btl_header_t)); #endif + /* If this is a short incoming message (i.e., the message is + wholly contained in this one message -- it is not chunked + across multiple messages), and it's not a PUT from the sender, + then just handle it here. */ if (endpoint != NULL && !endpoint->endpoint_exiting && (OPAL_BTL_USNIC_PAYLOAD_TYPE_FRAG == bseg->us_btl_header->payload_type) && @@ -311,8 +315,10 @@ opal_btl_usnic_dump_hex(bseg->us_btl_header, bseg->us_btl_header->payload_len + drop: channel->chan_deferred_recv = seg; + } - } else { + /* Otherwise, handle all the other cases the "normal" way */ + else { opal_btl_usnic_recv_call(module, seg, channel); } } @@ -382,6 +388,10 @@ opal_btl_usnic_recv(opal_btl_usnic_module_t *module, endpoint = lookup_sender(module, bseg); seg->rs_endpoint = endpoint; + /* If this is a short incoming message (i.e., the message is + wholly contained in this one message -- it is not chunked + across multiple messages), and it's not a PUT from the sender, + then just handle it here. */ if (endpoint != NULL && !endpoint->endpoint_exiting && (OPAL_BTL_USNIC_PAYLOAD_TYPE_FRAG == bseg->us_btl_header->payload_type) && @@ -408,7 +418,10 @@ opal_btl_usnic_recv(opal_btl_usnic_module_t *module, reg->cbfunc(&module->super, bseg->us_btl_header->tag, &seg->rs_desc, reg->cbdata); - } else { + } + + /* Otherwise, handle all the other cases the "normal" way */ + else { opal_btl_usnic_recv_call(module, seg, channel); } } diff --git a/opal/mca/btl/usnic/btl_usnic_send.c b/opal/mca/btl/usnic/btl_usnic_send.c index 02e3a2b753..aff68722aa 100644 --- a/opal/mca/btl/usnic/btl_usnic_send.c +++ b/opal/mca/btl/usnic/btl_usnic_send.c @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2006 Sandia National Laboratories. All rights * reserved. - * Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2008-2015 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2012 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ @@ -106,7 +106,8 @@ opal_btl_usnic_chunk_send_complete(opal_btl_usnic_module_t *module, * This routine lives in this file to help prevent automatic inlining by the * compiler. * - * The "tag" only applies to sends. */ + * The "tag" only applies to sends. + */ int opal_btl_usnic_finish_put_or_send( opal_btl_usnic_module_t *module, diff --git a/opal/mca/btl/vader/btl_vader.h b/opal/mca/btl/vader/btl_vader.h index fb9f14c5a1..8bc7f529f0 100644 --- a/opal/mca/btl/vader/btl_vader.h +++ b/opal/mca/btl/vader/btl_vader.h @@ -12,8 +12,8 @@ * All rights reserved. * Copyright (c) 2006-2007 Voltaire. All rights reserved. * Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010-2014 Los Alamos National Security, LLC. - * All rights reserved. + * Copyright (c) 2010-2015 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -98,7 +98,7 @@ enum { * Shared Memory (VADER) BTL module. */ struct mca_btl_vader_component_t { - mca_btl_base_component_2_0_0_t super; /**< base BTL component */ + mca_btl_base_component_3_0_0_t super; /**< base BTL component */ int vader_free_list_num; /**< initial size of free lists */ int vader_free_list_max; /**< maximum size of free lists */ int vader_free_list_inc; /**< number of elements to alloc when growing free lists */ @@ -115,7 +115,6 @@ struct mca_btl_vader_component_t { ompi_free_list_t vader_frags_eager; /**< free list of vader send frags */ ompi_free_list_t vader_frags_max_send; /**< free list of vader max send frags (large fragments) */ ompi_free_list_t vader_frags_user; /**< free list of small inline frags */ - ompi_free_list_t vader_frags_rdma; /**< free list of vader put/get frags (single-copy) */ unsigned int fbox_threshold; /**< number of sends required before we setup a send fast box for a peer */ unsigned int fbox_max; /**< maximum number of send fast boxes to allocate */ @@ -208,21 +207,24 @@ int mca_btl_vader_sendi (struct mca_btl_base_module_t *btl, * @param descriptor (IN) Description of the data to be transferred */ #if OPAL_BTL_VADER_HAVE_XPMEM -int mca_btl_vader_put_xpmem (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t *des); +int mca_btl_vader_put_xpmem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); #endif #if OPAL_BTL_VADER_HAVE_CMA -int mca_btl_vader_put_cma (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t *des); +int mca_btl_vader_put_cma (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); #endif #if OPAL_BTL_VADER_HAVE_KNEM -int mca_btl_vader_put_knem (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t *des); +int mca_btl_vader_put_knem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); #endif /** @@ -233,21 +235,24 @@ int mca_btl_vader_put_knem (struct mca_btl_base_module_t *btl, * @param descriptor (IN) Description of the data to be transferred */ #if OPAL_BTL_VADER_HAVE_XPMEM -int mca_btl_vader_get_xpmem (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t *des); +int mca_btl_vader_get_xpmem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); #endif #if OPAL_BTL_VADER_HAVE_CMA -int mca_btl_vader_get_cma (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t *des); +int mca_btl_vader_get_cma (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); #endif #if OPAL_BTL_VADER_HAVE_KNEM -int mca_btl_vader_get_knem (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t *des); +int mca_btl_vader_get_knem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); #endif /** @@ -260,6 +265,7 @@ mca_btl_base_descriptor_t* mca_btl_vader_alloc (struct mca_btl_base_module_t* bt struct mca_btl_base_endpoint_t* endpoint, uint8_t order, size_t size, uint32_t flags); + END_C_DECLS #endif diff --git a/opal/mca/btl/vader/btl_vader_component.c b/opal/mca/btl/vader/btl_vader_component.c index 5979d91f08..5f55dc914b 100644 --- a/opal/mca/btl/vader/btl_vader_component.c +++ b/opal/mca/btl/vader/btl_vader_component.c @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2006-2007 Voltaire. All rights reserved. * Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010-2014 Los Alamos National Security, LLC. + * Copyright (c) 2010-2015 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2011 NVIDIA Corporation. All rights reserved. * Copyright (c) 2014 Intel, Inc. All rights reserved. @@ -227,12 +227,12 @@ static int mca_btl_vader_component_register (void) mca_btl_vader.super.btl_eager_limit = 32 * 1024; mca_btl_vader.super.btl_rndv_eager_limit = mca_btl_vader.super.btl_eager_limit; mca_btl_vader.super.btl_max_send_size = mca_btl_vader.super.btl_eager_limit; - mca_btl_vader.super.btl_min_rdma_pipeline_size = mca_btl_vader.super.btl_eager_limit; + mca_btl_vader.super.btl_min_rdma_pipeline_size = INT_MAX; } else { mca_btl_vader.super.btl_eager_limit = 4 * 1024; mca_btl_vader.super.btl_rndv_eager_limit = 32 * 1024; mca_btl_vader.super.btl_max_send_size = 32 * 1024; - mca_btl_vader.super.btl_min_rdma_pipeline_size = 32 * 1024; + mca_btl_vader.super.btl_min_rdma_pipeline_size = INT_MAX; } mca_btl_vader.super.btl_rdma_pipeline_send_length = mca_btl_vader.super.btl_eager_limit; @@ -251,7 +251,6 @@ static int mca_btl_vader_component_register (void) mca_btl_vader.super.btl_bandwidth = 10000; /* Mbs */ } - mca_btl_vader.super.btl_seg_size = sizeof (mca_btl_vader_segment_t); mca_btl_vader.super.btl_latency = 1; /* Microsecs */ /* Call the BTL based to register its MCA params */ @@ -272,7 +271,6 @@ static int mca_btl_vader_component_open(void) OBJ_CONSTRUCT(&mca_btl_vader_component.vader_frags_eager, ompi_free_list_t); OBJ_CONSTRUCT(&mca_btl_vader_component.vader_frags_user, ompi_free_list_t); OBJ_CONSTRUCT(&mca_btl_vader_component.vader_frags_max_send, ompi_free_list_t); - OBJ_CONSTRUCT(&mca_btl_vader_component.vader_frags_rdma, ompi_free_list_t); OBJ_CONSTRUCT(&mca_btl_vader_component.lock, opal_mutex_t); OBJ_CONSTRUCT(&mca_btl_vader_component.pending_endpoints, opal_list_t); OBJ_CONSTRUCT(&mca_btl_vader_component.pending_fragments, opal_list_t); @@ -293,7 +291,6 @@ static int mca_btl_vader_component_close(void) OBJ_DESTRUCT(&mca_btl_vader_component.vader_frags_eager); OBJ_DESTRUCT(&mca_btl_vader_component.vader_frags_user); OBJ_DESTRUCT(&mca_btl_vader_component.vader_frags_max_send); - OBJ_DESTRUCT(&mca_btl_vader_component.vader_frags_rdma); OBJ_DESTRUCT(&mca_btl_vader_component.lock); OBJ_DESTRUCT(&mca_btl_vader_component.pending_endpoints); OBJ_DESTRUCT(&mca_btl_vader_component.pending_fragments); @@ -349,12 +346,11 @@ static void mca_btl_vader_select_next_single_copy_mechanism (void) static void mca_btl_vader_check_single_copy (void) { int initial_mechanism = mca_btl_vader_component.single_copy_mechanism; - int rc; #if OPAL_BTL_VADER_HAVE_XPMEM if (MCA_BTL_VADER_XPMEM == mca_btl_vader_component.single_copy_mechanism) { /* try to create an xpmem segment for the entire address space */ - rc = mca_btl_vader_xpmem_init (); + int rc = mca_btl_vader_xpmem_init (); if (OPAL_SUCCESS != rc) { if (MCA_BTL_VADER_XPMEM == initial_mechanism) { opal_show_help("help-btl-vader.txt", "xpmem-make-failed", @@ -414,7 +410,7 @@ static void mca_btl_vader_check_single_copy (void) #if OPAL_BTL_VADER_HAVE_KNEM if (MCA_BTL_VADER_KNEM == mca_btl_vader_component.single_copy_mechanism) { /* mca_btl_vader_knem_init will set the appropriate get/put functions */ - rc = mca_btl_vader_knem_init (); + int rc = mca_btl_vader_knem_init (); if (OPAL_SUCCESS != rc) { if (MCA_BTL_VADER_KNEM == initial_mechanism) { opal_show_help("help-btl-vader.txt", "knem requested but not available", @@ -559,7 +555,7 @@ failed: void mca_btl_vader_poll_handle_frag (mca_btl_vader_hdr_t *hdr, struct mca_btl_base_endpoint_t *endpoint) { mca_btl_base_segment_t segments[2]; - mca_btl_base_descriptor_t frag = {.des_local = segments, .des_local_count = 1}; + mca_btl_base_descriptor_t frag = {.des_segments = segments, .des_segment_count = 1}; const mca_btl_active_message_callback_t *reg; if (hdr->flags & MCA_BTL_VADER_FLAG_COMPLETE) { @@ -579,7 +575,7 @@ void mca_btl_vader_poll_handle_frag (mca_btl_vader_hdr_t *hdr, struct mca_btl_ba &segments[1].seg_addr.pval); segments[1].seg_len = hdr->sc_iov.iov_len; - frag.des_local_count = 2; + frag.des_segment_count = 2; /* recv upcall */ reg->cbfunc(&mca_btl_vader.super, hdr->tag, &frag, reg->cbdata); diff --git a/opal/mca/btl/vader/btl_vader_fbox.h b/opal/mca/btl/vader/btl_vader_fbox.h index 43b2fac6b5..f48ea91616 100644 --- a/opal/mca/btl/vader/btl_vader_fbox.h +++ b/opal/mca/btl/vader/btl_vader_fbox.h @@ -204,7 +204,7 @@ static inline bool mca_btl_vader_check_fboxes (void) /* the 0xff tag indicates we should skip the rest of the buffer */ if (OPAL_LIKELY((0xfe & hdr.data.tag) != 0xfe)) { mca_btl_base_segment_t segment; - mca_btl_base_descriptor_t desc = {.des_local = &segment, .des_local_count = 1}; + mca_btl_base_descriptor_t desc = {.des_segments = &segment, .des_segment_count = 1}; const mca_btl_active_message_callback_t *reg = mca_btl_base_active_message_trigger + hdr.data.tag; diff --git a/opal/mca/btl/vader/btl_vader_frag.c b/opal/mca/btl/vader/btl_vader_frag.c index 6cad4e5b63..b39f5fb3c3 100644 --- a/opal/mca/btl/vader/btl_vader_frag.c +++ b/opal/mca/btl/vader/btl_vader_frag.c @@ -31,11 +31,11 @@ static inline void mca_btl_vader_frag_constructor (mca_btl_vader_frag_t *frag) if(frag->hdr != NULL) { frag->hdr->frag = frag; frag->hdr->flags = 0; - frag->segments[0].base.seg_addr.pval = (char *)(frag->hdr + 1); + frag->segments[0].seg_addr.pval = (char *)(frag->hdr + 1); } - frag->base.des_local = &frag->segments->base; - frag->base.des_local_count = 1; + frag->base.des_segments = frag->segments; + frag->base.des_segment_count = 1; frag->fbox = NULL; } @@ -65,8 +65,6 @@ void mca_btl_vader_frag_init (ompi_free_list_item_t *item, void *ctx) frag->my_list = &mca_btl_vader_component.vader_frags_eager; } else if (mca_btl_vader.super.btl_max_send_size == data_size) { frag->my_list = &mca_btl_vader_component.vader_frags_max_send; - } else { - frag->my_list = &mca_btl_vader_component.vader_frags_rdma; } if (data_size) { diff --git a/opal/mca/btl/vader/btl_vader_frag.h b/opal/mca/btl/vader/btl_vader_frag.h index fee0bdb565..2c6e5c9091 100644 --- a/opal/mca/btl/vader/btl_vader_frag.h +++ b/opal/mca/btl/vader/btl_vader_frag.h @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -57,15 +57,6 @@ struct mca_btl_vader_hdr_t { }; typedef struct mca_btl_vader_hdr_t mca_btl_vader_hdr_t; -struct mca_btl_vader_segment_t { - mca_btl_base_segment_t base; -#if OPAL_BTL_VADER_HAVE_KNEM - uint64_t cookie; - intptr_t registered_base; -#endif -}; -typedef struct mca_btl_vader_segment_t mca_btl_vader_segment_t; - /** * shared memory send fragment derived type. */ @@ -73,7 +64,7 @@ struct mca_btl_vader_frag_t { /** base object */ mca_btl_base_descriptor_t base; /** storage for segment data (max 2) */ - mca_btl_vader_segment_t segments[2]; + mca_btl_base_segment_t segments[2]; /** endpoint this fragment is active on */ struct mca_btl_base_endpoint_t *endpoint; /** fast box in use (or NULL) */ @@ -82,9 +73,6 @@ struct mca_btl_vader_frag_t { mca_btl_vader_hdr_t *hdr; /** free list this fragment was allocated within */ ompi_free_list_t *my_list; -#if OPAL_BTL_VADER_HAVE_KNEM - uint64_t cookie; -#endif }; typedef struct mca_btl_vader_frag_t mca_btl_vader_frag_t; @@ -108,37 +96,16 @@ static inline int mca_btl_vader_frag_alloc (mca_btl_vader_frag_t **frag, ompi_fr return OPAL_SUCCESS; } -static inline int mca_btl_vader_frag_alloc_rdma (mca_btl_vader_frag_t **frag, ompi_free_list_t *list, - struct mca_btl_base_endpoint_t *endpoint) { - ompi_free_list_item_t *item; - - OMPI_FREE_LIST_GET_MT(list, item); - *frag = (mca_btl_vader_frag_t *) item; - if (OPAL_LIKELY(NULL != item)) { - (*frag)->endpoint = endpoint; - } - - return OPAL_SUCCESS; -} - static inline void mca_btl_vader_frag_return (mca_btl_vader_frag_t *frag) { if (frag->hdr) { frag->hdr->flags = 0; } - frag->segments[0].base.seg_addr.pval = (char *)(frag->hdr + 1); - frag->base.des_local_count = 1; + frag->segments[0].seg_addr.pval = (char *)(frag->hdr + 1); + frag->base.des_segment_count = 1; frag->fbox = NULL; -#if OPAL_BTL_VADER_HAVE_KNEM - if (frag->cookie) { - /* NTH: explicity ignore the return code. Don't care about this cookie anymore anyway. */ - (void) ioctl(mca_btl_vader.knem_fd, KNEM_CMD_DESTROY_REGION, &frag->cookie); - frag->cookie = 0; - } -#endif - OMPI_FREE_LIST_RETURN_MT(frag->my_list, (ompi_free_list_item_t *)frag); } @@ -153,9 +120,6 @@ OBJ_CLASS_DECLARATION(mca_btl_vader_frag_t); #define MCA_BTL_VADER_FRAG_ALLOC_USER(frag, endpoint) \ mca_btl_vader_frag_alloc (&(frag), &mca_btl_vader_component.vader_frags_user, endpoint) -#define MCA_BTL_VADER_FRAG_ALLOC_RDMA(frag, endpoint) \ - mca_btl_vader_frag_alloc_rdma (&(frag), &mca_btl_vader_component.vader_frags_rdma, endpoint) - #define MCA_BTL_VADER_FRAG_RETURN(frag) mca_btl_vader_frag_return(frag) diff --git a/opal/mca/btl/vader/btl_vader_get.c b/opal/mca/btl/vader/btl_vader_get.c index a0ce15e312..ce8d7b89d8 100644 --- a/opal/mca/btl/vader/btl_vader_get.c +++ b/opal/mca/btl/vader/btl_vader_get.c @@ -33,47 +33,42 @@ * @param descriptor (IN) Description of the data to be transferred */ #if OPAL_BTL_VADER_HAVE_XPMEM -int mca_btl_vader_get_xpmem (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t *des) +int mca_btl_vader_get_xpmem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { - mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) des; - mca_btl_base_segment_t *src = des->des_remote; - mca_btl_base_segment_t *dst = des->des_local; - const size_t size = min(dst->seg_len, src->seg_len); mca_mpool_base_registration_t *reg; void *rem_ptr; - reg = vader_get_registation (endpoint, src->seg_addr.pval, src->seg_len, 0, &rem_ptr); + /* silence warning about unused arguments */ + (void) local_handle; + (void) remote_handle; + + reg = vader_get_registation (endpoint, (void *)(intptr_t) remote_address, size, 0, &rem_ptr); if (OPAL_UNLIKELY(NULL == rem_ptr)) { return OPAL_ERROR; } - vader_memmove (dst->seg_addr.pval, rem_ptr, size); + vader_memmove (local_address, rem_ptr, size); vader_return_registration (reg, endpoint); /* always call the callback function */ - frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; - - frag->endpoint = endpoint; - mca_btl_vader_frag_complete (frag); + cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS); return OPAL_SUCCESS; } #endif #if OPAL_BTL_VADER_HAVE_CMA -int mca_btl_vader_get_cma (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t *des) +int mca_btl_vader_get_cma (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { - mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) des; - mca_btl_base_segment_t *src = des->des_remote; - mca_btl_base_segment_t *dst = des->des_local; - const size_t size = min(dst->seg_len, src->seg_len); - struct iovec src_iov = {.iov_base = src->seg_addr.pval, .iov_len = size}; - struct iovec dst_iov = {.iov_base = dst->seg_addr.pval, .iov_len = size}; + struct iovec src_iov = {.iov_base = (void *)(intptr_t) remote_address, .iov_len = size}; + struct iovec dst_iov = {.iov_base = local_address, .iov_len = size}; ssize_t ret; ret = process_vm_readv (endpoint->segment_data.other.seg_ds->seg_cpid, &dst_iov, 1, &src_iov, 1, 0); @@ -83,36 +78,29 @@ int mca_btl_vader_get_cma (struct mca_btl_base_module_t *btl, } /* always call the callback function */ - frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; - - frag->endpoint = endpoint; - mca_btl_vader_frag_complete (frag); + cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS); return OPAL_SUCCESS; } #endif #if OPAL_BTL_VADER_HAVE_KNEM -int mca_btl_vader_get_knem (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t *des) +int mca_btl_vader_get_knem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { - mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) des; - mca_btl_vader_segment_t *src = (mca_btl_vader_segment_t *) des->des_remote; - mca_btl_vader_segment_t *dst = (mca_btl_vader_segment_t *) des->des_local; - const size_t size = min(dst->base.seg_len, src->base.seg_len); - intptr_t offset = src->base.seg_addr.lval - src->registered_base; struct knem_cmd_param_iovec recv_iovec; struct knem_cmd_inline_copy icopy; /* Fill in the ioctl data fields. There's no async completion, so we don't need to worry about getting a slot, etc. */ - recv_iovec.base = (uintptr_t) dst->base.seg_addr.lval; + recv_iovec.base = (uintptr_t) local_address; recv_iovec.len = size; icopy.local_iovec_array = (uintptr_t) &recv_iovec; icopy.local_iovec_nr = 1; - icopy.remote_cookie = src->cookie; - icopy.remote_offset = offset; + icopy.remote_cookie = remote_handle->cookie; + icopy.remote_offset = remote_address - remote_handle->base_addr; icopy.write = 0; icopy.flags = 0; @@ -120,7 +108,7 @@ int mca_btl_vader_get_knem (struct mca_btl_base_module_t *btl, * is greater than the cutoff. Not that if DMA is not supported * or the user specified 0 for knem_dma_min the knem_dma_min was * set to UINT_MAX in mca_btl_vader_knem_init. */ - if (mca_btl_vader_component.knem_dma_min <= dst->base.seg_len) { + if (mca_btl_vader_component.knem_dma_min <= size) { icopy.flags = KNEM_FLAG_DMA; } /* synchronous flags only, no need to specify icopy.async_status_index */ @@ -136,10 +124,7 @@ int mca_btl_vader_get_knem (struct mca_btl_base_module_t *btl, } /* always call the callback function */ - frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; - - frag->endpoint = endpoint; - mca_btl_vader_frag_complete (frag); + cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS); return OPAL_SUCCESS; } diff --git a/opal/mca/btl/vader/btl_vader_knem.c b/opal/mca/btl/vader/btl_vader_knem.c index e776ebf9e1..157dc04ae2 100644 --- a/opal/mca/btl/vader/btl_vader_knem.c +++ b/opal/mca/btl/vader/btl_vader_knem.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2014 Los Alamos National Security, LLC. All rights + * Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -19,9 +19,83 @@ #include #include "opal/util/show_help.h" +#include "opal/mca/mpool/grdma/mpool_grdma.h" + +OBJ_CLASS_INSTANCE(mca_btl_vader_registration_handle_t, mca_mpool_base_registration_t, NULL, NULL); + +static int mca_btl_vader_knem_reg (void *reg_data, void *base, size_t size, + mca_mpool_base_registration_t *reg) +{ + mca_btl_vader_registration_handle_t *knem_reg = (mca_btl_vader_registration_handle_t *) reg; + struct knem_cmd_create_region knem_cr; + struct knem_cmd_param_iovec knem_iov; + + knem_iov.base = (uintptr_t) base; + knem_iov.len = size; + + knem_cr.iovec_array = (uintptr_t) &knem_iov; + knem_cr.iovec_nr = 1; + /* TODO -- set proper access flags when the protection is passed down */ + knem_cr.protection = PROT_READ | PROT_WRITE; + + /* Vader will explicitly destroy this cookie */ + knem_cr.flags = 0; + if (OPAL_UNLIKELY(ioctl(mca_btl_vader.knem_fd, KNEM_CMD_CREATE_REGION, &knem_cr) < 0)) { + return OPAL_ERROR; + } + + knem_reg->btl_handle.cookie = knem_cr.cookie; + knem_reg->btl_handle.base_addr = (intptr_t) base; + + return OPAL_SUCCESS; +} + +static int mca_btl_vader_knem_dereg (void *reg_data, mca_mpool_base_registration_t *reg) +{ + mca_btl_vader_registration_handle_t *knem_reg = (mca_btl_vader_registration_handle_t *) reg; + + /* NTH: explicity ignore the return code. Don't care about this cookie anymore anyway. */ + (void) ioctl(mca_btl_vader.knem_fd, KNEM_CMD_DESTROY_REGION, &knem_reg->btl_handle.cookie); + + return OPAL_SUCCESS; +} + +static mca_btl_base_registration_handle_t * +mca_btl_vader_register_mem_knem (struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t *endpoint, + void *base, size_t size, uint32_t flags) +{ + mca_btl_vader_registration_handle_t *reg = NULL; + int rc; + + rc = btl->btl_mpool->mpool_register (btl->btl_mpool, base, size, 0, + (mca_mpool_base_registration_t **) ®); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + return NULL; + } + + return ®->btl_handle; +} + +static int +mca_btl_vader_deregister_mem_knem (struct mca_btl_base_module_t *btl, struct mca_btl_base_registration_handle_t *handle) +{ + mca_btl_vader_registration_handle_t *reg = + (mca_btl_vader_registration_handle_t *)((intptr_t) handle - offsetof (mca_btl_vader_registration_handle_t, btl_handle)); + + btl->btl_mpool->mpool_deregister (btl->btl_mpool, ®->base); + + return OPAL_SUCCESS; +} int mca_btl_vader_knem_init (void) { + mca_mpool_base_resources_t mpool_resources = { + .pool_name = "vader", .reg_data = NULL, + .sizeof_reg = sizeof (mca_btl_vader_registration_handle_t), + .register_mem = mca_btl_vader_knem_reg, + .deregister_mem = mca_btl_vader_knem_dereg + }; struct knem_cmd_info knem_info; int rc; @@ -74,6 +148,17 @@ int mca_btl_vader_knem_init (void) mca_btl_vader.super.btl_get = mca_btl_vader_get_knem; mca_btl_vader.super.btl_put = mca_btl_vader_put_knem; + /* knem requires registration */ + mca_btl_vader.super.btl_register_mem = mca_btl_vader_register_mem_knem; + mca_btl_vader.super.btl_deregister_mem = mca_btl_vader_deregister_mem_knem; + mca_btl_vader.super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t); + + mca_btl_vader.super.btl_mpool = mca_mpool_base_module_create ("grdma", NULL, + &mpool_resources); + if (NULL == mca_btl_vader.super.btl_mpool) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + return OPAL_SUCCESS; } while (0); @@ -89,6 +174,11 @@ int mca_btl_vader_knem_fini (void) mca_btl_vader.knem_fd = -1; } + if (mca_btl_vader.super.btl_mpool) { + (void) mca_mpool_base_module_destroy (mca_btl_vader.super.btl_mpool); + mca_btl_vader.super.btl_mpool = NULL; + } + return OPAL_SUCCESS; } diff --git a/opal/mca/btl/vader/btl_vader_knem.h b/opal/mca/btl/vader/btl_vader_knem.h index 1d6fa2d164..8d3b840209 100644 --- a/opal/mca/btl/vader/btl_vader_knem.h +++ b/opal/mca/btl/vader/btl_vader_knem.h @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2014 Los Alamos National Security, LLC. All rights + * Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -17,6 +17,18 @@ #include #include +/* At this time only knem requires a registration of "RDMA" buffers */ +struct mca_btl_base_registration_handle_t { + uint64_t cookie; + intptr_t base_addr; +}; + +struct mca_btl_vader_registration_handle_t { + mca_mpool_base_registration_t base; + mca_btl_base_registration_handle_t btl_handle; +}; +typedef struct mca_btl_vader_registration_handle_t mca_btl_vader_registration_handle_t; + int mca_btl_vader_knem_init (void); int mca_btl_vader_knem_fini (void); int mca_btl_vader_knem_progress (void); diff --git a/opal/mca/btl/vader/btl_vader_module.c b/opal/mca/btl/vader/btl_vader_module.c index 3ab684c963..2f30cd3071 100644 --- a/opal/mca/btl/vader/btl_vader_module.c +++ b/opal/mca/btl/vader/btl_vader_module.c @@ -48,7 +48,6 @@ static int vader_free (struct mca_btl_base_module_t* btl, mca_btl_base_descripto static struct mca_btl_base_descriptor_t *vader_prepare_src ( struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, - mca_mpool_base_registration_t *registration, struct opal_convertor_t *convertor, uint8_t order, size_t reserve, @@ -56,16 +55,6 @@ static struct mca_btl_base_descriptor_t *vader_prepare_src ( uint32_t flags ); -static struct mca_btl_base_descriptor_t *vader_prepare_dst ( - struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_mpool_base_registration_t *registration, - struct opal_convertor_t *convertor, - uint8_t order, - size_t reserve, - size_t *size, - uint32_t flags); - static int vader_add_procs(struct mca_btl_base_module_t* btl, size_t nprocs, struct opal_proc_t **procs, struct mca_btl_base_endpoint_t** peers, @@ -82,7 +71,6 @@ mca_btl_vader_t mca_btl_vader = { .btl_alloc = mca_btl_vader_alloc, .btl_free = vader_free, .btl_prepare_src = vader_prepare_src, - .btl_prepare_dst = vader_prepare_dst, .btl_send = mca_btl_vader_send, .btl_sendi = mca_btl_vader_sendi, .btl_dump = mca_btl_base_dump, @@ -108,21 +96,6 @@ static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n) component->segment_offset = MCA_BTL_VADER_FIFO_SIZE; /* initialize fragment descriptor free lists */ - /* initialize free list for single copy (get, put) */ - if (MCA_BTL_VADER_NONE != mca_btl_vader_component.single_copy_mechanism) { - rc = ompi_free_list_init_ex_new (&component->vader_frags_rdma, - sizeof(mca_btl_vader_frag_t), 8, - OBJ_CLASS(mca_btl_vader_frag_t), - 0, opal_cache_line_size, - component->vader_free_list_num, - component->vader_free_list_max, - component->vader_free_list_inc, - NULL, mca_btl_vader_frag_init, (void *) 0); - if (OPAL_SUCCESS != rc) { - return rc; - } - } - /* initialize free list for small send and inline fragments */ rc = ompi_free_list_init_ex_new(&component->vader_frags_user, sizeof(mca_btl_vader_frag_t), @@ -418,7 +391,7 @@ mca_btl_base_descriptor_t *mca_btl_vader_alloc(struct mca_btl_base_module_t *btl } if (OPAL_LIKELY(frag != NULL)) { - frag->segments[0].base.seg_len = size; + frag->segments[0].seg_len = size; frag->base.des_flags = flags; frag->base.order = order; @@ -440,56 +413,6 @@ static int vader_free (struct mca_btl_base_module_t *btl, mca_btl_base_descripto return OPAL_SUCCESS; } -struct mca_btl_base_descriptor_t *vader_prepare_dst(struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_mpool_base_registration_t *registration, - struct opal_convertor_t *convertor, - uint8_t order, size_t reserve, size_t *size, - uint32_t flags) -{ - mca_btl_vader_frag_t *frag; - void *data_ptr; - - (void) MCA_BTL_VADER_FRAG_ALLOC_RDMA(frag, endpoint); - if (OPAL_UNLIKELY(NULL == frag)) { - return NULL; - } - - opal_convertor_get_current_pointer (convertor, &data_ptr); - - frag->segments[0].base.seg_addr.lval = (uint64_t)(uintptr_t) data_ptr; - frag->segments[0].base.seg_len = *size; - -#if OPAL_BTL_VADER_HAVE_KNEM - if (MCA_BTL_VADER_KNEM == mca_btl_vader_component.single_copy_mechanism) { - struct knem_cmd_create_region knem_cr; - struct knem_cmd_param_iovec knem_iov; - - knem_iov.base = (uintptr_t) data_ptr; - knem_iov.len = *size; - - knem_cr.iovec_array = (uintptr_t) &knem_iov; - knem_cr.iovec_nr = 1; - knem_cr.protection = PROT_WRITE; - /* Vader will explicitly destroy this cookie */ - knem_cr.flags = 0; - if (OPAL_UNLIKELY(ioctl(mca_btl_vader.knem_fd, KNEM_CMD_CREATE_REGION, &knem_cr) < 0)) { - MCA_BTL_VADER_FRAG_RETURN(frag); - return NULL; - } - - frag->segments[0].cookie = knem_cr.cookie; - frag->segments[0].registered_base = (intptr_t) data_ptr; - frag->cookie = knem_cr.cookie; - } -#endif /* OPAL_BTL_SM_HAVE_KNEM */ - - frag->base.order = order; - frag->base.des_flags = flags; - - return &frag->base; -} - /** * Pack data * @@ -497,7 +420,6 @@ struct mca_btl_base_descriptor_t *vader_prepare_dst(struct mca_btl_base_module_t */ static struct mca_btl_base_descriptor_t *vader_prepare_src (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, - mca_mpool_base_registration_t *registration, struct opal_convertor_t *convertor, uint8_t order, size_t reserve, size_t *size, uint32_t flags) @@ -510,118 +432,84 @@ static struct mca_btl_base_descriptor_t *vader_prepare_src (struct mca_btl_base_ opal_convertor_get_current_pointer (convertor, &data_ptr); - if (OPAL_LIKELY(reserve)) { - /* in place send fragment */ - if (OPAL_UNLIKELY(opal_convertor_need_buffers(convertor))) { - uint32_t iov_count = 1; - struct iovec iov; + /* in place send fragment */ + if (OPAL_UNLIKELY(opal_convertor_need_buffers(convertor))) { + uint32_t iov_count = 1; + struct iovec iov; - /* non-contiguous data requires the convertor */ - if (MCA_BTL_VADER_XPMEM != mca_btl_vader_component.single_copy_mechanism && - total_size > mca_btl_vader.super.btl_eager_limit) { - (void) MCA_BTL_VADER_FRAG_ALLOC_MAX(frag, endpoint); - } else - (void) MCA_BTL_VADER_FRAG_ALLOC_EAGER(frag, endpoint); + /* non-contiguous data requires the convertor */ + if (MCA_BTL_VADER_XPMEM != mca_btl_vader_component.single_copy_mechanism && + total_size > mca_btl_vader.super.btl_eager_limit) { + (void) MCA_BTL_VADER_FRAG_ALLOC_MAX(frag, endpoint); + } else + (void) MCA_BTL_VADER_FRAG_ALLOC_EAGER(frag, endpoint); - if (OPAL_UNLIKELY(NULL == frag)) { - return NULL; - } - - iov.iov_len = *size; - iov.iov_base = - (IOVBASE_TYPE *)(((uintptr_t)(frag->segments[0].base.seg_addr.pval)) + - reserve); - - rc = opal_convertor_pack (convertor, &iov, &iov_count, size); - if (OPAL_UNLIKELY(rc < 0)) { - MCA_BTL_VADER_FRAG_RETURN(frag); - return NULL; - } - - frag->segments[0].base.seg_len = *size + reserve; - } else { - if (MCA_BTL_VADER_XPMEM != mca_btl_vader_component.single_copy_mechanism) { - if (OPAL_LIKELY(total_size <= mca_btl_vader.super.btl_eager_limit)) { - (void) MCA_BTL_VADER_FRAG_ALLOC_EAGER(frag, endpoint); - } else { - (void) MCA_BTL_VADER_FRAG_ALLOC_MAX(frag, endpoint); - } - } else - (void) MCA_BTL_VADER_FRAG_ALLOC_USER(frag, endpoint); - - if (OPAL_UNLIKELY(NULL == frag)) { - return NULL; - } - -#if OPAL_BTL_VADER_HAVE_XPMEM - /* use xpmem to send this segment if it is above the max inline send size */ - if (OPAL_UNLIKELY(MCA_BTL_VADER_XPMEM == mca_btl_vader_component.single_copy_mechanism && - total_size > (size_t) mca_btl_vader_component.max_inline_send)) { - /* single copy send */ - frag->hdr->flags = MCA_BTL_VADER_FLAG_SINGLE_COPY; - - /* set up single copy io vector */ - frag->hdr->sc_iov.iov_base = data_ptr; - frag->hdr->sc_iov.iov_len = *size; - - frag->segments[0].base.seg_len = reserve; - frag->segments[1].base.seg_len = *size; - frag->segments[1].base.seg_addr.pval = data_ptr; - frag->base.des_local_count = 2; - } else { -#endif - - /* inline send */ - if (OPAL_LIKELY(MCA_BTL_DES_FLAGS_BTL_OWNERSHIP & flags)) { - /* try to reserve a fast box for this transfer only if the - * fragment does not belong to the caller */ - fbox = mca_btl_vader_reserve_fbox (endpoint, total_size); - if (OPAL_LIKELY(fbox)) { - frag->segments[0].base.seg_addr.pval = fbox; - } - - frag->fbox = fbox; - } - - /* NTH: the covertor adds some latency so we bypass it here */ - memcpy ((void *)((uintptr_t)frag->segments[0].base.seg_addr.pval + reserve), data_ptr, *size); - frag->segments[0].base.seg_len = total_size; -#if OPAL_BTL_VADER_HAVE_XPMEM - } -#endif - } - } else { - /* put/get fragment */ - (void) MCA_BTL_VADER_FRAG_ALLOC_RDMA(frag, endpoint); if (OPAL_UNLIKELY(NULL == frag)) { return NULL; } - frag->segments[0].base.seg_addr.lval = (uint64_t)(uintptr_t) data_ptr; - frag->segments[0].base.seg_len = total_size; -#if OPAL_BTL_VADER_HAVE_KNEM - if (MCA_BTL_VADER_KNEM == mca_btl_vader_component.single_copy_mechanism) { - struct knem_cmd_create_region knem_cr; - struct knem_cmd_param_iovec knem_iov; + iov.iov_len = *size; + iov.iov_base = + (IOVBASE_TYPE *)(((uintptr_t)(frag->segments[0].seg_addr.pval)) + + reserve); - knem_iov.base = (uintptr_t) data_ptr; - knem_iov.len = total_size; + rc = opal_convertor_pack (convertor, &iov, &iov_count, size); + if (OPAL_UNLIKELY(rc < 0)) { + MCA_BTL_VADER_FRAG_RETURN(frag); + return NULL; + } - knem_cr.iovec_array = (uintptr_t) &knem_iov; - knem_cr.iovec_nr = 1; - knem_cr.protection = PROT_READ | PROT_WRITE; - /* Vader will explicitly destroy this cookie */ - knem_cr.flags = 0; - if (OPAL_UNLIKELY(ioctl(mca_btl_vader.knem_fd, KNEM_CMD_CREATE_REGION, &knem_cr) < 0)) { - MCA_BTL_VADER_FRAG_RETURN(frag); - return NULL; + frag->segments[0].seg_len = *size + reserve; + } else { + if (MCA_BTL_VADER_XPMEM != mca_btl_vader_component.single_copy_mechanism) { + if (OPAL_LIKELY(total_size <= mca_btl_vader.super.btl_eager_limit)) { + (void) MCA_BTL_VADER_FRAG_ALLOC_EAGER(frag, endpoint); + } else { + (void) MCA_BTL_VADER_FRAG_ALLOC_MAX(frag, endpoint); + } + } else + (void) MCA_BTL_VADER_FRAG_ALLOC_USER(frag, endpoint); + + if (OPAL_UNLIKELY(NULL == frag)) { + return NULL; + } + +#if OPAL_BTL_VADER_HAVE_XPMEM + /* use xpmem to send this segment if it is above the max inline send size */ + if (OPAL_UNLIKELY(MCA_BTL_VADER_XPMEM == mca_btl_vader_component.single_copy_mechanism && + total_size > (size_t) mca_btl_vader_component.max_inline_send)) { + /* single copy send */ + frag->hdr->flags = MCA_BTL_VADER_FLAG_SINGLE_COPY; + + /* set up single copy io vector */ + frag->hdr->sc_iov.iov_base = data_ptr; + frag->hdr->sc_iov.iov_len = *size; + + frag->segments[0].seg_len = reserve; + frag->segments[1].seg_len = *size; + frag->segments[1].seg_addr.pval = data_ptr; + frag->base.des_segment_count = 2; + } else { +#endif + + /* inline send */ + if (OPAL_LIKELY(MCA_BTL_DES_FLAGS_BTL_OWNERSHIP & flags)) { + /* try to reserve a fast box for this transfer only if the + * fragment does not belong to the caller */ + fbox = mca_btl_vader_reserve_fbox (endpoint, total_size); + if (OPAL_LIKELY(fbox)) { + frag->segments[0].seg_addr.pval = fbox; + } + + frag->fbox = fbox; } - frag->segments[0].cookie = knem_cr.cookie; - frag->segments[0].registered_base = (intptr_t) data_ptr; - frag->cookie = knem_cr.cookie; + /* NTH: the covertor adds some latency so we bypass it here */ + memcpy ((void *)((uintptr_t)frag->segments[0].seg_addr.pval + reserve), data_ptr, *size); + frag->segments[0].seg_len = total_size; +#if OPAL_BTL_VADER_HAVE_XPMEM } -#endif /* OPAL_BTL_SM_HAVE_KNEM */ +#endif } frag->base.order = order; diff --git a/opal/mca/btl/vader/btl_vader_put.c b/opal/mca/btl/vader/btl_vader_put.c index 49c3ddabb3..3107f420b3 100644 --- a/opal/mca/btl/vader/btl_vader_put.c +++ b/opal/mca/btl/vader/btl_vader_put.c @@ -35,47 +35,38 @@ * @param descriptor (IN) Description of the data to be transferred */ #if OPAL_BTL_VADER_HAVE_XPMEM -int mca_btl_vader_put_xpmem (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t *des) +int mca_btl_vader_put_xpmem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { - mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) des; - mca_btl_base_segment_t *src = des->des_local; - mca_btl_base_segment_t *dst = des->des_remote; - const size_t size = min(dst->seg_len, src->seg_len); mca_mpool_base_registration_t *reg; void *rem_ptr; - reg = vader_get_registation (endpoint, dst->seg_addr.pval, dst->seg_len, 0, &rem_ptr); + reg = vader_get_registation (endpoint, (void *)(intptr_t) remote_address, size, 0, &rem_ptr); if (OPAL_UNLIKELY(NULL == reg)) { return OPAL_ERROR; } - vader_memmove (rem_ptr, src->seg_addr.pval, size); + vader_memmove (rem_ptr, local_address, size); vader_return_registration (reg, endpoint); /* always call the callback function */ - frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; - - frag->endpoint = endpoint; - mca_btl_vader_frag_complete (frag); + cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS); return OPAL_SUCCESS; } #endif #if OPAL_BTL_VADER_HAVE_CMA -int mca_btl_vader_put_cma (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t *des) +int mca_btl_vader_put_cma (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { - mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) des; - mca_btl_base_segment_t *src = des->des_local; - mca_btl_base_segment_t *dst = des->des_remote; - const size_t size = min(dst->seg_len, src->seg_len); - struct iovec src_iov = {.iov_base = src->seg_addr.pval, .iov_len = size}; - struct iovec dst_iov = {.iov_base = dst->seg_addr.pval, .iov_len = size}; + struct iovec src_iov = {.iov_base = local_address, .iov_len = size}; + struct iovec dst_iov = {.iov_base = (void *)(intptr_t) remote_address, .iov_len = size}; ssize_t ret; ret = process_vm_writev (endpoint->segment_data.other.seg_ds->seg_cpid, &src_iov, 1, &dst_iov, 1, 0); @@ -85,36 +76,29 @@ int mca_btl_vader_put_cma (struct mca_btl_base_module_t *btl, } /* always call the callback function */ - frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; - - frag->endpoint = endpoint; - mca_btl_vader_frag_complete (frag); + cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS); return OPAL_SUCCESS; } #endif #if OPAL_BTL_VADER_HAVE_KNEM -int mca_btl_vader_put_knem (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t *des) +int mca_btl_vader_put_knem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { - mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) des; - mca_btl_vader_segment_t *src = (mca_btl_vader_segment_t *) des->des_local; - mca_btl_vader_segment_t *dst = (mca_btl_vader_segment_t *) des->des_remote; - const size_t size = min(dst->base.seg_len, src->base.seg_len); - intptr_t offset = dst->base.seg_addr.lval - dst->registered_base; struct knem_cmd_param_iovec send_iovec; struct knem_cmd_inline_copy icopy; /* Fill in the ioctl data fields. There's no async completion, so we don't need to worry about getting a slot, etc. */ - send_iovec.base = (uintptr_t) src->base.seg_addr.lval; + send_iovec.base = (uintptr_t) local_address; send_iovec.len = size; icopy.local_iovec_array = (uintptr_t) &send_iovec; icopy.local_iovec_nr = 1; - icopy.remote_cookie = dst->cookie; - icopy.remote_offset = offset; + icopy.remote_cookie = remote_handle->cookie; + icopy.remote_offset = remote_address - remote_handle->base_addr; icopy.write = 1; icopy.flags = 0; @@ -122,7 +106,7 @@ int mca_btl_vader_put_knem (struct mca_btl_base_module_t *btl, * is greater than the cutoff. Not that if DMA is not supported * or the user specified 0 for knem_dma_min the knem_dma_min was * set to UINT_MAX in mca_btl_vader_knem_init. */ - if (mca_btl_vader_component.knem_dma_min <= dst->base.seg_len) { + if (mca_btl_vader_component.knem_dma_min <= size) { icopy.flags = KNEM_FLAG_DMA; } /* synchronous flags only, no need to specify icopy.async_status_index */ @@ -138,10 +122,7 @@ int mca_btl_vader_put_knem (struct mca_btl_base_module_t *btl, } /* always call the callback function */ - frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; - - frag->endpoint = endpoint; - mca_btl_vader_frag_complete (frag); + cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS); return OPAL_SUCCESS; } diff --git a/opal/mca/btl/vader/btl_vader_send.c b/opal/mca/btl/vader/btl_vader_send.c index 5182f40ec6..59a10c366a 100644 --- a/opal/mca/btl/vader/btl_vader_send.c +++ b/opal/mca/btl/vader/btl_vader_send.c @@ -40,7 +40,7 @@ int mca_btl_vader_send (struct mca_btl_base_module_t *btl, mca_btl_base_tag_t tag) { mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) descriptor; - const size_t total_size = frag->segments[0].base.seg_len; + const size_t total_size = frag->segments[0].seg_len; if (OPAL_LIKELY(frag->fbox)) { mca_btl_vader_fbox_send (frag->fbox, tag); diff --git a/opal/mca/btl/vader/btl_vader_sendi.c b/opal/mca/btl/vader/btl_vader_sendi.c index 877105192a..be9768d53c 100644 --- a/opal/mca/btl/vader/btl_vader_sendi.c +++ b/opal/mca/btl/vader/btl_vader_sendi.c @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2006-2007 Voltaire. All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010-2014 Los Alamos National Security, LLC. All rights + * Copyright (c) 2010-2015 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -49,7 +49,10 @@ int mca_btl_vader_sendi (struct mca_btl_base_module_t *btl, /* don't attempt sendi if there are pending fragments on the endpoint */ if (OPAL_UNLIKELY(opal_list_get_size (&endpoint->pending_frags))) { - *descriptor = NULL; + if (descriptor) { + *descriptor = NULL; + } + return OPAL_ERR_OUT_OF_RESOURCE; } @@ -68,7 +71,9 @@ int mca_btl_vader_sendi (struct mca_btl_base_module_t *btl, frag = (mca_btl_vader_frag_t *) mca_btl_vader_alloc (btl, endpoint, order, length, flags | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); if (OPAL_UNLIKELY(NULL == frag)) { - *descriptor = NULL; + if (descriptor) { + *descriptor = NULL; + } return OPAL_ERR_OUT_OF_RESOURCE; } @@ -78,7 +83,7 @@ int mca_btl_vader_sendi (struct mca_btl_base_module_t *btl, frag->hdr->tag = tag; /* write the match header (with MPI comm/tag/etc. info) */ - memcpy (frag->segments[0].base.seg_addr.pval, header, header_size); + memcpy (frag->segments[0].seg_addr.pval, header, header_size); /* write the message data if there is any */ /* we can't use single-copy semantics here since as caller will consider the send @@ -88,7 +93,7 @@ int mca_btl_vader_sendi (struct mca_btl_base_module_t *btl, struct iovec iov; /* pack the data into the supplied buffer */ - iov.iov_base = (IOVBASE_TYPE *)((uintptr_t)frag->segments[0].base.seg_addr.pval + header_size); + iov.iov_base = (IOVBASE_TYPE *)((uintptr_t)frag->segments[0].seg_addr.pval + header_size); iov.iov_len = length = payload_size; (void) opal_convertor_pack (convertor, &iov, &iov_count, &length); @@ -98,7 +103,9 @@ int mca_btl_vader_sendi (struct mca_btl_base_module_t *btl, /* write the fragment pointer to peer's the FIFO. the progress function will return the fragment */ if (!vader_fifo_write_ep (frag->hdr, endpoint)) { - *descriptor = &frag->base; + if (descriptor) { + *descriptor = &frag->base; + } return OPAL_ERR_OUT_OF_RESOURCE; } diff --git a/opal/mca/common/cuda/common_cuda.c b/opal/mca/common/cuda/common_cuda.c index 0198743e2b..241a8d4b27 100644 --- a/opal/mca/common/cuda/common_cuda.c +++ b/opal/mca/common/cuda/common_cuda.c @@ -230,7 +230,7 @@ int mca_common_cuda_stage_one_init(void) opal_lt_dladvise advise; int retval, i, j; int advise_support = 1; - char *cudalibs[] = {"libcuda.so.1", NULL}; + char *cudalibs[] = {"libcuda.so.1", "libcuda.dylib", NULL}; char *searchpaths[] = {"", "/usr/lib64", NULL}; char **errmsgs = NULL; char *errmsg = NULL; @@ -1027,7 +1027,9 @@ int cuda_getmemhandle(void *base, size_t size, mca_mpool_base_registration_t *ne /* Store all the information in the registration */ cuda_reg->base.base = (void *)pbase; cuda_reg->base.bound = (unsigned char *)pbase + psize - 1; - memcpy(&cuda_reg->memHandle, &memHandle, sizeof(memHandle)); + memcpy(&cuda_reg->data.memHandle, &memHandle, sizeof(memHandle)); + cuda_reg->data.memh_seg_addr.pval = (void *) pbase; + cuda_reg->data.memh_seg_len = psize; #if OPAL_CUDA_SYNC_MEMOPS /* With CUDA 6.0, we can set an attribute on the memory pointer that will @@ -1051,7 +1053,7 @@ int cuda_getmemhandle(void *base, size_t size, mca_mpool_base_registration_t *ne * Note that this needs to be the NULL stream to make since it is * unknown what stream any copies into the device memory were done * with. */ - result = cuFunc.cuEventRecord((CUevent)cuda_reg->event, 0); + result = cuFunc.cuEventRecord((CUevent)cuda_reg->data.event, 0); if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed", true, result, base); @@ -1068,7 +1070,7 @@ int cuda_getmemhandle(void *base, size_t size, mca_mpool_base_registration_t *ne */ int cuda_ungetmemhandle(void *reg_data, mca_mpool_base_registration_t *reg) { - CUDA_DUMP_EVTHANDLE((100, ((mca_mpool_common_cuda_reg_t *)reg)->evtHandle, "cuda_ungetmemhandle")); + CUDA_DUMP_EVTHANDLE((100, ((mca_mpool_common_cuda_reg_t *)reg)->data.evtHandle, "cuda_ungetmemhandle")); opal_output_verbose(10, mca_common_cuda_output, "CUDA: cuda_ungetmemhandle (no-op): base=%p", reg->base); @@ -1089,7 +1091,7 @@ int cuda_openmemhandle(void *base, size_t size, mca_mpool_base_registration_t *n mca_mpool_common_cuda_reg_t *cuda_newreg = (mca_mpool_common_cuda_reg_t*)newreg; /* Need to copy into memory handle for call into CUDA library. */ - memcpy(&memHandle, cuda_newreg->memHandle, sizeof(memHandle)); + memcpy(&memHandle, cuda_newreg->data.memHandle, sizeof(memHandle)); CUDA_DUMP_MEMHANDLE((100, &memHandle, "Before call to cuIpcOpenMemHandle")); /* Open the memory handle and store it into the registration structure. */ @@ -1137,7 +1139,7 @@ int cuda_closememhandle(void *reg_data, mca_mpool_base_registration_t *reg) opal_output_verbose(10, mca_common_cuda_output, "CUDA: cuIpcCloseMemHandle passed: base=%p", cuda_reg->base.alloc_base); - CUDA_DUMP_MEMHANDLE((100, cuda_reg->memHandle, "cuIpcCloseMemHandle")); + CUDA_DUMP_MEMHANDLE((100, cuda_reg->data.memHandle, "cuIpcCloseMemHandle")); } return OPAL_SUCCESS; @@ -1189,7 +1191,7 @@ void mca_common_wait_stream_synchronize(mca_mpool_common_cuda_reg_t *rget_reg) CUevent event; CUresult result; - memcpy(&evtHandle, rget_reg->evtHandle, sizeof(evtHandle)); + memcpy(&evtHandle, rget_reg->data.evtHandle, sizeof(evtHandle)); CUDA_DUMP_EVTHANDLE((100, &evtHandle, "stream_synchronize")); result = cuFunc.cuIpcOpenEventHandle(&event, evtHandle); @@ -1613,7 +1615,7 @@ int mca_common_cuda_memhandle_matches(mca_mpool_common_cuda_reg_t *new_reg, mca_mpool_common_cuda_reg_t *old_reg) { - if (0 == memcmp(new_reg->memHandle, old_reg->memHandle, sizeof(new_reg->memHandle))) { + if (0 == memcmp(new_reg->data.memHandle, old_reg->data.memHandle, sizeof(new_reg->data.memHandle))) { return 1; } else { return 0; diff --git a/opal/mca/common/cuda/common_cuda.h b/opal/mca/common/cuda/common_cuda.h index 1bf00caefb..61dfa5351b 100644 --- a/opal/mca/common/cuda/common_cuda.h +++ b/opal/mca/common/cuda/common_cuda.h @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology @@ -10,6 +11,8 @@ * Copyright (c) 2004-2006 The Regents of the University of California. * All rights reserved. * Copyright (c) 2011 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -24,11 +27,19 @@ #define MEMHANDLE_SIZE 8 #define EVTHANDLE_SIZE 8 -struct mca_mpool_common_cuda_reg_t { - mca_mpool_base_registration_t base; + +struct mca_mpool_common_cuda_reg_data_t { uint64_t memHandle[MEMHANDLE_SIZE]; uint64_t evtHandle[EVTHANDLE_SIZE]; uint64_t event; + opal_ptr_t memh_seg_addr; + size_t memh_seg_len; +}; +typedef struct mca_mpool_common_cuda_reg_data_t mca_mpool_common_cuda_reg_data_t; + +struct mca_mpool_common_cuda_reg_t { + mca_mpool_base_registration_t base; + mca_mpool_common_cuda_reg_data_t data; }; typedef struct mca_mpool_common_cuda_reg_t mca_mpool_common_cuda_reg_t; extern bool mca_common_cuda_enabled; diff --git a/opal/mca/mpool/rgpusm/mpool_rgpusm_module.c b/opal/mca/mpool/rgpusm/mpool_rgpusm_module.c index 776ccc8c3b..5c470cf329 100644 --- a/opal/mca/mpool/rgpusm/mpool_rgpusm_module.c +++ b/opal/mca/mpool/rgpusm/mpool_rgpusm_module.c @@ -212,7 +212,7 @@ int mca_mpool_rgpusm_register(mca_mpool_base_module_t *mpool, void *addr, rgpusm_reg->base.flags = flags; /* Copy the memory handle received into the registration */ - memcpy(rgpusm_reg->memHandle, rget_reg->memHandle, sizeof(rget_reg->memHandle)); + memcpy(rgpusm_reg->data.memHandle, rget_reg->data.memHandle, sizeof(rget_reg->data.memHandle)); /* The rget_reg registration is holding the memory handle needed * to register the remote memory. This was received from the remote @@ -325,7 +325,7 @@ int mca_mpool_rgpusm_register(mca_mpool_base_module_t *mpool, void *addr, rgpusm_reg->base.flags = flags; /* Need the memory handle saved in the registration */ - memcpy(rgpusm_reg->memHandle, rget_reg->memHandle, sizeof(rget_reg->memHandle)); + memcpy(rgpusm_reg->data.memHandle, rget_reg->data.memHandle, sizeof(rget_reg->data.memHandle)); /* Actually register the memory, which opens the memory handle. * Need to do this prior to putting in the cache as the base and diff --git a/oshmem/mca/spml/yoda/spml_yoda.c b/oshmem/mca/spml/yoda/spml_yoda.c index ccf55b57d7..e26b88e77b 100644 --- a/oshmem/mca/spml/yoda/spml_yoda.c +++ b/oshmem/mca/spml/yoda/spml_yoda.c @@ -1,8 +1,11 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2013 Mellanox Technologies, Inc. * All rights reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -76,6 +79,10 @@ static int btl_name_to_id(char *btl_name) return YODA_BTL_OPENIB; } else if (0 == strcmp(btl_name, "self")) { return YODA_BTL_SELF; + } else if (0 == strcmp(btl_name, "vader")) { + return YODA_BTL_VADER; + } else if (0 == strcmp(btl_name, "ugni")) { + return YODA_BTL_UGNI; } return YODA_BTL_UNKNOWN; } @@ -91,21 +98,40 @@ static char *btl_type2str(int btl_type) return "openib"; case YODA_BTL_SM: return "sm"; + case YODA_BTL_VADER: + return "vader"; + case YODA_BTL_UGNI: + return "ugni"; } return "bad_btl_type"; } -static inline void calc_nfrags(mca_bml_base_btl_t* bml_btl, - size_t size, - unsigned int *frag_size, - int *nfrags, - int use_send) +static inline void calc_nfrags_put (mca_bml_base_btl_t* bml_btl, + size_t size, + unsigned int *frag_size, + int *nfrags, + int use_send) { if (use_send) { *frag_size = bml_btl->btl->btl_max_send_size - SPML_YODA_SEND_CONTEXT_SIZE; } else { - *frag_size = bml_btl->btl->btl_max_send_size; + *frag_size = bml_btl->btl->btl_put_limit; + } + *nfrags = 1 + (size - 1) / (*frag_size); +} + +static inline void calc_nfrags_get (mca_bml_base_btl_t* bml_btl, + size_t size, + unsigned int *frag_size, + int *nfrags, + int use_send) +{ + if (use_send) { + *frag_size = bml_btl->btl->btl_max_send_size - SPML_YODA_SEND_CONTEXT_SIZE; + } + else { + *frag_size = bml_btl->btl->btl_get_limit; } *nfrags = 1 + (size - 1) / (*frag_size); } @@ -152,7 +178,7 @@ static inline void mca_spml_yoda_bml_alloc( mca_bml_base_btl_t* bml_btl, size, flags); - if (OPAL_UNLIKELY(!(*des) || !(*des)->des_local ) && !is_fence_complete) { + if (OPAL_UNLIKELY(!(*des) || !(*des)->des_segments ) && !is_fence_complete) { mca_spml_yoda_fence_internal(mca_spml_yoda.bml_alloc_threshold); is_fence_complete = true; @@ -205,7 +231,7 @@ static void mca_yoda_put_callback(mca_btl_base_module_t* btl, size_t* size; void** l_addr; - size = (size_t *) des->des_local->seg_addr.pval; + size = (size_t *) des->des_segments->seg_addr.pval; l_addr = (void**) ( ((char*)size) + sizeof(*size)); memcpy(*l_addr, ((char*)l_addr) + sizeof(*l_addr), *size); } @@ -231,7 +257,7 @@ static void mca_yoda_get_callback(mca_btl_base_module_t* btl, putreq = NULL; /* Unpack data */ - p = (void **)des->des_local->seg_addr.pval; + p = (void **)des->des_segments->seg_addr.pval; p_src = (void*) p; size = (size_t*)((char*)p_src + sizeof(*p_src) ); @@ -252,18 +278,18 @@ static void mca_yoda_get_callback(mca_btl_base_module_t* btl, MCA_BTL_DES_SEND_ALWAYS_CALLBACK, 1); - if (OPAL_UNLIKELY(!des_loc || !des_loc->des_local)) { + if (OPAL_UNLIKELY(!des_loc || !des_loc->des_segments)) { SPML_ERROR("shmem OOM error need %d bytes", (int)*size); oshmem_shmem_abort(-1); } - spml_yoda_prepare_for_get_response((void*)des_loc->des_local->seg_addr.pval, *size, (void*)*p_src, (void*) *p_dst,(void*)*p_getreq,1); + spml_yoda_prepare_for_get_response((void*)des_loc->des_segments->seg_addr.pval, *size, (void*)*p_src, (void*) *p_dst,(void*)*p_getreq,1); frag->rdma_req = putreq; /* Initialize callback data for put*/ des_loc->des_cbdata = frag; des_loc->des_cbfunc = mca_spml_yoda_put_completion; - des_loc->des_local_count = 1; + des_loc->des_segment_count = 1; OPAL_THREAD_ADD32(&mca_spml_yoda.n_active_puts, 1); @@ -299,7 +325,7 @@ static void mca_yoda_get_response_callback(mca_btl_base_module_t* btl, mca_spml_yoda_get_request_t* getreq; /* unpacking data*/ - size = (size_t *) ( ((char*)des->des_local->seg_addr.pval) ); + size = (size_t *) ( ((char*)des->des_segments->seg_addr.pval) ); l_addr = (void**)( ((char*)size) + sizeof(*size)); getreq = (mca_spml_yoda_get_request_t*)*(void**)((char*)l_addr + sizeof(*l_addr) + *size); @@ -339,8 +365,7 @@ int mca_spml_yoda_deregister(sshmem_mkey_t *mkeys) yoda_context->btl_src_descriptor = NULL; } if (yoda_context->registration) { - ybtl->btl->btl_mpool->mpool_deregister(ybtl->btl->btl_mpool, - yoda_context->registration); + ybtl->btl->btl_deregister_mem (ybtl->btl, yoda_context->registration); } } @@ -355,16 +380,9 @@ sshmem_mkey_t *mca_spml_yoda_register(void* addr, int *count) { int i; - mca_btl_base_descriptor_t* des = NULL; - const opal_datatype_t *datatype = &opal_datatype_wchar; - opal_convertor_t convertor; sshmem_mkey_t *mkeys; struct yoda_btl *ybtl; - oshmem_proc_t *proc_self; mca_spml_yoda_context_t* yoda_context; - struct iovec iov; - uint32_t iov_count = 1; - SPML_VERBOSE(10, "address %p len %llu", addr, (unsigned long long)size); *count = 0; @@ -375,10 +393,6 @@ sshmem_mkey_t *mca_spml_yoda_register(void* addr, return NULL ; } - proc_self = oshmem_proc_group_find(oshmem_group_all, oshmem_my_proc_id()); - /* create convertor */ - OBJ_CONSTRUCT(&convertor, opal_convertor_t); - mca_bml.bml_register( MCA_SPML_YODA_PUT, mca_yoda_put_callback, NULL ); @@ -402,8 +416,8 @@ sshmem_mkey_t *mca_spml_yoda_register(void* addr, continue; } - /* If we have shared memory just save its id*/ - if (YODA_BTL_SM == ybtl->btl_type + /* If we have shared memory just save its id */ + if ((YODA_BTL_SM == ybtl->btl_type || YODA_BTL_VADER == ybtl->btl_type) && MAP_SEGMENT_SHM_INVALID != (int)shmid) { mkeys[i].u.key = shmid; mkeys[i].va_base = 0; @@ -415,54 +429,23 @@ sshmem_mkey_t *mca_spml_yoda_register(void* addr, yoda_context->registration = NULL; if (ybtl->btl->btl_flags & MCA_BTL_FLAGS_RDMA) { - - /* initialize convertor for source descriptor*/ - opal_convertor_copy_and_prepare_for_recv(proc_self->super.proc_convertor, - datatype, - size, - addr, - 0, - &convertor); - - if (NULL != ybtl->btl->btl_mpool && NULL != ybtl->btl->btl_mpool->mpool_register) { - iov.iov_len = size; - iov.iov_base = NULL; - - opal_convertor_pack(&convertor, &iov, &iov_count, &size); - ybtl->btl->btl_mpool->mpool_register(ybtl->btl->btl_mpool, - iov.iov_base, size, 0, &yoda_context->registration); - } - /* initialize convertor for source descriptor*/ - opal_convertor_copy_and_prepare_for_recv(proc_self->super.proc_convertor, - datatype, - size, - addr, - 0, - &convertor); - - /* register source memory */ - des = ybtl->btl->btl_prepare_src(ybtl->btl, - ybtl->bml_btl->btl_endpoint, - yoda_context->registration, - &convertor, - MCA_BTL_NO_ORDER, - 0, - &size, - 0); - if (NULL == des) { - SPML_ERROR("%s: failed to register source memory. ", - btl_type2str(ybtl->btl_type)); - /* FIXME some cleanup might be needed here - * yoda_context->btl_src_descriptor = NULL; - * OBJ_DESTRUCT(&convertor); - * *count = ???; - */ - return NULL; + if (NULL != ybtl->btl->btl_register_mem) { + yoda_context->registration = ybtl->btl->btl_register_mem (ybtl->btl, MCA_BTL_ENDPOINT_ANY, + addr, size, MCA_BTL_REG_FLAG_ACCESS_ANY); + if (NULL == yoda_context->registration) { + SPML_ERROR("%s: failed to register source memory: addr: %p, size: %u", + btl_type2str(ybtl->btl_type), addr, size); + /* FIXME some cleanup might be needed here + * yoda_context->btl_src_descriptor = NULL; + * *count = ???; + */ + return NULL; + } } - yoda_context->btl_src_descriptor = des; - mkeys[i].u.data = des->des_local; - mkeys[i].len = ybtl->btl->btl_seg_size; + yoda_context->btl_src_descriptor = NULL; + mkeys[i].u.data = yoda_context->registration; + mkeys[i].len = yoda_context->registration ? ybtl->btl->btl_registration_handle_size : 0; } SPML_VERBOSE(5, @@ -470,7 +453,6 @@ sshmem_mkey_t *mca_spml_yoda_register(void* addr, OSHMEM_PROC_VPID(oshmem_proc_local_proc), btl_type2str(ybtl->btl_type), mkeys[i].va_base, mkeys[i].len, (unsigned long long)mkeys[i].u.key, (unsigned long long)size); } - OBJ_DESTRUCT(&convertor); *count = mca_spml_yoda.n_btls; return mkeys; } @@ -746,7 +728,6 @@ static inline mca_bml_base_btl_t *get_next_btl(int dst, int *btl_id) return bml_btl; } - static inline int mca_spml_yoda_put_internal(void *dst_addr, size_t size, void *src_addr, @@ -769,6 +750,7 @@ static inline int mca_spml_yoda_put_internal(void *dst_addr, int btl_id = 0; struct yoda_btl *ybtl; int put_via_send; + mca_btl_base_registration_handle_t *local_handle = NULL, *remote_handle = NULL; /* If nothing to put its OK.*/ if (0 >= size) { @@ -802,17 +784,22 @@ static inline int mca_spml_yoda_put_internal(void *dst_addr, ybtl = &mca_spml_yoda.btl_type_map[btl_id]; + if (ybtl->btl->btl_register_mem) { + assert (r_mkey->len == ybtl->btl->btl_registration_handle_size); + remote_handle = (mca_btl_base_registration_handle_t *) r_mkey->u.data; + } + /* check if we doing put into shm attached segment and if so * just do memcpy */ - if ((YODA_BTL_SM == ybtl->btl_type) + if ((YODA_BTL_SM == ybtl->btl_type || YODA_BTL_VADER == ybtl->btl_type) && mca_memheap_base_can_local_copy(r_mkey, dst_addr)) { memcpy((void *) (unsigned long) rva, src_addr, size); return OSHMEM_SUCCESS; } /* We support only blocking PUT now => we always need copy for src buffer*/ - calc_nfrags(bml_btl, size, &frag_size, &nfrags, put_via_send); + calc_nfrags_put (bml_btl, size, &frag_size, &nfrags, put_via_send); p_src = (char*) src_addr; p_dst = (char*) (unsigned long) rva; @@ -832,7 +819,7 @@ static inline int mca_spml_yoda_put_internal(void *dst_addr, MCA_BTL_DES_SEND_ALWAYS_CALLBACK, put_via_send); - if (OPAL_UNLIKELY(!des || !des->des_local )) { + if (OPAL_UNLIKELY(!des || !des->des_segments )) { SPML_ERROR("src=%p nfrags = %d frag_size=%d", src_addr, nfrags, frag_size); SPML_ERROR("shmem OOM error need %d bytes", ncopied); @@ -844,35 +831,36 @@ static inline int mca_spml_yoda_put_internal(void *dst_addr, } /* copy data to allocated buffer*/ - segment = des->des_local; + segment = des->des_segments; spml_yoda_prepare_for_put((void*)segment->seg_addr.pval, ncopied, (void*)p_src, (void*)p_dst, put_via_send); - /* Preparing destination buffer */ - - assert( NULL != r_mkey->u.data && 0 != r_mkey->len); - - memcpy(&frag->rdma_segs[0].base_seg, - r_mkey->u.data, - r_mkey->len); + if (!put_via_send && ybtl->btl->btl_register_mem) { + local_handle = ybtl->btl->btl_register_mem (ybtl->btl, bml_btl->btl_endpoint, + segment->seg_addr.pval, ncopied, 0); + if (NULL == local_handle) { + /* No free resources, Block on completion here */ + SPML_ERROR("shmem error: OSHMEM_ERR_OUT_OF_RESOURCE"); + oshmem_request_wait_completion(&putreq->req_put.req_base.req_oshmem); + } + } frag->rdma_segs[0].base_seg.seg_addr.lval = (uintptr_t) p_dst; frag->rdma_segs[0].base_seg.seg_len = (put_via_send ? ncopied + SPML_YODA_SEND_CONTEXT_SIZE : ncopied); - des->des_remote = &frag->rdma_segs[0].base_seg; - frag->rdma_req = putreq; /* initialize callback data for put*/ des->des_cbdata = frag; des->des_cbfunc = mca_spml_yoda_put_completion; - des->des_remote_count = 1; OPAL_THREAD_ADD32(&mca_spml_yoda.n_active_puts, 1); /* put the data to remote side */ if (!put_via_send) { - rc = mca_bml_base_put(bml_btl, des); + rc = mca_bml_base_put (bml_btl, segment->seg_addr.pval, (uint64_t) (intptr_t) p_dst, + local_handle, remote_handle, ncopied, 0, 0, mca_spml_yoda_put_completion_rdma, + des); } else { rc = mca_bml_base_send(bml_btl, des, MCA_SPML_YODA_PUT); if (1 == rc) @@ -1025,11 +1013,7 @@ int mca_spml_yoda_get(void* src_addr, size_t size, void* dst_addr, int src) struct yoda_btl *ybtl; int btl_id = 0; int get_via_send; - const opal_datatype_t *datatype = &opal_datatype_wchar; - opal_convertor_t convertor; - oshmem_proc_t *proc_self; - size_t prepare_size; - mca_mpool_base_registration_t* registration; + mca_btl_base_registration_handle_t *local_handle, *remote_handle = NULL; mca_spml_yoda_get_request_t* getreq = NULL; /*If nothing to get its OK.*/ @@ -1057,6 +1041,7 @@ int mca_spml_yoda_get(void* src_addr, size_t size, void* dst_addr, int src) src, src_addr); oshmem_shmem_abort(-1); } + #if SPML_YODA_DEBUG == 1 SPML_VERBOSE(100, "get: pe:%d src=%p -> dst: %p sz=%d. src_rva=%p, %s", src, src_addr, dst_addr, (int)size, (void *)rva, mca_spml_base_mkey2str(r_mkey)); @@ -1064,12 +1049,17 @@ int mca_spml_yoda_get(void* src_addr, size_t size, void* dst_addr, int src) ybtl = &mca_spml_yoda.btl_type_map[btl_id]; + if (ybtl->btl->btl_register_mem) { + assert(ybtl->btl->btl_registration_handle_size == r_mkey->len); + remote_handle = (mca_btl_base_registration_handle_t *) r_mkey->u.data; + } + nfrags = 1; /* check if we doing get into shm attached segment and if so * just do memcpy */ - if ((YODA_BTL_SM == ybtl->btl_type) + if ((YODA_BTL_SM == ybtl->btl_type || YODA_BTL_VADER == ybtl->btl_type) && mca_memheap_base_can_local_copy(r_mkey, src_addr)) { memcpy(dst_addr, (void *) rva, size); /* must call progress here to avoid deadlock. Scenarion: @@ -1089,7 +1079,7 @@ int mca_spml_yoda_get(void* src_addr, size_t size, void* dst_addr, int src) */ frag_size = ncopied; if ((NULL == l_mkey) || get_via_send) { - calc_nfrags(bml_btl, size, &frag_size, &nfrags, get_via_send); + calc_nfrags_get (bml_btl, size, &frag_size, &nfrags, get_via_send); } p_src = (char*) (unsigned long) rva; @@ -1110,7 +1100,6 @@ int mca_spml_yoda_get(void* src_addr, size_t size, void* dst_addr, int src) ncopied = i < nfrags - 1 ? frag_size :(unsigned) ((char *) dst_addr + size - p_dst); frag->allocated = 0; /* Prepare destination descriptor*/ - assert(0 != r_mkey->len); memcpy(&frag->rdma_segs[0].base_seg, r_mkey->u.data, r_mkey->len); @@ -1130,16 +1119,17 @@ int mca_spml_yoda_get(void* src_addr, size_t size, void* dst_addr, int src) (int)frag_size, MCA_BTL_DES_SEND_ALWAYS_CALLBACK, get_via_send); - if (OPAL_UNLIKELY(!des || !des->des_local)) { + if (OPAL_UNLIKELY(!des || !des->des_segments)) { SPML_ERROR("shmem OOM error need %d bytes", ncopied); SPML_ERROR("src=%p nfrags = %d frag_size=%d", src_addr, nfrags, frag_size); oshmem_shmem_abort(-1); } - segment = des->des_local; + segment = des->des_segments; spml_yoda_prepare_for_get((void*)segment->seg_addr.pval, ncopied, (void*)p_src, oshmem_my_proc_id(), (void*)p_dst, (void*) getreq); des->des_cbfunc = mca_spml_yoda_get_response_completion; + des->des_cbdata = frag; OPAL_THREAD_ADD32(&mca_spml_yoda.n_active_gets, 1); } @@ -1147,36 +1137,24 @@ int mca_spml_yoda_get(void* src_addr, size_t size, void* dst_addr, int src) /* * Register src memory if do GET via GET */ - proc_self = oshmem_proc_group_find(oshmem_group_all, oshmem_my_proc_id()); - OBJ_CONSTRUCT(&convertor, opal_convertor_t); + if (NULL == l_mkey && ybtl->btl->btl_register_mem) { + local_handle = ybtl->btl->btl_register_mem (ybtl->btl, bml_btl->btl_endpoint, p_dst, ncopied, + MCA_BTL_REG_FLAG_LOCAL_WRITE); - prepare_size = ncopied; - opal_convertor_copy_and_prepare_for_recv(proc_self->super.proc_convertor, - datatype, - prepare_size, - p_dst, - 0, - &convertor); + if (NULL == local_handle) { + SPML_ERROR("%s: failed to register destination memory %p.", + btl_type2str(ybtl->btl_type), p_dst); + } - registration = (NULL == l_mkey ? NULL : ((mca_spml_yoda_context_t*)l_mkey->spml_context)->registration); - des = ybtl->btl->btl_prepare_dst(ybtl->btl, - bml_btl->btl_endpoint, - registration, - &convertor, - MCA_BTL_NO_ORDER, - 0, - &prepare_size, - 0); - if (NULL == des) { - SPML_ERROR("%s: failed to register destination memory %p.", - btl_type2str(ybtl->btl_type), p_dst); + frag->local_handle = local_handle; + } else { + local_handle = ((mca_spml_yoda_context_t*)l_mkey->spml_context)->registration; + frag->local_handle = NULL; } - OBJ_DESTRUCT(&convertor); + frag->rdma_segs[0].base_seg.seg_addr.lval = (uintptr_t) p_src; getreq->p_dst = (uint64_t*) p_dst; frag->size = ncopied; - des->des_cbfunc = mca_spml_yoda_get_completion; - des->des_remote = &frag->rdma_segs[0].base_seg; OPAL_THREAD_ADD32(&mca_spml_yoda.n_active_gets, 1); } @@ -1189,12 +1167,6 @@ int mca_spml_yoda_get(void* src_addr, size_t size, void* dst_addr, int src) frag->rdma_req = getreq; - /** - * Init remote side descriptor. - */ - des->des_remote_count = 1; - des->des_cbdata = frag; - /** * Do GET operation */ @@ -1203,7 +1175,8 @@ int mca_spml_yoda_get(void* src_addr, size_t size, void* dst_addr, int src) if (1 == rc) rc = OSHMEM_SUCCESS; } else { - rc = mca_bml_base_get(bml_btl, des); + rc = mca_bml_base_get(bml_btl, p_dst, (uint64_t) (intptr_t) p_src, local_handle, + remote_handle, ncopied, 0, 0, mca_spml_yoda_get_completion, frag); } if (OPAL_UNLIKELY(OSHMEM_SUCCESS != rc)) { diff --git a/oshmem/mca/spml/yoda/spml_yoda.h b/oshmem/mca/spml/yoda/spml_yoda.h index 7a21f01a63..deb45654b9 100644 --- a/oshmem/mca/spml/yoda/spml_yoda.h +++ b/oshmem/mca/spml/yoda/spml_yoda.h @@ -1,6 +1,9 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2013 Mellanox Technologies, Inc. * All rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -50,6 +53,8 @@ enum { YODA_BTL_SELF = 0, YODA_BTL_SM, YODA_BTL_OPENIB, + YODA_BTL_VADER, + YODA_BTL_UGNI, YODA_BTL_MAX }; @@ -86,7 +91,7 @@ typedef struct mca_spml_yoda_t mca_spml_yoda_module_t; struct mca_spml_yoda_context_t { mca_btl_base_descriptor_t* btl_src_descriptor; - mca_mpool_base_registration_t* registration; + mca_btl_base_registration_handle_t *registration; }; typedef struct mca_spml_yoda_context_t mca_spml_yoda_context_t; diff --git a/oshmem/mca/spml/yoda/spml_yoda_getreq.c b/oshmem/mca/spml/yoda/spml_yoda_getreq.c index 85e495a1c6..1b63038b9a 100644 --- a/oshmem/mca/spml/yoda/spml_yoda_getreq.c +++ b/oshmem/mca/spml/yoda/spml_yoda_getreq.c @@ -1,8 +1,11 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2013 Mellanox Technologies, Inc. * All rights reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -72,29 +75,24 @@ OBJ_CLASS_INSTANCE( mca_spml_yoda_get_request_t, mca_spml_yoda_get_request_construct, mca_spml_yoda_get_request_destruct); -void mca_spml_yoda_get_completion(mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* ep, - struct mca_btl_base_descriptor_t* des, - int status) +void mca_spml_yoda_get_completion (struct mca_btl_base_module_t* module, + struct mca_btl_base_endpoint_t* endpoint, + void *local_address, + struct mca_btl_base_registration_handle_t *local_handle, + void *context, void *cbdata, int status) { mca_spml_yoda_rdma_frag_t* frag = - (mca_spml_yoda_rdma_frag_t*) des->des_cbdata; + (mca_spml_yoda_rdma_frag_t*) cbdata; mca_spml_yoda_get_request_t* getreq = (mca_spml_yoda_get_request_t*) frag->rdma_req; - mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context; + mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) context; /* check completion status */ - if (OPAL_UNLIKELY(OSHMEM_SUCCESS != status)) { + if (OPAL_UNLIKELY(OPAL_SUCCESS != status)) { /* shmem has no way to propagate errors. cry&die */ SPML_ERROR("FATAL get completion error"); abort(); } - /* decide if we need to copy buffer */ - if (getreq->p_dst) { - memcpy(getreq->p_dst, - des->des_local->seg_addr.pval, - frag->size); - } if (getreq->parent) { OPAL_THREAD_ADD32(&getreq->parent->active_count, -1); @@ -103,7 +101,9 @@ void mca_spml_yoda_get_completion(mca_btl_base_module_t* btl, oshmem_request_complete(&getreq->req_get.req_base.req_oshmem, 1); oshmem_request_free((oshmem_request_t**) &getreq); - mca_bml_base_free(bml_btl, des); + if (bml_btl->btl->btl_register_mem && frag->local_handle) { + bml_btl->btl->btl_deregister_mem (bml_btl->btl, frag->local_handle); + } OPAL_THREAD_ADD32(&mca_spml_yoda.n_active_gets, -1); } diff --git a/oshmem/mca/spml/yoda/spml_yoda_getreq.h b/oshmem/mca/spml/yoda/spml_yoda_getreq.h index b83aa75669..f32f843063 100644 --- a/oshmem/mca/spml/yoda/spml_yoda_getreq.h +++ b/oshmem/mca/spml/yoda/spml_yoda_getreq.h @@ -1,6 +1,9 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2013 Mellanox Technologies, Inc. * All rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -51,10 +54,11 @@ static inline mca_spml_yoda_get_request_t *mca_spml_yoda_getreq_alloc(int dst) return getreq; } -void mca_spml_yoda_get_completion(mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* ep, - struct mca_btl_base_descriptor_t* des, - int status); +void mca_spml_yoda_get_completion (struct mca_btl_base_module_t* module, + struct mca_btl_base_endpoint_t* endpoint, + void *local_address, + struct mca_btl_base_registration_handle_t *local_handle, + void *context, void *cbdata, int status); void mca_spml_yoda_get_response_completion(mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* ep, diff --git a/oshmem/mca/spml/yoda/spml_yoda_putreq.c b/oshmem/mca/spml/yoda/spml_yoda_putreq.c index e70f34c63d..dea643652e 100644 --- a/oshmem/mca/spml/yoda/spml_yoda_putreq.c +++ b/oshmem/mca/spml/yoda/spml_yoda_putreq.c @@ -91,3 +91,20 @@ void mca_spml_yoda_put_completion(mca_btl_base_module_t* btl, oshmem_request_free((oshmem_request_t**) &putreq); mca_bml_base_free(bml_btl, des); } + +void mca_spml_yoda_put_completion_rdma (struct mca_btl_base_module_t* module, + struct mca_btl_base_endpoint_t* endpoint, + void *local_address, + struct mca_btl_base_registration_handle_t *local_handle, + void *context, void *cbdata, int status) +{ + mca_btl_base_descriptor_t *des = (mca_btl_base_descriptor_t *) cbdata; + mca_bml_base_btl_t *bml_btl = (mca_bml_base_btl_t *) context; + des->des_context = context; + + if (bml_btl->btl->btl_register_mem) { + bml_btl->btl->btl_deregister_mem (bml_btl->btl, local_handle); + } + + des->des_cbfunc (module, endpoint, des, status); +} diff --git a/oshmem/mca/spml/yoda/spml_yoda_putreq.h b/oshmem/mca/spml/yoda/spml_yoda_putreq.h index ea1aabfbbc..5251aa4415 100644 --- a/oshmem/mca/spml/yoda/spml_yoda_putreq.h +++ b/oshmem/mca/spml/yoda/spml_yoda_putreq.h @@ -49,6 +49,12 @@ void mca_spml_yoda_put_completion(mca_btl_base_module_t* btl, struct mca_btl_base_descriptor_t* des, int status); +void mca_spml_yoda_put_completion_rdma (struct mca_btl_base_module_t* module, + struct mca_btl_base_endpoint_t* endpoint, + void *local_address, + struct mca_btl_base_registration_handle_t *local_handle, + void *context, void *cbdata, int status); + END_C_DECLS #endif /* OSHMEM_SPML_YODA_PUT_REQUEST_H */ diff --git a/oshmem/mca/spml/yoda/spml_yoda_rdmafrag.h b/oshmem/mca/spml/yoda/spml_yoda_rdmafrag.h index 00ca338d17..398d378125 100644 --- a/oshmem/mca/spml/yoda/spml_yoda_rdmafrag.h +++ b/oshmem/mca/spml/yoda/spml_yoda_rdmafrag.h @@ -32,7 +32,7 @@ typedef union mca_spml_yoda_segment_t { struct mca_spml_yoda_rdma_frag_t { mca_spml_yoda_segment_t rdma_segs[2]; - mca_btl_base_segment_t *btl_seg; /* save pointer to btl allocated descriptor segment */ + mca_btl_base_registration_handle_t *local_handle; void *rdma_req; int allocated; int use_send;