diff --git a/opal/mca/btl/openib/Makefile.am b/opal/mca/btl/openib/Makefile.am index dfd65ac4c1..ccecd7c032 100644 --- a/opal/mca/btl/openib/Makefile.am +++ b/opal/mca/btl/openib/Makefile.am @@ -59,6 +59,8 @@ sources = \ btl_openib_fd.c \ btl_openib_ip.h \ btl_openib_ip.c \ + btl_openib_put.c \ + btl_openib_get.c \ connect/base.h \ connect/btl_openib_connect_base.c \ connect/btl_openib_connect_empty.c \ diff --git a/opal/mca/btl/openib/btl_openib.c b/opal/mca/btl/openib/btl_openib.c index a26f960bda..88eb31f466 100644 --- a/opal/mca/btl/openib/btl_openib.c +++ b/opal/mca/btl/openib/btl_openib.c @@ -1369,18 +1369,6 @@ int mca_btl_openib_free( struct mca_btl_base_module_t* btl, mca_btl_base_descriptor_t* des) { - /* is this fragment pointing at user memory? */ - if(MCA_BTL_OPENIB_FRAG_SEND_USER == openib_frag_type(des) || - MCA_BTL_OPENIB_FRAG_RECV_USER == openib_frag_type(des)) { - mca_btl_openib_com_frag_t* frag = to_com_frag(des); - - if(frag->registration != NULL) { - btl->btl_mpool->mpool_deregister(btl->btl_mpool, - (mca_mpool_base_registration_t*)frag->registration); - frag->registration = NULL; - } - } - /* reset those field on free so we will not have to do it on alloc */ to_base_frag(des)->base.des_flags = 0; switch(openib_frag_type(des)) { @@ -1799,7 +1787,10 @@ no_credits_or_wqe: cant_send: OPAL_THREAD_UNLOCK(&ep->endpoint_lock); /* We can not send the data directly, so we just return descriptor */ - *descriptor = mca_btl_openib_alloc(btl, ep, order, size, flags); + if (NULL != descriptor) { + *descriptor = mca_btl_openib_alloc(btl, ep, order, size, flags); + } + return OPAL_ERR_RESOURCE_BUSY; } /* @@ -1867,230 +1858,6 @@ static int mca_btl_openib_deregister_mem (mca_btl_base_module_t *btl, mca_btl_ba return OPAL_SUCCESS; } -/* - * RDMA WRITE local buffer to remote buffer address. - */ - -int mca_btl_openib_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, void *local_address, - uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, - mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, - int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) -{ - mca_btl_openib_put_frag_t *frag = NULL; - int rc, qp = order; - - if (OPAL_UNLIKELY(size > btl->btl_put_limit)) { - return OPAL_ERR_BAD_PARAM; - } - - frag = to_put_frag(alloc_send_user_frag ()); - if (OPAL_UNLIKELY(NULL == frag)) { - return OPAL_ERR_OUT_OF_RESOURCE; - } - - if (MCA_BTL_NO_ORDER == qp) { - qp = mca_btl_openib_component.rdma_qp; - } - - /* set base descriptor flags */ - to_base_frag(frag)->base.order = order; - /* free this descriptor when the operation is complete */ - to_base_frag(frag)->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP; - - /* set up scatter-gather entry */ - to_com_frag(frag)->sg_entry.length = size; - to_com_frag(frag)->sg_entry.lkey = local_handle->lkey; - to_com_frag(frag)->sg_entry.addr = (uint64_t)(intptr_t) local_address; - to_com_frag(frag)->endpoint = ep; - - /* set up rdma callback */ - frag->cb.func = cbfunc; - frag->cb.context = cbcontext; - frag->cb.data = cbdata; - frag->cb.local_handle = local_handle; - - /* post descriptor */ - to_out_frag(frag)->sr_desc.opcode = IBV_WR_RDMA_WRITE; - to_out_frag(frag)->sr_desc.send_flags = ib_send_flags(size, &(ep->qps[qp]), 1); - to_out_frag(frag)->sr_desc.wr.rdma.remote_addr = remote_address; - -#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if ((ep->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN) - != (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) { - to_out_frag(frag)->sr_desc.wr.rdma.rkey = opal_swap_bytes4(remote_handle->rkey); - } else -#endif - { - to_out_frag(frag)->sr_desc.wr.rdma.rkey = remote_handle->rkey; - } - -#if HAVE_XRC - if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) - to_out_frag(frag)->sr_desc.xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num; -#endif - - if (ep->endpoint_state != MCA_BTL_IB_CONNECTED) { - OPAL_THREAD_LOCK(&ep->endpoint_lock); - rc = check_endpoint_state(ep, &to_base_frag(frag)->base, &ep->pending_put_frags); - OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - if (OPAL_ERR_RESOURCE_BUSY == rc) { - /* descriptor was queued pending connection */ - return OPAL_SUCCESS; - } - - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - MCA_BTL_IB_FRAG_RETURN (frag); - return rc; - } - } - - rc = mca_btl_openib_put_internal (btl, ep, qp, frag); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - MCA_BTL_IB_FRAG_RETURN (frag); - } - - return rc; -} - -int mca_btl_openib_put_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, - int qp, mca_btl_openib_put_frag_t *frag) -{ - struct ibv_send_wr *bad_wr; - - /* check for a send wqe */ - if (qp_get_wqe(ep, qp) < 0) { - qp_put_wqe(ep, qp); - OPAL_THREAD_LOCK(&ep->endpoint_lock); - opal_list_append(&ep->pending_put_frags, (opal_list_item_t*)frag); - OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - return OPAL_SUCCESS; - } - - qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag)); - qp_reset_signal_count(ep, qp); - - if (ibv_post_send(ep->qps[qp].qp->lcl_qp, &to_out_frag(frag)->sr_desc, &bad_wr)) - return OPAL_ERROR; - - return OPAL_SUCCESS; -} - -/* - * RDMA READ remote buffer to local buffer address. - */ - -int mca_btl_openib_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, void *local_address, - uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, - mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, - int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) -{ - mca_btl_openib_get_frag_t* frag = NULL; - int qp = order; - int rc; - - if (OPAL_UNLIKELY(size > btl->btl_get_limit)) { - return OPAL_ERR_BAD_PARAM; - } - - frag = to_get_frag(alloc_recv_user_frag()); - if (OPAL_UNLIKELY(NULL == frag)) { - return OPAL_ERR_OUT_OF_RESOURCE; - } - - if (MCA_BTL_NO_ORDER == qp) { - qp = mca_btl_openib_component.rdma_qp; - } - - /* set base descriptor flags */ - to_base_frag(frag)->base.order = order; - /* free this descriptor when the operation is complete */ - to_base_frag(frag)->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP; - - /* set up scatter-gather entry */ - to_com_frag(frag)->sg_entry.length = size; - to_com_frag(frag)->sg_entry.lkey = local_handle->lkey; - to_com_frag(frag)->sg_entry.addr = (uint64_t)(uintptr_t) local_address; - to_com_frag(frag)->endpoint = ep; - - /* set up rdma callback */ - frag->cb.func = cbfunc; - frag->cb.context = cbcontext; - frag->cb.data = cbdata; - frag->cb.local_handle = local_handle; - - /* set up descriptor */ - frag->sr_desc.wr.rdma.remote_addr = remote_address; - -#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if((ep->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN) - != (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) { - frag->sr_desc.wr.rdma.rkey = opal_swap_bytes4 (remote_handle->rkey); - } else -#endif - { - frag->sr_desc.wr.rdma.rkey = remote_handle->rkey; - } - - if (ep->endpoint_state != MCA_BTL_IB_CONNECTED) { - OPAL_THREAD_LOCK(&ep->endpoint_lock); - rc = check_endpoint_state(ep, &to_base_frag(frag)->base, &ep->pending_get_frags); - OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - if (OPAL_ERR_RESOURCE_BUSY == rc) { - return OPAL_SUCCESS; - } - - if (OPAL_SUCCESS != rc) { - MCA_BTL_IB_FRAG_RETURN (frag); - return rc; - } - } - - rc = mca_btl_openib_get_internal (btl, ep, qp, frag); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - MCA_BTL_IB_FRAG_RETURN (frag); - } - - return rc; -} - -int mca_btl_openib_get_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, - int qp, mca_btl_openib_get_frag_t *frag) -{ - struct ibv_send_wr* bad_wr; - - /* check for a send wqe */ - if (qp_get_wqe(ep, qp) < 0) { - qp_put_wqe(ep, qp); - OPAL_THREAD_LOCK(&ep->endpoint_lock); - opal_list_append(&ep->pending_get_frags, (opal_list_item_t*)frag); - OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - return OPAL_SUCCESS; - } - - /* check for a get token */ - if (OPAL_THREAD_ADD32(&ep->get_tokens,-1) < 0) { - qp_put_wqe(ep, qp); - OPAL_THREAD_ADD32(&ep->get_tokens,1); - OPAL_THREAD_LOCK(&ep->endpoint_lock); - opal_list_append(&ep->pending_get_frags, (opal_list_item_t*)frag); - OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - return OPAL_SUCCESS; - } - -#if HAVE_XRC - if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) - frag->sr_desc.xrc_remote_srq_num=ep->rem_info.rem_srqs[qp].rem_srq_num; -#endif - - qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag)); - qp_reset_signal_count(ep, qp); - - if (ibv_post_send(ep->qps[qp].qp->lcl_qp, &frag->sr_desc, &bad_wr)) - return OPAL_ERROR; - - return OPAL_SUCCESS; -} - #if OPAL_ENABLE_FT_CR == 0 int mca_btl_openib_ft_event(int state) { return OPAL_SUCCESS; diff --git a/opal/mca/btl/openib/btl_openib.h b/opal/mca/btl/openib/btl_openib.h index 1972301967..77da695f99 100644 --- a/opal/mca/btl/openib/btl_openib.h +++ b/opal/mca/btl/openib/btl_openib.h @@ -627,7 +627,6 @@ struct mca_btl_openib_get_frag_t; * * @param btl (IN) BTL instance * @param ep (IN) BTL endpoint - * @param qp (IN) ID of queue pair to schedule the get on * @param frag (IN) Fragment prepared by mca_btl_openib_put * * If the fragment can not be scheduled due to resource limitations then @@ -635,7 +634,7 @@ struct mca_btl_openib_get_frag_t; * when another get/put fragment has completed. */ int mca_btl_openib_put_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, - int qp, struct mca_btl_openib_put_frag_t *frag); + struct mca_btl_openib_put_frag_t *frag); /** * @brief Schedule an RDMA write with the HCA @@ -676,7 +675,7 @@ int mca_btl_openib_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint * when another get/put fragment has completed. */ int mca_btl_openib_get_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, - int qp, struct mca_btl_openib_get_frag_t *frag); + struct mca_btl_openib_get_frag_t *frag); /** * @brief Schedule an RDMA read with the HCA diff --git a/opal/mca/btl/openib/btl_openib_component.c b/opal/mca/btl/openib/btl_openib_component.c index 73b8227c49..85c1c42d3e 100644 --- a/opal/mca/btl/openib/btl_openib_component.c +++ b/opal/mca/btl/openib/btl_openib_component.c @@ -2903,9 +2903,13 @@ void mca_btl_openib_frag_progress_pending_put_get(mca_btl_base_endpoint_t *ep, if (NULL == frag) break; rc = mca_btl_openib_get_internal ((mca_btl_base_module_t *)openib_btl, ep, - qp, to_get_frag(frag)); - if (OPAL_ERR_OUT_OF_RESOURCE == rc) + to_get_frag(frag)); + if (OPAL_ERR_OUT_OF_RESOURCE == rc) { + OPAL_THREAD_LOCK(&ep->endpoint_lock); + opal_list_prepend (&ep->pending_get_frags, frag); + OPAL_THREAD_UNLOCK(&ep->endpoint_lock); break; + } } len = opal_list_get_size(&ep->pending_put_frags); @@ -2916,9 +2920,13 @@ void mca_btl_openib_frag_progress_pending_put_get(mca_btl_base_endpoint_t *ep, if (NULL == frag) break; rc = mca_btl_openib_put_internal ((mca_btl_base_module_t*)openib_btl, ep, - qp, to_put_frag(frag)); - if (OPAL_ERR_OUT_OF_RESOURCE == rc) + to_put_frag(frag)); + if (OPAL_ERR_OUT_OF_RESOURCE == rc) { + OPAL_THREAD_LOCK(&ep->endpoint_lock); + opal_list_prepend (&ep->pending_put_frags, frag); + OPAL_THREAD_UNLOCK(&ep->endpoint_lock); break; + } } } @@ -3291,12 +3299,13 @@ static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq, get_frag->cb.func (&openib_btl->super, endpoint, (void *)(intptr_t) frag->sg_entry.addr, get_frag->cb.local_handle, get_frag->cb.context, get_frag->cb.data, OPAL_SUCCESS); - } else { + } else if (MCA_BTL_OPENIB_FRAG_SEND_USER == openib_frag_type(des)) { mca_btl_openib_put_frag_t *put_frag = to_put_frag(des); put_frag->cb.func (&openib_btl->super, endpoint, (void *)(intptr_t) frag->sg_entry.addr, put_frag->cb.local_handle, put_frag->cb.context, put_frag->cb.data, OPAL_SUCCESS); + put_frag->cb.func = NULL; } /* fall through */ case IBV_WC_SEND: diff --git a/opal/mca/btl/openib/btl_openib_endpoint.h b/opal/mca/btl/openib/btl_openib_endpoint.h index 2b2fbf4be3..9dd84dadc6 100644 --- a/opal/mca/btl/openib/btl_openib_endpoint.h +++ b/opal/mca/btl/openib/btl_openib_endpoint.h @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology @@ -10,7 +11,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights + * Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved. * Copyright (c) 2007-2009 Mellanox Technologies. All rights reserved. diff --git a/opal/mca/btl/openib/btl_openib_frag.c b/opal/mca/btl/openib/btl_openib_frag.c index 6d0a5cd9ba..6768fcd76c 100644 --- a/opal/mca/btl/openib/btl_openib_frag.c +++ b/opal/mca/btl/openib/btl_openib_frag.c @@ -134,6 +134,7 @@ static void put_constructor(mca_btl_openib_put_frag_t *frag) { to_base_frag(frag)->type = MCA_BTL_OPENIB_FRAG_SEND_USER; to_out_frag(frag)->sr_desc.opcode = IBV_WR_RDMA_WRITE; + frag->cb.func = NULL; } static void get_constructor(mca_btl_openib_get_frag_t *frag) diff --git a/opal/mca/btl/openib/btl_openib_get.c b/opal/mca/btl/openib/btl_openib_get.c new file mode 100644 index 0000000000..e187fa9db9 --- /dev/null +++ b/opal/mca/btl/openib/btl_openib_get.c @@ -0,0 +1,156 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2013 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007-2013 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved. + * Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2006-2007 Voltaire All rights reserved. + * Copyright (c) 2008-2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2009 IBM Corporation. All rights reserved. + * Copyright (c) 2013-2014 Intel, Inc. All rights reserved + * Copyright (c) 2013 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2014 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "btl_openib.h" +#include "btl_openib_frag.h" +#include "btl_openib_endpoint.h" + +/* + * RDMA READ remote buffer to local buffer address. + */ + +int mca_btl_openib_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) +{ + mca_btl_openib_get_frag_t* frag = NULL; + int qp = order; + int rc; + + if (OPAL_UNLIKELY(size > btl->btl_get_limit)) { + return OPAL_ERR_BAD_PARAM; + } + + frag = to_get_frag(alloc_recv_user_frag()); + if (OPAL_UNLIKELY(NULL == frag)) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + if (MCA_BTL_NO_ORDER == qp) { + qp = mca_btl_openib_component.rdma_qp; + } + + /* set base descriptor flags */ + to_base_frag(frag)->base.order = qp; + /* free this descriptor when the operation is complete */ + to_base_frag(frag)->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP; + + /* set up scatter-gather entry */ + to_com_frag(frag)->sg_entry.length = size; + to_com_frag(frag)->sg_entry.lkey = local_handle->lkey; + to_com_frag(frag)->sg_entry.addr = (uint64_t)(uintptr_t) local_address; + to_com_frag(frag)->endpoint = ep; + + /* set up rdma callback */ + frag->cb.func = cbfunc; + frag->cb.context = cbcontext; + frag->cb.data = cbdata; + frag->cb.local_handle = local_handle; + + /* set up descriptor */ + frag->sr_desc.wr.rdma.remote_addr = remote_address; + +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT + if((ep->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN) + != (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) { + frag->sr_desc.wr.rdma.rkey = opal_swap_bytes4 (remote_handle->rkey); + } else +#endif + { + frag->sr_desc.wr.rdma.rkey = remote_handle->rkey; + } + +#if HAVE_XRC + if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) { + frag->sr_desc.xrc_remote_srq_num=ep->rem_info.rem_srqs[qp].rem_srq_num; + } +#endif + + if (ep->endpoint_state != MCA_BTL_IB_CONNECTED) { + OPAL_THREAD_LOCK(&ep->endpoint_lock); + rc = check_endpoint_state(ep, &to_base_frag(frag)->base, &ep->pending_get_frags); + OPAL_THREAD_UNLOCK(&ep->endpoint_lock); + if (OPAL_ERR_RESOURCE_BUSY == rc) { + return OPAL_SUCCESS; + } + + if (OPAL_SUCCESS != rc) { + MCA_BTL_IB_FRAG_RETURN (frag); + return rc; + } + } + + rc = mca_btl_openib_get_internal (btl, ep, frag); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + if (OPAL_LIKELY(OPAL_ERR_OUT_OF_RESOURCE == rc)) { + rc = OPAL_SUCCESS; + + OPAL_THREAD_LOCK(&ep->endpoint_lock); + opal_list_append(&ep->pending_get_frags, (opal_list_item_t*)frag); + OPAL_THREAD_UNLOCK(&ep->endpoint_lock); + } else { + MCA_BTL_IB_FRAG_RETURN (frag); + } + } + + return rc; +} + +int mca_btl_openib_get_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, + mca_btl_openib_get_frag_t *frag) +{ + int qp = to_base_frag(frag)->base.order; + struct ibv_send_wr *bad_wr; + + /* check for a send wqe */ + if (qp_get_wqe(ep, qp) < 0) { + qp_put_wqe(ep, qp); + return OPAL_ERR_OUT_OF_RESOURCE; + } + + /* check for a get token */ + if (OPAL_THREAD_ADD32(&ep->get_tokens,-1) < 0) { + qp_put_wqe(ep, qp); + OPAL_THREAD_ADD32(&ep->get_tokens,1); + return OPAL_ERR_OUT_OF_RESOURCE; + } + + qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag)); + qp_reset_signal_count(ep, qp); + + if (ibv_post_send(ep->qps[qp].qp->lcl_qp, &frag->sr_desc, &bad_wr)) { + qp_put_wqe(ep, qp); + OPAL_THREAD_ADD32(&ep->get_tokens,1); + return OPAL_ERROR; + } + + return OPAL_SUCCESS; +} diff --git a/opal/mca/btl/openib/btl_openib_put.c b/opal/mca/btl/openib/btl_openib_put.c new file mode 100644 index 0000000000..e6f0f6c7ea --- /dev/null +++ b/opal/mca/btl/openib/btl_openib_put.c @@ -0,0 +1,151 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2013 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007-2013 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved. + * Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2006-2007 Voltaire All rights reserved. + * Copyright (c) 2008-2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2009 IBM Corporation. All rights reserved. + * Copyright (c) 2013-2014 Intel, Inc. All rights reserved + * Copyright (c) 2013 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2014 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "btl_openib.h" +#include "btl_openib_frag.h" +#include "btl_openib_endpoint.h" + +/* + * RDMA WRITE local buffer to remote buffer address. + */ + +int mca_btl_openib_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) +{ + mca_btl_openib_put_frag_t *frag = NULL; + int rc, qp = order; + + if (OPAL_UNLIKELY(size > btl->btl_put_limit)) { + return OPAL_ERR_BAD_PARAM; + } + + frag = to_put_frag(alloc_send_user_frag ()); + if (OPAL_UNLIKELY(NULL == frag)) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + if (MCA_BTL_NO_ORDER == qp) { + qp = mca_btl_openib_component.rdma_qp; + } + + /* set base descriptor flags */ + to_base_frag(frag)->base.order = qp; + /* free this descriptor when the operation is complete */ + to_base_frag(frag)->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP; + + /* set up scatter-gather entry */ + to_com_frag(frag)->sg_entry.length = size; + to_com_frag(frag)->sg_entry.lkey = local_handle->lkey; + to_com_frag(frag)->sg_entry.addr = (uint64_t)(intptr_t) local_address; + to_com_frag(frag)->endpoint = ep; + + /* set up rdma callback */ + frag->cb.func = cbfunc; + frag->cb.context = cbcontext; + frag->cb.data = cbdata; + frag->cb.local_handle = local_handle; + + /* post descriptor */ + to_out_frag(frag)->sr_desc.opcode = IBV_WR_RDMA_WRITE; + to_out_frag(frag)->sr_desc.send_flags = ib_send_flags(size, &(ep->qps[qp]), 1); + to_out_frag(frag)->sr_desc.wr.rdma.remote_addr = remote_address; + +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT + if ((ep->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN) + != (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) { + to_out_frag(frag)->sr_desc.wr.rdma.rkey = opal_swap_bytes4(remote_handle->rkey); + } else +#endif + { + to_out_frag(frag)->sr_desc.wr.rdma.rkey = remote_handle->rkey; + } + +#if HAVE_XRC + if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) + to_out_frag(frag)->sr_desc.xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num; +#endif + + if (ep->endpoint_state != MCA_BTL_IB_CONNECTED) { + OPAL_THREAD_LOCK(&ep->endpoint_lock); + rc = check_endpoint_state(ep, &to_base_frag(frag)->base, &ep->pending_put_frags); + OPAL_THREAD_UNLOCK(&ep->endpoint_lock); + if (OPAL_ERR_RESOURCE_BUSY == rc) { + /* descriptor was queued pending connection */ + return OPAL_SUCCESS; + } + + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + MCA_BTL_IB_FRAG_RETURN (frag); + return rc; + } + } + + rc = mca_btl_openib_put_internal (btl, ep, frag); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + if (OPAL_LIKELY(OPAL_ERR_OUT_OF_RESOURCE == rc)) { + rc = OPAL_SUCCESS; + + /* queue the fragment for when resources are available */ + OPAL_THREAD_LOCK(&ep->endpoint_lock); + opal_list_append(&ep->pending_put_frags, (opal_list_item_t*)frag); + OPAL_THREAD_UNLOCK(&ep->endpoint_lock); + } else { + MCA_BTL_IB_FRAG_RETURN (frag); + } + } + + return rc; +} + +int mca_btl_openib_put_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, + mca_btl_openib_put_frag_t *frag) +{ + int qp = to_base_frag(frag)->base.order; + struct ibv_send_wr *bad_wr; + int rc; + + /* check for a send wqe */ + if (qp_get_wqe(ep, qp) < 0) { + qp_put_wqe(ep, qp); + return OPAL_ERR_OUT_OF_RESOURCE; + } + + qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag)); + qp_reset_signal_count(ep, qp); + + if (0 != (rc = ibv_post_send(ep->qps[qp].qp->lcl_qp, &to_out_frag(frag)->sr_desc, &bad_wr))) { + qp_put_wqe(ep, qp); + return OPAL_ERROR;; + } + + return OPAL_SUCCESS; +}