1
1
openmpi/opal/mca/btl/ofi/btl_ofi_frag.c
Thananon Patinyasakdikul 080115d440 btl/ofi: Added 2 side communication support.
The 2 sided communication support is added for non-tagmatching provider
to take advantage of this BTL and PML OB1. The current state is
"functional" and not optimized for performance.

Two sided support is disabled by default and can be turned on by mca
parameter: "mca_btl_ofi_mode".

Signed-off-by: Thananon Patinyasakdikul <thananon.patinyasakdikul@intel.com>
2018-08-03 12:30:03 -07:00

199 строки
6.6 KiB
C

/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* $COPYRIGHT$
* Copyright (c) 2018 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2018 Intel Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "btl_ofi.h"
#include "btl_ofi_frag.h"
#include "btl_ofi_rdma.h"
#include "btl_ofi_endpoint.h"
static void mca_btl_ofi_base_frag_constructor (mca_btl_ofi_base_frag_t *frag)
{
/* zero everything out */
memset ((char *) frag + sizeof (frag->base), 0, sizeof (*frag) - sizeof (frag->base));
frag->base.des_segments = frag->segments;
frag->base.des_segment_count = 1;
}
static void mca_btl_ofi_base_frag_destructor (mca_btl_ofi_base_frag_t *frag)
{
}
OBJ_CLASS_INSTANCE(mca_btl_ofi_base_frag_t,
mca_btl_base_descriptor_t,
mca_btl_ofi_base_frag_constructor,
mca_btl_ofi_base_frag_destructor);
OBJ_CLASS_INSTANCE(mca_btl_ofi_frag_completion_t,
opal_free_list_item_t,
NULL,
NULL);
mca_btl_ofi_frag_completion_t *mca_btl_ofi_frag_completion_alloc
(mca_btl_base_module_t *btl,
mca_btl_ofi_context_t *context,
mca_btl_ofi_base_frag_t *frag,
int type)
{
mca_btl_ofi_frag_completion_t *comp;
comp = (mca_btl_ofi_frag_completion_t*) opal_free_list_get(&context->frag_comp_list);
comp->base.btl = btl;
comp->base.my_context = context;
comp->base.my_list = &context->frag_comp_list;
comp->base.type = type;
comp->frag = frag;
comp->comp_ctx.comp = comp;
return comp;
}
mca_btl_base_descriptor_t *mca_btl_ofi_alloc(
mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
uint64_t order, size_t size, uint32_t flags)
{
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t*) btl;
mca_btl_ofi_base_frag_t *frag = NULL;
mca_btl_ofi_context_t *context = get_ofi_context(ofi_btl);
frag = mca_btl_ofi_frag_alloc(ofi_btl, &context->frag_list, endpoint);
if (OPAL_LIKELY(frag)) {
frag->segments[0].seg_addr.pval = frag + 1;
frag->segments[0].seg_len = size;
frag->base.des_segment_count = 1;
frag->base.des_segments = &frag->segments[0];
frag->base.des_flags = flags;
frag->base.order = order;
frag->hdr.len = size;
}
return (mca_btl_base_descriptor_t*) frag;
}
int mca_btl_ofi_free (mca_btl_base_module_t *btl, mca_btl_base_descriptor_t *des)
{
/* return the frag to the free list. */
mca_btl_ofi_frag_return ((mca_btl_ofi_base_frag_t*) des);
return OPAL_SUCCESS;
}
int mca_btl_ofi_send (mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
mca_btl_base_descriptor_t *descriptor,
mca_btl_base_tag_t tag)
{
int rc = 0;
mca_btl_ofi_context_t *context;
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t*) btl;
mca_btl_ofi_endpoint_t *ofi_ep = (mca_btl_ofi_endpoint_t*) endpoint;
mca_btl_ofi_base_frag_t *frag = (mca_btl_ofi_base_frag_t*) descriptor;
mca_btl_ofi_frag_completion_t *comp;
frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
/* This tag is the active message tag for the remote side */
frag->hdr.tag = tag;
/* create completion context */
context = get_ofi_context(ofi_btl);
comp = mca_btl_ofi_frag_completion_alloc(btl, context, frag,
MCA_BTL_OFI_TYPE_SEND);
/* send the frag. Note that we start sending from BTL header + payload
* because we need the other side to have this header information. */
rc = fi_send(context->tx_ctx,
&frag->hdr,
sizeof(mca_btl_ofi_header_t) + frag->hdr.len,
NULL,
ofi_ep->peer_addr,
&comp->comp_ctx);
if (OPAL_UNLIKELY(FI_SUCCESS != rc)) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
MCA_BTL_OFI_NUM_SEND_INC(ofi_btl);
return OPAL_SUCCESS;
}
inline int mca_btl_ofi_recv_frag (mca_btl_ofi_module_t *ofi_btl,
mca_btl_base_endpoint_t *endpoint,
mca_btl_ofi_context_t *context,
mca_btl_ofi_base_frag_t *frag)
{
int rc;
mca_btl_active_message_callback_t *reg;
/* Tell PML where the payload is */
frag->base.des_segments = frag->segments;
frag->segments[0].seg_addr.pval = frag+1;
frag->segments[0].seg_len = frag->hdr.len;
frag->base.des_segment_count = 1;
/* call the callback */
reg = mca_btl_base_active_message_trigger + frag->hdr.tag;
reg->cbfunc (&ofi_btl->super, frag->hdr.tag, &frag->base, reg->cbdata);
mca_btl_ofi_frag_complete(frag, OPAL_SUCCESS);
/* repost the recv */
rc = mca_btl_ofi_post_recvs((mca_btl_base_module_t*) ofi_btl, context, 1);
if (OPAL_SUCCESS != rc) {
/* might not be that bad but let's just fail here. */
BTL_ERROR(("failed reposting receive."));
MCA_BTL_OFI_ABORT();
}
return OPAL_SUCCESS;
}
struct mca_btl_base_descriptor_t *mca_btl_ofi_prepare_src (
mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
opal_convertor_t *convertor,
uint8_t order, size_t reserve,
size_t *size, uint32_t flags)
{
struct iovec iov;
size_t length;
uint32_t iov_count = 1;
mca_btl_ofi_base_frag_t *frag;
/* allocate the frag with reserve. */
frag = (mca_btl_ofi_base_frag_t*) mca_btl_ofi_alloc(btl, endpoint,
order, reserve, flags);
if (OPAL_UNLIKELY(NULL == frag)) {
return NULL;
}
/* pack the data after the reserve */
iov.iov_len = *size;
iov.iov_base = (IOVBASE_TYPE*)(((unsigned char*)(frag->segments[0].seg_addr.pval)) + reserve);
opal_convertor_pack(convertor, &iov, &iov_count, &length);
/* pass on frag information */
frag->base.des_segments = frag->segments;
frag->base.des_flags = flags;
frag->base.order = MCA_BTL_NO_ORDER;
frag->segments[0].seg_len += length;
frag->hdr.len += length;
*size = length;
return &frag->base;
}