1
1
Этот коммит содержится в:
Jeff Squyres 2015-02-06 14:59:45 -08:00 коммит произвёл Nathan Hjelm
родитель 0a5fd8e36a
Коммит ad841d7ba3
6 изменённых файлов: 202 добавлений и 152 удалений

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2013-2014 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -147,12 +147,14 @@ opal_btl_usnic_handle_ack(
* fragment really needs to be freed, we'll take care of it in a few
* lines below.
*/
if (frag->sf_ack_bytes_left == bytes_acked &&
((frag->sf_base.uf_remote_seg[0].seg_addr.pval != NULL) ||
(frag->sf_base.uf_base.des_flags &
MCA_BTL_DES_SEND_ALWAYS_CALLBACK))) {
if (frag->sf_ack_bytes_left == bytes_acked) {
if (frag->sf_base.uf_remote_seg[0].seg_addr.pval != NULL) {
OPAL_BTL_USNIC_DO_PUT_FRAG_CB(module, frag, "put completion");
} else if (frag->sf_base.uf_base.des_flags &
MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
OPAL_BTL_USNIC_DO_SEND_FRAG_CB(module, frag, "send completion");
}
}
/* free this segment */
sseg->ss_ack_pending = false;

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2013-2014 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -18,11 +18,11 @@
#include "btl_usnic_frag.h"
#include "btl_usnic_endpoint.h"
/* Invoke the descriptor callback for the frag, updating stats and clearing the
* _CALLBACK flag in the process. */
/* Invoke the descriptor callback for a (non-PUT) send frag, updating
* stats and clearing the _CALLBACK flag in the process. */
#define OPAL_BTL_USNIC_DO_SEND_FRAG_CB(module, send_frag, comment) \
do { \
MSGDEBUG1_OUT("%s:%d: %s send callback for module=%p frag=%p\n", \
MSGDEBUG1_OUT("%s:%d: %s SEND callback for module=%p frag=%p\n", \
__func__, __LINE__, \
(comment), (void *)(module), (void *)(send_frag)); \
(send_frag)->sf_base.uf_base.des_cbfunc( \
@ -34,6 +34,26 @@
++((module)->stats.pml_send_callbacks); \
} while (0)
/* Invoke the descriptor callback for a send frag that was a PUT,
* updating stats and clearing the _CALLBACK flag in the process. */
#define OPAL_BTL_USNIC_DO_PUT_FRAG_CB(module, send_frag, comment) \
do { \
MSGDEBUG1_OUT("%s:%d: %s PUT callback for module=%p frag=%p\n", \
__func__, __LINE__, \
(comment), (void *)(module), (void *)(send_frag)); \
mca_btl_base_rdma_completion_fn_t func = \
(mca_btl_base_rdma_completion_fn_t) \
(send_frag)->sf_base.uf_base.des_cbfunc; \
func(&(module)->super, \
(send_frag)->sf_endpoint, \
(send_frag)->sf_base.uf_local_seg[0].seg_addr.pval, \
NULL, \
(send_frag)->sf_base.uf_base.des_context, \
(send_frag)->sf_base.uf_base.des_cbdata, \
OPAL_SUCCESS); \
++((module)->stats.pml_send_callbacks); \
} while (0)
/*
* Reap an ACK send that is complete
*/

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2013-2014 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -40,20 +40,20 @@
#endif
# define USNIC_BTL_DEFAULT_VERSION(name) MCA_BTL_DEFAULT_VERSION(name)
# define USNIC_SEND_LOCAL des_local
# define USNIC_SEND_LOCAL_COUNT des_local_count
# define USNIC_SEND_REMOTE des_remote
# define USNIC_SEND_REMOTE_COUNT des_remote_count
# define USNIC_SEND_LOCAL des_segments
# define USNIC_SEND_LOCAL_COUNT des_segment_count
# define USNIC_SEND_REMOTE des_segments
# define USNIC_SEND_REMOTE_COUNT des_segment_count
# define USNIC_RECV_LOCAL des_local
# define USNIC_RECV_LOCAL_COUNT des_local_count
# define USNIC_RECV_REMOTE des_remote
# define USNIC_RECV_REMOTE_COUNT des_remote_count
# define USNIC_RECV_LOCAL des_segments
# define USNIC_RECV_LOCAL_COUNT des_segment_count
# define USNIC_RECV_REMOTE des_segments
# define USNIC_RECV_REMOTE_COUNT des_segment_count
# define USNIC_PUT_LOCAL des_local
# define USNIC_PUT_LOCAL_COUNT des_local_count
# define USNIC_PUT_REMOTE des_remote
# define USNIC_PUT_REMOTE_COUNT des_remote_count
# define USNIC_PUT_LOCAL des_segments
# define USNIC_PUT_LOCAL_COUNT des_segment_count
# define USNIC_PUT_REMOTE des_segments
# define USNIC_PUT_REMOTE_COUNT des_segments_count
/*
* Performance critical; needs to be inline

Просмотреть файл

@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2006 Sandia National Laboratories. All rights
* reserved.
* Copyright (c) 2013-2014 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -122,11 +122,18 @@ recv_seg_constructor(
mca_btl_usnic_component.transport_header_len);
/* initialize descriptor */
seg->rs_desc.USNIC_RECV_LOCAL = &seg->rs_segment;
seg->rs_desc.USNIC_RECV_LOCAL_COUNT = 1;
/* JMS Initializing RECV_REMOTE for receive frags is unnecessary
with BTL 3.0. The only reason to keep this here would be for
compatibility with the BTL 2.0 usnic-v1.8 git branch (i.e.,
it's harmless to do this assignment first, before the
RECV_LOCAL assignments -- the compiler will likely compile out
this dead code, anyway). */
seg->rs_desc.USNIC_RECV_REMOTE = NULL;
seg->rs_desc.USNIC_RECV_REMOTE_COUNT = 0;
seg->rs_desc.USNIC_RECV_LOCAL = &seg->rs_segment;
seg->rs_desc.USNIC_RECV_LOCAL_COUNT = 1;
/*
* This pointer is only correct for incoming segments of type
* OPAL_BTL_USNIC_PAYLOAD_TYPE_FRAG, but that's the only time
@ -144,12 +151,20 @@ send_frag_constructor(opal_btl_usnic_send_frag_t *frag)
/* Fill in source descriptor */
desc = &frag->sf_base.uf_base;
/* JMS Initializing SEND_REMOTE for receive frags is unnecessary
with BTL 3.0. The only reason to keep this here would be for
compatibility with the BTL 2.0 usnic-v1.8 git branch (i.e.,
it's harmless to do this assignment first, before the
SEND_LOCAL assignments -- the compiler will likely compile out
this dead code, anyway). */
desc->USNIC_SEND_REMOTE = frag->sf_base.uf_remote_seg;
desc->USNIC_SEND_REMOTE_COUNT = 0;
desc->USNIC_SEND_LOCAL = frag->sf_base.uf_local_seg;
frag->sf_base.uf_local_seg[0].seg_len = 0;
frag->sf_base.uf_local_seg[1].seg_len = 0;
desc->USNIC_SEND_LOCAL_COUNT = 2;
desc->USNIC_SEND_REMOTE = frag->sf_base.uf_remote_seg;
desc->USNIC_SEND_REMOTE_COUNT = 0;
desc->order = MCA_BTL_NO_ORDER;
desc->des_flags = 0;

Просмотреть файл

@ -85,6 +85,25 @@ usnic_seg_type_str(opal_btl_usnic_seg_type_t t)
}
/*
* usnic registration handle (passed over the network to peers as a
* cookie).
*
* Currently, this struct is meaningless (but it must be defined /
* exist) because we are emulating RDMA and do not have
* btl_register_mem and btl_deregister_mem functions (and we set
* module.btl_registration_handle_size to 0, not sizeof(struct
* mca_btl_base_registration_handle_t)).
*/
struct mca_btl_base_registration_handle_t {
/* Maybe we'll need fields like this */
uint32_t lkey;
uint32_t rkey;
};
/*
* usnic local registration
*/
typedef struct opal_btl_usnic_reg_t {
mca_mpool_base_registration_t base;
struct fid_mr *ur_mr;

Просмотреть файл

@ -644,24 +644,7 @@ static int usnic_free(struct mca_btl_base_module_t* btl,
return OPAL_SUCCESS;
}
/*
* Notes from george:
*
* - BTL ALLOC: allocating control messages or eager frags if BTL
does not have INPLACE flag. To be clear: max it will ever alloc
is eager_limit. THEREFORE: eager_limit is the max that ALLOC
must always be able to alloc.
--> Contraction in the btl.h documentation.
*
* - BTL PREPARE SRC: max_send_size frags go through here. Can return
a smaller size than was asked for.
*
* - BTL PREPARE DEST: not used if you don't have PUT/GET
*
* - BTL SEND: will be used after ALLOC / PREPARE
*/
/* Responsible for handling "small" frags (reserve + *size <= max_frag_payload)
/* Responsible for sending "small" frags (reserve + *size <= max_frag_payload)
* in the same manner as btl_prepare_src. Must return a smaller amount than
* requested if the given convertor cannot process the entire (*size).
*/
@ -670,7 +653,6 @@ opal_btl_usnic_send_frag_t *
prepare_src_small(
struct opal_btl_usnic_module_t* module,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_mpool_base_registration_t* registration,
struct opal_convertor_t* convertor,
uint8_t order,
size_t reserve,
@ -865,7 +847,7 @@ pack_chunk_seg_chain_with_reserve(
seg_space = module->max_chunk_payload;
copyptr = seg->ss_base.us_payload.raw;
if (first_pass && reserve_len > 0) {
if (first_pass) {
/* logic could accommodate >max, but currently doesn't */
assert(reserve_len <= module->max_chunk_payload);
ret_ptr = copyptr;
@ -923,7 +905,6 @@ opal_btl_usnic_send_frag_t *
prepare_src_large(
struct opal_btl_usnic_module_t* module,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_mpool_base_registration_t* registration,
struct opal_convertor_t* convertor,
uint8_t order,
size_t reserve,
@ -1000,29 +981,40 @@ prepare_src_large(
}
/**
* Note the "user" data the PML wishes to communicate and return a descriptor
* that can be used for send or put. We create a frag (which is also a
* descriptor by virtue of its base class) and populate it with enough
* source information to complete a future send/put.
/*
* BTL 3.0 prepare_src function.
*
* We will create either a small send frag if < than an MTU, otherwise a large
* send frag. The convertor will be saved for deferred packing if the user
* buffer is noncontiguous. Otherwise it will be saved in one of the
* This function is only used for sending PML fragments (not putting
* or getting fragments).
*
* Note the "user" data the PML wishes to communicate and return a
* descriptor. We create a frag (which is also a descriptor by virtue
* of its base class) and populate it with enough source information
* to complete a future send.
*
* Recall that the usnic BTL's max_send_size is almost certainly
* larger than the MTU (by default, max_send_size is either 25K or
* 150K). Therefore, the PML may give us a fragment up to
* max_send_size in this function. Hence, we make the decision here
* as to whether it's a "small" fragment (i.e., size <= MTU, meaning
* that it fits in a single datagram) or a "large" fragment (i.e.,
* size > MTU, meaning that it must be chunked into multiple
* datagrams).
*
* The convertor will be saved for deferred packing if the user buffer
* is noncontiguous. Otherwise, it will be saved in one of the
* descriptor's SGEs.
*
* NOTE that the *only* reason this routine is allowed to return a size smaller
* than was requested is if the convertor cannot process the entire amount.
*/
static mca_btl_base_descriptor_t*
usnic_prepare_src(
struct mca_btl_base_module_t* base_module,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_mpool_base_registration_t* registration,
struct opal_convertor_t* convertor,
static struct mca_btl_base_descriptor_t *
usnic_prepare_src(struct mca_btl_base_module_t *base_module,
struct mca_btl_base_endpoint_t *endpoint,
struct opal_convertor_t *convertor,
uint8_t order,
size_t reserve,
size_t* size,
size_t *size,
uint32_t flags)
{
opal_btl_usnic_module_t *module = (opal_btl_usnic_module_t*) base_module;
@ -1042,10 +1034,10 @@ usnic_prepare_src(
*/
payload_len = *size + reserve;
if (payload_len <= module->max_frag_payload) {
frag = prepare_src_small(module, endpoint, registration, convertor,
frag = prepare_src_small(module, endpoint, convertor,
order, reserve, size, flags);
} else {
frag = prepare_src_large(module, endpoint, registration, convertor,
frag = prepare_src_large(module, endpoint, convertor,
order, reserve, size, flags);
}
@ -1071,98 +1063,81 @@ usnic_prepare_src(
return &frag->sf_base.uf_base;
}
static mca_btl_base_descriptor_t*
usnic_prepare_dst(
struct mca_btl_base_module_t* base_module,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_mpool_base_registration_t* registration,
struct opal_convertor_t* convertor,
uint8_t order,
size_t reserve,
size_t* size,
uint32_t flags)
{
opal_btl_usnic_put_dest_frag_t *pfrag;
opal_btl_usnic_module_t *module;
void *data_ptr;
module = (opal_btl_usnic_module_t *)base_module;
/* allocate a fragment for this */
pfrag = (opal_btl_usnic_put_dest_frag_t *)
opal_btl_usnic_put_dest_frag_alloc(module);
if (NULL == pfrag) {
return NULL;
}
/* find start of the data */
opal_convertor_get_current_pointer(convertor, (void **) &data_ptr);
/* make a seg entry pointing at data_ptr */
pfrag->uf_remote_seg[0].seg_addr.pval = data_ptr;
pfrag->uf_remote_seg[0].seg_len = *size;
pfrag->uf_base.order = order;
pfrag->uf_base.des_flags = flags;
#if MSGDEBUG2
opal_output(0, "prep_dst size=%d, addr=%p, pfrag=%p\n", (int)*size,
data_ptr, (void *)pfrag);
#endif
return &pfrag->uf_base;
}
/*
* Emulate an RDMA put. We'll send the remote address
* across to the other side so it will know where to put the data
* Emulate an RDMA put. We'll send the remote address across to the
* other side so it will know where to put the data.
*
* Note that this function is only ever called with contiguous
* buffers, so a convertor is not necessary.
*/
static int
usnic_put(
struct mca_btl_base_module_t *btl,
usnic_put(struct mca_btl_base_module_t *base_module,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *desc)
void *local_address, uint64_t remote_address,
struct mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_registration_handle_t *remote_handle,
size_t size, int flags, int order,
mca_btl_base_rdma_completion_fn_t cbfunc,
void *cbcontext, void *cbdata)
{
opal_btl_usnic_send_frag_t *sfrag;
opal_btl_usnic_module_t *module = (opal_btl_usnic_module_t*) base_module;
/* At least for the moment, continue to make a descriptor, like we
used to in BTL 2.0 */
if (size <= module->max_frag_payload) {
/* Small send fragment -- the whole thing fits in one MTU
(i.e., a single chunk) */
opal_btl_usnic_small_send_frag_t *ssfrag;
ssfrag = opal_btl_usnic_small_send_frag_alloc(module);
if (OPAL_UNLIKELY(NULL == ssfrag)) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
sfrag = &ssfrag->ssf_base;
} else {
/* Large send fragment -- need more than one MTU (i.e.,
multiple chunks) */
opal_btl_usnic_large_send_frag_t *lsfrag;
lsfrag = opal_btl_usnic_large_send_frag_alloc(module);
if (OPAL_UNLIKELY(NULL == lsfrag)) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
lsfrag->lsf_pack_on_the_fly = true;
sfrag = &lsfrag->lsf_base;
}
sfrag->sf_endpoint = endpoint;
sfrag->sf_size = size;
sfrag->sf_ack_bytes_left = size;
opal_btl_usnic_frag_t *frag;
frag = &sfrag->sf_base;
frag->uf_local_seg[0].seg_len = size;
frag->uf_local_seg[0].seg_addr.pval = local_address;
frag->uf_remote_seg[0].seg_len = size;
frag->uf_remote_seg[0].seg_addr.pval =
(void *)(uintptr_t) remote_address;
mca_btl_base_descriptor_t *desc;
desc = &frag->uf_base;
desc->des_segment_count = 1;
desc->des_segments = &frag->uf_local_seg[0];
/* This is really the wrong cbfunc type, but we'll cast it to
the Right type before we use it. So it'll be ok. */
desc->des_cbfunc = (mca_btl_base_completion_fn_t) cbfunc;
desc->des_cbdata = cbdata;
desc->des_context = cbcontext;
desc->des_flags = flags;
desc->order = order;
int rc;
opal_btl_usnic_send_frag_t *frag;
frag = (opal_btl_usnic_send_frag_t *)desc;
compute_sf_size(frag);
frag->sf_ack_bytes_left = frag->sf_size;
#if MSGDEBUG2
opal_output(0, "usnic_put, frag=%p, size=%d\n", (void *)frag,
(int)frag->sf_size);
#if MSGDEBUG1
{ unsigned i;
for (i=0; i<desc->USNIC_PUT_LOCAL_COUNT; ++i) {
opal_output(0, " %d: ptr:%p len:%d%s\n", i,
desc->USNIC_PUT_LOCAL[i].seg_addr.pval,
desc->USNIC_PUT_LOCAL[i].seg_len,
(i==0)?" (put local)":"");
}
for (i=0; i<desc->USNIC_PUT_REMOTE_COUNT; ++i) {
opal_output(0, " %d: ptr:%p len:%d%s\n", i,
desc->USNIC_PUT_REMOTE[i].seg_addr.pval,
desc->USNIC_PUT_REMOTE[i].seg_len,
(i==0)?" (put remote)":"");
}
}
#endif
#endif
/* RFXX copy out address - why does he not use our provided holder? */
/* JMS What does this mean? ^^ */
frag->sf_base.uf_remote_seg[0].seg_addr.pval =
desc->USNIC_PUT_REMOTE->seg_addr.pval;
rc = opal_btl_usnic_finish_put_or_send((opal_btl_usnic_module_t *)btl,
rc = opal_btl_usnic_finish_put_or_send(module,
(opal_btl_usnic_endpoint_t *)endpoint,
frag,
sfrag,
/*tag=*/MCA_BTL_NO_ORDER);
return rc;
}
@ -2242,7 +2217,9 @@ static void init_pml_values(opal_btl_usnic_module_t *module)
/* Since we emulate PUT, max_send_size can be same as
eager_limit */
module->super.btl_max_send_size = module->super.btl_eager_limit;
module->super.btl_max_send_size =
module->super.btl_put_limit =
module->super.btl_eager_limit;
}
static void init_senders(opal_btl_usnic_module_t *module)
@ -2625,22 +2602,39 @@ static int usnic_ft_event(int state)
opal_btl_usnic_module_t opal_btl_usnic_module_template = {
.super = {
.btl_component = &mca_btl_usnic_component.super,
.btl_exclusivity = MCA_BTL_EXCLUSIVITY_DEFAULT,
.btl_flags =
MCA_BTL_FLAGS_SEND |
MCA_BTL_FLAGS_PUT |
MCA_BTL_FLAGS_SEND_INPLACE,
.btl_seg_size = sizeof(mca_btl_base_segment_t),
.btl_atomic_flags = 0,
.btl_registration_handle_size = 0,
.btl_get_limit = 0,
.btl_get_alignment = 0,
.btl_put_limit = 0,
.btl_put_alignment = 0,
.btl_add_procs = usnic_add_procs,
.btl_del_procs = usnic_del_procs,
.btl_register = NULL,
.btl_finalize = usnic_finalize,
.btl_alloc = usnic_alloc,
.btl_free = usnic_free,
.btl_prepare_src = usnic_prepare_src,
.btl_prepare_dst = usnic_prepare_dst,
.btl_send = usnic_send,
.btl_sendi = NULL,
.btl_put = usnic_put,
.btl_get = NULL,
.btl_dump = mca_btl_base_dump,
.btl_atomic_op = NULL,
.btl_atomic_fop = NULL,
.btl_atomic_cswap = NULL,
.btl_mpool = NULL,
.btl_register_error = usnic_register_pml_err_cb,
.btl_ft_event = usnic_ft_event
}