1
1
openmpi/opal/mca/btl/usnic/btl_usnic_send.h
Jeff Squyres 984982790a usnic: convert from verbs to libfabric (yay!)
This commit represents the conversion of the usnic BTL from verbs to
libfabric.

For the moment, libfabric is embedded in Open MPI (currently in the
usnic BTL).  This is because the libfabric API is still changing, and
also has not yet been released.  Ultimately, this embedded copy of
libfabric will likely disappear and the usnic BTL will rely on an
external installation of libfabric.

New configure options:

* --with-libfabric: will cause configure to fail if libfabric support
    cannot be built
* --without-libfabric: will prevent libfabric support from being built
* --with-libfabric=DIR: use an external libfabric installation
* --with-libfabric-libdir=LIBDIR: when paired with --with-libfabric=DIR,
    use LIBDIR for the libfabric installation library dir

The --with-libnl3[-libdir] arguments are now gone.
2014-12-08 11:37:37 -08:00

323 строки
10 KiB
C

/*
* Copyright (c) 2013-2014 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef BTL_USNIC_SEND_H
#define BTL_USNIC_SEND_H
#include "btl_usnic.h"
#include "btl_usnic_frag.h"
#include "btl_usnic_ack.h"
#if MSGDEBUG1
#include "btl_usnic_util.h"
#endif
/*
* Check if conditions are right, and if so, put endpoint on
* list of endpoints that have sends to be done
*/
static inline void
opal_btl_usnic_check_rts(
opal_btl_usnic_endpoint_t *endpoint)
{
/*
* If endpoint not already ready,
* and has packets to send,
* and it has send credits,
* and its retransmission window is open,
* make it ready
*/
if (!endpoint->endpoint_ready_to_send &&
!opal_list_is_empty(&endpoint->endpoint_frag_send_queue) &&
endpoint->endpoint_send_credits > 0 &&
WINDOW_OPEN(endpoint)) {
opal_list_append(&endpoint->endpoint_module->endpoints_with_sends,
&endpoint->super);
endpoint->endpoint_ready_to_send = true;
#if MSGDEBUG1
opal_output(0, "make endpoint %p RTS\n", (void*)endpoint);
} else {
opal_output(0, "rts:%d empty:%d cred:%d open%d\n",
endpoint->endpoint_ready_to_send,
opal_list_is_empty(&endpoint->endpoint_frag_send_queue),
endpoint->endpoint_send_credits,
WINDOW_OPEN(endpoint));
#endif
}
}
/*
* Common point for posting a segment
*/
static inline void
opal_btl_usnic_post_segment(
opal_btl_usnic_module_t *module,
opal_btl_usnic_endpoint_t *endpoint,
opal_btl_usnic_send_segment_t *sseg)
{
int ret;
/* get channel and remote channel */
opal_btl_usnic_channel_id_t channel_id = sseg->ss_channel;
opal_btl_usnic_channel_t *channel = &module->mod_channels[channel_id];
#if MSGDEBUG1
opal_output(0, "post_send: type=%s, ep=%p, remote_addr=%p, addr=%p, len=%"
PRIsize_t,
usnic_seg_type_str(sseg->ss_base.us_type),
(void*) channel->ep,
(void*) endpoint->endpoint_remote_addrs[channel_id],
(void*) sseg->ss_ptr,
sseg->ss_len);
#endif
/* Send the segment */
#if 0
opal_output(0, "send len=%d ret=%d\n", sseg->ss_len, ret);
opal_btl_usnic_dump_hex(sseg->ss_ptr, sseg->ss_len);
#endif
ret = fi_sendto(channel->ep,
sseg->ss_ptr,
sseg->ss_len,
NULL,
endpoint->endpoint_remote_addrs[channel_id],
sseg);
if (OPAL_UNLIKELY(0 != ret)) {
opal_btl_usnic_util_abort("fi_sendto() failed",
__FILE__, __LINE__);
/* Never returns */
}
/* track # of time non-ACKs are posted */
if (sseg->ss_base.us_type != OPAL_BTL_USNIC_SEG_ACK) {
++sseg->ss_send_posted;
++sseg->ss_parent_frag->sf_seg_post_cnt;
}
/* Stats */
++module->stats.num_total_sends;
++channel->num_channel_sends;
--channel->credits;
}
/*
* Common point for posting an ACK
*/
static inline void
opal_btl_usnic_post_ack(
opal_btl_usnic_module_t *module,
opal_btl_usnic_endpoint_t *endpoint,
opal_btl_usnic_send_segment_t *sseg)
{
int ret;
/* get channel and remote channel */
opal_btl_usnic_channel_id_t channel_id = sseg->ss_channel;
opal_btl_usnic_channel_t *channel = &module->mod_channels[channel_id];
#if MSGDEBUG1
opal_output(0, "post_send ACK: type=%s, ep=%p, remote_addr=%p, addr=%p, len=%"
PRIsize_t,
usnic_seg_type_str(sseg->ss_base.us_type),
(void*) channel->ep,
(void*) endpoint->endpoint_remote_addrs[channel_id],
(void*) sseg->ss_ptr,
sseg->ss_len);
#endif
#if 0
opal_output(0, "ACK send:\n");
opal_btl_usnic_dump_hex(sseg->ss_ptr, sseg->ss_len);
#endif
ret = fi_sendto(channel->ep,
sseg->ss_ptr,
sseg->ss_len,
NULL,
endpoint->endpoint_remote_addrs[channel_id],
sseg);
if (OPAL_UNLIKELY(0 != ret)) {
opal_btl_usnic_util_abort("fi_sendto() failed",
__FILE__, __LINE__);
/* Never returns */
}
/* Stats */
++module->stats.num_total_sends;
++channel->num_channel_sends;
--channel->credits;
}
/*
* Post a send to the work queue
*/
static inline void
opal_btl_usnic_endpoint_send_segment(
opal_btl_usnic_module_t *module,
opal_btl_usnic_send_segment_t *sseg)
{
opal_btl_usnic_send_frag_t *frag;
opal_btl_usnic_endpoint_t *endpoint;
uint16_t sfi;
frag = sseg->ss_parent_frag;
endpoint = frag->sf_endpoint;
/* Do we have room in the endpoint's sender window?
Sender window:
|-------- WINDOW_SIZE ----------|
+---------------------------------+
| next_seq_to_send |
| somewhere in this range |
^+---------------------------------+
|
+-- ack_seq_rcvd: one less than the window left edge
Assuming that next_seq_to_send is > ack_seq_rcvd (verified
by assert), then the good condition to send is:
next_seq_to_send <= ack_seq_rcvd + WINDOW_SIZE
And therefore the bad condition is
next_seq_to_send > ack_seq_rcvd + WINDOW_SIZE
*/
assert(SEQ_GT(endpoint->endpoint_next_seq_to_send,
endpoint->endpoint_ack_seq_rcvd));
assert(WINDOW_OPEN(endpoint));
/* Assign sequence number and increment */
sseg->ss_base.us_btl_header->pkt_seq =
endpoint->endpoint_next_seq_to_send++;
/* Fill in remote address to indicate PUT or not */
sseg->ss_base.us_btl_header->put_addr =
frag->sf_base.uf_remote_seg[0].seg_addr.pval;
/* piggy-back an ACK if needed */
opal_btl_usnic_piggyback_ack(endpoint, sseg);
#if MSGDEBUG1
{
char local_ip[32];
char remote_ip[32];
opal_btl_usnic_snprintf_ipv4_addr(local_ip, sizeof(local_ip),
module->local_modex.ipv4_addr,
module->local_modex.netmask);
opal_btl_usnic_snprintf_ipv4_addr(remote_ip, sizeof(remote_ip),
endpoint->endpoint_remote_modex.ipv4_addr,
endpoint->endpoint_remote_modex.netmask);
opal_output(0, "--> Sending %s: seq: %" UDSEQ ", sender: 0x%016lx from device %s, IP %s, port %u, seg %p, room %d, wc len %u, remote IP %s, port %u",
(sseg->ss_parent_frag->sf_base.uf_type == OPAL_BTL_USNIC_FRAG_LARGE_SEND)?
"CHUNK" : "FRAG",
sseg->ss_base.us_btl_header->pkt_seq,
sseg->ss_base.us_btl_header->sender,
endpoint->endpoint_module->fabric_info->fabric_attr->name,
local_ip,
module->local_modex.ports[sseg->ss_channel],
(void*)sseg,
sseg->ss_hotel_room,
sseg->ss_ptr,
remote_ip,
endpoint->endpoint_remote_modex.ports[sseg->ss_channel]);
}
#endif
/* do the actual send */
opal_btl_usnic_post_segment(module, endpoint, sseg);
/* Track this header by stashing in an array on the endpoint that
is the same length as the sender's window (i.e., WINDOW_SIZE).
To find a unique slot in this array, use (seq % WINDOW_SIZE).
*/
sfi = WINDOW_SIZE_MOD(sseg->ss_base.us_btl_header->pkt_seq);
endpoint->endpoint_sent_segs[sfi] = sseg;
sseg->ss_ack_pending = true;
/* bookkeeping */
--endpoint->endpoint_send_credits;
/* Stats */
if (sseg->ss_parent_frag->sf_base.uf_type
== OPAL_BTL_USNIC_FRAG_LARGE_SEND) {
++module->stats.num_chunk_sends;
} else {
++module->stats.num_frag_sends;
}
/* If we have room in the sender's window, we also have room in
endpoint hotel */
opal_hotel_checkin_with_res(&endpoint->endpoint_hotel, sseg,
&sseg->ss_hotel_room);
}
/*
* This enqueues a fragment send into the system. A send of a fragment
* may result in the sending of multiple segments
*/
static inline int
opal_btl_usnic_endpoint_enqueue_frag(
opal_btl_usnic_endpoint_t *endpoint,
opal_btl_usnic_send_frag_t *frag)
{
#if MSGDEBUG1
opal_output(0, "enq_frag: frag=%p, endpoint=%p, %s, len=%lu\n",
(void*)frag, (void*)endpoint,
usnic_frag_type(frag->sf_base.uf_type),
(long unsigned)frag->sf_base.uf_base.des_src->seg_len);
if (frag->sf_base.uf_type == OPAL_BTL_USNIC_FRAG_LARGE_SEND) {
opal_btl_usnic_large_send_frag_t *lfrag;
lfrag = (opal_btl_usnic_large_send_frag_t *)frag;
opal_output(0, " large size=%zd\n", lfrag->lsf_base.sf_size);
}
#endif
/* add to tail of in-progress list */
opal_list_append(&endpoint->endpoint_frag_send_queue,
&frag->sf_base.uf_base.super.super);
/* possibly make this endpoint ready to send again */
opal_btl_usnic_check_rts(endpoint);
return OPAL_SUCCESS;
}
static inline void
opal_btl_usnic_release_send_segment(
opal_btl_usnic_module_t *module,
opal_btl_usnic_send_frag_t *frag,
opal_btl_usnic_send_segment_t *sseg)
{
/* We only return CHUNK segments because they are the only send-style
* segments that are dynamically allocated.
*/
if (sseg->ss_base.us_type == OPAL_BTL_USNIC_SEG_CHUNK) {
opal_btl_usnic_chunk_segment_return(module, sseg);
}
}
void opal_btl_usnic_frag_complete(opal_btl_usnic_send_frag_t *frag);
void opal_btl_usnic_frag_send_complete(opal_btl_usnic_module_t *module,
opal_btl_usnic_send_segment_t *sseg);
void opal_btl_usnic_chunk_send_complete(opal_btl_usnic_module_t *module,
opal_btl_usnic_send_segment_t *sseg);
int
opal_btl_usnic_finish_put_or_send(
opal_btl_usnic_module_t *module,
opal_btl_usnic_endpoint_t *endpoint,
opal_btl_usnic_send_frag_t *frag,
mca_btl_base_tag_t tag)
__opal_attribute_noinline__;
#endif /* BTL_USNIC_SEND_H */