1
1
openmpi/opal/mca/btl/scif/btl_scif_send.c
2015-06-23 20:59:57 -07:00

300 строки
10 KiB
C

/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "btl_scif.h"
#include "btl_scif_frag.h"
#define BUFFER_FREE(s,e,hbm) (((s) > (e) || ((s) == (e) && !hbm)) ? (s) - (e) : (mca_btl_scif_component.segment_size - (e)))
/* attempt to reserve a contiguous segment from the remote endpoint */
static inline int mca_btl_scif_send_get_buffer (mca_btl_base_endpoint_t *endpoint, size_t size, unsigned char * restrict *dst)
{
/* the high bit helps determine if the buffer is empty or full */
bool hbm = (endpoint->send_buffer.start >> 31) == (endpoint->send_buffer.end >> 31);
const unsigned int segment_size = mca_btl_scif_component.segment_size;
unsigned int start = endpoint->send_buffer.start & ~ (1 << 31);
unsigned int end = endpoint->send_buffer.end & ~ (1 << 31);
unsigned int buffer_free = BUFFER_FREE(start, end, hbm);
#if defined(SCIF_TIMING)
struct timespec ts;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
#endif
/* need space for the fragment + the header */
size += sizeof (mca_btl_scif_frag_hdr_t);
/* check if we need to free up space for this fragment */
if (OPAL_UNLIKELY(buffer_free < size)) {
BTL_VERBOSE(("not enough room for a fragment of size %u. in use buffer segment: {start: %x, end: %x, high bit matches: %d}\n",
(unsigned) size, start, end, (int) hbm));
/* read the current start pointer from the remote peer */
start = endpoint->send_buffer.start = endpoint->send_buffer.startp[0];
start &= ~ (1 << 31);
hbm = (endpoint->send_buffer.start >> 31) == (endpoint->send_buffer.end >> 31);
buffer_free = BUFFER_FREE(start, end, hbm);
opal_atomic_rmb ();
/* if this is the end of the buffer. does the fragment fit? */
if (OPAL_UNLIKELY(buffer_free > 0 && buffer_free < size && start <= end)) {
mca_btl_scif_frag_hdr_t hdr;
hdr.size = buffer_free - sizeof (mca_btl_scif_frag_hdr_t);
hdr.tag = 0xff;
#if defined(SCIF_USE_SEQ)
hdr.seq = endpoint->seq_next++;
((uint64_t *) (endpoint->send_buffer.buffer + end))[0] = *((uint64_t *) &hdr);
#else
((uint32_t *) (endpoint->send_buffer.buffer + end))[0] = *((uint32_t *) &hdr);
#endif
/* toggle the high bit */
end = 64;
endpoint->send_buffer.end = ((endpoint->send_buffer.end & (1 << 31)) ^ (1 << 31)) | end;
hbm = (endpoint->send_buffer.start >> 31) == (endpoint->send_buffer.end >> 31);
buffer_free = BUFFER_FREE(start, end, hbm);
}
if (OPAL_UNLIKELY(buffer_free < size)) {
#if defined(SCIF_TIMING)
SCIF_UPDATE_TIMER(mca_btl_scif_component.aquire_buffer_time, mca_btl_scif_component.aquire_buffer_time_max, ts);
#endif
return OPAL_ERR_OUT_OF_RESOURCE;
}
}
BTL_VERBOSE(("writing fragment of size %u to offset %u {start: %x, end: %x} of peer's buffer. free = %u",
(unsigned int) size, end, start, end, buffer_free));
*dst = endpoint->send_buffer.buffer + end;
/* align the buffer on a 64 byte boundary */
end = (end + size + 63) & ~63;
if (OPAL_UNLIKELY(segment_size == end)) {
endpoint->send_buffer.end = ((endpoint->send_buffer.end & (1 << 31)) ^ (1 << 31)) | 64;
} else {
endpoint->send_buffer.end = (endpoint->send_buffer.end & (1 << 31)) | end;
}
#if defined(SCIF_TIMING)
SCIF_UPDATE_TIMER(mca_btl_scif_component.aquire_buffer_time, mca_btl_scif_component.aquire_buffer_time_max, ts);
#endif
return OPAL_SUCCESS;
}
static void mark_buffer (struct mca_btl_base_endpoint_t *endpoint)
{
if (endpoint->port_id.node != mca_btl_scif_module.port_id.node) {
/* force the PCIe bus to flush by reading from the remote node */
volatile uint32_t start = endpoint->send_buffer.startp[0]; (void)start;
endpoint->send_buffer.endp[0] = endpoint->send_buffer.end;
endpoint->send_buffer.start = endpoint->send_buffer.startp[0];
} else {
MB();
endpoint->send_buffer.endp[0] = endpoint->send_buffer.end;
}
}
static int mca_btl_scif_send_frag (struct mca_btl_base_endpoint_t *endpoint,
mca_btl_scif_base_frag_t *frag)
{
size_t size = frag->hdr.size;
unsigned char * restrict dst;
BTL_VERBOSE(("btl/scif sending descriptor %p from %d -> %d. length = %" PRIu64, (void *) frag,
OPAL_PROC_MY_NAME.vpid, endpoint->peer_proc->proc_name.vpid, frag->segments[0].seg_len));
if (OPAL_LIKELY(OPAL_SUCCESS == mca_btl_scif_send_get_buffer (endpoint, size, &dst))) {
unsigned char * restrict data = (unsigned char * restrict) frag->segments[0].seg_addr.pval;
#if defined(SCIF_TIMING)
struct timespec ts;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
#endif
memcpy (dst + sizeof (frag->hdr), data, frag->segments[0].seg_len);
if (frag->segments[1].seg_len) {
memcpy (dst + sizeof (frag->hdr) + frag->segments[0].seg_len,
frag->segments[1].seg_addr.pval,
frag->segments[1].seg_len);
}
#if defined(SCIF_USE_SEQ)
frag->hdr.seq = endpoint->seq_next++;
/* write the tag to signal the fragment is available */
((uint64_t *) dst)[0] = *((uint64_t *) &frag->hdr);
#else
((uint32_t *) dst)[0] = *((uint32_t *) &frag->hdr);
#endif
opal_atomic_wmb ();
#if defined(SCIF_TIMING)
SCIF_UPDATE_TIMER(mca_btl_scif_component.send_time, mca_btl_scif_component.send_time_max, ts);
#endif
/* fragment is gone */
mca_btl_scif_frag_complete (frag, OPAL_SUCCESS);
return 1;
}
return OPAL_ERR_OUT_OF_RESOURCE;
}
int mca_btl_scif_send (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *descriptor,
mca_btl_base_tag_t tag)
{
mca_btl_scif_base_frag_t *frag = (mca_btl_scif_base_frag_t *) descriptor;
size_t size = frag->segments[0].seg_len + frag->segments[1].seg_len;
int rc;
frag->hdr.tag = tag;
frag->hdr.size = size;
if (OPAL_UNLIKELY(MCA_BTL_SCIF_EP_STATE_CONNECTED != endpoint->state)) {
rc = mca_btl_scif_ep_connect (endpoint);
if (OPAL_UNLIKELY(MCA_BTL_SCIF_EP_STATE_CONNECTED != endpoint->state)) {
/* the receiver was not ready to handle the fragment. queue up the fragment. */
descriptor->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
opal_list_append (&endpoint->frag_wait_list, (opal_list_item_t *) descriptor);
return OPAL_SUCCESS;
}
}
rc = mca_btl_scif_send_frag (endpoint, frag);
if (OPAL_LIKELY(1 == rc)) {
mark_buffer (endpoint);
return 1;
}
/* the receiver was not ready to handle the fragment. queue up the fragment. */
descriptor->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
opal_list_append (&endpoint->frag_wait_list, (opal_list_item_t *) descriptor);
return OPAL_SUCCESS;
}
int mca_btl_scif_sendi (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct opal_convertor_t *convertor,
void *header, size_t header_size,
size_t payload_size, uint8_t order,
uint32_t flags, mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t **descriptor)
{
size_t length = (header_size + payload_size);
unsigned char * restrict base;
mca_btl_scif_frag_hdr_t hdr;
size_t max_data;
int rc;
#if defined(SCIF_TIMING)
struct timespec ts;
#endif
assert (length < mca_btl_scif_module.super.btl_eager_limit);
assert (0 == (flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK));
if (OPAL_UNLIKELY(MCA_BTL_SCIF_EP_STATE_CONNECTED != endpoint->state)) {
rc = mca_btl_scif_ep_connect (endpoint);
if (OPAL_UNLIKELY(MCA_BTL_SCIF_EP_STATE_CONNECTED != endpoint->state)) {
return OPAL_ERR_RESOURCE_BUSY;
}
}
rc = mca_btl_scif_send_get_buffer (endpoint, length, &base);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
if (NULL != descriptor) {
*descriptor = NULL;
}
return OPAL_ERR_OUT_OF_RESOURCE;
}
#if defined(SCIF_TIMING)
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
#endif
/* fill in the fragment header (except for the tag) */
hdr.size = length;
hdr.tag = tag;
#if defined(SCIF_USE_SEQ)
hdr.seq = endpoint->seq_next++;
#endif
/* write the match header (with MPI comm/tag/etc. info) */
memcpy (base + sizeof (hdr), header, header_size);
if (payload_size) {
uint32_t iov_count = 1;
struct iovec iov[1];
iov[0].iov_base = base + sizeof (hdr) + header_size;
iov[0].iov_len = payload_size;
/* move the data */
opal_convertor_pack (convertor, iov, &iov_count, &max_data);
assert (max_data == payload_size);
}
#if defined(SCIF_USE_SEQ)
/* signal the remote side that this fragment is available */
((uint64_t *)base)[0] = *((uint64_t *) &hdr);
#else
((uint32_t *)base)[0] = *((uint32_t *) &hdr);
#endif
opal_atomic_wmb ();
mark_buffer (endpoint);
#if defined(SCIF_TIMING)
SCIF_UPDATE_TIMER(mca_btl_scif_component.sendi_time, mca_btl_scif_component.sendi_time_max, ts);
#endif
return OPAL_SUCCESS;
}
int mca_btl_scif_progress_send_wait_list (mca_btl_base_endpoint_t *endpoint)
{
mca_btl_scif_base_frag_t *frag;
int rc = OPAL_SUCCESS;
while (NULL !=
(frag = (mca_btl_scif_base_frag_t *) opal_list_remove_first (&endpoint->frag_wait_list))) {
rc = mca_btl_scif_send_frag (endpoint, frag);
if (OPAL_UNLIKELY(OPAL_SUCCESS > rc)) {
if (OPAL_LIKELY(OPAL_ERR_OUT_OF_RESOURCE == rc)) {
opal_list_prepend (&endpoint->frag_wait_list, (opal_list_item_t *) frag);
} else {
mca_btl_scif_frag_complete (frag, rc);
}
break;
}
}
mark_buffer (endpoint);
return OPAL_SUCCESS;
}