1
1
openmpi/ompi/mca/btl/usnic/btl_usnic_endpoint.h
Dave Goodell 82db913490 usnic: fix module_recv_buffers perf regression
Cisco v1.6 git commit 913ec6c and upstream trunk r29593 (segfault fix)
introduced a performance regression by inadvertently disabling the
`module_recv_buffers` functionality.  With those changes in place, the
`btl_usnic_recv.c` logic would end up mallocing a buffer that should
have otherwise come from a `module_recv_buffers` pool.  It also resulted
in a small, bounded memory leak (128 buffers at each power-of-two size
interval).

The new version just places the buffer after the free list item with a
flexible array member.  I bumped the pool to allocate all 128 elements
up front because the deferred allocation was modestly impacting IMB
Sendrecv performance at a few sizes.

Reviewed-by: Reese Faucette <rfaucett@cisco.com>

This commit was SVN r29631.

The following SVN revision numbers were found above:
  r29593 --> open-mpi/ompi@1ed9b8ff43
2013-11-07 01:27:31 +00:00

186 строки
6.5 KiB
C

/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Sandia National Laboratories. All rights
* reserved.
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef OMPI_BTL_USNIC_ENDPOINT_H
#define OMPI_BTL_USNIC_ENDPOINT_H
#include <infiniband/verbs.h>
#include "opal/class/opal_list.h"
#include "opal/class/opal_hotel.h"
#include "opal/mca/event/event.h"
#include "btl_usnic.h"
BEGIN_C_DECLS
/*
* Forward declarations to avoid include loops
*/
struct ompi_btl_usnic_module_t;
struct ompi_btl_usnic_send_segment_t;
/*
* Have the window size as a compile-time constant that is a power of
* two so that we can take advantage of fast bit operations.
*/
#define WINDOW_SIZE 4096
#define WINDOW_SIZE_MOD(a) (((a) & (WINDOW_SIZE - 1)))
#define WINDOW_OPEN(E) ((E)->endpoint_next_seq_to_send < \
((E)->endpoint_ack_seq_rcvd + WINDOW_SIZE))
#define WINDOW_EMPTY(E) ((E)->endpoint_ack_seq_rcvd == \
((E)->endpoint_next_seq_to_send-1))
/*
* Returns true when an endpoint has nothing left to send
*/
#define ENDPOINT_DRAINED(E) (WINDOW_EMPTY(E) && \
opal_list_is_empty(&(E)->endpoint_frag_send_queue))
/*
* Channel IDs
*/
typedef enum ompi_btl_usnic_channel_id_t {
USNIC_PRIORITY_CHANNEL,
USNIC_DATA_CHANNEL,
USNIC_NUM_CHANNELS
} ompi_btl_usnic_channel_id_t;
typedef struct ompi_btl_usnic_addr_t {
ompi_btl_usnic_seq_t isn;
uint32_t qp_num[USNIC_NUM_CHANNELS];
union ibv_gid gid;
uint32_t ipv4_addr;
uint32_t cidrmask;
uint8_t mac[6];
int mtu;
} ompi_btl_usnic_addr_t;
struct ompi_btl_usnic_send_segment_t;
struct ompi_btl_usnic_proc_t;
/*
* This is a descriptor for an incoming fragment that is broken
* into chunks. When the first reference to this frag_id is seen,
* memory is allocated for it. When the last byte arrives, the assembled
* fragment is passed to the PML.
*
* The endpoint structure has space for WINDOW_SIZE/2 simultaneous fragments.
* This is the largest number of fragments that can possibly be in-flight
* to us from a particular endpoint because eash chunked fragment will occupy
* at least two segments, and only WINDOW_SIZE segments can be in flight.
* OK, so there is an extremely pathological case where we could see
* (WINDOW_SIZE/2)+1 "in flight" at once, but just dropping that last one
* and waiting for retrans is just fine in this hypothetical hyper-pathological
* case, which is what we'll do.
*/
#define MAX_ACTIVE_FRAGS (WINDOW_SIZE/2)
typedef struct ompi_btl_usnic_rx_frag_info_t {
uint32_t rfi_frag_id; /* ID for this fragment */
uint32_t rfi_frag_size; /* bytes in this fragment */
uint32_t rfi_bytes_left; /* bytes remaining to RX in fragment */
char *rfi_data; /* pointer to assembly area */
int rfi_data_pool; /* if 0, data malloced, else rx buf pool */
ompi_free_list_item_t *rfi_fl_elt; /* free list elemement from buf pool
when rfi_data_pool is nonzero */
} ompi_btl_usnic_rx_frag_info_t;
/**
* An abstraction that represents a connection to a remote process.
* An instance of mca_btl_base_endpoint_t is associated with each
* (btl_usnic_proc_t, btl_usnic_module_t) tuple and address
* information is exchanged at startup. The usnic BTL is
* connectionless, so no connection is ever established.
*/
typedef struct mca_btl_base_endpoint_t {
opal_list_item_t super;
/** BTL module that created this connection */
struct ompi_btl_usnic_module_t *endpoint_module;
/** proc that owns this endpoint */
struct ompi_btl_usnic_proc_t *endpoint_proc;
int endpoint_proc_index; /* index in owning proc's endpoint array */
/** True when proc has been deleted, but still have sends that need ACKs */
bool endpoint_exiting;
/** List item for linking into module "all_endpoints" */
opal_list_item_t endpoint_endpoint_li;
/** List item for linking into "need ack" */
opal_list_item_t endpoint_ack_li;
/** Remote address information */
ompi_btl_usnic_addr_t endpoint_remote_addr;
/** Remote address handle */
struct ibv_ah* endpoint_remote_ah;
/** Send-related data */
bool endpoint_ready_to_send;
opal_list_t endpoint_frag_send_queue;
int32_t endpoint_send_credits;
uint32_t endpoint_next_frag_id;
/** Receive-related data */
struct ompi_btl_usnic_rx_frag_info_t *endpoint_rx_frag_info;
/** OPAL hotel to track outstanding stends */
opal_hotel_t endpoint_hotel;
/** Sliding window parameters for this peer */
/* Values for the current proc to send to this endpoint on the
peer proc */
ompi_btl_usnic_seq_t endpoint_next_seq_to_send; /* n_t */
ompi_btl_usnic_seq_t endpoint_ack_seq_rcvd; /* n_a */
struct ompi_btl_usnic_send_segment_t *endpoint_sent_segs[WINDOW_SIZE];
/* Values for the current proc to receive from this endpoint on
the peer proc */
bool endpoint_ack_needed;
/* When we receive a packet that needs an ACK, set this
* to delay the ACK to allow for piggybacking
*/
uint64_t endpoint_acktime;
ompi_btl_usnic_seq_t endpoint_next_contig_seq_to_recv; /* n_r */
ompi_btl_usnic_seq_t endpoint_highest_seq_rcvd; /* n_s */
bool endpoint_rcvd_segs[WINDOW_SIZE];
uint32_t endpoint_rfstart;
} mca_btl_base_endpoint_t;
typedef mca_btl_base_endpoint_t ompi_btl_usnic_endpoint_t;
OBJ_CLASS_DECLARATION(ompi_btl_usnic_endpoint_t);
/*
* Flush all pending sends and resends from and endpoint
*/
void
ompi_btl_usnic_flush_endpoint(
ompi_btl_usnic_endpoint_t *endpoint);
END_C_DECLS
#endif