1
1
openmpi/ompi/mca/btl/usnic/btl_usnic_endpoint.h
Dave Goodell 1ed9b8ff43 usnic: fix segfault at finalize time
Without this commit, if you run IMB pingpong between two nodes with only
one usnic selected (e.g., via `--mca btl_usnic_if_include usnic_0`) then
the run will seem fine but will segfault at MPI_Finalize time.

This behavior has happened since Cisco v1.6 git commit ec7ddf8, upstream
trunk r29484, and upstream v1.7 r29507.

Root cause was that the free list element was being used as the recv
buffer instead of the data buffer associated with the element.  So the
reassembly code would stomp all over the free list element, which would
cause the destructor to explode when the free list attempted to clean up
all of its elements.  This surprisingly did not cause any other problems
until now.

Reviewed-by: Reese Faucette <rfaucett@cisco.com>

This commit was SVN r29593.

The following SVN revision numbers were found above:
  r29484 --> open-mpi/ompi@a6ed232a10
  r29507 --> open-mpi/ompi@790d269ce8
2013-11-04 22:52:14 +00:00

187 строки
6.6 KiB
C

/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Sandia National Laboratories. All rights
* reserved.
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef OMPI_BTL_USNIC_ENDPOINT_H
#define OMPI_BTL_USNIC_ENDPOINT_H
#include <infiniband/verbs.h>
#include "opal/class/opal_list.h"
#include "opal/class/opal_hotel.h"
#include "opal/mca/event/event.h"
#include "btl_usnic.h"
BEGIN_C_DECLS
/*
* Forward declarations to avoid include loops
*/
struct ompi_btl_usnic_module_t;
struct ompi_btl_usnic_send_segment_t;
/*
* Have the window size as a compile-time constant that is a power of
* two so that we can take advantage of fast bit operations.
*/
#define WINDOW_SIZE 4096
#define WINDOW_SIZE_MOD(a) (((a) & (WINDOW_SIZE - 1)))
#define WINDOW_OPEN(E) ((E)->endpoint_next_seq_to_send < \
((E)->endpoint_ack_seq_rcvd + WINDOW_SIZE))
#define WINDOW_EMPTY(E) ((E)->endpoint_ack_seq_rcvd == \
((E)->endpoint_next_seq_to_send-1))
/*
* Returns true when an endpoint has nothing left to send
*/
#define ENDPOINT_DRAINED(E) (WINDOW_EMPTY(E) && \
opal_list_is_empty(&(E)->endpoint_frag_send_queue))
/*
* Channel IDs
*/
typedef enum ompi_btl_usnic_channel_id_t {
USNIC_PRIORITY_CHANNEL,
USNIC_DATA_CHANNEL,
USNIC_NUM_CHANNELS
} ompi_btl_usnic_channel_id_t;
typedef struct ompi_btl_usnic_addr_t {
ompi_btl_usnic_seq_t isn;
uint32_t qp_num[USNIC_NUM_CHANNELS];
union ibv_gid gid;
uint32_t ipv4_addr;
uint32_t cidrmask;
uint8_t mac[6];
int mtu;
} ompi_btl_usnic_addr_t;
struct ompi_btl_usnic_send_segment_t;
struct ompi_btl_usnic_proc_t;
/*
* This is a descriptor for an incoming fragment that is broken
* into chunks. When the first reference to this frag_id is seen,
* memory is allocated for it. When the last byte arrives, the assembled
* fragment is passed to the PML.
*
* The endpoint structure has space for WINDOW_SIZE/2 simultaneous fragments.
* This is the largest number of fragments that can possibly be in-flight
* to us from a particular endpoint because eash chunked fragment will occupy
* at least two segments, and only WINDOW_SIZE segments can be in flight.
* OK, so there is an extremely pathological case where we could see
* (WINDOW_SIZE/2)+1 "in flight" at once, but just dropping that last one
* and waiting for retrans is just fine in this hypothetical hyper-pathological
* case, which is what we'll do.
*/
#define MAX_ACTIVE_FRAGS (WINDOW_SIZE/2)
typedef struct ompi_btl_usnic_rx_frag_info_t {
uint32_t rfi_frag_id; /* ID for this fragment */
uint32_t rfi_frag_size; /* bytes in this fragment */
uint32_t rfi_bytes_left; /* bytes remaining to RX in fragment */
char *rfi_data; /* pointer to assembly area */
int rfi_data_pool; /* if 0, data malloced, else rx buf pool */
ompi_free_list_item_t *rfi_fl_elt; /* free list elemement from buf pool
(rfi_fl_elt->ptr==rfi_data) when
rfi_data_pool is nonzero */
} ompi_btl_usnic_rx_frag_info_t;
/**
* An abstraction that represents a connection to a remote process.
* An instance of mca_btl_base_endpoint_t is associated with each
* (btl_usnic_proc_t, btl_usnic_module_t) tuple and address
* information is exchanged at startup. The usnic BTL is
* connectionless, so no connection is ever established.
*/
typedef struct mca_btl_base_endpoint_t {
opal_list_item_t super;
/** BTL module that created this connection */
struct ompi_btl_usnic_module_t *endpoint_module;
/** proc that owns this endpoint */
struct ompi_btl_usnic_proc_t *endpoint_proc;
int endpoint_proc_index; /* index in owning proc's endpoint array */
/** True when proc has been deleted, but still have sends that need ACKs */
bool endpoint_exiting;
/** List item for linking into module "all_endpoints" */
opal_list_item_t endpoint_endpoint_li;
/** List item for linking into "need ack" */
opal_list_item_t endpoint_ack_li;
/** Remote address information */
ompi_btl_usnic_addr_t endpoint_remote_addr;
/** Remote address handle */
struct ibv_ah* endpoint_remote_ah;
/** Send-related data */
bool endpoint_ready_to_send;
opal_list_t endpoint_frag_send_queue;
int32_t endpoint_send_credits;
uint32_t endpoint_next_frag_id;
/** Receive-related data */
struct ompi_btl_usnic_rx_frag_info_t *endpoint_rx_frag_info;
/** OPAL hotel to track outstanding stends */
opal_hotel_t endpoint_hotel;
/** Sliding window parameters for this peer */
/* Values for the current proc to send to this endpoint on the
peer proc */
ompi_btl_usnic_seq_t endpoint_next_seq_to_send; /* n_t */
ompi_btl_usnic_seq_t endpoint_ack_seq_rcvd; /* n_a */
struct ompi_btl_usnic_send_segment_t *endpoint_sent_segs[WINDOW_SIZE];
/* Values for the current proc to receive from this endpoint on
the peer proc */
bool endpoint_ack_needed;
/* When we receive a packet that needs an ACK, set this
* to delay the ACK to allow for piggybacking
*/
uint64_t endpoint_acktime;
ompi_btl_usnic_seq_t endpoint_next_contig_seq_to_recv; /* n_r */
ompi_btl_usnic_seq_t endpoint_highest_seq_rcvd; /* n_s */
bool endpoint_rcvd_segs[WINDOW_SIZE];
uint32_t endpoint_rfstart;
} mca_btl_base_endpoint_t;
typedef mca_btl_base_endpoint_t ompi_btl_usnic_endpoint_t;
OBJ_CLASS_DECLARATION(ompi_btl_usnic_endpoint_t);
/*
* Flush all pending sends and resends from and endpoint
*/
void
ompi_btl_usnic_flush_endpoint(
ompi_btl_usnic_endpoint_t *endpoint);
END_C_DECLS
#endif