8b77359cac
We only support running with libfabric v1.3 or greater. So it's safe to remove the legacy/adaptive cq_readerr() behavior. Signed-off-by: Jeff Squyres <jsquyres@cisco.com>
287 строки
8.4 KiB
C
287 строки
8.4 KiB
C
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
|
/*
|
|
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
|
* reserved.
|
|
* Copyright (c) 2011-2016 Cisco Systems, Inc. All rights reserved.
|
|
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
|
|
* reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
/**
|
|
* @file
|
|
*/
|
|
#ifndef OPAL_BTL_USNIC_H
|
|
#define OPAL_BTL_USNIC_H
|
|
|
|
#include "opal_config.h"
|
|
#include <sys/types.h>
|
|
|
|
#include "opal_stdint.h"
|
|
#include "opal/util/alfg.h"
|
|
#include "opal/class/opal_hash_table.h"
|
|
#include "opal/class/opal_hash_table.h"
|
|
#include "opal/mca/event/event.h"
|
|
|
|
#if BTL_IN_OPAL
|
|
#include "opal/mca/btl/btl.h"
|
|
#include "opal/mca/btl/base/btl_base_error.h"
|
|
#include "opal/mca/btl/base/base.h"
|
|
#include "opal/mca/rcache/rcache.h"
|
|
|
|
#include "btl_usnic_compat.h"
|
|
|
|
#if RCACHE_VERSION < 30
|
|
#include "opal/mca/mpool/grdma/mpool_grdma.h"
|
|
#endif
|
|
#else
|
|
#include "ompi/mca/btl/btl.h"
|
|
#include "ompi/mca/btl/base/btl_base_error.h"
|
|
#include "ompi/mca/btl/base/base.h"
|
|
#include "ompi/mca/mpool/grdma/mpool_grdma.h"
|
|
#endif
|
|
|
|
BEGIN_C_DECLS
|
|
|
|
/*
|
|
* We're simulating a clock as best we can without resorting to the
|
|
* system. The clock is used to defer ACKs, and ticks will be incremented
|
|
* when progression gets called. It could be incremented by different amounts
|
|
* at other times as needed or as tuning dictates.
|
|
*/
|
|
extern uint64_t opal_btl_usnic_ticks;
|
|
|
|
/* Lock for MPU_THREAD_MULTIPLE support */
|
|
extern opal_recursive_mutex_t btl_usnic_lock;
|
|
|
|
static inline uint64_t
|
|
get_nsec(void)
|
|
{
|
|
return opal_btl_usnic_ticks;
|
|
}
|
|
|
|
/* RNG buffer declaration */
|
|
extern opal_rng_buff_t opal_btl_usnic_rand_buff;
|
|
|
|
#ifndef container_of
|
|
#define container_of(ptr, type, member) ( \
|
|
(type *)( ((char *)(ptr)) - offsetof(type,member) ))
|
|
#endif
|
|
|
|
#ifndef max
|
|
#define max(a, b) (((a) > (b)) ? (a) : (b))
|
|
#endif
|
|
|
|
/* MSGDEBUG2 prints 1 line at each BTL entry point */
|
|
#define MSGDEBUG2 (MSGDEBUG1||0)
|
|
/* MSGDEBUG1 prints more info about arguments and internal functions */
|
|
#define MSGDEBUG1 0
|
|
|
|
/* output macros to declutter source */
|
|
#if MSGDEBUG1
|
|
#define MSGDEBUG1_OUT(...) opal_output(0, __VA_ARGS__)
|
|
#else
|
|
#define MSGDEBUG1_OUT(...) do {} while (0)
|
|
#endif
|
|
#if MSGDEBUG2
|
|
#define MSGDEBUG2_OUT(...) opal_output(0, __VA_ARGS__)
|
|
#else
|
|
#define MSGDEBUG2_OUT(...) do {} while (0)
|
|
#endif
|
|
|
|
/* Set to >0 to randomly drop received frags. The higher the number,
|
|
the more frequent the drops. */
|
|
#define WANT_RECV_DROPS 0
|
|
/* Set to >0 to randomly fail to send an ACK, mimicing a lost ACK.
|
|
The higher the number, the more frequent the failed-to-send-ACK. */
|
|
#define WANT_FAIL_TO_SEND_ACK 0
|
|
/* Set to >0 to randomly fail to resend a frag (causing it to be
|
|
requed to be sent later). The higher the number, the more frequent
|
|
the failed-to-resend-frag. */
|
|
#define WANT_FAIL_TO_RESEND_FRAG 0
|
|
|
|
#if WANT_RECV_DROPS > 0
|
|
#define FAKE_RECV_DROP (opal_rand(&opal_btl_usnic_rand_buff) < WANT_RECV_DROPS)
|
|
#else
|
|
#define FAKE_RECV_DROP 0
|
|
#endif
|
|
|
|
#if WANT_FAIL_TO_SEND_ACK > 0
|
|
#define FAKE_FAIL_TO_SEND_ACK (opal_rand(&opal_btl_usnic_rand_buff) < WANT_FAIL_TO_SEND_ACK)
|
|
#else
|
|
#define FAKE_FAIL_TO_SEND_ACK 0
|
|
#endif
|
|
|
|
#if WANT_FAIL_TO_RESEND_FRAG > 0
|
|
#define FAKE_FAIL_TO_RESEND_FRAG (opal_rand(&opal_btl_usnic_rand_buff) < WANT_FAIL_TO_RESEND_FRAG)
|
|
#else
|
|
#define FAKE_FAIL_TO_RESEND_FRAG 0
|
|
#endif
|
|
|
|
|
|
/**
|
|
* usnic BTL component
|
|
*/
|
|
typedef struct opal_btl_usnic_component_t {
|
|
/** base BTL component */
|
|
mca_btl_base_component_2_0_0_t super;
|
|
|
|
/* in the v1.6 series, sizeof(super) is 256, leading to good alignment for
|
|
* subsequent fastpath fields */
|
|
|
|
/** Maximum number of BTL modules */
|
|
int max_modules;
|
|
/** Number of available/initialized BTL modules */
|
|
int num_modules;
|
|
|
|
/* Cached hashed version of my RTE proc name (to stuff in
|
|
protocol headers) */
|
|
uint64_t my_hashed_rte_name;
|
|
|
|
/** array of possible BTLs (>= num_modules elements) */
|
|
struct opal_btl_usnic_module_t* usnic_all_modules;
|
|
/** array of pointers to active BTLs (num_modules elements) */
|
|
struct opal_btl_usnic_module_t** usnic_active_modules;
|
|
|
|
/** convertor packing threshold */
|
|
int pack_lazy_threshold;
|
|
|
|
/* vvvvvvvvvv non-fastpath fields go below vvvvvvvvvv */
|
|
|
|
/** list of usnic proc structures */
|
|
opal_list_t usnic_procs;
|
|
|
|
#if RCACHE_VERSION == 30
|
|
/** memory pool hints */
|
|
char* usnic_mpool_hints;
|
|
|
|
/** registration cache name */
|
|
char *usnic_rcache_name;
|
|
#else
|
|
/** name of memory pool */
|
|
char* usnic_mpool_name;
|
|
#endif
|
|
|
|
char *if_include;
|
|
char *if_exclude;
|
|
|
|
/** Want stats? */
|
|
bool stats_enabled;
|
|
bool stats_relative;
|
|
int stats_frequency;
|
|
|
|
/** Whether we want to use NUMA distances to choose which usNIC
|
|
devices to use for short messages */
|
|
bool want_numa_device_assignment;
|
|
|
|
/** max send descriptors to post per module */
|
|
int32_t sd_num;
|
|
|
|
/** max receive descriptors per module */
|
|
int32_t rd_num;
|
|
|
|
/** max send/receive desriptors for priority channel */
|
|
int32_t prio_sd_num;
|
|
int32_t prio_rd_num;
|
|
|
|
/** max completion queue entries per module */
|
|
int32_t cq_num;
|
|
|
|
/** max number of entries in AV EQ */
|
|
int32_t av_eq_num;
|
|
|
|
/** retrans characteristics */
|
|
int retrans_timeout;
|
|
|
|
/** transport header length for all usNIC devices on this server
|
|
(it is guaranteed that all usNIC devices on a single server
|
|
will have the same underlying transport, and therefore the
|
|
same transport header length) */
|
|
int transport_header_len;
|
|
uint32_t transport_protocol;
|
|
|
|
/* what UDP port do we want to use? If 0, the system will pick.
|
|
If nonzero, it is used as the base -- the final number will be
|
|
(base+my_local_rank). */
|
|
int udp_port_base;
|
|
|
|
/** disable the "cannot find route" warnings (for network setups
|
|
where this is known/acceptable) */
|
|
bool show_route_failures;
|
|
|
|
/** connectivity verification: ACK timeout, number of retries
|
|
before issue an error/abort the job */
|
|
bool connectivity_enabled;
|
|
int connectivity_ack_timeout;
|
|
int connectivity_num_retries;
|
|
|
|
/** how many short packets have to be received before outputting
|
|
the "received short packets" warning? */
|
|
uint32_t max_short_packets;
|
|
|
|
/* Prefix for the connectivity map filename (map will be output if
|
|
the prefix is non-NULL) */
|
|
char *connectivity_map_prefix;
|
|
|
|
/** Offset into the send buffer where the payload will go. For
|
|
libfabric v1.0.0 / API v1.0, this is 0. For libfabric >=v1.1
|
|
/ API >=v1.1, this is the endpoint.msg_prefix_size (i.e.,
|
|
component.transport_header_len). */
|
|
uint32_t prefix_send_offset;
|
|
|
|
/* OPAL async progress event base */
|
|
opal_event_base_t *opal_evbase;
|
|
} opal_btl_usnic_component_t;
|
|
|
|
OPAL_MODULE_DECLSPEC extern opal_btl_usnic_component_t mca_btl_usnic_component;
|
|
|
|
typedef mca_btl_base_recv_reg_t opal_btl_usnic_recv_reg_t;
|
|
|
|
/**
|
|
* Size for sequence numbers (just to ensure we use the same size
|
|
* everywhere)
|
|
*/
|
|
typedef uint16_t opal_btl_usnic_seq_t;
|
|
#define UDSEQ PRIu16
|
|
|
|
/* sequence number comparison macros that allow for rollover.
|
|
* Relies on the fact that sequence numbers should be relatively close
|
|
* together as compared to (1<<31)
|
|
*/
|
|
#define SEQ_DIFF(A,B) ((int16_t)((A)-(B)))
|
|
#define SEQ_LT(A,B) (SEQ_DIFF(A,B) < 0)
|
|
#define SEQ_LE(A,B) (SEQ_DIFF(A,B) <= 0)
|
|
#define SEQ_GT(A,B) (SEQ_DIFF(A,B) > 0)
|
|
#define SEQ_GE(A,B) (SEQ_DIFF(A,B) >= 0)
|
|
|
|
/**
|
|
* Register the usnic BTL MCA params
|
|
*/
|
|
int opal_btl_usnic_component_register(void);
|
|
|
|
/**
|
|
* Routine which can be called from a debugger to print module, endpoint,
|
|
* fragment, and segment state to standard output. */
|
|
void opal_btl_usnic_component_debug(void);
|
|
|
|
/**
|
|
* Called to output the connectivity map
|
|
*/
|
|
void opal_btl_usnic_connectivity_map(void);
|
|
|
|
END_C_DECLS
|
|
#endif
|