1
1
openmpi/ompi/mca/btl/ofud/btl_ofud.h
Jeff Squyres 0af7ac53f2 Fixes trac:1392, #1400
* add "register" function to mca_base_component_t
   * converted coll:basic and paffinity:linux and paffinity:solaris to
     use this function
   * we'll convert the rest over time (I'll file a ticket once all
     this is committed)
 * add 32 bytes of "reserved" space to the end of mca_base_component_t
   and mca_base_component_data_2_0_0_t to make future upgrades
   [slightly] easier
   * new mca_base_component_t size: 196 bytes
   * new mca_base_component_data_2_0_0_t size: 36 bytes
 * MCA base version bumped to v2.0
   * '''We now refuse to load components that are not MCA v2.0.x'''
 * all MCA frameworks versions bumped to v2.0
 * be a little more explicit about version numbers in the MCA base
   * add big comment in mca.h about versioning philosophy

This commit was SVN r19073.

The following Trac tickets were found above:
  Ticket 1392 --> https://svn.open-mpi.org/trac/ompi/ticket/1392
2008-07-28 22:40:57 +00:00

368 строки
11 KiB
C

/*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Sandia National Laboratories. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*/
#ifndef MCA_BTL_UD_H
#define MCA_BTL_UD_H
/* Number of QP's to stripe sends over - keep this as power of 2 */
/* AWF - This is intentionally NOT an MCA parameter so that I can do fast
modular arithmetic with it. */
#define MCA_BTL_UD_NUM_QP 4
/* Standard system includes */
#include <sys/types.h>
#include <infiniband/verbs.h>
/* Open MPI includes */
#include "opal/class/opal_hash_table.h"
#include "orte/util/show_help.h"
#include "opal/sys/timer.h"
#include "ompi/class/ompi_free_list.h"
#include "ompi/class/ompi_bitmap.h"
#include "ompi/mca/btl/btl.h"
#include "ompi/mca/btl/base/btl_base_error.h"
#include "ompi/mca/btl/base/base.h"
#include "ompi/mca/mpool/mpool.h"
/* TODO - If I want this to go away, addr_t has to come over here */
#include "btl_ofud_endpoint.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
/**
* UD Infiniband (IB) BTL component.
*/
struct mca_btl_ud_component_t {
mca_btl_base_component_2_0_0_t super; /**< base BTL component */
uint32_t max_btls; /**< Maximum number of BTL modules */
uint32_t num_btls; /**< Number of available/initialized BTL modules */
char* if_include;
char** if_include_list;
char* if_exclude;
char** if_exclude_list;
char** if_list;
struct mca_btl_ud_module_t* ud_btls; /**< array of available BTLs */
opal_list_t ud_procs; /**< list of ib proc structures */
opal_mutex_t ud_lock; /**< lock for accessing component state */
char* ud_mpool_name; /**< name of memory pool */
int32_t sd_num; /**< max send descriptors to post per BTL */
int32_t rd_num; /**< number of receive descriptors per BTL */
#if 0
int32_t rd_num_init; /**< initial recv descriptors to post per BTL */
int32_t rd_num_max;
int32_t rd_num_inc;
#endif
uint32_t ib_pkey_ix;
uint32_t ib_qkey;
uint32_t ib_service_level;
uint32_t ib_src_path_bits;
}; typedef struct mca_btl_ud_component_t mca_btl_ud_component_t;
OMPI_MODULE_DECLSPEC extern mca_btl_ud_component_t mca_btl_ofud_component;
typedef mca_btl_base_recv_reg_t mca_btl_ud_recv_reg_t;
/**
* Profiling variables
*/
#if OMPI_ENABLE_DEBUG
#define MCA_BTL_UD_ENABLE_PROFILE 0
#else
#define MCA_BTL_UD_ENABLE_PROFILE 0
#endif
#if MCA_BTL_UD_ENABLE_PROFILE
#define MCA_BTL_UD_PROFILE_VAR(var) \
opal_timer_t avg_ ## var; \
opal_timer_t cnt_ ## var; \
opal_timer_t tmp_ ## var
struct mca_btl_ud_profile_t {
MCA_BTL_UD_PROFILE_VAR(post_send);
MCA_BTL_UD_PROFILE_VAR(ibv_post_send);
};
typedef struct mca_btl_ud_profile_t mca_btl_ud_profile_t;
extern mca_btl_ud_profile_t mca_btl_ud_profile;
#endif
/**
* UD/IB BTL Interface
*/
struct mca_btl_ud_module_t {
mca_btl_base_module_t super;
uint8_t ib_port_num;
struct ibv_device* ib_dev;
struct ibv_context* ib_dev_context;
struct ibv_pd* ib_pd;
struct ibv_cq* ib_cq;
struct mca_btl_ud_addr_t addr; /**< local address information */
ompi_free_list_t send_frags; /**< send fragments & buffers */
ompi_free_list_t user_frags; /**< user data fragments */
ompi_free_list_t recv_frags; /**< receive fragments & buffers */
opal_list_t pending_frags; /**< list of pending send frags */
opal_mutex_t ud_lock; /**< lock for pending_frags */
size_t ib_inline_max; /**< max size of IB inline send */
/*int32_t rd_posted;*/ /**< number of receives currently posted */
int32_t sd_wqe; /**< available send WQ entries */
/* No lock needed, these are incremented/decremented atomically */
/*opal_hash_table_t* ep_lookup;*/
/**< hash table for fast lookup of endpoint structures in recv path */
/* lid:qpnum is key, value is mca_btl_ud_endpoint_t* */
struct ibv_qp* ib_qp[MCA_BTL_UD_NUM_QP];
uint32_t ib_qp_next;
/**< Local QPs and stripe counters */
/* No lock needed - counters only ever increase by 1 */
}; typedef struct mca_btl_ud_module_t mca_btl_ud_module_t;
struct mca_btl_ud_frag_t;
extern mca_btl_ud_module_t mca_btl_ofud_module;
/**
* Open the component; register UD/IB parameters with the MCA framework
*/
extern int mca_btl_ud_component_open(void);
/**
* Any final cleanup before being unloaded.
*/
extern int mca_btl_ud_component_close(void);
/**
* IB component initialization.
*
* @param num_btl_modules (OUT)
* Number of BTLs returned in BTL array.
* @param allow_multi_user_threads (OUT)
* Flag indicating wether BTL supports user threads (TRUE)
* @param have_hidden_threads (OUT)
* Flag indicating whether BTL uses threads (TRUE)
*
* (1) read interface list from verbs and compare against component parameters
* then create a BTL instance for selected interfaces
* (2) publish BTL addressing info
*/
extern mca_btl_base_module_t** mca_btl_ud_component_init(
int *num_btl_modules,
bool allow_multi_user_threads,
bool have_hidden_threads);
/**
* UD/IB component progress.
*/
extern int mca_btl_ud_component_progress(void);
/**
* Cleanup any resources held by the BTL.
*
* @param btl BTL instance.
* @return OMPI_SUCCESS or error status on failure.
*/
extern int mca_btl_ud_finalize(struct mca_btl_base_module_t* btl);
/**
* PML->BTL notification of change in the process list.
*
* @param btl (IN)
* @param nprocs (IN) Number of processes
* @param procs (IN) Set of processes
* @param peers (OUT) Set of (optional) peer addressing info.
* @param peers (IN/OUT) Set of processes that are reachable via this BTL.
* @return OMPI_SUCCESS or error status on failure.
*/
extern int mca_btl_ud_add_procs(struct mca_btl_base_module_t* btl,
size_t nprocs,
struct ompi_proc_t **procs,
struct mca_btl_base_endpoint_t** peers,
ompi_bitmap_t* reachable);
/**
* PML->BTL notification of change in the process list.
*
* @param btl (IN) BTL instance
* @param nproc (IN) Number of processes.
* @param procs (IN) Set of processes.
* @param peers (IN) Set of peer data structures.
* @return Status indicating if cleanup was successful
*
*/
extern int mca_btl_ud_del_procs(struct mca_btl_base_module_t* btl,
size_t nprocs,
struct ompi_proc_t **procs,
struct mca_btl_base_endpoint_t** peers);
/**
* PML->BTL Initiate a send of the specified size.
*
* @param btl (IN)
* BTL instance
* @param btl_base_peer (IN)
* BTL peer addressing
* @param send_request (IN/OUT)
* Send request (allocated by PML via mca_btl_base_request_alloc_fn_t)
* @param size (IN)
* Number of bytes PML is requesting BTL to deliver
* @param flags (IN)
* Flags that should be passed to the peer via the message header.
* @param request (OUT)
* OMPI_SUCCESS if the BTL was able to queue one or more fragments
*/
extern int mca_btl_ud_send(struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* btl_peer,
struct mca_btl_base_descriptor_t* descriptor,
mca_btl_base_tag_t tag);
/**
* Allocate a descriptor.
*
* @param btl (IN) BTL module
* @param size (IN) Requested descriptor size.
*/
extern mca_btl_base_descriptor_t* mca_btl_ud_alloc(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
uint8_t order,
size_t size,
uint32_t flags);
/**
* Return a segment allocated by this BTL.
*
* @param btl (IN) BTL module
* @param descriptor (IN) Allocated descriptor.
*/
extern int mca_btl_ud_free(struct mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des);
/**
* Pack data and return a descriptor that can be
* used for send/put.
*
* @param btl (IN) BTL module
* @param peer (IN) BTL peer addressing
*/
mca_btl_base_descriptor_t* mca_btl_ud_prepare_src(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* peer,
mca_mpool_base_registration_t* registration,
struct ompi_convertor_t* convertor,
uint8_t order,
size_t reserve,
size_t* size,
uint32_t flags);
int mca_btl_ud_module_init(mca_btl_ud_module_t* ud_btl);
/**
* Fault Tolerance Event Notification Function
* @param state Checkpoint State
* @return OMPI_SUCCESS or failure status
*/
extern int mca_btl_ud_ft_event(int state);
/*
* Profiling stuff
*/
#if MCA_BTL_UD_ENABLE_PROFILE
#define MCA_BTL_UD_START_TIME(var) \
((mca_btl_ud_profile.tmp_ ## var) = opal_sys_timer_get_cycles())
#define MCA_BTL_UD_END_TIME(var) \
do { \
mca_btl_ud_profile.avg_ ## var += \
opal_sys_timer_get_cycles() - mca_btl_ud_profile.tmp_ ## var; \
mca_btl_ud_profile.cnt_ ## var++; \
} while(0)
#define MCA_BTL_UD_SHOW_TIME(var) \
OPAL_OUTPUT((0, " " #var " avg %lu cnt %lu", \
(mca_btl_ud_profile.avg_ ## var) / (mca_btl_ud_profile.cnt_ ## var), \
mca_btl_ud_profile.cnt_ ## var));
#else
#define MCA_BTL_UD_START_TIME(var)
#define MCA_BTL_UD_END_TIME(var)
#define MCA_BTL_UD_SHOW_TIME(var)
#endif
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif