openmpi/ompi/mca/bml/bml.h

/*
 * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
 *                         University Research and Technology
 *                         Corporation.  All rights reserved.
 * Copyright (c) 2004-2005 The University of Tennessee and The University
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2006 The Regents of the University of California.
 *                         All rights reserved.
 * $COPYRIGHT$
 * 
 * Additional copyrights may follow
 * 
 * $HEADER$
 */
/**
 * @file
 *
 * BML Management Layer (BML)
 *
 */

#ifndef MCA_BML_H
#define MCA_BML_H

#include "opal/mca/mca.h"
#include "ompi/mca/btl/btl.h"

#include "ompi/mca/bml/base/bml_base_btl.h"
#include "ompi/mca/bml/base/bml_base_endpoint.h" 

#include "ompi/types.h"
#include "ompi/class/ompi_free_list.h"

#include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h"

#define OMPI_ENABLE_DEBUG_RELIABILITY 0

/*
 * BML types
 */

struct ompi_proc_t; 
struct mca_bml_base_module_t;
struct mca_bml_base_endpoint_t;
struct mca_mpool_base_resources_t;

/*
 * Cached set of information for each btl 
 */

struct mca_bml_base_btl_t {
    int    btl_index;                             /**< index in endpoint array */
    int    btl_flags;                             /**< support for put/get? */
    double btl_weight;                            /**< BTL weight for scheduling */
    size_t btl_eager_limit;                       /**< BTL eager limit */
    size_t btl_rndv_eager_limit;                  /**< BTL rndv eager limit */
    size_t btl_max_send_size;                     /**< BTL min send size */
    size_t btl_rdma_pipeline_send_length;         /**< BTL send length */
    size_t btl_rdma_pipeline_frag_size;           /**< BTL rdma frag size */
    size_t btl_min_rdma_pipeline_size;            /**< BTL min rdma size */
    struct mca_btl_base_module_t *btl;            /**< BTL module */
    struct mca_btl_base_endpoint_t* btl_endpoint; /**< BTL addressing info */
        
    /* BTL function table */
    mca_btl_base_module_alloc_fn_t   btl_alloc;
    mca_btl_base_module_free_fn_t    btl_free;
    mca_btl_base_module_send_fn_t    btl_send;
    mca_btl_base_module_sendi_fn_t   btl_sendi;
    mca_btl_base_module_prepare_fn_t btl_prepare_src;
    mca_btl_base_module_prepare_fn_t btl_prepare_dst;
    mca_btl_base_module_put_fn_t     btl_put;
    mca_btl_base_module_get_fn_t     btl_get;
    mca_btl_base_component_progress_fn_t btl_progress; 

    mca_mpool_base_module_t*         btl_mpool;
};
typedef struct mca_bml_base_btl_t mca_bml_base_btl_t;


/**
 * A dynamically growable array of mca_bml_base_btl_t instances.
 * Maintains an index into the array that is used for round-robin
 * scheduling across contents.
 */
struct mca_bml_base_btl_array_t {
    opal_object_t super;
    size_t arr_size;                     /**< number available */
    size_t arr_reserve;                  /**< size of allocated btl_proc array */
    size_t arr_index;                    /**< last used index*/
    mca_bml_base_btl_t* bml_btls;   /**< array of  bml btl's */
};
typedef struct mca_bml_base_btl_array_t mca_bml_base_btl_array_t;

OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_bml_base_btl_array_t);


/**
 * If required, reallocate (grow) the array to the indicate size.
 * 
 * @param array (IN)
 * @param size (IN)
 */
int mca_bml_base_btl_array_reserve(mca_bml_base_btl_array_t*, size_t);

static inline size_t mca_bml_base_btl_array_get_size(mca_bml_base_btl_array_t* array)
{
    return array->arr_size;
}

/**
 * Grow the array if required, and set the size.
 * 
 * @param array (IN)
 * @param size (IN)
 */
static inline void mca_bml_base_btl_array_set_size(mca_bml_base_btl_array_t* array, size_t size)
{
    if(array->arr_size > array->arr_reserve)
        mca_bml_base_btl_array_reserve(array, size);
    array->arr_size = size;
}

/**
 * Grow the array size by one and return the item at that index.
 * 
 * @param array (IN)
 */
static inline mca_bml_base_btl_t* mca_bml_base_btl_array_insert(mca_bml_base_btl_array_t* array)
{
#if OMPI_ENABLE_DEBUG
    if(array->arr_size >= array->arr_reserve) {
        orte_output(0, "mca_bml_base_btl_array_insert: invalid array index %lu >= %lu", 
                    (unsigned long)array->arr_size, (unsigned long)array->arr_reserve);
        return 0;
    }
#endif
    return &array->bml_btls[array->arr_size++];
}

/** 
 * Remove a btl from a bml_btl 
 *
 * @param array (IN)
 * @param btl (IN)
 */
static inline bool mca_bml_base_btl_array_remove( mca_bml_base_btl_array_t* array, 
                                                  struct mca_btl_base_module_t* btl )
{ 
    size_t i = 0;
    /* find the btl */
    for( i = 0; i < array->arr_size; i++ ) {
        if( array->bml_btls[i].btl == btl ) {
            /* make sure not to go out of bounds */
            for( ; i < array->arr_size-1; i++ ) {
                /* move all btl's back by 1, so the found 
                   btl is "removed" */
                array->bml_btls[i] = array->bml_btls[(i+1)];
            }
            array->arr_size--;
            array->arr_index = 0;
            return true;
        }
    }
    return false;
}


/**
 * Return an array item at the specified index.
 * 
 * @param array (IN)
 * @param item_index (IN)
 */
static inline mca_bml_base_btl_t* mca_bml_base_btl_array_get_index(mca_bml_base_btl_array_t* array, size_t item_index)
{
#if OMPI_ENABLE_DEBUG
    if(item_index >= array->arr_size) {
        orte_output(0, "mca_bml_base_btl_array_get_index: invalid array index %lu >= %lu",
                    (unsigned long)item_index, (unsigned long)array->arr_size);
        return 0;
    }
#endif
    return &array->bml_btls[item_index];
}

/**
 * Return the next LRU index in the array.
 * 
 * @param array (IN)
 *
 * @param index (OUT)
 */
static inline mca_bml_base_btl_t* mca_bml_base_btl_array_get_next(mca_bml_base_btl_array_t* array)
{
#if OMPI_ENABLE_DEBUG
    if(array->arr_size == 0) {
        orte_output(0, "mca_bml_base_btl_array_get_next: invalid array size");
        return 0;
    }
#endif
    if( 1 == array->arr_size ) {
        return &array->bml_btls[0];  /* force the return to avoid a jump */
    } else {
        size_t current_position = array->arr_index;  /* force to always start from zero */
        if( (current_position + 1) == array->arr_size ) {
            array->arr_index = 0;  /* next time serve from the beginning */
        } else {
            array->arr_index = current_position + 1;  /* continue */
        }
        return &array->bml_btls[current_position];
    }
}

/**
 * Locate an element in the array
 * 
 * @param array (IN)
 * @param index (IN)
 */
static inline mca_bml_base_btl_t* mca_bml_base_btl_array_find(
    mca_bml_base_btl_array_t* array, struct mca_btl_base_module_t* btl)
{
    size_t i=0;   
    for(i=0; i<array->arr_size; i++) {
        if(array->bml_btls[i].btl == btl) {
            return &array->bml_btls[i];
        }
    }
    return NULL;
}

/**
 *  Structure associated w/ ompi_proc_t that contains the set
 *  of BTLs used to reach a destination
 */
struct mca_bml_base_endpoint_t {
    opal_list_item_t         super;             /**< base_endpoint is a list item */
    struct ompi_proc_t*      btl_proc;          /**< backpointer to target ompi_proc_t */
    size_t                   btl_pipeline_send_length;   /**< max of pipeline send_length of available BTLs */
    size_t                   btl_send_limit;    /**< max of min rdma pipeline for available rmda btls */
    size_t                   btl_max_send_size; /**< min of max send size for available send btls */
    size_t                   btl_rdma_align;    /**< max of min rdma size for available rmda btls */
    mca_bml_base_btl_array_t btl_eager;         /**< array of btls to use for first fragments */
    mca_bml_base_btl_array_t btl_send;          /**< array of btls to use for remaining fragments */
    size_t                   bml_max_send_length;
    size_t                   bml_max_rdma_length;
    mca_bml_base_btl_array_t btl_rdma;          /**< array of btls that support (prefer) rdma */
    size_t btl_rdma_index;                      /**< index of last used BTL for RDMA */
    uint32_t btl_flags_or;                      /**< the bitwise OR of the btl flags */
    uint32_t btl_flags_and;                     /**< the bitwise AND of the btl flags */
};
typedef struct mca_bml_base_endpoint_t mca_bml_base_endpoint_t;
                              
    
OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_bml_base_endpoint_t);

static inline void mca_bml_base_alloc(mca_bml_base_btl_t* bml_btl, mca_btl_base_descriptor_t** des, uint8_t order, size_t size, uint32_t flags) { 
    *des = bml_btl->btl_alloc(bml_btl->btl, bml_btl->btl_endpoint, order, size, flags);
}

static inline void mca_bml_base_free(mca_bml_base_btl_t* bml_btl, mca_btl_base_descriptor_t* des) { 
    bml_btl->btl_free( bml_btl->btl, des );   
    /* The previous function is supposed to release the des object
     * so we should not touch it anymore.
     */
}

#if OMPI_ENABLE_DEBUG_RELIABILITY

int mca_bml_base_send(
    mca_bml_base_btl_t* bml_btl, 
    mca_btl_base_descriptor_t* des, 
    mca_btl_base_tag_t tag); 


#else

static inline int mca_bml_base_send(
    mca_bml_base_btl_t* bml_btl,
    mca_btl_base_descriptor_t* des,
    mca_btl_base_tag_t tag)
{
    int rc;
    des->des_context = (void*) bml_btl;
    rc = bml_btl->btl_send(bml_btl->btl, bml_btl->btl_endpoint, des, tag);
    if(rc == OMPI_ERR_RESOURCE_BUSY)
        rc = OMPI_SUCCESS;

    return rc;
}

#endif

static inline int mca_bml_base_send_status(
    mca_bml_base_btl_t* bml_btl,
    mca_btl_base_descriptor_t* des,
    mca_btl_base_tag_t tag)
{
    des->des_context = (void*) bml_btl;
    return bml_btl->btl_send(bml_btl->btl, bml_btl->btl_endpoint, des, tag);
}

static inline int  mca_bml_base_sendi(
    mca_bml_base_btl_t* bml_btl,
    struct ompi_convertor_t* convertor,
    void* header,
    size_t header_size,
    size_t payload_size,
    uint8_t order,
    uint32_t flags,
    mca_btl_base_tag_t tag,
    mca_btl_base_descriptor_t** descriptor)
{
    return bml_btl->btl_sendi(bml_btl->btl, bml_btl->btl_endpoint, 
                              convertor, header, header_size,
                              payload_size, order, flags, tag, descriptor);
}

static inline int mca_bml_base_put(mca_bml_base_btl_t* bml_btl, mca_btl_base_descriptor_t* des) { 
    des->des_context = (void*) bml_btl; 
    return bml_btl->btl_put(
                             bml_btl->btl,
                             bml_btl->btl_endpoint, 
                             des);
}

static inline int mca_bml_base_get(mca_bml_base_btl_t* bml_btl, mca_btl_base_descriptor_t* des) { 
    des->des_context = (void*) bml_btl; 
    return bml_btl->btl_get(
                             bml_btl->btl,
                             bml_btl->btl_endpoint, 
                             des);
}


static inline void mca_bml_base_prepare_src(mca_bml_base_btl_t* bml_btl, 
                                            mca_mpool_base_registration_t* reg, 
                                            struct ompi_convertor_t* conv, 
                                            uint8_t order,
                                            size_t reserve, 
                                            size_t *size,
                                            uint32_t flags,
                                            mca_btl_base_descriptor_t** des) { 
    *des = bml_btl->btl_prepare_src( bml_btl->btl, 
                                     bml_btl->btl_endpoint, 
                                     reg, 
                                     conv,
                                     order,
                                     reserve,
                                     size,
                                     flags );
    if( OPAL_LIKELY((*des) != NULL) ) {
        (*des)->des_context = (void*) bml_btl;
    }
}

static inline void mca_bml_base_prepare_dst(mca_bml_base_btl_t* bml_btl, 
                                            mca_mpool_base_registration_t* reg, 
                                            struct ompi_convertor_t* conv, 
                                            uint8_t order,
                                            size_t reserve, 
                                            size_t *size,
                                            uint32_t flags,
                                            mca_btl_base_descriptor_t** des) { 
    *des = bml_btl->btl_prepare_dst( bml_btl->btl, 
                                     bml_btl->btl_endpoint, 
                                     reg, 
                                     conv,
                                     order,
                                     reserve,
                                     size,
                                     flags );
    if( OPAL_LIKELY((*des) != NULL) ) {
        (*des)->des_context = (void*) bml_btl;
    }
}

/*
 *  BML component interface functions and datatype.
 */

/**
 * MCA->BML Initializes the BML component and creates specific BML
 * module(s).
 *
 * @param num_bmls (OUT) Returns the number of bml modules created, or 0
 *                       if the transport is not available.
 *
 * @param enable_progress_threads (IN) Whether this component is
 * allowed to run a hidden/progress thread or not.
 *
 * @param enable_mpi_threads (IN) Whether support for multiple MPI
 * threads is enabled or not (i.e., MPI_THREAD_MULTIPLE), which
 * indicates whether multiple threads may invoke this component
 * simultaneously or not.
 *
 * @return Array of pointers to BML modules, or NULL if the transport  
 *         is not available.
 *
 * During component initialization, the BML component should discover
 * the physical devices that are available for the given transport,
 * and create a BML module to represent each device. Any addressing 
 * information required by peers to reach the device should be published 
 * during this function via the mca_base_modex_send() interface. 
 *
 */

typedef struct mca_bml_base_module_t* (*mca_bml_base_component_init_fn_t)(
                                                                          int* priority,
                                                                          bool enable_progress_threads,
                                                                          bool enable_mpi_threads
                                                                          );

/**
 * MCA->BML Called to progress outstanding requests for
 * non-threaded polling environments.
 *
 * @param tstamp     Current time.
 * @return           OMPI_SUCCESS or error code on failure.
 */

typedef int (*mca_bml_base_module_progress_fn_t)(void);

/**
 *  BML component descriptor. Contains component version information
 *  and component open/close/init functions.
 */

struct mca_bml_base_component_1_0_0_t {
  mca_base_component_t bml_version;
  mca_base_component_data_1_0_0_t bml_data;
  mca_bml_base_component_init_fn_t bml_init;
};
typedef struct mca_bml_base_component_1_0_0_t mca_bml_base_component_1_0_0_t;
typedef struct mca_bml_base_component_1_0_0_t mca_bml_base_component_t;


/*
 * BML module interface functions and datatype.
 */

/**
 * MCA->BML Clean up any resources held by BML module 
 * before the module is unloaded.
 *  
 * @param bml (IN)   BML module.
 *
 * Prior to unloading a BML module, the MCA framework will call 
 * the BML finalize method of the module. Any resources held by 
 * the BML should be released and if required the memory corresponding
 * to the BML module freed.
 * 
 */
typedef int (*mca_bml_base_module_finalize_fn_t)( void );
                                                                                                         
/**
 * PML->BML notification of change in the process list. 
 *
 * @param nprocs (IN)         Number of processes
 * @param procs (IN)          Set of processes
 * @param endpoint (OUT)      Set of (optional) mca_bml_base_endpoint_t structures by BML.
 * @param reachable (OUT)     Bitmask indicating set of peer processes that are reachable by this BML.
 * @return                    OMPI_SUCCESS or error status on failure.
 *
 * The mca_bml_base_module_add_procs_fn_t() is called by the PML to 
 * determine the set of BMLs that should be used to reach each process.
 * Any addressing information exported by the peer via the mca_base_modex_send()
 * function should be available during this call via the corresponding 
 * mca_base_modex_recv() function. The BML may utilize this information to 
 * determine reachability of each peer process. 
 *
 * For each process that is reachable by the BML, the bit corresponding to the index 
 * into the proc array (nprocs) should be set in the reachable bitmask. The PML
 * provides the BML the option to return a pointer to a data structure defined
 * by the BML that is returned to the BML on subsequent calls to the BML data
 * transfer functions (e.g bml_send). This may be used by the BML to cache any addressing 
 * or connection information (e.g. TCP socket, IP queue pair).
 *
 * \note This function will return OMPI_ERR_UNREACH if one or more
 * processes can not be reached by the currently active BTLs.  This is
 * not a fatal error, and the calling layer is free to continue using
 * the BML interface.
 */
typedef int (*mca_bml_base_module_add_procs_fn_t)(
                                                  size_t nprocs,
                                                  struct ompi_proc_t** procs, 
                                                  struct mca_bml_base_endpoint_t** endpoints,
                                                  struct ompi_bitmap_t* reachable
                                                  );

/**
 * Notification of change to the process list.
 *
 * @param nprocs (IN)  Number of processes
 * @param proc (IN)    Set of processes
 * @return             Status indicating if cleanup was successful
 *
 * When the process list changes, the PML notifies the BML of the
 * change, to provide the opportunity to cleanup or release any
 * resources associated with the peer.
 */
typedef int (*mca_bml_base_module_del_procs_fn_t)(
                                                  size_t nprocs,
                                                  struct ompi_proc_t** procs
                                                  );

/**
 * Notification of change to the btl list.
 *
 * @param bml (IN)     BTL module
 * @return             Status indicating if cleanup was successful
 *
 * On recovery of a btl, add it to the set of forwarding
 * entries used by the BML.
 */
typedef int (*mca_bml_base_module_add_btl_fn_t)( struct mca_btl_base_module_t* );

/**
 * Notification of change to the btl list.
 *
 * @param bml (IN)     BTL module
 * @return             Status indicating if cleanup was successful
 *
 * On failure of a btl, remove it from the set of forwarding
 * entries used by the BML.
 */
typedef int (*mca_bml_base_module_del_btl_fn_t)( struct mca_btl_base_module_t* );

/**
 * Notification of change to the btl list.
 *
 * @param bml (IN)     BTL module
 * @return             Status indicating if cleanup was successful
 *
 * On failure of a btl, remove it from the set of forwarding
 * entries used by the BML.
 */
typedef int (*mca_bml_base_module_del_proc_btl_fn_t)( 
    struct ompi_proc_t*,
    struct mca_btl_base_module_t* );

/**
 * Register a callback function that is called on receipt
 * of a fragment.
 *
 * @param bml (IN)     BML module
 * @return             Status indicating if cleanup was successful
 *
 * When the process list changes, the PML notifies the BML of the
 * change, to provide the opportunity to cleanup or release any
 * resources associated with the peer.
 */
typedef int (*mca_bml_base_module_register_fn_t)(
                                                 mca_btl_base_tag_t tag,
                                                 mca_btl_base_module_recv_cb_fn_t cbfunc,
                                                 void* cbdata
                                                 );
/**
 * Register a callback function that is called of error.
 *
 * @param bml (IN)     BML module
 * @return             Status indicating if cleanup was successful
 *
 */
typedef int (*mca_bml_base_module_register_error_cb_fn_t)(
        mca_btl_base_module_error_cb_fn_t cbfunc
);

/**
 * Fault Tolerance Event Notification Function
 * @param status Checkpoint Status
 * @return OMPI_SUCCESS or failure status
 */
typedef int (*mca_bml_base_module_ft_event_fn_t)(int status);


/**
 * BML module interface functions and attributes.
 */
struct mca_bml_base_module_t {
    /* BML common attributes */
    mca_bml_base_component_t* bml_component; /**< pointer back to the BML component structure */
    size_t      bml_eager_limit;      /**< maximum size of first fragment -- eager send */
    size_t      bml_rndv_eager_limit;    /**< size of a first fragment of rndv protocol */
    size_t      bml_max_send_size;    /**< maximum send fragment size supported by the BML */
    size_t      bml_min_rdma_size;    /**< threshold below which the BML should not fragment */
    size_t      bml_max_rdma_size;    /**< maximum rdma fragment size supported by the BML */
    
    /* BML function table */
    mca_bml_base_module_add_procs_fn_t     bml_add_procs;
    mca_bml_base_module_del_procs_fn_t     bml_del_procs;
    mca_bml_base_module_add_btl_fn_t       bml_add_btl;
    mca_bml_base_module_del_btl_fn_t       bml_del_btl;
    mca_bml_base_module_del_proc_btl_fn_t  bml_del_proc_btl;
    mca_bml_base_module_register_fn_t      bml_register;
    mca_bml_base_module_register_error_cb_fn_t bml_register_error;

    mca_bml_base_module_finalize_fn_t      bml_finalize;

    mca_bml_base_module_progress_fn_t bml_progress;

    mca_bml_base_module_ft_event_fn_t      bml_ft_event;
};
typedef struct mca_bml_base_module_t mca_bml_base_module_t;

/*
 * Macro for use in modules that are of type bml v1.0.0
 */
#define MCA_BML_BASE_VERSION_1_0_0 \
  /*  v1.0 is chained to MCA v1.0 */ \
  MCA_BASE_VERSION_1_0_0, \
  /* bml v1.0 */ \
  "bml", 1, 0, 0

#endif /* OMPI_MCA_BML_H */