/*
 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
 *                         University Research and Technology
 *                         Corporation.  All rights reserved.
 * Copyright (c) 2004-2006 The University of Tennessee and The University
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2005 The Regents of the University of California.
 *                         All rights reserved.
 * $COPYRIGHT$
 * 
 * Additional copyrights may follow
 * 
 * $HEADER$
 */

#ifndef _OMPI_FIFO
#define _OMPI_FIFO

#include "ompi/constants.h"
#include "opal/sys/cache.h"
#include "opal/sys/atomic.h"
#include "ompi/mca/mpool/mpool.h"
#include "ompi/class/ompi_circular_buffer_fifo.h"


/** @file
 *
 * This defines a set of functions to create, and manipulate a FIFO
 * implemented as a link list of circular buffer FIFO's.  FIFO
 * elements are assumed to be pointers.  Pointers are written to the
 * head, and read from the tail.  For thread safety, a spin lock is
 * provided in the !!!!!ompi_cb_fifo_ctl_t!!!! structure, but it's use
 * must be managed by the calling routines - this is not by these set
 * of routines.  When a write to a circular buffer queue will overflow
 * that queue, the next circular buffer queue if the link list is
 * used, if it is empty, or a new one is inserted into the list.
 *
 * This set of routines is currently exclusively used by the sm btl,
 * and has been tailored to meet its needs (i.e., it is probably not
 * suitable as a general purpose fifo). 
 *
 * Before describing any further, a note about mmap() is in order.
 * mmap() is used to create/attach shared memory segments to a
 * process.  It is used by OMPI to manage shared memory.
 * Specifically, each process ends up calling mmap() to create or
 * attach shared memory; the end result is that multiple processes
 * have the same shared memory segment attached to their process.
 * This shared memory is therefore used here in the fifo code.  
 *
 * However, it is important to note that when attaching the same
 * shared memory segment to multiple processes, mmap() does *not* need
 * to return the same virtual address to the beginning of the shared
 * memory segment to each process.  That is, the virtual address
 * returned in each process will point to the same shared memory
 * segment as all others, but its virtual address value may be
 * different.  Specifically, process A may get the value X back from
 * mmap(), while process B, who attached the same shared memory
 * segment as process A, may get back the value Y from mmap().
 * Process C may attach the same shared memory segment and get back
 * value X from mmap().  This is perfectly legal mmap() behavior.
 *
 * As such, our code -- including this fifo code -- needs to be able
 * to handle the cases where the base address is the same and the
 * cases where it is different.
 *
 * There are four main interface functions:
 *
 * ompi_fifo_init_same_base_addr(): create a fifo for the case where
 * the creating process shares a common shared memory segment base
 * address.
 *
 * ompi_fifo_write_to_head_same_base_addr(): write a value to the head
 * of the fifo for the case where the shared memory segment virtual
 * address is the same as the process who created the fifo.
 *
 * ompi_fifo_read_from_tail_same_base_addr(): read a value from the
 * tail of the fifo for the case where the shared memory segment
 * virtual address is the same as the process who created the fifo.
 *
 * ompi_fifo_read_from_tail(): read a value from the tail of the fifo
 * for the case where the shared memory segment virtual address is
 * *not* the same as the process who created the fifo.
 *
 * The data structures used in these fifos are carefully structured to
 * be lockless, even when used in shared memory.  However, this is
 * predicated upon there being only exactly *ONE* concurrent writer
 * and *ONE* concurrent reader (in terms of the sm btl, two fifos are
 * established between each process pair; one for data flowing A->B
 * and one for data flowing B->A).  Hence, the writer always looks at
 * the "head" and the reader always looks at the "tail."
 *
 * The general scheme of the fifo is that this class is an upper-level
 * manager for the ompi_circular_buffer_fifo_t class.  When an
 * ompi_fifo_t instance is created, it creates an
 * ompi_circular_buffer_fifo_t.  Items can then be put into the fifo
 * until the circular buffer fills up (i.e., items have not been
 * removed from the circular buffer, so it gets full).  The
 * ompi_fifo_t class will manage this case and create another
 * circular_buffer and start putting items in there.  This can
 * continue indefinitely; the ompi_fifo_t class will create a linked
 * list of circular buffers in order to create storage for any items
 * that need to be put in the fifo.  
 *
 * The tail will then read from these circular buffers in order,
 * draining them as it goes.
 *
 * The linked list of circular buffers is created in a circle, so if
 * you have N circular buffers, the fill pattern will essentially go
 * in a circle (assuming that the reader is dutifully reading/draining
 * behind the writer).  Yes, this means that we have a ring of
 * circular buffers.  A single circular buffer is treated as a
 * standalone entitle, a reader/writer pair can utilize it
 * indefinitely; they will never move on to the next circular buffer
 * unless the writer gets so far ahead of the reader that the current
 * circular buffer fills up and the writer moves on to the next
 * circular buffer.  In this case, the reader will eventually drain
 * the current circular buffer and then move on to the next circular
 * buffer (and assumedly eventually catch up to the writer).
 *
 * The natural question of "why bother doing this instead of just
 * having an array of pointers that you realloc?" arises.  The intent
 * with this class is to have a lockless structure -- using realloc,
 * by definition, means that you would have to lock every single
 * access to the array to ensure that it doesn't get realloc'ed from
 * underneath you.  This is definitely something we want to avoid for
 * performance reasons.
 *
 * Hence, once you get your head wrapped around this scheme, it
 * actually does make sense (and give good performance).
 *
 ********************************* NOTE *******************************
 * 
 * Although the scheme is designed to be lockless, there is currently
 * one lock used in this scheme.  There is a nasty race condition
 * between multiple processes that if the writer fills up a circular
 * buffer before anything this read, it can make the decision to
 * create a new circular buffer (because that one is full).  However,
 * if, at the same time, the reader takes over -- after the decision
 * has been made to make a new circular buffer, and after some [but
 * not all] of the data fields are updated to reflect this -- the
 * reader can drain the entire current circular buffer, obviating the
 * need to make a new circular buffer (because there's now space
 * available in the current one).  The reader will then update some
 * data fields in the fifo.
 *
 * This can lead to a fifo management consistency error -- the reader
 * thinks it is advancing to the next circular bufer but it really
 * ends up back on the same circular buffer (because the writer had
 * not updated the "next cb" field yet).  The reader is then stuck in
 * a cb where nothing will arrive until the writer loops all the way
 * around (i.e., through all other existing circular buffers) and
 * starts writing to the circular buffer where the reader is waiting.
 * This effectively means that the reader will miss a lot of messages.
 *
 * So we had to add a lock to protect this -- when the writer decides
 * to make a new circular buffer and when the reader decides to move
 * to the new circular buffer.  It is a rather coarse-grained lock; it
 * convers a relatively large chunk of code in the writing_to_head
 * function, but, interestingly enough, this seems to create *better*
 * performance for sending large messages via shared memory (i.e.,
 * netpipe graphs with and without this lock show that using the lock
 * gives better overall bandwidth for large messages).  We do lose a
 * bit of overall bandwidth for mid-range message sizes, though.
 *
 * We feel that this lock can probably be eventually removed from the
 * implementation; we recognized this race condition and ran out of
 * time to fix is properly (i.e., in a lockless way).  As such, we
 * employed a lock to serialize the access and protect it that way.
 * This issue should be revisited someday to remove the lock.
 *
 * See the notes in the writer function for more details on the lock.
 */

/*
 * Structure by the the  ompi_fifo routines to keep track of some
 * extra queue information not needed by the ompi_cb_fifo routines.
 */
struct ompi_cb_fifo_wrapper_t {
    /* pointer to ompi_cb_fifo_ctl_t structure in use */
    ompi_cb_fifo_t cb_fifo;

    /* pointer to next ompi_cb_fifo_ctl_t structure.  This is always
       stored as an absolute address. */
    struct ompi_cb_fifo_wrapper_t *next_fifo_wrapper;

    /* flag indicating if cb_fifo has over flown - need this to force
     * release of entries already read */
    volatile bool cb_overflow;
};

typedef struct ompi_cb_fifo_wrapper_t ompi_cb_fifo_wrapper_t;

/* data structure used to describe the fifo */
struct ompi_fifo_t {
    /* locks for multi-process synchronization */
    opal_atomic_lock_t fifo_lock;

    /* locks for thread synchronization */
    opal_atomic_lock_t *head_lock;

    /* locks for thread synchronization */
    opal_atomic_lock_t *tail_lock;

    /* size of fifo */
    int size;

    /* number of allocated circular buffers */
    int cb_count;

    /* fifo memory locality index */
    mca_mpool_base_module_t *fifo_mpool;

    /* head memory locality index */
    mca_mpool_base_module_t *head_mpool;

    /* tail memory locality index */
    mca_mpool_base_module_t *tail_mpool;

    /* offset between sender and receiver shared mapping */
    ptrdiff_t offset;

    /* pointer to head (write) ompi_cb_fifo_t structure.  This is
       always stored as an sender size address. */
    ompi_cb_fifo_wrapper_t *head;

    /* pointer to tail (read) ompi_cb_fifo_t structure.  This is
       always stored as an receiver size address. */
    ompi_cb_fifo_wrapper_t *tail;
};

typedef struct ompi_fifo_t ompi_fifo_t;

/**
 * Initialize a fifo 
 *
 * @param size_of_cb_fifo Length of fifo array (IN)
 *
 * @param fifo_memory_locality_index Locality index to apply to
 *                                   the fifo array.  Not currently
 *                                   in use (IN)
 *
 * @param head_memory_locality_index Locality index to apply to the
 *                                   head control structure.  Not
 *                                   currently in use (IN)
 *
 * @param tail_memory_locality_index Locality index to apply to the
 *                                   tail control structure.  Not
 *                                   currently in use (IN)
 *
 * @param fifo Pointer to data structure defining this fifo (IN)
 *
 * @param memory_allocator Pointer to the memory allocator to use
 *                         to allocate memory for this fifo. (IN)
 *
 * @returncode Error code
 *
 */
static inline int ompi_fifo_init(int size_of_cb_fifo,
        int lazy_free_freq, int cb_num_limit,
        mca_mpool_base_module_t *fifo_mpool,
        mca_mpool_base_module_t *head_mpool,
        mca_mpool_base_module_t *tail_mpool,
        ompi_fifo_t *fifo, ptrdiff_t offset)
{
    int error_code;

    fifo->offset = offset;
    fifo->size = size_of_cb_fifo;
    /*we allocate one cb below so subtract one here */
    fifo->cb_count = cb_num_limit - 1;
    fifo->fifo_mpool = fifo_mpool;
    fifo->head_mpool = head_mpool;
    fifo->tail_mpool = tail_mpool;

    /* allocate head ompi_cb_fifo_t structure and place for head and tail locks
     * on different cache lines */
    fifo->head = (ompi_cb_fifo_wrapper_t*)fifo_mpool->mpool_alloc(
            fifo_mpool, sizeof(ompi_cb_fifo_wrapper_t), getpagesize(),
            0, NULL);
    if(NULL == fifo->head) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    /* initialize the circular buffer fifo head structure */
    error_code = ompi_cb_fifo_init(size_of_cb_fifo,
            lazy_free_freq, head_mpool, tail_mpool, &(fifo->head->cb_fifo),
            offset);
    if ( OMPI_SUCCESS != error_code ) {
        return error_code;
    }

    /* finish head initialization */
    opal_atomic_init(&(fifo->fifo_lock), OPAL_ATOMIC_UNLOCKED);
    fifo->head->next_fifo_wrapper = fifo->head;
    fifo->head->cb_overflow=false;  /* no attempt to overflow the queue */

    /* set the tail */
    fifo->tail = (ompi_cb_fifo_wrapper_t*)((char*)fifo->head - offset);

    /* return */
    return error_code;
}

/**
 * Try to write pointer to the head of the queue
 *
 * @param data Pointer value to write in specified slot (IN)
 *
 * @param fifo Pointer to data structure defining this fifo (IN)
 *
 * @returncode Slot index to which data is written
 *
 */
static inline int ompi_fifo_write_to_head(void *data, ompi_fifo_t *fifo)
{
    int error_code;
    ompi_cb_fifo_wrapper_t *next_ff;

    /* attempt to write data to head ompi_fifo_cb_fifo_t */
    error_code = ompi_cb_fifo_write_to_head(data, &fifo->head->cb_fifo);

    /* If the queue is full, create a new circular buffer and put the
       data in it. */
    if(OMPI_CB_ERROR == error_code) {
        /* NOTE: This is the lock described in the top-level comment
           in this file.  There are corresponding uses of this lock in
           both of the read routines.  We need to protect this whole
           section -- setting cb_overflow to true through setting the
           next_fifo_wrapper to the next circular buffer.  It is
           likely possible to do this in a finer grain; indeed, it is
           likely that we can get rid of this lock altogther, but it
           will take some refactoring to make the data updates
           safe.  */
        opal_atomic_lock(&fifo->fifo_lock);

        /* mark queue as overflown */
        fifo->head->cb_overflow = true;

        /* We retry to write to the old head before creating new one just in
         * case consumer read all entries after first attempt failed, but
         * before we set cb_overflow to true */
        error_code = ompi_cb_fifo_write_to_head(data, &fifo->head->cb_fifo);

        if(error_code != OMPI_CB_ERROR) {
            fifo->head->cb_overflow = false;
            opal_atomic_unlock(&(fifo->fifo_lock));
            return OMPI_SUCCESS;
        }

        /* see if next queue is available - while the next queue
         * has not been emptied, it will be marked as overflowen*/
        next_ff = fifo->head->next_fifo_wrapper;

        /* if next queue not available, allocate new queue */
        if (next_ff->cb_overflow) {
            /* allocate head ompi_cb_fifo_t structure */
            if(0 == fifo->cb_count)
                next_ff = NULL;
            else
                next_ff = (ompi_cb_fifo_wrapper_t*)
                    fifo->fifo_mpool->mpool_alloc(fifo->fifo_mpool,
                            sizeof(ompi_cb_fifo_wrapper_t), getpagesize(), 0,
                            NULL);
            if (NULL == next_ff) {
                opal_atomic_unlock(&fifo->fifo_lock);
                return OMPI_ERR_OUT_OF_RESOURCE;
            }

            /* initialize the circular buffer fifo head structure */
            error_code = ompi_cb_fifo_init(fifo->size,
                    fifo->head->cb_fifo.lazy_free_frequency,
                    fifo->head_mpool, fifo->tail_mpool,
                    &(next_ff->cb_fifo), fifo->offset);
            if (OMPI_SUCCESS != error_code) {
                fifo->fifo_mpool->mpool_free(fifo->fifo_mpool, next_ff, NULL);
                opal_atomic_unlock(&fifo->fifo_lock);
                return error_code;
            }
            fifo->cb_count--;
            /* finish new element initialization */
            /* only one element in the link list */
            next_ff->next_fifo_wrapper = fifo->head->next_fifo_wrapper;
            next_ff->cb_overflow = false; /* no attempt to overflow the queue */
            fifo->head->next_fifo_wrapper = next_ff;
        }

        /* reset head pointer */
        fifo->head = next_ff;
        opal_atomic_unlock(&fifo->fifo_lock);

        /* write data to new head structure */
        error_code=ompi_cb_fifo_write_to_head(data, &fifo->head->cb_fifo);
        if( OMPI_CB_ERROR == error_code ) {
            return OMPI_ERROR;
        }
    }

    return OMPI_SUCCESS; 
}


/**
 * Try to read pointer from the tail of the queue
 *
 * @param fifo Pointer to data structure defining this fifo (IN)
 *
 * @returncode Pointer - OMPI_CB_FREE indicates no data to read
 *
 */
static inline
void *ompi_fifo_read_from_tail(ompi_fifo_t *fifo)
{
    /* local parameters */
    void *return_value;
    bool queue_empty;

    /* get next element */
    return_value = ompi_cb_fifo_read_from_tail(&fifo->tail->cb_fifo,
            fifo->tail->cb_overflow, &queue_empty);

    /* check to see if need to move on to next cb_fifo in the link list */
    if(queue_empty) {
        /* queue_emptied - move on to next element in fifo */
        /* See the big comment at the top of this file about this
           lock. */
        opal_atomic_lock(&(fifo->fifo_lock));
        if(fifo->tail->cb_overflow == true) {
            fifo->tail->cb_overflow = false;
            if(fifo->head == (ompi_cb_fifo_wrapper_t*) (((char*)fifo->tail) + fifo->offset)) {
                opal_atomic_unlock(&(fifo->fifo_lock));
                return return_value;
            }
            fifo->tail = (ompi_cb_fifo_wrapper_t*)
                ((char*)fifo->tail->next_fifo_wrapper - fifo->offset);
        }
        opal_atomic_unlock(&(fifo->fifo_lock));
    }

    return return_value;
}

#endif				/* !_OMPI_FIFO */