/*
 * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
 *                         University Research and Technology
 *                         Corporation.  All rights reserved.
 * Copyright (c) 2004-2005 The University of Tennessee and The University
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2006 The Regents of the University of California.
 *                         All rights reserved.
 * Copyright (c) 2011      NVIDIA Corporation.  All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
 *
 * $HEADER$
 */

#ifndef OMPI_MCA_COMMON_CUDA_H
#define OMPI_MCA_COMMON_CUDA_H
#include "ompi/mca/btl/btl.h"
#include "opal/datatype/opal_convertor.h"

#define MEMHANDLE_SIZE 8
#define EVTHANDLE_SIZE 8
struct mca_mpool_common_cuda_reg_t {
    mca_mpool_base_registration_t base;
    uint64_t memHandle[MEMHANDLE_SIZE];
    uint64_t evtHandle[EVTHANDLE_SIZE];
    uint64_t event;
};
typedef struct mca_mpool_common_cuda_reg_t mca_mpool_common_cuda_reg_t;
extern bool mca_common_cuda_enabled;

OMPI_DECLSPEC int mca_common_cuda_register_mca_variables(void);

OMPI_DECLSPEC void mca_common_cuda_register(void *ptr, size_t amount, char *msg);

OMPI_DECLSPEC void mca_common_cuda_unregister(void *ptr, char *msg);

OMPI_DECLSPEC void mca_common_wait_stream_synchronize(mca_mpool_common_cuda_reg_t *rget_reg);

OMPI_DECLSPEC int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
                                         struct mca_btl_base_descriptor_t *, int *done);

OMPI_DECLSPEC int mca_common_cuda_record_ipc_event(char *msg,
                                               struct mca_btl_base_descriptor_t *frag);
OMPI_DECLSPEC int mca_common_cuda_record_dtoh_event(char *msg,
                                                    struct mca_btl_base_descriptor_t *frag);
OMPI_DECLSPEC int mca_common_cuda_record_htod_event(char *msg,
                                                    struct mca_btl_base_descriptor_t *frag);

OMPI_DECLSPEC void *mca_common_cuda_get_dtoh_stream(void);
OMPI_DECLSPEC void *mca_common_cuda_get_htod_stream(void);

OMPI_DECLSPEC int progress_one_cuda_ipc_event(struct mca_btl_base_descriptor_t **);
OMPI_DECLSPEC int progress_one_cuda_dtoh_event(struct mca_btl_base_descriptor_t **);
OMPI_DECLSPEC int progress_one_cuda_htod_event(struct mca_btl_base_descriptor_t **);

OMPI_DECLSPEC int mca_common_cuda_memhandle_matches(mca_mpool_common_cuda_reg_t *new_reg,
                                                    mca_mpool_common_cuda_reg_t *old_reg);

OMPI_DECLSPEC void mca_common_cuda_construct_event_and_handle(uint64_t **event, void **handle);
OMPI_DECLSPEC void mca_common_cuda_destruct_event(uint64_t *event);

OMPI_DECLSPEC int cuda_getmemhandle(void *base, size_t, mca_mpool_base_registration_t *newreg,
                                    mca_mpool_base_registration_t *hdrreg);
OMPI_DECLSPEC int cuda_ungetmemhandle(void *reg_data, mca_mpool_base_registration_t *reg);
OMPI_DECLSPEC int cuda_openmemhandle(void *base, size_t size, mca_mpool_base_registration_t *newreg,
                                     mca_mpool_base_registration_t *hdrreg);
OMPI_DECLSPEC int cuda_closememhandle(void *reg_data, mca_mpool_base_registration_t *reg);
OMPI_DECLSPEC int mca_common_cuda_get_device(int *devicenum);
OMPI_DECLSPEC int mca_common_cuda_device_can_access_peer(int *access, int dev1, int dev2);
OMPI_DECLSPEC int mca_common_cuda_stage_one_init(void);
OMPI_DECLSPEC int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base);
#if OPAL_CUDA_GDR_SUPPORT
OMPI_DECLSPEC bool mca_common_cuda_previously_freed_memory(mca_mpool_base_registration_t *reg);
OMPI_DECLSPEC void mca_common_cuda_get_buffer_id(mca_mpool_base_registration_t *reg);
#endif /* OPAL_CUDA_GDR_SUPPORT */
/**
 * Return:   0 if no packing is required for sending (the upper layer
 *             can use directly the pointer to the contiguous user
 *             buffer).
 *           1 if data does need to be packed, i.e. heterogeneous peers
 *             (source arch != dest arch) or non contiguous memory
 *             layout.
 */
static inline int32_t opal_convertor_cuda_need_buffers( opal_convertor_t* pConvertor )
{
    int32_t retval;
    uint32_t cudaflag = pConvertor->flags & CONVERTOR_CUDA; /* Save CUDA flag */
    pConvertor->flags &= ~CONVERTOR_CUDA;              /* Clear CUDA flag if it exists */
    retval = opal_convertor_need_buffers(pConvertor);
    pConvertor->flags |= cudaflag; /* Restore CUDA flag */
    return retval;
}

#endif /* OMPI_MCA_COMMON_CUDA_H */