1
1
openmpi/ompi/mca/mtl/portals4/mtl_portals4.h

339 строки
14 KiB
C
Исходник Обычный вид История

/*
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2007 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2010-2012 Sandia National Laboratories. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MTL_PORTALS_H_HAS_BEEN_INCLUDED
#define MTL_PORTALS_H_HAS_BEEN_INCLUDED
#include <portals4.h>
#include "opal/include/opal_config.h"
#include "opal/class/opal_free_list.h"
#include "opal/class/opal_list.h"
#include "opal/datatype/opal_convertor.h"
#include "ompi/mca/mtl/mtl.h"
#include "ompi/mca/mtl/base/base.h"
#include "mtl_portals4_flowctl.h"
BEGIN_C_DECLS
struct mca_mtl_portals4_send_request_t;
struct mca_mtl_portals4_module_t {
mca_mtl_base_module_t base;
/** Eager limit; messages greater than this use a rendezvous protocol */
MCA/base: Add new MCA variable system Features: - Support for an override parameter file (openmpi-mca-param-override.conf). Variable values in this file can not be overridden by any file or environment value. - Support for boolean, unsigned, and unsigned long long variables. - Support for true/false values. - Support for enumerations on integer variables. - Support for MPIT scope, verbosity, and binding. - Support for command line source. - Support for setting variable source via the environment using OMPI_MCA_SOURCE_<var name>=source (either command or file:filename) - Cleaner API. - Support for variable groups (equivalent to MPIT categories). Notes: - Variables must be created with a backing store (char **, int *, or bool *) that must live at least as long as the variable. - Creating a variable with the MCA_BASE_VAR_FLAG_SETTABLE enables the use of mca_base_var_set_value() to change the value. - String values are duplicated when the variable is registered. It is up to the caller to free the original value if necessary. The new value will be freed by the mca_base_var system and must not be freed by the user. - Variables with constant scope may not be settable. - Variable groups (and all associated variables) are deregistered when the component is closed or the component repository item is freed. This prevents a segmentation fault from accessing a variable after its component is unloaded. - After some discussion we decided we should remove the automatic registration of component priority variables. Few component actually made use of this feature. - The enumerator interface was updated to be general enough to handle future uses of the interface. - The code to generate ompi_info output has been moved into the MCA variable system. See mca_base_var_dump(). opal: update core and components to mca_base_var system orte: update core and components to mca_base_var system ompi: update core and components to mca_base_var system This commit also modifies the rmaps framework. The following variables were moved from ppr and lama: rmaps_base_pernode, rmaps_base_n_pernode, rmaps_base_n_persocket. Both lama and ppr create synonyms for these variables. This commit was SVN r28236.
2013-03-27 21:09:41 +00:00
unsigned long long eager_limit;
/** Size of short message blocks */
MCA/base: Add new MCA variable system Features: - Support for an override parameter file (openmpi-mca-param-override.conf). Variable values in this file can not be overridden by any file or environment value. - Support for boolean, unsigned, and unsigned long long variables. - Support for true/false values. - Support for enumerations on integer variables. - Support for MPIT scope, verbosity, and binding. - Support for command line source. - Support for setting variable source via the environment using OMPI_MCA_SOURCE_<var name>=source (either command or file:filename) - Cleaner API. - Support for variable groups (equivalent to MPIT categories). Notes: - Variables must be created with a backing store (char **, int *, or bool *) that must live at least as long as the variable. - Creating a variable with the MCA_BASE_VAR_FLAG_SETTABLE enables the use of mca_base_var_set_value() to change the value. - String values are duplicated when the variable is registered. It is up to the caller to free the original value if necessary. The new value will be freed by the mca_base_var system and must not be freed by the user. - Variables with constant scope may not be settable. - Variable groups (and all associated variables) are deregistered when the component is closed or the component repository item is freed. This prevents a segmentation fault from accessing a variable after its component is unloaded. - After some discussion we decided we should remove the automatic registration of component priority variables. Few component actually made use of this feature. - The enumerator interface was updated to be general enough to handle future uses of the interface. - The code to generate ompi_info output has been moved into the MCA variable system. See mca_base_var_dump(). opal: update core and components to mca_base_var system orte: update core and components to mca_base_var system ompi: update core and components to mca_base_var system This commit also modifies the rmaps framework. The following variables were moved from ppr and lama: rmaps_base_pernode, rmaps_base_n_pernode, rmaps_base_n_persocket. Both lama and ppr create synonyms for these variables. This commit was SVN r28236.
2013-03-27 21:09:41 +00:00
unsigned long long recv_short_size;
/** Number of short message blocks which should be created during startup */
int recv_short_num;
/** Length of the send event queues */
int send_queue_size;
/** Length of the receive event queues */
int recv_queue_size;
/** Protocol for long message transfer */
enum { eager, rndv } protocol;
/* free list of message for matched probe */
opal_free_list_t fl_message;
/** Network interface handle for matched interface */
ptl_handle_ni_t ni_h;
/** Uid for current user */
ptl_uid_t uid;
/** portals index for message matching */
ptl_pt_index_t recv_idx;
/** portals index for long message rendezvous */
ptl_pt_index_t read_idx;
/** portals index for flow control recovery */
ptl_pt_index_t flowctl_idx;
/** Event queue handles. See send_eq_h and recv_eq_h defines for
usage. Array for PtlEQPoll */
ptl_handle_eq_t eqs_h[2];
/** MD handle for sending ACKS */
ptl_handle_md_t zero_md_h;
/** Send MD handle(s). Use ompi_mtl_portals4_get_md() to get the right md */
#if OPAL_PORTALS4_MAX_MD_SIZE < OPAL_PORTALS4_MAX_VA_SIZE
ptl_handle_md_t *send_md_hs;
#else
ptl_handle_md_t send_md_h;
#endif
/** long message receive overflow ME. Persistent ME, first in
overflow list on the recv_idx portal table. */
ptl_handle_me_t long_overflow_me_h;
/** List of active short receive blocks. Active means that the ME
was posted to the overflow list and the UNLINK event has not
yet been received. */
opal_list_t active_recv_short_blocks;
/** List of short receive blocks waiting for FREE event. Blocks
are added to this list when the UNLINK event has been
received and removed when the FREE event is received. */
opal_list_t waiting_recv_short_blocks;
/** number of send-side operations started */
uint64_t opcount;
#if OPAL_ENABLE_DEBUG
/** number of receive-side operations started. Used only for
debugging */
uint64_t recv_opcount;
#endif
#if OMPI_MTL_PORTALS4_FLOW_CONTROL
ompi_mtl_portals4_flowctl_t flowctl;
#endif
};
typedef struct mca_mtl_portals4_module_t mca_mtl_portals4_module_t;
#define send_eq_h eqs_h[0]
#define recv_eq_h eqs_h[1]
extern mca_mtl_portals4_module_t ompi_mtl_portals4;
#define REQ_RECV_TABLE_ID 12
#define REQ_READ_TABLE_ID 13
#define REQ_FLOWCTL_TABLE_ID 14
#define MTL_PORTALS4_FLOWCTL_TRIGGER 0x01
#define MTL_PORTALS4_FLOWCTL_ALERT 0x02
#define MTL_PORTALS4_FLOWCTL_FANIN 0x03
#define MTL_PORTALS4_FLOWCTL_FANOUT 0x04
/* match/ignore bit manipulation
*
* 0123 4567 01234567 01234567 01234567 01234567 01234567 01234567 01234567
* | | |
* ^ | context id | source | message tag
* | | | |
* +---- protocol
*/
#define MTL_PORTALS4_PROTOCOL_MASK 0xF000000000000000ULL
#define MTL_PORTALS4_CONTEXT_MASK 0x0FFF000000000000ULL
#define MTL_PORTALS4_SOURCE_MASK 0x0000FFFF00000000ULL
#define MTL_PORTALS4_TAG_MASK 0x00000000FFFFFFFFULL
#define MTL_PORTALS4_PROTOCOL_IGNR MTL_PORTALS4_PROTOCOL_MASK
#define MTL_PORTALS4_CONTEXT_IGNR MTL_PORTALS4_CONTEXT_MASK
#define MTL_PORTALS4_SOURCE_IGNR MTL_PORTALS4_SOURCE_MASK
#define MTL_PORTALS4_TAG_IGNR 0x000000007FFFFFFFULL
#define MTL_PORTALS4_SHORT_MSG 0x1000000000000000ULL
#define MTL_PORTALS4_LONG_MSG 0x2000000000000000ULL
/* send posting */
#define MTL_PORTALS4_SET_SEND_BITS(match_bits, contextid, source, tag, type) \
{ \
match_bits = contextid; \
match_bits = (match_bits << 16); \
match_bits |= source; \
match_bits = (match_bits << 32); \
match_bits |= (MTL_PORTALS4_TAG_MASK & tag) | type; \
}
/* receive posting */
#define MTL_PORTALS4_SET_RECV_BITS(match_bits, ignore_bits, contextid, source, tag) \
{ \
match_bits = 0; \
ignore_bits = MTL_PORTALS4_PROTOCOL_IGNR; \
\
match_bits = contextid; \
match_bits = (match_bits << 16); \
\
if (MPI_ANY_SOURCE == source) { \
match_bits = (match_bits << 32); \
ignore_bits |= MTL_PORTALS4_SOURCE_IGNR; \
} else { \
match_bits |= source; \
match_bits = (match_bits << 32); \
} \
\
if (MPI_ANY_TAG == tag) { \
ignore_bits |= MTL_PORTALS4_TAG_IGNR; \
} else { \
match_bits |= (MTL_PORTALS4_TAG_MASK & tag); \
} \
}
#define MTL_PORTALS4_IS_SHORT_MSG(match_bits) \
(0 != (MTL_PORTALS4_SHORT_MSG & match_bits))
#define MTL_PORTALS4_IS_LONG_MSG(match_bits) \
(0 != (MTL_PORTALS4_LONG_MSG & match_bits))
#define MTL_PORTALS4_IS_READY_MSG(match_bits) \
(0 != (MTL_PORTALS4_READY_MSG & match_bits))
#define MTL_PORTALS4_GET_TAG(match_bits) \
((int)(match_bits & MTL_PORTALS4_TAG_MASK))
#define MTL_PORTALS4_GET_SOURCE(match_bits) \
((int)((match_bits & MTL_PORTALS4_SOURCE_MASK) >> 32))
#define MTL_PORTALS4_SYNC_MSG 0x8000000000000000ULL
#define MTL_PORTALS4_SET_HDR_DATA(hdr_data, opcount, length, sync) \
{ \
hdr_data = (sync) ? 1 : 0; \
hdr_data = (hdr_data << 15); \
hdr_data |= opcount & 0x7FFFULL; \
hdr_data = (hdr_data << 48); \
hdr_data |= (length & 0xFFFFFFFFFFFFULL); \
}
#define MTL_PORTALS4_GET_LENGTH(hdr_data) ((size_t)(hdr_data & 0xFFFFFFFFFFFFULL))
#define MTL_PORTALS4_IS_SYNC_MSG(hdr_data) \
(0 != (MTL_PORTALS4_SYNC_MSG & hdr_data))
/*
* Not all implementations of Portals 4 support binding a memory
* descriptor which covers all of memory, but all support covering a
* large fraction of memory. Therefore, rather than working around
* the issue by pinning per message, we use a number of memory
* descriptors to cover all of memory. As long as the maximum memory
* descriptor is a large fraction of the user virtual address space
* (like 46 bit MDs on a platform with 47 bits of user virtual address
* space), this works fine.
*
* Our scheme is to create N memory descriptors which contiguously
* cover the entire user address space, then another N-1 contiguous
* memory descriptors offset by 1/2 the size of the MD, then a final
* memory descriptor of 1/2 the size of the other MDs covering the top
* of the memory space, to avoid if statements in the critical path. This
* scheme allows for a maximum message size of 1/2 the size of the MD
* without ever crossing an MD boundary. Also, because MD sizes are
* always on a power of 2 in this scheme, computing the offsets and MD
* selection are quick, using only bit shift and mask.q
*
* ompi_mtl_portals4_get_md() relies heavily on compiler constant folding.
* "mask" can be constant folded into a constant. "which" compiler folds
* into a bit shift of a register a constant number of times, then masked
* by a constant (the input is, unfortunately, not constant).
*
* In the case where an MD can cover all of memory,
* ompi_mtl_portals4_get_md() will be compiled into two assignments.
* Assuming the function inlines (and it certainly should be), the two
* assignments should be optimized into register assignments for the
* Portals call relatively easily.
*/
static inline void
ompi_mtl_portals4_get_md(const void *ptr, ptl_handle_md_t *md_h, void **base_ptr)
{
#if OPAL_PORTALS4_MAX_MD_SIZE < OPAL_PORTALS4_MAX_VA_SIZE
int mask = (1ULL << (OPAL_PORTALS4_MAX_VA_SIZE - OPAL_PORTALS4_MAX_MD_SIZE + 1)) - 1;
int which = (((uintptr_t) ptr) >> (OPAL_PORTALS4_MAX_MD_SIZE - 1)) & mask;
*md_h = ompi_mtl_portals4.send_md_hs[which];
*base_ptr = (void*) (which * (1ULL << (OPAL_PORTALS4_MAX_MD_SIZE - 1)));
#else
*md_h = ompi_mtl_portals4.send_md_h;
*base_ptr = 0;
#endif
}
static inline int
ompi_mtl_portals4_get_num_mds(void)
{
#if OPAL_PORTALS4_MAX_MD_SIZE < OPAL_PORTALS4_MAX_VA_SIZE
return (1 << (OPAL_PORTALS4_MAX_VA_SIZE - OPAL_PORTALS4_MAX_MD_SIZE + 1));
#else
return 1;
#endif
}
/* MTL interface functions */
extern int ompi_mtl_portals4_finalize(struct mca_mtl_base_module_t *mtl);
extern int ompi_mtl_portals4_add_procs(struct mca_mtl_base_module_t* mtl,
size_t nprocs,
struct ompi_proc_t** procs);
extern int ompi_mtl_portals4_del_procs(struct mca_mtl_base_module_t* mtl,
size_t nprocs,
struct ompi_proc_t** procs);
extern int ompi_mtl_portals4_send(struct mca_mtl_base_module_t* mtl,
struct ompi_communicator_t* comm,
int dest,
int tag,
struct opal_convertor_t *convertor,
mca_pml_base_send_mode_t mode);
extern int ompi_mtl_portals4_isend(struct mca_mtl_base_module_t* mtl,
struct ompi_communicator_t* comm,
int dest,
int tag,
struct opal_convertor_t *convertor,
mca_pml_base_send_mode_t mode,
bool blocking,
mca_mtl_request_t *mtl_request);
extern int ompi_mtl_portals4_irecv(struct mca_mtl_base_module_t* mtl,
struct ompi_communicator_t *comm,
int src,
int tag,
struct opal_convertor_t *convertor,
mca_mtl_request_t *mtl_request);
extern int ompi_mtl_portals4_iprobe(struct mca_mtl_base_module_t* mtl,
struct ompi_communicator_t *comm,
int src,
int tag,
int *flag,
struct ompi_status_public_t *status);
extern int ompi_mtl_portals4_imrecv(struct mca_mtl_base_module_t* mtl,
struct opal_convertor_t *convertor,
struct ompi_message_t **message,
struct mca_mtl_request_t *mtl_request);
extern int ompi_mtl_portals4_improbe(struct mca_mtl_base_module_t *mtl,
struct ompi_communicator_t *comm,
int src,
int tag,
int *matched,
struct ompi_message_t **message,
struct ompi_status_public_t *status);
extern int ompi_mtl_portals4_cancel(struct mca_mtl_base_module_t* mtl,
mca_mtl_request_t *mtl_request,
int flag);
extern int ompi_mtl_portals4_add_comm(struct mca_mtl_base_module_t *mtl,
struct ompi_communicator_t *comm);
extern int ompi_mtl_portals4_del_comm(struct mca_mtl_base_module_t *mtl,
struct ompi_communicator_t *comm);
extern int ompi_mtl_portals4_progress(void);
extern int ompi_mtl_portals4_get_error(int ptl_error);
END_C_DECLS
#endif /* MTL_PORTALS_H_HAS_BEEN_INCLUDED */