2015-02-19 23:41:41 +03:00
|
|
|
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
2013-07-12 00:47:08 +04:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
|
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2007 The University of Tennessee and The University
|
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
|
|
* University of Stuttgart. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
|
|
|
* Copyright (c) 2010-2012 Sandia National Laboratories. All rights reserved.
|
2015-01-29 20:07:53 +03:00
|
|
|
* Copyright (c) 2014 Bull SAS. All rights reserved.
|
2015-02-19 23:41:41 +03:00
|
|
|
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
|
|
|
* reserved.
|
2013-07-12 00:47:08 +04:00
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef BTL_PORTALS_H_HAS_BEEN_INCLUDED
|
|
|
|
#define BTL_PORTALS_H_HAS_BEEN_INCLUDED
|
|
|
|
|
|
|
|
#include <portals4.h>
|
|
|
|
#include <btl_portals4_frag.h>
|
|
|
|
|
|
|
|
#include "opal/class/opal_free_list.h"
|
|
|
|
#include "opal/class/opal_list.h"
|
|
|
|
#include "opal/datatype/opal_convertor.h"
|
George did the work and deserves all the credit for it. Ralph did the merge, and deserves whatever blame results from errors in it :-)
WHAT: Open our low-level communication infrastructure by moving all necessary components (btl/rcache/allocator/mpool) down in OPAL
All the components required for inter-process communications are currently deeply integrated in the OMPI layer. Several groups/institutions have express interest in having a more generic communication infrastructure, without all the OMPI layer dependencies. This communication layer should be made available at a different software level, available to all layers in the Open MPI software stack. As an example, our ORTE layer could replace the current OOB and instead use the BTL directly, gaining access to more reactive network interfaces than TCP. Similarly, external software libraries could take advantage of our highly optimized AM (active message) communication layer for their own purpose. UTK with support from Sandia, developped a version of Open MPI where the entire communication infrastucture has been moved down to OPAL (btl/rcache/allocator/mpool). Most of the moved components have been updated to match the new schema, with few exceptions (mainly BTLs where I have no way of compiling/testing them). Thus, the completion of this RFC is tied to being able to completing this move for all BTLs. For this we need help from the rest of the Open MPI community, especially those supporting some of the BTLs. A non-exhaustive list of BTLs that qualify here is: mx, portals4, scif, udapl, ugni, usnic.
This commit was SVN r32317.
2014-07-26 04:47:28 +04:00
|
|
|
#include "opal/mca/btl/btl.h"
|
|
|
|
#include "opal/mca/btl/base/base.h"
|
2014-07-28 23:25:03 +04:00
|
|
|
#include "opal/mca/btl/base/btl_base_error.h"
|
2013-07-12 00:47:08 +04:00
|
|
|
|
|
|
|
BEGIN_C_DECLS
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Portals BTL component.
|
|
|
|
*/
|
|
|
|
struct mca_btl_portals4_component_t {
|
|
|
|
/* base BTL component */
|
|
|
|
mca_btl_base_component_2_0_0_t super;
|
2015-06-24 06:59:57 +03:00
|
|
|
|
2014-01-10 00:01:42 +04:00
|
|
|
unsigned int num_btls;
|
|
|
|
unsigned int max_btls; /* Maximum number of accepted Portals4 cards */
|
|
|
|
|
|
|
|
struct mca_btl_portals4_module_t** btls; /* array of available BTL modules */
|
|
|
|
|
2015-09-25 03:19:06 +03:00
|
|
|
/* add_procs() can get called multiple times. this prevents multiple calls to portals4_init_interface(). */
|
|
|
|
int need_init;
|
|
|
|
|
2015-01-29 20:04:55 +03:00
|
|
|
/* Use the logical to physical table to accelerate portals4 adressing: 1 (true) : 0 (false) */
|
|
|
|
int use_logical;
|
|
|
|
|
2013-07-12 00:47:08 +04:00
|
|
|
/* initial size of free lists */
|
|
|
|
int portals_free_list_init_num;
|
|
|
|
/* max size of free lists */
|
|
|
|
int portals_free_list_max_num;
|
|
|
|
/* numer of elements to grow free lists */
|
|
|
|
int portals_free_list_inc_num;
|
|
|
|
|
|
|
|
/* number of eager fragments */
|
|
|
|
int portals_free_list_eager_max_num;
|
|
|
|
|
|
|
|
/* do I need a portals ACK? */
|
|
|
|
int portals_need_ack;
|
2014-01-10 00:01:42 +04:00
|
|
|
|
|
|
|
/** Length of the receive event queues */
|
|
|
|
int recv_queue_size;
|
|
|
|
|
|
|
|
/* number outstanding sends and local rdma */
|
|
|
|
int32_t portals_max_outstanding_ops;
|
|
|
|
|
|
|
|
/* incoming send message receive memory descriptors */
|
|
|
|
int portals_recv_mds_num;
|
|
|
|
int portals_recv_mds_size;
|
|
|
|
|
|
|
|
/** Event queue handles table used in PtlEQPoll */
|
|
|
|
ptl_handle_eq_t *eqs_h;
|
2013-07-12 00:47:08 +04:00
|
|
|
};
|
|
|
|
|
|
|
|
typedef struct mca_btl_portals4_component_t mca_btl_portals4_component_t;
|
|
|
|
|
|
|
|
struct mca_btl_portals4_module_t {
|
|
|
|
/* base BTL module interface */
|
|
|
|
mca_btl_base_module_t super;
|
|
|
|
|
|
|
|
/* number of processes we're actively connected to. Needed to
|
|
|
|
know when to do activation / shutdown */
|
|
|
|
int32_t portals_num_procs;
|
|
|
|
|
2014-01-10 00:01:42 +04:00
|
|
|
/* number of the interface (btl) */
|
|
|
|
uint32_t interface_num;
|
2013-07-12 00:47:08 +04:00
|
|
|
|
|
|
|
/* fragment free lists */
|
2015-02-19 23:41:41 +03:00
|
|
|
opal_free_list_t portals_frag_eager;
|
|
|
|
opal_free_list_t portals_frag_max;
|
|
|
|
opal_free_list_t portals_frag_user;
|
2013-07-12 00:47:08 +04:00
|
|
|
|
|
|
|
opal_list_t portals_recv_blocks;
|
|
|
|
|
|
|
|
/** Length of the receive event queues */
|
|
|
|
int recv_queue_size;
|
|
|
|
|
|
|
|
/** Event queue handle */
|
|
|
|
ptl_handle_eq_t recv_eq_h;
|
|
|
|
|
|
|
|
/* number outstanding sends and local rdma */
|
|
|
|
volatile int32_t portals_outstanding_ops;
|
|
|
|
int32_t portals_max_outstanding_ops;
|
|
|
|
|
|
|
|
/* key to use for next rdma operation */
|
|
|
|
volatile int64_t portals_rdma_key;
|
|
|
|
|
|
|
|
/* our portals network interface */
|
|
|
|
ptl_handle_ni_t portals_ni_h;
|
|
|
|
|
|
|
|
/** portals index */
|
|
|
|
ptl_pt_index_t recv_idx;
|
|
|
|
|
|
|
|
/** MD handle for sending ACKS */
|
|
|
|
ptl_handle_md_t zero_md_h;
|
|
|
|
|
2015-05-11 19:25:57 +03:00
|
|
|
/** Send MD handle */
|
2013-07-12 01:00:11 +04:00
|
|
|
ptl_handle_md_t send_md_h;
|
2013-07-12 00:47:08 +04:00
|
|
|
|
|
|
|
/** long message receive overflow ME. Persistent ME, first in
|
|
|
|
overflow list on the recv_idx portal table. */
|
|
|
|
ptl_handle_me_t long_overflow_me_h;
|
|
|
|
};
|
|
|
|
|
|
|
|
typedef struct mca_btl_portals4_module_t mca_btl_portals4_module_t;
|
|
|
|
|
|
|
|
/* match/ignore bit manipulation
|
|
|
|
*
|
|
|
|
* 0123 4567 01234567 01234567 01234567 01234567 01234567 01234567 01234567
|
|
|
|
* | | |
|
|
|
|
* ^ | context id | source | message tag
|
|
|
|
* | | | |
|
|
|
|
* +---- protocol
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define BTL_PORTALS4_PROTOCOL_MASK 0xF000000000000000ULL
|
|
|
|
#define BTL_PORTALS4_CONTEXT_MASK 0x0FFF000000000000ULL
|
|
|
|
#define BTL_PORTALS4_SOURCE_MASK 0x0000FFFF00000000ULL
|
|
|
|
#define BTL_PORTALS4_TAG_MASK 0x00000000FFFFFFFFULL
|
|
|
|
|
|
|
|
#define BTL_PORTALS4_PROTOCOL_IGNR BTL_PORTALS4_PROTOCOL_MASK
|
|
|
|
#define BTL_PORTALS4_CONTEXT_IGNR BTL_PORTALS4_CONTEXT_MASK
|
|
|
|
#define BTL_PORTALS4_SOURCE_IGNR BTL_PORTALS4_SOURCE_MASK
|
|
|
|
#define BTL_PORTALS4_TAG_IGNR 0x000000007FFFFFFFULL
|
|
|
|
|
|
|
|
#define BTL_PORTALS4_SHORT_MSG 0x1000000000000000ULL
|
|
|
|
#define BTL_PORTALS4_LONG_MSG 0x2000000000000000ULL
|
|
|
|
|
|
|
|
/* send posting */
|
|
|
|
#define BTL_PORTALS4_SET_SEND_BITS(match_bits, contextid, source, tag, type) \
|
|
|
|
{ \
|
|
|
|
match_bits = contextid; \
|
|
|
|
match_bits = (match_bits << 16); \
|
|
|
|
match_bits |= source; \
|
|
|
|
match_bits = (match_bits << 32); \
|
|
|
|
match_bits |= (BTL_PORTALS4_TAG_MASK & tag) | type; \
|
|
|
|
}
|
|
|
|
|
|
|
|
#define BTL_PORTALS4_SET_HDR_DATA(hdr_data, opcount, length, sync) \
|
|
|
|
{ \
|
|
|
|
hdr_data = (sync) ? 1 : 0; \
|
|
|
|
hdr_data = (hdr_data << 15); \
|
|
|
|
hdr_data |= opcount & 0x7FFFULL; \
|
|
|
|
hdr_data = (hdr_data << 48); \
|
|
|
|
hdr_data |= (length & 0xFFFFFFFFFFFFULL); \
|
|
|
|
}
|
|
|
|
|
2015-01-29 20:04:55 +03:00
|
|
|
#define REQ_BTL_TABLE_ID 2
|
|
|
|
|
2013-07-12 00:47:08 +04:00
|
|
|
int mca_btl_portals4_component_progress(void);
|
2014-01-10 00:01:42 +04:00
|
|
|
void mca_btl_portals4_free_module(mca_btl_portals4_module_t *portals4_btl);
|
2013-07-12 00:47:08 +04:00
|
|
|
|
|
|
|
/* BTL interface functions */
|
|
|
|
int mca_btl_portals4_finalize(struct mca_btl_base_module_t* btl_base);
|
|
|
|
|
|
|
|
|
|
|
|
int mca_btl_portals4_add_procs(struct mca_btl_base_module_t* btl_base,
|
|
|
|
size_t nprocs,
|
George did the work and deserves all the credit for it. Ralph did the merge, and deserves whatever blame results from errors in it :-)
WHAT: Open our low-level communication infrastructure by moving all necessary components (btl/rcache/allocator/mpool) down in OPAL
All the components required for inter-process communications are currently deeply integrated in the OMPI layer. Several groups/institutions have express interest in having a more generic communication infrastructure, without all the OMPI layer dependencies. This communication layer should be made available at a different software level, available to all layers in the Open MPI software stack. As an example, our ORTE layer could replace the current OOB and instead use the BTL directly, gaining access to more reactive network interfaces than TCP. Similarly, external software libraries could take advantage of our highly optimized AM (active message) communication layer for their own purpose. UTK with support from Sandia, developped a version of Open MPI where the entire communication infrastucture has been moved down to OPAL (btl/rcache/allocator/mpool). Most of the moved components have been updated to match the new schema, with few exceptions (mainly BTLs where I have no way of compiling/testing them). Thus, the completion of this RFC is tied to being able to completing this move for all BTLs. For this we need help from the rest of the Open MPI community, especially those supporting some of the BTLs. A non-exhaustive list of BTLs that qualify here is: mx, portals4, scif, udapl, ugni, usnic.
This commit was SVN r32317.
2014-07-26 04:47:28 +04:00
|
|
|
struct opal_proc_t **procs,
|
2013-07-12 00:47:08 +04:00
|
|
|
struct mca_btl_base_endpoint_t** peers,
|
|
|
|
opal_bitmap_t* reachable);
|
|
|
|
|
|
|
|
int mca_btl_portals4_del_procs(struct mca_btl_base_module_t* btl_base,
|
|
|
|
size_t nprocs,
|
George did the work and deserves all the credit for it. Ralph did the merge, and deserves whatever blame results from errors in it :-)
WHAT: Open our low-level communication infrastructure by moving all necessary components (btl/rcache/allocator/mpool) down in OPAL
All the components required for inter-process communications are currently deeply integrated in the OMPI layer. Several groups/institutions have express interest in having a more generic communication infrastructure, without all the OMPI layer dependencies. This communication layer should be made available at a different software level, available to all layers in the Open MPI software stack. As an example, our ORTE layer could replace the current OOB and instead use the BTL directly, gaining access to more reactive network interfaces than TCP. Similarly, external software libraries could take advantage of our highly optimized AM (active message) communication layer for their own purpose. UTK with support from Sandia, developped a version of Open MPI where the entire communication infrastucture has been moved down to OPAL (btl/rcache/allocator/mpool). Most of the moved components have been updated to match the new schema, with few exceptions (mainly BTLs where I have no way of compiling/testing them). Thus, the completion of this RFC is tied to being able to completing this move for all BTLs. For this we need help from the rest of the Open MPI community, especially those supporting some of the BTLs. A non-exhaustive list of BTLs that qualify here is: mx, portals4, scif, udapl, ugni, usnic.
This commit was SVN r32317.
2014-07-26 04:47:28 +04:00
|
|
|
struct opal_proc_t **procs,
|
2013-07-12 00:47:08 +04:00
|
|
|
struct mca_btl_base_endpoint_t** peers);
|
|
|
|
|
2015-06-24 06:59:57 +03:00
|
|
|
mca_btl_base_descriptor_t*
|
|
|
|
mca_btl_portals4_alloc(struct mca_btl_base_module_t* btl_base,
|
2013-07-12 00:47:08 +04:00
|
|
|
struct mca_btl_base_endpoint_t* endpoint,
|
|
|
|
uint8_t order,
|
|
|
|
size_t size,
|
2015-06-24 06:59:57 +03:00
|
|
|
uint32_t flags);
|
2013-07-12 00:47:08 +04:00
|
|
|
|
2015-06-24 06:59:57 +03:00
|
|
|
int mca_btl_portals4_free(struct mca_btl_base_module_t* btl_base,
|
|
|
|
mca_btl_base_descriptor_t* des);
|
2013-07-12 00:47:08 +04:00
|
|
|
|
2015-06-24 06:59:57 +03:00
|
|
|
mca_btl_base_descriptor_t*
|
2013-07-12 00:47:08 +04:00
|
|
|
mca_btl_portals4_prepare_src(struct mca_btl_base_module_t* btl_base,
|
|
|
|
struct mca_btl_base_endpoint_t* peer,
|
|
|
|
struct opal_convertor_t* convertor,
|
|
|
|
uint8_t order,
|
|
|
|
size_t reserve,
|
|
|
|
size_t* size,
|
|
|
|
uint32_t flags);
|
|
|
|
|
|
|
|
int mca_btl_portals4_send(struct mca_btl_base_module_t* btl_base,
|
|
|
|
struct mca_btl_base_endpoint_t* btl_peer,
|
2015-06-24 06:59:57 +03:00
|
|
|
struct mca_btl_base_descriptor_t* descriptor,
|
2013-07-12 00:47:08 +04:00
|
|
|
mca_btl_base_tag_t tag);
|
|
|
|
|
|
|
|
|
|
|
|
int mca_btl_portals4_sendi(struct mca_btl_base_module_t* btl_base,
|
|
|
|
struct mca_btl_base_endpoint_t* endpoint,
|
|
|
|
struct opal_convertor_t* convertor,
|
|
|
|
void* header,
|
|
|
|
size_t header_size,
|
|
|
|
size_t payload_size,
|
|
|
|
uint8_t order,
|
|
|
|
uint32_t flags,
|
2015-06-24 06:59:57 +03:00
|
|
|
mca_btl_base_tag_t tag,
|
2013-07-12 00:47:08 +04:00
|
|
|
mca_btl_base_descriptor_t** des);
|
|
|
|
|
|
|
|
int mca_btl_portals4_put(struct mca_btl_base_module_t* btl_base,
|
|
|
|
struct mca_btl_base_endpoint_t* btl_peer,
|
|
|
|
struct mca_btl_base_descriptor_t* decriptor);
|
|
|
|
|
|
|
|
|
|
|
|
int mca_btl_portals4_get(struct mca_btl_base_module_t* btl_base,
|
|
|
|
struct mca_btl_base_endpoint_t* btl_peer,
|
2015-03-12 19:57:48 +03:00
|
|
|
void *local_address,
|
|
|
|
uint64_t remote_address,
|
|
|
|
struct mca_btl_base_registration_handle_t *local_handle,
|
|
|
|
struct mca_btl_base_registration_handle_t *remote_handle,
|
|
|
|
size_t size,
|
|
|
|
int flags,
|
|
|
|
int order,
|
|
|
|
mca_btl_base_rdma_completion_fn_t cbfunc,
|
|
|
|
void *cbcontext,
|
|
|
|
void *cbdata);
|
2013-07-12 00:47:08 +04:00
|
|
|
|
|
|
|
int mca_btl_portals4_get_error(int ptl_error);
|
|
|
|
|
2015-03-12 19:57:48 +03:00
|
|
|
struct mca_btl_base_registration_handle_t {
|
|
|
|
/** Portals4 match bits */
|
|
|
|
ptl_match_bits_t key;
|
2015-04-10 22:39:20 +03:00
|
|
|
/** Portals4 me_h */
|
|
|
|
ptl_handle_me_t me_h;
|
2015-03-12 19:57:48 +03:00
|
|
|
};
|
|
|
|
|
2013-07-12 00:47:08 +04:00
|
|
|
/*
|
|
|
|
* global structures
|
|
|
|
*/
|
George did the work and deserves all the credit for it. Ralph did the merge, and deserves whatever blame results from errors in it :-)
WHAT: Open our low-level communication infrastructure by moving all necessary components (btl/rcache/allocator/mpool) down in OPAL
All the components required for inter-process communications are currently deeply integrated in the OMPI layer. Several groups/institutions have express interest in having a more generic communication infrastructure, without all the OMPI layer dependencies. This communication layer should be made available at a different software level, available to all layers in the Open MPI software stack. As an example, our ORTE layer could replace the current OOB and instead use the BTL directly, gaining access to more reactive network interfaces than TCP. Similarly, external software libraries could take advantage of our highly optimized AM (active message) communication layer for their own purpose. UTK with support from Sandia, developped a version of Open MPI where the entire communication infrastucture has been moved down to OPAL (btl/rcache/allocator/mpool). Most of the moved components have been updated to match the new schema, with few exceptions (mainly BTLs where I have no way of compiling/testing them). Thus, the completion of this RFC is tied to being able to completing this move for all BTLs. For this we need help from the rest of the Open MPI community, especially those supporting some of the BTLs. A non-exhaustive list of BTLs that qualify here is: mx, portals4, scif, udapl, ugni, usnic.
This commit was SVN r32317.
2014-07-26 04:47:28 +04:00
|
|
|
OPAL_MODULE_DECLSPEC extern mca_btl_portals4_component_t mca_btl_portals4_component;
|
2013-07-12 00:47:08 +04:00
|
|
|
extern mca_btl_portals4_module_t mca_btl_portals4_module;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* An abstraction that represents a connection to a endpoint process.
|
|
|
|
* An instance of mca_btl_base_endpoint_t is associated w/ each process
|
|
|
|
* and BTL pair at startup. However, connections to the endpoint
|
|
|
|
* are established dynamically on an as-needed basis:
|
|
|
|
*/
|
|
|
|
struct mca_btl_base_endpoint_t {
|
|
|
|
ptl_process_t ptl_proc;
|
|
|
|
};
|
|
|
|
typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t;
|
|
|
|
|
|
|
|
END_C_DECLS
|
|
|
|
|
|
|
|
#endif /* BTL_PORTALS_H_HAS_BEEN_INCLUDED */
|