Enable RDMA for heterogeneous situations. Currently done by overloading
the ompi_convertor_need_buffers function to only return 0 if the convertor is homogeneous (which it never does on the trunk, but does to on v1.2, but that's a different issue). Only enable the heterogeneous rdma code for a btl if it supports it (via a flag), as some btls need some work for this to work properly. Currently only TCP and OpenIB extensively tested This commit was SVN r15990.
Этот коммит содержится в:
родитель
dcf678dbab
Коммит
59b22533f2
@ -163,6 +163,7 @@ static inline int ompi_convertor_cleanup( ompi_convertor_t* convertor )
|
||||
static inline int32_t
|
||||
ompi_convertor_need_buffers( const ompi_convertor_t* pConvertor )
|
||||
{
|
||||
if (OPAL_UNLIKELY(0 != (pConvertor->flags & CONVERTOR_HOMOGENEOUS))) return 1;
|
||||
if( pConvertor->flags & DT_FLAG_NO_GAPS ) return 0;
|
||||
if( (pConvertor->count == 1) && (pConvertor->flags & DT_FLAG_CONTIGUOUS) ) return 0;
|
||||
return 1;
|
||||
|
@ -437,8 +437,9 @@ int mca_bml_r2_add_procs(
|
||||
bml_endpoint->btl_max_send_size = btl->btl_max_send_size;
|
||||
|
||||
/* check flags - is rdma prefered */
|
||||
if(btl->btl_flags & (MCA_BTL_FLAGS_PUT|MCA_BTL_FLAGS_GET) &&
|
||||
proc->proc_arch == ompi_proc_local_proc->proc_arch) {
|
||||
if ((btl->btl_flags & (MCA_BTL_FLAGS_PUT|MCA_BTL_FLAGS_GET)) &&
|
||||
!((proc->proc_arch != ompi_proc_local_proc->proc_arch) &&
|
||||
(0 == (btl->btl_flags & MCA_BTL_FLAGS_HETEROGENEOUS_RDMA)))) {
|
||||
mca_bml_base_btl_t* bml_btl_rdma = mca_bml_base_btl_array_insert(&bml_endpoint->btl_rdma);
|
||||
*bml_btl_rdma = *bml_btl;
|
||||
if(bml_endpoint->btl_pipeline_send_length <
|
||||
|
@ -164,6 +164,9 @@ typedef uint8_t mca_btl_base_tag_t;
|
||||
/* btl needs local rdma completion */
|
||||
#define MCA_BTL_FLAGS_RDMA_COMPLETION 0x0080
|
||||
|
||||
/* btl can do heterogeneous rdma operations on byte buffers */
|
||||
#define MCA_BTL_FLAGS_HETEROGENEOUS_RDMA 0x0100
|
||||
|
||||
/* Default exclusivity levels */
|
||||
#define MCA_BTL_EXCLUSIVITY_HIGH (64*1024) /* internal loopback */
|
||||
#define MCA_BTL_EXCLUSIVITY_DEFAULT 1024 /* GM/IB/etc. */
|
||||
|
@ -35,6 +35,7 @@
|
||||
#include "btl_openib_endpoint.h"
|
||||
#include "ompi/datatype/convertor.h"
|
||||
#include "ompi/datatype/datatype.h"
|
||||
#include "ompi/datatype/dt_arch.h"
|
||||
#include "ompi/mca/mpool/base/base.h"
|
||||
#include "ompi/mca/mpool/mpool.h"
|
||||
#include "ompi/mca/mpool/rdma/mpool_rdma.h"
|
||||
@ -938,8 +939,17 @@ int mca_btl_openib_put( mca_btl_base_module_t* btl,
|
||||
int ib_rc;
|
||||
|
||||
frag->wr_desc.sr_desc.send_flags = IBV_SEND_SIGNALED;
|
||||
frag->wr_desc.sr_desc.wr.rdma.remote_addr = frag->base.des_dst->seg_addr.lval;
|
||||
frag->wr_desc.sr_desc.wr.rdma.rkey = frag->base.des_dst->seg_key.key32[0];
|
||||
#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
if ((endpoint->endpoint_proc->proc_ompi->proc_arch & OMPI_ARCH_ISBIGENDIAN) !=
|
||||
(ompi_proc_local()->proc_arch & OMPI_ARCH_ISBIGENDIAN)) {
|
||||
frag->wr_desc.sr_desc.wr.rdma.remote_addr = opal_swap_bytes8(frag->base.des_dst->seg_addr.lval);
|
||||
frag->wr_desc.sr_desc.wr.rdma.rkey = opal_swap_bytes4(frag->base.des_dst->seg_key.key32[0]);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
frag->wr_desc.sr_desc.wr.rdma.remote_addr = frag->base.des_dst->seg_addr.lval;
|
||||
frag->wr_desc.sr_desc.wr.rdma.rkey = frag->base.des_dst->seg_key.key32[0];
|
||||
}
|
||||
frag->sg_entry.addr = (unsigned long) frag->base.des_src->seg_addr.pval;
|
||||
frag->sg_entry.length = frag->base.des_src->seg_len;
|
||||
|
||||
@ -997,8 +1007,17 @@ int mca_btl_openib_get( mca_btl_base_module_t* btl,
|
||||
} else {
|
||||
|
||||
frag->wr_desc.sr_desc.send_flags = IBV_SEND_SIGNALED;
|
||||
frag->wr_desc.sr_desc.wr.rdma.remote_addr = frag->base.des_src->seg_addr.lval;
|
||||
frag->wr_desc.sr_desc.wr.rdma.rkey = frag->base.des_src->seg_key.key32[0];
|
||||
#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
if ((endpoint->endpoint_proc->proc_ompi->proc_arch & OMPI_ARCH_ISBIGENDIAN) !=
|
||||
(ompi_proc_local()->proc_arch & OMPI_ARCH_ISBIGENDIAN)) {
|
||||
frag->wr_desc.sr_desc.wr.rdma.remote_addr = opal_swap_bytes8(frag->base.des_src->seg_addr.lval);
|
||||
frag->wr_desc.sr_desc.wr.rdma.rkey = opal_swap_bytes4(frag->base.des_src->seg_key.key32[0]);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
frag->wr_desc.sr_desc.wr.rdma.remote_addr = frag->base.des_src->seg_addr.lval;
|
||||
frag->wr_desc.sr_desc.wr.rdma.rkey = frag->base.des_src->seg_key.key32[0];
|
||||
}
|
||||
frag->sg_entry.addr = (unsigned long) frag->base.des_dst->seg_addr.pval;
|
||||
frag->sg_entry.length = frag->base.des_dst->seg_len;
|
||||
|
||||
|
@ -437,7 +437,7 @@ int btl_openib_register_mca_params(void)
|
||||
mca_btl_openib_module.super.btl_rdma_pipeline_frag_size = 1024 * 1024;
|
||||
mca_btl_openib_module.super.btl_min_rdma_pipeline_size = 256 * 1024;
|
||||
mca_btl_openib_module.super.btl_flags = MCA_BTL_FLAGS_RDMA |
|
||||
MCA_BTL_FLAGS_NEED_ACK | MCA_BTL_FLAGS_NEED_CSUM;
|
||||
MCA_BTL_FLAGS_NEED_ACK | MCA_BTL_FLAGS_NEED_CSUM | MCA_BTL_FLAGS_HETEROGENEOUS_RDMA;
|
||||
mca_btl_openib_module.super.btl_bandwidth = 800;
|
||||
mca_btl_openib_module.super.btl_latency = 10;
|
||||
ret = mca_btl_base_param_register(
|
||||
|
@ -231,7 +231,8 @@ int mca_btl_tcp_component_open(void)
|
||||
mca_btl_tcp_module.super.btl_flags = MCA_BTL_FLAGS_PUT |
|
||||
MCA_BTL_FLAGS_SEND_INPLACE |
|
||||
MCA_BTL_FLAGS_NEED_CSUM |
|
||||
MCA_BTL_FLAGS_NEED_ACK;
|
||||
MCA_BTL_FLAGS_NEED_ACK |
|
||||
MCA_BTL_FLAGS_HETEROGENEOUS_RDMA;
|
||||
mca_btl_tcp_module.super.btl_bandwidth = 100;
|
||||
mca_btl_tcp_module.super.btl_latency = 100;
|
||||
mca_btl_base_param_register(&mca_btl_tcp_component.super.btl_version,
|
||||
|
@ -240,7 +240,7 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd)
|
||||
|
||||
/* read header */
|
||||
if(frag->iov_cnt == 0) {
|
||||
if (btl_endpoint->endpoint_nbo) MCA_BTL_TCP_HDR_NTOH(frag->hdr);
|
||||
if (btl_endpoint->endpoint_nbo && frag->iov_idx == 1) MCA_BTL_TCP_HDR_NTOH(frag->hdr);
|
||||
switch(frag->hdr.type) {
|
||||
case MCA_BTL_TCP_HDR_TYPE_SEND:
|
||||
if(frag->iov_idx == 1 && frag->hdr.size) {
|
||||
|
@ -134,6 +134,19 @@ struct mca_pml_ob1_rget_hdr_t {
|
||||
};
|
||||
typedef struct mca_pml_ob1_rget_hdr_t mca_pml_ob1_rget_hdr_t;
|
||||
|
||||
#define MCA_PML_OB1_RGET_HDR_NTOH(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_RNDV_HDR_NTOH((h).hdr_rndv); \
|
||||
(h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \
|
||||
} while (0)
|
||||
|
||||
#define MCA_PML_OB1_RGET_HDR_HTON(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_RNDV_HDR_HTON((h).hdr_rndv); \
|
||||
(h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \
|
||||
} while (0)
|
||||
|
||||
|
||||
/**
|
||||
* Header for subsequent fragments.
|
||||
*/
|
||||
@ -209,6 +222,20 @@ struct mca_pml_ob1_rdma_hdr_t {
|
||||
};
|
||||
typedef struct mca_pml_ob1_rdma_hdr_t mca_pml_ob1_rdma_hdr_t;
|
||||
|
||||
#define MCA_PML_OB1_RDMA_HDR_NTOH(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_COMMON_HDR_NTOH(h.hdr_common); \
|
||||
(h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \
|
||||
(h).hdr_rdma_offset = ntoh64((h).hdr_rdma_offset); \
|
||||
} while (0)
|
||||
|
||||
#define MCA_PML_OB1_RDMA_HDR_HTON(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
|
||||
(h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \
|
||||
(h).hdr_rdma_offset = hton64((h).hdr_rdma_offset); \
|
||||
} while (0)
|
||||
|
||||
/**
|
||||
* Header used to complete an RDMA operation.
|
||||
*/
|
||||
|
@ -107,10 +107,9 @@ void mca_pml_ob1_recv_frag_callback( mca_btl_base_module_t* btl,
|
||||
case MCA_PML_OB1_HDR_TYPE_RGET:
|
||||
{
|
||||
#if !defined(WORDS_BIGENDIAN) && OMPI_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
/* RDMA is currently disabled by bml if arch doesn't
|
||||
match, so this shouldn't be needed. here to make sure
|
||||
we remember if we ever change the bml. */
|
||||
assert(0 == (hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NBO));
|
||||
if (hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NBO) {
|
||||
MCA_PML_OB1_RGET_HDR_NTOH(hdr->hdr_rget);
|
||||
}
|
||||
#endif
|
||||
mca_pml_ob1_recv_frag_match(btl, &hdr->hdr_match, segments,des->des_dst_cnt);
|
||||
break;
|
||||
@ -161,10 +160,9 @@ void mca_pml_ob1_recv_frag_callback( mca_btl_base_module_t* btl,
|
||||
{
|
||||
mca_pml_ob1_send_request_t* sendreq;
|
||||
#if !defined(WORDS_BIGENDIAN) && OMPI_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
/* RDMA is currently disabled by bml if arch doesn't
|
||||
match, so this shouldn't be needed. here to make sure
|
||||
we remember if we ever change the bml. */
|
||||
assert(0 == (hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NBO));
|
||||
if (hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NBO) {
|
||||
MCA_PML_OB1_RDMA_HDR_NTOH(hdr->hdr_rdma);
|
||||
}
|
||||
#endif
|
||||
sendreq = (mca_pml_ob1_send_request_t*)hdr->hdr_rdma.hdr_req.pval;
|
||||
mca_pml_ob1_send_request_put(sendreq,btl,&hdr->hdr_rdma);
|
||||
|
@ -425,7 +425,15 @@ static void mca_pml_ob1_recv_request_rget(
|
||||
/* allocate/initialize a fragment */
|
||||
for(i = 0; i < hdr->hdr_seg_cnt; i++) {
|
||||
frag->rdma_segs[i] = hdr->hdr_segs[i];
|
||||
size += frag->rdma_segs[i].seg_len;
|
||||
#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
if ((recvreq->req_recv.req_base.req_proc->proc_arch & OMPI_ARCH_ISBIGENDIAN) !=
|
||||
(ompi_proc_local()->proc_arch & OMPI_ARCH_ISBIGENDIAN)) {
|
||||
size += opal_swap_bytes4(hdr->hdr_segs[i].seg_len);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
size += hdr->hdr_segs[i].seg_len;
|
||||
}
|
||||
}
|
||||
frag->rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl);
|
||||
if( OPAL_UNLIKELY(NULL == frag->rdma_bml) ) {
|
||||
@ -719,11 +727,10 @@ int mca_pml_ob1_recv_request_schedule_exclusive(
|
||||
/* if we are little endian and the remote side is big endian,
|
||||
we're responsible for making sure the data is in network byte
|
||||
order */
|
||||
/* RDMA is currently disabled by bml if arch doesn't
|
||||
match, so this shouldn't be needed. here to make sure
|
||||
we remember if we ever change the bml. */
|
||||
assert(0 == (recvreq->req_recv.req_base.req_proc->proc_arch &
|
||||
OMPI_ARCH_ISBIGENDIAN));
|
||||
if (recvreq->req_recv.req_base.req_proc->proc_arch & OMPI_ARCH_ISBIGENDIAN) {
|
||||
hdr->hdr_common.hdr_flags |= MCA_PML_OB1_HDR_FLAGS_NBO;
|
||||
MCA_PML_OB1_RDMA_HDR_HTON(*hdr);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
@ -722,11 +722,10 @@ int mca_pml_ob1_send_request_start_rdma(
|
||||
/* if we are little endian and the remote side is big endian,
|
||||
we're responsible for making sure the data is in network byte
|
||||
order */
|
||||
/* RDMA is currently disabled by bml if arch doesn't
|
||||
match, so this shouldn't be needed. here to make sure
|
||||
we remember if we ever change the bml. */
|
||||
assert(0 == (sendreq->req_send.req_base.req_proc->proc_arch &
|
||||
OMPI_ARCH_ISBIGENDIAN));
|
||||
if (sendreq->req_send.req_base.req_proc->proc_arch & OMPI_ARCH_ISBIGENDIAN) {
|
||||
hdr->hdr_common.hdr_flags |= MCA_PML_OB1_HDR_FLAGS_NBO;
|
||||
MCA_PML_OB1_RGET_HDR_HTON(hdr->hdr_rget);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@ -1226,7 +1225,16 @@ void mca_pml_ob1_send_request_put( mca_pml_ob1_send_request_t* sendreq,
|
||||
frag->rdma_segs[i].seg_addr.lval = hdr->hdr_segs[i].seg_addr.lval;
|
||||
frag->rdma_segs[i].seg_len = hdr->hdr_segs[i].seg_len;
|
||||
frag->rdma_segs[i].seg_key.key64 = hdr->hdr_segs[i].seg_key.key64;
|
||||
size += frag->rdma_segs[i].seg_len;
|
||||
|
||||
#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
if ((sendreq->req_send.req_base.req_proc->proc_arch & OMPI_ARCH_ISBIGENDIAN) !=
|
||||
(ompi_proc_local()->proc_arch & OMPI_ARCH_ISBIGENDIAN)) {
|
||||
size += opal_swap_bytes4(frag->rdma_segs[i].seg_len);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
size == frag->rdma_segs[i].seg_len;
|
||||
}
|
||||
}
|
||||
|
||||
frag->rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl);
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user