1
1

heterogeneous fixes to the OpenIB BTL. This includes work by nysal, brian and

I. 

This commit was SVN r13106.
Этот коммит содержится в:
Galen Shipman 2007-01-12 23:14:45 +00:00
родитель df099a4731
Коммит 2097d174f6
7 изменённых файлов: 193 добавлений и 75 удалений

Просмотреть файл

@ -152,6 +152,17 @@ struct mca_btl_openib_port_info_t {
};
typedef struct mca_btl_openib_port_info_t mca_btl_openib_port_info_t;
#define MCA_BTL_OPENIB_PORT_INFO_NTOH(hdr) \
do { \
(hdr).mtu = ntohl((hdr).mtu); \
(hdr).subnet = ntoh64((hdr).subnet); \
} while (0)
#define MCA_BTL_OPENIB_PORT_INFO_HTON(hdr) \
do { \
(hdr).mtu = htonl((hdr).mtu); \
(hdr).subnet = hton64((hdr).subnet); \
} while (0)
struct mca_btl_openib_hca_t {
struct ibv_device *ib_dev; /* the ib device */
#if OMPI_ENABLE_PROGRESS_THREADS == 1

Просмотреть файл

@ -181,6 +181,9 @@ static int btl_openib_modex_send(void)
for (i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
mca_btl_openib_module_t *btl = &mca_btl_openib_component.openib_btls[i];
ports[i] = btl->port_info;
#if !defined(WORDS_BIGENDIAN) && OMPI_ENABLE_HETEROGENEOUS_SUPPORT
MCA_BTL_OPENIB_PORT_INFO_HTON(ports[i]);
#endif
}
}
rc = mca_pml_base_modex_send (&mca_btl_openib_component.super.btl_version, ports, size);
@ -206,6 +209,7 @@ static void btl_openib_control(struct mca_btl_base_module_t* btl,
mca_btl_openib_eager_rdma_header_t *rdma_hdr;
mca_btl_openib_rdma_credits_header_t *credits_hdr;
if(frag->size == mca_btl_openib_component.eager_limit) {
/* if not sent via rdma */
if(!MCA_BTL_OPENIB_RDMA_FRAG(frag) &&
@ -215,27 +219,53 @@ static void btl_openib_control(struct mca_btl_base_module_t* btl,
} else {
OPAL_THREAD_ADD32(&endpoint->rd_credits[BTL_OPENIB_LP_QP], -1);
}
switch (ctl_hdr->type) {
case MCA_BTL_OPENIB_CONTROL_CREDITS:
credits_hdr = (mca_btl_openib_rdma_credits_header_t*)ctl_hdr;
if(endpoint->nbo) {
BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH((*credits_hdr));
}
if(credits_hdr->rdma_credits)
OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens,
credits_hdr->rdma_credits);
break;
break;
case MCA_BTL_OPENIB_CONTROL_RDMA:
rdma_hdr = (mca_btl_openib_eager_rdma_header_t*)ctl_hdr;
BTL_VERBOSE(("prior to NTOH received rkey %lu, rdma_start.lval %llu, pval %p, ival %u, frag_t_len %llu\n",
rdma_hdr->rkey,
(unsigned long) rdma_hdr->rdma_start.lval,
rdma_hdr->rdma_start.pval,
rdma_hdr->rdma_start.ival,
(unsigned long) rdma_hdr->frag_t_len
));
if(endpoint->nbo) {
BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_NTOH((*rdma_hdr));
BTL_VERBOSE(("received rkey %lu, rdma_start.lval %llu, pval %p, ival %u, frag_t_len %llu\n",
rdma_hdr->rkey,
(unsigned long) rdma_hdr->rdma_start.lval,
rdma_hdr->rdma_start.pval,
rdma_hdr->rdma_start.ival,
(unsigned long) rdma_hdr->frag_t_len
));
}
if (endpoint->eager_rdma_remote.base.pval) {
BTL_ERROR(("Got RDMA connect twise!"));
BTL_ERROR(("Got RDMA connect twice!"));
return;
}
endpoint->eager_rdma_remote.rkey = rdma_hdr->rkey;
endpoint->eager_rdma_remote.base.pval = rdma_hdr->rdma_start.pval;
endpoint->eager_rdma_remote.base.lval = rdma_hdr->rdma_start.lval;
endpoint->eager_rdma_remote.frag_t_len = rdma_hdr->frag_t_len;
endpoint->eager_rdma_remote.tokens =
mca_btl_openib_component.eager_rdma_num - 1;
break;
default:
BTL_ERROR(("Unknown message type received by BTL"));
BTL_ERROR(("Unknown message type received by BTL"));
break;
}
}
@ -797,7 +827,9 @@ static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl,
size_t byte_len, const int prio)
{
ompi_free_list_t *free_list;
if(endpoint->nbo) {
BTL_OPENIB_HEADER_NTOH((*(frag->hdr)));
}
if(BTL_OPENIB_HP_QP == prio)
free_list = &openib_btl->recv_free_eager;
else
@ -841,6 +873,7 @@ static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl,
if (!endpoint->eager_rdma_local.base.pval &&
mca_btl_openib_component.use_eager_rdma &&
endpoint->use_eager_rdma &&
BTL_OPENIB_HP_QP == prio &&
openib_btl->eager_rdma_buffers_count <
mca_btl_openib_component.max_eager_rdma &&
@ -1120,12 +1153,15 @@ static int btl_openib_component_progress(void)
if(MCA_BTL_OPENIB_RDMA_FRAG_LOCAL(frag)) {
uint32_t size;
opal_atomic_rmb();
if(endpoint->nbo) {
BTL_OPENIB_FOOTER_NTOH((*frag->ftr));
}
size = MCA_BTL_OPENIB_RDMA_FRAG_GET_SIZE(frag->ftr);
#if OMPI_ENABLE_DEBUG
if (frag->ftr->seq != endpoint->eager_rdma_local.seq)
BTL_ERROR(("Eager RDMA wrong SEQ: received %d expected %d",
frag->ftr->seq,
endpoint->eager_rdma_local.seq));
frag->ftr->seq,
endpoint->eager_rdma_local.seq));
endpoint->eager_rdma_local.seq++;
#endif
MCA_BTL_OPENIB_RDMA_NEXT_INDEX(endpoint->eager_rdma_local.head);
@ -1176,7 +1212,7 @@ static int btl_openib_module_progress(mca_btl_openib_module_t* openib_btl)
if(ne < 0 || wc.status != IBV_WC_SUCCESS)
goto error;
frag = (mca_btl_openib_frag_t*) (unsigned long) wc.wr_id;
endpoint = frag->endpoint;
/* Handle work completions */

Просмотреть файл

@ -38,6 +38,7 @@ struct mca_btl_openib_eager_rdma_remote_t {
#if OMPI_ENABLE_DEBUG
uint32_t seq;
#endif
uint64_t frag_t_len; /**< remote's sizeof(mca_btl_openib_frag_t) */
};
typedef struct mca_btl_openib_eager_rdma_remote_t mca_btl_openib_eager_rdma_remote_t;

Просмотреть файл

@ -119,9 +119,9 @@ static inline int mca_btl_openib_endpoint_post_send(mca_btl_openib_module_t* ope
{
int do_rdma = 0, prio;
struct ibv_send_wr* bad_wr;
frag->wr_desc.sr_desc.opcode = IBV_WR_SEND;
frag->sg_entry.addr = (unsigned long) frag->hdr;
frag->sg_entry.addr = (unsigned long) frag->hdr;
prio = (frag->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY) ?
BTL_OPENIB_HP_QP : BTL_OPENIB_LP_QP;
@ -163,12 +163,17 @@ static inline int mca_btl_openib_endpoint_post_send(mca_btl_openib_module_t* ope
frag->segment.seg_len))->seq =
endpoint->eager_rdma_remote.seq++;
#endif
if(endpoint->nbo) {
BTL_OPENIB_HEADER_HTON((*(frag->hdr)));
BTL_OPENIB_FOOTER_HTON((*ftr));
}
frag->wr_desc.sr_desc.wr.rdma.rkey = endpoint->eager_rdma_remote.rkey;
frag->wr_desc.sr_desc.wr.rdma.remote_addr =
endpoint->eager_rdma_remote.base.lval +
endpoint->eager_rdma_remote.head *
openib_btl->eager_rdma_frag_size +
sizeof(mca_btl_openib_frag_t) +
endpoint->eager_rdma_remote.frag_t_len +
sizeof(mca_btl_openib_header_t) +
mca_btl_openib_component.eager_limit +
sizeof(mca_btl_openib_footer_t);
@ -179,10 +184,16 @@ static inline int mca_btl_openib_endpoint_post_send(mca_btl_openib_module_t* ope
frag->wr_desc.sr_desc.opcode = IBV_WR_SEND_WITH_IMM;
frag->wr_desc.sr_desc.imm_data = endpoint->rem_info.rem_index;
}
if(endpoint->nbo) {
BTL_OPENIB_HEADER_HTON((*(frag->hdr)));
}
}
if(ibv_post_send(endpoint->lcl_qp[prio], &frag->wr_desc.sr_desc,
&bad_wr)) {
if(endpoint->nbo) {
BTL_OPENIB_HEADER_NTOH((*(frag->hdr)));
}
if(BTL_OPENIB_IS_RDMA_CREDITS(frag->hdr->credits)) {
OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits,
BTL_OPENIB_CREDITS(frag->hdr->credits));
@ -280,6 +291,8 @@ static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint)
endpoint->rem_info.rem_psn_lp = 0;
endpoint->rem_info.rem_subnet_id = 0;
endpoint->rem_info.rem_mtu = 0;
endpoint->nbo = false;
endpoint->use_eager_rdma = true;
}
/*
@ -1061,7 +1074,6 @@ void mca_btl_openib_endpoint_send_credits(mca_btl_openib_endpoint_t* endpoint,
mca_btl_openib_frag_t* frag;
struct ibv_send_wr* bad_wr;
mca_btl_openib_rdma_credits_header_t *credits_hdr;
frag = endpoint->credit_frag[prio];
credits_hdr =
(mca_btl_openib_rdma_credits_header_t*)frag->segment.seg_addr.pval;
@ -1075,6 +1087,7 @@ void mca_btl_openib_endpoint_send_credits(mca_btl_openib_endpoint_t* endpoint,
if(endpoint->rd_credits[prio] > 0) {
frag->hdr->credits = endpoint->rd_credits[prio];
OPAL_THREAD_ADD32(&endpoint->rd_credits[prio], -frag->hdr->credits);
} else {
frag->hdr->credits = 0;
}
@ -1097,18 +1110,27 @@ void mca_btl_openib_endpoint_send_credits(mca_btl_openib_endpoint_t* endpoint,
frag->sg_entry.length = sizeof(mca_btl_openib_header_t) +
sizeof(mca_btl_openib_rdma_credits_header_t);
frag->sg_entry.addr = (unsigned long) frag->hdr;
if(frag->sg_entry.length <= openib_btl->ib_inline_max) {
frag->wr_desc.sr_desc.send_flags = IBV_SEND_INLINE | IBV_SEND_SIGNALED;
} else {
frag->wr_desc.sr_desc.send_flags = IBV_SEND_SIGNALED;
}
/* just do it all, regardless of eager rdma or not.. */
if(endpoint->nbo) {
BTL_OPENIB_HEADER_HTON((*frag->hdr));
BTL_OPENIB_RDMA_CREDITS_HEADER_HTON((*credits_hdr));
}
if(ibv_post_send(endpoint->lcl_qp[prio], &frag->wr_desc.sr_desc, &bad_wr)) {
if(endpoint->nbo) {
BTL_OPENIB_HEADER_NTOH((*frag->hdr));
BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH((*credits_hdr));
}
OPAL_THREAD_ADD32(&endpoint->sd_credits[prio], -1);
OPAL_THREAD_ADD32(&endpoint->rd_credits[prio], frag->hdr->credits);
OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits,
credits_hdr->rdma_credits);
OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits, credits_hdr->rdma_credits);
BTL_ERROR(("error posting send request errno %d says %s",
strerror(errno)));
}
@ -1146,8 +1168,28 @@ static int mca_btl_openib_endpoint_send_eager_rdma(
rdma_hdr = (mca_btl_openib_eager_rdma_header_t*)frag->segment.seg_addr.pval;
rdma_hdr->control.type = MCA_BTL_OPENIB_CONTROL_RDMA;
rdma_hdr->rkey = endpoint->eager_rdma_local.reg->mr->rkey;
rdma_hdr->frag_t_len = sizeof(mca_btl_openib_frag_t);
rdma_hdr->rdma_start.lval = ompi_ptr_ptol(endpoint->eager_rdma_local.base.pval);
BTL_VERBOSE(("sending rkey %lu, rdma_start.lval %llu, pval %p, ival %u type %d and sizeof(rdma_hdr) %d\n",
rdma_hdr->rkey,
rdma_hdr->rdma_start.lval,
rdma_hdr->rdma_start.pval,
rdma_hdr->rdma_start.ival,
rdma_hdr->control.type,
sizeof(mca_btl_openib_eager_rdma_header_t)
));
frag->segment.seg_len = sizeof(mca_btl_openib_eager_rdma_header_t);
if(endpoint->nbo) {
BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_HTON((*rdma_hdr));
BTL_VERBOSE(("after HTON: sending rkey %lu, rdma_start.lval %llu, pval %p, ival %u\n",
rdma_hdr->rkey,
rdma_hdr->rdma_start.lval,
rdma_hdr->rdma_start.pval,
rdma_hdr->rdma_start.ival
));
}
if (mca_btl_openib_endpoint_send(endpoint, frag) != OMPI_SUCCESS) {
MCA_BTL_IB_FRAG_RETURN(openib_btl, frag);
BTL_ERROR(("Error sending RDMA buffer", strerror(errno)));
@ -1194,7 +1236,7 @@ void mca_btl_openib_endpoint_connect_eager_rdma(
((mca_btl_openib_frag_t*)item)->endpoint = endpoint;
((mca_btl_openib_frag_t*)item)->type = MCA_BTL_OPENIB_FRAG_EAGER_RDMA;
}
/* set local rdma pointer to real value */
opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, (void*)1,
buf);

Просмотреть файл

@ -155,6 +155,8 @@ struct mca_btl_base_endpoint_t {
/**< info about local RDMA buffer */
uint32_t index; /**< index of the endpoint in endpoints array */
struct mca_btl_openib_frag_t *credit_frag[2]; /**< frags for sending explicit high priority credits */
bool nbo; /**< does the endpoint require network byte ordering? */
bool use_eager_rdma; /**< use eager rdma for this peer? */
};
typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t;

Просмотреть файл

@ -64,28 +64,39 @@ struct mca_btl_openib_footer_t {
};
typedef struct mca_btl_openib_footer_t mca_btl_openib_footer_t;
#ifdef WORDS_BIGENDIAN
#define MCA_BTL_OPENIB_FTR_SIZE_REVERSE(ftr)
#else
#define MCA_BTL_OPENIB_FTR_SIZE_REVERSE(ftr) \
do { \
uint8_t tmp = (ftr).u.buf[0]; \
(ftr).u.buf[0]=(ftr).u.buf[2]; \
(ftr).u.buf[2]=tmp; \
} while (0)
#endif
#if OMPI_ENABLE_DEBUG
#define BTL_OPENIB_FOOTER_HTON(h) \
do { \
h.seq = htonl(h.seq); \
h.u.size = htonl(h.u.size); \
} while (0)
#define BTL_OPENIB_FOOTER_NTOH(h) \
do { \
h.seq = ntohs(h.seq); \
h.u.size = ntohl(h.u.size); \
} while (0)
#define BTL_OPENIB_FOOTER_HTON(h) \
do { \
h.seq = htonl(h.seq); \
MCA_BTL_OPENIB_FTR_SIZE_REVERSE(h); \
} while (0)
#define BTL_OPENIB_FOOTER_NTOH(h) \
do { \
h.seq = ntohl(h.seq); \
MCA_BTL_OPENIB_FTR_SIZE_REVERSE(h); \
} while (0)
#else
#define BTL_OPENIB_FOOTER_HTON(h) \
do { \
h.u.size = htonl(h.u.size); \
} while (0)
#define BTL_OPENIB_FOOTER_NTOH(h) \
do { \
h.u.size = ntohl(h.u.size); \
} while (0)
#define BTL_OPENIB_FOOTER_HTON(h) \
do { \
MCA_BTL_OPENIB_FTR_SIZE_REVERSE(h); \
} while (0)
#define BTL_OPENIB_FOOTER_NTOH(h) \
do { \
MCA_BTL_OPENIB_FTR_SIZE_REVERSE(h); \
} while (0)
#endif
@ -93,58 +104,50 @@ do { \
#define MCA_BTL_OPENIB_CONTROL_RDMA 1
struct mca_btl_openib_control_header_t {
uint32_t type;
uint8_t type;
};
typedef struct mca_btl_openib_control_header_t mca_btl_openib_control_header_t;
#define BTL_OPENIB_CONTROL_HEADER_HTON(h) \
do { \
h.type = htonl(h.type); \
} while (0)
#define BTL_OPENIB_CONTROL_HEADER_NTOH(h) \
do { \
h.type = ntohl(h.type); \
} while (0)
struct mca_btl_openib_eager_rdma_header_t {
mca_btl_openib_control_header_t control;
uint32_t rkey;
ompi_ptr_t rdma_start;
mca_btl_openib_control_header_t control;
uint8_t padding[3];
uint32_t rkey;
ompi_ptr_t rdma_start;
uint64_t frag_t_len;
};
typedef struct mca_btl_openib_eager_rdma_header_t mca_btl_openib_eager_rdma_header_t;
#define BTL_OPENIB_EAGER_RDMA_HEADER_HTON(h) \
do { \
BTL_OPENIB_CONTROL_HEADER_HTON(h.control); \
h.rkey = htonl(h.rkey); \
h.rdma_start.lval = hton64(h.rdma_start.lval); \
} while (0)
#define BTL_OPENIB_EAGER_RDMA_HEADER_NTOH(h) \
do { \
BTL_OPENIB_CONTROL_HEADER_NTOH(h.control); \
h.rkey = ntohl(h.rkey); \
h.rdma_start.lval = ntoh64(h.rdma_start.lval); \
} while (0)
#define BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_HTON(h) \
do { \
h.rkey = htonl(h.rkey); \
h.rdma_start.lval = hton64(h.rdma_start.lval); \
h.frag_t_len = hton64(h.frag_t_len); \
} while (0)
#define BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_NTOH(h) \
do { \
h.rkey = ntohl(h.rkey); \
h.rdma_start.lval = ntoh64(h.rdma_start.lval); \
h.frag_t_len = ntoh64(h.frag_t_len); \
} while (0)
struct mca_btl_openib_rdma_credits_header_t {
mca_btl_openib_control_header_t control;
uint8_t padding[1];
uint16_t rdma_credits;
};
typedef struct mca_btl_openib_rdma_credits_header_t mca_btl_openib_rdma_credits_header_t;
#define BTL_OPENIB_RDMA_CREDITS_HEADER_HTON(h) \
do { \
BTL_OPENIB_CONTROL_HEADER_HTON(h.control); \
/* BTL_OPENIB_CONTROL_HEADER_HTON(h.control); */ \
h.rdma_credits = htons(h.rdma_credits); \
} while (0)
#define BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH(h) \
do { \
BTL_OPENIB_CONTROL_HEADER_NTOH(h.control); \
/* BTL_OPENIB_CONTROL_HEADER_NTOH(h.control); */ \
h.rdma_credits = ntohs(h.rdma_credits); \
} while (0)

Просмотреть файл

@ -20,6 +20,7 @@
#include "opal/class/opal_hash_table.h"
#include "ompi/mca/pml/base/pml_base_module_exchange.h"
#include "ompi/datatype/dt_arch.h"
#include "btl_openib.h"
#include "btl_openib_proc.h"
@ -98,7 +99,7 @@ mca_btl_openib_proc_t* mca_btl_openib_proc_create(ompi_proc_t* ompi_proc)
{
mca_btl_openib_proc_t* module_proc = NULL;
size_t size;
int rc;
int rc,i;
/* Check if we have already created a IB proc
* structure for this ompi process */
@ -145,8 +146,6 @@ mca_btl_openib_proc_t* mca_btl_openib_proc_create(ompi_proc_t* ompi_proc)
return NULL;
}
/* TODO - Endian Ordering fixups for the subnet and such.. just call hton, ntoh
always use NBO */
module_proc->proc_port_count = size/sizeof(mca_btl_openib_port_info_t);
if (0 == module_proc->proc_port_count) {
@ -155,7 +154,12 @@ mca_btl_openib_proc_t* mca_btl_openib_proc_create(ompi_proc_t* ompi_proc)
module_proc->proc_endpoints = (mca_btl_base_endpoint_t**)
malloc(module_proc->proc_port_count * sizeof(mca_btl_base_endpoint_t*));
}
#if !defined(WORDS_BIGENDIAN) && OMPI_ENABLE_HETEROGENEOUS_SUPPORT
for(i=0; i < module_proc->proc_port_count; ++i) {
MCA_BTL_OPENIB_PORT_INFO_NTOH(module_proc->proc_ports[i]);
}
#endif
if(NULL == module_proc->proc_endpoints) {
OBJ_RELEASE(module_proc);
return NULL;
@ -173,6 +177,25 @@ int mca_btl_openib_proc_insert(mca_btl_openib_proc_t* module_proc,
mca_btl_base_endpoint_t* module_endpoint)
{
/* insert into endpoint array */
#ifndef WORDS_BIGENDIAN
/* if we are little endian and our peer is not so lucky, then we
need to put all information sent to him in big endian (aka
Network Byte Order) and expect all information received to
be in NBO. Since big endian machines always send and receive
in NBO, we don't care so much about that case. */
if (module_proc->proc_ompi->proc_arch & OMPI_ARCH_ISBIGENDIAN) {
module_endpoint->nbo = true;
}
#endif
/* only allow eager rdma if the peers agree on the size of a long */
if((module_proc->proc_ompi->proc_arch & OMPI_ARCH_LONGISxx) !=
(ompi_proc_local()->proc_arch & OMPI_ARCH_LONGISxx)) {
module_endpoint->use_eager_rdma = false;
}
module_endpoint->endpoint_proc = module_proc;
module_proc->proc_endpoints[module_proc->proc_endpoint_count++] = module_endpoint;
return OMPI_SUCCESS;