Added support for openib RDMA READ.. note that performance is currently an
issue so PUT is default.. We are determining if this is an openib issue or a btl issue as we have seen performance increases on mvapi. This commit was SVN r6928.
Этот коммит содержится в:
родитель
166ecc9544
Коммит
afdfa70f73
@ -32,7 +32,6 @@
|
||||
#include "mca/mpool/openib/mpool_openib.h"
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
extern int errno;
|
||||
|
||||
mca_btl_openib_module_t mca_btl_openib_module = {
|
||||
{
|
||||
@ -57,7 +56,7 @@ mca_btl_openib_module_t mca_btl_openib_module = {
|
||||
mca_btl_openib_prepare_dst,
|
||||
mca_btl_openib_send,
|
||||
mca_btl_openib_put,
|
||||
NULL /* get */
|
||||
mca_btl_openib_get /* get */
|
||||
}
|
||||
};
|
||||
|
||||
@ -775,7 +774,7 @@ int mca_btl_openib_send(
|
||||
}
|
||||
|
||||
/*
|
||||
* RDMA local buffer to remote buffer address.
|
||||
* RDMA WRITE local buffer to remote buffer address.
|
||||
*/
|
||||
|
||||
int mca_btl_openib_put( mca_btl_base_module_t* btl,
|
||||
@ -812,6 +811,45 @@ int mca_btl_openib_put( mca_btl_base_module_t* btl,
|
||||
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* RDMA READ remote buffer to local buffer address.
|
||||
*/
|
||||
|
||||
int mca_btl_openib_get( mca_btl_base_module_t* btl,
|
||||
mca_btl_base_endpoint_t* endpoint,
|
||||
mca_btl_base_descriptor_t* descriptor)
|
||||
{
|
||||
struct ibv_send_wr* bad_wr;
|
||||
mca_btl_openib_frag_t* frag = (mca_btl_openib_frag_t*) descriptor;
|
||||
frag->endpoint = endpoint;
|
||||
frag->wr_desc.sr_desc.opcode = IBV_WR_RDMA_READ;
|
||||
frag->wr_desc.sr_desc.send_flags = IBV_SEND_SIGNALED;
|
||||
frag->wr_desc.sr_desc.wr.rdma.remote_addr = (uintptr_t) frag->base.des_src->seg_addr.pval;
|
||||
frag->wr_desc.sr_desc.wr.rdma.rkey = frag->base.des_src->seg_key.key32[0];
|
||||
frag->sg_entry.addr = (uintptr_t) frag->base.des_dst->seg_addr.pval;
|
||||
frag->sg_entry.length = frag->base.des_dst->seg_len;
|
||||
|
||||
BTL_VERBOSE(("frag->wr_desc.sr_desc.wr.rdma.remote_addr = %llu .rkey = %lu frag->sg_entry.addr = %llu .length = %lu"
|
||||
, frag->wr_desc.sr_desc.wr.rdma.remote_addr
|
||||
, frag->wr_desc.sr_desc.wr.rdma.rkey
|
||||
, frag->sg_entry.addr
|
||||
, frag->sg_entry.length));
|
||||
|
||||
if(ibv_post_send(endpoint->lcl_qp_low,
|
||||
&frag->wr_desc.sr_desc,
|
||||
&bad_wr)){
|
||||
BTL_ERROR(("error posting send request errno says %s", strerror(errno)));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
MCA_BTL_OPENIB_ENDPOINT_POST_RR_HIGH(endpoint, 1);
|
||||
MCA_BTL_OPENIB_ENDPOINT_POST_RR_LOW(endpoint, 1);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize the btl module by allocating a protection domain
|
||||
* and creating both the high and low priority completion queues
|
||||
|
@ -306,7 +306,24 @@ extern int mca_btl_openib_put(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* btl_peer,
|
||||
struct mca_btl_base_descriptor_t* decriptor
|
||||
);
|
||||
);
|
||||
|
||||
/**
|
||||
* PML->BTL Initiate a get of the specified size.
|
||||
*
|
||||
* @param btl (IN) BTL instance
|
||||
* @param btl_base_peer (IN) BTL peer addressing
|
||||
* @param send_request (IN/OUT) Send request (allocated by PML via mca_btl_base_request_alloc_fn_t)
|
||||
* @param size (IN) Number of bytes PML is requesting BTL to deliver
|
||||
* @param flags (IN) Flags that should be passed to the peer via the message header.
|
||||
* @param request (OUT) OMPI_SUCCESS if the BTL was able to queue one or more fragments
|
||||
*/
|
||||
extern int mca_btl_openib_get(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* btl_peer,
|
||||
struct mca_btl_base_descriptor_t* decriptor
|
||||
);
|
||||
|
||||
|
||||
/**
|
||||
* Allocate a descriptor.
|
||||
|
@ -41,7 +41,6 @@
|
||||
#include <errno.h>
|
||||
#include <string.h> /* for strerror()*/
|
||||
|
||||
extern int errno;
|
||||
mca_btl_openib_component_t mca_btl_openib_component = {
|
||||
{
|
||||
/* First, the mca_base_component_t struct containing meta information
|
||||
@ -507,71 +506,67 @@ int mca_btl_openib_component_progress()
|
||||
* we will check the high priority and process them until there are none left.
|
||||
* note that low priority messages are only processed one per progress call.
|
||||
*/
|
||||
do{
|
||||
ne=ibv_poll_cq(openib_btl->ib_cq_high, 1, &wc );
|
||||
if(ne < 0 ){
|
||||
BTL_ERROR(("error polling CQ with %d errno says %s\n", ne, strerror(errno)));
|
||||
ne=ibv_poll_cq(openib_btl->ib_cq_high, 1, &wc );
|
||||
if(ne < 0 ){
|
||||
BTL_ERROR(("error polling CQ with %d errno says %s\n", ne, strerror(errno)));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
else if(wc.status != IBV_WC_SUCCESS) {
|
||||
BTL_ERROR(("error polling CQ with status %d for wr_id %llu\n",
|
||||
wc.status, wc.wr_id));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
else if(1 == ne) {
|
||||
BTL_VERBOSE(("completion queue event says opcode is %d\n", wc.opcode));
|
||||
|
||||
/* Handle work completions */
|
||||
switch(wc.opcode) {
|
||||
case IBV_WC_RECV_RDMA_WITH_IMM:
|
||||
BTL_ERROR(("Got an RDMA with Immediate data Not supported!"));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
else if(wc.status != IBV_WC_SUCCESS) {
|
||||
BTL_ERROR(("error polling CQ with status %d for wr_id %llu\n",
|
||||
wc.status, wc.wr_id));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
else if(1 == ne) {
|
||||
BTL_VERBOSE(("completion queue event says opcode is %d\n", wc.opcode));
|
||||
|
||||
case IBV_WC_RECV:
|
||||
/* Process a RECV */
|
||||
|
||||
BTL_VERBOSE(("Got an recv on the completion queue"));
|
||||
frag = (mca_btl_openib_frag_t*) wc.wr_id;
|
||||
endpoint = (mca_btl_openib_endpoint_t*) frag->endpoint;
|
||||
frag->rc=OMPI_SUCCESS;
|
||||
frag->segment.seg_len =
|
||||
wc.byte_len-
|
||||
((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr);
|
||||
|
||||
|
||||
|
||||
/* advance the segment address past the header and subtract from the length..*/
|
||||
openib_btl->ib_reg[frag->hdr->tag].cbfunc(&openib_btl->super,
|
||||
frag->hdr->tag,
|
||||
&frag->base,
|
||||
openib_btl->ib_reg[frag->hdr->tag].cbdata);
|
||||
|
||||
OPAL_THREAD_ADD32(&endpoint->rr_posted_high, -1);
|
||||
MCA_BTL_OPENIB_ENDPOINT_POST_RR_HIGH(((mca_btl_openib_frag_t*)wc.wr_id)->endpoint, 0);
|
||||
OMPI_FREE_LIST_RETURN(&(openib_btl->recv_free_eager), (opal_list_item_t*) frag);
|
||||
count++;
|
||||
break;
|
||||
|
||||
/* Handle work completions */
|
||||
switch(wc.opcode) {
|
||||
case IBV_WC_RECV_RDMA_WITH_IMM:
|
||||
BTL_ERROR(("Got an RDMA with Immediate data Not supported!"));
|
||||
return OMPI_ERROR;
|
||||
|
||||
case IBV_WC_RECV:
|
||||
/* Process a RECV */
|
||||
case IBV_WC_RDMA_READ:
|
||||
case IBV_WC_RDMA_WRITE:
|
||||
case IBV_WC_SEND :
|
||||
|
||||
BTL_VERBOSE(("Got an recv on the completion queue"));
|
||||
frag = (mca_btl_openib_frag_t*) wc.wr_id;
|
||||
endpoint = (mca_btl_openib_endpoint_t*) frag->endpoint;
|
||||
frag->rc=OMPI_SUCCESS;
|
||||
frag->segment.seg_len =
|
||||
wc.byte_len-
|
||||
((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr);
|
||||
|
||||
|
||||
/* Process a completed send or rdma write*/
|
||||
frag = (mca_btl_openib_frag_t*) wc.wr_id;
|
||||
frag->rc = OMPI_SUCCESS;
|
||||
frag->base.des_cbfunc(&openib_btl->super, frag->endpoint, &frag->base, frag->rc);
|
||||
count++;
|
||||
break;
|
||||
|
||||
/* advance the segment address past the header and subtract from the length..*/
|
||||
openib_btl->ib_reg[frag->hdr->tag].cbfunc(&openib_btl->super,
|
||||
frag->hdr->tag,
|
||||
&frag->base,
|
||||
openib_btl->ib_reg[frag->hdr->tag].cbdata);
|
||||
|
||||
OPAL_THREAD_ADD32(&endpoint->rr_posted_high, -1);
|
||||
MCA_BTL_OPENIB_ENDPOINT_POST_RR_HIGH(((mca_btl_openib_frag_t*)wc.wr_id)->endpoint, 0);
|
||||
OMPI_FREE_LIST_RETURN(&(openib_btl->recv_free_eager), (opal_list_item_t*) frag);
|
||||
count++;
|
||||
break;
|
||||
|
||||
case IBV_WC_RDMA_WRITE:
|
||||
case IBV_WC_SEND :
|
||||
|
||||
/* Process a completed send or rdma write*/
|
||||
frag = (mca_btl_openib_frag_t*) wc.wr_id;
|
||||
frag->rc = OMPI_SUCCESS;
|
||||
frag->base.des_cbfunc(&openib_btl->super, frag->endpoint, &frag->base, frag->rc);
|
||||
count++;
|
||||
break;
|
||||
|
||||
|
||||
break;
|
||||
|
||||
default:
|
||||
BTL_ERROR(("Unhandled work completion opcode is %d", wc.opcode));
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
BTL_ERROR(("Unhandled work completion opcode is %d", wc.opcode));
|
||||
break;
|
||||
}
|
||||
}
|
||||
while(ne > 0);
|
||||
|
||||
ne=ibv_poll_cq(openib_btl->ib_cq_low, 1, &wc );
|
||||
if(ne < 0){
|
||||
@ -614,8 +609,10 @@ int mca_btl_openib_component_progress()
|
||||
count++;
|
||||
break;
|
||||
|
||||
case IBV_WC_RDMA_READ:
|
||||
case IBV_WC_RDMA_WRITE:
|
||||
case IBV_WC_SEND :
|
||||
|
||||
/* Process a completed send */
|
||||
frag = (mca_btl_openib_frag_t*) wc.wr_id;
|
||||
frag->rc = OMPI_SUCCESS;
|
||||
|
@ -32,7 +32,6 @@
|
||||
#include "class/ompi_free_list.h"
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
extern int errno;
|
||||
|
||||
static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint);
|
||||
static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint);
|
||||
@ -728,7 +727,7 @@ int mca_btl_openib_endpoint_create_qp(
|
||||
qp_attr->qp_state = IBV_QPS_INIT;
|
||||
qp_attr->pkey_index = mca_btl_openib_component.ib_pkey_ix;
|
||||
qp_attr->port_num = openib_btl->port_num;
|
||||
qp_attr->qp_access_flags = IBV_ACCESS_REMOTE_WRITE;
|
||||
qp_attr->qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ;
|
||||
|
||||
if(ibv_modify_qp((*qp), qp_attr,
|
||||
IBV_QP_STATE |
|
||||
|
@ -27,7 +27,6 @@
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#include "mca/btl/base/btl_base_error.h"
|
||||
extern int errno;
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
|
@ -87,16 +87,6 @@ OBJ_CLASS_INSTANCE(
|
||||
|
||||
|
||||
|
||||
static char* mca_mpool_openib_param_register_string(
|
||||
const char* param_name,
|
||||
const char* default_value)
|
||||
{
|
||||
char *param_value;
|
||||
int id = mca_base_param_register_string("mpool","openib",param_name,NULL,default_value);
|
||||
mca_base_param_lookup_string(id, ¶m_value);
|
||||
return param_value;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* component open/close/init function
|
||||
|
@ -46,10 +46,7 @@ void* mca_mpool_openib_alloc(
|
||||
size_t align,
|
||||
mca_mpool_base_registration_t** registration)
|
||||
{
|
||||
mca_mpool_openib_module_t* mpool_openib = (mca_mpool_openib_module_t*)mpool;
|
||||
/* void* addr_malloc = (void*)malloc((*size) + mca_mpool_openib_component.page_size); */
|
||||
/* void* addr = (void*) ALIGN_ADDR(addr_malloc, mca_mpool_openib_component.page_size_log); */
|
||||
|
||||
|
||||
void* addr_malloc = (void*)memalign(mca_mpool_openib_component.page_size, size);
|
||||
void* addr = addr_malloc;
|
||||
|
||||
@ -81,7 +78,7 @@ int mca_mpool_openib_register(mca_mpool_base_module_t* mpool,
|
||||
mpool_module->resources.ib_pd,
|
||||
addr,
|
||||
size,
|
||||
IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE
|
||||
IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ
|
||||
/* IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE */
|
||||
);
|
||||
|
||||
@ -104,7 +101,6 @@ int mca_mpool_openib_register(mca_mpool_base_module_t* mpool,
|
||||
int mca_mpool_openib_deregister(mca_mpool_base_module_t* mpool, void *addr, size_t size,
|
||||
mca_mpool_base_registration_t* registration){
|
||||
|
||||
mca_mpool_openib_module_t * mpool_openib = (mca_mpool_openib_module_t*) mpool;
|
||||
mca_mpool_openib_registration_t * openib_reg;
|
||||
openib_reg = (mca_mpool_openib_registration_t*) registration;
|
||||
if(ibv_dereg_mr(openib_reg->mr)){
|
||||
@ -127,7 +123,7 @@ void* mca_mpool_openib_realloc(
|
||||
mca_mpool_base_registration_t* old_reg = *registration;
|
||||
void* new_mem = mpool->mpool_alloc(mpool, size, 0, registration);
|
||||
memcpy(new_mem, addr, old_reg->bound - old_reg->base);
|
||||
mpool->mpool_free(mpool, addr, &old_reg);
|
||||
mpool->mpool_free(mpool, addr, old_reg);
|
||||
return new_mem;
|
||||
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user