1
1

Added support for openib RDMA READ.. note that performance is currently an

issue so PUT is default.. We are determining if this is an openib issue or a
btl issue as we have seen performance increases on mvapi. 

This commit was SVN r6928.
Этот коммит содержится в:
Galen Shipman 2005-08-18 17:08:27 +00:00
родитель 166ecc9544
Коммит afdfa70f73
7 изменённых файлов: 120 добавлений и 84 удалений

Просмотреть файл

@ -32,7 +32,6 @@
#include "mca/mpool/openib/mpool_openib.h"
#include <errno.h>
#include <string.h>
extern int errno;
mca_btl_openib_module_t mca_btl_openib_module = {
{
@ -57,7 +56,7 @@ mca_btl_openib_module_t mca_btl_openib_module = {
mca_btl_openib_prepare_dst,
mca_btl_openib_send,
mca_btl_openib_put,
NULL /* get */
mca_btl_openib_get /* get */
}
};
@ -775,7 +774,7 @@ int mca_btl_openib_send(
}
/*
* RDMA local buffer to remote buffer address.
* RDMA WRITE local buffer to remote buffer address.
*/
int mca_btl_openib_put( mca_btl_base_module_t* btl,
@ -812,6 +811,45 @@ int mca_btl_openib_put( mca_btl_base_module_t* btl,
}
/*
* RDMA READ remote buffer to local buffer address.
*/
int mca_btl_openib_get( mca_btl_base_module_t* btl,
mca_btl_base_endpoint_t* endpoint,
mca_btl_base_descriptor_t* descriptor)
{
struct ibv_send_wr* bad_wr;
mca_btl_openib_frag_t* frag = (mca_btl_openib_frag_t*) descriptor;
frag->endpoint = endpoint;
frag->wr_desc.sr_desc.opcode = IBV_WR_RDMA_READ;
frag->wr_desc.sr_desc.send_flags = IBV_SEND_SIGNALED;
frag->wr_desc.sr_desc.wr.rdma.remote_addr = (uintptr_t) frag->base.des_src->seg_addr.pval;
frag->wr_desc.sr_desc.wr.rdma.rkey = frag->base.des_src->seg_key.key32[0];
frag->sg_entry.addr = (uintptr_t) frag->base.des_dst->seg_addr.pval;
frag->sg_entry.length = frag->base.des_dst->seg_len;
BTL_VERBOSE(("frag->wr_desc.sr_desc.wr.rdma.remote_addr = %llu .rkey = %lu frag->sg_entry.addr = %llu .length = %lu"
, frag->wr_desc.sr_desc.wr.rdma.remote_addr
, frag->wr_desc.sr_desc.wr.rdma.rkey
, frag->sg_entry.addr
, frag->sg_entry.length));
if(ibv_post_send(endpoint->lcl_qp_low,
&frag->wr_desc.sr_desc,
&bad_wr)){
BTL_ERROR(("error posting send request errno says %s", strerror(errno)));
return OMPI_ERROR;
}
MCA_BTL_OPENIB_ENDPOINT_POST_RR_HIGH(endpoint, 1);
MCA_BTL_OPENIB_ENDPOINT_POST_RR_LOW(endpoint, 1);
return OMPI_SUCCESS;
}
/*
* Initialize the btl module by allocating a protection domain
* and creating both the high and low priority completion queues

Просмотреть файл

@ -306,7 +306,24 @@ extern int mca_btl_openib_put(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* btl_peer,
struct mca_btl_base_descriptor_t* decriptor
);
);
/**
* PML->BTL Initiate a get of the specified size.
*
* @param btl (IN) BTL instance
* @param btl_base_peer (IN) BTL peer addressing
* @param send_request (IN/OUT) Send request (allocated by PML via mca_btl_base_request_alloc_fn_t)
* @param size (IN) Number of bytes PML is requesting BTL to deliver
* @param flags (IN) Flags that should be passed to the peer via the message header.
* @param request (OUT) OMPI_SUCCESS if the BTL was able to queue one or more fragments
*/
extern int mca_btl_openib_get(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* btl_peer,
struct mca_btl_base_descriptor_t* decriptor
);
/**
* Allocate a descriptor.

Просмотреть файл

@ -41,7 +41,6 @@
#include <errno.h>
#include <string.h> /* for strerror()*/
extern int errno;
mca_btl_openib_component_t mca_btl_openib_component = {
{
/* First, the mca_base_component_t struct containing meta information
@ -507,71 +506,67 @@ int mca_btl_openib_component_progress()
* we will check the high priority and process them until there are none left.
* note that low priority messages are only processed one per progress call.
*/
do{
ne=ibv_poll_cq(openib_btl->ib_cq_high, 1, &wc );
if(ne < 0 ){
BTL_ERROR(("error polling CQ with %d errno says %s\n", ne, strerror(errno)));
ne=ibv_poll_cq(openib_btl->ib_cq_high, 1, &wc );
if(ne < 0 ){
BTL_ERROR(("error polling CQ with %d errno says %s\n", ne, strerror(errno)));
return OMPI_ERROR;
}
else if(wc.status != IBV_WC_SUCCESS) {
BTL_ERROR(("error polling CQ with status %d for wr_id %llu\n",
wc.status, wc.wr_id));
return OMPI_ERROR;
}
else if(1 == ne) {
BTL_VERBOSE(("completion queue event says opcode is %d\n", wc.opcode));
/* Handle work completions */
switch(wc.opcode) {
case IBV_WC_RECV_RDMA_WITH_IMM:
BTL_ERROR(("Got an RDMA with Immediate data Not supported!"));
return OMPI_ERROR;
}
else if(wc.status != IBV_WC_SUCCESS) {
BTL_ERROR(("error polling CQ with status %d for wr_id %llu\n",
wc.status, wc.wr_id));
return OMPI_ERROR;
}
else if(1 == ne) {
BTL_VERBOSE(("completion queue event says opcode is %d\n", wc.opcode));
case IBV_WC_RECV:
/* Process a RECV */
BTL_VERBOSE(("Got an recv on the completion queue"));
frag = (mca_btl_openib_frag_t*) wc.wr_id;
endpoint = (mca_btl_openib_endpoint_t*) frag->endpoint;
frag->rc=OMPI_SUCCESS;
frag->segment.seg_len =
wc.byte_len-
((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr);
/* advance the segment address past the header and subtract from the length..*/
openib_btl->ib_reg[frag->hdr->tag].cbfunc(&openib_btl->super,
frag->hdr->tag,
&frag->base,
openib_btl->ib_reg[frag->hdr->tag].cbdata);
OPAL_THREAD_ADD32(&endpoint->rr_posted_high, -1);
MCA_BTL_OPENIB_ENDPOINT_POST_RR_HIGH(((mca_btl_openib_frag_t*)wc.wr_id)->endpoint, 0);
OMPI_FREE_LIST_RETURN(&(openib_btl->recv_free_eager), (opal_list_item_t*) frag);
count++;
break;
/* Handle work completions */
switch(wc.opcode) {
case IBV_WC_RECV_RDMA_WITH_IMM:
BTL_ERROR(("Got an RDMA with Immediate data Not supported!"));
return OMPI_ERROR;
case IBV_WC_RECV:
/* Process a RECV */
case IBV_WC_RDMA_READ:
case IBV_WC_RDMA_WRITE:
case IBV_WC_SEND :
BTL_VERBOSE(("Got an recv on the completion queue"));
frag = (mca_btl_openib_frag_t*) wc.wr_id;
endpoint = (mca_btl_openib_endpoint_t*) frag->endpoint;
frag->rc=OMPI_SUCCESS;
frag->segment.seg_len =
wc.byte_len-
((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr);
/* Process a completed send or rdma write*/
frag = (mca_btl_openib_frag_t*) wc.wr_id;
frag->rc = OMPI_SUCCESS;
frag->base.des_cbfunc(&openib_btl->super, frag->endpoint, &frag->base, frag->rc);
count++;
break;
/* advance the segment address past the header and subtract from the length..*/
openib_btl->ib_reg[frag->hdr->tag].cbfunc(&openib_btl->super,
frag->hdr->tag,
&frag->base,
openib_btl->ib_reg[frag->hdr->tag].cbdata);
OPAL_THREAD_ADD32(&endpoint->rr_posted_high, -1);
MCA_BTL_OPENIB_ENDPOINT_POST_RR_HIGH(((mca_btl_openib_frag_t*)wc.wr_id)->endpoint, 0);
OMPI_FREE_LIST_RETURN(&(openib_btl->recv_free_eager), (opal_list_item_t*) frag);
count++;
break;
case IBV_WC_RDMA_WRITE:
case IBV_WC_SEND :
/* Process a completed send or rdma write*/
frag = (mca_btl_openib_frag_t*) wc.wr_id;
frag->rc = OMPI_SUCCESS;
frag->base.des_cbfunc(&openib_btl->super, frag->endpoint, &frag->base, frag->rc);
count++;
break;
break;
default:
BTL_ERROR(("Unhandled work completion opcode is %d", wc.opcode));
break;
}
default:
BTL_ERROR(("Unhandled work completion opcode is %d", wc.opcode));
break;
}
}
while(ne > 0);
ne=ibv_poll_cq(openib_btl->ib_cq_low, 1, &wc );
if(ne < 0){
@ -614,8 +609,10 @@ int mca_btl_openib_component_progress()
count++;
break;
case IBV_WC_RDMA_READ:
case IBV_WC_RDMA_WRITE:
case IBV_WC_SEND :
/* Process a completed send */
frag = (mca_btl_openib_frag_t*) wc.wr_id;
frag->rc = OMPI_SUCCESS;

Просмотреть файл

@ -32,7 +32,6 @@
#include "class/ompi_free_list.h"
#include <errno.h>
#include <string.h>
extern int errno;
static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint);
static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint);
@ -728,7 +727,7 @@ int mca_btl_openib_endpoint_create_qp(
qp_attr->qp_state = IBV_QPS_INIT;
qp_attr->pkey_index = mca_btl_openib_component.ib_pkey_ix;
qp_attr->port_num = openib_btl->port_num;
qp_attr->qp_access_flags = IBV_ACCESS_REMOTE_WRITE;
qp_attr->qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ;
if(ibv_modify_qp((*qp), qp_attr,
IBV_QP_STATE |

Просмотреть файл

@ -27,7 +27,6 @@
#include <errno.h>
#include <string.h>
#include "mca/btl/base/btl_base_error.h"
extern int errno;
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {

Просмотреть файл

@ -87,16 +87,6 @@ OBJ_CLASS_INSTANCE(
static char* mca_mpool_openib_param_register_string(
const char* param_name,
const char* default_value)
{
char *param_value;
int id = mca_base_param_register_string("mpool","openib",param_name,NULL,default_value);
mca_base_param_lookup_string(id, &param_value);
return param_value;
}
/**
* component open/close/init function

Просмотреть файл

@ -46,10 +46,7 @@ void* mca_mpool_openib_alloc(
size_t align,
mca_mpool_base_registration_t** registration)
{
mca_mpool_openib_module_t* mpool_openib = (mca_mpool_openib_module_t*)mpool;
/* void* addr_malloc = (void*)malloc((*size) + mca_mpool_openib_component.page_size); */
/* void* addr = (void*) ALIGN_ADDR(addr_malloc, mca_mpool_openib_component.page_size_log); */
void* addr_malloc = (void*)memalign(mca_mpool_openib_component.page_size, size);
void* addr = addr_malloc;
@ -81,7 +78,7 @@ int mca_mpool_openib_register(mca_mpool_base_module_t* mpool,
mpool_module->resources.ib_pd,
addr,
size,
IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE
IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ
/* IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE */
);
@ -104,7 +101,6 @@ int mca_mpool_openib_register(mca_mpool_base_module_t* mpool,
int mca_mpool_openib_deregister(mca_mpool_base_module_t* mpool, void *addr, size_t size,
mca_mpool_base_registration_t* registration){
mca_mpool_openib_module_t * mpool_openib = (mca_mpool_openib_module_t*) mpool;
mca_mpool_openib_registration_t * openib_reg;
openib_reg = (mca_mpool_openib_registration_t*) registration;
if(ibv_dereg_mr(openib_reg->mr)){
@ -127,7 +123,7 @@ void* mca_mpool_openib_realloc(
mca_mpool_base_registration_t* old_reg = *registration;
void* new_mem = mpool->mpool_alloc(mpool, size, 0, registration);
memcpy(new_mem, addr, old_reg->bound - old_reg->base);
mpool->mpool_free(mpool, addr, &old_reg);
mpool->mpool_free(mpool, addr, old_reg);
return new_mem;
}