1
1

* put rdma operations in the send event queue instead of receive because it's

easier to do event accounting that way
* greatly increase receive event and buffer sizes.  We're still about half
  of what Cray defaults to, so I don't feel bad about the increases
* Implement a pre-pinning optimization for eager fragments - will be
  pinned on first use and left pinned for the life of the fragment
* Since we can't have two receive frag callbacks fired at the same time,
  don't have receive free list - just keep one receive fragment in the
  module.  Saves a big free list and all that interaction.

This commit was SVN r9915.
Этот коммит содержится в:
Brian Barrett 2006-05-14 04:23:26 +00:00
родитель db03ca0cc0
Коммит dcc6b47fa2
11 изменённых файлов: 277 добавлений и 470 удалений

Просмотреть файл

@ -153,7 +153,7 @@ mca_btl_portals_add_procs(struct mca_btl_base_module_t* btl_base,
/* fill in send memory descriptor */
mca_btl_portals_module.md_send.start = NULL;
mca_btl_portals_module.md_send.length = 0;
mca_btl_portals_module.md_send.threshold = 2; /* send and ack */
mca_btl_portals_module.md_send.threshold = PTL_MD_THRESH_INF;
mca_btl_portals_module.md_send.max_size = 0;
mca_btl_portals_module.md_send.options = PTL_MD_EVENT_START_DISABLE;
mca_btl_portals_module.md_send.user_ptr = NULL;
@ -163,9 +163,6 @@ mca_btl_portals_add_procs(struct mca_btl_base_module_t* btl_base,
ret = OMPI_SUCCESS;
}
opal_output_verbose(50, mca_btl_portals_component.portals_output,
"count: %d", mca_btl_portals_module.portals_num_procs);
return ret;
}
@ -233,21 +230,20 @@ mca_btl_base_descriptor_t*
mca_btl_portals_alloc(struct mca_btl_base_module_t* btl_base,
size_t size)
{
mca_btl_portals_frag_t* frag;
int rc;
mca_btl_portals_frag_t* frag;
assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base);
OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output,
"alloc called with size %d", size));
if (size <= mca_btl_portals_module.super.btl_eager_limit) {
OMPI_BTL_PORTALS_FRAG_ALLOC_EAGER(&mca_btl_portals_module, frag, rc);
if (OMPI_SUCCESS != rc) return NULL;
frag->segments[0].seg_len =
size <= mca_btl_portals_module.super.btl_eager_limit ?
size : mca_btl_portals_module.super.btl_eager_limit ;
} else {
OMPI_BTL_PORTALS_FRAG_ALLOC_MAX(&mca_btl_portals_module, frag, rc);
if (OMPI_SUCCESS != rc) return NULL;
frag->segments[0].seg_len =
size <= mca_btl_portals_module.super.btl_max_send_size ?
size : mca_btl_portals_module.super.btl_max_send_size ;
@ -256,10 +252,6 @@ mca_btl_portals_alloc(struct mca_btl_base_module_t* btl_base,
frag->base.des_src_cnt = 1;
frag->base.des_flags = 0;
/* can't setup off an alloc right now - we don't know how much the
caller will actually use */
frag->md_h = PTL_INVALID_HANDLE;
return &frag->base;
}
@ -272,20 +264,23 @@ mca_btl_portals_free(struct mca_btl_base_module_t* btl_base,
assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base);
if (frag->md_h != PTL_INVALID_HANDLE) {
OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output,
"rdma frag free frag 0x%x, callback 0x%x, bits %lld",
frag, frag->base.des_cbfunc, frag->segments[0].seg_key.key64));
PtlMDUnlink(frag->md_h);
}
if (frag->size == 0) {
OMPI_BTL_PORTALS_FRAG_RETURN_USER(&mca_btl_portals_module.super, frag);
} else if (frag->size == mca_btl_portals_module.super.btl_eager_limit){
if (frag->size == mca_btl_portals_module.super.btl_eager_limit){
/* don't ever unlink eager frags */
OMPI_BTL_PORTALS_FRAG_RETURN_EAGER(&mca_btl_portals_module.super, frag);
} else if (frag->size == mca_btl_portals_module.super.btl_max_send_size) {
if (frag->md_h != PTL_INVALID_HANDLE) {
PtlMDUnlink(frag->md_h);
frag->md_h = PTL_INVALID_HANDLE;
}
OMPI_BTL_PORTALS_FRAG_RETURN_MAX(&mca_btl_portals_module.super, frag);
} else {
} else if (frag->size == 0) {
if (frag->md_h != PTL_INVALID_HANDLE) {
PtlMDUnlink(frag->md_h);
frag->md_h = PTL_INVALID_HANDLE;
}
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1);
OMPI_BTL_PORTALS_FRAG_RETURN_USER(&mca_btl_portals_module.super, frag);
} else {
return OMPI_ERR_BAD_PARAM;
}
@ -310,165 +305,100 @@ mca_btl_portals_prepare_src(struct mca_btl_base_module_t* btl_base,
assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base);
OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output,
"prepare_src called with size %d", *size));
if (0 != ompi_convertor_need_buffers(convertor)) {
/* if we need to use buffers to pack the data, grab either an
eager or (if we need more space) max buffer, pack the data
into the first segment, and return */
if (max_data+reserve <= mca_btl_portals_module.super.btl_eager_limit) {
/*
* if we can't send out of the buffer directly and the
* requested size is less than the eager limit, pack into a
* fragment from the eager pool
*/
OMPI_BTL_PORTALS_FRAG_ALLOC_EAGER(&mca_btl_portals_module, frag, ret);
if (NULL == frag) {
return NULL;
}
iov.iov_len = max_data;
iov.iov_base = (unsigned char*) frag->segments[0].seg_addr.pval + reserve;
ret = ompi_convertor_pack(convertor, &iov, &iov_count,
&max_data, &free_after);
*size = max_data;
if (ret < 0) {
OMPI_BTL_PORTALS_FRAG_RETURN_EAGER(&mca_btl_portals_module, frag);
return NULL;
}
frag->segments[0].seg_len = max_data + reserve;
frag->base.des_src_cnt = 1;
} else {
/*
* otherwise pack as much data as we can into a fragment
* that is the max send size.
*/
OMPI_BTL_PORTALS_FRAG_ALLOC_MAX(&mca_btl_portals_module, frag, ret);
if (NULL == frag) {
return NULL;
}
if (max_data + reserve > mca_btl_portals_module.super.btl_max_send_size){
max_data = mca_btl_portals_module.super.btl_max_send_size - reserve;
}
iov.iov_len = max_data;
iov.iov_base = (unsigned char*) frag->segments[0].seg_addr.pval + reserve;
ret = ompi_convertor_pack(convertor, &iov, &iov_count,
&max_data, &free_after);
*size = max_data;
if ( ret < 0 ) {
OMPI_BTL_PORTALS_FRAG_RETURN_MAX(&mca_btl_portals_module, frag);
return NULL;
}
frag->segments[0].seg_len = max_data + reserve;
frag->base.des_src_cnt = 1;
if (0 != reserve || 0 != ompi_convertor_need_buffers(convertor)) {
frag = (mca_btl_portals_frag_t*)
mca_btl_portals_alloc(btl_base, max_data + reserve);
if (NULL == frag) {
return NULL;
}
/* clearly a send - delay setup of memory descriptor until send */
frag->md_h = PTL_INVALID_HANDLE;
if (max_data + reserve > frag->size) {
max_data = frag->size - reserve;
}
iov.iov_len = max_data;
iov.iov_base = (unsigned char*) frag->segments[0].seg_addr.pval + reserve;
ret = ompi_convertor_pack(convertor, &iov, &iov_count,
&max_data, &free_after);
*size = max_data;
if ( ret < 0 ) {
return NULL;
}
frag->segments[0].seg_len = max_data + reserve;
frag->base.des_src_cnt = 1;
} else {
/* no need to pack - we can send directly out of the user's
buffer. If we have reserve space, use an eager fragment
and give the caller the eager space as reserve. If we have
no reserve space needs, use a user frag */
if (0 == reserve) {
ptl_md_t md;
ptl_handle_me_t me_h;
/* no need to pack - rdma operation out of user's buffer */
ptl_md_t md;
ptl_handle_me_t me_h;
/* user frags are always setup to use only one fragment */
OMPI_BTL_PORTALS_FRAG_ALLOC_USER(&mca_btl_portals_module.super, frag, ret);
if(NULL == frag){
return NULL;
}
iov.iov_len = max_data;
iov.iov_base = NULL;
/* reserve space in the event queue for rdma operations immediately */
while (OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, 1) >
mca_btl_portals_module.portals_max_outstanding_ops) {
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1);
mca_btl_portals_component_progress();
}
ompi_convertor_pack(convertor, &iov, &iov_count, &max_data,
&free_after);
OMPI_BTL_PORTALS_FRAG_ALLOC_USER(&mca_btl_portals_module.super, frag, ret);
if(NULL == frag){
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1);
return NULL;
}
iov.iov_len = max_data;
iov.iov_base = NULL;
frag->segments[0].seg_len = max_data;
frag->segments[0].seg_addr.pval = iov.iov_base;
frag->segments[0].seg_key.key64 = OPAL_THREAD_ADD64(&(mca_btl_portals_module.portals_rdma_key), 1);
frag->base.des_src_cnt = 1;
ompi_convertor_pack(convertor, &iov, &iov_count, &max_data,
&free_after);
/* either a put or get. figure out which later */
frag->segments[0].seg_len = max_data;
frag->segments[0].seg_addr.pval = iov.iov_base;
frag->segments[0].seg_key.key64 =
OPAL_THREAD_ADD64(&(mca_btl_portals_module.portals_rdma_key), 1);
frag->base.des_src_cnt = 1;
OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output,
"rdma src posted for frag 0x%x, callback 0x%x, bits %lld",
frag, frag->base.des_cbfunc, frag->segments[0].seg_key.key64));
/* either a put or get. figure out which later */
OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output,
"rdma src posted for frag 0x%x, callback 0x%x, bits %lld",
frag, frag->base.des_cbfunc, frag->segments[0].seg_key.key64));
/* create a match entry */
ret = PtlMEAttach(mca_btl_portals_module.portals_ni_h,
OMPI_BTL_PORTALS_RDMA_TABLE_ID,
*((mca_btl_base_endpoint_t*) peer),
frag->segments[0].seg_key.key64, /* match */
0, /* ignore */
PTL_UNLINK,
PTL_INS_AFTER,
&me_h);
if (PTL_OK != ret) {
opal_output(mca_btl_portals_component.portals_output,
"Error creating rdma src ME: %d", ret);
OMPI_BTL_PORTALS_FRAG_RETURN_USER(&mca_btl_portals_module.super, frag);
return NULL;
}
/* create a match entry */
ret = PtlMEAttach(mca_btl_portals_module.portals_ni_h,
OMPI_BTL_PORTALS_RDMA_TABLE_ID,
*((mca_btl_base_endpoint_t*) peer),
frag->segments[0].seg_key.key64, /* match */
0, /* ignore */
PTL_UNLINK,
PTL_INS_AFTER,
&me_h);
if (PTL_OK != ret) {
opal_output(mca_btl_portals_component.portals_output,
"Error creating rdma src ME: %d", ret);
OMPI_BTL_PORTALS_FRAG_RETURN_USER(&mca_btl_portals_module.super, frag);
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1);
return NULL;
}
/* setup the memory descriptor. RDMA should never need to be
retransmitted, so we set the threshold for the event it will
receive (PUT/GET START and END). No need to track the unlinks
later :) */
md.start = frag->segments[0].seg_addr.pval;
md.length = frag->segments[0].seg_len;
md.threshold = PTL_MD_THRESH_INF;
md.max_size = 0;
md.options = PTL_MD_OP_PUT | PTL_MD_OP_GET | PTL_MD_EVENT_START_DISABLE;
md.user_ptr = frag; /* keep a pointer to ourselves */
md.eq_handle = mca_btl_portals_module.portals_eq_handles[OMPI_BTL_PORTALS_EQ];
/* setup the memory descriptor */
md.start = frag->segments[0].seg_addr.pval;
md.length = frag->segments[0].seg_len;
md.threshold = PTL_MD_THRESH_INF;
md.max_size = 0;
md.options = PTL_MD_OP_PUT | PTL_MD_OP_GET | PTL_MD_EVENT_START_DISABLE;
md.user_ptr = frag; /* keep a pointer to ourselves */
md.eq_handle = mca_btl_portals_module.portals_eq_handles[OMPI_BTL_PORTALS_EQ_SEND];
ret = PtlMDAttach(me_h,
md,
PTL_UNLINK,
&(frag->md_h));
if (PTL_OK != ret) {
opal_output(mca_btl_portals_component.portals_output,
"Error creating rdma src MD: %d", ret);
PtlMEUnlink(me_h);
OMPI_BTL_PORTALS_FRAG_RETURN_USER(&mca_btl_portals_module.super, frag);
return NULL;
}
} else {
OMPI_BTL_PORTALS_FRAG_ALLOC_EAGER(&mca_btl_portals_module, frag, ret);
if (NULL == frag) {
return NULL;
}
iov.iov_len = max_data;
iov.iov_base = NULL;
ret = ompi_convertor_pack(convertor, &iov, &iov_count,
&max_data, &free_after);
*size = max_data;
if (ret < 0) {
OMPI_BTL_PORTALS_FRAG_RETURN_EAGER(&mca_btl_portals_module, frag);
return NULL;
}
frag->segments[0].seg_len = reserve;
frag->segments[1].seg_addr.pval = iov.iov_base;
frag->segments[1].seg_len = max_data;
frag->base.des_src_cnt = 2;
frag->iov[0].iov_base = frag->segments[0].seg_addr.pval;
frag->iov[0].iov_len = frag->segments[0].seg_len;
frag->iov[1].iov_base = frag->segments[1].seg_addr.pval;
frag->iov[1].iov_len = frag->segments[1].seg_len;
/* clearly a send - delay setup of memory descriptor until send */
frag->md_h = PTL_INVALID_HANDLE;
ret = PtlMDAttach(me_h,
md,
PTL_UNLINK,
&(frag->md_h));
if (PTL_OK != ret) {
opal_output(mca_btl_portals_component.portals_output,
"Error creating rdma src MD: %d", ret);
PtlMEUnlink(me_h);
OMPI_BTL_PORTALS_FRAG_RETURN_USER(&mca_btl_portals_module.super, frag);
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1);
return NULL;
}
}
@ -497,22 +427,29 @@ mca_btl_portals_prepare_dst(struct mca_btl_base_module_t* btl_base,
assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base);
/* reserve space in the event queue for rdma operations immediately */
while (OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, 1) >
mca_btl_portals_module.portals_max_outstanding_ops) {
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1);
mca_btl_portals_component_progress();
}
OMPI_BTL_PORTALS_FRAG_ALLOC_USER(&mca_btl_portals_module.super, frag, ret);
if(NULL == frag) {
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1);
return NULL;
}
ompi_ddt_type_lb(convertor->pDesc, &lb);
frag->segments[0].seg_len = *size;
frag->segments[0].seg_addr.pval = convertor->pBaseBuf + lb + convertor->bConverted;
frag->segments[0].seg_key.key64 = OPAL_THREAD_ADD64(&(mca_btl_portals_module.portals_rdma_key), 1);
frag->segments[0].seg_key.key64 =
OPAL_THREAD_ADD64(&(mca_btl_portals_module.portals_rdma_key), 1);
frag->base.des_src = NULL;
frag->base.des_src_cnt = 0;
frag->base.des_dst = frag->segments;
frag->base.des_dst_cnt = 1;
frag->base.des_flags = 0;
frag->type = mca_btl_portals_frag_type_rdma;
OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output,
"rdma dest posted for frag 0x%x, callback 0x%x, bits %lld",
@ -530,21 +467,19 @@ mca_btl_portals_prepare_dst(struct mca_btl_base_module_t* btl_base,
if (PTL_OK != ret) {
opal_output(mca_btl_portals_component.portals_output,
"Error creating rdma dest ME: %d", ret);
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1);
OMPI_BTL_PORTALS_FRAG_RETURN_USER(&mca_btl_portals_module.super, frag);
return NULL;
}
/* setup the memory descriptor. RDMA should never need to be
retransmitted, so we set the threshold for the event it will
receive (PUT/GET START and END). No need to track the unlinks
later :) */
/* setup the memory descriptor. */
md.start = frag->segments[0].seg_addr.pval;
md.length = frag->segments[0].seg_len;
md.threshold = PTL_MD_THRESH_INF;
md.max_size = 0;
md.options = PTL_MD_OP_PUT | PTL_MD_OP_GET | PTL_MD_EVENT_START_DISABLE;
md.user_ptr = frag; /* keep a pointer to ourselves */
md.eq_handle = mca_btl_portals_module.portals_eq_handles[OMPI_BTL_PORTALS_EQ];
md.eq_handle = mca_btl_portals_module.portals_eq_handles[OMPI_BTL_PORTALS_EQ_SEND];
ret = PtlMDAttach(me_h,
md,
@ -554,6 +489,7 @@ mca_btl_portals_prepare_dst(struct mca_btl_base_module_t* btl_base,
opal_output(mca_btl_portals_component.portals_output,
"Error creating rdma dest MD: %d", ret);
PtlMEUnlink(me_h);
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1);
OMPI_BTL_PORTALS_FRAG_RETURN_USER(&mca_btl_portals_module.super, frag);
return NULL;
}
@ -570,16 +506,10 @@ mca_btl_portals_finalize(struct mca_btl_base_module_t *btl_base)
assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base);
/* finalize all communication */
while (mca_btl_portals_module.portals_outstanding_sends > 0) {
while (mca_btl_portals_module.portals_outstanding_ops > 0) {
mca_btl_portals_component_progress();
}
if (0 != opal_list_get_size(&(mca_btl_portals_module.portals_queued_sends))) {
opal_output(mca_btl_portals_component.portals_output,
"Warning: there were %d queued sends not sent",
opal_list_get_size(&(mca_btl_portals_module.portals_queued_sends)));
}
if (mca_btl_portals_module.portals_num_procs != 0) {
int i;
@ -597,7 +527,10 @@ mca_btl_portals_finalize(struct mca_btl_base_module_t *btl_base)
}
OBJ_DESTRUCT(&mca_btl_portals_module.portals_recv_blocks);
OBJ_DESTRUCT(&mca_btl_portals_module.portals_queued_sends);
OBJ_DESTRUCT(&mca_btl_portals_module.portals_recv_frag);
OBJ_DESTRUCT(&mca_btl_portals_module.portals_frag_eager);
OBJ_DESTRUCT(&mca_btl_portals_module.portals_frag_max);
OBJ_DESTRUCT(&mca_btl_portals_module.portals_frag_user);
if (PTL_INVALID_HANDLE != mca_btl_portals_module.portals_ni_h) {
ret = PtlNIFini(mca_btl_portals_module.portals_ni_h);

Просмотреть файл

@ -31,6 +31,7 @@
#include "orte/class/orte_proc_table.h"
#include "btl_portals_endpoint.h"
#include "btl_portals_frag.h"
#define OMPI_BTL_PORTALS_SEND_TABLE_ID (OMPI_BTL_PORTALS_STARTING_TABLE_ID + 0)
#define OMPI_BTL_PORTALS_RDMA_TABLE_ID (OMPI_BTL_PORTALS_STARTING_TABLE_ID + 1)
@ -66,12 +67,15 @@ struct mca_btl_portals_component_t {
int portals_free_list_max_num;
/* numer of elements to grow free lists */
int portals_free_list_inc_num;
/* number of eager fragments */
int portals_free_list_eager_max_num;
};
typedef struct mca_btl_portals_component_t mca_btl_portals_component_t;
#define OMPI_BTL_PORTALS_EQ_SEND 0
#define OMPI_BTL_PORTALS_EQ 1
#define OMPI_BTL_PORTALS_EQ_RECV 1
#define OMPI_BTL_PORTALS_EQ_SIZE 2
struct mca_btl_portals_module_t {
@ -89,13 +93,15 @@ struct mca_btl_portals_module_t {
ompi_free_list_t portals_frag_eager;
ompi_free_list_t portals_frag_max;
ompi_free_list_t portals_frag_user;
ompi_free_list_t portals_frag_recv;
/* incoming send message receive memory descriptors */
int portals_recv_mds_num;
int portals_recv_mds_size;
opal_list_t portals_recv_blocks;
/* frag for receive callbacks */
mca_btl_portals_frag_recv_t portals_recv_frag;
/* event queues. Keep sends on own eq, since we can't control
space for the ack otherwise */
int portals_eq_sizes[OMPI_BTL_PORTALS_EQ_SIZE];
@ -104,11 +110,11 @@ struct mca_btl_portals_module_t {
/* "reject" entry for recv match list */
ptl_handle_me_t portals_recv_reject_me_h;
/* number outstanding sends */
volatile int32_t portals_outstanding_sends;
int32_t portals_max_outstanding_sends;
/* number outstanding sends and local rdma */
volatile int32_t portals_outstanding_ops;
int32_t portals_max_outstanding_ops;
/* queued sends */
/* sends queued until there's time to send */
opal_list_t portals_queued_sends;
/* key to use for next rdma operation */

Просмотреть файл

@ -119,7 +119,7 @@ mca_btl_portals_add_procs_compat(struct mca_btl_portals_module_t* btl,
int ret;
if (use_modex) {
int my_rid;
int my_rid = 0;
ptl_process_id_t *info;
char *nidmap = NULL;
char *pidmap = NULL;

Просмотреть файл

@ -20,6 +20,7 @@
#include <sys/types.h>
#include <unistd.h>
#include <limits.h>
#if OMPI_BTL_PORTALS_REDSTORM
#include <catamount/cnos_mpi_os.h>
#endif
@ -91,11 +92,11 @@ mca_btl_portals_component_open(void)
"Debugging verbosity (0 - 100)",
false,
false,
OMPI_BTL_PORTALS_DEFAULT_DEBUG_LEVEL,
0,
&(portals_output_stream.lds_verbose_level));
#if OMPI_BTL_PORTALS_REDSTORM
asprintf(&(portals_output_stream.lds_prefix),
"btl: portals (%2d): ", cnos_get_rank());
"btl: portals (%5d): ", cnos_get_rank());
#else
asprintf(&(portals_output_stream.lds_prefix),
"btl: portals (%5d): ", getpid());
@ -118,22 +119,29 @@ mca_btl_portals_component_open(void)
"Initial number of elements to initialize in free lists",
false,
false,
OMPI_BTL_PORTALS_DEFAULT_FREE_LIST_INIT_NUM,
16,
&(mca_btl_portals_component.portals_free_list_init_num));
mca_base_param_reg_int(&mca_btl_portals_component.super.btl_version,
"free_list_max_num",
"Max number of elements to initialize in free lists",
false,
false,
OMPI_BTL_PORTALS_DEFAULT_FREE_LIST_MAX_NUM,
1024,
&(mca_btl_portals_component.portals_free_list_max_num));
mca_base_param_reg_int(&mca_btl_portals_component.super.btl_version,
"free_list_inc_num",
"Increment count for free lists",
false,
false,
OMPI_BTL_PORTALS_DEFAULT_FREE_LIST_INC_NUM,
16,
&(mca_btl_portals_component.portals_free_list_inc_num));
mca_base_param_reg_int(&mca_btl_portals_component.super.btl_version,
"eager_frag_limit",
"Maximum number of pre-pinned eager fragments",
false,
false,
32,
&(mca_btl_portals_component.portals_free_list_eager_max_num));
/*
* fill default module state
@ -143,7 +151,7 @@ mca_btl_portals_component_open(void)
"Maximum size for eager frag",
false,
false,
OMPI_BTL_PORTALS_DEFAULT_EAGER_LIMIT,
32 * 1024,
&dummy);
mca_btl_portals_module.super.btl_eager_limit = dummy;
@ -152,7 +160,7 @@ mca_btl_portals_component_open(void)
"Minimum size for a send frag",
false,
false,
OMPI_BTL_PORTALS_DEFAULT_MIN_SEND_SIZE,
32 * 1024,
&dummy);
mca_btl_portals_module.super.btl_min_send_size = dummy;
mca_base_param_reg_int(&mca_btl_portals_component.super.btl_version,
@ -160,7 +168,7 @@ mca_btl_portals_component_open(void)
"Maximum size for a send frag",
false,
false,
OMPI_BTL_PORTALS_DEFAULT_MAX_SEND_SIZE,
64 * 1024,
&dummy);
mca_btl_portals_module.super.btl_max_send_size = dummy;
mca_base_param_reg_int(&mca_btl_portals_component.super.btl_version,
@ -168,7 +176,7 @@ mca_btl_portals_component_open(void)
"Minimum size for a rdma frag",
false,
false,
OMPI_BTL_PORTALS_DEFAULT_MIN_RDMA_SIZE,
64 * 1024,
&dummy);
mca_btl_portals_module.super.btl_min_rdma_size = dummy;
mca_base_param_reg_int(&mca_btl_portals_component.super.btl_version,
@ -176,7 +184,7 @@ mca_btl_portals_component_open(void)
"Maximum size for a rdma frag",
false,
false,
OMPI_BTL_PORTALS_DEFAULT_MAX_RDMA_SIZE,
INT_MAX,
&dummy);
mca_btl_portals_module.super.btl_max_rdma_size = dummy;
@ -205,11 +213,10 @@ mca_btl_portals_component_open(void)
&dummy);
mca_btl_portals_module.super.btl_bandwidth = dummy;
#if 0 /* it appears that copying is faster than iovecs at present */
mca_btl_portals_module.super.btl_flags = MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND_INPLACE;
#else
/* send in place actually increases our latency because we have to
hold on to the buffer until we're done with it, rather than
copy and send. So don't use it for now. */
mca_btl_portals_module.super.btl_flags = MCA_BTL_FLAGS_RDMA;
#endif
mca_btl_portals_module.portals_num_procs = 0;
bzero(&(mca_btl_portals_module.portals_reg),
@ -222,23 +229,23 @@ mca_btl_portals_component_open(void)
/* eq handles will be created when the module is instantiated.
Set sizes here */
mca_base_param_reg_int(&mca_btl_portals_component.super.btl_version,
"eq_size",
"Size of the event queue",
"eq_recv_size",
"Size of the receive event queue",
false,
false,
OMPI_BTL_PORTALS_DEFAULT_RECV_QUEUE_SIZE,
&(mca_btl_portals_module.portals_eq_sizes[OMPI_BTL_PORTALS_EQ]));
16 * 1024,
&(mca_btl_portals_module.portals_eq_sizes[OMPI_BTL_PORTALS_EQ_RECV]));
mca_base_param_reg_int(&mca_btl_portals_component.super.btl_version,
"eq_send_max_pending",
"Maximum number of pending send frags",
"max_pending_ops",
"Maximum number of pending send/rdma frags",
false,
false,
OMPI_BTL_PORTALS_MAX_SENDS_PENDING,
&(mca_btl_portals_module.portals_max_outstanding_sends));
/* sends_pending * 2 for end, ack */
8 * 1024,
&(mca_btl_portals_module.portals_max_outstanding_ops));
/* ops_pending * 2 for end, ack */
mca_btl_portals_module.portals_eq_sizes[OMPI_BTL_PORTALS_EQ_SEND] =
mca_btl_portals_module.portals_max_outstanding_sends * 2;
mca_btl_portals_module.portals_max_outstanding_ops * 2;
mca_btl_portals_module.portals_recv_reject_me_h = PTL_INVALID_HANDLE;
@ -247,19 +254,19 @@ mca_btl_portals_component_open(void)
"Number of send frag receive descriptors",
false,
false,
OMPI_BTL_PORTALS_DEFAULT_RECV_MD_NUM,
3,
&(mca_btl_portals_module.portals_recv_mds_num));
mca_base_param_reg_int(&mca_btl_portals_component.super.btl_version,
"recv_md_size",
"Size of send frag receive descriptors",
false,
false,
OMPI_BTL_PORTALS_DEFAULT_RECV_MD_SIZE,
10 * 1024 * 1024,
&(mca_btl_portals_module.portals_recv_mds_size));
mca_btl_portals_module.portals_ni_h = PTL_INVALID_HANDLE;
mca_btl_portals_module.portals_sr_dropped = 0;
mca_btl_portals_module.portals_outstanding_sends = 0;
mca_btl_portals_module.portals_outstanding_ops = 0;
mca_btl_portals_module.portals_rdma_key = 1;
return OMPI_SUCCESS;
@ -315,7 +322,6 @@ mca_btl_portals_component_init(int *num_btls,
OBJ_CONSTRUCT(&(mca_btl_portals_module.portals_frag_eager), ompi_free_list_t);
OBJ_CONSTRUCT(&(mca_btl_portals_module.portals_frag_max), ompi_free_list_t);
OBJ_CONSTRUCT(&(mca_btl_portals_module.portals_frag_user), ompi_free_list_t);
OBJ_CONSTRUCT(&(mca_btl_portals_module.portals_frag_recv), ompi_free_list_t);
/* eager frags */
ompi_free_list_init(&(mca_btl_portals_module.portals_frag_eager),
@ -323,7 +329,7 @@ mca_btl_portals_component_init(int *num_btls,
mca_btl_portals_module.super.btl_eager_limit,
OBJ_CLASS(mca_btl_portals_frag_eager_t),
mca_btl_portals_component.portals_free_list_init_num,
mca_btl_portals_component.portals_free_list_max_num,
mca_btl_portals_component.portals_free_list_eager_max_num,
mca_btl_portals_component.portals_free_list_inc_num,
NULL);
@ -347,19 +353,16 @@ mca_btl_portals_component_init(int *num_btls,
NULL);
/* recv frags */
ompi_free_list_init(&(mca_btl_portals_module.portals_frag_recv),
sizeof(mca_btl_portals_frag_recv_t),
OBJ_CLASS(mca_btl_portals_frag_recv_t),
mca_btl_portals_component.portals_free_list_init_num,
mca_btl_portals_component.portals_free_list_max_num,
mca_btl_portals_component.portals_free_list_inc_num,
NULL);
OBJ_CONSTRUCT(&(mca_btl_portals_module.portals_recv_frag),
mca_btl_portals_frag_recv_t);
/* receive block list */
OBJ_CONSTRUCT(&(mca_btl_portals_module.portals_recv_blocks), opal_list_t);
/* pending sends */
OBJ_CONSTRUCT(&(mca_btl_portals_module.portals_queued_sends), opal_list_t);
/* list for send requests that have to be delayed */
OBJ_CONSTRUCT(&(mca_btl_portals_module.portals_queued_sends),
opal_list_t);
*num_btls = 1;
opal_output_verbose(20, mca_btl_portals_component.portals_output,
@ -386,18 +389,9 @@ mca_btl_portals_component_progress(void)
while (true) {
ret = PtlEQPoll(mca_btl_portals_module.portals_eq_handles,
OMPI_BTL_PORTALS_EQ_SIZE,
#if OMPI_BTL_PORTALS_REDSTORM
0, /* timeout */
#else
/* with a timeout of 0, the reference
implementation seems to get really unhappy
really fast when communication starts between
all peers at the same time. Slowing things
down a bit seems to help a bunch. */
1, /* timeout */
#endif
&ev,
&which);
0, /* timeout */
&ev, /* event structure to update */
&which); /* which queue the event came from - we don't care */
switch (ret) {
case PTL_OK:
frag = ev.md.user_ptr;
@ -406,7 +400,6 @@ mca_btl_portals_component_progress(void)
switch (ev.type) {
case PTL_EVENT_GET_START:
/* generated on source (target) when a get from memory starts */
OPAL_OUTPUT_VERBOSE((900, mca_btl_portals_component.portals_output,
"PTL_EVENT_GET_START for 0x%x, %d",
frag, (int) ev.hdr_data));
@ -415,7 +408,6 @@ mca_btl_portals_component_progress(void)
case PTL_EVENT_GET_END:
/* generated on source (target) when a get from memory ends */
OPAL_OUTPUT_VERBOSE((900, mca_btl_portals_component.portals_output,
"PTL_EVENT_GET_END for 0x%x, %d",
frag, (int) ev.hdr_data));
@ -424,7 +416,6 @@ mca_btl_portals_component_progress(void)
case PTL_EVENT_PUT_START:
/* generated on destination (target) when a put into memory starts */
OPAL_OUTPUT_VERBOSE((900, mca_btl_portals_component.portals_output,
"PTL_EVENT_PUT_START for 0x%x, %d",
frag, (int) ev.hdr_data));
@ -446,7 +437,6 @@ mca_btl_portals_component_progress(void)
case PTL_EVENT_PUT_END:
/* generated on destination (target) when a put into memory ends */
OPAL_OUTPUT_VERBOSE((900, mca_btl_portals_component.portals_output,
"PTL_EVENT_PUT_END for 0x%x, %d",
frag, (int) ev.hdr_data));
@ -465,13 +455,15 @@ mca_btl_portals_component_progress(void)
block = ev.md.user_ptr;
tag = ev.hdr_data;
OMPI_BTL_PORTALS_FRAG_ALLOC_RECV(&mca_btl_portals_module, frag, ret);
/* if we ever make this thread hot, need to do
something with the receive fragments */
frag = &mca_btl_portals_module.portals_recv_frag;
frag->segments[0].seg_addr.pval = (((char*) ev.md.start) + ev.offset);
frag->segments[0].seg_len = ev.mlength;
OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output,
"received send fragment %x (thresh: %d)",
frag, ev.md.threshold));
"received send fragment %x (thresh: %d, length %d)",
frag, ev.md.threshold, (int) ev.mlength));
if (ev.md.length - (ev.offset + ev.mlength) < ev.md.max_size ||
ev.md.threshold == 1) {
@ -491,8 +483,6 @@ mca_btl_portals_component_progress(void)
tag,
&frag->base,
mca_btl_portals_module.portals_reg[tag].cbdata);
OMPI_BTL_PORTALS_FRAG_RETURN_RECV(&mca_btl_portals_module.super,
frag);
mca_btl_portals_return_block_part(&mca_btl_portals_module, block);
}
break;
@ -502,8 +492,8 @@ mca_btl_portals_component_progress(void)
returning data */
OPAL_OUTPUT_VERBOSE((900, mca_btl_portals_component.portals_output,
"PTL_EVENT_REPLY_START for 0x%x, %d, %d",
frag, (int) frag->type, (int) ev.hdr_data));
"PTL_EVENT_REPLY_START for 0x%x, %d",
frag, (int) ev.hdr_data));
break;
@ -512,8 +502,7 @@ mca_btl_portals_component_progress(void)
done returning data */
OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output,
"PTL_EVENT_REPLY_END for 0x%x, %d",
frag, (int) frag->type));
"PTL_EVENT_REPLY_END for 0x%x", frag));
/* let the PML know we're done */
frag->base.des_cbfunc(&mca_btl_portals_module.super,
@ -528,18 +517,12 @@ mca_btl_portals_component_progress(void)
#if OMPI_ENABLE_DEBUG
OPAL_OUTPUT_VERBOSE((900, mca_btl_portals_component.portals_output,
"PTL_EVENT_SEND_START for 0x%x, %d, %d",
frag, (int) frag->type, (int) ev.hdr_data));
"PTL_EVENT_SEND_START for 0x%x, %d",
frag, (int) ev.hdr_data));
if (ev.ni_fail_type != PTL_NI_OK) {
opal_output(mca_btl_portals_component.portals_output,
"Failure to start send event\n");
if (ev.hdr_data < MCA_BTL_TAG_MAX) {
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_sends,
-1);
/* unlink, since we don't expect to get an end or ack */
}
PtlMDUnlink(ev.md_handle);
frag->base.des_cbfunc(&mca_btl_portals_module.super,
frag->endpoint,
&frag->base,
@ -552,18 +535,12 @@ mca_btl_portals_component_progress(void)
/* generated on source (origin) when put stops sending */
#if OMPI_ENABLE_DEBUG
OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output,
"PTL_EVENT_SEND_END for 0x%x, %d, %d",
frag, (int) frag->type, (int) ev.hdr_data));
"PTL_EVENT_SEND_END for 0x%x, %d",
frag, (int) ev.hdr_data));
if (ev.ni_fail_type != PTL_NI_OK) {
opal_output(mca_btl_portals_component.portals_output,
"Failure to end send event\n");
if (ev.hdr_data < MCA_BTL_TAG_MAX) {
/* unlink, since we don't expect to get an ack */
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_sends,
-1);
PtlMDUnlink(ev.md_handle);
}
frag->base.des_cbfunc(&mca_btl_portals_module.super,
frag->endpoint,
&frag->base,
@ -580,20 +557,12 @@ mca_btl_portals_component_progress(void)
Requeue the put on badness */
OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output,
"PTL_EVENT_ACK for 0x%x, %d",
frag, (int) frag->type));
if (frag->type == mca_btl_portals_frag_type_send) {
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_sends,
-1);
}
"PTL_EVENT_ACK for 0x%x", frag));
#if OMPI_ENABLE_DEBUG
if (ev.ni_fail_type != PTL_NI_OK) {
opal_output(mca_btl_portals_component.portals_output,
"Failure to ack event\n");
/* unlink, since we don't expect to get an ack */
PtlMDUnlink(ev.md_handle);
frag->base.des_cbfunc(&mca_btl_portals_module.super,
frag->endpoint,
&frag->base,
@ -608,14 +577,15 @@ mca_btl_portals_component_progress(void)
buffer space available for receiving */
opal_output_verbose(50,
mca_btl_portals_component.portals_output,
"message was dropped. Adding to front of queue list");
opal_list_prepend(&(mca_btl_portals_module.portals_queued_sends),
(opal_list_item_t*) frag);
"message was dropped. Trying again");
mca_btl_portals_send(&mca_btl_portals_module.super,
frag->endpoint,
&frag->base,
frag->hdr.tag);
} else {
/* other side received the message. should have
received entire thing */
/* let the PML know we're done */
frag->base.des_cbfunc(&mca_btl_portals_module.super,
frag->endpoint,
@ -623,7 +593,11 @@ mca_btl_portals_component_progress(void)
OMPI_SUCCESS);
}
if (frag->type == mca_btl_portals_frag_type_send) {
opal_output_verbose(50, mca_btl_portals_component.portals_output, "fuck");
if (0 != frag->size) {
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops,
-1);
MCA_BTL_PORTALS_PROGRESS_QUEUED_SENDS();
}

Просмотреть файл

@ -34,9 +34,7 @@ mca_btl_portals_frag_common_send_constructor(mca_btl_portals_frag_t* frag)
frag->segments[0].seg_len = frag->size;
frag->segments[0].seg_key.key64 = 0;
frag->segments[1].seg_addr.pval = 0;
frag->segments[1].seg_len = 0;
frag->segments[1].seg_key.key64 = 0;
frag->md_h = PTL_INVALID_HANDLE;
}
@ -48,6 +46,16 @@ mca_btl_portals_frag_eager_constructor(mca_btl_portals_frag_t* frag)
}
static void
mca_btl_portals_frag_eager_destructor(mca_btl_portals_frag_t* frag)
{
if (PTL_INVALID_HANDLE == frag->md_h) {
PtlMDUnlink(frag->md_h);
frag->md_h = PTL_INVALID_HANDLE;
}
}
static void
mca_btl_portals_frag_max_constructor(mca_btl_portals_frag_t* frag)
{
@ -76,7 +84,6 @@ mca_btl_portals_frag_recv_constructor(mca_btl_portals_frag_t* frag)
frag->base.des_src = NULL;
frag->base.des_src_cnt = 0;
frag->size = 0;
frag->type = mca_btl_portals_frag_type_recv;
}
@ -90,7 +97,7 @@ OBJ_CLASS_INSTANCE(
mca_btl_portals_frag_eager_t,
mca_btl_base_descriptor_t,
mca_btl_portals_frag_eager_constructor,
NULL);
mca_btl_portals_frag_eager_destructor);
OBJ_CLASS_INSTANCE(
mca_btl_portals_frag_max_t,

Просмотреть файл

@ -30,16 +30,15 @@ OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_btl_portals_frag_t);
*/
struct mca_btl_portals_frag_t {
mca_btl_base_descriptor_t base;
mca_btl_base_segment_t segments[2];
ptl_md_iovec_t iov[2];
mca_btl_base_segment_t segments[1];
/* needed for retransmit case */
struct mca_btl_base_endpoint_t *endpoint;
/* needed for retransmit case */
mca_btl_base_header_t hdr;
enum { mca_btl_portals_frag_type_send,
mca_btl_portals_frag_type_recv,
mca_btl_portals_frag_type_rdma} type;
/* handle to use for communication */
ptl_handle_md_t md_h;
/* size of the allocated memory region -- not the amount of data
we need to send */
size_t size;
};
@ -59,69 +58,61 @@ OBJ_CLASS_DECLARATION(mca_btl_portals_frag_user_t);
typedef struct mca_btl_portals_frag_t mca_btl_portals_frag_recv_t;
OBJ_CLASS_DECLARATION(mca_btl_portals_frag_recv_t);
/*
* Macros to allocate/return descriptors from module specific
* free list(s).
*/
#define OMPI_BTL_PORTALS_FRAG_ALLOC_EAGER(btl_macro, frag, rc) \
#define OMPI_BTL_PORTALS_FRAG_ALLOC_EAGER(btl_macro, frag, rc) \
{ \
\
opal_list_item_t *item; \
OMPI_FREE_LIST_WAIT(&((mca_btl_portals_module_t*)btl_macro)->portals_frag_eager, item, rc); \
frag = (mca_btl_portals_frag_t*) item; \
OMPI_FREE_LIST_GET(&((mca_btl_portals_module_t*)btl_macro)->portals_frag_eager, item, rc); \
if (rc == OMPI_ERR_TEMP_OUT_OF_RESOURCE) { \
OMPI_BTL_PORTALS_FRAG_ALLOC_MAX(btl_macro, frag, rc); \
} \
frag = (mca_btl_portals_frag_t*) item; \
}
#define OMPI_BTL_PORTALS_FRAG_RETURN_EAGER(btl_macro, frag) \
#define OMPI_BTL_PORTALS_FRAG_RETURN_EAGER(btl_macro, frag) \
{ \
OMPI_FREE_LIST_RETURN(&((mca_btl_portals_module_t*)btl_macro)->portals_frag_eager, \
(opal_list_item_t*)(frag)); \
}
#define OMPI_BTL_PORTALS_FRAG_ALLOC_MAX(btl_macro, frag, rc) \
#define OMPI_BTL_PORTALS_FRAG_ALLOC_MAX(btl_macro, frag, rc) \
{ \
\
opal_list_item_t *item; \
OMPI_FREE_LIST_WAIT(&((mca_btl_portals_module_t*)btl_macro)->portals_frag_max, item, rc); \
frag = (mca_btl_portals_frag_t*) item; \
frag = (mca_btl_portals_frag_t*) item; \
}
#define OMPI_BTL_PORTALS_FRAG_RETURN_MAX(btl_macro, frag) \
#define OMPI_BTL_PORTALS_FRAG_RETURN_MAX(btl_macro, frag) \
{ \
OMPI_FREE_LIST_RETURN(&((mca_btl_portals_module_t*)btl_macro)->portals_frag_max, \
(opal_list_item_t*)(frag)); \
}
#define OMPI_BTL_PORTALS_FRAG_ALLOC_USER(btl_macro, frag, rc) \
#define OMPI_BTL_PORTALS_FRAG_ALLOC_USER(btl_macro, frag, rc) \
{ \
opal_list_item_t *item; \
OMPI_FREE_LIST_WAIT(&((mca_btl_portals_module_t*)btl_macro)->portals_frag_user, item, rc); \
frag = (mca_btl_portals_frag_t*) item; \
frag = (mca_btl_portals_frag_t*) item; \
}
#define OMPI_BTL_PORTALS_FRAG_RETURN_USER(btl_macro, frag) \
#define OMPI_BTL_PORTALS_FRAG_RETURN_USER(btl_macro, frag) \
{ \
OMPI_FREE_LIST_RETURN(&((mca_btl_portals_module_t*)btl_macro)->portals_frag_user, \
(opal_list_item_t*)(frag)); \
(opal_list_item_t*)(frag)); \
}
#define OMPI_BTL_PORTALS_FRAG_ALLOC_RECV(btl_macro, frag, rc) \
{ \
opal_list_item_t *item; \
OMPI_FREE_LIST_WAIT(&((mca_btl_portals_module_t*)btl_macro)->portals_frag_recv, item, rc); \
frag = (mca_btl_portals_frag_t*) item; \
}
#define OMPI_BTL_PORTALS_FRAG_RETURN_RECV(btl_macro, frag) \
{ \
OMPI_FREE_LIST_RETURN(&((mca_btl_portals_module_t*)btl_macro)->portals_frag_recv, \
(opal_list_item_t*)(frag)); \
}
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif

Просмотреть файл

@ -42,7 +42,6 @@ mca_btl_portals_put(struct mca_btl_base_module_t* btl_base,
frag->endpoint = btl_peer;
frag->hdr.tag = MCA_BTL_TAG_MAX;
frag->type = mca_btl_portals_frag_type_rdma;
/* setup the send */
assert(1 == frag->base.des_src_cnt);
@ -58,7 +57,6 @@ mca_btl_portals_put(struct mca_btl_base_module_t* btl_base,
if (ret != PTL_OK) {
opal_output(mca_btl_portals_component.portals_output,
"PtlPut failed with error %d", ret);
PtlMDUnlink(frag->md_h);
return OMPI_ERROR;
}
@ -83,7 +81,6 @@ mca_btl_portals_get(struct mca_btl_base_module_t* btl_base,
frag->endpoint = btl_peer;
frag->hdr.tag = MCA_BTL_TAG_MAX;
frag->type = mca_btl_portals_frag_type_rdma;
ret = PtlGet(frag->md_h,
*((mca_btl_base_endpoint_t*) btl_peer),
@ -94,7 +91,6 @@ mca_btl_portals_get(struct mca_btl_base_module_t* btl_base,
if (ret != PTL_OK) {
opal_output(mca_btl_portals_component.portals_output,
"PtlGet failed with error %d", ret);
PtlMDUnlink(frag->md_h);
return OMPI_ERROR;
}

Просмотреть файл

@ -96,12 +96,12 @@ mca_btl_portals_activate_block(mca_btl_portals_recv_block_t *block)
md.length = block->length;
/* try to throttle incoming sends so that we don't overrun the incoming
queue size */
md.threshold = mca_btl_portals_module.portals_eq_sizes[OMPI_BTL_PORTALS_EQ] /
md.threshold = mca_btl_portals_module.portals_eq_sizes[OMPI_BTL_PORTALS_EQ_RECV] /
(mca_btl_portals_module.portals_recv_mds_num * 2);
md.max_size = block->btl->super.btl_max_send_size;
md.options = PTL_MD_OP_PUT | PTL_MD_MAX_SIZE;
md.user_ptr = block;
md.eq_handle = block->btl->portals_eq_handles[OMPI_BTL_PORTALS_EQ];
md.eq_handle = block->btl->portals_eq_handles[OMPI_BTL_PORTALS_EQ_RECV];
block->pending = 0;
block->full = false;

Просмотреть файл

@ -35,95 +35,70 @@ mca_btl_portals_send(struct mca_btl_base_module_t* btl_base,
mca_btl_base_tag_t tag)
{
mca_btl_portals_frag_t *frag = (mca_btl_portals_frag_t*) descriptor;
int32_t num_sends;
int ret;
assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base);
assert(frag->md_h == PTL_INVALID_HANDLE);
frag->endpoint = endpoint;
frag->hdr.tag = tag;
frag->type = mca_btl_portals_frag_type_send;
num_sends = OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_sends, 1);
/* make sure that we have enough space to send. This means that
there is enough space in the event queue for all the events
that may be deposited by outstanding sends */
if (num_sends >= mca_btl_portals_module.portals_max_outstanding_sends) {
OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output,
"PtlPut (send) fragment %x", frag));
if (OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, 1) >
mca_btl_portals_module.portals_max_outstanding_ops) {
/* no space - queue and continute */
opal_output_verbose(50, mca_btl_portals_component.portals_output,
"no space for message 0x%x. Adding to back of queue",
frag);
"no space for message 0x%x. Adding to back of queue",
frag);
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1);
opal_list_append(&(mca_btl_portals_module.portals_queued_sends),
(opal_list_item_t*) frag);
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_sends, -1);
}
ret = OMPI_SUCCESS;
} else {
int ret;
ptl_handle_md_t md_h;
OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output,
"PtlPut (send) fragment %x", frag));
/* setup the send */
if (1 == frag->base.des_src_cnt) {
mca_btl_portals_module.md_send.start = frag->segments[0].seg_addr.pval;
mca_btl_portals_module.md_send.length = frag->segments[0].seg_len;
mca_btl_portals_module.md_send.options = PTL_MD_EVENT_START_DISABLE;
OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output,
"fragment info:\n"
"\tstart: 0x%x\n"
"\tlen: %d",
frag->segments[0].seg_addr.pval,
frag->segments[0].seg_len));
} else {
assert(2 == frag->base.des_src_cnt);
mca_btl_portals_module.md_send.start = frag->iov;
mca_btl_portals_module.md_send.length = 2;
mca_btl_portals_module.md_send.options =
PTL_MD_EVENT_START_DISABLE | PTL_MD_IOVEC;
OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output,
"fragment info:\n"
"\tiov[0].iov_base: 0x%x\n"
"\tiov[0].iov_len: %d\n"
"\tiov[1].iov_base: 0x%x\n"
"\tiov[1].iov_len: %d",
frag->iov[0].iov_base,
frag->iov[0].iov_len,
frag->iov[1].iov_base,
frag->iov[1].iov_len));
}
if (frag->md_h == PTL_INVALID_HANDLE) {
/* setup the send - always describe entire fragment */
mca_btl_portals_module.md_send.start = frag->segments[0].seg_addr.pval;
mca_btl_portals_module.md_send.length =
0 == frag->size ? frag->segments[0].seg_len : frag->size;
mca_btl_portals_module.md_send.options =
PTL_MD_EVENT_START_DISABLE;
mca_btl_portals_module.md_send.user_ptr = frag; /* keep a pointer to ourselves */
/* make a free-floater */
ret = PtlMDBind(mca_btl_portals_module.portals_ni_h,
mca_btl_portals_module.md_send,
PTL_UNLINK,
&md_h);
&frag->md_h);
if (ret != PTL_OK) {
opal_output(mca_btl_portals_component.portals_output,
"PtlMDBind failed with error %d", ret);
return OMPI_ERROR;
}
}
ret = PtlPut(md_h,
PTL_ACK_REQ,
*((mca_btl_base_endpoint_t*) endpoint),
OMPI_BTL_PORTALS_SEND_TABLE_ID,
0, /* ac_index - not used */
0, /* match bits */
0, /* remote offset - not used */
frag->hdr.tag); /* hdr_data - tag */
if (ret != PTL_OK) {
opal_output(mca_btl_portals_component.portals_output,
"send: PtlPut failed with error %d", ret);
PtlMDUnlink(md_h);
return OMPI_ERROR;
}
OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output,
"fragment info:\n"
"\tstart: 0x%x\n"
"\tlen: %d",
frag->segments[0].seg_addr.pval,
frag->segments[0].seg_len));
return OMPI_SUCCESS;
ret = PtlPutRegion(frag->md_h, /* memory descriptor */
0, /* fragment offset */
frag->segments[0].seg_len, /* fragment length */
PTL_ACK_REQ,
*((mca_btl_base_endpoint_t*) endpoint),
OMPI_BTL_PORTALS_SEND_TABLE_ID,
0, /* ac_index - not used */
0, /* match bits */
0, /* remote offset - not used */
frag->hdr.tag); /* hdr_data: tag */
if (ret != PTL_OK) {
opal_output(mca_btl_portals_component.portals_output,
"send: PtlPut failed with error %d", ret);
return OMPI_ERROR;
}
return ret;
return OMPI_SUCCESS;
}

Просмотреть файл

@ -24,8 +24,8 @@
#define MCA_BTL_PORTALS_PROGRESS_QUEUED_SENDS() \
if ((0 != opal_list_get_size(&(mca_btl_portals_module.portals_queued_sends))) && \
(mca_btl_portals_module.portals_outstanding_sends < \
mca_btl_portals_module.portals_max_outstanding_sends)) { \
(mca_btl_portals_module.portals_outstanding_ops < \
mca_btl_portals_module.portals_max_outstanding_ops)) { \
mca_btl_portals_frag_t *qfrag = (mca_btl_portals_frag_t*) \
opal_list_remove_first(&(mca_btl_portals_module.portals_queued_sends)); \
OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, \

Просмотреть файл

@ -17,80 +17,6 @@
# $HEADER$
#
# _MCA_btl_portals_config_val(config_name, define_name,
# default_val, descrtiption)
# -----------------------------------------------------
AC_DEFUN([MCA_btl_portals_CONFIG_VAL], [
AC_ARG_WITH([portals-$1], AC_HELP_STRING([--with-portals-$1],
[$4 (default: $3)]))
case "[$with_]m4_bpatsubst([portals-$1], -, _)" in
"")
$2=$3
;;
"no")
AC_MSG_ERROR([--without-portals-$1 is invalid argument])
;;
*)
$2="[$with_]m4_bpatsubst([portals-$1], -, _)"
;;
esac
AC_DEFINE_UNQUOTED([$2], [[$]$2], [$4])
])
# _MCA_btl_portals_CONFIG_VALS()
# ------------------------------
AC_DEFUN([MCA_btl_portals_CONFIG_VALS], [
# User configuration options
MCA_btl_portals_CONFIG_VAL([debug-level],
[OMPI_BTL_PORTALS_DEFAULT_DEBUG_LEVEL], [0],
[debugging level for portals btl])
MCA_btl_portals_CONFIG_VAL([eager-limit],
[OMPI_BTL_PORTALS_DEFAULT_EAGER_LIMIT], [32768],
[max size for eager sends])
MCA_btl_portals_CONFIG_VAL([min-send-size],
[OMPI_BTL_PORTALS_DEFAULT_MIN_SEND_SIZE], [32768],
[min size for send fragments])
MCA_btl_portals_CONFIG_VAL([max-send-size],
[OMPI_BTL_PORTALS_DEFAULT_MAX_SEND_SIZE], [65536],
[max size for send fragments])
MCA_btl_portals_CONFIG_VAL([md-size],
[OMPI_BTL_PORTALS_DEFAULT_RECV_MD_SIZE], [1048576],
[Size of receive memory descriptors])
MCA_btl_portals_CONFIG_VAL([md-size],
[OMPI_BTL_PORTALS_DEFAULT_RECV_MD_NUM], [3],
[Number of receive memory descriptors])
MCA_btl_portals_CONFIG_VAL([min-rdma-size],
[OMPI_BTL_PORTALS_DEFAULT_MIN_RDMA_SIZE], [65536],
[min size for rdma fragments])
MCA_btl_portals_CONFIG_VAL([max-rdma-size],
[OMPI_BTL_PORTALS_DEFAULT_MAX_RDMA_SIZE], [2147483647],
[max size for rdma fragments])
MCA_btl_portals_CONFIG_VAL([max-sends-pending],
[OMPI_BTL_PORTALS_MAX_SENDS_PENDING], [64],
[max number of sends pending at any time])
MCA_btl_portals_CONFIG_VAL([recv-queue-size],
[OMPI_BTL_PORTALS_DEFAULT_RECV_QUEUE_SIZE], [8192],
[size of event queue for receiving frags])
MCA_btl_portals_CONFIG_VAL([free-list-init-num],
[OMPI_BTL_PORTALS_DEFAULT_FREE_LIST_INIT_NUM], [8],
[starting size of free lists])
MCA_btl_portals_CONFIG_VAL([free-list-max-num],
[OMPI_BTL_PORTALS_DEFAULT_FREE_LIST_MAX_NUM], [1024],
[maximum size of free lists])
MCA_btl_portals_CONFIG_VAL([free-list-inc-num],
[OMPI_BTL_PORTALS_DEFAULT_FREE_LIST_INC_NUM], [32],
[grow size for freelists])
])
# _MCA_btl_portals_CONFIG_PLATFORM()
# ----------------------------------
AC_DEFUN([MCA_btl_portals_CONFIG_PLATFORM], [
@ -197,7 +123,6 @@ AC_DEFUN([MCA_btl_portals_CONFIG],[
AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <${btl_portals_header_prefix}portals3.h>],
[int i; PtlInit(&i);])],
[AC_MSG_RESULT([yes])
MCA_btl_portals_CONFIG_VALS()
btl_portals_WRAPPER_EXTRA_LDFLAGS="$btl_portals_LDFLAGS"
btl_portals_WRAPPER_EXTRA_LIBS="$btl_portals_LIBS"
$1],