1
1
openmpi/ompi/mca/btl/portals/btl_portals.c
George Bosilca e361bcb64c Send optimizations.
1. The send path get shorter. The BTL is allowed to return > 0 to specify that the
   descriptor was pushed to the networks, and that the memory attached to it is 
   available again for the upper layer. The MCA_BTL_DES_SEND_ALWAYS_CALLBACK flag
   can be used by the PML to force the BTL to always trigger the callback.
   Unmodified BTL will continue to work as expected, as they will return OMPI_SUCCESS
   which force the PML to have exactly the same behavior as before. Some BTLs have
   been modified: self, sm, tcp, mx.
2. Add send immediate interface to BTL.
   The idea is to have a mechanism of allowing the BTL to take advantage of
   send optimizations such as the ability to deliver data "inline". Some
   network APIs such as Portals allow data to be sent using a "thin" event
   without packing data into a memory descriptor. This interface change
   allows the BTL to use such capabilities and allows for other optimizations
   in the future. All existing BTLs except for Portals and sm have this interface
   set to NULL.

This commit was SVN r18551.
2008-05-30 03:58:39 +00:00

571 строка
20 KiB
C

/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2008 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include <sys/types.h>
#include <unistd.h>
#include <stdio.h>
#include <errno.h>
#include <inttypes.h>
#include "ompi/constants.h"
#include "orte/util/output.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/mca/btl/btl.h"
#include "ompi/datatype/convertor.h"
#include "ompi/datatype/datatype.h"
#include "btl_portals.h"
#include "btl_portals_endpoint.h"
#include "btl_portals_recv.h"
#include "btl_portals_frag.h"
mca_btl_portals_module_t mca_btl_portals_module = {
{
&mca_btl_portals_component.super,
/* NOTE: All these default values are set in
component_open() */
0, /* max size of first frag */
0, /* min send size */
0, /* max send size */
0, /* btl_rdma_pipeline_send_length */
0, /* btl_rdma_pipeline_frag_size */
0, /* btl_min_rdma_pipeline_size */
0, /* exclusivity - higher than sm, lower than self */
0, /* latency */
0, /* bandwidth */
0, /* btl flags */
mca_btl_portals_add_procs,
mca_btl_portals_del_procs,
NULL,
mca_btl_portals_finalize,
mca_btl_portals_alloc,
mca_btl_portals_free,
mca_btl_portals_prepare_src,
mca_btl_portals_prepare_dst,
mca_btl_portals_send,
mca_btl_portals_sendi,
mca_btl_portals_put,
mca_btl_portals_get,
mca_btl_base_dump,
NULL, /* mpool */
NULL, /* register error */
NULL
},
};
int
mca_btl_portals_add_procs(struct mca_btl_base_module_t* btl_base,
size_t nprocs, struct ompi_proc_t **procs,
struct mca_btl_base_endpoint_t** peers,
ompi_bitmap_t* reachable)
{
int ret;
struct ompi_proc_t *curr_proc = NULL;
ptl_process_id_t *portals_procs = NULL;
size_t i;
unsigned long distance;
bool need_activate = false;
bool accel;
assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base);
orte_output_verbose(50, mca_btl_portals_component.portals_output,
"Adding %d procs (%d)", (int) nprocs,
(int) mca_btl_portals_module.portals_num_procs);
/* if we havne't already, get our network handle */
if (mca_btl_portals_module.portals_ni_h == PTL_INVALID_HANDLE) {
ret = ompi_common_portals_ni_initialize(&mca_btl_portals_module.portals_ni_h, &accel);
if (OMPI_SUCCESS != ret) return ret;
}
portals_procs = malloc(nprocs * sizeof(ptl_process_id_t));
ret = ompi_common_portals_get_procs(nprocs, procs, portals_procs);
if (OMPI_SUCCESS != ret) return ret;
if (0 == mca_btl_portals_module.portals_num_procs) {
need_activate = true;
}
/* loop through all procs, setting our reachable flag */
for (i= 0; i < nprocs ; ++i) {
curr_proc = procs[i];
/* portals doesn't support heterogeneous yet... */
if (ompi_proc_local()->proc_arch != curr_proc->proc_arch) {
continue;
}
peers[i] = malloc(sizeof(mca_btl_base_endpoint_t));
if (NULL == peers[i]) return OMPI_ERROR;
*((mca_btl_base_endpoint_t*) peers[i]) = portals_procs[i];
/* accelerated doesn't support PtlNIDist() */
if (accel == false) {
/* make sure we can reach the process - this is supposed to be
a cheap-ish operation */
ret = PtlNIDist(mca_btl_portals_module.portals_ni_h,
portals_procs[i],
&distance);
if (ret != PTL_OK) {
orte_output_verbose(10, mca_btl_portals_component.portals_output,
"Could not find distance to process %d", (int) i);
continue;
}
}
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_num_procs, 1);
/* and here we can reach */
ompi_bitmap_set_bit(reachable, i);
}
if (NULL != portals_procs) free(portals_procs);
if (need_activate && mca_btl_portals_module.portals_num_procs > 0) {
/* create eqs */
int i;
orte_output_verbose(50, mca_btl_portals_component.portals_output,
"Enabling progress");
for (i = 0 ; i < OMPI_BTL_PORTALS_EQ_SIZE ; ++i) {
int ptl_ret = PtlEQAlloc(mca_btl_portals_module.portals_ni_h,
mca_btl_portals_module.portals_eq_sizes[i],
PTL_EQ_HANDLER_NONE,
&(mca_btl_portals_module.portals_eq_handles[i]));
if (PTL_OK != ptl_ret) {
orte_output(mca_btl_portals_component.portals_output,
"Error creating EQ %d: %d", i, ptl_ret);
/* BWB - better error code? */
return OMPI_ERROR;
}
}
ret = mca_btl_portals_recv_enable(&mca_btl_portals_module);
/* fill in send memory descriptor */
mca_btl_portals_module.md_send.start = NULL;
mca_btl_portals_module.md_send.length = 0;
mca_btl_portals_module.md_send.threshold = PTL_MD_THRESH_INF;
mca_btl_portals_module.md_send.max_size = 0;
mca_btl_portals_module.md_send.options = PTL_MD_EVENT_START_DISABLE;
mca_btl_portals_module.md_send.user_ptr = NULL;
mca_btl_portals_module.md_send.eq_handle =
mca_btl_portals_module.portals_eq_handles[OMPI_BTL_PORTALS_EQ_SEND];
} else {
ret = OMPI_SUCCESS;
}
return ret;
}
int
mca_btl_portals_del_procs(struct mca_btl_base_module_t *btl_base,
size_t nprocs,
struct ompi_proc_t **procs,
struct mca_btl_base_endpoint_t **peers)
{
size_t i = 0;
int ret = OMPI_SUCCESS;
assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base);
orte_output_verbose(50, mca_btl_portals_component.portals_output,
"Removing %d procs (%d)", (int) nprocs,
(int) mca_btl_portals_module.portals_num_procs);
for (i = 0 ; i < nprocs ; ++i) {
free(peers[i]);
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_num_procs, -1);
}
if (0 == mca_btl_portals_module.portals_num_procs) {
int i;
orte_output_verbose(50, mca_btl_portals_component.portals_output,
"Disabling progress");
ret = mca_btl_portals_recv_disable(&mca_btl_portals_module);
/* destroy eqs */
for (i = 0 ; i < OMPI_BTL_PORTALS_EQ_SIZE ; ++i) {
int ptl_ret = PtlEQFree(mca_btl_portals_module.portals_eq_handles[i]);
if (PTL_OK != ptl_ret) {
orte_output(mca_btl_portals_component.portals_output,
"Error freeing EQ %d: %d", i, ptl_ret);
}
}
} else {
ret = OMPI_SUCCESS;
}
return ret;
}
mca_btl_base_descriptor_t*
mca_btl_portals_alloc(struct mca_btl_base_module_t* btl_base,
struct mca_btl_base_endpoint_t* endpoint,
uint8_t order,
size_t size,
uint32_t flags)
{
int rc;
mca_btl_portals_frag_t* frag;
assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base);
if (size <= mca_btl_portals_module.super.btl_eager_limit) {
OMPI_BTL_PORTALS_FRAG_ALLOC_EAGER(&mca_btl_portals_module, frag, rc);
if (OMPI_SUCCESS != rc) return NULL;
frag->segments[0].seg_len = size;
} else {
OMPI_BTL_PORTALS_FRAG_ALLOC_MAX(&mca_btl_portals_module, frag, rc);
if (OMPI_SUCCESS != rc) return NULL;
frag->segments[0].seg_len =
size <= mca_btl_portals_module.super.btl_max_send_size ?
size : mca_btl_portals_module.super.btl_max_send_size ;
}
frag->base.des_src_cnt = 1;
frag->base.des_flags = flags | MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
frag->base.order = MCA_BTL_NO_ORDER;
return &frag->base;
}
int
mca_btl_portals_free(struct mca_btl_base_module_t* btl_base,
mca_btl_base_descriptor_t* des)
{
mca_btl_portals_frag_t* frag = (mca_btl_portals_frag_t*) des;
assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base);
if (BTL_PORTALS_FRAG_TYPE_EAGER == frag->type) {
/* don't ever unlink eager frags */
OMPI_BTL_PORTALS_FRAG_RETURN_EAGER(&mca_btl_portals_module.super, frag);
} else if (BTL_PORTALS_FRAG_TYPE_MAX == frag->type) {
if (frag->md_h != PTL_INVALID_HANDLE) {
PtlMDUnlink(frag->md_h);
frag->md_h = PTL_INVALID_HANDLE;
}
OMPI_BTL_PORTALS_FRAG_RETURN_MAX(&mca_btl_portals_module.super, frag);
} else if (BTL_PORTALS_FRAG_TYPE_USER == frag->type) {
if (frag->md_h != PTL_INVALID_HANDLE) {
PtlMDUnlink(frag->md_h);
frag->md_h = PTL_INVALID_HANDLE;
}
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1);
OMPI_BTL_PORTALS_FRAG_RETURN_USER(&mca_btl_portals_module.super, frag);
} else {
return OMPI_ERR_BAD_PARAM;
}
return OMPI_SUCCESS;
}
mca_btl_base_descriptor_t*
mca_btl_portals_prepare_src(struct mca_btl_base_module_t* btl_base,
struct mca_btl_base_endpoint_t* peer,
mca_mpool_base_registration_t* registration,
struct ompi_convertor_t* convertor,
uint8_t order,
size_t reserve,
size_t* size,
uint32_t flags)
{
mca_btl_portals_frag_t* frag;
size_t max_data = *size;
struct iovec iov;
uint32_t iov_count = 1;
int ret;
assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base);
if (0 != reserve || 0 != ompi_convertor_need_buffers(convertor)) {
frag = (mca_btl_portals_frag_t*)
mca_btl_portals_alloc(btl_base, peer, MCA_BTL_NO_ORDER, max_data + reserve, flags);
if (NULL == frag) {
return NULL;
}
if (max_data + reserve > frag->size) {
max_data = frag->size - reserve;
}
iov.iov_len = max_data;
iov.iov_base = (unsigned char*) frag->segments[0].seg_addr.pval + reserve;
ret = ompi_convertor_pack(convertor, &iov, &iov_count,
&max_data );
*size = max_data;
if ( ret < 0 ) {
return NULL;
}
frag->segments[0].seg_len = max_data + reserve;
frag->base.des_src_cnt = 1;
} else {
/* no need to pack - rdma operation out of user's buffer */
ptl_md_t md;
ptl_handle_me_t me_h;
/* reserve space in the event queue for rdma operations immediately */
while (OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, 1) >
mca_btl_portals_module.portals_max_outstanding_ops) {
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1);
mca_btl_portals_component_progress();
}
OMPI_BTL_PORTALS_FRAG_ALLOC_USER(&mca_btl_portals_module.super, frag, ret);
if(NULL == frag){
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1);
return NULL;
}
iov.iov_len = max_data;
iov.iov_base = NULL;
ompi_convertor_pack(convertor, &iov, &iov_count, &max_data );
frag->segments[0].seg_len = max_data;
frag->segments[0].seg_addr.pval = iov.iov_base;
frag->segments[0].seg_key.key64 =
OPAL_THREAD_ADD64(&(mca_btl_portals_module.portals_rdma_key), 1);
frag->base.des_src_cnt = 1;
/* either a put or get. figure out which later */
ORTE_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output,
"rdma src posted for frag 0x%lx, callback 0x%lx, bits %"PRIu64", flags say %d" ,
(unsigned long) frag,
(unsigned long) frag->base.des_cbfunc,
frag->segments[0].seg_key.key64, flags));
/* create a match entry */
ret = PtlMEAttach(mca_btl_portals_module.portals_ni_h,
OMPI_BTL_PORTALS_RDMA_TABLE_ID,
*((mca_btl_base_endpoint_t*) peer),
frag->segments[0].seg_key.key64, /* match */
0, /* ignore */
PTL_UNLINK,
PTL_INS_AFTER,
&me_h);
if (PTL_OK != ret) {
orte_output(mca_btl_portals_component.portals_output,
"Error creating rdma src ME: %d", ret);
OMPI_BTL_PORTALS_FRAG_RETURN_USER(&mca_btl_portals_module.super, frag);
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1);
return NULL;
}
/* setup the memory descriptor */
md.start = frag->segments[0].seg_addr.pval;
md.length = frag->segments[0].seg_len;
md.threshold = PTL_MD_THRESH_INF;
md.max_size = 0;
md.options = PTL_MD_OP_PUT | PTL_MD_OP_GET | PTL_MD_EVENT_START_DISABLE;
md.user_ptr = frag; /* keep a pointer to ourselves */
md.eq_handle = mca_btl_portals_module.portals_eq_handles[OMPI_BTL_PORTALS_EQ_SEND];
ret = PtlMDAttach(me_h,
md,
PTL_UNLINK,
&(frag->md_h));
if (PTL_OK != ret) {
orte_output(mca_btl_portals_component.portals_output,
"Error creating rdma src MD: %d", ret);
PtlMEUnlink(me_h);
OMPI_BTL_PORTALS_FRAG_RETURN_USER(&mca_btl_portals_module.super, frag);
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1);
return NULL;
}
}
frag->base.des_src = frag->segments;
frag->base.des_dst = NULL;
frag->base.des_dst_cnt = 0;
frag->base.des_flags = flags;
frag->base.order = MCA_BTL_NO_ORDER;
return &frag->base;
}
mca_btl_base_descriptor_t*
mca_btl_portals_prepare_dst(struct mca_btl_base_module_t* btl_base,
struct mca_btl_base_endpoint_t* peer,
mca_mpool_base_registration_t* registration,
struct ompi_convertor_t* convertor,
uint8_t order,
size_t reserve,
size_t* size,
uint32_t flags)
{
mca_btl_portals_frag_t* frag;
ptl_md_t md;
ptl_handle_me_t me_h;
int ret;
assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base);
/* reserve space in the event queue for rdma operations immediately */
while (OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, 1) >
mca_btl_portals_module.portals_max_outstanding_ops) {
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1);
mca_btl_portals_component_progress();
}
OMPI_BTL_PORTALS_FRAG_ALLOC_USER(&mca_btl_portals_module.super, frag, ret);
if(NULL == frag) {
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1);
return NULL;
}
frag->segments[0].seg_len = *size;
ompi_convertor_get_current_pointer( convertor, (void**)&(frag->segments[0].seg_addr.pval) );
frag->segments[0].seg_key.key64 =
OPAL_THREAD_ADD64(&(mca_btl_portals_module.portals_rdma_key), 1);
frag->base.des_src = NULL;
frag->base.des_src_cnt = 0;
frag->base.des_dst = frag->segments;
frag->base.des_dst_cnt = 1;
frag->base.des_flags = flags;
ORTE_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output,
"rdma dest posted for frag 0x%lx, callback 0x%lx, bits %" PRIu64 " flags %d",
(unsigned long) frag,
(unsigned long) frag->base.des_cbfunc,
frag->segments[0].seg_key.key64,
flags));
/* create a match entry */
ret = PtlMEAttach(mca_btl_portals_module.portals_ni_h,
OMPI_BTL_PORTALS_RDMA_TABLE_ID,
*((mca_btl_base_endpoint_t*) peer),
frag->segments[0].seg_key.key64, /* match */
0, /* ignore */
PTL_UNLINK,
PTL_INS_AFTER,
&me_h);
if (PTL_OK != ret) {
orte_output(mca_btl_portals_component.portals_output,
"Error creating rdma dest ME: %d", ret);
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1);
OMPI_BTL_PORTALS_FRAG_RETURN_USER(&mca_btl_portals_module.super, frag);
return NULL;
}
/* setup the memory descriptor. */
md.start = frag->segments[0].seg_addr.pval;
md.length = frag->segments[0].seg_len;
md.threshold = PTL_MD_THRESH_INF;
md.max_size = 0;
md.options = PTL_MD_OP_PUT | PTL_MD_OP_GET | PTL_MD_EVENT_START_DISABLE;
md.user_ptr = frag; /* keep a pointer to ourselves */
md.eq_handle = mca_btl_portals_module.portals_eq_handles[OMPI_BTL_PORTALS_EQ_SEND];
ret = PtlMDAttach(me_h,
md,
PTL_UNLINK,
&(frag->md_h));
if (PTL_OK != ret) {
orte_output(mca_btl_portals_component.portals_output,
"Error creating rdma dest MD: %d", ret);
PtlMEUnlink(me_h);
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1);
OMPI_BTL_PORTALS_FRAG_RETURN_USER(&mca_btl_portals_module.super, frag);
return NULL;
}
frag->base.order = MCA_BTL_NO_ORDER;
return &frag->base;
}
int
mca_btl_portals_finalize(struct mca_btl_base_module_t *btl_base)
{
int ret;
assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base);
ORTE_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output,
"in mca_btl_portals_finalize"));
/* sanity check */
assert(mca_btl_portals_module.portals_outstanding_ops >= 0);
/* finalize all communication */
while (mca_btl_portals_module.portals_outstanding_ops > 0) {
ORTE_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output,
"portals_outstanding_ops: %d",
mca_btl_portals_module.portals_outstanding_ops));
mca_btl_portals_component_progress();
}
if (mca_btl_portals_module.portals_num_procs != 0) {
int i;
ret = mca_btl_portals_recv_disable(&mca_btl_portals_module);
/* destroy eqs */
for (i = 0 ; i < OMPI_BTL_PORTALS_EQ_SIZE ; ++i) {
int ptl_ret = PtlEQFree(mca_btl_portals_module.portals_eq_handles[i]);
if (PTL_OK != ptl_ret) {
#if (OMPI_PORTALS_CRAYXT3 || OMPI_PORTALS_CRAYXT3_MODEX)
if (i != OMPI_BTL_PORTALS_EQ_SEND && PTL_EQ_IN_USE != ptl_ret) {
/* The PML isn't great about cleaning up after itself.
Ignore related errors. */
#endif
orte_output(mca_btl_portals_component.portals_output,
"Error freeing EQ %d: %d", i, ptl_ret);
#if (OMPI_PORTALS_CRAYXT3 || OMPI_PORTALS_CRAYXT3_MODEX)
}
#endif
}
}
}
OBJ_DESTRUCT(&mca_btl_portals_module.portals_recv_blocks);
OBJ_DESTRUCT(&mca_btl_portals_module.portals_recv_frag);
OBJ_DESTRUCT(&mca_btl_portals_module.portals_frag_eager);
OBJ_DESTRUCT(&mca_btl_portals_module.portals_frag_max);
OBJ_DESTRUCT(&mca_btl_portals_module.portals_frag_user);
ompi_common_portals_ni_finalize();
ompi_common_portals_finalize();
orte_output_verbose(20, mca_btl_portals_component.portals_output,
"successfully finalized module");
return OMPI_SUCCESS;
}