1
1
openmpi/ompi/mca/btl/portals/btl_portals.c
Galen Shipman 3401bd2b07 Add optional ordering to the BTL interface.
This is required to tighten up the BTL semantics. Ordering is not guaranteed,
but, if the BTL returns a order tag in a descriptor (other than
MCA_BTL_NO_ORDER) then we may request another descriptor that will obey
ordering w.r.t. to the other descriptor.


This will allow sane behavior for RDMA networks, where local completion of an
RDMA operation on the active side does not imply remote completion on the
passive side. If we send a FIN message after local completion and the FIN is
not ordered w.r.t. the RDMA operation then badness may occur as the passive
side may now try to deregister the memory and the RDMA operation may still be
pending on the passive side. 

Note that this has no impact on networks that don't suffer from this
limitation as the ORDER tag can simply always be specified as
MCA_BTL_NO_ORDER.

This commit was SVN r14768.
2007-05-24 19:51:26 +00:00

562 строки
20 KiB
C

/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include <sys/types.h>
#include <unistd.h>
#include <stdio.h>
#include <errno.h>
#include "ompi/constants.h"
#include "opal/util/output.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/mca/btl/btl.h"
#include "ompi/datatype/convertor.h"
#include "ompi/datatype/datatype.h"
#include "btl_portals.h"
#include "btl_portals_endpoint.h"
#include "btl_portals_recv.h"
#include "btl_portals_frag.h"
mca_btl_portals_module_t mca_btl_portals_module = {
{
&mca_btl_portals_component.super,
/* NOTE: All these default values are set in
component_open() */
0, /* max size of first frag */
0, /* min send size */
0, /* max send size */
0, /* btl_rdma_pipeline_offset */
0, /* btl_rdma_pipeline_frag_size */
0, /* btl_min_rdma_pipeline_size */
0, /* exclusivity - higher than sm, lower than self */
0, /* latency */
0, /* bandwidth */
0, /* btl flags */
mca_btl_portals_add_procs,
mca_btl_portals_del_procs,
mca_btl_portals_register,
mca_btl_portals_finalize,
mca_btl_portals_alloc,
mca_btl_portals_free,
mca_btl_portals_prepare_src,
mca_btl_portals_prepare_dst,
mca_btl_portals_send,
mca_btl_portals_put,
mca_btl_portals_get,
mca_btl_base_dump,
NULL, /* mpool */
NULL, /* register error */
NULL
},
};
int
mca_btl_portals_add_procs(struct mca_btl_base_module_t* btl_base,
size_t nprocs, struct ompi_proc_t **procs,
struct mca_btl_base_endpoint_t** peers,
ompi_bitmap_t* reachable)
{
int ret;
struct ompi_proc_t *curr_proc = NULL;
ptl_process_id_t *portals_procs = NULL;
size_t i;
unsigned long distance;
bool need_activate = false;
assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base);
opal_output_verbose(50, mca_btl_portals_component.portals_output,
"Adding %d procs (%d)", nprocs,
mca_btl_portals_module.portals_num_procs);
/* if we havne't already, get our network handle */
if (mca_btl_portals_module.portals_ni_h == PTL_INVALID_HANDLE) {
ret = ompi_common_portals_ni_initialize(&mca_btl_portals_module.portals_ni_h);
if (OMPI_SUCCESS != ret) return ret;
}
portals_procs = malloc(nprocs * sizeof(ptl_process_id_t));
ret = ompi_common_portals_get_procs(nprocs, procs, portals_procs);
if (OMPI_SUCCESS != ret) return ret;
if (0 == mca_btl_portals_module.portals_num_procs) {
need_activate = true;
}
/* loop through all procs, setting our reachable flag */
for (i= 0; i < nprocs ; ++i) {
curr_proc = procs[i];
/* portals doesn't support heterogeneous yet... */
if (ompi_proc_local()->proc_arch != curr_proc->proc_arch) {
continue;
}
peers[i] = malloc(sizeof(mca_btl_base_endpoint_t));
if (NULL == peers[i]) return OMPI_ERROR;
*((mca_btl_base_endpoint_t*) peers[i]) = portals_procs[i];
/* make sure we can reach the process - this is supposed to be
a cheap-ish operation */
ret = PtlNIDist(mca_btl_portals_module.portals_ni_h,
portals_procs[i],
&distance);
if (ret != PTL_OK) {
opal_output_verbose(10, mca_btl_portals_component.portals_output,
"Could not find distance to process %d", i);
continue;
}
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_num_procs, 1);
/* and here we can reach */
ompi_bitmap_set_bit(reachable, i);
}
if (NULL != portals_procs) free(portals_procs);
if (need_activate && mca_btl_portals_module.portals_num_procs > 0) {
/* create eqs */
int i;
opal_output_verbose(50, mca_btl_portals_component.portals_output,
"Enabling progress");
for (i = 0 ; i < OMPI_BTL_PORTALS_EQ_SIZE ; ++i) {
int ptl_ret = PtlEQAlloc(mca_btl_portals_module.portals_ni_h,
mca_btl_portals_module.portals_eq_sizes[i],
PTL_EQ_HANDLER_NONE,
&(mca_btl_portals_module.portals_eq_handles[i]));
if (PTL_OK != ptl_ret) {
opal_output(mca_btl_portals_component.portals_output,
"Error creating EQ %d: %d", i, ptl_ret);
/* BWB - better error code? */
return OMPI_ERROR;
}
}
ret = mca_btl_portals_recv_enable(&mca_btl_portals_module);
/* fill in send memory descriptor */
mca_btl_portals_module.md_send.start = NULL;
mca_btl_portals_module.md_send.length = 0;
mca_btl_portals_module.md_send.threshold = PTL_MD_THRESH_INF;
mca_btl_portals_module.md_send.max_size = 0;
mca_btl_portals_module.md_send.options = PTL_MD_EVENT_START_DISABLE;
mca_btl_portals_module.md_send.user_ptr = NULL;
mca_btl_portals_module.md_send.eq_handle =
mca_btl_portals_module.portals_eq_handles[OMPI_BTL_PORTALS_EQ_SEND];
} else {
ret = OMPI_SUCCESS;
}
return ret;
}
int
mca_btl_portals_del_procs(struct mca_btl_base_module_t *btl_base,
size_t nprocs,
struct ompi_proc_t **procs,
struct mca_btl_base_endpoint_t **peers)
{
size_t i = 0;
int ret = OMPI_SUCCESS;
assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base);
opal_output_verbose(50, mca_btl_portals_component.portals_output,
"Removing %d procs (%d)", nprocs,
mca_btl_portals_module.portals_num_procs);
for (i = 0 ; i < nprocs ; ++i) {
free(peers[i]);
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_num_procs, -1);
}
if (0 == mca_btl_portals_module.portals_num_procs) {
int i;
opal_output_verbose(50, mca_btl_portals_component.portals_output,
"Disabling progress");
ret = mca_btl_portals_recv_disable(&mca_btl_portals_module);
/* destroy eqs */
for (i = 0 ; i < OMPI_BTL_PORTALS_EQ_SIZE ; ++i) {
int ptl_ret = PtlEQFree(mca_btl_portals_module.portals_eq_handles[i]);
if (PTL_OK != ptl_ret) {
opal_output(mca_btl_portals_component.portals_output,
"Error freeing EQ %d: %d", i, ptl_ret);
}
}
} else {
ret = OMPI_SUCCESS;
}
return ret;
}
int
mca_btl_portals_register(struct mca_btl_base_module_t* btl_base,
mca_btl_base_tag_t tag,
mca_btl_base_module_recv_cb_fn_t cbfunc,
void* cbdata)
{
assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base);
mca_btl_portals_module.portals_reg[tag].cbfunc = cbfunc;
mca_btl_portals_module.portals_reg[tag].cbdata = cbdata;
return OMPI_SUCCESS;
}
mca_btl_base_descriptor_t*
mca_btl_portals_alloc(struct mca_btl_base_module_t* btl_base,
uint8_t order,
size_t size)
{
int rc;
mca_btl_portals_frag_t* frag;
assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base);
if (size <= mca_btl_portals_module.super.btl_eager_limit) {
OMPI_BTL_PORTALS_FRAG_ALLOC_EAGER(&mca_btl_portals_module, frag, rc);
if (OMPI_SUCCESS != rc) return NULL;
frag->segments[0].seg_len =
size <= mca_btl_portals_module.super.btl_eager_limit ?
size : mca_btl_portals_module.super.btl_eager_limit ;
} else {
OMPI_BTL_PORTALS_FRAG_ALLOC_MAX(&mca_btl_portals_module, frag, rc);
if (OMPI_SUCCESS != rc) return NULL;
frag->segments[0].seg_len =
size <= mca_btl_portals_module.super.btl_max_send_size ?
size : mca_btl_portals_module.super.btl_max_send_size ;
}
frag->base.des_src_cnt = 1;
frag->base.des_flags = 0;
frag->base.order = MCA_BTL_NO_ORDER;
return &frag->base;
}
int
mca_btl_portals_free(struct mca_btl_base_module_t* btl_base,
mca_btl_base_descriptor_t* des)
{
mca_btl_portals_frag_t* frag = (mca_btl_portals_frag_t*) des;
assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base);
if (BTL_PORTALS_FRAG_TYPE_EAGER == frag->type) {
/* don't ever unlink eager frags */
OMPI_BTL_PORTALS_FRAG_RETURN_EAGER(&mca_btl_portals_module.super, frag);
} else if (BTL_PORTALS_FRAG_TYPE_MAX == frag->type) {
if (frag->md_h != PTL_INVALID_HANDLE) {
PtlMDUnlink(frag->md_h);
frag->md_h = PTL_INVALID_HANDLE;
}
OMPI_BTL_PORTALS_FRAG_RETURN_MAX(&mca_btl_portals_module.super, frag);
} else if (BTL_PORTALS_FRAG_TYPE_USER == frag->type) {
if (frag->md_h != PTL_INVALID_HANDLE) {
PtlMDUnlink(frag->md_h);
frag->md_h = PTL_INVALID_HANDLE;
}
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1);
OMPI_BTL_PORTALS_FRAG_RETURN_USER(&mca_btl_portals_module.super, frag);
} else {
return OMPI_ERR_BAD_PARAM;
}
return OMPI_SUCCESS;
}
mca_btl_base_descriptor_t*
mca_btl_portals_prepare_src(struct mca_btl_base_module_t* btl_base,
struct mca_btl_base_endpoint_t* peer,
mca_mpool_base_registration_t* registration,
struct ompi_convertor_t* convertor,
uint8_t order,
size_t reserve,
size_t* size)
{
mca_btl_portals_frag_t* frag;
size_t max_data = *size;
struct iovec iov;
uint32_t iov_count = 1;
int ret;
assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base);
if (0 != reserve || 0 != ompi_convertor_need_buffers(convertor)) {
frag = (mca_btl_portals_frag_t*)
mca_btl_portals_alloc(btl_base, max_data + reserve);
if (NULL == frag) {
return NULL;
}
if (max_data + reserve > frag->size) {
max_data = frag->size - reserve;
}
iov.iov_len = max_data;
iov.iov_base = (unsigned char*) frag->segments[0].seg_addr.pval + reserve;
ret = ompi_convertor_pack(convertor, &iov, &iov_count,
&max_data );
*size = max_data;
if ( ret < 0 ) {
return NULL;
}
frag->segments[0].seg_len = max_data + reserve;
frag->base.des_src_cnt = 1;
} else {
/* no need to pack - rdma operation out of user's buffer */
ptl_md_t md;
ptl_handle_me_t me_h;
/* reserve space in the event queue for rdma operations immediately */
while (OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, 1) >
mca_btl_portals_module.portals_max_outstanding_ops) {
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1);
mca_btl_portals_component_progress();
}
OMPI_BTL_PORTALS_FRAG_ALLOC_USER(&mca_btl_portals_module.super, frag, ret);
if(NULL == frag){
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1);
return NULL;
}
iov.iov_len = max_data;
iov.iov_base = NULL;
ompi_convertor_pack(convertor, &iov, &iov_count, &max_data );
frag->segments[0].seg_len = max_data;
frag->segments[0].seg_addr.pval = iov.iov_base;
frag->segments[0].seg_key.key64 =
OPAL_THREAD_ADD64(&(mca_btl_portals_module.portals_rdma_key), 1);
frag->base.des_src_cnt = 1;
/* either a put or get. figure out which later */
OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output,
"rdma src posted for frag 0x%x, callback 0x%x, bits %lld",
frag, frag->base.des_cbfunc, frag->segments[0].seg_key.key64));
/* create a match entry */
ret = PtlMEAttach(mca_btl_portals_module.portals_ni_h,
OMPI_BTL_PORTALS_RDMA_TABLE_ID,
*((mca_btl_base_endpoint_t*) peer),
frag->segments[0].seg_key.key64, /* match */
0, /* ignore */
PTL_UNLINK,
PTL_INS_AFTER,
&me_h);
if (PTL_OK != ret) {
opal_output(mca_btl_portals_component.portals_output,
"Error creating rdma src ME: %d", ret);
OMPI_BTL_PORTALS_FRAG_RETURN_USER(&mca_btl_portals_module.super, frag);
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1);
return NULL;
}
/* setup the memory descriptor */
md.start = frag->segments[0].seg_addr.pval;
md.length = frag->segments[0].seg_len;
md.threshold = PTL_MD_THRESH_INF;
md.max_size = 0;
md.options = PTL_MD_OP_PUT | PTL_MD_OP_GET | PTL_MD_EVENT_START_DISABLE;
md.user_ptr = frag; /* keep a pointer to ourselves */
md.eq_handle = mca_btl_portals_module.portals_eq_handles[OMPI_BTL_PORTALS_EQ_SEND];
ret = PtlMDAttach(me_h,
md,
PTL_UNLINK,
&(frag->md_h));
if (PTL_OK != ret) {
opal_output(mca_btl_portals_component.portals_output,
"Error creating rdma src MD: %d", ret);
PtlMEUnlink(me_h);
OMPI_BTL_PORTALS_FRAG_RETURN_USER(&mca_btl_portals_module.super, frag);
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1);
return NULL;
}
}
frag->base.des_src = frag->segments;
frag->base.des_dst = NULL;
frag->base.des_dst_cnt = 0;
frag->base.des_flags = 0;
frag->base.order = MCA_BTL_NO_ORDER;
return &frag->base;
}
mca_btl_base_descriptor_t*
mca_btl_portals_prepare_dst(struct mca_btl_base_module_t* btl_base,
struct mca_btl_base_endpoint_t* peer,
mca_mpool_base_registration_t* registration,
struct ompi_convertor_t* convertor,
uint8_t order,
size_t reserve,
size_t* size)
{
mca_btl_portals_frag_t* frag;
ptl_md_t md;
ptl_handle_me_t me_h;
int ret;
assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base);
/* reserve space in the event queue for rdma operations immediately */
while (OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, 1) >
mca_btl_portals_module.portals_max_outstanding_ops) {
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1);
mca_btl_portals_component_progress();
}
OMPI_BTL_PORTALS_FRAG_ALLOC_USER(&mca_btl_portals_module.super, frag, ret);
if(NULL == frag) {
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1);
return NULL;
}
frag->segments[0].seg_len = *size;
ompi_convertor_get_current_pointer( convertor, (void**)&(frag->segments[0].seg_addr.pval) );
frag->segments[0].seg_key.key64 =
OPAL_THREAD_ADD64(&(mca_btl_portals_module.portals_rdma_key), 1);
frag->base.des_src = NULL;
frag->base.des_src_cnt = 0;
frag->base.des_dst = frag->segments;
frag->base.des_dst_cnt = 1;
frag->base.des_flags = 0;
OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output,
"rdma dest posted for frag 0x%x, callback 0x%x, bits %lld",
frag, frag->base.des_cbfunc, frag->segments[0].seg_key.key64));
/* create a match entry */
ret = PtlMEAttach(mca_btl_portals_module.portals_ni_h,
OMPI_BTL_PORTALS_RDMA_TABLE_ID,
*((mca_btl_base_endpoint_t*) peer),
frag->segments[0].seg_key.key64, /* match */
0, /* ignore */
PTL_UNLINK,
PTL_INS_AFTER,
&me_h);
if (PTL_OK != ret) {
opal_output(mca_btl_portals_component.portals_output,
"Error creating rdma dest ME: %d", ret);
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1);
OMPI_BTL_PORTALS_FRAG_RETURN_USER(&mca_btl_portals_module.super, frag);
return NULL;
}
/* setup the memory descriptor. */
md.start = frag->segments[0].seg_addr.pval;
md.length = frag->segments[0].seg_len;
md.threshold = PTL_MD_THRESH_INF;
md.max_size = 0;
md.options = PTL_MD_OP_PUT | PTL_MD_OP_GET | PTL_MD_EVENT_START_DISABLE;
md.user_ptr = frag; /* keep a pointer to ourselves */
md.eq_handle = mca_btl_portals_module.portals_eq_handles[OMPI_BTL_PORTALS_EQ_SEND];
ret = PtlMDAttach(me_h,
md,
PTL_UNLINK,
&(frag->md_h));
if (PTL_OK != ret) {
opal_output(mca_btl_portals_component.portals_output,
"Error creating rdma dest MD: %d", ret);
PtlMEUnlink(me_h);
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1);
OMPI_BTL_PORTALS_FRAG_RETURN_USER(&mca_btl_portals_module.super, frag);
return NULL;
}
frag->base.order = MCA_BTL_NO_ORDER;
return &frag->base;
}
int
mca_btl_portals_finalize(struct mca_btl_base_module_t *btl_base)
{
int ret;
assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base);
/* finalize all communication */
while (mca_btl_portals_module.portals_outstanding_ops > 0) {
mca_btl_portals_component_progress();
}
if (mca_btl_portals_module.portals_num_procs != 0) {
int i;
ret = mca_btl_portals_recv_disable(&mca_btl_portals_module);
/* destroy eqs */
for (i = 0 ; i < OMPI_BTL_PORTALS_EQ_SIZE ; ++i) {
int ptl_ret = PtlEQFree(mca_btl_portals_module.portals_eq_handles[i]);
if (PTL_OK != ptl_ret) {
#if OMPI_PORTALS_CRAYXT3
if (i != OMPI_BTL_PORTALS_EQ_SEND && PTL_EQ_IN_USE != ptl_ret) {
/* The PML isn't great about cleaning up after itself.
Ignore related errors. */
#endif
opal_output(mca_btl_portals_component.portals_output,
"Error freeing EQ %d: %d", i, ptl_ret);
#if OMPI_PORTALS_CRAYXT3
}
#endif
}
}
}
OBJ_DESTRUCT(&mca_btl_portals_module.portals_recv_blocks);
OBJ_DESTRUCT(&mca_btl_portals_module.portals_recv_frag);
OBJ_DESTRUCT(&mca_btl_portals_module.portals_frag_eager);
OBJ_DESTRUCT(&mca_btl_portals_module.portals_frag_max);
OBJ_DESTRUCT(&mca_btl_portals_module.portals_frag_user);
ompi_common_portals_ni_finalize();
ompi_common_portals_finalize();
opal_output_verbose(20, mca_btl_portals_component.portals_output,
"successfully finalized module");
return OMPI_SUCCESS;
}