1
1
openmpi/ompi/mca/btl/vader/btl_vader.c
Josh Hursey 28681deffa Backout the ORCA commit. :(
There is a linking issue on Mac OSX that needs to be addressed before this is able to come back into the trunk.

This commit was SVN r26676.
2012-06-27 01:28:28 +00:00

691 строка
25 KiB
C

/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2011 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2009 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2007 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2007 Voltaire. All rights reserved.
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010-2012 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "btl_vader.h"
#include "btl_vader_endpoint.h"
#include "btl_vader_fifo.h"
#include "btl_vader_fbox.h"
static int vader_del_procs (struct mca_btl_base_module_t *btl,
size_t nprocs, struct ompi_proc_t **procs,
struct mca_btl_base_endpoint_t **peers);
static int vader_register_error_cb (struct mca_btl_base_module_t* btl,
mca_btl_base_module_error_cb_fn_t cbfunc);
static int vader_finalize (struct mca_btl_base_module_t* btl);
static int vader_free (struct mca_btl_base_module_t* btl, mca_btl_base_descriptor_t* des);
static struct mca_btl_base_descriptor_t *vader_prepare_src (
struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
mca_mpool_base_registration_t *registration,
struct opal_convertor_t *convertor,
uint8_t order,
size_t reserve,
size_t *size,
uint32_t flags
);
static struct mca_btl_base_descriptor_t *vader_prepare_dst (
struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_mpool_base_registration_t *registration,
struct opal_convertor_t *convertor,
uint8_t order,
size_t reserve,
size_t *size,
uint32_t flags);
static int vader_add_procs(struct mca_btl_base_module_t* btl,
size_t nprocs, struct ompi_proc_t **procs,
struct mca_btl_base_endpoint_t** peers,
struct opal_bitmap_t* reachability);
static int vader_ft_event (int state);
mca_btl_vader_t mca_btl_vader = {
{
&mca_btl_vader_component.super,
0, /* btl_eager_limit */
0, /* btl_rndv_eager_limit */
0, /* btl_max_send_size */
0, /* btl_rdma_pipeline_send_length */
0, /* btl_rdma_pipeline_frag_size */
0, /* btl_min_rdma_pipeline_size */
0, /* btl_exclusivity */
0, /* bTl_latency */
0, /* btl_bandwidth */
0, /* btl_flags */
0, /* btl segment size */
vader_add_procs,
vader_del_procs,
NULL, /* btl_register */
vader_finalize,
mca_btl_vader_alloc,
vader_free,
vader_prepare_src,
vader_prepare_dst,
mca_btl_vader_send,
mca_btl_vader_sendi,
mca_btl_vader_put,
mca_btl_vader_get,
mca_btl_base_dump,
NULL, /* btl_mpool */
vader_register_error_cb, /* register error */
vader_ft_event
}
};
static inline int vader_init_mpool (mca_btl_vader_t *vader_btl, int n)
{
mca_btl_vader_component_t *component = &mca_btl_vader_component;
mca_mpool_base_resources_t res;
res.mem_node = -1;
/* determine how much memory to create */
/*
* This heuristic formula mostly says that we request memory for:
* - a vader fifo
* - eager fragments (2 * n of them, allocated in vader_free_list_inc chunks)
*
* On top of all that, we sprinkle in some number of "opal_cache_line_size"
* additions to account for some padding and edge effects that may lie
* in the allocator.
*/
res.size = sizeof (vader_fifo_t) + 4 * opal_cache_line_size +
(2 * n + component->vader_free_list_inc) * (component->eager_limit + 2 * opal_cache_line_size);
/* before we multiply by n, make sure the result won't overflow */
/* Stick that little pad in, particularly since we'll eventually
* need a little extra space. E.g., in mca_mpool_vader_init() in
* mpool_vader_component.c when sizeof(mca_common_sm_module_t) is
* added.
*/
if ( ((double) res.size) * n > LONG_MAX - 4096 )
return OMPI_ERR_OUT_OF_RESOURCE;
res.size *= n;
/* now, create it */
component->vader_mpool =
mca_mpool_base_module_create(component->vader_mpool_name,
vader_btl, &res);
/* Sanity check to ensure that we found it */
if(NULL == component->vader_mpool) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
component->vader_mpool_base =
component->vader_mpool->mpool_base (component->vader_mpool);
return OMPI_SUCCESS;
}
static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n)
{
mca_btl_vader_component_t *component = &mca_btl_vader_component;
size_t size;
char *vader_ctl_file;
vader_fifo_t *my_fifos;
ompi_proc_t **procs;
size_t num_procs;
int i, rc;
rc = vader_init_mpool (vader_btl, n);
if (OMPI_SUCCESS != rc) {
return rc;
}
/* create a list of peers */
component->vader_peers = (struct mca_btl_base_endpoint_t **)
calloc(n, sizeof(struct mca_btl_base_endpoint_t *));
if(NULL == component->vader_peers)
return OMPI_ERR_OUT_OF_RESOURCE;
/* Allocate Shared Memory BTL process coordination
* data structure. This will reside in shared memory */
/* set file name */
if(asprintf(&vader_ctl_file, "%s"OPAL_PATH_SEP"vader_btl_module.%s",
orte_process_info.job_session_dir,
orte_process_info.nodename) < 0)
return OMPI_ERR_OUT_OF_RESOURCE;
/* Pass in a data segment alignment of 0 to get no data
segment (only the shared control structure) */
size = sizeof (mca_common_sm_seg_header_t) +
n * (sizeof (vader_fifo_t *) + sizeof (char *)
+ sizeof (xpmem_segid_t)) + opal_cache_line_size;
procs = ompi_proc_world(&num_procs);
if (!(mca_btl_vader_component.vader_seg =
mca_common_sm_init(procs, num_procs, size, vader_ctl_file,
sizeof (mca_common_sm_seg_header_t),
opal_cache_line_size))) {
opal_output(0, "vader_add_procs: unable to create shared memory "
"BTL coordinating strucure :: size %lu \n",
(unsigned long) size);
free(procs);
free(vader_ctl_file);
return OMPI_ERROR;
}
free(procs);
free(vader_ctl_file);
component->shm_fifo = (volatile vader_fifo_t **) component->vader_seg->module_data_addr;
component->shm_bases = (char **)(component->shm_fifo + n);
component->shm_seg_ids = (xpmem_segid_t *)(component->shm_bases + n);
/* set the base of the shared memory segment */
component->shm_bases[component->my_smp_rank] = (char *)component->vader_mpool_base;
component->shm_seg_ids[component->my_smp_rank] = component->my_seg_id;
/* initialize the fifo and fast boxes "owned" by this process */
posix_memalign ((void **)&my_fifos, getpagesize (), (n + 1) * getpagesize ());
if(NULL == my_fifos)
return OMPI_ERR_OUT_OF_RESOURCE;
/* cache the pointer to the 2d fifo array. These addresses
* are valid in the current process space */
component->fifo = (vader_fifo_t **) calloc (n, sizeof(vader_fifo_t *));
if(NULL == component->fifo)
return OMPI_ERR_OUT_OF_RESOURCE;
component->shm_fifo[component->my_smp_rank] =
component->fifo[component->my_smp_rank] = my_fifos;
component->apids = (xpmem_apid_t *) calloc (n, sizeof (xpmem_apid_t));
if (NULL == component->apids)
return OMPI_ERR_OUT_OF_RESOURCE;
component->xpmem_rcaches =
(struct mca_rcache_base_module_t **) calloc (n, sizeof (struct mca_rcache_base_module_t *));
if (NULL == component->xpmem_rcaches)
return OMPI_ERR_OUT_OF_RESOURCE;
component->vader_fboxes_in = (unsigned char **) calloc (n, sizeof (char *));
if (NULL == component->vader_fboxes_in)
return OMPI_ERR_OUT_OF_RESOURCE;
component->vader_fboxes_out = (unsigned char **) calloc (n, sizeof (char *));
if (NULL == component->vader_fboxes_out)
return OMPI_ERR_OUT_OF_RESOURCE;
component->vader_next_fbox_in = (unsigned char *) calloc (64, 1);
if (NULL == component->vader_next_fbox_in)
return OMPI_ERR_OUT_OF_RESOURCE;
component->vader_next_fbox_out = (unsigned char *) calloc (64, 1);
if (NULL == component->vader_next_fbox_out)
return OMPI_ERR_OUT_OF_RESOURCE;
/* initialize fragment descriptor free lists */
/* initialize free list for send fragments */
i = ompi_free_list_init_new(&component->vader_frags_eager,
sizeof (mca_btl_vader_frag_t),
opal_cache_line_size, OBJ_CLASS(mca_btl_vader_frag_t),
sizeof (mca_btl_vader_hdr_t) + component->eager_limit,
opal_cache_line_size,
component->vader_free_list_num,
component->vader_free_list_max,
component->vader_free_list_inc,
component->vader_mpool);
if (OMPI_SUCCESS != i)
return i;
/* initialize free list for put/get fragments */
i = ompi_free_list_init_new(&component->vader_frags_user,
sizeof(mca_btl_vader_frag_t),
opal_cache_line_size, OBJ_CLASS(mca_btl_vader_frag_t),
sizeof(mca_btl_vader_hdr_t) + mca_btl_vader_max_inline_send,
opal_cache_line_size,
component->vader_free_list_num,
component->vader_free_list_max,
component->vader_free_list_inc,
component->vader_mpool);
if (OMPI_SUCCESS != i)
return i;
/* set flag indicating btl has been inited */
vader_btl->btl_inited = true;
return OMPI_SUCCESS;
}
static struct
mca_btl_base_endpoint_t *create_vader_endpoint (int local_proc, struct ompi_proc_t *proc)
{
struct mca_btl_base_endpoint_t *ep = (struct mca_btl_base_endpoint_t *)
calloc(1, sizeof (struct mca_btl_base_endpoint_t));
if(NULL != ep) {
ep->peer_smp_rank = local_proc + mca_btl_vader_component.num_smp_procs;
}
return ep;
}
/**
* PML->BTL notification of change in the process list.
* PML->BTL Notification that a receive fragment has been matched.
* Called for message that is send from process with the virtual
* address of the shared memory segment being different than that of
* the receiver.
*
* @param btl (IN)
* @param proc (IN)
* @param peer (OUT)
* @return OMPI_SUCCESS or error status on failure.
*
*/
static int vader_add_procs (struct mca_btl_base_module_t* btl,
size_t nprocs, struct ompi_proc_t **procs,
struct mca_btl_base_endpoint_t **peers,
opal_bitmap_t *reachability)
{
mca_btl_vader_component_t *component = &mca_btl_vader_component;
mca_btl_vader_t *vader_btl = (mca_btl_vader_t *) btl;
int32_t n_local_procs = 0, proc, i, my_smp_rank = -1;
bool have_connected_peer = false;
ompi_proc_t *my_proc;
int rc = OMPI_SUCCESS;
/* initializion */
/* get pointer to my proc structure */
if (NULL == (my_proc = ompi_proc_local())) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
/* Get unique host identifier for each process in the list,
* and idetify procs that are on this host. Add procs on this
* host to shared memory reachbility list. Also, get number
* of local procs in the procs list. */
for (proc = 0; proc < (int32_t) nprocs; ++proc) {
/* check to see if this proc can be reached via shmem (i.e.,
if they're on my local host and in my job) */
if (procs[proc]->proc_name.jobid != my_proc->proc_name.jobid ||
!OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags)) {
peers[proc] = NULL;
continue;
}
/* check to see if this is me */
if (my_proc == procs[proc]) {
my_smp_rank = component->my_smp_rank = n_local_procs++;
continue;
}
/* we have someone to talk to */
have_connected_peer = true;
if (!(peers[proc] = create_vader_endpoint (n_local_procs, procs[proc]))) {
rc = OMPI_ERROR;
goto CLEANUP;
}
n_local_procs++;
/* add this proc to shared memory accessibility list */
rc = opal_bitmap_set_bit (reachability, proc);
if(OMPI_SUCCESS != rc) {
goto CLEANUP;
}
}
/* jump out if there's not someone we can talk to */
if (!have_connected_peer) {
goto CLEANUP;
}
/* make sure that my_smp_rank has been defined */
if(-1 == my_smp_rank) {
rc = OMPI_ERROR;
goto CLEANUP;
}
if (!vader_btl->btl_inited) {
rc = vader_btl_first_time_init(vader_btl, n_local_procs);
if(rc != OMPI_SUCCESS) {
goto CLEANUP;
}
}
/* set local proc's smp rank in the peers structure for
* rapid access and calculate reachability */
for (proc = 0; proc < (int32_t) nprocs; ++proc) {
if(NULL == peers[proc])
continue;
component->vader_peers[peers[proc]->peer_smp_rank] = peers[proc];
peers[proc]->my_smp_rank = my_smp_rank;
}
/* initialize own FIFOs */
/*
* The receiver initializes all its FIFOs. All components will
* be allocated near the receiver. Nothing will be local to
* "the sender" since there will be many senders.
*/
rc = vader_fifo_init (component->fifo[my_smp_rank]);
if (OMPI_SUCCESS != rc) {
goto CLEANUP;
}
opal_atomic_wmb();
/* Sync with other local procs. Force the FIFO initialization to always
* happens before the readers access it.
*/
opal_atomic_add_32( &component->vader_seg->module_seg->seg_inited, 1);
while (n_local_procs >
component->vader_seg->module_seg->seg_inited) {
opal_progress();
opal_atomic_rmb();
}
/* coordinate with other processes */
for (i = 0 ; i < n_local_procs ; ++i) {
int peer_smp_rank = i + component->num_smp_procs;
/* spin until this element is allocated */
/* doesn't really wait for that process... FIFO might be allocated, but not initialized */
while (NULL == component->shm_fifo[peer_smp_rank]) {
opal_progress();
opal_atomic_rmb();
}
if (my_smp_rank != peer_smp_rank) {
void *rem_ptr = (void *) component->shm_fifo[peer_smp_rank];
component->apids[peer_smp_rank] =
xpmem_get (component->shm_seg_ids[peer_smp_rank],
XPMEM_RDWR, XPMEM_PERMIT_MODE, (void *) 0666);
component->xpmem_rcaches[peer_smp_rank] = mca_rcache_base_module_create("vma");
/* get a persistent pointer to the peer's fifo */
component->fifo[peer_smp_rank] =
vader_reg_to_ptr (vader_get_registation (peer_smp_rank, rem_ptr,
(n_local_procs + 1) * getpagesize (),
MCA_MPOOL_FLAGS_PERSIST), rem_ptr);
/* fast boxes are allocated at the same time as the fifos */
component->vader_fboxes_in[peer_smp_rank] = (unsigned char *) component->fifo[my_smp_rank] +
(peer_smp_rank + 1) * getpagesize ();
component->vader_fboxes_out[peer_smp_rank] = (unsigned char *) component->fifo[peer_smp_rank] +
(my_smp_rank + 1) * getpagesize ();
component->vader_next_fbox_in[peer_smp_rank] = 0;
component->vader_next_fbox_out[peer_smp_rank] = 0;
memset (component->vader_fboxes_in[peer_smp_rank], MCA_BTL_VADER_FBOX_FREE, getpagesize());
}
}
/* update the local smp process count */
component->num_smp_procs += n_local_procs;
/* make sure we have enough eager fragmnents for each process */
rc = ompi_free_list_resize(&component->vader_frags_eager,
component->num_smp_procs * 2);
CLEANUP:
return rc;
}
/**
* PML->BTL notification of change in the process list.
*
* @param btl (IN) BTL instance
* @param proc (IN) Peer process
* @param peer (IN) Peer addressing information.
* @return Status indicating if cleanup was successful
*
*/
static int vader_del_procs(struct mca_btl_base_module_t *btl,
size_t nprocs, struct ompi_proc_t **procs,
struct mca_btl_base_endpoint_t **peers)
{
return OMPI_SUCCESS;
}
/**
* MCA->BTL Clean up any resources held by BTL module
* before the module is unloaded.
*
* @param btl (IN) BTL module.
*
* Prior to unloading a BTL module, the MCA framework will call
* the BTL finalize method of the module. Any resources held by
* the BTL should be released and if required the memory corresponding
* to the BTL module freed.
*
*/
static int vader_finalize(struct mca_btl_base_module_t *btl)
{
return OMPI_SUCCESS;
}
/**
* Register a callback function that is called on error..
*
* @param btl (IN) BTL module
* @param cbfunc (IN) function to call on error
* @return Status indicating if cleanup was successful
*/
static int vader_register_error_cb(struct mca_btl_base_module_t* btl,
mca_btl_base_module_error_cb_fn_t cbfunc)
{
((mca_btl_vader_t *)btl)->error_cb = cbfunc;
return OMPI_SUCCESS;
}
/**
* Allocate a segment.
*
* @param btl (IN) BTL module
* @param size (IN) Request segment size.
*/
mca_btl_base_descriptor_t *mca_btl_vader_alloc(struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
uint8_t order, size_t size, uint32_t flags)
{
mca_btl_vader_frag_t *frag = NULL;
if (size <= (size_t) mca_btl_vader_max_inline_send) {
(void) MCA_BTL_VADER_FRAG_ALLOC_USER(frag);
} else if (size <= mca_btl_vader_component.eager_limit) {
(void) MCA_BTL_VADER_FRAG_ALLOC_EAGER(frag);
}
if (OPAL_LIKELY(frag != NULL)) {
frag->segment.seg_len = size;
frag->endpoint = endpoint;
frag->base.des_flags = flags;
frag->base.order = order;
frag->base.des_src = &frag->segment;
frag->base.des_src_cnt = 1;
frag->base.des_dst = &frag->segment;
frag->base.des_src_cnt = 1;
}
return (mca_btl_base_descriptor_t *) frag;
}
/**
* Return a segment allocated by this BTL.
*
* @param btl (IN) BTL module
* @param segment (IN) Allocated segment.
*/
static int vader_free (struct mca_btl_base_module_t *btl, mca_btl_base_descriptor_t *des)
{
MCA_BTL_VADER_FRAG_RETURN((mca_btl_vader_frag_t *) des);
return OMPI_SUCCESS;
}
struct mca_btl_base_descriptor_t *vader_prepare_dst(struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_mpool_base_registration_t *registration,
struct opal_convertor_t *convertor,
uint8_t order, size_t reserve, size_t *size,
uint32_t flags)
{
mca_btl_vader_frag_t *frag;
void *data_ptr;
(void) MCA_BTL_VADER_FRAG_ALLOC_USER(frag);
if (OPAL_UNLIKELY(NULL == frag)) {
return NULL;
}
opal_convertor_get_current_pointer (convertor, (void **) &data_ptr);
frag->segment.seg_addr.pval = data_ptr;
frag->segment.seg_len = *size;
frag->base.des_dst = &frag->segment;
frag->base.des_dst_cnt = 1;
frag->base.order = order;
frag->base.des_flags = flags;
frag->endpoint = endpoint;
return &frag->base;
}
/**
* Pack data
*
* @param btl (IN) BTL module
*/
static struct mca_btl_base_descriptor_t *vader_prepare_src (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
mca_mpool_base_registration_t *registration,
struct opal_convertor_t *convertor,
uint8_t order, size_t reserve, size_t *size,
uint32_t flags)
{
struct iovec iov, *lcl_mem;
mca_btl_vader_frag_t *frag;
uint32_t iov_count = 1;
void *data_ptr, *fbox_ptr;
int rc;
opal_convertor_get_current_pointer (convertor, &data_ptr);
if (OPAL_LIKELY(reserve)) {
/* in place send fragment */
if (OPAL_UNLIKELY(opal_convertor_need_buffers(convertor))) {
/* non-contiguous data requires the convertor */
(void) MCA_BTL_VADER_FRAG_ALLOC_EAGER(frag);
if (OPAL_UNLIKELY(NULL == frag)) {
return NULL;
}
iov.iov_len = *size;
iov.iov_base =
(IOVBASE_TYPE *)(((uintptr_t)(frag->segment.seg_addr.pval)) +
reserve);
rc = opal_convertor_pack (convertor, &iov, &iov_count, size);
if (OPAL_UNLIKELY(rc < 0)) {
MCA_BTL_VADER_FRAG_RETURN(frag);
return NULL;
}
frag->segment.seg_len = reserve + *size;
} else {
(void) MCA_BTL_VADER_FRAG_ALLOC_USER(frag);
if (OPAL_UNLIKELY(NULL == frag)) {
return NULL;
}
if ((*size + reserve) > (size_t) mca_btl_vader_max_inline_send) {
/* single copy send */
/* pack the iovec after the reserved memory */
lcl_mem = (struct iovec *) ((uintptr_t)frag->segment.seg_addr.pval + reserve);
frag->hdr->flags = MCA_BTL_VADER_FLAG_SINGLE_COPY;
lcl_mem->iov_base = data_ptr;
lcl_mem->iov_len = *size;
frag->segment.seg_len = reserve;
} else {
/* inline send */
/* try to reserve a fast box for this transfer */
fbox_ptr = mca_btl_vader_reserve_fbox (endpoint->peer_smp_rank, reserve + *size);
if (fbox_ptr) {
frag->hdr->flags |= MCA_BTL_VADER_FLAG_FBOX;
frag->segment.seg_addr.pval = fbox_ptr;
}
/* NTH: the covertor adds some latency so we bypass it here */
vader_memmove ((void *)((uintptr_t)frag->segment.seg_addr.pval + reserve),
data_ptr, *size);
frag->segment.seg_len = reserve + *size;
}
}
} else {
/* put/get fragment */
(void) MCA_BTL_VADER_FRAG_ALLOC_USER(frag);
if (OPAL_UNLIKELY(NULL == frag)) {
return NULL;
}
frag->segment.seg_addr.pval = data_ptr;
frag->segment.seg_len = reserve + *size;
}
frag->base.des_src = &frag->segment;
frag->base.des_src_cnt = 1;
frag->base.order = order;
frag->base.des_flags = flags;
frag->endpoint = endpoint;
return &frag->base;
}
/**
* Fault Tolerance Event Notification Function
* @param state Checkpoint Stae
* @return OMPI_SUCCESS or failure status
*/
static int vader_ft_event (int state)
{
return OMPI_SUCCESS;
}