/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2011 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2009 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2007 Voltaire. All rights reserved. * Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2010-2012 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2011 NVIDIA Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #include "ompi_config.h" #include "ompi/constants.h" #include "opal/util/output.h" #include "orte/util/proc_info.h" #include "orte/util/show_help.h" #include "orte/runtime/orte_globals.h" #include "opal/mca/base/mca_base_param.h" #include "ompi/mca/btl/base/btl_base_error.h" #include "btl_vader.h" #include "btl_vader_frag.h" #include "btl_vader_fifo.h" #include "btl_vader_fbox.h" static int mca_btl_vader_component_progress (void); static int mca_btl_vader_component_open(void); static int mca_btl_vader_component_close(void); static int mca_btl_vader_component_register(void); static mca_btl_base_module_t** mca_btl_vader_component_init(int *num_btls, bool enable_progress_threads, bool enable_mpi_threads); /* limit where we should switch from bcopy to memcpy */ int mca_btl_vader_memcpy_limit = 524288; int mca_btl_vader_log_align = 21; /* 2 MiB */ /* maximum size for using copy-in-copy out semantics for contiguous sends */ int mca_btl_vader_max_inline_send = 256; /* * Shared Memory (VADER) component instance. */ mca_btl_vader_component_t mca_btl_vader_component = { { /* First, the mca_base_component_t struct containing meta information about the component itself */ { MCA_BTL_BASE_VERSION_2_0_0, "vader", /* MCA component name */ OMPI_MAJOR_VERSION, /* MCA component major version */ OMPI_MINOR_VERSION, /* MCA component minor version */ OMPI_RELEASE_VERSION, /* MCA component release version */ mca_btl_vader_component_open, /* component open */ mca_btl_vader_component_close, /* component close */ NULL, mca_btl_vader_component_register, }, { /* The component is checkpoint ready */ MCA_BASE_METADATA_PARAM_CHECKPOINT }, mca_btl_vader_component_init, mca_btl_vader_component_progress, } /* end super */ }; /* * utility routines for parameter registration */ static inline char *mca_btl_vader_param_register_string(const char *param_name, const char *default_value) { char *param_value; int id = mca_base_param_register_string("btl", "vader", param_name, NULL, default_value); mca_base_param_lookup_string(id, ¶m_value); return param_value; } static inline int mca_btl_vader_param_register_int(const char *param_name, int value) { int id = mca_base_param_register_int("btl", "vader", param_name, NULL, value); mca_base_param_lookup_int(id, &value); return value; } static int mca_btl_vader_component_register (void) { /* register VADER component parameters */ mca_btl_vader_component.vader_free_list_num = mca_btl_vader_param_register_int("free_list_num", 8); mca_btl_vader_component.vader_free_list_max = mca_btl_vader_param_register_int("free_list_max", -1); mca_btl_vader_component.vader_free_list_inc = mca_btl_vader_param_register_int("free_list_inc", 64); mca_btl_vader_component.vader_mpool_name = mca_btl_vader_param_register_string("mpool", "sm"); mca_btl_vader_memcpy_limit = mca_btl_vader_param_register_int("memcpy_limit", mca_btl_vader_memcpy_limit); mca_btl_vader_log_align = mca_btl_vader_param_register_int("log_align", mca_btl_vader_log_align); /* limit segment alignment to be between 4k and 16M */ if (mca_btl_vader_log_align < 12) { mca_btl_vader_log_align = 12; } else if (mca_btl_vader_log_align > 25) { mca_btl_vader_log_align = 25; } mca_btl_vader_max_inline_send = mca_btl_vader_param_register_int("max_inline_send", mca_btl_vader_max_inline_send); mca_btl_vader.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH; mca_btl_vader.super.btl_eager_limit = 64 * 1024; mca_btl_vader.super.btl_rndv_eager_limit = mca_btl_vader.super.btl_eager_limit; mca_btl_vader.super.btl_max_send_size = mca_btl_vader.super.btl_eager_limit; mca_btl_vader.super.btl_rdma_pipeline_send_length = mca_btl_vader.super.btl_eager_limit; mca_btl_vader.super.btl_rdma_pipeline_frag_size = mca_btl_vader.super.btl_eager_limit; mca_btl_vader.super.btl_min_rdma_pipeline_size = mca_btl_vader.super.btl_eager_limit; mca_btl_vader.super.btl_flags = MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_SEND_INPLACE; mca_btl_vader.super.btl_seg_size = sizeof (mca_btl_base_segment_t); mca_btl_vader.super.btl_bandwidth = 40000; /* Mbs */ mca_btl_vader.super.btl_latency = 1; /* Microsecs */ /* Call the BTL based to register its MCA params */ mca_btl_base_param_register(&mca_btl_vader_component.super.btl_version, &mca_btl_vader.super); return OMPI_SUCCESS; } /* * Called by MCA framework to open the component, registers * component parameters. */ static int mca_btl_vader_component_open(void) { mca_btl_vader_component.eager_limit = mca_btl_vader.super.btl_eager_limit; /* initialize objects */ OBJ_CONSTRUCT(&mca_btl_vader_component.vader_frags_eager, ompi_free_list_t); OBJ_CONSTRUCT(&mca_btl_vader_component.vader_frags_user, ompi_free_list_t); OBJ_CONSTRUCT(&mca_btl_vader_component.active_sends, opal_list_t); return OMPI_SUCCESS; } /* * component cleanup - sanity checking of queue lengths */ static int mca_btl_vader_component_close(void) { int return_value = OMPI_SUCCESS; /** * We don't have to destroy the fragment lists. They are allocated * directly into the mmapped file, they will auto-magically disappear * when the file get unmapped. */ /*OBJ_DESTRUCT(&mca_btl_vader_component.vader_frags_eager);*/ /* unmap the shared memory control structure */ if(mca_btl_vader_component.vader_seg != NULL) { return_value = mca_common_sm_fini( mca_btl_vader_component.vader_seg ); if( OMPI_SUCCESS != return_value ) { return_value=OMPI_ERROR; opal_output(0," mca_common_sm_fini failed\n"); goto CLEANUP; } /* unlink file, so that it will be deleted when all references * to it are gone - no error checking, since we want all procs * to call this, so that in an abnormal termination scenario, * this file will still get cleaned up */ /* XXX LANL TODO -- remove unlink once the shmem segment uses xpmem */ unlink(mca_btl_vader_component.vader_seg->shmem_ds.seg_name); OBJ_RELEASE(mca_btl_vader_component.vader_seg); } if (NULL != mca_btl_vader_component.vader_mpool_name) { free(mca_btl_vader_component.vader_mpool_name); } OBJ_DESTRUCT(&mca_btl_vader_component.active_sends); CLEANUP: /* return */ return return_value; } /* * VADER component initialization */ static mca_btl_base_module_t **mca_btl_vader_component_init (int *num_btls, bool enable_progress_threads, bool enable_mpi_threads) { mca_btl_vader_component_t *component = &mca_btl_vader_component; mca_btl_base_module_t **btls = NULL; *num_btls = 0; /* if no session directory was created, then we cannot be used */ /* XXX LANL FIXME -- this is not the case. we can use an anonymous segment */ if (!orte_create_session_dirs) { return NULL; } /* lookup/create shared memory pool only when used */ component->vader_mpool = NULL; component->vader_mpool_base = NULL; btls = (mca_btl_base_module_t **) calloc (1, sizeof (mca_btl_base_module_t *)); if (NULL == btls) { return NULL; } /* create an xpmem segment for the entire memory space */ component->my_seg_id = xpmem_make (0, 0xffffffffffffffffll, XPMEM_PERMIT_MODE, (void *)0666); if (-1 == component->my_seg_id) { free (btls); return NULL; } *num_btls = 1; /* get pointer to the btls */ btls[0] = (mca_btl_base_module_t *) &mca_btl_vader; /* initialize some BTL data */ /* start with no VADER procs */ component->num_smp_procs = 0; component->my_smp_rank = -1; /* not defined */ /* set flag indicating btl not inited */ mca_btl_vader.btl_inited = false; return btls; } static inline void mca_btl_vader_progress_sends (void) { opal_list_t *list = &mca_btl_vader_component.active_sends; opal_list_item_t *item, *next; mca_btl_vader_frag_t *frag; for (item = opal_list_get_first (list) ; item != opal_list_get_end (list) ; ) { frag = (mca_btl_vader_frag_t *) item; next = opal_list_get_next (item); if (OPAL_LIKELY(frag->hdr->complete)) { opal_list_remove_item (&mca_btl_vader_component.active_sends, item); mca_btl_vader_frag_complete (frag); } item = next; } } static int mca_btl_vader_component_progress (void) { int my_smp_rank = mca_btl_vader_component.my_smp_rank; vader_fifo_t *fifo = mca_btl_vader_component.fifo[my_smp_rank]; mca_btl_active_message_callback_t *reg; mca_btl_vader_frag_t frag; mca_btl_vader_hdr_t *hdr; mca_btl_base_segment_t segments[2]; mca_mpool_base_registration_t *xpmem_reg = NULL; /* check for messages in fast boxes */ mca_btl_vader_check_fboxes (); /* check active sends for completion */ mca_btl_vader_progress_sends (); /* poll the fifo once */ hdr = vader_fifo_read (fifo); if (NULL == hdr) { return 0; } reg = mca_btl_base_active_message_trigger + hdr->tag; frag.base.des_dst = segments; segments[0].seg_addr.pval = (void *) (hdr + 1); segments[0].seg_len = hdr->len; if (OPAL_UNLIKELY(hdr->flags & MCA_BTL_VADER_FLAG_SINGLE_COPY)) { struct iovec *rem_mem = (struct iovec *) ((uintptr_t)segments[0].seg_addr.pval + hdr->len); xpmem_reg = vader_get_registation (hdr->my_smp_rank, rem_mem->iov_base, rem_mem->iov_len, 0); segments[1].seg_addr.pval = vader_reg_to_ptr (xpmem_reg, rem_mem->iov_base); segments[1].seg_len = rem_mem->iov_len; /* recv upcall */ frag.base.des_dst_cnt = 2; reg->cbfunc(&mca_btl_vader.super, hdr->tag, &(frag.base), reg->cbdata); vader_return_registration (xpmem_reg, hdr->my_smp_rank); } else { frag.base.des_dst_cnt = 1; reg->cbfunc(&mca_btl_vader.super, hdr->tag, &(frag.base), reg->cbdata); } /* return the fragment */ hdr->complete = true; return 1; }