diff --git a/ompi/config/ompi_check_ugni.m4 b/ompi/config/ompi_check_ugni.m4 new file mode 100644 index 0000000000..24c65c8cdb --- /dev/null +++ b/ompi/config/ompi_check_ugni.m4 @@ -0,0 +1,95 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2006 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2006 QLogic Corp. All rights reserved. +# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2011 Los Alamos National Security, LLC. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# OMPI_CHECK_UGNI(prefix, [action-if-found], [action-if-not-found]) +# -------------------------------------------------------- +# check if GNI support can be found. sets prefix_{CPPFLAGS, +# LDFLAGS, LIBS} as needed and runs action-if-found if there is +# support, otherwise executes action-if-not-found +# +# NOTES +# on Cray XE6 systems, the GNI development header (gni_pub.h) is in a +# completely different place than the ugni library (libugni). +# +# EXAMPLE CONFIGURE USAGE: +# --with-ugni=/base/path/to/libugni --with-ugni-includedir=/path/to/gni_pub.h +# +# --with-ugni=/opt/cray/ugni/default --with-ugni-includedir=/opt/cray/gni-headers/default/include + +AC_DEFUN([OMPI_CHECK_UGNI], [ + AC_ARG_WITH([ugni], [ + AC_HELP_STRING([--with-ugni(=DIR)], + [Build GNI (Cray Gemini) support, optionally adding DIR/include, DIR/lib, and DIR/lib64 to the search path for headers and libraries])]) + + dnl does the path exist? + OMPI_CHECK_WITHDIR([ugni], [$with_ugni], [.]) + + AC_ARG_WITH([ugni-libdir], [ + AC_HELP_STRING([--with-ugni-libdir=DIR], [ + Search for GNI (Cray Gemini) libraries in DIR])]) + OMPI_CHECK_WITHDIR([ugni-libdir], [$with_ugni_libdir], [libugni.*]) + + AC_ARG_WITH([ugni-includedir], [ + AC_HELP_STRING([--with-ugni-includedir=DIR], [ + Search for GNI (Cray Gemini) headers in DIR])]) + OMPI_CHECK_WITHDIR([ugni-includedir], [$with_ugni_includedir], [gni_pub.h]) + + AS_IF([test "$with_ugni_includedir" != "" -a "$with_ugni_includedir" != "yes" -a "$with_ugni_includedir" != "no"], + [$1_CPPFLAGS="$$1_CPPFLAGS -I$with_ugni_includedir"]) + + ompi_check_ugni_$1_save_CPPFLAGS="$CPPFLAGS" + ompi_check_ugni_$1_save_LDFLAGS="$LDFLAGS" + ompi_check_ugni_$1_save_LIBS="$LIBS" + + AS_IF([test "$with_ugni" != "no"], [ + AS_IF([test ! -z "$with_ugni" -a "$with_ugni" != "yes"], [ + ompi_check_ugni_dir="$with_ugni"]) + AS_IF([test ! -z "$with_ugni_libdir" -a "$with_ugni_libdir" != "yes"], [ + ompi_check_ugni_libdir="$with_ugni_libdir"]) + + OMPI_CHECK_PACKAGE([$1], + [ugni.h], + [ugni], + [GNI_CdmCreate], + [], + [$ompi_check_ugni_dir], + [$ompi_check_ugni_libdir], + [ompi_check_ugni_happy="yes"], + [ompi_check_ugni_happy="no"])], + [ompi_check_ugni_happy="no"]) + + CPPFLAGS="$ompi_check_ugni_$1_save_CPPFLAGS" + LDFLAGS="$ompi_check_ugni_$1_save_LDFLAGS" + LIBS="$ompi_check_ugni_$1_save_LIBS" + + dnl XXX not sure if this is true, but will assume so... + AS_IF([test "$ompi_check_ugni_happy" = "yes" -a "$enable_progress_threads" = "yes"], + [AC_MSG_WARN([GNI driver does not currently support progress threads. Disabling.]) + ompi_check_ugni_happy="no"]) + + AS_IF([test "$ompi_check_ugni_happy" = "yes"], + [$2], + [AS_IF([test ! -z "$with_ugni" -a "$with_ugni" != "no"], + [AC_MSG_ERROR([GNI support requested but not found. Cannot continue.])]) + $3]) +]) diff --git a/ompi/mca/btl/ugni/Makefile.am b/ompi/mca/btl/ugni/Makefile.am new file mode 100644 index 0000000000..f91484cfc3 --- /dev/null +++ b/ompi/mca/btl/ugni/Makefile.am @@ -0,0 +1,52 @@ +# -*- indent-tabs-mode:nil -*- +# +# Copyright (c) 2011 Los Alamos National Security, LLC. All rights +# reserved. +# Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +AM_CPPFLAGS = $(btl_ugni_CPPFLAGS) + +if MCA_BUILD_ompi_btl_ugni_DSO +component_noinst = +component_install = mca_btl_ugni.la +else +component_noinst = libmca_btl_ugni.la +component_install = +endif + +ugni_SOURCES = \ + btl_ugni_component.c \ + btl_ugni_module.c \ + btl_ugni_add_procs.c \ + btl_ugni_endpoint.h \ + btl_ugni_endpoint.c \ + btl_ugni_frag.c \ + btl_ugni_frag.h \ + btl_ugni_rdma.h \ + btl_ugni_send.c \ + btl_ugni_sendi.c \ + btl_ugni_put.c \ + btl_ugni_get.c \ + btl_ugni.h + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_btl_ugni_la_SOURCES = $(ugni_SOURCES) +nodist_mca_btl_ugni_la_SOURCES = $(ugni_nodist_SOURCES) +mca_btl_ugni_la_LIBADD = $(btl_ugni_LIBS) +mca_btl_ugni_la_LDFLAGS = -module -avoid-version $(btl_ugni_LDFLAGS) + +noinst_LTLIBRARIES = $(component_noinst) +libmca_btl_ugni_la_SOURCES = $(ugni_SOURCES) +nodist_libmca_btl_ugni_la_SOURCES = $(ugni_nodist_SOURCES) +libmca_btl_ugni_la_LIBADD = $(btl_ugni_LIBS) +libmca_btl_ugni_la_LDFLAGS = -module -avoid-version $(btl_ugni_LDFLAGS) diff --git a/ompi/mca/btl/ugni/btl_ugni.h b/ompi/mca/btl/ugni/btl_ugni.h new file mode 100644 index 0000000000..c335d8aa54 --- /dev/null +++ b/ompi/mca/btl/ugni/btl_ugni.h @@ -0,0 +1,248 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/* + * The ugni btl is implemented with native Cray Gemini. + * + * Known issues with ugni: + * - + */ + +#ifndef MCA_BTL_UGNI_H +#define MCA_BTL_UGNI_H + +#include "ompi_config.h" + +#include "ompi/mca/mpool/mpool.h" +#include "ompi/mca/mpool/base/base.h" +#include "ompi/mca/mpool/rdma/mpool_rdma.h" +#include "ompi/runtime/ompi_module_exchange.h" +#include "opal/util/output.h" +#include "opal_stdint.h" + +#include "ompi/mca/btl/btl.h" +#include "ompi/mca/btl/base/base.h" +#include "ompi/mca/btl/base/btl_base_error.h" +#include "ompi/class/ompi_free_list.h" + +#include "ompi/mca/common/ugni/common_ugni.h" + +#include +#include +#include +#include +#include +#include + +/* datagram message ids */ +#define MCA_BTL_UGNI_CONNECT_WILDCARD_ID 0x6b69726b00000000ull +#define MCA_BTL_UGNI_CONNECT_DIRECTED_ID 0x6b61686e00000000ull +#define MCA_BTL_UGNI_DATAGRAM_MASK 0xffffffff00000000ull + +typedef enum { + MCA_BTL_UGNI_TAG_SEND, + MCA_BTL_UGNI_TAG_DISCONNECT, + MCA_BTL_UGNI_TAG_PUT_INIT, + MCA_BTL_UGNI_TAG_PUT_COMPLETE +} mca_btl_ugni_smsg_tag_t; + +/* Maximum number of outstanding eager messages */ +extern int mca_btl_ugni_smsg_max_credits; +extern int mca_btl_ugni_smsg_mbox_size; + +struct mca_btl_ugni_module_t { + mca_btl_base_module_t super; + + ompi_common_ugni_device_t *device; + + size_t endpoint_count; + struct mca_btl_base_endpoint_t **endpoints; + + opal_list_t failed_frags; + + mca_mpool_base_module_t *smsg_mpool; + ompi_free_list_t smsg_mboxes; + + gni_ep_handle_t wildcard_ep; + gni_smsg_attr_t wc_remote_attr, wc_local_attr; + + gni_cq_handle_t bte_local_cq; + gni_cq_handle_t smsg_remote_cq; +}; +typedef struct mca_btl_ugni_module_t mca_btl_ugni_module_t; + +struct mca_btl_ugni_component_t { + /* base BTL component */ + mca_btl_base_component_2_0_0_t super; + + /* maximum supported btls. hardcoded to 1 for now */ + uint32_t ugni_max_btls; + /* Maximum number of entries a completion queue can hold */ + uint32_t cq_size; + + /* number of ugni modules */ + uint32_t ugni_num_btls; + /* ugni modules */ + mca_btl_ugni_module_t *modules; + + /* eager send limit in bytes */ + /* used as the threshold for switching from SMSG */ + size_t eager_limit; + + /* After this message size switch to BTE protocols */ + size_t btl_fma_limit; + /* Switch to put when trying to GET at or above this size */ + size_t btl_get_limit; + + /* eager fragment list */ + ompi_free_list_t ugni_frags_eager; + /* RDMA fragment list */ + ompi_free_list_t ugni_frags_rdma; + + /* initial free list size */ + int ugni_free_list_num; + /* maximum free list size */ + int ugni_free_list_max; + /* free list increment */ + int ugni_free_list_inc; + + /* number of times to retry a post */ + int rdma_max_retries; +}; +typedef struct mca_btl_ugni_component_t mca_btl_ugni_component_t; + +int mca_btl_ugni_module_init (mca_btl_ugni_module_t *ugni_module, + ompi_common_ugni_device_t *device); + +/** + * BML->BTL notification of change in the process list. + * + * location: btl_ugni_add_procs.c + * + * @param btl (IN) BTL module + * @param nprocs (IN) Number of processes + * @param procs (IN) Array of processes + * @param endpoint (OUT) Array of mca_btl_base_endpoint_t structures by BTL. + * @param reachable (OUT) Bitmask indicating set of peer processes that are reachable by this BTL. + * @return OMPI_SUCCESS or error status on failure. + */ +int +mca_btl_ugni_add_procs (struct mca_btl_base_module_t* btl, + size_t nprocs, + struct ompi_proc_t **procs, + struct mca_btl_base_endpoint_t **peers, + opal_bitmap_t *reachable); + +/** + * Notification of change to the process list. + * + * location: btl_ugni_add_procs.c + * + * @param btl (IN) BTL module + * @param nprocs (IN) Number of processes + * @param proc (IN) Set of processes + * @param peer (IN) Set of peer addressing information. + * @return Status indicating if cleanup was successful + */ +int +mca_btl_ugni_del_procs (struct mca_btl_base_module_t *btl, + size_t nprocs, + struct ompi_proc_t **procs, + struct mca_btl_base_endpoint_t **peers); + +/** + * Initiate an asynchronous send. + * + * location: btl_ugni_send.c + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL addressing information + * @param descriptor (IN) Description of the data to be transfered + * @param tag (IN) The tag value used to notify the peer. + */ +int +mca_btl_ugni_send (struct mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *btl_peer, + struct mca_btl_base_descriptor_t *descriptor, + mca_btl_base_tag_t tag); + +/** + * Initiate an immediate blocking send. + * + * location: btl_ugni_sendi.c + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL addressing information + * @param convertor (IN) Data type convertor + * @param header (IN) Pointer to header. + * @param header_size (IN) Size of header. + * @param payload_size (IN) Size of payload (from convertor). + * @param order (IN) The ordering tag (may be MCA_BTL_NO_ORDER) + * @param flags (IN) Flags. + * @param tag (IN) The tag value used to notify the peer. + * @param descriptor (OUT) The descriptor to be returned unable to be sent immediately + */ +int +mca_btl_ugni_sendi (struct mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *endpoint, + struct opal_convertor_t *convertor, + void *header, size_t header_size, + size_t payload_size, uint8_t order, + uint32_t flags, mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t **descriptor); + +/** + * Initiate a get operation. + * + * location: btl_ugni_get.c + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL addressing information + * @param descriptor (IN) Description of the data to be transferred + */ +int +mca_btl_ugni_get (struct mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *endpoint, + struct mca_btl_base_descriptor_t *des); + +/** + * Initiate a put operation. + * + * location: btl_ugni_put.c + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL addressing information + * @param descriptor (IN) Description of the data to be transferred + */ +int +mca_btl_ugni_put (struct mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *endpoint, + struct mca_btl_base_descriptor_t *des); + +mca_btl_base_descriptor_t * +mca_btl_ugni_alloc(struct mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *endpoint, + uint8_t order, size_t size, uint32_t flags); + +struct mca_btl_ugni_reg_t { + mca_mpool_base_registration_t base; + gni_mem_handle_t memory_hdl; + void *buffer; + size_t size; +}; +typedef struct mca_btl_ugni_reg_t mca_btl_ugni_reg_t; + +/* Global structures */ + +OMPI_MODULE_DECLSPEC extern mca_btl_ugni_component_t mca_btl_ugni_component; +OMPI_MODULE_DECLSPEC extern mca_btl_ugni_module_t mca_btl_ugni_module; + +#endif diff --git a/ompi/mca/btl/ugni/btl_ugni_add_procs.c b/ompi/mca/btl/ugni/btl_ugni_add_procs.c new file mode 100644 index 0000000000..2ecb3feb45 --- /dev/null +++ b/ompi/mca/btl/ugni/btl_ugni_add_procs.c @@ -0,0 +1,131 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include +#include +#include +#include +#include + +#include "ompi/constants.h" +#include "ompi/communicator/communicator.h" + +#include "ompi_config.h" + +#include "btl_ugni.h" +#include "btl_ugni_frag.h" +#include "btl_ugni_endpoint.h" + +int mca_btl_ugni_add_procs(struct mca_btl_base_module_t* btl, + size_t nprocs, + struct ompi_proc_t **procs, + struct mca_btl_base_endpoint_t **peers, + opal_bitmap_t *reachable) { + mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl; + size_t ntotal_procs; + size_t i; + int rc; + + + if (NULL == ugni_module->endpoints) { + (void) ompi_proc_world (&ntotal_procs); + + ugni_module->endpoints = calloc (ntotal_procs, sizeof (mca_btl_base_endpoint_t *)); + + if (OPAL_UNLIKELY(NULL == ugni_module->endpoints)) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + rc = ompi_free_list_init_new (&mca_btl_ugni_component.ugni_frags_eager, + sizeof (mca_btl_ugni_base_frag_t), + opal_cache_line_size, OBJ_CLASS(mca_btl_ugni_base_frag_t), + sizeof (mca_btl_ugni_frag_hdr_t) + mca_btl_ugni_component.eager_limit, + opal_cache_line_size, + mca_btl_ugni_component.ugni_free_list_num, + mca_btl_ugni_component.ugni_free_list_max, + mca_btl_ugni_component.ugni_free_list_inc, + NULL); + if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { + return rc; + } + + rc = ompi_free_list_init_new (&mca_btl_ugni_component.ugni_frags_rdma, + sizeof (mca_btl_ugni_rdma_frag_t), + opal_cache_line_size, OBJ_CLASS(mca_btl_ugni_rdma_frag_t), + 0, opal_cache_line_size, + mca_btl_ugni_component.ugni_free_list_num, + mca_btl_ugni_component.ugni_free_list_max, + mca_btl_ugni_component.ugni_free_list_inc, + NULL); + if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { + return rc; + } + } + + for (i = 0 ; i < nprocs ; ++i) { + struct ompi_proc_t *ompi_proc = procs[i]; + uint32_t rem_rank = ompi_proc->proc_name.vpid; + + if (OPAL_PROC_ON_LOCAL_NODE(ompi_proc->proc_flags)) { + /* ignore local procs */ + peers[i] = NULL; + continue; + } + + /* Create and Init endpoints */ + rc = mca_btl_ugni_init_ep (peers + i, (mca_btl_ugni_module_t *) btl, ompi_proc); + if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { + BTL_ERROR(("btl/ugni error initializing endpoint")); + return rc; + } + + /* Set the reachable bit */ + rc = opal_bitmap_set_bit (reachable, i); + + /* Store a reference to this peer */ + ugni_module->endpoints[rem_rank] = peers[i]; + } + + ugni_module->endpoint_count += nprocs; + + return OMPI_SUCCESS; +} + +int mca_btl_ugni_del_procs (struct mca_btl_base_module_t *btl, + size_t nprocs, struct ompi_proc_t **procs, + struct mca_btl_base_endpoint_t **peers) { + mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl; + size_t i; + + /* NTH: this function destroys the endpoint list which will cause bad + things to happen if the caller only wants to delete a few procs. */ + + for (i = 0 ; i < nprocs ; ++i) { + struct ompi_proc_t *ompi_proc = procs[i]; + uint32_t rem_rank = ompi_proc->proc_name.vpid; + + if (ugni_module->endpoints[rem_rank]) { + mca_btl_ugni_release_ep (ugni_module->endpoints[rem_rank]); + } + + ugni_module->endpoints[rem_rank] = NULL; + } + + ugni_module->endpoint_count -= nprocs; + + if (0 == ugni_module->endpoint_count) { + free (ugni_module->endpoints); + ugni_module->endpoints = NULL; + } + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/btl/ugni/btl_ugni_component.c b/ompi/mca/btl/ugni/btl_ugni_component.c new file mode 100644 index 0000000000..69eed74677 --- /dev/null +++ b/ompi/mca/btl/ugni/btl_ugni_component.c @@ -0,0 +1,627 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "btl_ugni.h" +#include "btl_ugni_endpoint.h" +#include "btl_ugni_frag.h" +#include "btl_ugni_rdma.h" + +#include "opal/mca/base/mca_base_param.h" +#include "opal/memoryhooks/memory.h" +#include "ompi/runtime/params.h" + +int mca_btl_ugni_smsg_max_credits = 32; +int mca_btl_ugni_smsg_mbox_size; + +static int btl_ugni_component_register(void); +static int btl_ugni_component_open(void); +static int btl_ugni_component_close(void); +static mca_btl_base_module_t **mca_btl_ugni_component_init(int *, bool, bool); +static int mca_btl_ugni_component_progress(void); + +mca_btl_ugni_component_t mca_btl_ugni_component = { + { + /* First, the mca_base_component_t struct containing meta information + about the component itself */ + + { + MCA_BTL_BASE_VERSION_2_0_0, + + "ugni", /* MCA component name */ + OMPI_MAJOR_VERSION, /* MCA component major version */ + OMPI_MINOR_VERSION, /* MCA component minor version */ + OMPI_RELEASE_VERSION, /* MCA component release version */ + btl_ugni_component_open, /* component open */ + btl_ugni_component_close, /* component close */ + NULL, /* component query */ + btl_ugni_component_register, /* component register */ + }, + { + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + mca_btl_ugni_component_init, + mca_btl_ugni_component_progress, + } +}; + +static inline char * +mca_btl_ugni_param_register_string(const char *param_name, + const char *default_value) +{ + char *param_value; + int id = mca_base_param_register_string("btl", "ugni", param_name, NULL, + default_value); + mca_base_param_lookup_string(id, ¶m_value); + return param_value; +} + +static inline int +mca_btl_ugni_param_register_int (const char *param_name, int value) +{ + int id = mca_base_param_register_int("btl", "ugni", param_name, NULL, value); + mca_base_param_lookup_int(id, &value); + return value; +} + +static int +btl_ugni_component_register(void) +{ + mca_btl_ugni_component.ugni_free_list_num = + mca_btl_ugni_param_register_int("free_list_num", 8); + mca_btl_ugni_component.ugni_free_list_max = + mca_btl_ugni_param_register_int("free_list_max", -1); + mca_btl_ugni_component.ugni_free_list_inc = + mca_btl_ugni_param_register_int("free_list_inc", 64); + + mca_btl_ugni_component.cq_size = + mca_btl_ugni_param_register_int("cq_size", 25000); + + mca_btl_ugni_component.btl_fma_limit = + mca_btl_ugni_param_register_int("fma_limit", 4 * 1024); + + mca_btl_ugni_component.btl_get_limit = + mca_btl_ugni_param_register_int("get_limit", 8 * 1024); + + mca_btl_ugni_component.rdma_max_retries = + mca_btl_ugni_param_register_int("rdma_max_retries", 8); + + mca_btl_ugni_module.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH; + + /* smsg threshold */ + mca_btl_ugni_module.super.btl_eager_limit = 0; /* set dynamically in module_init */ + mca_btl_ugni_module.super.btl_rndv_eager_limit = 8 * 1024; + mca_btl_ugni_module.super.btl_rdma_pipeline_frag_size = 2 * 1024 * 1024; + mca_btl_ugni_module.super.btl_max_send_size = 0; /* set this later */ + mca_btl_ugni_module.super.btl_rdma_pipeline_send_length = 0; /* set this later */ + + /* threshold for put */ + mca_btl_ugni_module.super.btl_min_rdma_pipeline_size = 0; + + mca_btl_ugni_module.super.btl_flags = MCA_BTL_FLAGS_SEND | + MCA_BTL_FLAGS_RDMA | + MCA_BTL_FLAGS_RDMA_MATCHED; + + mca_btl_ugni_module.super.btl_bandwidth = 40000; /* Mbs */ + mca_btl_ugni_module.super.btl_latency = 2; /* Microsecs */ + + /* Call the BTL based to register its MCA params */ + mca_btl_base_param_register(&mca_btl_ugni_component.super.btl_version, + &mca_btl_ugni_module.super); + return OMPI_SUCCESS; +} + +static int +btl_ugni_component_open(void) +{ + mca_btl_ugni_component.ugni_num_btls = 0; + mca_btl_ugni_component.modules = NULL; + + OBJ_CONSTRUCT(&mca_btl_ugni_component.ugni_frags_eager, ompi_free_list_t); + OBJ_CONSTRUCT(&mca_btl_ugni_component.ugni_frags_rdma, ompi_free_list_t); + + return OMPI_SUCCESS; +} + +/* + * component cleanup - sanity checking of queue lengths + */ +static int +btl_ugni_component_close(void) +{ + ompi_common_ugni_fini (); + + OBJ_DESTRUCT(&mca_btl_ugni_component.ugni_frags_eager); + OBJ_DESTRUCT(&mca_btl_ugni_component.ugni_frags_rdma); + + return OMPI_SUCCESS; +} + +static void mca_btl_ugni_autoset_leave_pinned (void) { + mca_base_param_source_t source; + int index, rc, value; + + /* If we have a memory manager available, and + mpi_leave_pinned==-1, then unless the user explicitly set + mpi_leave_pinned_pipeline==0, then set mpi_leave_pinned to 1. + We have a memory manager if we have both FREE and MUNMAP + support */ + value = opal_mem_hooks_support_level(); + if ((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT) == + ((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT) & value)) { + rc = 0; + index = mca_base_param_find("mpi", NULL, "leave_pinned"); + if (index >= 0) { + if (OPAL_SUCCESS == mca_base_param_lookup_int(index, &value) && + -1 == value) { + ++rc; + } + } + index = mca_base_param_find("mpi", NULL, "leave_pinned_pipeline"); + if (index >= 0) { + if (OPAL_SUCCESS == mca_base_param_lookup_int(index, &value) && + OPAL_SUCCESS == mca_base_param_lookup_source(index, &source, + NULL)) { + if (0 == value && MCA_BASE_PARAM_SOURCE_DEFAULT == source) { + ++rc; + } + } + } + /* If we were good on both parameters, then set leave_pinned=1 */ + if (2 == rc) { + ompi_mpi_leave_pinned = 1; + ompi_mpi_leave_pinned_pipeline = 0; + } + } +} + +static int mca_btl_ugni_smsg_setup (void) { + gni_smsg_attr_t tmp_smsg_attrib; + unsigned int mbox_size; + int rc; + + /* calculate mailbox size */ + tmp_smsg_attrib.msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT; + tmp_smsg_attrib.msg_maxsize = mca_btl_ugni_component.eager_limit + sizeof (mca_btl_ugni_frag_hdr_t); + tmp_smsg_attrib.mbox_maxcredit = mca_btl_ugni_smsg_max_credits; + + rc = GNI_SmsgBufferSizeNeeded (&tmp_smsg_attrib, &mbox_size); + if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) { + BTL_ERROR(("error in GNI_SmsgBufferSizeNeeded")); + return ompi_common_rc_ugni_to_ompi (rc); + } + + mca_btl_ugni_smsg_mbox_size = ((mbox_size + opal_cache_line_size - 1)/opal_cache_line_size) * opal_cache_line_size; + + return OMPI_SUCCESS; +} + +static mca_btl_base_module_t ** +mca_btl_ugni_component_init (int *num_btl_modules, + bool enable_progress_threads, + bool enable_mpi_threads) +{ + struct mca_btl_base_module_t **base_modules; + mca_btl_ugni_module_t *ugni_modules; + unsigned int i; + size_t nprocs; + int rc; + + /* Initialize ugni library and create communication domain */ + rc = ompi_common_ugni_init(); + if (OMPI_SUCCESS != rc) { + return NULL; + } + + /* Create and initialize modules + * Create one module per device + * One btl == One module + */ + /* Manju: I should set this automatically, not hardcoded */ + mca_btl_ugni_component.ugni_num_btls = ompi_common_ugni_module.device_count; + + BTL_VERBOSE(("btl/ugni initializing")); + + ugni_modules = mca_btl_ugni_component.modules = (mca_btl_ugni_module_t *) + calloc (mca_btl_ugni_component.ugni_num_btls, + sizeof (mca_btl_ugni_module_t)); + + if (OPAL_UNLIKELY(NULL == mca_btl_ugni_component.modules)) { + BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__)); + return NULL; + } + + base_modules = (struct mca_btl_base_module_t **) + calloc (mca_btl_ugni_component.ugni_num_btls, + sizeof (struct mca_btl_base_module_t *)); + if (OPAL_UNLIKELY(NULL == base_modules)) { + BTL_ERROR(("Malloc failed : %s:%d", __FILE__, __LINE__)); + return NULL; + } + + mca_btl_ugni_autoset_leave_pinned (); + + (void) ompi_proc_world (&nprocs); + + if (0 == mca_btl_ugni_component.eager_limit) { + /* auto-set the eager limit based on the number of ranks */ + if (nprocs <= 1024) { + mca_btl_ugni_component.eager_limit = 1024; + } else if (nprocs <= 16384) { + mca_btl_ugni_component.eager_limit = 512; + } else { + mca_btl_ugni_component.eager_limit = 256; + } + } + + rc = mca_btl_ugni_smsg_setup (); + if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { + return NULL; + } + + for (i = 0 ; i < mca_btl_ugni_component.ugni_num_btls ; ++i) { + mca_btl_ugni_module_t *ugni_module = ugni_modules + i; + + rc = mca_btl_ugni_module_init (ugni_module, + ompi_common_ugni_module.devices + i); + if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { + BTL_ERROR(("Failed to initialize uGNI module @ %s:%d", __FILE__, + __LINE__)); + return NULL; + } + + base_modules[i] = (mca_btl_base_module_t *) ugni_module; + } + + *num_btl_modules = mca_btl_ugni_component.ugni_num_btls; + + /* XXX TODO remove before release */ + signal (SIGSEGV, SIG_DFL); + + BTL_VERBOSE(("btl/ugni done initializing modules")); + + return base_modules; +} + +static inline void mca_btl_ugni_callback_reverse_get (mca_btl_base_module_t *btl, + mca_btl_base_endpoint_t *ep, + mca_btl_base_descriptor_t *des, + int rc) +{ + mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl; + mca_btl_ugni_base_frag_t *frag = (mca_btl_ugni_base_frag_t *) des; + uint32_t msg_id = ORTE_PROC_MY_NAME->vpid; + + BTL_VERBOSE(("reverse get (put) for rem_ctx %p complete", des->des_cbdata)); + + /* tell peer the put is complete */ + rc = GNI_SmsgSendWTag (frag->endpoint->common->ep_handle, &des->des_cbdata, sizeof (void *), + NULL, 0, msg_id, MCA_BTL_UGNI_TAG_PUT_COMPLETE); + if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) { + /* turn off btl ownership for now */ + des->des_flags &= ~MCA_BTL_DES_FLAGS_BTL_OWNERSHIP; + opal_list_append (&ugni_module->failed_frags, (opal_list_item_t *) des); + } else { + des->des_flags |= MCA_BTL_DES_FLAGS_BTL_OWNERSHIP; + } +} + +static inline int mca_btl_ugni_start_progress_reverse_get (mca_btl_base_endpoint_t *ep, + mca_btl_base_segment_t *segments, + void *rem_ctx) +{ + mca_btl_ugni_base_frag_t *frag; + int rc; + + BTL_VERBOSE(("starting reverse get (put) for remote ctx: %p", rem_ctx)); + + MCA_BTL_UGNI_FRAG_ALLOC_RDMA(ep->btl, frag, rc); + if (OPAL_UNLIKELY(NULL == frag)) { + BTL_ERROR(("error allocating rdma frag for reverse get")); + return rc; + } + + frag->base.des_cbfunc = mca_btl_ugni_callback_reverse_get; + frag->base.des_cbdata = rem_ctx; + frag->endpoint = ep; + + memmove (&frag->segments, segments, 2 * sizeof (segments[0])); + + frag->base.des_src = frag->segments; + frag->base.des_src_cnt = 1; + frag->base.des_dst = frag->segments + 1; + frag->base.des_dst_cnt = 1; + + rc = mca_btl_ugni_put (&ep->btl->super, ep, &frag->base); + assert (OMPI_SUCCESS == rc); + + return rc; +} + +static inline int +mca_btl_ugni_smsg_process (mca_btl_base_endpoint_t *ep) +{ + mca_btl_active_message_callback_t *reg; + mca_btl_ugni_base_frag_t frag; + mca_btl_base_segment_t *segments; + mca_btl_ugni_frag_hdr_t *hdr; + uintptr_t data_ptr; + int tries = 3; + int count = 0; + int rc; + + do { + uint8_t tag = GNI_SMSG_ANY_TAG; + + rc = GNI_SmsgGetNextWTag (ep->common->ep_handle, (void **) &data_ptr, &tag); + if (GNI_RC_SUCCESS != rc) { + BTL_VERBOSE(("no smsg message waiting. rc = %d", rc)); + continue; + } + + if (OPAL_UNLIKELY(0 == data_ptr)) { + BTL_ERROR(("null data ptr!")); + return OMPI_ERROR; + } + + count++; + + BTL_VERBOSE(("got smsg fragment. tag = %d\n", tag)); + + switch (tag) { + case MCA_BTL_UGNI_TAG_SEND: + hdr = (mca_btl_ugni_frag_hdr_t *) data_ptr; + + BTL_VERBOSE(("received smsg fragment. hdr = {len = %u, tag = %d}", + (unsigned int) hdr->len, hdr->tag)); + + reg = mca_btl_base_active_message_trigger + hdr->tag; + frag.base.des_dst = frag.segments; + frag.base.des_dst_cnt = 1; + + frag.segments[0].seg_addr.pval = (void *)(data_ptr + sizeof (*hdr)); + frag.segments[0].seg_len = hdr->len; + + reg->cbfunc(&ep->btl->super, hdr->tag, &(frag.base), reg->cbdata); + + break; + case MCA_BTL_UGNI_TAG_DISCONNECT: + /* remote endpoint has disconnected */ + rc = GNI_SmsgRelease (ep->common->ep_handle); + if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) { + BTL_ERROR(("Smsg release failed!")); + return OMPI_ERROR; + } + + mca_btl_ugni_ep_disconnect (ep, false); + + return count; + case MCA_BTL_UGNI_TAG_PUT_INIT: + segments = (mca_btl_base_segment_t *) data_ptr; + + mca_btl_ugni_start_progress_reverse_get (ep, segments, + ((void **)(segments + 2))[0]); + + break; + case MCA_BTL_UGNI_TAG_PUT_COMPLETE: + mca_btl_ugni_post_frag_complete (((void **)data_ptr)[0], OMPI_SUCCESS); + + break; + default: + break; +/* BTL_ERROR(("unknown tag %d\n", tag)); */ + } + + rc = GNI_SmsgRelease (ep->common->ep_handle); + if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) { + BTL_ERROR(("Smsg release failed!")); + return OMPI_ERROR; + } + } while (tries--); + + /* finished processing events */ + return count; +} + +static inline int +mca_btl_ugni_progress_datagram (mca_btl_ugni_module_t *btl) +{ + uint32_t remote_addr, remote_id; + uint64_t datagram_id; + mca_btl_base_endpoint_t *ep; + gni_ep_handle_t handle; + gni_post_state_t post_state; + int rc, count; + + count = 0; + + post_state = GNI_POST_PENDING; + rc = GNI_PostDataProbeById (btl->device->dev_handle, &datagram_id); + if (OPAL_LIKELY(GNI_RC_SUCCESS != rc)) { + return 0; + } + + if ((datagram_id & MCA_BTL_UGNI_DATAGRAM_MASK) == + MCA_BTL_UGNI_CONNECT_WILDCARD_ID) { + handle = btl->wildcard_ep; + } else { + handle = + btl->endpoints[(uint32_t)(datagram_id & 0xffffffffull)]->common->ep_handle; + } + + /* wait for the incoming datagram to complete (in case it isn't) */ + rc = GNI_EpPostDataWaitById (handle, datagram_id, -1, &post_state, + &remote_addr, &remote_id); + if (GNI_RC_SUCCESS != rc) { + BTL_ERROR(("GNI_EpPostDataWaitById failed with rc = %d", rc)); + return ompi_common_rc_ugni_to_ompi (rc); + } + + BTL_VERBOSE(("got a datagram completion: id = %" PRIx64 ", state = %d, " + "peer = %d", datagram_id, post_state, remote_id)); + + ep = btl->endpoints[remote_id]; + + OPAL_THREAD_LOCK(&ep->common->lock); + + /* NTH: TODO -- error handling */ + (void) mca_btl_ugni_ep_connect_progress (ep); + + if (ep->smsgs_waiting && OMPI_COMMON_UGNI_CONNECTED == MCA_BTL_UGNI_EP_STATE(ep)) { + /* process messages waiting in the endpoint's smsg mailbox */ + while ((rc = mca_btl_ugni_smsg_process (ep) > 0)) count += rc; + ep->smsgs_waiting = false; + } + + OPAL_THREAD_UNLOCK(&ep->common->lock); + + if ((datagram_id & MCA_BTL_UGNI_DATAGRAM_MASK) == + MCA_BTL_UGNI_CONNECT_WILDCARD_ID) { + mca_btl_ugni_wildcard_ep_post (btl); + } + + return count; +} + +static inline int +mca_btl_ugni_handle_smsg_overrun (mca_btl_ugni_module_t *btl) +{ + gni_cq_entry_t event_data; + unsigned int ep_index; + int count, rc; + + BTL_VERBOSE(("btl/ugni_component detect SMSG CQ overrun. " + "processing message backlog...")); + + /* we don't know which endpoint lost an smsg completion. clear the + smsg cq and check all mailboxes */ + + /* clear out remote cq */ + do { + rc = GNI_CqGetEvent (btl->smsg_remote_cq, &event_data); + } while (GNI_RC_SUCCESS == rc); + + count = 0; + + for (ep_index = 0 ; ep_index < btl->endpoint_count ; ++ep_index) { + mca_btl_base_endpoint_t *ep = btl->endpoints[ep_index]; + + if (NULL == ep || OMPI_COMMON_UGNI_CONNECTED != MCA_BTL_UGNI_EP_STATE(ep)) { + continue; + } + + do { + /* clear out smsg mailbox */ + rc = mca_btl_ugni_smsg_process (ep); + if (rc > 0) + count += rc; + } while (rc > 0); + } + + return count; +} + +static inline int +mca_btl_ugni_progress_smsg (mca_btl_ugni_module_t *btl) +{ + mca_btl_base_endpoint_t *ep; + gni_cq_entry_t event_data; + int rc; + + rc = GNI_CqGetEvent (btl->smsg_remote_cq, &event_data); + if (GNI_RC_NOT_DONE == rc) { + return 0; + } + + if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc || !GNI_CQ_STATUS_OK(event_data) || + GNI_CQ_OVERRUN(event_data))) { + if (GNI_RC_ERROR_RESOURCE == rc || + (GNI_RC_SUCCESS == rc && GNI_CQ_OVERRUN(event_data))) { + /* recover from smsg cq overrun */ + return mca_btl_ugni_handle_smsg_overrun (btl); + } + + BTL_ERROR(("unhandled error in GNI_CqGetEvent")); + + /* unhandled error: crash */ + assert (0); + return OMPI_ERROR; + } + + BTL_VERBOSE(("REMOTE CQ: Got event 0x%" PRIx64 ". msg id = %" PRIu64 + ". ok = %d, type = %" PRIu64 "\n", (uint64_t) event_data, + GNI_CQ_GET_MSG_ID(event_data), GNI_CQ_STATUS_OK(event_data), + GNI_CQ_GET_TYPE(event_data))); + + /* we could check the message type here but it seems to always be a POST */ + + ep = btl->endpoints[GNI_CQ_GET_MSG_ID(event_data)]; + if (OPAL_UNLIKELY(OMPI_COMMON_UGNI_CONNECTED != MCA_BTL_UGNI_EP_STATE(ep))) { + /* due to the nature of datagrams we may get a smsg completion before + we get mailbox info from the peer */ + BTL_VERBOSE(("event occurred on an unconnected endpoint! ep state = %d", MCA_BTL_UGNI_EP_STATE(ep))); + + /* flag the endpoint as having messages waiting */ + ep->smsgs_waiting = true; + return 0; + } + + return mca_btl_ugni_smsg_process (ep); +} + +static inline int +mca_btl_ugni_progress_bte (mca_btl_ugni_module_t *btl) +{ + (void) ompi_common_ugni_process_completed_post (btl->device, btl->bte_local_cq); + + return 1; +} + +static int +mca_btl_ugni_retry_failed (mca_btl_ugni_module_t *btl) +{ + int count = opal_list_get_size (&btl->failed_frags); + opal_list_item_t *item; + + while (count-- && NULL != (item = opal_list_remove_first (&btl->failed_frags))) { + fprintf (stderr, "retrying frag %p\n", (void *) item); + mca_btl_ugni_post_frag_complete ((void *) item, OMPI_SUCCESS); + } + + return 0; +} + +static int +mca_btl_ugni_component_progress (void) +{ + mca_btl_ugni_module_t *btl; + unsigned int i, j, k; + int count; + + count = ompi_common_ugni_progress (); + + for (i = 0 ; i < mca_btl_ugni_component.ugni_num_btls ; ++i) { + btl = mca_btl_ugni_component.modules + i; + + mca_btl_ugni_retry_failed (btl); + + count += mca_btl_ugni_progress_datagram (btl); + for (j = 0 ; j < 2 ; ++j) { + for (k = 0 ; k < 5 ; ++k) { + count += mca_btl_ugni_progress_smsg (btl); + } + + count += mca_btl_ugni_progress_bte (btl); + } + } + + return count; +} diff --git a/ompi/mca/btl/ugni/btl_ugni_endpoint.c b/ompi/mca/btl/ugni/btl_ugni_endpoint.c new file mode 100644 index 0000000000..385ee11a74 --- /dev/null +++ b/ompi/mca/btl/ugni/btl_ugni_endpoint.c @@ -0,0 +1,200 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "btl_ugni.h" +#include "btl_ugni_endpoint.h" +#include "btl_ugni_frag.h" + +static void mca_btl_ugni_ep_construct (mca_btl_base_endpoint_t *ep); +static void mca_btl_ugni_ep_destruct (mca_btl_base_endpoint_t *ep); + +OBJ_CLASS_INSTANCE(mca_btl_base_endpoint_t, opal_object_t, + mca_btl_ugni_ep_construct, mca_btl_ugni_ep_destruct); + +static void mca_btl_ugni_ep_construct (mca_btl_base_endpoint_t *ep) +{ + OBJ_CONSTRUCT(&ep->pending_list, opal_list_t); + ep->smsgs_waiting = false; +} + +static void mca_btl_ugni_ep_destruct (mca_btl_base_endpoint_t *ep) +{ + OBJ_DESTRUCT(&ep->pending_list); +} + +static void mca_btl_ugni_smsg_mbox_construct (mca_btl_ugni_smsg_mbox_t *mbox) { + struct mca_btl_ugni_reg_t *reg = + (struct mca_btl_ugni_reg_t *) mbox->super.registration; + + mbox->buffer = mbox->super.ptr; + + /* initialize mailbox attributes */ + mbox->smsg_attrib.msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT; + mbox->smsg_attrib.msg_maxsize = mca_btl_ugni_component.eager_limit + + sizeof (mca_btl_ugni_frag_hdr_t); + mbox->smsg_attrib.mbox_maxcredit = mca_btl_ugni_smsg_max_credits; + mbox->smsg_attrib.mbox_offset = 0; /* autoselect */ + mbox->smsg_attrib.msg_buffer = mbox->buffer; + mbox->smsg_attrib.buff_size = mca_btl_ugni_smsg_mbox_size; + mbox->smsg_attrib.mem_hndl = reg->memory_hdl; +} + +OBJ_CLASS_INSTANCE(mca_btl_ugni_smsg_mbox_t, ompi_free_list_item_t, + mca_btl_ugni_smsg_mbox_construct, NULL); + +static inline int mca_btl_ugni_ep_smsg_get_mbox (mca_btl_base_endpoint_t *ep) { + mca_btl_ugni_module_t *ugni_module = ep->btl; + ompi_free_list_item_t *mbox; + int rc; + + OMPI_FREE_LIST_GET(&ugni_module->smsg_mboxes, mbox, rc); + if (OPAL_UNLIKELY(NULL == mbox)) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + ep->mailbox = (mca_btl_ugni_smsg_mbox_t *) mbox; + + /* per ugni spec we need to zero mailbox data before connecting */ + memset (ep->mailbox->buffer, 0, mca_btl_ugni_smsg_mbox_size); + return rc; +} + +int mca_btl_ugni_ep_disconnect (mca_btl_base_endpoint_t *ep, bool send_disconnect) { + uint32_t msg_id = ORTE_PROC_MY_NAME->vpid; + char msg; + int rc; + + OPAL_THREAD_LOCK(&ep->common->lock); + + do { + if (OMPI_COMMON_UGNI_INIT == MCA_BTL_UGNI_EP_STATE(ep)) { + /* nothing to do */ + break; + } + + if (OMPI_COMMON_UGNI_CONNECTED == MCA_BTL_UGNI_EP_STATE(ep) && send_disconnect) { + rc = GNI_SmsgSendWTag (ep->common->ep_handle, &msg, 1, NULL, 0, msg_id, + MCA_BTL_UGNI_TAG_DISCONNECT); + if (GNI_RC_SUCCESS != rc) { + BTL_VERBOSE(("btl/ugni could not send close message")); + } + + /* we might want to wait for local completion here (do we even care) */ + } + + ep->common->state = OMPI_COMMON_UGNI_BOUND; + + /* drop the lock before we unbind */ + OPAL_THREAD_UNLOCK(&ep->common->lock); + rc = ompi_common_ugni_endpoint_unbind (ep->common); + OPAL_THREAD_LOCK(&ep->common->lock); + if (OMPI_SUCCESS != rc) { + BTL_VERBOSE(("btl/ugni error unbinding ugni endpoint")); + } + + OMPI_FREE_LIST_RETURN(&ep->btl->smsg_mboxes, ((ompi_free_list_item_t *) ep->mailbox)); + ep->mailbox = NULL; + } while (0); + + OPAL_THREAD_UNLOCK(&ep->common->lock); + + return OMPI_SUCCESS; +} + +static inline int mca_btl_ugni_ep_connect_start (mca_btl_base_endpoint_t *ep) { + int rc; + + BTL_VERBOSE(("initiaiting connection to remote peer with address: %u id: %u", + ep->common->ep_rem_addr, ep->common->ep_rem_id)); + + /* bind endpoint to remote address */ + OPAL_THREAD_UNLOCK(&ep->common->lock); + rc = ompi_common_ugni_endpoint_bind (ep->common); + OPAL_THREAD_LOCK(&ep->common->lock); + if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { + return rc; + } + + MCA_BTL_UGNI_EP_STATE(ep) = OMPI_COMMON_UGNI_CONNECTING; + + /* build connection data */ + rc = mca_btl_ugni_ep_smsg_get_mbox (ep); + if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { + return rc; + } + + memset (&ep->remote_smsg_attrib, 0, sizeof (ep->remote_smsg_attrib)); + + BTL_VERBOSE(("btl/ugni connection to remote peer initiated")); + + return OMPI_SUCCESS; +} + +static inline int mca_btl_ugni_ep_connect_finish (mca_btl_base_endpoint_t *ep) { + opal_list_item_t *item; + int rc; + + BTL_VERBOSE(("finishing connection. remote attributes: msg_type = %d, msg_buffer = %p, buff_size = %d, " + "mem_hndl = {qword1 = %" PRIu64 ", qword2 = %" PRIu64 "}, mbox = %d, mbox_maxcredit = %d, " + "msg_maxsize = %d", ep->remote_smsg_attrib.msg_type, ep->remote_smsg_attrib.msg_buffer, + ep->remote_smsg_attrib.buff_size, ep->remote_smsg_attrib.mem_hndl.qword1, + ep->remote_smsg_attrib.mem_hndl.qword2, ep->remote_smsg_attrib.mbox_offset, + ep->remote_smsg_attrib.mbox_maxcredit, ep->remote_smsg_attrib.msg_maxsize)); + + BTL_VERBOSE(("finishing connection. local attributes: msg_type = %d, msg_buffer = %p, buff_size = %d, " + "mem_hndl = {qword1 = %" PRIu64 ", qword2 = %" PRIu64 "}, mbox = %d, mbox_maxcredit = %d, " + "msg_maxsize = %d", ep->mailbox->smsg_attrib.msg_type, ep->mailbox->smsg_attrib.msg_buffer, + ep->mailbox->smsg_attrib.buff_size, ep->mailbox->smsg_attrib.mem_hndl.qword1, + ep->mailbox->smsg_attrib.mem_hndl.qword2, ep->mailbox->smsg_attrib.mbox_offset, + ep->mailbox->smsg_attrib.mbox_maxcredit, ep->mailbox->smsg_attrib.msg_maxsize)); + + rc = GNI_SmsgInit (ep->common->ep_handle, &ep->mailbox->smsg_attrib, &ep->remote_smsg_attrib); + if (GNI_RC_SUCCESS != rc) { + BTL_ERROR(("error initializing SMSG protocol. rc = %d", rc)); + return ompi_common_rc_ugni_to_ompi (rc); + } + + BTL_VERBOSE(("endpoint connected. posting %u sends", (unsigned int) opal_list_get_size (&ep->pending_list))); + + MCA_BTL_UGNI_EP_STATE(ep) = OMPI_COMMON_UGNI_CONNECTED; + + /* post pending sends */ + while (NULL != (item = opal_list_remove_first (&ep->pending_list))) { + mca_btl_ugni_base_frag_t *frag = (mca_btl_ugni_base_frag_t *) item; + + (void) mca_btl_ugni_send (&ep->btl->super, ep, &frag->base, frag->tag); + } + + return OMPI_SUCCESS; +} + +int mca_btl_ugni_ep_connect_progress (mca_btl_base_endpoint_t *ep) { + int rc; + + if (OMPI_COMMON_UGNI_CONNECTED == MCA_BTL_UGNI_EP_STATE(ep)) { + return OMPI_SUCCESS; + } + + if (OMPI_COMMON_UGNI_CONNECTING > ep->common->state) { + rc = mca_btl_ugni_ep_connect_start (ep); + if (OMPI_SUCCESS != rc) { + return rc; + } + } + + if (GNI_SMSG_TYPE_INVALID == ep->remote_smsg_attrib.msg_type) { + (void) mca_btl_ugni_directed_ep_post (ep); + return OMPI_ERR_RESOURCE_BUSY; + } + + return mca_btl_ugni_ep_connect_finish (ep); +} diff --git a/ompi/mca/btl/ugni/btl_ugni_endpoint.h b/ompi/mca/btl/ugni/btl_ugni_endpoint.h new file mode 100644 index 0000000000..9c5fcce552 --- /dev/null +++ b/ompi/mca/btl/ugni/btl_ugni_endpoint.h @@ -0,0 +1,135 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_BTL_UGNI_ENDPOINT_H +#define MCA_BTL_UGNI_ENDPOINT_H + +#include "btl_ugni.h" + +struct mca_btl_ugni_smsg_mbox_t { + ompi_free_list_item_t super; + + void *buffer; + gni_smsg_attr_t smsg_attrib; +}; +typedef struct mca_btl_ugni_smsg_mbox_t mca_btl_ugni_smsg_mbox_t; + +OBJ_CLASS_DECLARATION(mca_btl_ugni_smsg_mbox_t); + +struct mca_btl_base_endpoint_t { + opal_object_t super; + + ompi_common_ugni_endpoint_t *common; + + mca_btl_ugni_module_t *btl; + + gni_smsg_attr_t remote_smsg_attrib; + + mca_btl_ugni_smsg_mbox_t *mailbox; + + opal_list_t pending_list; + + /* true if a frag was received before the connection was complete */ + bool smsgs_waiting; +}; +typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t; + +#define MCA_BTL_UGNI_EP_STATE(ep) ((ep)->common->state) + +OBJ_CLASS_DECLARATION(mca_btl_base_endpoint_t); + +int mca_btl_ugni_ep_connect_progress (mca_btl_base_endpoint_t *ep); +int mca_btl_ugni_ep_disconnect (mca_btl_base_endpoint_t *ep, bool send_disconnect); + +static inline int mca_btl_ugni_init_ep (mca_btl_base_endpoint_t **ep, + mca_btl_ugni_module_t *btl, + ompi_proc_t *peer_proc) { + mca_btl_base_endpoint_t *endpoint; + int rc; + + endpoint = OBJ_NEW(mca_btl_base_endpoint_t); + assert (endpoint != NULL); + + rc = ompi_common_ugni_endpoint_for_proc (btl->device, peer_proc, &endpoint->common); + if (OMPI_SUCCESS != rc) { + assert (0); + return rc; + } + + endpoint->btl = btl; + + *ep = endpoint; + + return OMPI_SUCCESS; +} + +static inline void mca_btl_ugni_release_ep (mca_btl_base_endpoint_t *ep) { + int rc; + + rc = mca_btl_ugni_ep_disconnect (ep, false); + if (OMPI_SUCCESS == rc) { + BTL_VERBOSE(("btl/ugni error disconnecting endpoint")); + } + + ompi_common_ugni_endpoint_return (ep->common); + + OBJ_RELEASE(ep); +} + +static inline int mca_btl_ugni_check_endpoint_state (mca_btl_base_endpoint_t *ep) { + int rc; + + if (OPAL_LIKELY(OMPI_COMMON_UGNI_CONNECTED == ep->common->state)) { + return OMPI_SUCCESS; + } + + OPAL_THREAD_LOCK(&ep->common->lock); + + switch (ep->common->state) { + case OMPI_COMMON_UGNI_INIT: + rc = mca_btl_ugni_ep_connect_progress (ep); + if (OMPI_SUCCESS != rc) { + break; + } + case OMPI_COMMON_UGNI_CONNECTING: + rc = OMPI_ERR_RESOURCE_BUSY; + break; + default: + rc = OMPI_SUCCESS; + } + + OPAL_THREAD_UNLOCK(&ep->common->lock); + + return rc; +} + +static inline int mca_btl_ugni_wildcard_ep_post (mca_btl_ugni_module_t *ugni_module) { + int rc; + + memset (&ugni_module->wc_local_attr, 0, sizeof (ugni_module->wc_local_attr)); + rc = GNI_EpPostDataWId (ugni_module->wildcard_ep, &ugni_module->wc_local_attr, sizeof (ugni_module->wc_local_attr), + &ugni_module->wc_remote_attr, sizeof (ugni_module->wc_remote_attr), + MCA_BTL_UGNI_CONNECT_WILDCARD_ID | ORTE_PROC_MY_NAME->vpid); + + return ompi_common_rc_ugni_to_ompi (rc); +} + +static inline int mca_btl_ugni_directed_ep_post (mca_btl_base_endpoint_t *ep) { + int rc; + rc = GNI_EpPostDataWId (ep->common->ep_handle, &ep->mailbox->smsg_attrib, sizeof (ep->mailbox->smsg_attrib), + &ep->remote_smsg_attrib, sizeof (ep->remote_smsg_attrib), + MCA_BTL_UGNI_CONNECT_DIRECTED_ID | ep->common->ep_rem_id); + + return ompi_common_rc_ugni_to_ompi (rc); +} + +#endif /* MCA_BTL_UGNI_ENDPOINT_H */ diff --git a/ompi/mca/btl/ugni/btl_ugni_frag.c b/ompi/mca/btl/ugni/btl_ugni_frag.c new file mode 100644 index 0000000000..723663f87c --- /dev/null +++ b/ompi/mca/btl/ugni/btl_ugni_frag.c @@ -0,0 +1,41 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "btl_ugni.h" +#include "btl_ugni_frag.h" + +static inline void mca_btl_ugni_frag_constructor (mca_btl_ugni_base_frag_t *frag) +{ + /* send memory does not need to be registered so we do not need a mpool */ + frag->hdr = (mca_btl_ugni_frag_hdr_t *) calloc (1, sizeof (mca_btl_ugni_frag_hdr_t) + mca_btl_ugni_component.eager_limit); + frag->segments[0].seg_addr.pval = (void *) (frag->hdr + 1); +} + +static inline void mca_btl_ugni_frag_destructor (mca_btl_ugni_base_frag_t *frag) +{ + if (NULL != frag->hdr) { + free (frag->hdr); + } +} + +static inline void mca_btl_ugni_rdma_frag_constructor (mca_btl_ugni_base_frag_t *frag) +{ + /* we don't need any buffer memory for rdma frags */ + frag->hdr = NULL; + frag->segments[0].seg_addr.pval = NULL; +} + +OBJ_CLASS_INSTANCE(mca_btl_ugni_base_frag_t, mca_btl_base_descriptor_t, + mca_btl_ugni_frag_constructor, mca_btl_ugni_frag_destructor); + +OBJ_CLASS_INSTANCE(mca_btl_ugni_rdma_frag_t, mca_btl_base_descriptor_t, + mca_btl_ugni_rdma_frag_constructor, NULL); diff --git a/ompi/mca/btl/ugni/btl_ugni_frag.h b/ompi/mca/btl/ugni/btl_ugni_frag.h new file mode 100644 index 0000000000..b8f35b2a8d --- /dev/null +++ b/ompi/mca/btl/ugni/btl_ugni_frag.h @@ -0,0 +1,74 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#if !defined(MCA_BTL_UGNI_FRAG_H) +#define MCA_BTL_UGNI_FRAG_H + +#include "btl_ugni.h" +#include "btl_ugni_endpoint.h" + +struct mca_btl_ugni_frag_hdr_t { + size_t len; + mca_btl_base_tag_t tag; +}; +typedef struct mca_btl_ugni_frag_hdr_t mca_btl_ugni_frag_hdr_t; + +struct mca_btl_ugni_base_frag_t { + mca_btl_base_descriptor_t base; + mca_btl_base_segment_t segments[2]; + mca_btl_ugni_frag_hdr_t *hdr; + mca_btl_base_tag_t tag; + ompi_common_ugni_post_desc_t post_desc; + mca_btl_base_endpoint_t *endpoint; + mca_btl_ugni_reg_t *registration; + ompi_free_list_t *my_list; + mca_btl_ugni_module_t *btl; + int tries; +}; + +typedef struct mca_btl_ugni_base_frag_t mca_btl_ugni_base_frag_t; +typedef struct mca_btl_ugni_base_frag_t mca_btl_ugni_rdma_frag_t; + +#define MCA_BTL_UGNI_DESC_TO_FRAG(desc) ((mca_btl_ugni_base_frag_t *)((uintptr_t) (desc) - offsetof (mca_btl_ugni_base_frag_t, post_desc))) + +OBJ_CLASS_DECLARATION(mca_btl_ugni_base_frag_t); +OBJ_CLASS_DECLARATION(mca_btl_ugni_rdma_frag_t); + +#define MCA_BTL_UGNI_FRAG_ALLOC_EAGER(module, frag, rc) \ + do { \ + ompi_free_list_item_t *item; \ + OMPI_FREE_LIST_GET(&mca_btl_ugni_component.ugni_frags_eager, item, rc); \ + frag = (mca_btl_ugni_base_frag_t *) item; \ + frag->my_list = &mca_btl_ugni_component.ugni_frags_eager; \ + frag->btl = (module); \ + } while (0) + +#define MCA_BTL_UGNI_FRAG_ALLOC_RDMA(module, frag, rc) \ + do { \ + ompi_free_list_item_t *item; \ + OMPI_FREE_LIST_GET(&mca_btl_ugni_component.ugni_frags_rdma, item, rc); \ + frag = (mca_btl_ugni_base_frag_t *) item; \ + frag->my_list = &mca_btl_ugni_component.ugni_frags_rdma; \ + frag->btl = (module); \ + } while (0) + +#define MCA_BTL_UGNI_FRAG_RETURN(frag) \ + do { \ + if (OPAL_UNLIKELY(NULL != (frag)->registration)) { \ + (frag)->btl->super.btl_mpool->mpool_deregister((frag)->btl->super.btl_mpool, \ + &(frag)->registration->base); \ + (frag)->registration = NULL; \ + } \ + OMPI_FREE_LIST_RETURN((frag)->my_list, (ompi_free_list_item_t *)(frag)); \ + } while (0); + +#endif /* MCA_BTL_UGNI_FRAG_H */ diff --git a/ompi/mca/btl/ugni/btl_ugni_get.c b/ompi/mca/btl/ugni/btl_ugni_get.c new file mode 100644 index 0000000000..acf3b246b6 --- /dev/null +++ b/ompi/mca/btl/ugni/btl_ugni_get.c @@ -0,0 +1,65 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "btl_ugni_rdma.h" + +/** + * Initiate a get operation. + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL addressing information + * @param descriptor (IN) Description of the data to be transferred + */ +int mca_btl_ugni_get (struct mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *endpoint, + struct mca_btl_base_descriptor_t *des) { + mca_btl_ugni_base_frag_t *frag = (mca_btl_ugni_base_frag_t *) des; + gni_mem_handle_t lcl_hdl, rem_hdl; + void *lcl_buffer, *rem_buffer; + size_t size; + int rc; + + BTL_VERBOSE(("Using RDMA Get")); + + /* Check if endpoint is connected */ + rc = mca_btl_ugni_check_endpoint_state(endpoint); + if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) + /* we should already be connected by this point (we got a smsg send) */ + return rc; + + /* Get remote memory handle */ + rem_buffer = (void *)(des->des_src->seg_addr.pval); + size = des->des_src->seg_len; + memcpy (&rem_hdl, (void *) des->des_src->seg_key.key64, sizeof (rem_hdl)); + + /* Get local memory handle */ + lcl_buffer = (void *)(des->des_dst->seg_addr.pval); + memcpy (&lcl_hdl, (void *) des->des_dst->seg_key.key64, sizeof (lcl_hdl)); + + if (OPAL_UNLIKELY(((uintptr_t)rem_buffer & 0x3) || ((uintptr_t)lcl_buffer & 0x3) || + size & 0x3 || size > mca_btl_ugni_component.btl_get_limit)) { + /* switch to put */ + return mca_btl_ugni_start_reverse_get (btl, frag); + } + + frag->tries = 0; + + if (size < mca_btl_ugni_component.btl_fma_limit) { + rc = post_fma_descriptor (frag, GNI_POST_FMA_GET, endpoint, size, + lcl_buffer, lcl_hdl, rem_buffer, rem_hdl); + } else { + rc = post_bte_descriptor (frag, GNI_POST_RDMA_GET, endpoint, size, + lcl_buffer, lcl_hdl, rem_buffer, rem_hdl); + } + + return rc; +} diff --git a/ompi/mca/btl/ugni/btl_ugni_module.c b/ompi/mca/btl/ugni/btl_ugni_module.c new file mode 100644 index 0000000000..d5107debd7 --- /dev/null +++ b/ompi/mca/btl/ugni/btl_ugni_module.c @@ -0,0 +1,518 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "ompi/constants.h" +#include "ompi/communicator/communicator.h" +#include "opal/util/show_help.h" +#include "opal/align.h" +#include "ompi/mca/btl/base/base.h" +#include "ompi/mca/dpm/dpm.h" +#include "orte/util/proc_info.h" +#include "ompi/mca/btl/btl.h" +#include "ompi/mca/btl/base/btl_base_error.h" + +#include "btl_ugni.h" +#include "btl_ugni_frag.h" +#include "btl_ugni_endpoint.h" + +#include +#include +#include +#include +#include + +static int +mca_btl_ugni_free (struct mca_btl_base_module_t *btl, + mca_btl_base_descriptor_t *des); + +static int +mca_btl_ugni_module_finalize (struct mca_btl_base_module_t* btl); + +static struct mca_btl_base_descriptor_t * +mca_btl_ugni_prepare_src (struct mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *endpoint, + mca_mpool_base_registration_t *registration, + struct opal_convertor_t *convertor, + uint8_t order, size_t reserve, size_t *size, + uint32_t flags); + +static mca_btl_base_descriptor_t * +mca_btl_ugni_prepare_dst (mca_btl_base_module_t *btl, + mca_btl_base_endpoint_t *endpoint, + mca_mpool_base_registration_t *registration, + opal_convertor_t *convertor, uint8_t order, + size_t reserve, size_t *size, uint32_t flags); + +mca_btl_ugni_module_t mca_btl_ugni_module = { + { + /* .btl_component = */ &mca_btl_ugni_component.super, + + /* these are set in component_register */ + /* .btl_eager_limit = */ 0, + /* .btl_rndv_eager_limit = */ 0, + /* .btl_max_send_size = */ 0, + /* .btl_rdma_pipeline_send_length = */ 0, + /* .btl_rdma_pipeline_frag_size = */ 0, + /* .btl_min_rdma_pipeline_size = */ 0, + /* .btl_exclusivity = */ 0, + /* .btl_latency = */ 0, + /* .btl_bandwidth = */ 0, + /* .btl_flags = */ 0, + + /* member functions */ + mca_btl_ugni_add_procs, + mca_btl_ugni_del_procs, + NULL, /* register */ + mca_btl_ugni_module_finalize, + mca_btl_ugni_alloc, + mca_btl_ugni_free, + mca_btl_ugni_prepare_src, + mca_btl_ugni_prepare_dst, + mca_btl_ugni_send, + mca_btl_ugni_sendi, + mca_btl_ugni_put, + mca_btl_ugni_get, + NULL, /* mca_btl_base_dump, */ + NULL, /* mpool */ + NULL, /* mca_btl_ugni_register_error_cb - error callback registration */ + NULL, /* mca_btl_ugni_ft_event */ + } +}; + +static int ugni_reg_mem (void *reg_data, void *base, size_t size, + mca_mpool_base_registration_t *reg) +{ + mca_btl_ugni_module_t *btl = (mca_btl_ugni_module_t *) reg_data; + mca_btl_ugni_reg_t *ugni_reg = (mca_btl_ugni_reg_t *) reg; + int rc; + + rc = GNI_MemRegister (btl->device->dev_handle, (uint64_t)base, + size, NULL, GNI_MEM_READWRITE, -1, + &(ugni_reg->memory_hdl)); + + if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + ugni_reg->buffer = base; + ugni_reg->size = size; + + return OMPI_SUCCESS; +} + +static int ugni_reg_smsg_mem (void *reg_data, void *base, size_t size, + mca_mpool_base_registration_t *reg) +{ + mca_btl_ugni_module_t *btl = (mca_btl_ugni_module_t *) reg_data; + mca_btl_ugni_reg_t *ugni_reg = (mca_btl_ugni_reg_t *) reg; + int rc; + + rc = GNI_MemRegister (btl->device->dev_handle, (uint64_t)base, + size, btl->smsg_remote_cq, GNI_MEM_READWRITE | + GNI_MEM_USE_GART, -1, &(ugni_reg->memory_hdl)); + + if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + ugni_reg->buffer = base; + ugni_reg->size = size; + + return OMPI_SUCCESS; +} + +static int +ugni_dereg_mem (void *reg_data, mca_mpool_base_registration_t *reg) +{ + mca_btl_ugni_module_t *btl = (mca_btl_ugni_module_t *) reg_data; + mca_btl_ugni_reg_t *ugni_reg = (mca_btl_ugni_reg_t *)reg; + int rc; + + rc = GNI_MemDeregister (btl->device->dev_handle, &ugni_reg->memory_hdl); + if (GNI_RC_SUCCESS != rc) { + return OMPI_ERROR; + } + + ugni_reg->buffer = NULL; + ugni_reg->size = 0; + + return OMPI_SUCCESS; +} + +static int +mca_btl_ugni_module_setup_mpools (mca_btl_ugni_module_t *ugni_module) +{ + struct mca_mpool_base_resources_t mpool_resources; + int mbox_increment, rc; + size_t nprocs; + + (void) ompi_proc_world (&nprocs); + + mpool_resources.reg_data = (void *) ugni_module; + mpool_resources.sizeof_reg = sizeof (mca_btl_ugni_reg_t); + mpool_resources.register_mem = ugni_reg_mem; + mpool_resources.deregister_mem = ugni_dereg_mem; + ugni_module->super.btl_mpool = + mca_mpool_base_module_create("rdma", ugni_module->device, + &mpool_resources); + if (NULL == ugni_module->super.btl_mpool) { + BTL_ERROR(("error creating mpool")); + return OMPI_ERROR; + } + + mpool_resources.register_mem = ugni_reg_smsg_mem; + + ugni_module->smsg_mpool = + mca_mpool_base_module_create("rdma", ugni_module->device, + &mpool_resources); + + OBJ_CONSTRUCT(&ugni_module->smsg_mboxes, ompi_free_list_t); + + mbox_increment = nprocs; + + if (nprocs * mca_btl_ugni_smsg_mbox_size > 2 * 1024 * 1024) { + /* allocate at most 2 MB at a time */ + mbox_increment = (int) (2.0 * 1024.0 * 1024.0 / (float)mca_btl_ugni_smsg_mbox_size); + } + + if (nprocs < 1024) { + mbox_increment = nprocs / 2; + } else if (nprocs < 16384) { + mbox_increment = nprocs / 10; + } else { + mbox_increment = nprocs / 40; + } + + rc = ompi_free_list_init_new (&ugni_module->smsg_mboxes, + sizeof (mca_btl_ugni_smsg_mbox_t), 64, + OBJ_CLASS(mca_btl_ugni_smsg_mbox_t), + mca_btl_ugni_smsg_mbox_size, + opal_cache_line_size, 0, + nprocs, mbox_increment, + ugni_module->smsg_mpool); + if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { + return rc; + } + + return OMPI_SUCCESS; +} + +int +mca_btl_ugni_module_init (mca_btl_ugni_module_t *ugni_module, + ompi_common_ugni_device_t *dev) +{ + int rc; + + BTL_VERBOSE(("binding module %p to device %p", (void *) ugni_module, + (void *) dev)); + + /* copy module defaults (and function pointers) */ + memmove (ugni_module, &mca_btl_ugni_module, sizeof (mca_btl_ugni_module)); + + OBJ_CONSTRUCT(&ugni_module->failed_frags, opal_list_t); + + /* module settings */ + ugni_module->super.btl_eager_limit = mca_btl_ugni_component.eager_limit; + + ugni_module->super.btl_max_send_size = ugni_module->super.btl_eager_limit; + ugni_module->super.btl_rdma_pipeline_send_length = ugni_module->super.btl_eager_limit; + + ugni_module->device = dev; + + /* create wildcard endpoint to listen for connections. + * there is no need to bind this endpoint. */ + rc = GNI_EpCreate (ugni_module->device->dev_handle, NULL, + &ugni_module->wildcard_ep); + if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { + BTL_ERROR(("error creating wildcard ugni endpoint")); + return ompi_common_rc_ugni_to_ompi (rc); + } + + /* post wildcard datagram */ + rc = mca_btl_ugni_wildcard_ep_post (ugni_module); + if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { + BTL_ERROR(("error posting wildcard datagram")); + return rc; + } + + ugni_module->endpoints = NULL; + + rc = GNI_CqCreate (ugni_module->device->dev_handle, mca_btl_ugni_component.cq_size, + 0, GNI_CQ_NOBLOCK, NULL, NULL, &ugni_module->bte_local_cq); + if (GNI_RC_SUCCESS != rc) { + BTL_ERROR(("error creating local BTE CQ")); + return ompi_common_rc_ugni_to_ompi (rc); + } + + /* the smsg_remote_cq must be created before we setup the smsg mpool */ + rc = GNI_CqCreate (ugni_module->device->dev_handle, mca_btl_ugni_component.cq_size, + 0, GNI_CQ_NOBLOCK, NULL, NULL, &ugni_module->smsg_remote_cq); + if (GNI_RC_SUCCESS != rc) { + BTL_ERROR(("error creating remote SMSG CQ")); + return ompi_common_rc_ugni_to_ompi (rc); + } + + /* create rdma and smsg mpools */ + rc = mca_btl_ugni_module_setup_mpools (ugni_module); + if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { + BTL_ERROR(("error setting up module mpools")); + return rc; + } + + return OMPI_SUCCESS; +} + +static int +mca_btl_ugni_module_finalize (struct mca_btl_base_module_t *btl) +{ + mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *)btl; + size_t ntotal_procs, i; + int rc; + + /* close all open connections and release endpoints */ + if (NULL != ugni_module->endpoints) { + (void) ompi_proc_world (&ntotal_procs); + + for (i = 0 ; i < ntotal_procs ; ++i) { + if (ugni_module->endpoints[i]) { + mca_btl_ugni_release_ep (ugni_module->endpoints[i]); + } + + ugni_module->endpoints[i] = NULL; + } + + ugni_module->endpoint_count = 0; + ugni_module->endpoints = NULL; + } + + /* destroy all cqs */ + rc = GNI_CqDestroy (ugni_module->bte_local_cq); + if (GNI_RC_SUCCESS != rc) { + BTL_ERROR(("error tearing down local BTE CQ")); + } + + rc = GNI_CqDestroy (ugni_module->smsg_remote_cq); + if (GNI_RC_SUCCESS != rc) { + BTL_ERROR(("error tearing down remote SMSG CQ")); + } + + /* cancel wildcard post */ + rc = GNI_EpPostDataCancelById (ugni_module->wildcard_ep, + MCA_BTL_UGNI_CONNECT_WILDCARD_ID | + ORTE_PROC_MY_NAME->vpid); + if (GNI_RC_SUCCESS != rc) { + BTL_VERBOSE(("btl/ugni error cancelling wildcard post")); + } + + /* tear down wildcard endpoint */ + rc = GNI_EpDestroy (ugni_module->wildcard_ep); + if (GNI_RC_SUCCESS != rc) { + BTL_VERBOSE(("btl/ugni error destroying endpoint")); + } + + (void) mca_mpool_base_module_destroy (ugni_module->smsg_mpool); + ugni_module->smsg_mpool = NULL; + + (void) mca_mpool_base_module_destroy (ugni_module->super.btl_mpool); + ugni_module->super.btl_mpool = NULL; + + OBJ_DESTRUCT(&ugni_module->failed_frags); + + return OMPI_SUCCESS; +} + + +mca_btl_base_descriptor_t * +mca_btl_ugni_alloc(struct mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *endpoint, + uint8_t order, size_t size, uint32_t flags) +{ + mca_btl_ugni_base_frag_t *frag = NULL; + int rc = OMPI_SUCCESS; + + if (size <= mca_btl_ugni_component.eager_limit) { + MCA_BTL_UGNI_FRAG_ALLOC_EAGER((mca_btl_ugni_module_t *) btl, frag, rc); + } + + BTL_VERBOSE(("btl/ugni_module allocated frag of size: %u, flags: %x. frag = %p", + (unsigned int)size, flags, (void *) frag)); + + if (OPAL_LIKELY(NULL != frag)) { + frag->base.des_flags = flags; + frag->base.order = order; + frag->base.des_src = frag->segments; + frag->base.des_src_cnt = 1; + frag->base.des_dst = frag->segments; + frag->base.des_dst_cnt = 1; + + frag->segments[0].seg_len = size; + } + + return (mca_btl_base_descriptor_t *) frag; +} + +static int +mca_btl_ugni_free (struct mca_btl_base_module_t *btl, + mca_btl_base_descriptor_t *des) +{ + MCA_BTL_UGNI_FRAG_RETURN((mca_btl_ugni_base_frag_t *) des); + + return OMPI_SUCCESS; +} + +static struct mca_btl_base_descriptor_t * +mca_btl_ugni_prepare_src (struct mca_btl_base_module_t *btl, + mca_btl_base_endpoint_t *endpoint, + mca_mpool_base_registration_t *registration, + struct opal_convertor_t *convertor, + uint8_t order, size_t reserve, size_t *size, + uint32_t flags) +{ + mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl; + mca_btl_ugni_base_frag_t *frag = NULL; + void *data_ptr; + int rc; + + opal_convertor_get_current_pointer (convertor, &data_ptr); + + if (OPAL_LIKELY(reserve)) { + MCA_BTL_UGNI_FRAG_ALLOC_EAGER(ugni_module, frag, rc); + if (OPAL_UNLIKELY(NULL == frag)) { + return NULL; + } + if ((*size + reserve) > mca_btl_ugni_component.eager_limit) { + *size = mca_btl_ugni_component.eager_limit - reserve; + } + + BTL_VERBOSE(("preparing src for send fragment. size = %u", + (unsigned int)(*size + reserve))); + + if (OPAL_UNLIKELY(opal_convertor_need_buffers(convertor))) { + /* non-contiguous data requires using the convertor */ + uint32_t iov_count = 1; + struct iovec iov; + + iov.iov_len = mca_btl_ugni_component.eager_limit - reserve; + iov.iov_base = + (IOVBASE_TYPE *)(((uintptr_t)(frag->segments[0].seg_addr.pval)) + + reserve); + + rc = opal_convertor_pack (convertor, &iov, &iov_count, size); + if (OPAL_UNLIKELY(rc < 0)) { + MCA_BTL_UGNI_FRAG_RETURN(frag); + return NULL; + } + + frag->segments[0].seg_len = reserve + *size; + } + else { + memmove ((void *)((uintptr_t)frag->segments[0].seg_addr.pval + reserve), + data_ptr, *size); + frag->segments[0].seg_len = reserve + *size; + } + } else { + MCA_BTL_UGNI_FRAG_ALLOC_RDMA(ugni_module, frag, rc); + if (OPAL_UNLIKELY(NULL == frag)) { + return NULL; + } + + /* + * For medium message use FMA protocols and for large message + * use BTE protocols + */ + /* No need to register while using FMA Put (registration is + * non-null in get-- is this always true?) */ + if (*size >= mca_btl_ugni_component.btl_fma_limit || (flags & MCA_BTL_DES_FLAGS_GET)) { + if (NULL == registration) { + rc = ugni_module->super.btl_mpool->mpool_register(ugni_module->super.btl_mpool, + data_ptr, *size, 0, + ®istration); + if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { + BTL_ERROR(("btl/ugni error registering source memory")); + MCA_BTL_UGNI_FRAG_RETURN(frag); + return NULL; + } + + frag->registration = (mca_btl_ugni_reg_t*)registration; + } + + memcpy ((void *) frag->segments[0].seg_key.key64, + (void *)&((mca_btl_ugni_reg_t *)registration)->memory_hdl, + sizeof (((mca_btl_ugni_reg_t *)registration)->memory_hdl)); + } else { + memset ((void *) frag->segments[0].seg_key.key64, 0, + sizeof (frag->segments[0].seg_key.key64)); + } + + frag->segments[0].seg_len = *size; + frag->segments[0].seg_addr.pval = data_ptr; + } + + frag->base.des_src = frag->segments; + frag->base.des_src_cnt = 1; + frag->base.order = order; + frag->base.des_flags = flags; + frag->endpoint = endpoint; + + return &frag->base; +} + +static mca_btl_base_descriptor_t * +mca_btl_ugni_prepare_dst (mca_btl_base_module_t *btl, + mca_btl_base_endpoint_t *endpoint, + mca_mpool_base_registration_t *registration, + opal_convertor_t *convertor, uint8_t order, + size_t reserve, size_t *size, uint32_t flags) +{ + mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl; + mca_btl_ugni_base_frag_t *frag; + void *data_ptr; + int rc; + + opal_convertor_get_current_pointer (convertor, &data_ptr); + + /* no alignment restrictions on put */ + MCA_BTL_UGNI_FRAG_ALLOC_RDMA(ugni_module, frag, rc); + if (OPAL_UNLIKELY(NULL == frag)) { + return NULL; + } + + /* always need to register the buffer for put/get (even for fma) */ + if (NULL == registration) { + rc = ugni_module->super.btl_mpool->mpool_register(ugni_module->super.btl_mpool, + data_ptr, *size, 0, + ®istration); + if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { + MCA_BTL_UGNI_FRAG_RETURN(frag); + return NULL; + } + + frag->registration = (mca_btl_ugni_reg_t*) registration; + } + + memcpy ((void *) frag->segments[0].seg_key.key64, + (void *)&((mca_btl_ugni_reg_t *)registration)->memory_hdl, + sizeof (((mca_btl_ugni_reg_t *)registration)->memory_hdl)); + + frag->segments[0].seg_len = *size; + frag->segments[0].seg_addr.pval = data_ptr; + + frag->base.des_dst = frag->segments; + frag->base.des_dst_cnt = 1; + frag->base.order = order; + frag->base.des_flags = flags; + frag->endpoint = endpoint; + + return (struct mca_btl_base_descriptor_t *) frag; +} diff --git a/ompi/mca/btl/ugni/btl_ugni_put.c b/ompi/mca/btl/ugni/btl_ugni_put.c new file mode 100644 index 0000000000..07ff3bddf8 --- /dev/null +++ b/ompi/mca/btl/ugni/btl_ugni_put.c @@ -0,0 +1,65 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "ompi_config.h" +#include "opal/include/opal_stdint.h" + +#include "btl_ugni_rdma.h" +#include "opal/util/opal_sos.h" + +/** + * Initiate a put operation. + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL addressing information + * @param descriptor (IN) Description of the data to be transferred + */ +int mca_btl_ugni_put (struct mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *endpoint, + struct mca_btl_base_descriptor_t *des) { + mca_btl_ugni_base_frag_t *frag = (mca_btl_ugni_base_frag_t *) des; + gni_mem_handle_t lcl_hdl, rem_hdl; + void *lcl_buffer, *rem_buffer; + size_t size; + int rc; + + BTL_VERBOSE(("Using RDMA Put")); + + /* Check if endpoint is connected */ + rc = mca_btl_ugni_check_endpoint_state(endpoint); + if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { + /* we should already be connected by this point (we got an rc send) */ + return rc; + } + + /* Get local memory handle */ + lcl_buffer = (void*)(des->des_src->seg_addr.pval); + size = des->des_src->seg_len; + memcpy (&lcl_hdl, (void *) des->des_src->seg_key.key64, + sizeof(gni_mem_handle_t)); + + /* Get remote memory handle */ + rem_buffer = (void*)(des->des_dst->seg_addr.pval); + memcpy (&rem_hdl, (void *) des->des_dst->seg_key.key64, + sizeof(gni_mem_handle_t)); + + frag->tries = 0; + + if (size < mca_btl_ugni_component.btl_fma_limit) { + rc = post_fma_descriptor (frag, GNI_POST_FMA_PUT, endpoint, size, + lcl_buffer, lcl_hdl, rem_buffer, rem_hdl); + } else { + rc = post_bte_descriptor (frag, GNI_POST_RDMA_PUT, endpoint, size, + lcl_buffer, lcl_hdl, rem_buffer, rem_hdl); + } + + return rc; +} diff --git a/ompi/mca/btl/ugni/btl_ugni_rdma.h b/ompi/mca/btl/ugni/btl_ugni_rdma.h new file mode 100644 index 0000000000..7f419f16d0 --- /dev/null +++ b/ompi/mca/btl/ugni/btl_ugni_rdma.h @@ -0,0 +1,128 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#if !defined(MCA_BTL_UGNI_RDMA_H) +#define MCA_BTL_UGNI_RDMA_H + +#include "btl_ugni.h" +#include "btl_ugni_frag.h" +#include "btl_ugni_endpoint.h" + +static inline void +mca_btl_ugni_post_frag_complete (ompi_common_ugni_post_desc_t *desc, int rc) { + mca_btl_ugni_base_frag_t *frag = MCA_BTL_UGNI_DESC_TO_FRAG(desc); + + /* always call put/get callback */ + frag->base.des_cbfunc(&frag->btl->super, frag->endpoint, &frag->base, rc); + + if (OPAL_LIKELY(frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) { + MCA_BTL_UGNI_FRAG_RETURN(frag); + } +} + +static inline int init_gni_post_desc(mca_btl_ugni_base_frag_t *frag, + mca_btl_base_endpoint_t *ep, + gni_post_type_t op_type, + uint64_t lcl_addr, + gni_mem_handle_t *lcl_mdh, + uint64_t rem_addr, + gni_mem_handle_t *rem_mdh, + uint64_t bufsize, + gni_cq_handle_t cq_hndl) { + frag->post_desc.base.type = op_type; + frag->post_desc.base.cq_mode = GNI_CQMODE_GLOBAL_EVENT; + frag->post_desc.base.dlvr_mode = GNI_DLVMODE_PERFORMANCE; + frag->post_desc.base.local_addr = (uint64_t) lcl_addr; + frag->post_desc.base.local_mem_hndl = *lcl_mdh; + frag->post_desc.base.remote_addr = (uint64_t) rem_addr; + frag->post_desc.base.remote_mem_hndl = *rem_mdh; + frag->post_desc.base.length = bufsize; + frag->post_desc.base.rdma_mode = 0; + frag->post_desc.base.src_cq_hndl = cq_hndl; + + frag->post_desc.cbfunc = mca_btl_ugni_post_frag_complete; + frag->post_desc.endpoint = ep->common; + + frag->post_desc.tries = 0; + + return 0; +} + +static inline int post_fma_descriptor (mca_btl_ugni_base_frag_t *frag, gni_post_type_t op_type, + struct mca_btl_base_endpoint_t *endpoint, + size_t size, void *lcl_buffer, gni_mem_handle_t lcl_hdl, + void *rem_buffer, gni_mem_handle_t rem_hdl) +{ + int rc; + + /* Post descriptor */ + init_gni_post_desc (frag, endpoint, op_type, (uint64_t)lcl_buffer, + &lcl_hdl, (uint64_t)rem_buffer, &rem_hdl, + size, 0); + + rc = GNI_PostFma (endpoint->common->ep_handle, &frag->post_desc.base); + if (GNI_RC_SUCCESS != rc) { + BTL_ERROR(("GNI_PostFma failed with rc = %d", rc)); + assert(rc < 4); + rc = OMPI_ERR_OUT_OF_RESOURCE; /* ompi_common_rc_ugni_to_ompi (rc);*/ + } + + return rc; +} + +static inline int post_bte_descriptor (mca_btl_ugni_base_frag_t *frag, gni_post_type_t op_type, + struct mca_btl_base_endpoint_t *endpoint, + size_t size, void *lcl_buffer, gni_mem_handle_t lcl_hdl, + void *rem_buffer, gni_mem_handle_t rem_hdl) { + int rc; + + /* Post descriptor */ + init_gni_post_desc (frag, endpoint, op_type, (uint64_t)lcl_buffer, + &lcl_hdl, (uint64_t)rem_buffer, &rem_hdl, + size, endpoint->btl->bte_local_cq); + + rc = GNI_PostRdma (endpoint->common->ep_handle, &frag->post_desc.base); + if (GNI_RC_SUCCESS != rc) { + assert(rc < 4); + rc = ompi_common_rc_ugni_to_ompi (rc); + BTL_ERROR(("GNI_PostRdma failed with rc = %d", rc)); + } + + return rc; +} + +static inline int mca_btl_ugni_start_reverse_get (struct mca_btl_base_module_t *btl, + mca_btl_ugni_base_frag_t *frag) { + /* off alignment/off size. switch to put */ + mca_btl_base_segment_t segments[2]; + uint32_t msg_id = ORTE_PROC_MY_NAME->vpid; + void *post_desc_ptr = &(frag->post_desc); + int rc; + + segments[0] = frag->base.des_src[0]; + segments[1] = frag->base.des_dst[0]; + + rc = GNI_SmsgSendWTag (frag->endpoint->common->ep_handle, segments, + sizeof (segments), &post_desc_ptr, sizeof (void *), + msg_id, MCA_BTL_UGNI_TAG_PUT_INIT); + if (OPAL_UNLIKELY(rc == GNI_RC_NOT_DONE)) { + BTL_ERROR(("GNI_SmsgSendWTag failed with rc = %d", rc)); + /* send this smsg packet later */ + return OMPI_ERR_OUT_OF_RESOURCE; + } + /* todo -- on failure try again */ + assert (GNI_RC_SUCCESS == rc); + + return OMPI_SUCCESS; +} + +#endif /* MCA_BTL_UGNI_RDMA_H */ diff --git a/ompi/mca/btl/ugni/btl_ugni_send.c b/ompi/mca/btl/ugni/btl_ugni_send.c new file mode 100644 index 0000000000..b7578aa860 --- /dev/null +++ b/ompi/mca/btl/ugni/btl_ugni_send.c @@ -0,0 +1,67 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "btl_ugni.h" +#include "btl_ugni_frag.h" +#include "btl_ugni_endpoint.h" + +int mca_btl_ugni_send (struct mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *btl_peer, + struct mca_btl_base_descriptor_t *descriptor, + mca_btl_base_tag_t tag) +{ + mca_btl_ugni_base_frag_t *frag = (mca_btl_ugni_base_frag_t *) descriptor; + int rc; + + BTL_VERBOSE(("btl/ugni sending descriptor %p from %d -> %d. length = %d", (void *)descriptor, + ORTE_PROC_MY_NAME->vpid, btl_peer->common->ep_rem_id, frag->segments[0].seg_len)); + + rc = mca_btl_ugni_check_endpoint_state (btl_peer); + if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { + frag->tag = tag; + descriptor->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; + + opal_list_append (&btl_peer->pending_list, (opal_list_item_t *) frag); + /* connection started and request queued or something bad happened */ + return OMPI_SUCCESS; + } + + frag->hdr->tag = tag; + frag->hdr->len = frag->segments[0].seg_len; + + /* check endpoint state */ + rc = GNI_SmsgSendWTag (btl_peer->common->ep_handle, frag->hdr, + descriptor->des_src->seg_len + sizeof (frag->hdr[0]), + NULL, 0, -1, MCA_BTL_UGNI_TAG_SEND); + if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) { + BTL_VERBOSE(("GNI_SmsgSendWTag failed with rc = %d", rc)); + + if (OPAL_LIKELY(GNI_RC_NOT_DONE == rc)) { + BTL_VERBOSE(("out of credits")); + + return OMPI_ERR_OUT_OF_RESOURCE; + } + + return OMPI_ERROR; + } + + if (MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags) { + /* completion callback */ + frag->base.des_cbfunc(&btl_peer->btl->super, btl_peer, &frag->base, OMPI_SUCCESS); + } + + if (descriptor->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP) { + MCA_BTL_UGNI_FRAG_RETURN (frag); + } + + return 1; +} diff --git a/ompi/mca/btl/ugni/btl_ugni_sendi.c b/ompi/mca/btl/ugni/btl_ugni_sendi.c new file mode 100644 index 0000000000..fc932c294a --- /dev/null +++ b/ompi/mca/btl/ugni/btl_ugni_sendi.c @@ -0,0 +1,95 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "btl_ugni.h" +#include "btl_ugni_frag.h" +#include "btl_ugni_endpoint.h" + +int mca_btl_ugni_sendi (struct mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *endpoint, + struct opal_convertor_t *convertor, + void *header, size_t header_size, + size_t payload_size, uint8_t order, + uint32_t flags, mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t **descriptor) +{ + size_t length = header_size + payload_size; + uint32_t msg_id = ORTE_PROC_MY_NAME->vpid; + mca_btl_ugni_base_frag_t *frag; + uint32_t iov_count = 1; + void *data_ptr = NULL; + struct iovec iov; + size_t max_data; + int rc; + + assert (length < mca_btl_ugni_component.eager_limit); + assert (0 == (flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK)); + + if (OPAL_UNLIKELY(OMPI_SUCCESS != mca_btl_ugni_check_endpoint_state (endpoint))) { + /* can't complete inline send if the endpoint is not already connected */ + /* go ahead and start the connection */ + *descriptor = mca_btl_ugni_alloc (btl, endpoint, order, length, flags); + + return OMPI_ERR_RESOURCE_BUSY; + } + + MCA_BTL_UGNI_FRAG_ALLOC_EAGER((mca_btl_ugni_module_t *) btl, frag, rc); + if (OPAL_UNLIKELY(NULL == frag)) { + *descriptor = NULL; + return OMPI_ERR_OUT_OF_RESOURCE; + } + + BTL_VERBOSE(("btl/ugni sending inline descriptor %p from %d -> %d. length = %u", (void *) frag, + ORTE_PROC_MY_NAME->vpid, endpoint->common->ep_rem_id, (unsigned int) length)); + + /* write match header (with MPI comm/tag/etc. info) */ + memcpy (frag->segments[0].seg_addr.pval, header, header_size); + + frag->hdr->tag = tag; + frag->hdr->len = length; + + /* + We can add MEMCHECKER calls before and after the packing. + */ + if (OPAL_UNLIKELY(payload_size && opal_convertor_need_buffers (convertor))) { + /* pack the data into the supplied buffer */ + iov.iov_base = (IOVBASE_TYPE *)((uintptr_t)frag->segments[0].seg_addr.pval + header_size); + iov.iov_len = max_data = payload_size; + + (void) opal_convertor_pack (convertor, &iov, &iov_count, &max_data); + + assert (max_data == payload_size); + + header_size += payload_size; + payload_size = 0; + } else if (payload_size) { + opal_convertor_get_current_pointer (convertor, &data_ptr); + } + + header_size += sizeof (frag->hdr[0]); + + /* check endpoint state */ + rc = GNI_SmsgSendWTag (endpoint->common->ep_handle, frag->hdr, header_size, + data_ptr, payload_size, msg_id, + MCA_BTL_UGNI_TAG_SEND); + if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) { + BTL_VERBOSE(("GNI_SmsgSendWTag failed with rc = %d", rc)); + MCA_BTL_UGNI_FRAG_RETURN (frag); + *descriptor = NULL; + + return OMPI_ERR_OUT_OF_RESOURCE; + } + + MCA_BTL_UGNI_FRAG_RETURN (frag); + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/btl/ugni/configure.m4 b/ompi/mca/btl/ugni/configure.m4 new file mode 100644 index 0000000000..290f7d6d95 --- /dev/null +++ b/ompi/mca/btl/ugni/configure.m4 @@ -0,0 +1,56 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2006 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2006 QLogic Corp. All rights reserved. +# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2011 Los Alamos National Security, LLC. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# OMPI_CHECK_UGNI(prefix, [action-if-found], [action-if-not-found]) +# -------------------------------------------------------- +# check if GNI support can be found. sets prefix_{CPPFLAGS, +# LDFLAGS, LIBS} as needed and runs action-if-found if there is +# support, otherwise executes action-if-not-found +# +# NOTES +# on Cray XE6 systems, the GNI development header (gni_pub.h) is in a +# completely different place than the ugni library (libugni). +# +# EXAMPLE CONFIGURE USAGE: +# --with-ugni=/base/path/to/libugni --with-ugni-includedir=/path/to/gni_pub.h +# +# --with-ugni=/opt/cray/ugni/default --with-ugni-includedir=/opt/cray/gni-headers/default/include + +AC_DEFUN([MCA_ompi_btl_ugni_CONFIG],[ + AC_CONFIG_FILES([ompi/mca/btl/ugni/Makefile]) + + OMPI_CHECK_UGNI([btl_ugni], + [btl_ugni_happy="yes"], + [btl_ugni_happy="no"]) + + AS_IF([test "$btl_ugni_happy" = "yes"], + [btl_ugni_WRAPPER_EXTRA_LDFLAGS="$btl_ugni_LDFLAGS" + btl_ugni_WRAPPER_EXTRA_LIBS="$btl_ugni_LIBS" + $1], + [$2]) + + # substitute in the things needed to build ugni + AC_SUBST([btl_ugni_CPPFLAGS]) + AC_SUBST([btl_ugni_LDFLAGS]) + AC_SUBST([btl_ugni_LIBS]) +])dnl diff --git a/ompi/mca/common/ugni/Makefile.am b/ompi/mca/common/ugni/Makefile.am new file mode 100644 index 0000000000..5d8e80b534 --- /dev/null +++ b/ompi/mca/common/ugni/Makefile.am @@ -0,0 +1,43 @@ +# -*- indent-tabs-mode:nil -*- +# +# Copyright (c) 2011 Los Alamos National Security, LLC. All rights +# reserved. +# Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +AM_CPPFLAGS = $(common_ugni_CPPFLAGS) + +if MCA_BUILD_ompi_common_ugni_DSO +component_noinst = +component_install = mca_common_ugni.la +else +component_noinst = libmca_common_ugni.la +component_install = +endif + +headers = common_ugni.h \ + common_ugni_ep.h + +ugni_SOURCES = common_ugni.c \ + common_ugni_ep.c + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_common_ugni_la_SOURCES = $(ugni_SOURCES) +nodist_mca_common_ugni_la_SOURCES = $(ugni_nodist_SOURCES) +mca_common_ugni_la_LIBADD = $(common_ugni_LIBS) +mca_common_ugni_la_LDFLAGS = -module -avoid-version $(common_ugni_LDFLAGS) + +noinst_LTLIBRARIES = $(component_noinst) +libmca_common_ugni_la_SOURCES = $(ugni_SOURCES) +nodist_libmca_common_ugni_la_SOURCES = $(ugni_nodist_SOURCES) +libmca_common_ugni_la_LIBADD = $(common_ugni_LIBS) +libmca_common_ugni_la_LDFLAGS = -module -avoid-version $(common_ugni_LDFLAGS) diff --git a/ompi/mca/common/ugni/common_ugni.c b/ompi/mca/common/ugni/common_ugni.c new file mode 100644 index 0000000000..046d534b64 --- /dev/null +++ b/ompi/mca/common/ugni/common_ugni.c @@ -0,0 +1,318 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + + +#include "common_ugni.h" + +#include "ompi/proc/proc.h" + +/* NTH: we need some options from the btl */ +#include "ompi/mca/btl/ugni/btl_ugni.h" + +static int ompi_common_ugni_module_ref_count = 0; +ompi_common_ugni_module_t ompi_common_ugni_module; + +mca_base_component_t ompi_common_ugni_component = { + MCA_BASE_VERSION_2_0_0, + "common", + MCA_BASE_VERSION_2_0_0, + "ugni", + MCA_BASE_VERSION_2_0_0, + NULL, + NULL +}; + +static inline int +get_ptag(uint8_t *out_ptag) +{ + /* TODO no need for tmp */ + char *ptr; + uint8_t tmp_ptag; + + if (NULL == (ptr = getenv("PMI_GNI_PTAG"))) { + /* TODO add err msg - better rc? */ + return OMPI_ERR_NOT_FOUND; + } + errno = 0; + tmp_ptag = (uint8_t)strtoul (ptr, (char **)NULL, 10); + if (0 != errno) { + /* TODO add err msg - better rc? */ + return OMPI_ERR_VALUE_OUT_OF_BOUNDS; + } + *out_ptag = tmp_ptag; + return OMPI_SUCCESS; +} + +static inline int get_cookie (uint32_t *out_cookie) +{ + /* TODO no need for tmp */ + char *ptr; + uint32_t tmp_cookie; + + if (NULL == (ptr = getenv("PMI_GNI_COOKIE"))) { + /* TODO add err msg - better rc? */ + return OMPI_ERR_NOT_FOUND; + } + errno = 0; + tmp_cookie = (uint32_t) strtoul (ptr, NULL, 10); + if (0 != errno) { + /* TODO add err msg - better rc? */ + return OMPI_ERR_VALUE_OUT_OF_BOUNDS; + } + + *out_cookie = tmp_cookie; + + return OMPI_SUCCESS; +} + +static unsigned int +ompi_common_ugni_get_nic_address(int device_id) +{ + unsigned int address, cpu_id; + gni_return_t status; + int i, alps_dev_id = -1; + char *token,*p_ptr; + + p_ptr = getenv("PMI_GNI_DEV_ID"); + if (!p_ptr) { + status = GNI_CdmGetNicAddress(device_id, &address, &cpu_id); + if(status != GNI_RC_SUCCESS) { + opal_output (0, "FAILED:GNI_CdmGetNicAddress returned error %d", status); + return (unsigned int)-1; + } + return address; + } + + while (NULL != (token = strtok(p_ptr, ":"))) { + alps_dev_id = atoi(token); + if (alps_dev_id == device_id) { + break; + } + p_ptr = NULL; + } + + if (OPAL_UNLIKELY(-1 == alps_dev_id)) { + return (unsigned int)-1; + } + + p_ptr = getenv("PMI_GNI_LOC_ADDR"); + if (OPAL_UNLIKELY(NULL == p_ptr)) { + return (unsigned int)-1; + } + + i = 0; + while (NULL != (token = strtok(p_ptr, ":"))) { + if (i == alps_dev_id) { + return strtoul (token, NULL, 10); + } + p_ptr = NULL; + ++i; + } + + return (unsigned int)-1; +} + +static int ompi_common_ugni_device_init (ompi_common_ugni_device_t *device, + int comm_world_size, int device_id) +{ + int rc; + + /* Create a NIC Adress */ + device->dev_id = device_id; /* Minor number of the Gemini NIC */ + + device->dev_addr = ompi_common_ugni_get_nic_address (device->dev_id); + + OPAL_OUTPUT((-1, "Got NIC Addr: 0x%08x, CPU ID: %d", device->dev_addr, device->dev_id)); + + /* Attach device to the communication domain */ + rc = GNI_CdmAttach (ompi_common_ugni_module.cd_handle, device->dev_id, + &device->dev_pe_addr, &device->dev_handle); + if (GNI_RC_SUCCESS != rc) { + OPAL_OUTPUT((0, "Error: Creating communication domain %d\n", rc)); + return ompi_common_rc_ugni_to_ompi (rc); + } + + /* Create a completion queue to attach to endpoints */ + rc = GNI_CqCreate (device->dev_handle, ompi_common_ugni_module.local_cq_size, + 0, GNI_CQ_NOBLOCK, NULL, NULL, &device->dev_local_cq); + if (GNI_RC_SUCCESS != rc) { + OPAL_OUTPUT((0, "Error creating SMSG local CQ. rc = %d", rc)); + return ompi_common_rc_ugni_to_ompi (rc); + } + + device->dev_eps = calloc (comm_world_size, sizeof (ompi_common_ugni_endpoint_t *)); + if (NULL == device->dev_eps) { + OPAL_OUTPUT((0, "Error allocating space for endpoint pointers")); + return OMPI_ERROR; + } + + return OMPI_SUCCESS; +} + +static int ompi_common_ugni_device_fini (ompi_common_ugni_device_t *dev) +{ + int rc; + + if (dev->dev_eps) { + free (dev->dev_eps); + dev->dev_eps = NULL; + } + + rc = GNI_CqDestroy (dev->dev_local_cq); + if (GNI_RC_SUCCESS != rc) { + OPAL_OUTPUT((-1, "btl/ugni error destroying cq. rc = %d", rc)); + } + + return OMPI_SUCCESS; +} + +/* + * Send local device information and other information + * required for setup + */ +static int ompi_common_ugni_send_modex (int my_rank) +{ + uint32_t modex_size, total_msg_size, msg_offset; + struct ompi_common_ugni_modex_t modex; + char *modex_msg; + int rc, i; + + modex_size = sizeof (struct ompi_common_ugni_modex_t); + total_msg_size = ompi_common_ugni_module.device_count * modex_size; + + modex_msg = (char *) malloc (total_msg_size); + if (NULL == modex_msg) { + OPAL_OUTPUT((-1, "Error allocating memory for modex @ %s:%d", + __FILE__, __LINE__)); + return OMPI_ERR_OUT_OF_RESOURCE; + } + + /* pack modex for all available devices */ + for (i = 0, msg_offset = 0; i < ompi_common_ugni_module.device_count ; ++i) { + ompi_common_ugni_device_t *dev = ompi_common_ugni_module.devices + i; + + modex.addr = dev->dev_addr; + modex.id = my_rank; + + memcpy ((void *)((uintptr_t) modex_msg + msg_offset), + (void *)&modex, modex_size); + + msg_offset += modex_size; + } + + rc = ompi_modex_send(&ompi_common_ugni_component, + modex_msg, total_msg_size); + + free(modex_msg); + + return rc; +} + +int ompi_common_ugni_fini (void) +{ + int i, rc; + + if (0 == ompi_common_ugni_module_ref_count) { + return OMPI_SUCCESS; + } + + if (1 == ompi_common_ugni_module_ref_count) { + /* tear down component */ + if (ompi_common_ugni_module.devices) { + /* finalize devices */ + for (i = 0 ; i < ompi_common_ugni_module.device_count ; ++i) { + ompi_common_ugni_device_fini (ompi_common_ugni_module.devices + i); + } + + free (ompi_common_ugni_module.devices); + ompi_common_ugni_module.devices = NULL; + } + + /* finally, tear down the communication domain */ + rc = GNI_CdmDestroy (ompi_common_ugni_module.cd_handle); + if (GNI_RC_SUCCESS != rc) { + OPAL_OUTPUT((-1, "error destroying cdm")); + } + } + + ompi_common_ugni_module_ref_count--; + + return OMPI_SUCCESS; +} + +int ompi_common_ugni_init (void) +{ + int modes, rc, my_rank, i; + size_t comm_world_size; + ompi_proc_t *my_proc; + + ompi_common_ugni_module_ref_count ++; + + if (ompi_common_ugni_module_ref_count > 1) { + return OMPI_SUCCESS; + } + + my_proc = ompi_proc_local (); + my_rank = my_proc->proc_name.vpid; + + /* pull settings from ugni btl */ + ompi_common_ugni_module.rdma_max_retries = + mca_btl_ugni_component.rdma_max_retries; + ompi_common_ugni_module.local_cq_size = + mca_btl_ugni_component.cq_size; + + (void) ompi_proc_world (&comm_world_size); + + /* Create a communication domain */ + modes = GNI_CDM_MODE_FORK_FULLCOPY | GNI_CDM_MODE_CACHED_AMO_ENABLED | + GNI_CDM_MODE_DUAL_EVENTS | GNI_CDM_MODE_ERR_NO_KILL | + GNI_CDM_MODE_FAST_DATAGRAM_POLL; + + /* collect uGNI information */ + rc = get_ptag(&ompi_common_ugni_module.ptag); + if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { + return rc; + } + + rc = get_cookie(&ompi_common_ugni_module.cookie); + if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { + return rc; + } + + /* create a communication domain */ + rc = GNI_CdmCreate (my_rank, ompi_common_ugni_module.ptag, + ompi_common_ugni_module.cookie, modes, + &ompi_common_ugni_module.cd_handle); + if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) { + OPAL_OUTPUT((0, "Error: Creating communication domain %d\n",rc)); + return ompi_common_rc_ugni_to_ompi (rc); + } + + /* setup uGNI devices. we only support one device atm */ + ompi_common_ugni_module.device_count = 1; + ompi_common_ugni_module.devices = calloc (ompi_common_ugni_module.device_count, + sizeof (ompi_common_ugni_device_t)); + + for (i = 0 ; i < ompi_common_ugni_module.device_count ; ++i) { + rc = ompi_common_ugni_device_init (ompi_common_ugni_module.devices + i, + comm_world_size, i); + if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { + OPAL_OUTPUT((-1, "error initializing uGNI device")); + return rc; + } + } + + /* send ugni modex */ + ompi_common_ugni_send_modex (my_rank); + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/common/ugni/common_ugni.h b/ompi/mca/common/ugni/common_ugni.h new file mode 100644 index 0000000000..105d96d2f8 --- /dev/null +++ b/ompi/mca/common/ugni/common_ugni.h @@ -0,0 +1,193 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "opal/util/output.h" +#include "opal/class/opal_list.h" +#include "opal/include/opal/prefetch.h" +#include "opal_stdint.h" + +#include "ompi/include/ompi/constants.h" +#include "ompi/runtime/ompi_module_exchange.h" +#include "ompi/proc/proc.h" + +#include +#include +#include +#include +#include +#include + +#include "common_ugni_ep.h" + +#if !defined(MPI_COMMON_UGNI_H) +#define MPI_COMMON_UGNI_H + +struct ompi_common_ugni_modex_t { + uint32_t addr; + int id; +}; +typedef struct ompi_common_ugni_modex_t ompi_common_ugni_modex_t; + +struct ompi_common_ugni_device_t { + opal_object_t super; + + gni_nic_handle_t dev_handle; + + /* Minor number of the Gemini NIC */ + int32_t dev_id; + uint32_t dev_pe_addr; + uint32_t dev_addr; + uint32_t dev_cpu_id; + + gni_cq_handle_t dev_local_cq; + + size_t dev_ep_count; + ompi_common_ugni_endpoint_t **dev_eps; +}; +typedef struct ompi_common_ugni_device_t ompi_common_ugni_device_t; + +struct ompi_common_ugni_module_t { + /* protection tag */ + uint8_t ptag; + + /* unique id for this process assigned by the system */ + uint32_t cookie; + + /* communication domain handle */ + gni_cdm_handle_t cd_handle; + + /* device count. to be used if we have more than 1 common per ugni device */ + int device_count; + ompi_common_ugni_device_t *devices; + + int local_cq_size; + + int rdma_max_retries; +}; +typedef struct ompi_common_ugni_module_t ompi_common_ugni_module_t; + +struct ompi_common_ugni_post_desc_t { + gni_post_descriptor_t base; + + ompi_common_ugni_endpoint_t *endpoint; + int tries; + + /* NTH: callback function for this post. this may change in the future */ + void (*cbfunc) (struct ompi_common_ugni_post_desc_t *, int); +}; +typedef struct ompi_common_ugni_post_desc_t ompi_common_ugni_post_desc_t; + +extern ompi_common_ugni_module_t ompi_common_ugni_module; +extern mca_base_component_t ompi_common_ugni_component; + +static inline int +ompi_common_rc_ugni_to_ompi (gni_return_t rc) +{ + int codes[] = {OMPI_SUCCESS, + OMPI_ERR_RESOURCE_BUSY, + OMPI_ERR_BAD_PARAM, + OMPI_ERR_OUT_OF_RESOURCE, + OMPI_ERR_TIMEOUT, + OMPI_ERR_PERM, + OMPI_ERROR, + OMPI_ERR_BAD_PARAM, + OMPI_ERR_BAD_PARAM, + OMPI_ERR_NOT_FOUND, + OMPI_ERR_VALUE_OUT_OF_BOUNDS, + OMPI_ERROR, + OMPI_ERR_NOT_SUPPORTED, + OMPI_ERR_OUT_OF_RESOURCE}; + return codes[rc]; +} + +/* + * Initialize uGNI communication domain and device(s). + */ +int ompi_common_ugni_init (void); + +/* + * Finalize uGNI communication domain and device(s). + */ +int ompi_common_ugni_fini (void); + +static inline int +ompi_common_ugni_process_completed_post (ompi_common_ugni_device_t *dev, + gni_cq_handle_t cq_handle) { + ompi_common_ugni_post_desc_t *desc; + gni_return_t rc = GNI_RC_NOT_DONE; + gni_cq_entry_t event_data = 0; + uint32_t recoverable = 1; + + rc = GNI_CqGetEvent (cq_handle, &event_data); + if (GNI_RC_NOT_DONE == rc || GNI_CQ_GET_TYPE(event_data) != GNI_CQ_EVENT_TYPE_POST) { + /* ignore smsg completion */ + return 0; + } + + if (OPAL_UNLIKELY((GNI_RC_SUCCESS != rc && !event_data) || GNI_CQ_OVERRUN(event_data))) { + /* TODO -- need to handle overrun -- how do we do this without an event? + will the event eventually come back? Ask Cray */ + OPAL_OUTPUT((-1, "post error! cq overrun = %d", (int)GNI_CQ_OVERRUN(event_data))); + assert (GNI_RC_SUCCESS == rc); + return ompi_common_rc_ugni_to_ompi (rc); + } + + rc = GNI_GetCompleted (cq_handle, event_data, (gni_post_descriptor_t **) &desc); + if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) { + OPAL_OUTPUT((-1, "Error in GNI_GetComplete %s", gni_err_str[rc])); + return ompi_common_rc_ugni_to_ompi (rc); + } + + if (OPAL_UNLIKELY(!GNI_CQ_STATUS_OK(event_data))) { + (void) GNI_CqErrorRecoverable (event_data, &recoverable); + + if (OPAL_UNLIKELY(!recoverable || + ++desc->tries >= ompi_common_ugni_module.rdma_max_retries)) { + OPAL_OUTPUT((-1, "giving up on descriptor %p", (void *) desc)); + /* give up */ + desc->cbfunc (desc, OMPI_ERROR); + + return OMPI_ERROR; + } + + /* repost transaction */ + if (GNI_POST_RDMA_PUT == desc->base.type || + GNI_POST_RDMA_GET == desc->base.type) { + rc = GNI_PostRdma (desc->endpoint->ep_handle, &desc->base); + } else { + rc = GNI_PostFma (desc->endpoint->ep_handle, &desc->base); + } + + return ompi_common_rc_ugni_to_ompi (rc); + } + + desc->cbfunc (desc, OMPI_SUCCESS); + + return 1; +} + +static inline int ompi_common_ugni_progress (void) { + ompi_common_ugni_device_t *dev; + int count, i; + + for (i = 0, count = 0 ; i < ompi_common_ugni_module.device_count ; ++i) { + dev = ompi_common_ugni_module.devices + i; + /* progress fma transactions (ignore local smsg) */ + count += ompi_common_ugni_process_completed_post (dev, dev->dev_local_cq); + } + + return count; +} + +#endif /* MPI_COMMON_UGNI_H */ diff --git a/ompi/mca/common/ugni/common_ugni_ep.c b/ompi/mca/common/ugni/common_ugni_ep.c new file mode 100644 index 0000000000..2e0b787df5 --- /dev/null +++ b/ompi/mca/common/ugni/common_ugni_ep.c @@ -0,0 +1,147 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "common_ugni.h" + +static void ompi_common_ugni_ep_construct (ompi_common_ugni_endpoint_t *ep) +{ + OBJ_CONSTRUCT(&ep->lock, opal_mutex_t); + ep->state = OMPI_COMMON_UGNI_INIT; + ep->bind_count = 0; +} + +static void ompi_common_ugni_ep_destruct (ompi_common_ugni_endpoint_t *ep) +{ + OBJ_DESTRUCT(&ep->lock); + ompi_common_ugni_endpoint_unbind (ep); + ep->dev->dev_eps[ep->ep_rem_id] = NULL; +} + +OBJ_CLASS_INSTANCE(ompi_common_ugni_endpoint_t, opal_object_t, + ompi_common_ugni_ep_construct, ompi_common_ugni_ep_destruct); + +int ompi_common_ugni_endpoint_for_proc (ompi_common_ugni_device_t *dev, ompi_proc_t *peer_proc, + ompi_common_ugni_endpoint_t **ep) +{ + ompi_common_ugni_endpoint_t *endpoint; + ompi_common_ugni_modex_t *modex; + size_t msg_size; + int rem_id, rc; + + assert (NULL != dev && NULL != ep && peer_proc); + + rem_id = peer_proc->proc_name.vpid;; + + if (NULL == dev->dev_eps[rem_id]) { + endpoint = OBJ_NEW(ompi_common_ugni_endpoint_t); + if (OPAL_UNLIKELY(NULL == endpoint)) { + assert (0); + return OMPI_ERR_OUT_OF_RESOURCE; + } + + /* Receive the modex */ + rc = ompi_modex_recv(&ompi_common_ugni_component, + peer_proc, (void *)&modex, &msg_size); + if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { + OPAL_OUTPUT((-1, "btl/ugni error receiving modex")); + return rc; + } + + /* these should be the same */ + assert (rem_id == modex->id); + + endpoint->ep_rem_addr = modex->addr; + endpoint->ep_rem_id = modex->id; + + endpoint->dev = dev; + + *ep = endpoint; + + dev->dev_eps[rem_id] = endpoint; + } else { + OBJ_RETAIN(dev->dev_eps[rem_id]); + *ep = dev->dev_eps[rem_id]; + } + + return OMPI_SUCCESS; +} + +void ompi_common_ugni_endpoint_return (ompi_common_ugni_endpoint_t *ep) +{ + assert(NULL != ep); + + OBJ_RELEASE(ep); +} + +int ompi_common_ugni_endpoint_bind (ompi_common_ugni_endpoint_t *ep) +{ + int rc; + + assert (NULL != ep); + if (OPAL_UNLIKELY(NULL == ep)) { + return OPAL_ERR_BAD_PARAM; + } + + do { + if (OPAL_LIKELY(OMPI_COMMON_UGNI_BOUND <= ep->state)) { + return OMPI_SUCCESS; + } + + OPAL_THREAD_LOCK(&ep->lock); + /* create a uGNI endpoint handle and bind it to the remote peer */ + rc = GNI_EpCreate (ep->dev->dev_handle, ep->dev->dev_local_cq, + &ep->ep_handle); + if (GNI_RC_SUCCESS != rc) { + rc = ompi_common_rc_ugni_to_ompi (rc); + break; + } + + rc = GNI_EpBind (ep->ep_handle, ep->ep_rem_addr, ep->ep_rem_id); + if (GNI_RC_SUCCESS != rc) { + rc = ompi_common_rc_ugni_to_ompi (rc); + break; + } + + ep->state = OMPI_COMMON_UGNI_BOUND; + } while (0); + + OPAL_THREAD_UNLOCK(&ep->lock); + + return rc; +} + +int ompi_common_ugni_endpoint_unbind (ompi_common_ugni_endpoint_t *ep) +{ + int rc; + + if (0 == ep->bind_count) { + return OMPI_SUCCESS; + } + + assert (OMPI_COMMON_UGNI_BOUND == ep->state); + + rc = GNI_EpUnbind (ep->ep_handle); + if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) { + /* should warn */ + } + + GNI_EpDestroy (ep->ep_handle); + if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) { + /* should warn */ + } + + ep->state = OMPI_COMMON_UGNI_INIT; + ep->bind_count--; + + return OMPI_SUCCESS; +} + diff --git a/ompi/mca/common/ugni/common_ugni_ep.h b/ompi/mca/common/ugni/common_ugni_ep.h new file mode 100644 index 0000000000..c2bc5d14a2 --- /dev/null +++ b/ompi/mca/common/ugni/common_ugni_ep.h @@ -0,0 +1,72 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#if !defined(MPI_COMMON_UGNI_EP_H) +#define MPI_COMMON_UGNI_EP_H + +enum ompi_common_ugni_endpoint_state_t { + OMPI_COMMON_UGNI_INIT = 0, + OMPI_COMMON_UGNI_BOUND, + OMPI_COMMON_UGNI_CONNECTING, + OMPI_COMMON_UGNI_CONNECTED +}; +typedef enum ompi_common_ugni_endpoint_state_t ompi_common_ugni_endpoint_state_t; + +struct ompi_common_ugni_device_t; + +struct ompi_common_ugni_endpoint_t { + opal_object_t super; + gni_ep_handle_t ep_handle; /**< uGNI handle for this endpoint */ + ompi_common_ugni_endpoint_state_t state; /**< bind/connection state */ + uint32_t ep_rem_addr, ep_rem_id; /**< remote information */ + struct ompi_common_ugni_device_t *dev; /**< device this endpoint is using */ + opal_mutex_t lock; + int bind_count; /**< bind reference count */ +}; +typedef struct ompi_common_ugni_endpoint_t ompi_common_ugni_endpoint_t; + +OBJ_CLASS_DECLARATION(ompi_common_ugni_endpoint_t); + +/* + * Get (and retain) a reference to an endpoint to peer_proc. This endpoint + * needs to be returned with ompi_common_ugni_endpoint_return. + * + * @param[IN] dev uGNI device this endpoint should be bound to. + * @param[IN] peer_proc remote peer the endpoint will be connected to. + * @param[OUT] ep uGNI endpoint for the peer + */ +int ompi_common_ugni_endpoint_for_proc (struct ompi_common_ugni_device_t *dev, ompi_proc_t *peer_proc, + ompi_common_ugni_endpoint_t **ep); + +/* + * Allocate and bind a uGNI endpoint handle to the remote peer. + * + * @param[IN] ep uGNI endpoint to bind + */ +int ompi_common_ugni_endpoint_bind (ompi_common_ugni_endpoint_t *ep); + +/* + * Unbind and free the uGNI endpoint handle associated with this endpoint. + * + * @param[IN] ep uGNI endpoint to unbind + */ +int ompi_common_ugni_endpoint_unbind (ompi_common_ugni_endpoint_t *ep); + +/* + * Return (and possibly free) an endpoint. The endpoint may not be used + * once it is returned. + * + * @param[IN] ep uGNI endpoint to return + */ +void ompi_common_ugni_endpoint_return (ompi_common_ugni_endpoint_t *ep); + +#endif /* MPI_COMMON_UGNI_EP_H */ diff --git a/ompi/mca/common/ugni/configure.m4 b/ompi/mca/common/ugni/configure.m4 new file mode 100644 index 0000000000..7322775594 --- /dev/null +++ b/ompi/mca/common/ugni/configure.m4 @@ -0,0 +1,56 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2006 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2006 QLogic Corp. All rights reserved. +# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2011 Los Alamos National Security, LLC. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# OMPI_CHECK_UGNI(prefix, [action-if-found], [action-if-not-found]) +# -------------------------------------------------------- +# check if GNI support can be found. sets prefix_{CPPFLAGS, +# LDFLAGS, LIBS} as needed and runs action-if-found if there is +# support, otherwise executes action-if-not-found +# +# NOTES +# on Cray XE6 systems, the GNI development header (gni_pub.h) is in a +# completely different place than the ugni library (libugni). +# +# EXAMPLE CONFIGURE USAGE: +# --with-ugni=/base/path/to/libugni --with-ugni-includedir=/path/to/gni_pub.h +# +# --with-ugni=/opt/cray/ugni/default --with-ugni-includedir=/opt/cray/gni-headers/default/include + +AC_DEFUN([MCA_ompi_common_ugni_CONFIG],[ + AC_CONFIG_FILES([ompi/mca/common/ugni/Makefile]) + + OMPI_CHECK_UGNI([common_ugni], + [common_ugni_happy="yes"], + [common_ugni_happy="no"]) + + AS_IF([test "$common_ugni_happy" = "yes"], + [common_ugni_WRAPPER_EXTRA_LDFLAGS="$common_ugni_LDFLAGS" + common_ugni_WRAPPER_EXTRA_LIBS="$common_ugni_LIBS" + $1], + [$2]) + + # substitute in the things needed to build ugni + AC_SUBST([common_ugni_CPPFLAGS]) + AC_SUBST([common_ugni_LDFLAGS]) + AC_SUBST([common_ugni_LIBS]) +])dnl