diff --git a/opal/mca/btl/ugni/Makefile.am b/opal/mca/btl/ugni/Makefile.am index 371b83e2cf..2e9153641e 100644 --- a/opal/mca/btl/ugni/Makefile.am +++ b/opal/mca/btl/ugni/Makefile.am @@ -1,6 +1,6 @@ # -*- indent-tabs-mode:nil -*- # -# Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights +# Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights # reserved. # Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. # @@ -40,14 +40,15 @@ ugni_SOURCES = \ btl_ugni_smsg.c \ btl_ugni_progress_thread.c \ btl_ugni_prepare.h \ - btl_ugni_atomic.c + btl_ugni_atomic.c \ + btl_ugni_init.c \ + btl_ugni_device.h mcacomponentdir = $(opallibdir) mcacomponent_LTLIBRARIES = $(component_install) mca_btl_ugni_la_SOURCES = $(ugni_SOURCES) nodist_mca_btl_ugni_la_SOURCES = $(ugni_nodist_SOURCES) -mca_btl_ugni_la_LIBADD = $(btl_ugni_LIBS) \ - $(OPAL_TOP_BUILDDIR)/opal/mca/common/ugni/lib@OPAL_LIB_PREFIX@mca_common_ugni.la +mca_btl_ugni_la_LIBADD = $(btl_ugni_LIBS) mca_btl_ugni_la_LDFLAGS = -module -avoid-version $(btl_ugni_LDFLAGS) noinst_LTLIBRARIES = $(component_noinst) diff --git a/opal/mca/btl/ugni/btl_ugni.h b/opal/mca/btl/ugni/btl_ugni.h index e6487c7cd0..7b5f4e7456 100644 --- a/opal/mca/btl/ugni/btl_ugni.h +++ b/opal/mca/btl/ugni/btl_ugni.h @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2014 Research Organization for Information Science @@ -34,7 +34,6 @@ #include "opal/mca/btl/base/btl_base_error.h" #include "opal/class/opal_hash_table.h" #include "opal/class/opal_free_list.h" -#include "opal/mca/common/ugni/common_ugni.h" #include #include @@ -48,6 +47,23 @@ #define MCA_BTL_UGNI_CONNECT_DIRECTED_ID 0x8000000000000000ull #define MCA_BTL_UGNI_DATAGRAM_MASK 0x8000000000000000ull +/** maximum number of supported virtual devices */ +#define MCA_BTL_UGNI_MAX_DEV_HANDLES 128 + +/** number of rdma completion queue items to remove per progress loop */ +#define MCA_BTL_UGNI_COMPLETIONS_PER_LOOP 16 + +/** + * Modex data + */ +struct mca_btl_ugni_modex_t { + /** GNI NIC address */ + uint32_t addr; + /** CDM identifier (base) */ + int id; +}; +typedef struct mca_btl_ugni_modex_t mca_btl_ugni_modex_t; + /* ompi and smsg endpoint attributes */ typedef struct mca_btl_ugni_endpoint_attr_t { opal_process_name_t proc_name; @@ -61,12 +77,73 @@ enum { MCA_BTL_UGNI_RCACHE_GRDMA }; +enum mca_btl_ugni_free_list_id_t { + /* eager fragment list (registered) */ + MCA_BTL_UGNI_LIST_EAGER_SEND, + MCA_BTL_UGNI_LIST_EAGER_RECV, + /* SMSG fragment list (unregistered) */ + MCA_BTL_UGNI_LIST_SMSG, + /* RDMA fragment list */ + MCA_BTL_UGNI_LIST_RDMA, + MCA_BTL_UGNI_LIST_RDMA_INT, + MCA_BTL_UGNI_LIST_MAX, +}; + +struct mca_btl_ugni_cq_t { + /** ugni CQ handle */ + gni_cq_handle_t gni_handle; + /** number of completions expected on the CQ */ + int32_t active_operations; +}; +typedef struct mca_btl_ugni_cq_t mca_btl_ugni_cq_t; + +/** + * GNI virtual device + */ +struct mca_btl_ugni_device_t { + /** Communication domain handle */ + gni_cdm_handle_t dev_cd_handle; + + /** protection for ugni access */ + volatile int32_t lock; + + /** Index of device in module devices array */ + int dev_index; + + /** number of SMSG connections */ + volatile int32_t smsg_connections; + + /** uGNI device handle */ + gni_nic_handle_t dev_handle; + + /** uGNI rdma completion queue */ + mca_btl_ugni_cq_t dev_rdma_local_cq; + + /** local rdma completion queue (async) */ + mca_btl_ugni_cq_t dev_rdma_local_irq_cq; + + /** local SMSG completion queue */ + mca_btl_ugni_cq_t dev_smsg_local_cq; + + /** IRQ memory handle for this device */ + gni_mem_handle_t smsg_irq_mhndl; + + /** RDMA endpoint free list */ + opal_free_list_t endpoints; + + /** post descriptors pending resources */ + opal_list_t pending_post; +}; +typedef struct mca_btl_ugni_device_t mca_btl_ugni_device_t; + +typedef intptr_t (*mca_btl_ugni_device_serialize_fn_t) (mca_btl_ugni_device_t *device, void *arg); + typedef struct mca_btl_ugni_module_t { mca_btl_base_module_t super; bool initialized; - opal_common_ugni_device_t *device; + mca_btl_ugni_device_t devices[MCA_BTL_UGNI_MAX_DEV_HANDLES]; opal_mutex_t endpoint_lock; size_t endpoint_count; @@ -82,9 +159,6 @@ typedef struct mca_btl_ugni_module_t { opal_mutex_t eager_get_pending_lock; opal_list_t eager_get_pending; - opal_mutex_t pending_descriptors_lock; - opal_list_t pending_descriptors; - opal_free_list_t post_descriptors; mca_mpool_base_module_t *mpool; @@ -95,23 +169,11 @@ typedef struct mca_btl_ugni_module_t { struct mca_btl_ugni_endpoint_attr_t wc_remote_attr, wc_local_attr; - gni_cq_handle_t rdma_local_cq; gni_cq_handle_t smsg_remote_cq; - gni_cq_handle_t smsg_local_cq; gni_cq_handle_t smsg_remote_irq_cq; - gni_cq_handle_t rdma_local_irq_cq; - - /* eager fragment list (registered) */ - opal_free_list_t eager_frags_send; - opal_free_list_t eager_frags_recv; - - /* SMSG fragment list (unregistered) */ - opal_free_list_t smsg_frags; - - /* RDMA fragment list */ - opal_free_list_t rdma_frags; - opal_free_list_t rdma_int_frags; + /** fragment free lists (see enum mca_btl_ugni_free_list_id_t) */ + opal_free_list_t frags_lists[MCA_BTL_UGNI_LIST_MAX]; /* lock for this list */ opal_mutex_t ep_wait_list_lock; @@ -197,10 +259,62 @@ typedef struct mca_btl_ugni_component_t { /* Indicate whether progress thread allowed */ bool progress_thread_enabled; + /** Number of ugni device contexts to create per GNI device */ + int virtual_device_count; + + /** Protection tag */ + uint8_t ptag; + + /** Unique id for this process assigned by the system */ + uint32_t cookie; + + /** Starting value of communication identifier */ + uint32_t cdm_id_base; + + /** GNI CDM flags */ + uint32_t cdm_flags; + + /** NIC address */ + uint32_t dev_addr; } mca_btl_ugni_component_t; -int mca_btl_ugni_module_init (mca_btl_ugni_module_t *ugni_module, - opal_common_ugni_device_t *device); +/* Global structures */ + +OPAL_MODULE_DECLSPEC extern mca_btl_ugni_component_t mca_btl_ugni_component; +OPAL_MODULE_DECLSPEC extern mca_btl_ugni_module_t mca_btl_ugni_module; + +/** + * Get a virtual device for communication + */ +static inline mca_btl_ugni_device_t *mca_btl_ugni_ep_get_device (mca_btl_ugni_module_t *ugni_module) +{ + static volatile uint32_t device_index = (uint32_t) 0; + uint32_t dev_index; + + /* don't really care if the device index is atomically updated */ + dev_index = (device_index++) & (mca_btl_ugni_component.virtual_device_count - 1); + + return ugni_module->devices + dev_index; +} + +static inline int mca_btl_rc_ugni_to_opal (gni_return_t rc) +{ + static int codes[] = {OPAL_SUCCESS, + OPAL_ERR_RESOURCE_BUSY, + OPAL_ERR_BAD_PARAM, + OPAL_ERR_OUT_OF_RESOURCE, + OPAL_ERR_TIMEOUT, + OPAL_ERR_PERM, + OPAL_ERROR, + OPAL_ERR_BAD_PARAM, + OPAL_ERR_BAD_PARAM, + OPAL_ERR_NOT_FOUND, + OPAL_ERR_VALUE_OUT_OF_BOUNDS, + OPAL_ERROR, + OPAL_ERR_NOT_SUPPORTED, + OPAL_ERR_OUT_OF_RESOURCE}; + return codes[rc]; +} /** * BML->BTL notification of change in the process list. @@ -324,10 +438,32 @@ typedef struct mca_btl_ugni_reg_t { mca_btl_base_registration_handle_t handle; } mca_btl_ugni_reg_t; -/* Global structures */ +/** + * Initialize uGNI support. + */ +int mca_btl_ugni_init (void); -OPAL_MODULE_DECLSPEC extern mca_btl_ugni_component_t mca_btl_ugni_component; -OPAL_MODULE_DECLSPEC extern mca_btl_ugni_module_t mca_btl_ugni_module; +/** + * Finalize uGNI support. + */ +int mca_btl_ugni_fini (void); + +int mca_btl_ugni_module_init (mca_btl_ugni_module_t *ugni_module); + +/** + * Intialize a virtual device for device index 0. + * + * @param[inout] device Device to initialize + * @param[in] virtual_device_id Virtual device identified (up to max handles) + */ +int mca_btl_ugni_device_init (mca_btl_ugni_device_t *device, int virtual_device_id); + +/** + * Finalize a virtual device. + * + * @param[in] device Device to finalize + */ +int mca_btl_ugni_device_fini (mca_btl_ugni_device_t *dev); /* Get a unique 64-bit id for the process name */ static inline uint64_t mca_btl_ugni_proc_name_to_id (opal_process_name_t name) { @@ -338,6 +474,57 @@ static inline uint64_t mca_btl_ugni_proc_name_to_id (opal_process_name_t name) { int mca_btl_ugni_spawn_progress_thread(struct mca_btl_base_module_t* btl); int mca_btl_ugni_kill_progress_thread(void); +/** + * Try to lock a uGNI device for exclusive access + */ +static inline int mca_btl_ugni_device_trylock (mca_btl_ugni_device_t *device) +{ + /* checking the lock non-atomically first can reduce the number of + * unnecessary atomic operations. */ + return (device->lock || opal_atomic_swap_32 (&device->lock, 1)); +} + +/** + * Lock a uGNI device for exclusive access + */ +static inline void mca_btl_ugni_device_lock (mca_btl_ugni_device_t *device) +{ + while (mca_btl_ugni_device_trylock (device)); +} + +/** + * Release exclusive access to the device + */ +static inline void mca_btl_ugni_device_unlock (mca_btl_ugni_device_t *device) +{ + opal_atomic_wmb (); + device->lock = 0; +} + +/** + * Serialize an operation on a uGNI device + * + * @params[in] device ugni device + * @params[in] fn function to serialize + * @params[in] arg function argument + */ +static inline intptr_t mca_btl_ugni_device_serialize (mca_btl_ugni_device_t *device, + mca_btl_ugni_device_serialize_fn_t fn, void *arg) +{ + intptr_t rc; + + if (!opal_using_threads ()) { + return fn (device, arg); + } + + /* NTH: for now the device is just protected by a spin lock but this will change in the future */ + mca_btl_ugni_device_lock (device); + rc = fn (device, arg); + mca_btl_ugni_device_unlock (device); + return rc; +} + + /** Number of times the progress thread has woken up */ extern unsigned int mca_btl_ugni_progress_thread_wakeups; diff --git a/opal/mca/btl/ugni/btl_ugni_add_procs.c b/opal/mca/btl/ugni/btl_ugni_add_procs.c index 0bd1b0005b..a789149a9b 100644 --- a/opal/mca/btl/ugni/btl_ugni_add_procs.c +++ b/opal/mca/btl/ugni/btl_ugni_add_procs.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. @@ -20,7 +20,7 @@ #include "opal/include/opal/align.h" #include "opal/mca/pmix/pmix.h" -#define INITIAL_GNI_EPS 10000 +#define INITIAL_GNI_EPS 1024 static int mca_btl_ugni_setup_mpools (mca_btl_ugni_module_t *ugni_module); @@ -50,7 +50,7 @@ int mca_btl_ugni_add_procs (struct mca_btl_base_module_t* btl, size_t nprocs, /* NTH: might want to vary this size based off the universe size (if * one exists). the table is only used for connection lookup and * endpoint removal. */ - rc = opal_hash_table_init (&ugni_module->id_to_endpoint, 512); + rc = opal_hash_table_init (&ugni_module->id_to_endpoint, INITIAL_GNI_EPS); if (OPAL_SUCCESS != rc) { BTL_ERROR(("error initializing the endpoint hash. rc = %d", rc)); return rc; @@ -58,93 +58,63 @@ int mca_btl_ugni_add_procs (struct mca_btl_base_module_t* btl, size_t nprocs, } for (size_t i = 0 ; i < nprocs ; ++i) { - struct opal_proc_t *opal_proc = procs[i]; - uint64_t proc_id = mca_btl_ugni_proc_name_to_id(opal_proc->proc_name); - - /* check for an existing endpoint */ - OPAL_THREAD_LOCK(&ugni_module->endpoint_lock); - if (OPAL_SUCCESS != opal_hash_table_get_value_uint64 (&ugni_module->id_to_endpoint, proc_id, (void **) (peers + i))) { - if (OPAL_PROC_ON_LOCAL_NODE(opal_proc->proc_flags)) { - ugni_module->nlocal_procs++; - - /* ugni is allowed on local processes to provide support for network - * atomic operations */ - } - - /* Create and Init endpoints */ - rc = mca_btl_ugni_init_ep (ugni_module, peers + i, (mca_btl_ugni_module_t *) btl, opal_proc); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - OPAL_THREAD_UNLOCK(&ugni_module->endpoint_lock); - BTL_ERROR(("btl/ugni error initializing endpoint")); - return rc; - } - - /* go ahead and connect the local endpoint for RDMA/CQ write */ - if (opal_proc == opal_proc_local_get ()) { - ugni_module->local_ep = peers[i]; - } - - /* Add this endpoint to the pointer array. */ - BTL_VERBOSE(("initialized uGNI endpoint for proc id: 0x%" PRIx64 " ptr: %p", proc_id, (void *) peers[i])); - opal_hash_table_set_value_uint64 (&ugni_module->id_to_endpoint, proc_id, peers[i]); - - ++ugni_module->endpoint_count; + peers[i] = mca_btl_ugni_get_ep (btl, procs[i]); + if (NULL == peers[i]) { + continue; + } + + if (procs[i] == opal_proc_local_get ()) { + ugni_module->local_ep = peers[i]; } - OPAL_THREAD_UNLOCK(&ugni_module->endpoint_lock); /* Set the reachable bit if necessary */ if (reachable) { - rc = opal_bitmap_set_bit (reachable, i); + (void) opal_bitmap_set_bit (reachable, i); } } mca_btl_ugni_module_set_max_reg (ugni_module, ugni_module->nlocal_procs); if (false == ugni_module->initialized) { - OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); - rc = GNI_CqCreate (ugni_module->device->dev_handle, mca_btl_ugni_component.local_cq_size, - 0, GNI_CQ_NOBLOCK, NULL, NULL, &ugni_module->rdma_local_cq); - OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); - if (GNI_RC_SUCCESS != rc) { - BTL_ERROR(("error creating local BTE/FMA CQ")); - return opal_common_rc_ugni_to_opal (rc); + for (int i = 0 ; i < mca_btl_ugni_component.virtual_device_count ; ++i) { + mca_btl_ugni_device_t *device = ugni_module->devices + i; + rc = GNI_CqCreate (device->dev_handle, mca_btl_ugni_component.local_cq_size, 0, + GNI_CQ_NOBLOCK, NULL, NULL, &device->dev_rdma_local_cq.gni_handle); + if (GNI_RC_SUCCESS != rc) { + BTL_ERROR(("error creating local BTE/FMA CQ")); + return mca_btl_rc_ugni_to_opal (rc); + } + + rc = GNI_CqCreate (device->dev_handle, mca_btl_ugni_component.local_cq_size, + 0, GNI_CQ_NOBLOCK, NULL, NULL, &device->dev_smsg_local_cq.gni_handle); + if (GNI_RC_SUCCESS != rc) { + BTL_ERROR(("error creating local SMSG CQ")); + return mca_btl_rc_ugni_to_opal (rc); + } + + if (mca_btl_ugni_component.progress_thread_enabled) { + rc = GNI_CqCreate (device->dev_handle, mca_btl_ugni_component.local_cq_size, + 0, GNI_CQ_BLOCKING, NULL, NULL, &device->dev_rdma_local_irq_cq.gni_handle); + if (GNI_RC_SUCCESS != rc) { + BTL_ERROR(("error creating local BTE/FMA CQ")); + return mca_btl_rc_ugni_to_opal (rc); + } + } } - OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); - rc = GNI_CqCreate (ugni_module->device->dev_handle, mca_btl_ugni_component.local_cq_size, - 0, GNI_CQ_NOBLOCK, NULL, NULL, &ugni_module->smsg_local_cq); - OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); - if (GNI_RC_SUCCESS != rc) { - BTL_ERROR(("error creating local SMSG CQ")); - return opal_common_rc_ugni_to_opal (rc); - } - - OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); - rc = GNI_CqCreate (ugni_module->device->dev_handle, mca_btl_ugni_component.remote_cq_size, + rc = GNI_CqCreate (ugni_module->devices[0].dev_handle, mca_btl_ugni_component.remote_cq_size, 0, GNI_CQ_NOBLOCK, NULL, NULL, &ugni_module->smsg_remote_cq); - OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); if (GNI_RC_SUCCESS != rc) { BTL_ERROR(("error creating remote SMSG CQ")); - return opal_common_rc_ugni_to_opal (rc); + return mca_btl_rc_ugni_to_opal (rc); } if (mca_btl_ugni_component.progress_thread_enabled) { - OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); - rc = GNI_CqCreate (ugni_module->device->dev_handle, mca_btl_ugni_component.local_cq_size, - 0, GNI_CQ_BLOCKING, NULL, NULL, &ugni_module->rdma_local_irq_cq); - OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); - if (GNI_RC_SUCCESS != rc) { - BTL_ERROR(("error creating local BTE/FMA CQ")); - return opal_common_rc_ugni_to_opal (rc); - } - - OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); - rc = GNI_CqCreate (ugni_module->device->dev_handle, mca_btl_ugni_component.remote_cq_size, + rc = GNI_CqCreate (ugni_module->devices[0].dev_handle, mca_btl_ugni_component.remote_cq_size, 0, GNI_CQ_BLOCKING, NULL, NULL, &ugni_module->smsg_remote_irq_cq); - OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); if (GNI_RC_SUCCESS != rc) { BTL_ERROR(("error creating remote SMSG CQ")); - return opal_common_rc_ugni_to_opal (rc); + return mca_btl_rc_ugni_to_opal (rc); } } @@ -175,15 +145,13 @@ int mca_btl_ugni_add_procs (struct mca_btl_base_module_t* btl, size_t nprocs, return OPAL_ERR_OUT_OF_RESOURCE; } - OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); - rc = GNI_MemRegister(ugni_module->device->dev_handle, + rc = GNI_MemRegister(ugni_module->devices[0].dev_handle, (unsigned long)mmap_start_addr, 4096, ugni_module->smsg_remote_irq_cq, GNI_MEM_READWRITE, -1, - &ugni_module->device->smsg_irq_mhndl); - OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); + &ugni_module->devices[0].smsg_irq_mhndl); mca_btl_ugni_spawn_progress_thread(btl); } @@ -198,18 +166,10 @@ int mca_btl_ugni_del_procs (struct mca_btl_base_module_t *btl, size_t nprocs, struct opal_proc_t **procs, struct mca_btl_base_endpoint_t **peers) { mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl; - size_t i; - int rc; - while (ugni_module->active_send_count) { - /* ensure all sends are complete before removing and procs */ - rc = mca_btl_ugni_progress_local_smsg (ugni_module); - if (OPAL_SUCCESS != rc) { - break; - } - } + OPAL_THREAD_LOCK(&ugni_module->endpoint_lock); - for (i = 0 ; i < nprocs ; ++i) { + for (size_t i = 0 ; i < nprocs ; ++i) { struct opal_proc_t *opal_proc = procs[i]; uint64_t proc_id = mca_btl_ugni_proc_name_to_id(opal_proc->proc_name); mca_btl_base_endpoint_t *ep = NULL; @@ -224,10 +184,18 @@ int mca_btl_ugni_del_procs (struct mca_btl_base_module_t *btl, --ugni_module->endpoint_count; } + if (OPAL_PROC_ON_LOCAL_NODE(opal_proc->proc_flags)) { + --ugni_module->nlocal_procs; + } + /* remote the endpoint from the hash table */ opal_hash_table_set_value_uint64 (&ugni_module->id_to_endpoint, proc_id, NULL); } + OPAL_THREAD_UNLOCK(&ugni_module->endpoint_lock); + + mca_btl_ugni_module_set_max_reg (ugni_module, ugni_module->nlocal_procs); + return OPAL_SUCCESS; } @@ -244,9 +212,12 @@ struct mca_btl_base_endpoint_t *mca_btl_ugni_get_ep (struct mca_btl_base_module_ do { rc = opal_hash_table_get_value_uint64 (&ugni_module->id_to_endpoint, proc_id, (void **) &ep); if (OPAL_SUCCESS == rc) { + BTL_VERBOSE(("returning existing endpoint for proc %s", OPAL_NAME_PRINT(proc->proc_name))); break; } + BTL_VERBOSE(("initialized uGNI endpoint for proc id: 0x%" PRIx64 " ptr: %p", proc_id, (void *) proc)); + /* Create and Init endpoints */ rc = mca_btl_ugni_init_ep (ugni_module, &ep, ugni_module, proc); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { @@ -254,8 +225,13 @@ struct mca_btl_base_endpoint_t *mca_btl_ugni_get_ep (struct mca_btl_base_module_ break; } - /* Add this endpoint to the pointer array. */ - BTL_VERBOSE(("initialized uGNI endpoint for proc id: 0x%" PRIx64 " ptr: %p", proc_id, (void *) ep)); + /* ugni is allowed on local processes to provide support for network atomic operations */ + if (OPAL_PROC_ON_LOCAL_NODE(proc->proc_flags)) { + ++ugni_module->nlocal_procs; + } + ++ugni_module->endpoint_count; + + /* add this endpoint to the connection lookup table */ opal_hash_table_set_value_uint64 (&ugni_module->id_to_endpoint, proc_id, ep); } while (0); @@ -269,10 +245,8 @@ static int ugni_reg_mem (void *reg_data, void *base, size_t size, mca_rcache_base_registration_t *reg) { mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) reg_data; - mca_btl_ugni_reg_t *ugni_reg = (mca_btl_ugni_reg_t *) reg; - gni_cq_handle_t cq = NULL; - gni_return_t rc; - int flags; + gni_cq_handle_t cq = 0; + int flags, rc; if (ugni_module->reg_count >= ugni_module->reg_max) { return OPAL_ERR_OUT_OF_RESOURCE; @@ -293,37 +267,26 @@ static int ugni_reg_mem (void *reg_data, void *base, size_t size, cq = ugni_module->smsg_remote_cq; } - OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); - rc = GNI_MemRegister (ugni_module->device->dev_handle, (uint64_t) base, - size, cq, flags, -1, &(ugni_reg->handle.gni_handle)); - OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); - - if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) { - return OPAL_ERR_OUT_OF_RESOURCE; + rc = mca_btl_ugni_reg_mem (ugni_module, base, size, (mca_btl_ugni_reg_t *) reg, cq, flags); + if (OPAL_LIKELY(OPAL_SUCCESS == rc)) { + opal_atomic_add_32(&ugni_module->reg_count,1); } - opal_atomic_add_32(&ugni_module->reg_count,1); - - return OPAL_SUCCESS; + return rc; } static int ugni_dereg_mem (void *reg_data, mca_rcache_base_registration_t *reg) { mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) reg_data; - mca_btl_ugni_reg_t *ugni_reg = (mca_btl_ugni_reg_t *)reg; - gni_return_t rc; + int rc; - OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); - rc = GNI_MemDeregister (ugni_module->device->dev_handle, &ugni_reg->handle.gni_handle); - OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); - if (GNI_RC_SUCCESS != rc) { - return OPAL_ERROR; + rc = mca_btl_ugni_dereg_mem (ugni_module, (mca_btl_ugni_reg_t *) reg); + if (OPAL_LIKELY(OPAL_SUCCESS == rc)) { + opal_atomic_add_32(&ugni_module->reg_count,-1); } - opal_atomic_add_32(&ugni_module->reg_count,-1); - - return OPAL_SUCCESS; + return rc; } static int @@ -356,7 +319,7 @@ mca_btl_ugni_setup_mpools (mca_btl_ugni_module_t *ugni_module) return rc; } - rc = opal_free_list_init (&ugni_module->smsg_frags, + rc = opal_free_list_init (ugni_module->frags_lists + MCA_BTL_UGNI_LIST_SMSG, sizeof (mca_btl_ugni_smsg_frag_t), opal_cache_line_size, OBJ_CLASS(mca_btl_ugni_smsg_frag_t), mca_btl_ugni_component.ugni_smsg_limit, @@ -365,13 +328,13 @@ mca_btl_ugni_setup_mpools (mca_btl_ugni_module_t *ugni_module) mca_btl_ugni_component.ugni_free_list_max, mca_btl_ugni_component.ugni_free_list_inc, NULL, 0, NULL, (opal_free_list_item_init_fn_t) mca_btl_ugni_frag_init, - (void *) ugni_module); + (void *) (intptr_t) MCA_BTL_UGNI_LIST_SMSG); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { BTL_ERROR(("error creating smsg fragment free list")); return rc; } - rc = opal_free_list_init (&ugni_module->rdma_frags, + rc = opal_free_list_init (ugni_module->frags_lists + MCA_BTL_UGNI_LIST_RDMA, sizeof (mca_btl_ugni_rdma_frag_t), 64, OBJ_CLASS(mca_btl_ugni_rdma_frag_t), 0, opal_cache_line_size, @@ -379,17 +342,17 @@ mca_btl_ugni_setup_mpools (mca_btl_ugni_module_t *ugni_module) mca_btl_ugni_component.ugni_free_list_max, mca_btl_ugni_component.ugni_free_list_inc, NULL, 0, NULL, (opal_free_list_item_init_fn_t) mca_btl_ugni_frag_init, - (void *) ugni_module); + (void *) (intptr_t) MCA_BTL_UGNI_LIST_RDMA); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { return rc; } - rc = opal_free_list_init (&ugni_module->rdma_int_frags, + rc = opal_free_list_init (ugni_module->frags_lists + MCA_BTL_UGNI_LIST_RDMA_INT, sizeof (mca_btl_ugni_rdma_frag_t), 8, OBJ_CLASS(mca_btl_ugni_rdma_frag_t), 0, opal_cache_line_size, 0, -1, 64, NULL, 0, NULL, (opal_free_list_item_init_fn_t) mca_btl_ugni_frag_init, - (void *) ugni_module); + (void *) (intptr_t) MCA_BTL_UGNI_LIST_RDMA_INT); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { return rc; } @@ -419,14 +382,14 @@ mca_btl_ugni_setup_mpools (mca_btl_ugni_module_t *ugni_module) } ugni_module->rcache = - mca_rcache_base_module_create (rcache_name, ugni_module->device, &rcache_resources.base); + mca_rcache_base_module_create (rcache_name, ugni_module->devices, &rcache_resources.base); if (NULL == ugni_module->rcache) { BTL_ERROR(("error creating registration cache")); return OPAL_ERROR; } - rc = opal_free_list_init (&ugni_module->eager_frags_send, + rc = opal_free_list_init (ugni_module->frags_lists + MCA_BTL_UGNI_LIST_EAGER_SEND, sizeof (mca_btl_ugni_eager_frag_t), 8, OBJ_CLASS(mca_btl_ugni_eager_frag_t), ugni_module->super.btl_eager_limit, 64, @@ -435,13 +398,13 @@ mca_btl_ugni_setup_mpools (mca_btl_ugni_module_t *ugni_module) mca_btl_ugni_component.ugni_eager_inc, ugni_module->super.btl_mpool, 0, ugni_module->rcache, (opal_free_list_item_init_fn_t) mca_btl_ugni_frag_init, - (void *) ugni_module); + (void *) (intptr_t) MCA_BTL_UGNI_LIST_EAGER_SEND); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { BTL_ERROR(("error creating eager send fragment free list")); return rc; } - rc = opal_free_list_init (&ugni_module->eager_frags_recv, + rc = opal_free_list_init (ugni_module->frags_lists + MCA_BTL_UGNI_LIST_EAGER_RECV, sizeof (mca_btl_ugni_eager_frag_t), 8, OBJ_CLASS(mca_btl_ugni_eager_frag_t), ugni_module->super.btl_eager_limit, 64, @@ -450,7 +413,7 @@ mca_btl_ugni_setup_mpools (mca_btl_ugni_module_t *ugni_module) mca_btl_ugni_component.ugni_eager_inc, ugni_module->super.btl_mpool, 0, ugni_module->rcache, (opal_free_list_item_init_fn_t) mca_btl_ugni_frag_init, - (void *) ugni_module); + (void *) (intptr_t) MCA_BTL_UGNI_LIST_EAGER_RECV); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { BTL_ERROR(("error creating eager receive fragment free list")); return rc; @@ -503,14 +466,22 @@ mca_btl_ugni_module_set_max_reg (mca_btl_ugni_module_t *ugni_module, int nlocal_ gni_return_t grc; int fuzz = 20; - grc = GNI_GetJobResInfo (ugni_module->device->dev_id, opal_common_ugni_module.ptag, + grc = GNI_GetJobResInfo (0, mca_btl_ugni_component.ptag, GNI_JOB_RES_MDD, &res_des); if (GNI_RC_SUCCESS == grc) { - ugni_module->reg_max = (res_des.limit - fuzz) / nlocal_procs; + if (nlocal_procs) { + ugni_module->reg_max = (res_des.limit - fuzz) / nlocal_procs; + } else { + ugni_module->reg_max = 0; + } } #else /* no way to determine the maximum registration count */ - ugni_module->reg_max = 1200 / nlocal_procs; + if (nlocal_procs) { + ugni_module->reg_max = 1200 / nlocal_procs; + } else { + ugni_module->reg_max = 0; + } #endif } else if (-1 == mca_btl_ugni_component.max_mem_reg) { ugni_module->reg_max = INT_MAX; @@ -557,7 +528,7 @@ static int mca_btl_ugni_smsg_setup (int nprocs) grc = GNI_SmsgBufferSizeNeeded (&tmp_smsg_attrib, &mbox_size); if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc)) { BTL_ERROR(("error in GNI_SmsgBufferSizeNeeded")); - return opal_common_rc_ugni_to_opal (grc); + return mca_btl_rc_ugni_to_opal (grc); } mca_btl_ugni_component.smsg_mbox_size = OPAL_ALIGN(mbox_size, 64, unsigned int); diff --git a/opal/mca/btl/ugni/btl_ugni_atomic.c b/opal/mca/btl/ugni/btl_ugni_atomic.c index 3c62670da8..af29670f3b 100644 --- a/opal/mca/btl/ugni/btl_ugni_atomic.c +++ b/opal/mca/btl/ugni/btl_ugni_atomic.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights + * Copyright (c) 2014-2017 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -95,34 +95,23 @@ int mca_btl_ugni_aop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end return OPAL_ERR_NOT_SUPPORTED; } - rc = mca_btl_ugni_check_endpoint_state_rdma (endpoint); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - return rc; - } - - mca_btl_ugni_alloc_post_descriptor (endpoint, NULL, cbfunc, cbcontext, cbdata, &post_desc); + post_desc = mca_btl_ugni_alloc_post_descriptor (endpoint, NULL, cbfunc, cbcontext, cbdata); if (OPAL_UNLIKELY(NULL == post_desc)) { return OPAL_ERR_OUT_OF_RESOURCE; } - init_gni_post_desc (&post_desc->desc, order, GNI_POST_AMO, 0, dummy, remote_address, + init_gni_post_desc (post_desc, order, GNI_POST_AMO, 0, dummy, remote_address, remote_handle->gni_handle, size, 0); - post_desc->desc.base.amo_cmd = gni_op; + post_desc->desc.amo_cmd = gni_op; - post_desc->desc.base.first_operand = operand; + post_desc->desc.first_operand = operand; - OPAL_THREAD_LOCK(&endpoint->btl->device->dev_lock); - rc = GNI_PostFma (endpoint->rdma_ep_handle, &post_desc->desc.base); - OPAL_THREAD_UNLOCK(&endpoint->btl->device->dev_lock); - if (GNI_RC_SUCCESS != rc) { - mca_btl_ugni_return_post_descriptor (endpoint->btl, post_desc); - if (GNI_RC_ILLEGAL_OP == rc) { - return OPAL_ERR_NOT_SUPPORTED; - } - return OPAL_ERR_OUT_OF_RESOURCE; + rc = mca_btl_ugni_endpoint_post_fma (endpoint, post_desc); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + mca_btl_ugni_return_post_descriptor (post_desc); } - return OPAL_SUCCESS; + return rc; } int mca_btl_ugni_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, @@ -147,35 +136,24 @@ int mca_btl_ugni_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_en return OPAL_ERR_NOT_SUPPORTED; } - rc = mca_btl_ugni_check_endpoint_state_rdma (endpoint); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - return rc; - } - - mca_btl_ugni_alloc_post_descriptor (endpoint, local_handle, cbfunc, cbcontext, cbdata, &post_desc); + post_desc = mca_btl_ugni_alloc_post_descriptor (endpoint, local_handle, cbfunc, cbcontext, cbdata); if (OPAL_UNLIKELY(NULL == post_desc)) { return OPAL_ERR_OUT_OF_RESOURCE; } - init_gni_post_desc (&post_desc->desc, order, GNI_POST_AMO, (intptr_t) local_address, local_handle->gni_handle, + init_gni_post_desc (post_desc, order, GNI_POST_AMO, (intptr_t) local_address, local_handle->gni_handle, remote_address, remote_handle->gni_handle, size, 0); - post_desc->desc.base.amo_cmd = gni_op; + post_desc->desc.amo_cmd = gni_op; - post_desc->desc.base.first_operand = operand; + post_desc->desc.first_operand = operand; - OPAL_THREAD_LOCK(&endpoint->btl->device->dev_lock); - rc = GNI_PostFma (endpoint->rdma_ep_handle, &post_desc->desc.base); - OPAL_THREAD_UNLOCK(&endpoint->btl->device->dev_lock); - if (GNI_RC_SUCCESS != rc) { - mca_btl_ugni_return_post_descriptor (endpoint->btl, post_desc); - if (GNI_RC_ILLEGAL_OP == rc) { - return OPAL_ERR_NOT_SUPPORTED; - } - return OPAL_ERR_OUT_OF_RESOURCE; + rc = mca_btl_ugni_endpoint_post_fma (endpoint, post_desc); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + mca_btl_ugni_return_post_descriptor (post_desc); } - return OPAL_SUCCESS; + return rc; } int mca_btl_ugni_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, @@ -190,31 +168,23 @@ int mca_btl_ugni_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_ gni_op = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? GNI_FMA_ATOMIC2_CSWAP_S : GNI_FMA_ATOMIC_CSWAP; size = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? 4 : 8; - rc = mca_btl_ugni_check_endpoint_state_rdma (endpoint); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - return rc; - } - - mca_btl_ugni_alloc_post_descriptor (endpoint, local_handle, cbfunc, cbcontext, cbdata, &post_desc); + post_desc = mca_btl_ugni_alloc_post_descriptor (endpoint, local_handle, cbfunc, cbcontext, cbdata); if (OPAL_UNLIKELY(NULL == post_desc)) { return OPAL_ERR_OUT_OF_RESOURCE; } - init_gni_post_desc (&post_desc->desc, order, GNI_POST_AMO, (intptr_t) local_address, local_handle->gni_handle, + init_gni_post_desc (post_desc, order, GNI_POST_AMO, (intptr_t) local_address, local_handle->gni_handle, remote_address, remote_handle->gni_handle, size, 0); - post_desc->desc.base.amo_cmd = gni_op; + post_desc->desc.amo_cmd = gni_op; - post_desc->desc.base.first_operand = compare; - post_desc->desc.base.second_operand = value; + post_desc->desc.first_operand = compare; + post_desc->desc.second_operand = value; - OPAL_THREAD_LOCK(&endpoint->btl->device->dev_lock); - rc = GNI_PostFma (endpoint->rdma_ep_handle, &post_desc->desc.base); - OPAL_THREAD_UNLOCK(&endpoint->btl->device->dev_lock); - if (GNI_RC_SUCCESS != rc) { - mca_btl_ugni_return_post_descriptor (endpoint->btl, post_desc); - return OPAL_ERR_OUT_OF_RESOURCE; + rc = mca_btl_ugni_endpoint_post_fma (endpoint, post_desc); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + mca_btl_ugni_return_post_descriptor (post_desc); } - return OPAL_SUCCESS; + return rc; } diff --git a/opal/mca/btl/ugni/btl_ugni_component.c b/opal/mca/btl/ugni/btl_ugni_component.c index 2bd9bfbb2d..b9fc44c050 100644 --- a/opal/mca/btl/ugni/btl_ugni_component.c +++ b/opal/mca/btl/ugni/btl_ugni_component.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2011-2016 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. * $COPYRIGHT$ @@ -19,6 +19,7 @@ #include #include +#include #include "opal/memoryhooks/memory.h" #include "opal/runtime/opal_params.h" @@ -56,8 +57,49 @@ mca_base_var_enum_value_t rcache_values[] = { {-1, NULL} /* sentinal */ }; -static int -btl_ugni_component_register(void) +mca_base_var_enum_value_flag_t cdm_flags[] = { + {.flag = GNI_CDM_MODE_FORK_NOCOPY, .string = "fork-no-copy", .conflicting_flag = GNI_CDM_MODE_FORK_FULLCOPY | GNI_CDM_MODE_FORK_PARTCOPY}, + {.flag = GNI_CDM_MODE_FORK_FULLCOPY, .string = "fork-full-copy", .conflicting_flag = GNI_CDM_MODE_FORK_NOCOPY | GNI_CDM_MODE_FORK_PARTCOPY}, + {.flag = GNI_CDM_MODE_FORK_PARTCOPY, .string = "fork-part-copy", .conflicting_flag = GNI_CDM_MODE_FORK_NOCOPY | GNI_CDM_MODE_FORK_FULLCOPY}, + {.flag = GNI_CDM_MODE_ERR_NO_KILL, .string = "err-no-kill", .conflicting_flag = GNI_CDM_MODE_ERR_ALL_KILL}, + {.flag = GNI_CDM_MODE_ERR_ALL_KILL, .string = "err-all-kill", .conflicting_flag = GNI_CDM_MODE_ERR_NO_KILL}, + {.flag = GNI_CDM_MODE_FAST_DATAGRAM_POLL, .string = "fast-datagram-poll", .conflicting_flag = 0}, + {.flag = GNI_CDM_MODE_BTE_SINGLE_CHANNEL, .string = "bte-single-channel", .conflicting_flag = 0}, + {.flag = GNI_CDM_MODE_USE_PCI_IOMMU, .string = "use-pci-iommu", .conflicting_flag = 0}, + {.flag = GNI_CDM_MODE_MDD_DEDICATED, .string = "mdd-dedicated", .conflicting_flag = GNI_CDM_MODE_MDD_SHARED}, + {.flag = GNI_CDM_MODE_MDD_SHARED, .string = "mdd-shared", .conflicting_flag = GNI_CDM_MODE_MDD_DEDICATED}, + {.flag = GNI_CDM_MODE_FMA_DEDICATED, .string = "fma-dedicated", .conflicting_flag = GNI_CDM_MODE_FMA_SHARED}, + {.flag = GNI_CDM_MODE_FMA_SHARED, .string = "fma-shared", .conflicting_flag = GNI_CDM_MODE_FMA_DEDICATED}, + {.flag = GNI_CDM_MODE_CACHED_AMO_ENABLED, .string = "cached-amo-enabled", .conflicting_flag = 0}, + {.flag = GNI_CDM_MODE_CQ_NIC_LOCAL_PLACEMENT, .string = "cq-nic-placement", .conflicting_flag = 0}, + {.flag = GNI_CDM_MODE_FMA_SMALL_WINDOW, .string = "fma-small-window", .conflicting_flag = 0}, + {.string = NULL} +}; + +static inline int mca_btl_ugni_get_stat (const mca_base_pvar_t *pvar, void *value, void *obj) +{ + gni_statistic_t statistic = (gni_statistic_t) (intptr_t) pvar->ctx; + gni_return_t rc = GNI_RC_SUCCESS; + + for (int i = 0 ; i < mca_btl_ugni_component.virtual_device_count ; ++i) { + rc = GNI_GetNicStat (mca_btl_ugni_component.modules[0].devices[i].dev_handle, statistic, + ((unsigned int *) value) + i); + } + + return mca_btl_rc_ugni_to_opal (rc); +} + +static inline int mca_btl_ugni_notify_stat (mca_base_pvar_t *pvar, mca_base_pvar_event_t event, void *obj, int *count) +{ + if (MCA_BASE_PVAR_HANDLE_BIND == event) { + /* one value for each virtual device handle */ + *count = mca_btl_ugni_component.virtual_device_count; + } + + return OPAL_SUCCESS; +} + +static int btl_ugni_component_register(void) { mca_base_var_enum_t *new_enum; gni_nic_device_t device_type; @@ -181,6 +223,31 @@ btl_ugni_component_register(void) MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.mbox_increment); + /* communication domain flags */ + rc = mca_base_var_enum_create_flag ("btl_ugni_cdm_flags", cdm_flags, (mca_base_var_enum_flag_t **) &new_enum); + if (OPAL_SUCCESS != rc) { + return rc; + } + + mca_btl_ugni_component.cdm_flags = GNI_CDM_MODE_FORK_PARTCOPY | GNI_CDM_MODE_ERR_NO_KILL | GNI_CDM_MODE_FAST_DATAGRAM_POLL | + GNI_CDM_MODE_MDD_SHARED | GNI_CDM_MODE_FMA_SHARED | GNI_CDM_MODE_FMA_SMALL_WINDOW; + (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version, + "cdm_flags", "Flags to set when creating a communication domain " + " (default: fork-fullcopy,cached-amo-enabled,err-no-kill,fast-datagram-poll," + "fma-shared,fma-small-window)", + MCA_BASE_VAR_TYPE_UNSIGNED_INT, new_enum, 0, + MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.cdm_flags); + OBJ_RELEASE(new_enum); + + mca_btl_ugni_component.virtual_device_count = 0; + (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version, + "virtual_device_count", "Number of virtual devices to create. Higher numbers may " + "result in better performance when using threads. (default: auto, max: 8)", + MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, + MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.virtual_device_count); + /* determine if there are get alignment restrictions */ GNI_GetDeviceType (&device_type); @@ -202,12 +269,9 @@ btl_ugni_component_register(void) } (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version, - "smsg_page_size", "Page size to use for SMSG " - "mailbox allocation (default: detect)", - MCA_BASE_VAR_TYPE_INT, NULL, 0, - MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, - MCA_BASE_VAR_SCOPE_LOCAL, - &mca_btl_ugni_component.smsg_page_size); + "smsg_page_size", "Page size to use for SMSG mailbox allocation (default: detect)", + MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.smsg_page_size); mca_btl_ugni_component.progress_thread_requested = 0; (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version, @@ -228,6 +292,31 @@ btl_ugni_component_register(void) MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS, NULL, NULL, NULL, &mca_btl_ugni_progress_thread_wakeups); + /* register network statistics as performance variables */ + for (int i = 0 ; i < GNI_NUM_STATS ; ++i) { + char name[128], desc[128]; + size_t str_len = strlen (gni_statistic_str[i]); + + assert (str_len < sizeof (name)); + + /* we can get an all-caps string for the variable from gni_statistic_str. need to make it lowercase + * to match ompi standards */ + for (size_t j = 0 ; j < str_len ; ++j) { + name[j] = tolower (gni_statistic_str[i][j]); + desc[j] = ('_' == name[j]) ? ' ' : name[j]; + } + + name[str_len] = '\0'; + desc[str_len] = '\0'; + + (void) mca_base_component_pvar_register (&mca_btl_ugni_component.super.btl_version, name, desc, + OPAL_INFO_LVL_4, MCA_BASE_PVAR_CLASS_COUNTER, + MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, MCA_BASE_VAR_BIND_NO_OBJECT, + MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS, + mca_btl_ugni_get_stat, NULL, mca_btl_ugni_notify_stat, + (void *) (intptr_t) i); + } + /* btl/ugni can only support only a fixed set of rcache components (these rcache components have compatible resource * structures) */ rc = mca_base_var_enum_create ("btl_ugni_rcache", rcache_values, &new_enum); @@ -235,9 +324,10 @@ btl_ugni_component_register(void) return rc; } - mca_btl_ugni_component.rcache_type = MCA_BTL_UGNI_RCACHE_UDREG; + /* NTH: there are known *serious* performance issues with udreg. if they are ever resolved it is the preferred rcache */ + mca_btl_ugni_component.rcache_type = MCA_BTL_UGNI_RCACHE_GRDMA; (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version, - "rcache", "registration cache to use", MCA_BASE_VAR_TYPE_INT, new_enum, + "rcache", "registration cache to use (default: grdma)", MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.rcache_type); OBJ_RELEASE(new_enum); @@ -325,7 +415,7 @@ btl_ugni_component_open(void) static int btl_ugni_component_close(void) { - opal_common_ugni_fini (); + mca_btl_ugni_fini (); if (mca_btl_ugni_component.modules) { free (mca_btl_ugni_component.modules); @@ -342,7 +432,6 @@ mca_btl_ugni_component_init (int *num_btl_modules, { struct mca_btl_base_module_t **base_modules; mca_btl_ugni_module_t *ugni_modules; - unsigned int i; int rc; if (16384 < mca_btl_ugni_component.ugni_smsg_limit) { @@ -360,19 +449,18 @@ mca_btl_ugni_component_init (int *num_btl_modules, } /* Initialize ugni library and create communication domain */ - rc = opal_common_ugni_init(); + rc = mca_btl_ugni_init(); if (OPAL_SUCCESS != rc) { return NULL; } - /* Create and initialize one module per uGNI device */ - mca_btl_ugni_component.ugni_num_btls = opal_common_ugni_module.device_count; + /* For now only create a single BTL module */ + mca_btl_ugni_component.ugni_num_btls = 1; BTL_VERBOSE(("btl/ugni initializing")); ugni_modules = mca_btl_ugni_component.modules = (mca_btl_ugni_module_t *) - calloc (mca_btl_ugni_component.ugni_num_btls, - sizeof (mca_btl_ugni_module_t)); + calloc (mca_btl_ugni_component.ugni_num_btls, sizeof (mca_btl_ugni_module_t)); if (OPAL_UNLIKELY(NULL == mca_btl_ugni_component.modules)) { BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__)); @@ -395,20 +483,15 @@ mca_btl_ugni_component_init (int *num_btl_modules, mca_btl_ugni_module.super.btl_rdma_pipeline_send_length = mca_btl_ugni_module.super.btl_eager_limit; - for (i = 0 ; i < mca_btl_ugni_component.ugni_num_btls ; ++i) { - mca_btl_ugni_module_t *ugni_module = ugni_modules + i; - - rc = mca_btl_ugni_module_init (ugni_module, - opal_common_ugni_module.devices + i); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - BTL_ERROR(("Failed to initialize uGNI module @ %s:%d", __FILE__, - __LINE__)); - return NULL; - } - - base_modules[i] = (mca_btl_base_module_t *) ugni_module; + rc = mca_btl_ugni_module_init (ugni_modules); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + BTL_ERROR(("Failed to initialize uGNI module @ %s:%d", __FILE__, + __LINE__)); + return NULL; } + *base_modules = (mca_btl_base_module_t *) ugni_modules; + *num_btl_modules = mca_btl_ugni_component.ugni_num_btls; BTL_VERBOSE(("btl/ugni done initializing %d module(s)", *num_btl_modules)); @@ -417,80 +500,47 @@ mca_btl_ugni_component_init (int *num_btl_modules, } static inline int -mca_btl_ugni_progress_datagram (mca_btl_ugni_module_t *ugni_module) +mca_btl_ugni_progress_datagram (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_device_t *device) { - uint64_t datagram_id, data, proc_id; - uint32_t remote_addr, remote_id; mca_btl_base_endpoint_t *ep; - gni_post_state_t post_state; gni_ep_handle_t handle; - gni_return_t grc; int count = 0, rc; - /* check for datagram completion */ - OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); /* TODO: may not need lock for this function */ - grc = GNI_PostDataProbeById (ugni_module->device->dev_handle, &datagram_id); - if (OPAL_LIKELY(GNI_RC_SUCCESS != grc)) { - OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); - return 0; + rc = mca_btl_ugni_get_datagram (ugni_module, device, &handle, &ep); + if (1 != rc) { + return rc; } - data = datagram_id & ~(MCA_BTL_UGNI_DATAGRAM_MASK); - - BTL_VERBOSE(("datgram_id: %" PRIx64 ", mask: %" PRIx64, datagram_id, (uint64_t) (datagram_id & MCA_BTL_UGNI_DATAGRAM_MASK))); - - if ((datagram_id & MCA_BTL_UGNI_DATAGRAM_MASK) == MCA_BTL_UGNI_CONNECT_DIRECTED_ID) { - ep = (mca_btl_base_endpoint_t *) opal_pointer_array_get_item (&ugni_module->endpoints, data); - handle = ep->smsg_ep_handle; - } else { - handle = ugni_module->wildcard_ep; - } - - /* wait for the incoming datagram to complete (in case it isn't) */ - grc = GNI_EpPostDataWaitById (handle, datagram_id, -1, &post_state, - &remote_addr, &remote_id); - OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); - if (GNI_RC_SUCCESS != grc) { - BTL_ERROR(("GNI_EpPostDataWaitById failed with rc = %d", grc)); - return opal_common_rc_ugni_to_opal (grc); - } + BTL_VERBOSE(("remote datagram completion on handle %p", handle)); /* if this is a wildcard endpoint lookup the remote peer by the proc id we received */ if (handle == ugni_module->wildcard_ep) { - proc_id = mca_btl_ugni_proc_name_to_id (ugni_module->wc_remote_attr.proc_name); + struct opal_proc_t *remote_proc = opal_proc_for_name (ugni_module->wc_remote_attr.proc_name); - BTL_VERBOSE(("received connection attempt on wildcard endpoint from proc id: %" PRIx64, - proc_id)); + BTL_VERBOSE(("received connection attempt on wildcard endpoint from proc: %s", + OPAL_NAME_PRINT(ugni_module->wc_remote_attr.proc_name))); - OPAL_THREAD_LOCK(&ugni_module->endpoint_lock); - rc = opal_hash_table_get_value_uint64 (&ugni_module->id_to_endpoint, proc_id, (void **) &ep); - OPAL_THREAD_UNLOCK(&ugni_module->endpoint_lock); - - /* check if the endpoint is known */ - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc || NULL == ep)) { - struct opal_proc_t *remote_proc = opal_proc_for_name (ugni_module->wc_remote_attr.proc_name); - BTL_VERBOSE(("Got connection request from an unknown peer {jobid = 0x%x, vid = 0x%x}", - ugni_module->wc_remote_attr.proc_name.jobid, ugni_module->wc_remote_attr.proc_name.vpid)); - ep = mca_btl_ugni_get_ep (&ugni_module->super, remote_proc); - if (OPAL_UNLIKELY(NULL == ep)) { - return rc; - } + ep = mca_btl_ugni_get_ep (&ugni_module->super, remote_proc); + if (OPAL_UNLIKELY(NULL == ep)) { + /* there is no way to recover from this error so just abort() */ + BTL_ERROR(("could not find/allocate a btl endpoint for peer %s", + OPAL_NAME_PRINT(ugni_module->wc_remote_attr.proc_name))); + abort (); + return OPAL_ERR_NOT_FOUND; } - } else { - BTL_VERBOSE(("directed datagram complete for endpoint %p", (void *) ep)); } /* should not have gotten a NULL endpoint */ assert (NULL != ep); - BTL_VERBOSE(("got a datagram completion: id = %" PRIx64 ", state = %d, " - "data = 0x%" PRIx64 ", ep = %p, remote id: %d", datagram_id, post_state, - data, (void *) ep, remote_id)); + BTL_VERBOSE(("got a datagram completion: ep = %p. wc = %d", (void *) ep, handle == ugni_module->wildcard_ep)); /* NTH: TODO -- error handling */ opal_mutex_lock (&ep->lock); if (handle != ugni_module->wildcard_ep) { /* directed post complete */ + BTL_VERBOSE(("directed datagram complete for endpoint %p", (void *) ep)); + ep->dg_posted = false; } @@ -514,106 +564,106 @@ mca_btl_ugni_progress_datagram (mca_btl_ugni_module_t *ugni_module) static inline void btl_ugni_dump_post_desc (mca_btl_ugni_post_descriptor_t *desc) { - fprintf (stderr, "desc->desc.base.post_id = %" PRIx64 "\n", desc->desc.base.post_id); - fprintf (stderr, "desc->desc.base.status = %" PRIx64 "\n", desc->desc.base.status); - fprintf (stderr, "desc->desc.base.cq_mode_complete = %hu\n", desc->desc.base.cq_mode_complete); - fprintf (stderr, "desc->desc.base.type = %d\n", desc->desc.base.type); - fprintf (stderr, "desc->desc.base.cq_mode = %hu\n", desc->desc.base.cq_mode); - fprintf (stderr, "desc->desc.base.dlvr_mode = %hu\n", desc->desc.base.dlvr_mode); - fprintf (stderr, "desc->desc.base.local_addr = %" PRIx64 "\n", desc->desc.base.local_addr); - fprintf (stderr, "desc->desc.base.local_mem_hndl = {%" PRIx64 ", %" PRIx64 "}\n", desc->desc.base.local_mem_hndl.qword1, - desc->desc.base.local_mem_hndl.qword2); - fprintf (stderr, "desc->desc.base.remote_addr = %" PRIx64 "\n", desc->desc.base.remote_addr); - fprintf (stderr, "desc->desc.base.remote_mem_hndl = {%" PRIx64 ", %" PRIx64 "}\n", desc->desc.base.remote_mem_hndl.qword1, - desc->desc.base.remote_mem_hndl.qword2); - fprintf (stderr, "desc->desc.base.length = %" PRIu64 "\n", desc->desc.base.length); - fprintf (stderr, "desc->desc.base.rdma_mode = %hu\n", desc->desc.base.rdma_mode); - fprintf (stderr, "desc->desc.base.amo_cmd = %d\n", desc->desc.base.amo_cmd); + fprintf (stderr, "desc->desc.post_id = %" PRIx64 "\n", desc->desc.post_id); + fprintf (stderr, "desc->desc.status = %" PRIx64 "\n", desc->desc.status); + fprintf (stderr, "desc->desc.cq_mode_complete = %hu\n", desc->desc.cq_mode_complete); + fprintf (stderr, "desc->desc.type = %d\n", desc->desc.type); + fprintf (stderr, "desc->desc.cq_mode = %hu\n", desc->desc.cq_mode); + fprintf (stderr, "desc->desc.dlvr_mode = %hu\n", desc->desc.dlvr_mode); + fprintf (stderr, "desc->desc.local_addr = %" PRIx64 "\n", desc->desc.local_addr); + fprintf (stderr, "desc->desc.local_mem_hndl = {%" PRIx64 ", %" PRIx64 "}\n", desc->desc.local_mem_hndl.qword1, + desc->desc.local_mem_hndl.qword2); + fprintf (stderr, "desc->desc.remote_addr = %" PRIx64 "\n", desc->desc.remote_addr); + fprintf (stderr, "desc->desc.remote_mem_hndl = {%" PRIx64 ", %" PRIx64 "}\n", desc->desc.remote_mem_hndl.qword1, + desc->desc.remote_mem_hndl.qword2); + fprintf (stderr, "desc->desc.length = %" PRIu64 "\n", desc->desc.length); + fprintf (stderr, "desc->desc.rdma_mode = %hu\n", desc->desc.rdma_mode); + fprintf (stderr, "desc->desc.amo_cmd = %d\n", desc->desc.amo_cmd); } #endif -static inline int mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module, int which_cq) -{ - mca_btl_ugni_post_descriptor_t *post_desc = NULL; - gni_cq_entry_t event_data = 0; - gni_post_descriptor_t *desc; - uint32_t recoverable = 1; - gni_return_t grc; - gni_cq_handle_t the_cq; - - the_cq = (which_cq == 0) ? ugni_module->rdma_local_cq : ugni_module->rdma_local_irq_cq; - - OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); - grc = GNI_CqGetEvent (the_cq, &event_data); - if (GNI_RC_NOT_DONE == grc) { - OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); - return 0; - } - - if (OPAL_UNLIKELY((GNI_RC_SUCCESS != grc && !event_data) || GNI_CQ_OVERRUN(event_data))) { - /* TODO -- need to handle overrun -- how do we do this without an event? - will the event eventually come back? Ask Cray */ - BTL_ERROR(("unhandled post error! ugni rc = %d %s", grc, gni_err_str[grc])); - OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); - - return opal_common_rc_ugni_to_opal (grc); - } - - grc = GNI_GetCompleted (the_cq, event_data, &desc); - OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); - if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc && GNI_RC_TRANSACTION_ERROR != grc)) { - BTL_ERROR(("Error in GNI_GetComplete %s", gni_err_str[grc])); - return opal_common_rc_ugni_to_opal (grc); - } - - post_desc = MCA_BTL_UGNI_DESC_TO_PDESC(desc); - - if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc || !GNI_CQ_STATUS_OK(event_data))) { - (void) GNI_CqErrorRecoverable (event_data, &recoverable); - - if (OPAL_UNLIKELY(++post_desc->desc.tries >= mca_btl_ugni_component.rdma_max_retries || - !recoverable)) { - char char_buffer[1024]; - GNI_CqErrorStr (event_data, char_buffer, 1024); - /* give up */ - BTL_ERROR(("giving up on desciptor %p, recoverable %d: %s", (void *) post_desc, - recoverable, char_buffer)); -#if OPAL_ENABLE_DEBUG - btl_ugni_dump_post_desc (post_desc); -#endif - mca_btl_ugni_post_desc_complete (ugni_module, post_desc, OPAL_ERROR); - - return OPAL_ERROR; - } - - mca_btl_ugni_repost (ugni_module, post_desc); - - return 0; - } - - mca_btl_ugni_post_desc_complete (ugni_module, post_desc, opal_common_rc_ugni_to_opal (grc)); - - return 1; -} - static inline int -mca_btl_ugni_post_pending (mca_btl_ugni_module_t *ugni_module) +mca_btl_ugni_post_pending (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_device_t *device) { - int count = opal_list_get_size (&ugni_module->pending_descriptors); - int i; + int pending_post_count = opal_list_get_size (&device->pending_post); + mca_btl_ugni_post_descriptor_t *post_desc; + int rc; - for (i = 0 ; i < count ; ++i) { - OPAL_THREAD_LOCK(&ugni_module->pending_descriptors_lock); - mca_btl_ugni_post_descriptor_t *post_desc = - (mca_btl_ugni_post_descriptor_t *) opal_list_remove_first (&ugni_module->pending_descriptors); - OPAL_THREAD_UNLOCK(&ugni_module->pending_descriptors_lock); + /* check if there are any posts pending resources */ + if (OPAL_LIKELY(0 == pending_post_count)) { + return 0; + } - if (OPAL_SUCCESS != mca_btl_ugni_repost (ugni_module, post_desc)) { + BTL_VERBOSE(("progressing %d pending FMA/RDMA operations", pending_post_count)); + for (int i = 0 ; i < pending_post_count ; ++i) { + mca_btl_ugni_device_lock (device); + post_desc = (mca_btl_ugni_post_descriptor_t *) opal_list_remove_first (&device->pending_post); + mca_btl_ugni_device_unlock (device); + if (NULL == post_desc) { + break; + } + rc = mca_btl_ugni_repost (ugni_module, post_desc); + if (OPAL_SUCCESS != rc) { + mca_btl_ugni_device_lock (device); + opal_list_prepend (&device->pending_post, (opal_list_item_t *) post_desc); + mca_btl_ugni_device_unlock (device); break; } } - return i; + return 1; +} + +static inline int mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_device_t *device, + mca_btl_ugni_cq_t *cq) +{ + mca_btl_ugni_post_descriptor_t *post_desc[MCA_BTL_UGNI_COMPLETIONS_PER_LOOP]; + gni_cq_entry_t event_data[MCA_BTL_UGNI_COMPLETIONS_PER_LOOP]; + int rc; + + rc = mca_btl_ugni_cq_get_completed_desc (device, cq, event_data, post_desc, MCA_BTL_UGNI_COMPLETIONS_PER_LOOP); + if (0 >= rc) { + return rc; + } + + BTL_VERBOSE(("got %d completed rdma descriptors", rc)); + + for (int i = 0 ; i < rc ; ++i) { + BTL_VERBOSE(("post descriptor %p complete. GNI_CQ_STATUS_OK(): %d", post_desc[i], + GNI_CQ_STATUS_OK(event_data[i]))); + + if (OPAL_UNLIKELY(!GNI_CQ_STATUS_OK(event_data[i]))) { + uint32_t recoverable = 1; + + (void) GNI_CqErrorRecoverable (event_data[i], &recoverable); + + if (OPAL_UNLIKELY(++post_desc[i]->tries >= mca_btl_ugni_component.rdma_max_retries || + !recoverable)) { + char char_buffer[1024]; + GNI_CqErrorStr (event_data[i], char_buffer, 1024); + /* give up */ + BTL_ERROR(("giving up on desciptor %p, recoverable %d: %s", (void *) post_desc[i], + recoverable, char_buffer)); +#if OPAL_ENABLE_DEBUG + btl_ugni_dump_post_desc (post_desc[i]); +#endif + mca_btl_ugni_post_desc_complete (ugni_module, post_desc[i], OPAL_ERROR); + + return OPAL_ERROR; + } + + mca_btl_ugni_repost (ugni_module, post_desc[i]); + + return 0; + } + + mca_btl_ugni_post_desc_complete (ugni_module, post_desc[i], OPAL_SUCCESS); + } + + /* should be resources to progress the pending post list */ + (void) mca_btl_ugni_post_pending (ugni_module, device); + + return rc; } static inline int @@ -627,9 +677,14 @@ mca_btl_ugni_progress_wait_list (mca_btl_ugni_module_t *ugni_module) return 0; } + /* check the count before taking the lock to avoid unnecessary locking */ + count = opal_list_get_size(&ugni_module->ep_wait_list); + if (0 == count) { + return 0; + } + OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock); count = opal_list_get_size(&ugni_module->ep_wait_list); - do { endpoint = (mca_btl_base_endpoint_t *) opal_list_remove_first (&ugni_module->ep_wait_list); if (endpoint != NULL) { @@ -649,35 +704,34 @@ mca_btl_ugni_progress_wait_list (mca_btl_ugni_module_t *ugni_module) static int mca_btl_ugni_component_progress (void) { - mca_btl_ugni_module_t *ugni_module; - static int64_t call_count = 0; - int64_t cur_call_count = OPAL_THREAD_ADD64(&call_count, 1); - unsigned int i; + mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_component.modules; + static volatile int32_t call_count = 0; + int32_t current_call; int count = 0; - for (i = 0 ; i < mca_btl_ugni_component.ugni_num_btls ; ++i) { - ugni_module = mca_btl_ugni_component.modules + i; + current_call = OPAL_THREAD_ADD32(&call_count, 1); - if ((cur_call_count & 0x7) == 0) { - count += mca_btl_ugni_progress_datagram (ugni_module); - } + count += mca_btl_ugni_progress_remote_smsg (ugni_module); - if (ugni_module->connected_peer_count) { + if ((current_call & 0x7) == 0) { + count += mca_btl_ugni_progress_datagram (ugni_module, ugni_module->devices); + } + + for (int i = 0 ; i < mca_btl_ugni_component.virtual_device_count ; ++i) { + mca_btl_ugni_device_t *device = ugni_module->devices + i; + + if (device->smsg_connections) { + count += mca_btl_ugni_progress_local_smsg (ugni_module, device); mca_btl_ugni_progress_wait_list (ugni_module); - count += mca_btl_ugni_progress_local_smsg (ugni_module); - count += mca_btl_ugni_progress_remote_smsg (ugni_module); } - if (ugni_module->active_rdma_count) { - count += mca_btl_ugni_progress_rdma (ugni_module, 0); + if (device->dev_rdma_local_cq.active_operations) { + count += mca_btl_ugni_progress_rdma (ugni_module, device, &device->dev_rdma_local_cq); } - if (mca_btl_ugni_component.progress_thread_enabled) { - count += mca_btl_ugni_progress_rdma (ugni_module, 1); + if (mca_btl_ugni_component.progress_thread_enabled && device->dev_rdma_local_irq_cq.active_operations) { + count += mca_btl_ugni_progress_rdma (ugni_module, device, &device->dev_rdma_local_irq_cq); } - - /* post pending after progressing rdma */ - mca_btl_ugni_post_pending (ugni_module); } return count; diff --git a/opal/mca/btl/ugni/btl_ugni_device.h b/opal/mca/btl/ugni/btl_ugni_device.h new file mode 100644 index 0000000000..18a3b46416 --- /dev/null +++ b/opal/mca/btl/ugni/btl_ugni_device.h @@ -0,0 +1,430 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. + * Copyright (c) 2014 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file This file contains wrappers for uGNI functionality. These wrappers are thread-safe + * and intended to provide a way to measure various different ways to handle mutual exclusion + * into the uGNI library (which is not thread safe). These functions are all defined to be + * inline to limit the cost to non-threaded users. + */ + +#if !defined(BTL_UGNI_DEVICE_H) +#define BTL_UGNI_DEVICE_H + +#include "btl_ugni_endpoint.h" +#include "btl_ugni_frag.h" + +/* helper functions */ + +typedef struct mca_btl_ugni_smsg_send_wtag_arg_t { + gni_ep_handle_t ep_handle; + void *hdr; + size_t hdr_len; + void *payload; + size_t payload_len; + uint32_t msg_id; + int tag; +} mca_btl_ugni_smsg_send_wtag_arg_t; + +static inline int mca_btl_ugni_smsg_send_wtag_device (mca_btl_ugni_device_t *device, void *arg) +{ + mca_btl_ugni_smsg_send_wtag_arg_t *args = (mca_btl_ugni_smsg_send_wtag_arg_t *) arg; + gni_return_t grc; + + grc = GNI_SmsgSendWTag (args->ep_handle, args->hdr, args->hdr_len, args->payload, + args->payload_len, args->msg_id, args->tag); + device->dev_smsg_local_cq.active_operations += (GNI_RC_SUCCESS == grc); + return grc; +} + +typedef struct mca_btl_ugni_smsg_get_next_wtag_arg_t { + gni_ep_handle_t ep_handle; + uintptr_t *data_ptr; + uint8_t *tag; +} mca_btl_ugni_smsg_get_next_wtag_arg_t; + +static inline intptr_t mca_btl_ugni_smsg_get_next_wtag_device (mca_btl_ugni_device_t *device, void *arg) +{ + mca_btl_ugni_smsg_get_next_wtag_arg_t *args = (mca_btl_ugni_smsg_get_next_wtag_arg_t *) arg; + return GNI_SmsgGetNextWTag(args->ep_handle, (void **) args->data_ptr, args->tag); +} + +static inline intptr_t mca_btl_ugni_smsg_release_device (mca_btl_ugni_device_t *device, void *arg) +{ + mca_btl_ugni_endpoint_handle_t *ep_handle = (mca_btl_ugni_endpoint_handle_t *) arg; + + return GNI_SmsgRelease (ep_handle->gni_handle); +} + +static inline intptr_t mca_btl_ugni_cq_clear_device (mca_btl_ugni_device_t *device, void *arg) +{ + gni_cq_handle_t cq = (gni_cq_handle_t) (intptr_t) arg; + gni_cq_entry_t event_data; + int rc; + + do { + rc = GNI_CqGetEvent (cq, &event_data); + } while (GNI_RC_NOT_DONE != rc); + + return OPAL_SUCCESS; +} + +typedef struct mca_btl_ugni_cq_get_event_args_t { + mca_btl_ugni_cq_t *cq; + gni_cq_entry_t *event_data; +} mca_btl_ugni_cq_get_event_args_t; + +static inline intptr_t mca_btl_ugni_cq_get_event_device (mca_btl_ugni_device_t *device, void *arg) +{ + mca_btl_ugni_cq_get_event_args_t *args = (mca_btl_ugni_cq_get_event_args_t *) arg; + gni_return_t rc; + + rc = GNI_CqGetEvent (args->cq->gni_handle, args->event_data); + args->cq->active_operations -= GNI_RC_NOT_DONE != rc; + return rc; +} + +typedef struct mca_btl_ugni_gni_cq_get_event_args_t { + gni_cq_handle_t cq; + gni_cq_entry_t *event_data; +} mca_btl_ugni_gni_cq_get_event_args_t; + +static inline intptr_t mca_btl_ugni_gni_cq_get_event_device (mca_btl_ugni_device_t *device, void *arg) +{ + mca_btl_ugni_gni_cq_get_event_args_t *args = (mca_btl_ugni_gni_cq_get_event_args_t *) arg; + + return GNI_CqGetEvent (args->cq, args->event_data); +} + +static inline intptr_t mca_btl_ugni_post_fma_device (mca_btl_ugni_device_t *device, void *arg) +{ + mca_btl_ugni_post_descriptor_t *desc = (mca_btl_ugni_post_descriptor_t *) arg; + bool ep_handle_allocated = false; + int rc; + + if (NULL == desc->ep_handle) { + desc->ep_handle = mca_btl_ugni_ep_get_rdma (desc->endpoint, device); + if (OPAL_UNLIKELY(NULL == desc->ep_handle)) { + return OPAL_ERR_TEMP_OUT_OF_RESOURCE; + } + ep_handle_allocated = true; + } + + BTL_VERBOSE(("Posting FMA descriptor %p with op_type %d, amo %d, ep_handle %p, remote_addr 0x%lx, " + "length %lu", desc, desc->desc.type, desc->desc.amo_cmd, desc->ep_handle, + desc->desc.remote_addr, desc->desc.length)); + + rc = GNI_PostFma (desc->ep_handle->gni_handle, &desc->desc); + if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) { + if (ep_handle_allocated) { + /* only return the endpoint handle if we allocated it. if we didn't allocate the + * handle this call was likely made from repost() */ + mca_btl_ugni_ep_return_rdma (desc->ep_handle); + desc->ep_handle = NULL; + } + } else { + ++device->dev_rdma_local_cq.active_operations; + } + + return mca_btl_rc_ugni_to_opal (rc); +} + +static inline intptr_t mca_btl_ugni_post_rdma_device (mca_btl_ugni_device_t *device, void *arg) +{ + mca_btl_ugni_post_descriptor_t *desc = (mca_btl_ugni_post_descriptor_t *) arg; + bool ep_handle_allocated = false; + int rc; + + if (NULL == desc->ep_handle) { + desc->ep_handle = mca_btl_ugni_ep_get_rdma (desc->endpoint, device); + if (OPAL_UNLIKELY(NULL == desc->ep_handle)) { + return OPAL_ERR_TEMP_OUT_OF_RESOURCE; + } + ep_handle_allocated = true; + } + + /* pick the appropriate CQ */ + desc->cq = mca_btl_ugni_component.progress_thread_enabled ? &device->dev_rdma_local_irq_cq : + &device->dev_rdma_local_cq; + + desc->desc.src_cq_hndl = desc->cq->gni_handle; + + BTL_VERBOSE(("Posting RDMA descriptor %p with op_type %d, ep_handle %p, remote_addr 0x%lx, " + "length %lu", desc, desc->desc.type, desc->ep_handle, desc->desc.remote_addr, + desc->desc.length)); + + rc = GNI_PostRdma (desc->ep_handle->gni_handle, &desc->desc); + if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) { + if (ep_handle_allocated) { + /* only return the endpoint handle if we allocated it. if we didn't allocate the + * handle this call was likely made from repost() */ + mca_btl_ugni_ep_return_rdma (desc->ep_handle); + desc->ep_handle = NULL; + } + } else { + ++desc->cq->active_operations; + } + + return mca_btl_rc_ugni_to_opal (rc); +} + +static inline intptr_t mca_btl_ugni_post_cqwrite_device (mca_btl_ugni_device_t *device, void *arg) +{ + mca_btl_ugni_post_descriptor_t *desc = (mca_btl_ugni_post_descriptor_t *) arg; + int rc; + + desc->ep_handle = mca_btl_ugni_ep_get_rdma (desc->endpoint, device); + if (OPAL_UNLIKELY(NULL == desc->ep_handle)) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + desc->desc.src_cq_hndl = device->dev_rdma_local_cq.gni_handle; + + rc = GNI_PostCqWrite (desc->ep_handle->gni_handle, &desc->desc); + if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) { + mca_btl_ugni_ep_return_rdma (desc->ep_handle); + desc->ep_handle = NULL; + } + + return mca_btl_rc_ugni_to_opal (rc); +} + +typedef struct mca_btl_ugni_cq_get_completed_desc_arg_t { + mca_btl_ugni_cq_t *cq; + gni_cq_entry_t *event_data; + mca_btl_ugni_post_descriptor_t **post_desc; + int count; +} mca_btl_ugni_cq_get_completed_desc_arg_t; + +static inline intptr_t mca_btl_ugni_cq_get_completed_desc_device (mca_btl_ugni_device_t *device, void *arg0) +{ + mca_btl_ugni_cq_get_completed_desc_arg_t *args = (mca_btl_ugni_cq_get_completed_desc_arg_t *) arg0; + mca_btl_ugni_cq_t *cq = args->cq; + gni_post_descriptor_t *desc; + int rc; + + for (int i = 0 ; i < args->count ; ++i) { + rc = GNI_CqGetEvent (cq->gni_handle, args->event_data + i); + if (GNI_RC_NOT_DONE == rc) { + return i; + } + + if (OPAL_UNLIKELY((GNI_RC_SUCCESS != rc && !args->event_data[i]) || GNI_CQ_OVERRUN(args->event_data[i]))) { + /* TODO -- need to handle overrun -- how do we do this without an event? + will the event eventually come back? Ask Cray */ + BTL_ERROR(("unhandled post error! ugni rc = %d %s", rc, gni_err_str[rc])); + + return mca_btl_rc_ugni_to_opal (rc); + } + + rc = GNI_GetCompleted (cq->gni_handle, args->event_data[i], &desc); + if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc && GNI_RC_TRANSACTION_ERROR != rc)) { + BTL_ERROR(("Error in GNI_GetComplete %s", gni_err_str[rc])); + return mca_btl_rc_ugni_to_opal (rc); + } + + args->post_desc[i] = MCA_BTL_UGNI_DESC_TO_PDESC(desc); + /* return the endpoint handle while we have the lock. see the explanation in + * the documentation for mca_btl_ugni_ep_return_rdma() */ + if (OPAL_LIKELY(GNI_CQ_STATUS_OK(args->event_data[i]))) { + /* the operation completed successfully. return the endpoint handle now. otherwise + * we may still need the endpoint handle to start the repost(). */ + mca_btl_ugni_ep_return_rdma (args->post_desc[i]->ep_handle); + args->post_desc[i]->ep_handle = NULL; + } + --cq->active_operations; + } + + return args->count; +} + +typedef struct mca_btl_ugni_get_datagram_args_t { + mca_btl_ugni_module_t *ugni_module; + gni_ep_handle_t *handle; + mca_btl_base_endpoint_t **ep; +} mca_btl_ugni_get_datagram_args_t; + +static inline intptr_t mca_btl_ugni_get_datagram_device (mca_btl_ugni_device_t *device, void *arg0) +{ + mca_btl_ugni_get_datagram_args_t *args = (mca_btl_ugni_get_datagram_args_t *) arg0; + uint32_t remote_addr, remote_id; + uint64_t datagram_id; + gni_post_state_t post_state; + gni_return_t grc; + uint64_t data; + + grc = GNI_PostDataProbeById (device->dev_handle, &datagram_id); + if (OPAL_LIKELY(GNI_RC_SUCCESS != grc)) { + return 0; + } + + data = datagram_id & ~(MCA_BTL_UGNI_DATAGRAM_MASK); + + BTL_VERBOSE(("rc: %d, datgram_id: %" PRIx64 ", mask: %" PRIx64, grc, datagram_id, (uint64_t) (datagram_id & MCA_BTL_UGNI_DATAGRAM_MASK))); + + if ((datagram_id & MCA_BTL_UGNI_DATAGRAM_MASK) == MCA_BTL_UGNI_CONNECT_DIRECTED_ID) { + *(args->ep) = (mca_btl_base_endpoint_t *) opal_pointer_array_get_item (&args->ugni_module->endpoints, data); + *(args->handle) = (*args->ep)->smsg_ep_handle->gni_handle; + } else { + *(args->handle) = args->ugni_module->wildcard_ep; + } + + /* wait for the incoming datagram to complete (in case it isn't) */ + grc = GNI_EpPostDataWaitById (*args->handle, datagram_id, -1, &post_state, + &remote_addr, &remote_id); + if (GNI_RC_SUCCESS != grc) { + BTL_ERROR(("GNI_EpPostDataWaitById failed with rc = %d", grc)); + return mca_btl_rc_ugni_to_opal (grc); + } + + BTL_VERBOSE(("handled datagram completion. post_state: %d, remote_addr: %u, remote_id: %u, directed?: %d", + post_state, remote_addr, remote_id, (datagram_id & MCA_BTL_UGNI_DATAGRAM_MASK) == MCA_BTL_UGNI_CONNECT_DIRECTED_ID)); + + return 1; +} + +typedef struct mca_btl_ugni_reg_mem_args_t { + mca_btl_ugni_module_t *ugni_module; + void *base; + size_t size; + mca_btl_ugni_reg_t *ugni_reg; + gni_cq_handle_t cq; + int flags; +} mca_btl_ugni_reg_mem_args_t; + +static intptr_t mca_btl_ugni_reg_mem_device (mca_btl_ugni_device_t *device, void *arg) +{ + mca_btl_ugni_reg_mem_args_t *args = (mca_btl_ugni_reg_mem_args_t *) arg; + gni_return_t rc; + + rc = GNI_MemRegister (device->dev_handle, (uint64_t) args->base, args->size, args->cq, + args->flags, -1, &args->ugni_reg->handle.gni_handle); + if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + return OPAL_SUCCESS; +} + +typedef struct mca_btl_ugni_dereg_mem_arg_t { + mca_btl_ugni_module_t *ugni_module; + mca_btl_ugni_reg_t *ugni_reg; +} mca_btl_ugni_dereg_mem_arg_t; + +static intptr_t mca_btl_ugni_dereg_mem_device (mca_btl_ugni_device_t *device, void *arg) +{ + mca_btl_ugni_dereg_mem_arg_t *args = (mca_btl_ugni_dereg_mem_arg_t *) arg; + gni_return_t rc; + + rc = GNI_MemDeregister (device->dev_handle, &args->ugni_reg->handle.gni_handle); + return mca_btl_rc_ugni_to_opal (rc); +} + +/* multi-thread safe interface to uGNI */ + +static inline int mca_btl_ugni_endpoint_smsg_send_wtag (mca_btl_base_endpoint_t *endpoint, void *hdr, size_t hdr_len, + void *payload, size_t payload_len, uint32_t msg_id, int tag) +{ + mca_btl_ugni_smsg_send_wtag_arg_t args = {.ep_handle = endpoint->smsg_ep_handle->gni_handle, + .hdr = hdr, .hdr_len = hdr_len, .payload = payload, + .payload_len = payload_len, .msg_id = msg_id, + .tag = tag}; + mca_btl_ugni_device_t *device = endpoint->smsg_ep_handle->device; + return (int) mca_btl_ugni_device_serialize (device, (mca_btl_ugni_device_serialize_fn_t) mca_btl_ugni_smsg_send_wtag_device, &args); +} + +static inline int mca_btl_ugni_smsg_get_next_wtag (mca_btl_ugni_endpoint_handle_t *ep_handle, uintptr_t *data_ptr, uint8_t *tag) +{ + mca_btl_ugni_device_t *device = ep_handle->device; + mca_btl_ugni_smsg_get_next_wtag_arg_t args = {.ep_handle = ep_handle->gni_handle, .data_ptr = data_ptr, .tag = tag}; + + return (int) mca_btl_ugni_device_serialize (device, (mca_btl_ugni_device_serialize_fn_t) mca_btl_ugni_smsg_get_next_wtag_device, &args); +} + +static inline int mca_btl_ugni_smsg_release (mca_btl_ugni_endpoint_handle_t *ep_handle) +{ + mca_btl_ugni_device_t *device = ep_handle->device; + + return (int) mca_btl_ugni_device_serialize (device, (mca_btl_ugni_device_serialize_fn_t) mca_btl_ugni_smsg_release_device, ep_handle); +} + +static inline void mca_btl_ugni_cq_clear (mca_btl_ugni_device_t *device, gni_cq_handle_t cq) +{ + (void) mca_btl_ugni_device_serialize (device, (mca_btl_ugni_device_serialize_fn_t) mca_btl_ugni_cq_clear_device, (void *) (intptr_t) cq); +} + +static inline int mca_btl_ugni_cq_get_event (mca_btl_ugni_device_t *device, mca_btl_ugni_cq_t *cq, gni_cq_entry_t *event_data) +{ + mca_btl_ugni_cq_get_event_args_t args = {.cq = cq, .event_data = event_data}; + return (int) mca_btl_ugni_device_serialize (device, (mca_btl_ugni_device_serialize_fn_t) mca_btl_ugni_cq_get_event_device, &args); +} + +static inline int mca_btl_ugni_gni_cq_get_event (mca_btl_ugni_device_t *device, gni_cq_handle_t cq, gni_cq_entry_t *event_data) +{ + mca_btl_ugni_gni_cq_get_event_args_t args = {.cq = cq, .event_data = event_data}; + return (int) mca_btl_ugni_device_serialize (device, (mca_btl_ugni_device_serialize_fn_t) mca_btl_ugni_gni_cq_get_event_device, &args); +} + +static inline int mca_btl_ugni_endpoint_post_fma (mca_btl_ugni_endpoint_t *endpoint, mca_btl_ugni_post_descriptor_t *desc) +{ + mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_ep_btl (endpoint); + mca_btl_ugni_device_t *device = desc->ep_handle ? desc->ep_handle->device : mca_btl_ugni_ep_get_device (ugni_module); + return (int) mca_btl_ugni_device_serialize (device, (mca_btl_ugni_device_serialize_fn_t) mca_btl_ugni_post_fma_device, desc); +} + +static inline int mca_btl_ugni_endpoint_post_rdma (mca_btl_ugni_endpoint_t *endpoint, mca_btl_ugni_post_descriptor_t *desc) +{ + mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_ep_btl (endpoint); + mca_btl_ugni_device_t *device = desc->ep_handle ? desc->ep_handle->device : mca_btl_ugni_ep_get_device (ugni_module); + return (int) mca_btl_ugni_device_serialize (device, (mca_btl_ugni_device_serialize_fn_t) mca_btl_ugni_post_rdma_device, desc); +} + +static inline int mca_btl_ugni_endpoint_post_cqwrite (mca_btl_ugni_endpoint_t *endpoint, mca_btl_ugni_post_descriptor_t *desc) +{ + mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_ep_btl (endpoint); + mca_btl_ugni_device_t *device = ugni_module->devices; + return (int) mca_btl_ugni_device_serialize (device, (mca_btl_ugni_device_serialize_fn_t) mca_btl_ugni_post_cqwrite_device, desc); +} + +static inline int mca_btl_ugni_cq_get_completed_desc (mca_btl_ugni_device_t *device, mca_btl_ugni_cq_t *cq, + gni_cq_entry_t *event_data, mca_btl_ugni_post_descriptor_t **post_desc, + int count) +{ + mca_btl_ugni_cq_get_completed_desc_arg_t args = {.cq = cq, .event_data = event_data, .post_desc = post_desc, .count = count}; + return (int) mca_btl_ugni_device_serialize (device, (mca_btl_ugni_device_serialize_fn_t) mca_btl_ugni_cq_get_completed_desc_device, &args); +} + +static inline int mca_btl_ugni_get_datagram (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_device_t *device, gni_ep_handle_t *gni_handle, + mca_btl_base_endpoint_t **ep) +{ + mca_btl_ugni_get_datagram_args_t args = {.ugni_module = ugni_module, .ep = ep, .handle = gni_handle}; + return (int) mca_btl_ugni_device_serialize (device, (mca_btl_ugni_device_serialize_fn_t) mca_btl_ugni_get_datagram_device, &args); +} + +static inline int mca_btl_ugni_reg_mem (mca_btl_ugni_module_t *ugni_module, void *base, size_t size, mca_btl_ugni_reg_t *ugni_reg, + gni_cq_handle_t cq, int flags) +{ + mca_btl_ugni_reg_mem_args_t args = {.ugni_module = ugni_module, .base = base, .size = size, + .ugni_reg = ugni_reg, .cq = cq, .flags = flags}; + mca_btl_ugni_device_t *device = ugni_module->devices; + return (int) mca_btl_ugni_device_serialize (device, (mca_btl_ugni_device_serialize_fn_t) mca_btl_ugni_reg_mem_device, &args); +} + +static inline int mca_btl_ugni_dereg_mem (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_reg_t *ugni_reg) +{ + mca_btl_ugni_dereg_mem_arg_t args = {.ugni_module = ugni_module, .ugni_reg = ugni_reg}; + mca_btl_ugni_device_t *device = ugni_module->devices; + return (int) mca_btl_ugni_device_serialize (device, (mca_btl_ugni_device_serialize_fn_t) mca_btl_ugni_dereg_mem_device, &args); +} + +#endif /* BTL_UGNI_DEVICE_H */ diff --git a/opal/mca/btl/ugni/btl_ugni_endpoint.c b/opal/mca/btl/ugni/btl_ugni_endpoint.c index 730df99c3b..e8224af346 100644 --- a/opal/mca/btl/ugni/btl_ugni_endpoint.c +++ b/opal/mca/btl/ugni/btl_ugni_endpoint.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2011-2016 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2011-2013 UT-Battelle, LLC. All rights reserved. * $COPYRIGHT$ @@ -12,6 +12,7 @@ #include "btl_ugni_endpoint.h" #include "btl_ugni_smsg.h" +#include "opal/mca/pmix/pmix.h" static void mca_btl_ugni_ep_construct (mca_btl_base_endpoint_t *ep) { @@ -24,15 +25,94 @@ static void mca_btl_ugni_ep_destruct (mca_btl_base_endpoint_t *ep) { OBJ_DESTRUCT(&ep->frag_wait_list); OBJ_DESTRUCT(&ep->lock); + free (ep->remote_attr); } OBJ_CLASS_INSTANCE(mca_btl_ugni_endpoint_t, opal_list_item_t, mca_btl_ugni_ep_construct, mca_btl_ugni_ep_destruct); +static int mca_btl_ugni_endpoint_get_modex (mca_btl_base_endpoint_t *ep) +{ + mca_btl_ugni_modex_t *modex; + size_t msg_size; + int rc; + + assert (NULL != ep && NULL != ep->peer_proc); + + /* Receive the modex */ + OPAL_MODEX_RECV(rc, &mca_btl_ugni_component.super.btl_version, + &ep->peer_proc->proc_name, (void **)&modex, &msg_size); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + BTL_ERROR(("error receiving modex")); + return rc; + } + + ep->ep_rem_addr = modex->addr; + ep->ep_rem_id = modex->id; + + + BTL_VERBOSE(("received modex for ep %p. addr: %d, id: %d", ep, ep->ep_rem_addr, ep->ep_rem_id)); + + free (modex); + + return OPAL_SUCCESS; +} + +int mca_btl_ugni_init_ep (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_endpoint_t **ep, + mca_btl_ugni_module_t *btl, opal_proc_t *peer_proc) +{ + mca_btl_ugni_endpoint_t *endpoint; + int rc; + + endpoint = OBJ_NEW(mca_btl_ugni_endpoint_t); + assert (endpoint != NULL); + + endpoint->smsg_progressing = 0; + endpoint->state = MCA_BTL_UGNI_EP_STATE_INIT; + endpoint->peer_proc = peer_proc; + + /* get the modex info for this endpoint and setup a ugni endpoint. this call may lead + * to re-entry through opal_progress(). */ + rc = mca_btl_ugni_endpoint_get_modex (endpoint); + if (OPAL_SUCCESS != rc) { + assert (0); + return rc; + } + + /* add this endpoint to the pointer array */ + endpoint->index = opal_pointer_array_add (&ugni_module->endpoints, endpoint); + + *ep = endpoint; + + return OPAL_SUCCESS; +} + +void mca_btl_ugni_release_ep (mca_btl_ugni_endpoint_t *ep) +{ + mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_ep_btl (ep); + int rc; + + opal_mutex_lock (&ep->lock); + + rc = mca_btl_ugni_ep_disconnect (ep, false); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + BTL_VERBOSE(("btl/ugni error disconnecting endpoint")); + } + + /* TODO -- Clear space at the end of the endpoint array */ + opal_pointer_array_set_item (&ugni_module->endpoints, ep->index, NULL); + + opal_mutex_unlock (&ep->lock); + + OBJ_RELEASE(ep); +} + static inline int mca_btl_ugni_ep_smsg_get_mbox (mca_btl_base_endpoint_t *ep) { - mca_btl_ugni_module_t *ugni_module = ep->btl; + mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_ep_btl (ep); opal_free_list_item_t *mbox; + assert (NULL == ep->mailbox); + mbox = opal_free_list_get (&ugni_module->smsg_mboxes); if (OPAL_UNLIKELY(NULL == mbox)) { return OPAL_ERR_OUT_OF_RESOURCE; @@ -47,61 +127,103 @@ static inline int mca_btl_ugni_ep_smsg_get_mbox (mca_btl_base_endpoint_t *ep) { return OPAL_SUCCESS; } -int mca_btl_ugni_ep_disconnect (mca_btl_base_endpoint_t *ep, bool send_disconnect) { - gni_return_t rc; +static int mca_btl_ugni_ep_send_disconnect (mca_btl_base_endpoint_t *ep) +{ + int rc; + + do { + rc = mca_btl_ugni_endpoint_smsg_send_wtag (ep, NULL, 0, NULL, 0, -1, MCA_BTL_UGNI_TAG_DISCONNECT); + if (OPAL_LIKELY(GNI_RC_NOT_DONE != rc)) { + break; + } + + /* most likely got here because we are out of credits. check the remote CQ to get credit return */ + (void) mca_btl_ugni_progress_remote_smsg (mca_btl_ugni_ep_btl (ep)); + } while (1); + + return mca_btl_rc_ugni_to_opal (rc); +} + +int mca_btl_ugni_ep_disconnect (mca_btl_base_endpoint_t *ep, bool send_disconnect) +{ + mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_ep_btl (ep); + mca_btl_ugni_device_t *device; + int rc; if (MCA_BTL_UGNI_EP_STATE_INIT == ep->state) { /* nothing to do */ return OPAL_SUCCESS; } - if (MCA_BTL_UGNI_EP_STATE_CONNECTED == ep->state && send_disconnect) { - OPAL_THREAD_LOCK(&ep->common->dev->dev_lock); - rc = GNI_SmsgSendWTag (ep->smsg_ep_handle, NULL, 0, NULL, 0, -1, - MCA_BTL_UGNI_TAG_DISCONNECT); - OPAL_THREAD_UNLOCK(&ep->common->dev->dev_lock); - if (GNI_RC_SUCCESS != rc) { - BTL_VERBOSE(("btl/ugni could not send close message")); + device = ep->smsg_ep_handle->device; + + while (device->dev_smsg_local_cq.active_operations) { + /* ensure all sends are complete before removing and procs */ + rc = mca_btl_ugni_progress_local_smsg (ugni_module, device); + if (OPAL_SUCCESS != rc) { + break; } - - /* we might want to wait for local completion here (do we even care), yes we do */ - /* TODO: FIX FIX FIX */ - } - /* TODO: FIX GROSS */ - OPAL_THREAD_LOCK(&ep->common->dev->dev_lock); - (void) opal_common_ugni_ep_destroy (&ep->smsg_ep_handle); - (void) opal_common_ugni_ep_destroy (&ep->rdma_ep_handle); - OPAL_THREAD_UNLOCK(&ep->common->dev->dev_lock); + if (MCA_BTL_UGNI_EP_STATE_CONNECTED == ep->state && send_disconnect) { + rc = mca_btl_ugni_ep_send_disconnect (ep); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + BTL_VERBOSE(("could not send disconnect message to peer")); + } + + /* wait for the disconnect messagse to go */ + do { + /* ensure all sends are complete before removing and procs */ + rc = mca_btl_ugni_progress_local_smsg (ugni_module, device); + if (OPAL_SUCCESS != rc) { + break; + } + } while (device->dev_smsg_local_cq.active_operations); + + (void) opal_atomic_add_32 (&ep->smsg_ep_handle->device->smsg_connections, -1); + } + + mca_btl_ugni_device_lock (device); + + /* NTH: this call may not need the device lock. seems to work without it but + * the lock is here to be safe. */ + (void) mca_btl_ugni_ep_handle_destroy (ep->smsg_ep_handle); + ep->smsg_ep_handle = NULL; + + mca_btl_ugni_device_unlock (device); if (ep->mailbox) { - opal_free_list_return (&ep->btl->smsg_mboxes, ((opal_free_list_item_t *) ep->mailbox)); + opal_free_list_return (&ugni_module->smsg_mboxes, ((opal_free_list_item_t *) ep->mailbox)); ep->mailbox = NULL; } ep->state = MCA_BTL_UGNI_EP_STATE_INIT; - (void) opal_atomic_add_64 (&ep->btl->connected_peer_count, -11); return OPAL_SUCCESS; } static inline int mca_btl_ugni_ep_connect_start (mca_btl_base_endpoint_t *ep) { + mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_ep_btl (ep); + mca_btl_ugni_device_t *device = ugni_module->devices; int rc; - rc = mca_btl_ugni_ep_connect_rdma (ep); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - return rc; + /* protect against re-entry from opal_progress */ + if (OPAL_UNLIKELY(MCA_BTL_UGNI_EP_STATE_CONNECTING == ep->state)) { + return OPAL_ERR_RESOURCE_BUSY; } - BTL_VERBOSE(("initiaiting connection to remote peer with address: %u id: %u proc: %p", - ep->common->ep_rem_addr, ep->common->ep_rem_id, (void *)ep->peer_proc)); + ep->state = MCA_BTL_UGNI_EP_STATE_CONNECTING; + + BTL_VERBOSE(("initiating connection to remote peer with address: %u id: %u proc: %p", + ep->ep_rem_addr, ep->ep_rem_id, (void *)ep->peer_proc)); /* bind endpoint to remote address */ /* we bind two endpoints to seperate out local smsg completion and local fma completion */ - rc = opal_common_ugni_ep_create (ep->common, ep->btl->smsg_local_cq, &ep->smsg_ep_handle); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - return rc; + mca_btl_ugni_device_lock (device); + ep->smsg_ep_handle = mca_btl_ugni_ep_handle_create (ep, device->dev_smsg_local_cq.gni_handle, device); + mca_btl_ugni_device_unlock (device); + if (OPAL_UNLIKELY(NULL == ep->smsg_ep_handle)) { + return OPAL_ERR_OUT_OF_RESOURCE; } /* build connection data */ @@ -110,9 +232,10 @@ static inline int mca_btl_ugni_ep_connect_start (mca_btl_base_endpoint_t *ep) { return rc; } - ep->state = MCA_BTL_UGNI_EP_STATE_CONNECTING; - - memset (&ep->remote_attr, 0, sizeof (ep->remote_attr)); + ep->remote_attr = calloc (1, sizeof (*ep->remote_attr)); + if (OPAL_UNLIKELY(NULL == ep->remote_attr)) { + return OPAL_ERR_OUT_OF_RESOURCE; + } BTL_VERBOSE(("btl/ugni connection to remote peer initiated")); @@ -120,15 +243,16 @@ static inline int mca_btl_ugni_ep_connect_start (mca_btl_base_endpoint_t *ep) { } static inline int mca_btl_ugni_ep_connect_finish (mca_btl_base_endpoint_t *ep) { + mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_ep_btl (ep); gni_return_t grc; int rc; BTL_VERBOSE(("finishing connection. remote attributes: msg_type = %d, msg_buffer = %p, buff_size = %d, " "mem_hndl = {qword1 = %" PRIu64 ", qword2 = %" PRIu64 "}, mbox = %d, mbox_maxcredit = %d, " - "msg_maxsize = %d", ep->remote_attr.smsg_attr.msg_type, ep->remote_attr.smsg_attr.msg_buffer, - ep->remote_attr.smsg_attr.buff_size, ep->remote_attr.smsg_attr.mem_hndl.qword1, - ep->remote_attr.smsg_attr.mem_hndl.qword2, ep->remote_attr.smsg_attr.mbox_offset, - ep->remote_attr.smsg_attr.mbox_maxcredit, ep->remote_attr.smsg_attr.msg_maxsize)); + "msg_maxsize = %d", ep->remote_attr->smsg_attr.msg_type, ep->remote_attr->smsg_attr.msg_buffer, + ep->remote_attr->smsg_attr.buff_size, ep->remote_attr->smsg_attr.mem_hndl.qword1, + ep->remote_attr->smsg_attr.mem_hndl.qword2, ep->remote_attr->smsg_attr.mbox_offset, + ep->remote_attr->smsg_attr.mbox_maxcredit, ep->remote_attr->smsg_attr.msg_maxsize)); BTL_VERBOSE(("finishing connection. local attributes: msg_type = %d, msg_buffer = %p, buff_size = %d, " "mem_hndl = {qword1 = %" PRIu64 ", qword2 = %" PRIu64 "}, mbox = %d, mbox_maxcredit = %d, " @@ -137,54 +261,78 @@ static inline int mca_btl_ugni_ep_connect_finish (mca_btl_base_endpoint_t *ep) { ep->mailbox->attr.smsg_attr.mem_hndl.qword2, ep->mailbox->attr.smsg_attr.mbox_offset, ep->mailbox->attr.smsg_attr.mbox_maxcredit, ep->mailbox->attr.smsg_attr.msg_maxsize)); - grc = GNI_SmsgInit (ep->smsg_ep_handle, &ep->mailbox->attr.smsg_attr, &ep->remote_attr.smsg_attr); + grc = GNI_SmsgInit (ep->smsg_ep_handle->gni_handle, &ep->mailbox->attr.smsg_attr, + &ep->remote_attr->smsg_attr); if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc)) { BTL_ERROR(("error initializing SMSG protocol. rc = %d", grc)); - return opal_common_rc_ugni_to_opal (grc); + return mca_btl_rc_ugni_to_opal (grc); } /* set the local event data to the local index and the remote event data to my * index on the remote peer. This makes lookup of endpoints on completion take * a single lookup in the endpoints array. we will not be able to change the * remote peer's index in the endpoint's array after this point. */ - GNI_EpSetEventData (ep->rdma_ep_handle, ep->index, ep->remote_attr.index); - GNI_EpSetEventData (ep->smsg_ep_handle, ep->index, ep->remote_attr.index); + GNI_EpSetEventData (ep->smsg_ep_handle->gni_handle, ep->index, ep->remote_attr->index); - ep->rmt_irq_mem_hndl = ep->remote_attr.rmt_irq_mem_hndl; + ep->rmt_irq_mem_hndl = ep->remote_attr->rmt_irq_mem_hndl; ep->state = MCA_BTL_UGNI_EP_STATE_CONNECTED; - (void) opal_atomic_add_64 (&ep->btl->connected_peer_count, 1); + (void) opal_atomic_add_32 (&ep->smsg_ep_handle->device->smsg_connections, 1); /* send all pending messages */ BTL_VERBOSE(("endpoint connected. posting %u sends", (unsigned int) opal_list_get_size (&ep->frag_wait_list))); rc = mca_btl_ugni_progress_send_wait_list (ep); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - OPAL_THREAD_LOCK(&ep->btl->ep_wait_list_lock); + OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock); if (false == ep->wait_listed) { - opal_list_append (&ep->btl->ep_wait_list, &ep->super); + opal_list_append (&ugni_module->ep_wait_list, &ep->super); ep->wait_listed = true; } - OPAL_THREAD_UNLOCK(&ep->btl->ep_wait_list_lock); + OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock); } + free (ep->remote_attr); + ep->remote_attr = NULL; + return OPAL_SUCCESS; } -static inline int mca_btl_ugni_directed_ep_post (mca_btl_base_endpoint_t *ep) { +static int mca_btl_ugni_directed_ep_post (mca_btl_base_endpoint_t *ep) +{ + mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_ep_btl (ep); + mca_btl_ugni_device_t *device = ep->smsg_ep_handle->device; gni_return_t rc; - BTL_VERBOSE(("posting directed datagram to remote id: %d for endpoint %p", ep->common->ep_rem_id, (void *)ep)); - ep->mailbox->attr.rmt_irq_mem_hndl = mca_btl_ugni_component.modules[0].device->smsg_irq_mhndl; + BTL_VERBOSE(("posting directed datagram to remote id: %d for endpoint %p", ep->ep_rem_id, (void *)ep)); + /* the irq cq is associated with only the first device */ + ep->mailbox->attr.rmt_irq_mem_hndl = ugni_module->devices->smsg_irq_mhndl; - rc = GNI_EpPostDataWId (ep->smsg_ep_handle, &ep->mailbox->attr, sizeof (ep->mailbox->attr), - &ep->remote_attr, sizeof (ep->remote_attr), + rc = GNI_EpPostDataWId (ep->smsg_ep_handle->gni_handle, &ep->mailbox->attr, sizeof (ep->mailbox->attr), + ep->remote_attr, sizeof (*ep->remote_attr), MCA_BTL_UGNI_CONNECT_DIRECTED_ID | ep->index); - return opal_common_rc_ugni_to_opal (rc); + return mca_btl_rc_ugni_to_opal (rc); } -int mca_btl_ugni_ep_connect_progress (mca_btl_base_endpoint_t *ep) { +int mca_btl_ugni_wildcard_ep_post (mca_btl_ugni_module_t *ugni_module) +{ + gni_return_t rc; + + BTL_VERBOSE(("posting wildcard datagram")); + + memset (&ugni_module->wc_local_attr, 0, sizeof (ugni_module->wc_local_attr)); + memset (&ugni_module->wc_remote_attr, 0, sizeof (ugni_module->wc_remote_attr)); + rc = GNI_EpPostDataWId (ugni_module->wildcard_ep, &ugni_module->wc_local_attr, + sizeof (ugni_module->wc_local_attr), &ugni_module->wc_remote_attr, + sizeof (ugni_module->wc_remote_attr), MCA_BTL_UGNI_CONNECT_WILDCARD_ID); + + return mca_btl_rc_ugni_to_opal (rc); +} + + +int mca_btl_ugni_ep_connect_progress (mca_btl_base_endpoint_t *ep) +{ int rc; BTL_VERBOSE(("progressing connection for endpoint %p with state %d", (void *)ep, ep->state)); @@ -193,14 +341,17 @@ int mca_btl_ugni_ep_connect_progress (mca_btl_base_endpoint_t *ep) { return OPAL_SUCCESS; } - if (MCA_BTL_UGNI_EP_STATE_RDMA >= ep->state) { + if (MCA_BTL_UGNI_EP_STATE_INIT == ep->state) { rc = mca_btl_ugni_ep_connect_start (ep); if (OPAL_SUCCESS != rc) { return rc; } } - if (GNI_SMSG_TYPE_INVALID == ep->remote_attr.smsg_attr.msg_type) { + BTL_VERBOSE(("ep->remote_attr->smsg_attr = {.msg_type = %d, .msg_buffer = 0x%lx}", ep->remote_attr->smsg_attr.msg_type, + ep->remote_attr->smsg_attr.msg_buffer)); + + if (GNI_SMSG_TYPE_INVALID == ep->remote_attr->smsg_attr.msg_type) { /* use datagram to exchange connection information with the remote peer */ if (!ep->dg_posted) { rc = mca_btl_ugni_directed_ep_post (ep); @@ -217,3 +368,77 @@ int mca_btl_ugni_ep_connect_progress (mca_btl_base_endpoint_t *ep) { return mca_btl_ugni_ep_connect_finish (ep); } + +int mca_btl_ugni_endpoint_handle_init_rdma (opal_free_list_item_t *item, void *ctx) +{ + mca_btl_ugni_endpoint_handle_t *handle = (mca_btl_ugni_endpoint_handle_t *) item; + mca_btl_ugni_device_t *device = (mca_btl_ugni_device_t *) ctx; + gni_return_t grc; + + grc = GNI_EpCreate (device->dev_handle, device->dev_rdma_local_cq.gni_handle, &handle->gni_handle); + handle->device = device; + return mca_btl_rc_ugni_to_opal (grc); +} + +static void mca_btl_ugni_endpoint_handle_construct (mca_btl_ugni_endpoint_handle_t *handle) +{ + handle->gni_handle = 0; +} + +static void mca_btl_ugni_endpoint_handle_destruct (mca_btl_ugni_endpoint_handle_t *handle) +{ + if (handle->gni_handle) { + GNI_EpDestroy (handle->gni_handle); + handle->gni_handle = 0; + } +} + +OBJ_CLASS_INSTANCE(mca_btl_ugni_endpoint_handle_t, opal_object_t, + mca_btl_ugni_endpoint_handle_construct, + mca_btl_ugni_endpoint_handle_destruct); + +mca_btl_ugni_endpoint_handle_t *mca_btl_ugni_ep_handle_create (mca_btl_ugni_endpoint_t *ep, gni_cq_handle_t cq, + mca_btl_ugni_device_t *device) +{ + mca_btl_ugni_endpoint_handle_t *ep_handle; + gni_return_t grc; + + ep_handle = OBJ_NEW(mca_btl_ugni_endpoint_handle_t); + if (OPAL_UNLIKELY(NULL == ep_handle)) { + return NULL; + } + + ep_handle->device = device; + + /* create a uGNI endpoint handle and bind it to the remote peer */ + grc = GNI_EpCreate (device->dev_handle, cq, &ep_handle->gni_handle); + if (OPAL_LIKELY(GNI_RC_SUCCESS == grc)) { + grc = GNI_EpBind (ep_handle->gni_handle, ep->ep_rem_addr, ep->ep_rem_id); + } + + if (GNI_RC_SUCCESS != grc) { + OBJ_RELEASE(ep_handle); + ep_handle = NULL; + } + + return ep_handle; +} + +int mca_btl_ugni_ep_handle_destroy (mca_btl_ugni_endpoint_handle_t *ep_handle) +{ + int rc; + + if (NULL == ep_handle || 0 == ep_handle->gni_handle) { + return OPAL_SUCCESS; + } + + /* TODO: need to fix, may be outstanding tx's, etc. */ + rc = GNI_EpUnbind (ep_handle->gni_handle); + if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) { + /* should warn */ + } + + OBJ_RELEASE(ep_handle); + + return OPAL_SUCCESS; +} diff --git a/opal/mca/btl/ugni/btl_ugni_endpoint.h b/opal/mca/btl/ugni/btl_ugni_endpoint.h index 308bae9ac8..9ee68df139 100644 --- a/opal/mca/btl/ugni/btl_ugni_endpoint.h +++ b/opal/mca/btl/ugni/btl_ugni_endpoint.h @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2011-2016 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. * $COPYRIGHT$ @@ -17,15 +17,22 @@ enum mca_btl_ugni_endpoint_state_t { MCA_BTL_UGNI_EP_STATE_INIT = 0, - MCA_BTL_UGNI_EP_STATE_START, - MCA_BTL_UGNI_EP_STATE_RDMA, MCA_BTL_UGNI_EP_STATE_CONNECTING, - MCA_BTL_UGNI_EP_STATE_CONNECTED + MCA_BTL_UGNI_EP_STATE_CONNECTED, }; typedef enum mca_btl_ugni_endpoint_state_t mca_btl_ugni_endpoint_state_t; struct mca_btl_ugni_smsg_mbox_t; +struct mca_btl_ugni_endpoint_handle_t { + opal_free_list_item_t super; + mca_btl_ugni_device_t *device; + gni_ep_handle_t gni_handle; +}; + +typedef struct mca_btl_ugni_endpoint_handle_t mca_btl_ugni_endpoint_handle_t; +OBJ_CLASS_DECLARATION(mca_btl_ugni_endpoint_handle_t); + typedef struct mca_btl_base_endpoint_t { opal_list_item_t super; @@ -37,24 +44,34 @@ typedef struct mca_btl_base_endpoint_t { opal_recursive_mutex_t lock; mca_btl_ugni_endpoint_state_t state; - opal_common_ugni_endpoint_t *common; + /** Remote NIC address */ + uint32_t ep_rem_addr; - mca_btl_ugni_module_t *btl; + /** Remote CDM identifier (base) */ + uint32_t ep_rem_id; - gni_ep_handle_t smsg_ep_handle; - gni_ep_handle_t rdma_ep_handle; + /** endpoint to use for SMSG messages */ + mca_btl_ugni_endpoint_handle_t *smsg_ep_handle; - mca_btl_ugni_endpoint_attr_t remote_attr; /* TODO: UGH, remove this */ + /** temporary space to store the remote SMSG attributes */ + mca_btl_ugni_endpoint_attr_t *remote_attr; + /** SMSG mailbox assigned to this endpoint */ struct mca_btl_ugni_smsg_mbox_t *mailbox; - gni_mem_handle_t rmt_irq_mem_hndl; + /** Remote IRQ handle (for async completion) */ + gni_mem_handle_t rmt_irq_mem_hndl; + /** frags waiting for SMSG credits */ opal_list_t frag_wait_list; + + /** endpoint is currently wait-listed for SMSG progress */ bool wait_listed; + /** protect against race on connection */ bool dg_posted; + /** protect against re-entry to SMSG */ int32_t smsg_progressing; int index; @@ -65,49 +82,10 @@ OBJ_CLASS_DECLARATION(mca_btl_ugni_endpoint_t); int mca_btl_ugni_ep_connect_progress (mca_btl_ugni_endpoint_t *ep); int mca_btl_ugni_ep_disconnect (mca_btl_ugni_endpoint_t *ep, bool send_disconnect); - -static inline int mca_btl_ugni_init_ep (mca_btl_ugni_module_t *ugni_module, - mca_btl_ugni_endpoint_t **ep, - mca_btl_ugni_module_t *btl, - opal_proc_t *peer_proc) { - mca_btl_ugni_endpoint_t *endpoint; - - endpoint = OBJ_NEW(mca_btl_ugni_endpoint_t); - assert (endpoint != NULL); - - endpoint->smsg_progressing = 0; - endpoint->state = MCA_BTL_UGNI_EP_STATE_INIT; - - endpoint->btl = btl; - endpoint->peer_proc = peer_proc; - endpoint->index = opal_pointer_array_add (&ugni_module->endpoints, endpoint); - - *ep = endpoint; - - return OPAL_SUCCESS; -} - -static inline void mca_btl_ugni_release_ep (mca_btl_ugni_endpoint_t *ep) { - int rc; - - if (ep->common) { - opal_mutex_lock (&ep->lock); - - rc = mca_btl_ugni_ep_disconnect (ep, false); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - BTL_VERBOSE(("btl/ugni error disconnecting endpoint")); - } - - /* TODO -- Clear space at the end of the endpoint array */ - opal_pointer_array_set_item (&ep->btl->endpoints, ep->index, NULL); - - opal_mutex_unlock (&ep->lock); - - opal_common_ugni_endpoint_return (ep->common); - } - - OBJ_RELEASE(ep); -} +int mca_btl_ugni_wildcard_ep_post (mca_btl_ugni_module_t *ugni_module); +void mca_btl_ugni_release_ep (mca_btl_ugni_endpoint_t *ep); +int mca_btl_ugni_init_ep (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_endpoint_t **ep, + mca_btl_ugni_module_t *btl, opal_proc_t *peer_proc); static inline int mca_btl_ugni_check_endpoint_state (mca_btl_ugni_endpoint_t *ep) { int rc; @@ -120,8 +98,6 @@ static inline int mca_btl_ugni_check_endpoint_state (mca_btl_ugni_endpoint_t *ep switch (ep->state) { case MCA_BTL_UGNI_EP_STATE_INIT: - case MCA_BTL_UGNI_EP_STATE_RDMA: - case MCA_BTL_UGNI_EP_STATE_START: rc = mca_btl_ugni_ep_connect_progress (ep); if (OPAL_SUCCESS != rc) { break; @@ -138,63 +114,91 @@ static inline int mca_btl_ugni_check_endpoint_state (mca_btl_ugni_endpoint_t *ep return rc; } -static inline int mca_btl_ugni_ep_connect_rdma (mca_btl_base_endpoint_t *ep) { - int rc; - - if (ep->state >= MCA_BTL_UGNI_EP_STATE_RDMA) { - return OPAL_SUCCESS; - } - - /* protect against re-entry from opal_progress */ - if (OPAL_UNLIKELY(MCA_BTL_UGNI_EP_STATE_START == ep->state)) { - return OPAL_ERR_RESOURCE_BUSY; - } - - ep->state = MCA_BTL_UGNI_EP_STATE_START; - - /* get the modex info for this endpoint and setup a ugni endpoint. this call may lead - * to re-entry through opal_progress(). */ - rc = opal_common_ugni_endpoint_for_proc (ep->btl->device, ep->peer_proc, &ep->common); - if (OPAL_SUCCESS != rc) { - assert (0); - return rc; - } - - /* bind endpoint to remote address */ - rc = opal_common_ugni_ep_create (ep->common, ep->btl->rdma_local_cq, &ep->rdma_ep_handle); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - return rc; - } - - ep->state = MCA_BTL_UGNI_EP_STATE_RDMA; - - return OPAL_SUCCESS; +/** + * Accessor function for endpoint btl + * + * @param[in] ep endpoint to query + * + * This helper function exists to make it easy to switch between using a single + * and multiple ugni modules. Currently there is only one so we just use the + * pointer in the component structure. This saves 4-8 bytes in the endpoint + * structure. + */ +static inline mca_btl_ugni_module_t *mca_btl_ugni_ep_btl (mca_btl_ugni_endpoint_t *ep) +{ + /* there is only one ugni module at this time. if that changes add a btl pointer back + * to the endpoint structure. */ + return mca_btl_ugni_component.modules; } -static inline int mca_btl_ugni_check_endpoint_state_rdma (mca_btl_base_endpoint_t *ep) { - int rc; - if (OPAL_LIKELY(MCA_BTL_UGNI_EP_STATE_INIT < ep->state)) { - return OPAL_SUCCESS; +/** + * Allocate and bind a uGNI endpoint handle to the remote peer. + * + * @param[in] ep BTL endpoint + * @param[in] cq completion queue + * @param[out] ep_handle uGNI endpoint handle + */ +mca_btl_ugni_endpoint_handle_t *mca_btl_ugni_ep_handle_create (mca_btl_ugni_endpoint_t *ep, gni_cq_handle_t cq, + mca_btl_ugni_device_t *device); + +/** + * Unbind and free the uGNI endpoint handle. + * + * @param[in] ep_handle uGNI endpoint handle to unbind and release + */ +int mca_btl_ugni_ep_handle_destroy (mca_btl_ugni_endpoint_handle_t *ep_handle); + +/** + * Free list initialization function for endpoint handles (DO NOT CALL outside free list) + * + * @param[in] item Free list item to initialize + * @param[in] ctx Free list context + * + * @returns OPAL_SUCCESS on success + * @returns OPAL error code on error + */ +int mca_btl_ugni_endpoint_handle_init_rdma (opal_free_list_item_t *item, void *ctx); + +/** + * @brief get an endpoint handle from a device's free list + * + * @param[in] ep btl endpoint + * @param[in] device btl device to use + * + * This function MUST be called with the device lock held. This was done over using + * the atomic free list to avoid unnecessary atomics in the critical path. + */ +static inline mca_btl_ugni_endpoint_handle_t * +mca_btl_ugni_ep_get_rdma (mca_btl_ugni_endpoint_t *ep, mca_btl_ugni_device_t *device) +{ + mca_btl_ugni_endpoint_handle_t *ep_handle; + gni_return_t grc; + + ep_handle = (mca_btl_ugni_endpoint_handle_t *) opal_free_list_get_st (&device->endpoints); + if (OPAL_UNLIKELY(NULL == ep_handle)) { + return NULL; + } + grc = GNI_EpBind (ep_handle->gni_handle, ep->ep_rem_addr, ep->ep_rem_id | device->dev_index); + if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc)) { + opal_free_list_return_st (&device->endpoints, &ep_handle->super); + ep_handle = NULL; } - opal_mutex_lock (&ep->lock); - rc = mca_btl_ugni_ep_connect_rdma (ep); - opal_mutex_unlock (&ep->lock); - return rc; + return ep_handle; } -static inline int mca_btl_ugni_wildcard_ep_post (mca_btl_ugni_module_t *ugni_module) { - gni_return_t rc; - - BTL_VERBOSE(("posting wildcard datagram")); - - memset (&ugni_module->wc_local_attr, 0, sizeof (ugni_module->wc_local_attr)); - memset (&ugni_module->wc_remote_attr, 0, sizeof (ugni_module->wc_remote_attr)); - rc = GNI_EpPostDataWId (ugni_module->wildcard_ep, &ugni_module->wc_local_attr, - sizeof (ugni_module->wc_local_attr), &ugni_module->wc_remote_attr, - sizeof (ugni_module->wc_remote_attr), MCA_BTL_UGNI_CONNECT_WILDCARD_ID); - - return opal_common_rc_ugni_to_opal (rc); +/** + * @brief return an endpoint handle to a device's free list + * + * @param[in] ep_handle endpoint handle to return + * + * This function MUST be called with the device lock held. This was done over using + * the atomic free list to avoid unnecessary atomics in the critical path. If + */ +static inline void mca_btl_ugni_ep_return_rdma (mca_btl_ugni_endpoint_handle_t *ep_handle) +{ + (void) GNI_EpUnbind (ep_handle->gni_handle); + opal_free_list_return_st (&ep_handle->device->endpoints, &ep_handle->super); } #endif /* MCA_BTL_UGNI_ENDPOINT_H */ diff --git a/opal/mca/btl/ugni/btl_ugni_frag.c b/opal/mca/btl/ugni/btl_ugni_frag.c index a576c0c521..49891cc663 100644 --- a/opal/mca/btl/ugni/btl_ugni_frag.c +++ b/opal/mca/btl/ugni/btl_ugni_frag.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. * $COPYRIGHT$ @@ -38,11 +38,25 @@ OBJ_CLASS_INSTANCE(mca_btl_ugni_rdma_frag_t, mca_btl_base_descriptor_t, OBJ_CLASS_INSTANCE(mca_btl_ugni_eager_frag_t, mca_btl_base_descriptor_t, mca_btl_ugni_eager_frag_constructor, NULL); -OBJ_CLASS_INSTANCE(mca_btl_ugni_post_descriptor_t, opal_free_list_item_t, - NULL, NULL); - -int mca_btl_ugni_frag_init (mca_btl_ugni_base_frag_t *frag, mca_btl_ugni_module_t *ugni_module) +static void mca_btl_ugni_post_descriptor_constructor (mca_btl_ugni_post_descriptor_t *desc) { + desc->cq = NULL; + desc->ep_handle = NULL; +} + +OBJ_CLASS_INSTANCE(mca_btl_ugni_post_descriptor_t, opal_free_list_item_t, + mca_btl_ugni_post_descriptor_constructor, NULL); + +int mca_btl_ugni_frag_init (mca_btl_ugni_base_frag_t *frag, void *id) +{ + /* NTH: the id is a combination of the module id and the free list id. for now there + * is only ever one module so the module id is ignored. if this changes the code + * here and btl_ugni_add_procs.c (opal_free_list_init calls) needs to be updated */ + intptr_t free_list_id = (intptr_t) id & 0xff; + mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_component.modules; + frag->msg_id = opal_pointer_array_add (&ugni_module->pending_smsg_frags_bb, (void *) frag); + frag->my_list = ugni_module->frags_lists + free_list_id; + return OPAL_SUCCESS; } diff --git a/opal/mca/btl/ugni/btl_ugni_frag.h b/opal/mca/btl/ugni/btl_ugni_frag.h index 5faa0061b0..bb8a58cbc8 100644 --- a/opal/mca/btl/ugni/btl_ugni_frag.h +++ b/opal/mca/btl/ugni/btl_ugni_frag.h @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2011-2016 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2013 The University of Tennessee and The University @@ -72,7 +72,7 @@ typedef struct mca_btl_ugni_base_frag_t { uint16_t flags; mca_btl_ugni_frag_hdr_t hdr; mca_btl_base_segment_t segments[2]; - opal_common_ugni_post_desc_t post_desc; + gni_post_descriptor_t post_desc; mca_btl_base_endpoint_t *endpoint; mca_btl_ugni_reg_t *registration; opal_free_list_t *my_list; @@ -88,12 +88,15 @@ typedef struct mca_btl_ugni_base_frag_t mca_btl_ugni_eager_frag_t; typedef struct mca_btl_ugni_post_descriptor_t { opal_free_list_item_t super; - opal_common_ugni_post_desc_t desc; + gni_post_descriptor_t desc; + mca_btl_ugni_endpoint_handle_t *ep_handle; mca_btl_base_endpoint_t *endpoint; mca_btl_base_registration_handle_t *local_handle; mca_btl_base_rdma_completion_fn_t cbfunc; + mca_btl_ugni_cq_t *cq; void *cbdata; void *ctx; + int tries; } mca_btl_ugni_post_descriptor_t; OBJ_CLASS_DECLARATION(mca_btl_ugni_post_descriptor_t); @@ -101,26 +104,38 @@ OBJ_CLASS_DECLARATION(mca_btl_ugni_post_descriptor_t); #define MCA_BTL_UGNI_DESC_TO_PDESC(desc) \ ((mca_btl_ugni_post_descriptor_t *)((uintptr_t) (desc) - offsetof (mca_btl_ugni_post_descriptor_t, desc))) -static inline void mca_btl_ugni_alloc_post_descriptor (mca_btl_base_endpoint_t *endpoint, mca_btl_base_registration_handle_t *local_handle, - mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata, - mca_btl_ugni_post_descriptor_t **desc) +static inline mca_btl_ugni_post_descriptor_t * +mca_btl_ugni_alloc_post_descriptor (mca_btl_base_endpoint_t *endpoint, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { - *desc = (mca_btl_ugni_post_descriptor_t *) opal_free_list_get (&endpoint->btl->post_descriptors); - if (NULL != *desc) { - (*desc)->cbfunc = cbfunc; - (*desc)->ctx = cbcontext; - (*desc)->cbdata = cbdata; - (*desc)->local_handle = local_handle; - (*desc)->endpoint = endpoint; - (void) OPAL_THREAD_ADD64(&endpoint->btl->active_rdma_count, 1); + /* mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_ep_btl (endpoint); */ + mca_btl_ugni_post_descriptor_t *desc; + + desc = OBJ_NEW(mca_btl_ugni_post_descriptor_t); + /* (mca_btl_ugni_post_descriptor_t *) opal_free_list_get (&ugni_module->post_descriptors); */ + if (OPAL_UNLIKELY(NULL != desc)) { + desc->cbfunc = cbfunc; + desc->ctx = cbcontext; + desc->cbdata = cbdata; + desc->local_handle = local_handle; + desc->endpoint = endpoint; } + + return desc; } -static inline void mca_btl_ugni_return_post_descriptor (mca_btl_ugni_module_t *module, - mca_btl_ugni_post_descriptor_t *desc) +static inline void mca_btl_ugni_return_post_descriptor (mca_btl_ugni_post_descriptor_t *desc) { - (void) OPAL_THREAD_ADD64(&module->active_rdma_count, -1); - opal_free_list_return (&module->post_descriptors, &desc->super); + /* mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_ep_btl (desc->endpoint); */ + + if (NULL != desc->ep_handle) { + mca_btl_ugni_ep_return_rdma (desc->ep_handle); + /* desc->ep_handle = NULL; */ + } + + /* desc->cq = NULL; */ + /* opal_free_list_return (&ugni_module->post_descriptors, &desc->super); */ + free (desc); } static inline void mca_btl_ugni_post_desc_complete (mca_btl_ugni_module_t *module, mca_btl_ugni_post_descriptor_t *desc, int rc) @@ -129,40 +144,38 @@ static inline void mca_btl_ugni_post_desc_complete (mca_btl_ugni_module_t *modul if (NULL != desc->cbfunc) { /* call the user's callback function */ - desc->cbfunc (&module->super, desc->endpoint, (void *)(intptr_t) desc->desc.base.local_addr, + desc->cbfunc (&module->super, desc->endpoint, (void *)(intptr_t) desc->desc.local_addr, desc->local_handle, desc->ctx, desc->cbdata, rc); } /* the descriptor is no longer needed */ - mca_btl_ugni_return_post_descriptor (module, desc); + mca_btl_ugni_return_post_descriptor (desc); } OBJ_CLASS_DECLARATION(mca_btl_ugni_smsg_frag_t); OBJ_CLASS_DECLARATION(mca_btl_ugni_rdma_frag_t); OBJ_CLASS_DECLARATION(mca_btl_ugni_eager_frag_t); -int mca_btl_ugni_frag_init (mca_btl_ugni_base_frag_t *frag, mca_btl_ugni_module_t *ugni_module); +int mca_btl_ugni_frag_init (mca_btl_ugni_base_frag_t *frag, void *id); -static inline int mca_btl_ugni_frag_alloc (mca_btl_base_endpoint_t *ep, - opal_free_list_t *list, - mca_btl_ugni_base_frag_t **frag) +static inline mca_btl_ugni_base_frag_t *mca_btl_ugni_frag_alloc (mca_btl_base_endpoint_t *ep, + opal_free_list_t *list) { - *frag = (mca_btl_ugni_base_frag_t *) opal_free_list_get (list); - if (OPAL_LIKELY(NULL != *frag)) { - (*frag)->my_list = list; - (*frag)->endpoint = ep; - (*frag)->ref_cnt = 1; - return OPAL_SUCCESS; + mca_btl_ugni_base_frag_t *frag = (mca_btl_ugni_base_frag_t *) opal_free_list_get (list); + if (OPAL_LIKELY(NULL != frag)) { + frag->endpoint = ep; + frag->ref_cnt = 1; } - return OPAL_ERR_OUT_OF_RESOURCE; + return frag; } static inline int mca_btl_ugni_frag_return (mca_btl_ugni_base_frag_t *frag) { + mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_ep_btl (frag->endpoint); if (frag->registration) { - frag->endpoint->btl->rcache->rcache_deregister (frag->endpoint->btl->rcache, - (mca_rcache_base_registration_t *) frag->registration); + ugni_module->rcache->rcache_deregister (ugni_module->rcache, + (mca_rcache_base_registration_t *) frag->registration); frag->registration = NULL; } @@ -174,6 +187,7 @@ static inline int mca_btl_ugni_frag_return (mca_btl_ugni_base_frag_t *frag) } static inline bool mca_btl_ugni_frag_del_ref (mca_btl_ugni_base_frag_t *frag, int rc) { + mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_ep_btl (frag->endpoint); int32_t ref_cnt; opal_atomic_mb (); @@ -186,7 +200,7 @@ static inline bool mca_btl_ugni_frag_del_ref (mca_btl_ugni_base_frag_t *frag, in /* call callback if specified */ if (frag->base.des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) { - frag->base.des_cbfunc(&frag->endpoint->btl->super, frag->endpoint, &frag->base, rc); + frag->base.des_cbfunc(&ugni_module->super, frag->endpoint, &frag->base, rc); } if (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP) { @@ -208,15 +222,38 @@ static inline bool mca_btl_ugni_frag_check_complete (mca_btl_ugni_base_frag_t *f return !!(MCA_BTL_UGNI_FRAG_COMPLETE & frag->flags); } -#define MCA_BTL_UGNI_FRAG_ALLOC_SMSG(ep, frag) \ - mca_btl_ugni_frag_alloc((ep), &(ep)->btl->smsg_frags, &(frag)) -#define MCA_BTL_UGNI_FRAG_ALLOC_RDMA(ep, frag) \ - mca_btl_ugni_frag_alloc((ep), &(ep)->btl->rdma_frags, &(frag)) -#define MCA_BTL_UGNI_FRAG_ALLOC_RDMA_INT(ep, frag) \ - mca_btl_ugni_frag_alloc((ep), &(ep)->btl->rdma_int_frags, &(frag)) -#define MCA_BTL_UGNI_FRAG_ALLOC_EAGER_SEND(ep, frag) \ - mca_btl_ugni_frag_alloc((ep), &(ep)->btl->eager_frags_send, &(frag)) -#define MCA_BTL_UGNI_FRAG_ALLOC_EAGER_RECV(ep, frag) \ - mca_btl_ugni_frag_alloc((ep), &(ep)->btl->eager_frags_recv, &(frag)) + +void mca_btl_ugni_wait_list_append (mca_btl_ugni_module_t *ugni_module, mca_btl_base_endpoint_t *endpoint, + mca_btl_ugni_base_frag_t *frag); + +static inline mca_btl_ugni_base_frag_t *mca_btl_ugni_frag_alloc_smsg (mca_btl_base_endpoint_t *ep) +{ + mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_ep_btl (ep); + return mca_btl_ugni_frag_alloc (ep, ugni_module->frags_lists + MCA_BTL_UGNI_LIST_SMSG); +} + +static inline mca_btl_ugni_base_frag_t *mca_btl_ugni_frag_alloc_rdma (mca_btl_base_endpoint_t *ep) +{ + mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_ep_btl (ep); + return mca_btl_ugni_frag_alloc (ep, ugni_module->frags_lists + MCA_BTL_UGNI_LIST_RDMA); +} + +static inline mca_btl_ugni_base_frag_t *mca_btl_ugni_frag_alloc_rdma_int (mca_btl_base_endpoint_t *ep) +{ + mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_ep_btl (ep); + return mca_btl_ugni_frag_alloc (ep, ugni_module->frags_lists + MCA_BTL_UGNI_LIST_RDMA_INT); +} + +static inline mca_btl_ugni_base_frag_t *mca_btl_ugni_frag_alloc_eager_send (mca_btl_base_endpoint_t *ep) +{ + mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_ep_btl (ep); + return mca_btl_ugni_frag_alloc (ep, ugni_module->frags_lists + MCA_BTL_UGNI_LIST_EAGER_SEND); +} + +static inline mca_btl_ugni_base_frag_t *mca_btl_ugni_frag_alloc_eager_recv (mca_btl_base_endpoint_t *ep) +{ + mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_ep_btl (ep); + return mca_btl_ugni_frag_alloc (ep, ugni_module->frags_lists + MCA_BTL_UGNI_LIST_EAGER_RECV); +} #endif /* MCA_BTL_UGNI_FRAG_H */ diff --git a/opal/mca/btl/ugni/btl_ugni_get.c b/opal/mca/btl/ugni/btl_ugni_get.c index 9c04208cba..1f8ab248b0 100644 --- a/opal/mca/btl/ugni/btl_ugni_get.c +++ b/opal/mca/btl/ugni/btl_ugni_get.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. * $COPYRIGHT$ @@ -37,11 +37,8 @@ int mca_btl_ugni_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t return OPAL_ERR_NOT_AVAILABLE; } - BTL_VERBOSE(("Using RDMA/FMA Get from local address %p to remote address %" PRIx64, - local_address, remote_address)); - - /* cause endpoint to bind if it isn't already (bind is sufficient for rdma) */ - (void) mca_btl_ugni_check_endpoint_state_rdma (endpoint); + BTL_VERBOSE(("Using RDMA/FMA Get %lu bytes to local address %p to remote address %" PRIx64, + (unsigned long) size, local_address, remote_address)); return mca_btl_ugni_post (endpoint, true, size, local_address, remote_address, local_handle, remote_handle, order, cbfunc, cbcontext, cbdata); @@ -110,13 +107,15 @@ static void mca_btl_ugni_callback_eager_get (struct mca_btl_base_module_t *btl, } reg = mca_btl_base_active_message_trigger + tag; - reg->cbfunc(&frag->endpoint->btl->super, tag, &(tmp.base), reg->cbdata); + reg->cbfunc(&ugni_module->super, tag, &(tmp.base), reg->cbdata); /* fill in the response header */ frag->hdr.rdma.ctx = frag->hdr.eager.ctx; frag->flags = MCA_BTL_UGNI_FRAG_RESPONSE; frag->ref_cnt = 1; + frag->ref_cnt = 1; + /* once complete use this fragment for a pending eager get if any exist */ frag->base.des_cbfunc = mca_btl_ugni_callback_eager_get_progress_pending; @@ -125,16 +124,7 @@ static void mca_btl_ugni_callback_eager_get (struct mca_btl_base_module_t *btl, NULL, 0, MCA_BTL_UGNI_TAG_RDMA_COMPLETE); if (OPAL_UNLIKELY(0 > rc)) { /* queue fragment */ - OPAL_THREAD_LOCK(&endpoint->lock); - if (false == endpoint->wait_listed) { - OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock); - opal_list_append (&ugni_module->ep_wait_list, &endpoint->super); - OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock); - endpoint->wait_listed = true; - } - - opal_list_append (&endpoint->frag_wait_list, (opal_list_item_t *) frag); - OPAL_THREAD_UNLOCK(&endpoint->lock); + mca_btl_ugni_wait_list_append (ugni_module, endpoint, frag); } } @@ -142,7 +132,7 @@ int mca_btl_ugni_start_eager_get (mca_btl_base_endpoint_t *endpoint, mca_btl_ugni_eager_ex_frag_hdr_t hdr, mca_btl_ugni_base_frag_t *frag) { - mca_btl_ugni_module_t *ugni_module = endpoint->btl; + mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_ep_btl (endpoint); size_t size; int rc; @@ -151,10 +141,10 @@ int mca_btl_ugni_start_eager_get (mca_btl_base_endpoint_t *endpoint, do { if (NULL == frag) { /* try to allocate a registered buffer */ - rc = MCA_BTL_UGNI_FRAG_ALLOC_EAGER_RECV(endpoint, frag); + frag = mca_btl_ugni_frag_alloc_eager_recv (endpoint); if (OPAL_UNLIKELY(NULL == frag)) { /* no registered buffers available. try again later */ - (void) MCA_BTL_UGNI_FRAG_ALLOC_RDMA_INT(endpoint, frag); + frag = mca_btl_ugni_frag_alloc_rdma_int (endpoint); /* not much can be done if a small fragment can not be allocated. abort! */ assert (NULL != frag); diff --git a/opal/mca/btl/ugni/btl_ugni_init.c b/opal/mca/btl/ugni/btl_ugni_init.c new file mode 100644 index 0000000000..87ebd3f20a --- /dev/null +++ b/opal/mca/btl/ugni/btl_ugni_init.c @@ -0,0 +1,306 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. + * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. + * Copyright (c) 2014 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + + +#include "btl_ugni.h" +#include "btl_ugni_endpoint.h" + +#include "opal/class/opal_list.h" +#include "opal/dss/dss.h" +#include "opal/mca/pmix/pmix.h" +#include "opal/util/bit_ops.h" + +static inline int get_ptag(uint8_t *out_ptag) +{ + /* TODO no need for tmp */ + char *ptr; + uint8_t tmp_ptag; + + if (NULL == (ptr = getenv("PMI_GNI_PTAG"))) { + /* TODO add err msg - better rc? */ + return OPAL_ERR_NOT_FOUND; + } + errno = 0; + tmp_ptag = (uint8_t)strtoul (ptr, (char **)NULL, 10); + if (0 != errno) { + /* TODO add err msg - better rc? */ + return OPAL_ERR_VALUE_OUT_OF_BOUNDS; + } + *out_ptag = tmp_ptag; + return OPAL_SUCCESS; +} + +static inline int get_cookie (uint32_t *out_cookie) +{ + /* TODO no need for tmp */ + char *ptr; + uint32_t tmp_cookie; + + if (NULL == (ptr = getenv("PMI_GNI_COOKIE"))) { + /* TODO add err msg - better rc? */ + return OPAL_ERR_NOT_FOUND; + } + errno = 0; + tmp_cookie = (uint32_t) strtoul (ptr, NULL, 10); + if (0 != errno) { + /* TODO add err msg - better rc? */ + return OPAL_ERR_VALUE_OUT_OF_BOUNDS; + } + + *out_cookie = tmp_cookie; + + return OPAL_SUCCESS; +} + +static unsigned int mca_btl_ugni_get_nic_address(int device_id) +{ + unsigned int address, cpu_id; + gni_return_t status; + int i, alps_dev_id = -1; + char *token,*p_ptr; + + p_ptr = getenv("PMI_GNI_DEV_ID"); + if (!p_ptr) { + status = GNI_CdmGetNicAddress(device_id, &address, &cpu_id); + if(status != GNI_RC_SUCCESS) { + opal_output (0, "FAILED:GNI_CdmGetNicAddress returned error %d", status); + return (unsigned int)-1; + } + return address; + } + + while (NULL != (token = strtok(p_ptr, ":"))) { + alps_dev_id = atoi(token); + if (alps_dev_id == device_id) { + break; + } + p_ptr = NULL; + } + + if (OPAL_UNLIKELY(-1 == alps_dev_id)) { + return (unsigned int)-1; + } + + p_ptr = getenv("PMI_GNI_LOC_ADDR"); + if (OPAL_UNLIKELY(NULL == p_ptr)) { + return (unsigned int)-1; + } + + i = 0; + while (NULL != (token = strtok(p_ptr, ":"))) { + if (i == alps_dev_id) { + return strtoul (token, NULL, 10); + } + p_ptr = NULL; + ++i; + } + + return (unsigned int)-1; +} + +int mca_btl_ugni_device_init (mca_btl_ugni_device_t *device, int virtual_device_id) +{ + uint32_t dev_pe_addr; + int rc; + + OBJ_CONSTRUCT(&device->endpoints, opal_free_list_t); + OBJ_CONSTRUCT(&device->pending_post, opal_list_t); + + rc = opal_free_list_init (&device->endpoints, sizeof (mca_btl_ugni_endpoint_handle_t), + 8, OBJ_CLASS(mca_btl_ugni_endpoint_handle_t), 0, 8, 0, + mca_btl_ugni_component.local_cq_size, 16, + NULL, 0, NULL, mca_btl_ugni_endpoint_handle_init_rdma, + (void *) device); + if (OPAL_SUCCESS != rc) { + OBJ_DESTRUCT(&device->endpoints); + return rc; + } + + /* create a communication domain */ + rc = GNI_CdmCreate (mca_btl_ugni_component.cdm_id_base | virtual_device_id, mca_btl_ugni_component.ptag, + mca_btl_ugni_component.cookie, mca_btl_ugni_component.cdm_flags, &device->dev_cd_handle); + if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) { + /* this REALLY is an error but under alps + mapn we may not get any credentials */ + BTL_VERBOSE(("Error: Creating communication domain %d for virtual device %d", rc, virtual_device_id)); + return mca_btl_rc_ugni_to_opal (rc); + } + + device->dev_index = virtual_device_id; + + /* Create a NIC Adress */ + OPAL_OUTPUT((-1, "Got NIC Addr: 0x%08x, CPU ID: %d", mca_btl_ugni_component.dev_addr, 0)); + + /* Attach device to the communication domain */ + rc = GNI_CdmAttach (device->dev_cd_handle, 0, &dev_pe_addr, &device->dev_handle); + if (GNI_RC_SUCCESS != rc) { + BTL_VERBOSE(("Error: Attaching to communication domain. rc = %d, virtual device = %d", rc, virtual_device_id)); + return mca_btl_rc_ugni_to_opal (rc); + } + + device->lock = 0; + device->dev_rdma_local_cq.gni_handle = 0; + device->dev_rdma_local_cq.active_operations = 0; + device->dev_rdma_local_irq_cq.gni_handle = 0; + device->dev_rdma_local_irq_cq.active_operations = 0; + device->dev_smsg_local_cq.gni_handle = 0; + device->dev_smsg_local_cq.active_operations= 0; + + return OPAL_SUCCESS; +} + +int mca_btl_ugni_device_fini (mca_btl_ugni_device_t *dev) +{ + int rc; + + OBJ_DESTRUCT(&dev->endpoints); + OBJ_DESTRUCT(&dev->pending_post); + + if (0 != dev->dev_rdma_local_cq.gni_handle) { + GNI_CqDestroy (dev->dev_rdma_local_cq.gni_handle); + dev->dev_rdma_local_cq.gni_handle = 0; + } + + if (0 != dev->dev_rdma_local_irq_cq.gni_handle) { + GNI_CqDestroy (dev->dev_rdma_local_irq_cq.gni_handle); + dev->dev_rdma_local_irq_cq.gni_handle = 0; + } + + if (0 != dev->dev_smsg_local_cq.gni_handle) { + GNI_CqDestroy (dev->dev_smsg_local_cq.gni_handle); + dev->dev_smsg_local_cq.gni_handle = 0; + } + + rc = GNI_CdmDestroy (dev->dev_cd_handle); + if (GNI_RC_SUCCESS != rc) { + BTL_VERBOSE(("error destroying cdm handle")); + } + + return OPAL_SUCCESS; +} + +/* + * Send local device information and other information + * required for setup + */ +static int mca_btl_ugni_send_modex (void) +{ + struct mca_btl_ugni_modex_t modex; + uint32_t modex_size; + char *modex_msg; + int rc; + + modex_size = sizeof (struct mca_btl_ugni_modex_t); + + modex_msg = (char *) malloc (modex_size); + if (NULL == modex_msg) { + OPAL_OUTPUT((-1, "Error allocating memory for modex @ %s:%d", + __FILE__, __LINE__)); + return OPAL_ERR_OUT_OF_RESOURCE; + } + + modex.addr = mca_btl_ugni_component.dev_addr; + modex.id = mca_btl_ugni_component.cdm_id_base; + + BTL_VERBOSE(("sending modex. addr: %d, id: %d", modex.addr, modex.id)); + + memcpy ((void *) modex_msg, (void *) &modex, modex_size); + + /* + * need global for edge cases like MPI_Comm_spawn support with + * new ranks started on the same nodes as the spawnee ranks, etc. + */ + + OPAL_MODEX_SEND(rc, OPAL_PMIX_GLOBAL, + &mca_btl_ugni_component.super.btl_version, + modex_msg, modex_size); + + free (modex_msg); + + return rc; +} + +int mca_btl_ugni_fini (void) +{ + return OPAL_SUCCESS; +} + +int mca_btl_ugni_init (void) +{ + int32_t pid_max = 32768; + int rc, bit; + FILE *fh; + + if (0 == mca_btl_ugni_component.virtual_device_count) { + /* XXX -- TODO -- might want to improve this logic. One option would be to + * compare the number of local peers vs the number of cores or hyperthreads + * on the node. */ + + if (!opal_using_threads() || opal_process_info.num_local_peers >= 255) { + /* there is probably no benefit to using multiple device contexts when not + * using threads. */ + mca_btl_ugni_component.virtual_device_count = 1; + } else if (opal_process_info.num_local_peers >= 127) { + mca_btl_ugni_component.virtual_device_count = 2; + } else if (opal_process_info.num_local_peers >= 63) { + mca_btl_ugni_component.virtual_device_count = 4; + } else if (opal_process_info.num_local_peers >= 31) { + mca_btl_ugni_component.virtual_device_count = 8; + } else { + mca_btl_ugni_component.virtual_device_count = 16; + } + } else if (MCA_BTL_UGNI_MAX_DEV_HANDLES < mca_btl_ugni_component.virtual_device_count) { + mca_btl_ugni_component.virtual_device_count = MCA_BTL_UGNI_MAX_DEV_HANDLES; + } + + fh = fopen ("/proc/sys/kernel/pid_max", "r"); + if (NULL != fh) { + fscanf (fh, "%d", &pid_max); + fclose (fh); + } + + /* Use pid to generate the cdm_id. Although its not stated in the uGNI + * documentation, the cdm_id only needs to be unique within a node for a + * given ptag/cookie tuple */ + bit = opal_hibit (pid_max, 31); + if (bit >= 31) { + mca_btl_ugni_component.virtual_device_count = 1; + mca_btl_ugni_component.cdm_id_base = getpid(); + } else if (bit >= 30 && mca_btl_ugni_component.virtual_device_count > 2) { + mca_btl_ugni_component.virtual_device_count = 2; + mca_btl_ugni_component.cdm_id_base = getpid() << 1; + } else { + mca_btl_ugni_component.cdm_id_base = getpid() << 8; + } + + /* Create a communication domain */ + /* collect uGNI information */ + rc = get_ptag(&mca_btl_ugni_component.ptag); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + return rc; + } + + rc = get_cookie(&mca_btl_ugni_component.cookie); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + return rc; + } + + /* get the device address of the NIC */ + mca_btl_ugni_component.dev_addr = mca_btl_ugni_get_nic_address (0); + + /* send ugni modex */ + mca_btl_ugni_send_modex (); + + return OPAL_SUCCESS; +} diff --git a/opal/mca/btl/ugni/btl_ugni_module.c b/opal/mca/btl/ugni/btl_ugni_module.c index 90d4acd466..6452bc5a67 100644 --- a/opal/mca/btl/ugni/btl_ugni_module.c +++ b/opal/mca/btl/ugni/btl_ugni_module.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2014-2016 Research Organization for Information Science @@ -62,22 +62,18 @@ mca_btl_ugni_module_t mca_btl_ugni_module = { }; int -mca_btl_ugni_module_init (mca_btl_ugni_module_t *ugni_module, - opal_common_ugni_device_t *dev) +mca_btl_ugni_module_init (mca_btl_ugni_module_t *ugni_module) { int rc; - BTL_VERBOSE(("binding module %p to device %p", (void *) ugni_module, - (void *) dev)); + BTL_VERBOSE(("binding module %p to device 0", (void *) ugni_module)); /* copy module defaults (and function pointers) */ memmove (ugni_module, &mca_btl_ugni_module, sizeof (mca_btl_ugni_module)); ugni_module->initialized = false; ugni_module->nlocal_procs = 0; - ugni_module->active_send_count = 0; ugni_module->connected_peer_count = 0; - ugni_module->active_rdma_count = 0; OBJ_CONSTRUCT(&ugni_module->failed_frags, opal_list_t); OBJ_CONSTRUCT(&ugni_module->failed_frags_lock, opal_mutex_t); @@ -85,11 +81,10 @@ mca_btl_ugni_module_init (mca_btl_ugni_module_t *ugni_module, OBJ_CONSTRUCT(&ugni_module->eager_get_pending, opal_list_t); OBJ_CONSTRUCT(&ugni_module->eager_get_pending_lock,opal_mutex_t); - OBJ_CONSTRUCT(&ugni_module->eager_frags_send, opal_free_list_t); - OBJ_CONSTRUCT(&ugni_module->eager_frags_recv, opal_free_list_t); - OBJ_CONSTRUCT(&ugni_module->smsg_frags, opal_free_list_t); - OBJ_CONSTRUCT(&ugni_module->rdma_frags, opal_free_list_t); - OBJ_CONSTRUCT(&ugni_module->rdma_int_frags, opal_free_list_t); + for (int i = 0 ; i < MCA_BTL_UGNI_LIST_MAX ; ++i) { + OBJ_CONSTRUCT(ugni_module->frags_lists + i, opal_free_list_t); + } + OBJ_CONSTRUCT(&ugni_module->pending_smsg_frags_bb, opal_pointer_array_t); OBJ_CONSTRUCT(&ugni_module->ep_wait_list_lock,opal_mutex_t); OBJ_CONSTRUCT(&ugni_module->ep_wait_list, opal_list_t); @@ -97,22 +92,26 @@ mca_btl_ugni_module_init (mca_btl_ugni_module_t *ugni_module, OBJ_CONSTRUCT(&ugni_module->endpoints, opal_pointer_array_t); OBJ_CONSTRUCT(&ugni_module->id_to_endpoint, opal_hash_table_t); OBJ_CONSTRUCT(&ugni_module->smsg_mboxes, opal_free_list_t); - OBJ_CONSTRUCT(&ugni_module->pending_descriptors, opal_list_t); OBJ_CONSTRUCT(&ugni_module->eager_get_pending, opal_list_t); OBJ_CONSTRUCT(&ugni_module->post_descriptors, opal_free_list_t); - ugni_module->device = dev; - dev->btl_ctx = (void *) ugni_module; + /* set up virtual device handles */ + for (int i = 0 ; i < mca_btl_ugni_component.virtual_device_count ; ++i) { + rc = mca_btl_ugni_device_init (ugni_module->devices + i, i); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + BTL_VERBOSE(("error initializing uGNI device handle")); + return rc; + } + } - /* create wildcard endpoint to listen for connections. - * there is no need to bind this endpoint. */ - OPAL_THREAD_LOCK(&dev->dev_lock); - rc = GNI_EpCreate (ugni_module->device->dev_handle, NULL, + /* create wildcard endpoint on first device to listen for connections. + * there is no need to bind this endpoint. We are single threaded + * here so there is no need for a device lock. */ + rc = GNI_EpCreate (ugni_module->devices[0].dev_handle, NULL, &ugni_module->wildcard_ep); - OPAL_THREAD_UNLOCK(&dev->dev_lock); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { BTL_ERROR(("error creating wildcard ugni endpoint")); - return opal_common_rc_ugni_to_opal (rc); + return mca_btl_rc_ugni_to_opal (rc); } /* post wildcard datagram */ @@ -133,16 +132,8 @@ mca_btl_ugni_module_finalize (struct mca_btl_base_module_t *btl) uint64_t key; int rc; - while (ugni_module->active_send_count) { - /* ensure all sends are complete before closing the module */ - rc = mca_btl_ugni_progress_local_smsg (ugni_module); - if (OPAL_SUCCESS != rc) { - break; - } - } - - /* close all open connections and release endpoints */ if (ugni_module->initialized) { + /* close all open connections and release endpoints */ OPAL_HASH_TABLE_FOREACH(key, uint64, ep, &ugni_module->id_to_endpoint) { if (NULL != ep) { mca_btl_ugni_release_ep (ep); @@ -154,28 +145,12 @@ mca_btl_ugni_module_finalize (struct mca_btl_base_module_t *btl) } /* destroy all cqs */ - OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); - rc = GNI_CqDestroy (ugni_module->rdma_local_cq); - if (GNI_RC_SUCCESS != rc) { - BTL_ERROR(("error tearing down local BTE/FMA CQ - %s",gni_err_str[rc])); - } - - rc = GNI_CqDestroy (ugni_module->smsg_local_cq); - if (GNI_RC_SUCCESS != rc) { - BTL_ERROR(("error tearing down TX SMSG CQ - %s",gni_err_str[rc])); - } - rc = GNI_CqDestroy (ugni_module->smsg_remote_cq); if (GNI_RC_SUCCESS != rc) { BTL_ERROR(("error tearing down RX SMSG CQ - %s",gni_err_str[rc])); } if (mca_btl_ugni_component.progress_thread_enabled) { - rc = GNI_CqDestroy (ugni_module->rdma_local_irq_cq); - if (GNI_RC_SUCCESS != rc) { - BTL_ERROR(("error tearing down local BTE/FMA CQ - %s",gni_err_str[rc])); - } - rc = GNI_CqDestroy (ugni_module->smsg_remote_irq_cq); if (GNI_RC_SUCCESS != rc) { BTL_ERROR(("error tearing down remote SMSG CQ - %s",gni_err_str[rc])); @@ -195,14 +170,12 @@ mca_btl_ugni_module_finalize (struct mca_btl_base_module_t *btl) if (GNI_RC_SUCCESS != rc) { BTL_VERBOSE(("btl/ugni error destroying endpoint - %s",gni_err_str[rc])); } - OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); } - OBJ_DESTRUCT(&ugni_module->eager_frags_send); - OBJ_DESTRUCT(&ugni_module->eager_frags_recv); - OBJ_DESTRUCT(&ugni_module->smsg_frags); - OBJ_DESTRUCT(&ugni_module->rdma_frags); - OBJ_DESTRUCT(&ugni_module->rdma_int_frags); + for (int i = 0 ; i < MCA_BTL_UGNI_LIST_MAX ; ++i) { + OBJ_DESTRUCT(ugni_module->frags_lists + i); + } + OBJ_DESTRUCT(&ugni_module->ep_wait_list); OBJ_DESTRUCT(&ugni_module->smsg_mboxes); OBJ_DESTRUCT(&ugni_module->pending_smsg_frags_bb); @@ -217,6 +190,10 @@ mca_btl_ugni_module_finalize (struct mca_btl_base_module_t *btl) mca_rcache_base_module_destroy (ugni_module->rcache); } + for (int i = 0 ; i < mca_btl_ugni_component.virtual_device_count ; ++i) { + mca_btl_ugni_device_fini (ugni_module->devices + i); + } + ugni_module->initialized = false; return OPAL_SUCCESS; @@ -230,10 +207,17 @@ mca_btl_ugni_alloc(struct mca_btl_base_module_t *btl, { mca_btl_ugni_base_frag_t *frag = NULL; - if (size <= mca_btl_ugni_component.smsg_max_data) { - (void) MCA_BTL_UGNI_FRAG_ALLOC_SMSG(endpoint, frag); + /* do not allocate a fragment unless the wait list is relatively small. this + * reduces the potential for resource exhaustion. note the wait list only exists + * because we have no way to notify the sender that credits are available. */ + if (OPAL_UNLIKELY(opal_list_get_size (&endpoint->frag_wait_list) > 32)) { + return NULL; + } + + if (size <= mca_btl_ugni_component.smsg_max_data) { + frag = mca_btl_ugni_frag_alloc_smsg (endpoint); } else if (size <= btl->btl_eager_limit) { - (void) MCA_BTL_UGNI_FRAG_ALLOC_EAGER_SEND(endpoint, frag); + frag = mca_btl_ugni_frag_alloc_eager_send (endpoint); } if (OPAL_UNLIKELY(NULL == frag)) { @@ -284,6 +268,13 @@ mca_btl_ugni_prepare_src (struct mca_btl_base_module_t *btl, uint8_t order, size_t reserve, size_t *size, uint32_t flags) { + /* do not allocate a fragment unless the wait list is relatively small. this + * reduces the potential for resource exhaustion. note the wait list only exists + * because we have no way to notify the sender that credits are available. */ + if (OPAL_UNLIKELY(opal_list_get_size (&endpoint->frag_wait_list) > 32)) { + return NULL; + } + return mca_btl_ugni_prepare_src_send (btl, endpoint, convertor, order, reserve, size, flags); } diff --git a/opal/mca/btl/ugni/btl_ugni_prepare.h b/opal/mca/btl/ugni/btl_ugni_prepare.h index 093c9f6cb0..9d2da1954c 100644 --- a/opal/mca/btl/ugni/btl_ugni_prepare.h +++ b/opal/mca/btl/ugni/btl_ugni_prepare.h @@ -26,7 +26,7 @@ mca_btl_ugni_prepare_src_send_nodata (struct mca_btl_base_module_t *btl, { mca_btl_ugni_base_frag_t *frag = NULL; - (void) MCA_BTL_UGNI_FRAG_ALLOC_RDMA(endpoint, frag); + frag = mca_btl_ugni_frag_alloc_rdma (endpoint); if (OPAL_UNLIKELY(NULL == frag)) { return NULL; } @@ -65,8 +65,7 @@ mca_btl_ugni_prepare_src_send_inplace (struct mca_btl_base_module_t *btl, opal_convertor_get_current_pointer (convertor, &data_ptr); - (void) MCA_BTL_UGNI_FRAG_ALLOC_RDMA(endpoint, frag); - + frag = mca_btl_ugni_frag_alloc_rdma (endpoint); if (OPAL_UNLIKELY(NULL == frag)) { return NULL; } @@ -123,7 +122,7 @@ mca_btl_ugni_prepare_src_send_buffered (struct mca_btl_base_module_t *btl, int rc; if (OPAL_UNLIKELY(true == use_eager_get)) { - (void) MCA_BTL_UGNI_FRAG_ALLOC_EAGER_SEND(endpoint, frag); + frag = mca_btl_ugni_frag_alloc_eager_send (endpoint); if (OPAL_UNLIKELY(NULL == frag)) { return NULL; } @@ -136,7 +135,7 @@ mca_btl_ugni_prepare_src_send_buffered (struct mca_btl_base_module_t *btl, frag->hdr_size = reserve + sizeof (frag->hdr.eager); frag->segments[0].seg_addr.pval = frag->hdr.eager_ex.pml_header; } else { - (void) MCA_BTL_UGNI_FRAG_ALLOC_SMSG(endpoint, frag); + frag = mca_btl_ugni_frag_alloc_smsg (endpoint); if (OPAL_UNLIKELY(NULL == frag)) { return NULL; } @@ -186,8 +185,8 @@ mca_btl_ugni_prepare_src_send (struct mca_btl_base_module_t *btl, opal_convertor_get_current_pointer (convertor, &data_ptr); - send_in_place = !(opal_convertor_need_buffers(convertor) || - (use_eager_get && ((uintptr_t)data_ptr & 3))); + send_in_place = (btl->btl_flags & MCA_BTL_FLAGS_SEND_INPLACE) && !(opal_convertor_need_buffers(convertor) || + (use_eager_get && ((uintptr_t)data_ptr & 3))); if (send_in_place) { return mca_btl_ugni_prepare_src_send_inplace (btl, endpoint, convertor, order, diff --git a/opal/mca/btl/ugni/btl_ugni_progress_thread.c b/opal/mca/btl/ugni/btl_ugni_progress_thread.c index 2af2a4ad75..3fa0391140 100644 --- a/opal/mca/btl/ugni/btl_ugni_progress_thread.c +++ b/opal/mca/btl/ugni/btl_ugni_progress_thread.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. * $COPYRIGHT$ @@ -29,17 +29,19 @@ static void *mca_btl_ugni_prog_thread_fn(void * data) { uint32_t which; gni_return_t status; - gni_cq_handle_t cq_vec[2]; + gni_cq_handle_t cq_vec[1 + MCA_BTL_UGNI_MAX_DEV_HANDLES]; struct mca_btl_ugni_module_t *btl = (mca_btl_ugni_module_t *)data; + int cq_count = 1 + mca_btl_ugni_component.virtual_device_count; /* * need to block signals */ cq_vec[0] = btl->smsg_remote_irq_cq; - cq_vec[1] = btl->rdma_local_irq_cq; - + for (int i = 0 ; i < mca_btl_ugni_component.virtual_device_count ; ++i) { + cq_vec[i + 1] = btl->devices[i].dev_rdma_local_irq_cq.gni_handle; + } while (stop_progress_thread == 0) { @@ -48,7 +50,7 @@ static void *mca_btl_ugni_prog_thread_fn(void * data) */ status = GNI_CqVectorMonitor(cq_vec, - 2, + cq_count, -1, &which); @@ -106,8 +108,8 @@ int mca_btl_ugni_kill_progress_thread(void) */ ret = mca_btl_ugni_post_cqwrite (mca_btl_ugni_component.modules[0].local_ep, - mca_btl_ugni_component.modules[0].rdma_local_cq, - mca_btl_ugni_component.modules[0].device->smsg_irq_mhndl, + &mca_btl_ugni_component.modules[0].devices[0].dev_rdma_local_cq, + mca_btl_ugni_component.modules[0].devices[0].smsg_irq_mhndl, 0xdead, NULL, NULL, NULL); /* * TODO: if error returned, need to kill off thread manually diff --git a/opal/mca/btl/ugni/btl_ugni_put.c b/opal/mca/btl/ugni/btl_ugni_put.c index 2729314e37..71ab146f04 100644 --- a/opal/mca/btl/ugni/btl_ugni_put.c +++ b/opal/mca/btl/ugni/btl_ugni_put.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. * $COPYRIGHT$ @@ -19,11 +19,8 @@ int mca_btl_ugni_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { - BTL_VERBOSE(("Using RDMA/FMA Put from local address %p to remote address %" PRIx64, - local_address, remote_address)); - - /* cause endpoint to bind if it isn't already (bind is sufficient for rdma) */ - (void) mca_btl_ugni_check_endpoint_state_rdma (endpoint); + BTL_VERBOSE(("Using RDMA/FMA Put %lu bytes from local address %p to remote address %" PRIx64, + (unsigned long) size, local_address, remote_address)); return mca_btl_ugni_post (endpoint, false, size, local_address, remote_address, local_handle, remote_handle, order, cbfunc, cbcontext, cbdata); diff --git a/opal/mca/btl/ugni/btl_ugni_rdma.h b/opal/mca/btl/ugni/btl_ugni_rdma.h index 970feabd34..ccb9e55ae5 100644 --- a/opal/mca/btl/ugni/btl_ugni_rdma.h +++ b/opal/mca/btl/ugni/btl_ugni_rdma.h @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. * $COPYRIGHT$ @@ -15,12 +15,13 @@ #include "btl_ugni.h" #include "btl_ugni_frag.h" +#include "btl_ugni_device.h" int mca_btl_ugni_start_eager_get (mca_btl_base_endpoint_t *ep, mca_btl_ugni_eager_ex_frag_hdr_t hdr, mca_btl_ugni_base_frag_t *frag); -static inline void init_gni_post_desc (opal_common_ugni_post_desc_t *post_desc, +static inline void init_gni_post_desc (mca_btl_ugni_post_descriptor_t *post_desc, int order, gni_post_type_t op_type, uint64_t lcl_addr, gni_mem_handle_t lcl_mdh, @@ -28,20 +29,20 @@ static inline void init_gni_post_desc (opal_common_ugni_post_desc_t *post_desc, gni_mem_handle_t rem_mdh, uint64_t bufsize, gni_cq_handle_t cq_hndl) { - post_desc->base.type = op_type; - post_desc->base.cq_mode = GNI_CQMODE_GLOBAL_EVENT; + post_desc->desc.type = op_type; + post_desc->desc.cq_mode = GNI_CQMODE_GLOBAL_EVENT; if (MCA_BTL_NO_ORDER == order) { - post_desc->base.dlvr_mode = GNI_DLVMODE_PERFORMANCE; + post_desc->desc.dlvr_mode = GNI_DLVMODE_PERFORMANCE; } else { - post_desc->base.dlvr_mode = GNI_DLVMODE_NO_ADAPT; + post_desc->desc.dlvr_mode = GNI_DLVMODE_NO_ADAPT; } - post_desc->base.local_addr = (uint64_t) lcl_addr; - post_desc->base.local_mem_hndl = lcl_mdh; - post_desc->base.remote_addr = (uint64_t) rem_addr; - post_desc->base.remote_mem_hndl = rem_mdh; - post_desc->base.length = bufsize; - post_desc->base.rdma_mode = 0; - post_desc->base.src_cq_hndl = cq_hndl; + post_desc->desc.local_addr = (uint64_t) lcl_addr; + post_desc->desc.local_mem_hndl = lcl_mdh; + post_desc->desc.remote_addr = (uint64_t) rem_addr; + post_desc->desc.remote_mem_hndl = rem_mdh; + post_desc->desc.length = bufsize; + post_desc->desc.rdma_mode = 0; + post_desc->desc.src_cq_hndl = cq_hndl; post_desc->tries = 0; } @@ -54,38 +55,28 @@ static inline int mca_btl_ugni_post_fma (struct mca_btl_base_endpoint_t *endpoin { mca_btl_ugni_post_descriptor_t *post_desc; gni_mem_handle_t local_gni_handle = {0, 0}; - gni_return_t grc; + int rc; if (local_handle) { local_gni_handle = local_handle->gni_handle; } - mca_btl_ugni_alloc_post_descriptor (endpoint, local_handle, cbfunc, cbcontext, cbdata, &post_desc); + post_desc = mca_btl_ugni_alloc_post_descriptor (endpoint, local_handle, cbfunc, cbcontext, cbdata); if (OPAL_UNLIKELY(NULL == post_desc)) { return OPAL_ERR_OUT_OF_RESOURCE; } /* Post descriptor (CQ is ignored for FMA transactions) -- The CQ associated with the endpoint * is used. */ - init_gni_post_desc (&post_desc->desc, order, op_type, (intptr_t) local_address, local_gni_handle, + init_gni_post_desc (post_desc, order, op_type, (intptr_t) local_address, local_gni_handle, remote_address, remote_handle->gni_handle, size, 0); - OPAL_THREAD_LOCK(&endpoint->btl->device->dev_lock); - grc = GNI_PostFma (endpoint->rdma_ep_handle, &post_desc->desc.base); - OPAL_THREAD_UNLOCK(&endpoint->btl->device->dev_lock); - if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc)) { - mca_btl_ugni_return_post_descriptor (endpoint->btl, post_desc); - - if (GNI_RC_ALIGNMENT_ERROR == grc) { - BTL_VERBOSE(("GNI_PostFma failed with an alignment error")); - return OPAL_ERR_NOT_AVAILABLE; - } - - BTL_VERBOSE(("GNI_PostFma failed with gni rc: %d", grc)); - return OPAL_ERR_OUT_OF_RESOURCE; + rc = mca_btl_ugni_endpoint_post_fma (endpoint, post_desc); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + mca_btl_ugni_return_post_descriptor (post_desc); } - return OPAL_SUCCESS; + return rc; } static inline int mca_btl_ugni_post_bte (mca_btl_base_endpoint_t *endpoint, gni_post_type_t op_type, @@ -96,70 +87,53 @@ static inline int mca_btl_ugni_post_bte (mca_btl_base_endpoint_t *endpoint, gni_ void *cbcontext, void *cbdata) { mca_btl_ugni_post_descriptor_t *post_desc; - gni_cq_handle_t cq_handle = endpoint->btl->rdma_local_cq; - gni_return_t status; + int rc; - mca_btl_ugni_alloc_post_descriptor (endpoint, local_handle, cbfunc, cbcontext, cbdata, &post_desc); + post_desc = mca_btl_ugni_alloc_post_descriptor (endpoint, local_handle, cbfunc, cbcontext, cbdata); if (OPAL_UNLIKELY(NULL == post_desc)) { return OPAL_ERR_OUT_OF_RESOURCE; } - if (mca_btl_ugni_component.progress_thread_enabled) { - cq_handle = endpoint->btl->rdma_local_irq_cq; - } - /* Post descriptor */ - init_gni_post_desc (&post_desc->desc, order, op_type, (intptr_t) local_address, local_handle->gni_handle, - remote_address, remote_handle->gni_handle, size, cq_handle); + init_gni_post_desc (post_desc, order, op_type, (intptr_t) local_address, local_handle->gni_handle, + remote_address, remote_handle->gni_handle, size, 0); - OPAL_THREAD_LOCK(&endpoint->btl->device->dev_lock); - status = GNI_PostRdma (endpoint->rdma_ep_handle, &post_desc->desc.base); - OPAL_THREAD_UNLOCK(&endpoint->btl->device->dev_lock); - if (OPAL_UNLIKELY(GNI_RC_SUCCESS != status)) { - mca_btl_ugni_return_post_descriptor (endpoint->btl, post_desc); - - if (GNI_RC_ALIGNMENT_ERROR == status) { - BTL_VERBOSE(("GNI_PostRdma failed with an alignment error")); - return OPAL_ERR_NOT_AVAILABLE; - } - - BTL_VERBOSE(("GNI_PostRdma failed with gni rc: %d", status)); - return OPAL_ERR_OUT_OF_RESOURCE; + rc = mca_btl_ugni_endpoint_post_rdma (endpoint, post_desc); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + mca_btl_ugni_return_post_descriptor (post_desc); } - return OPAL_SUCCESS; + return rc; } -static inline int mca_btl_ugni_post_cqwrite (mca_btl_base_endpoint_t *endpoint, gni_cq_handle_t cq_handle, +static inline int mca_btl_ugni_post_cqwrite (mca_btl_base_endpoint_t *endpoint, mca_btl_ugni_cq_t *cq, gni_mem_handle_t irq_mhndl, uint64_t value, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { mca_btl_ugni_post_descriptor_t *post_desc; - gni_return_t grc; + int rc; - mca_btl_ugni_alloc_post_descriptor (endpoint, NULL, cbfunc, cbcontext, cbdata, &post_desc); + post_desc = mca_btl_ugni_alloc_post_descriptor (endpoint, NULL, cbfunc, cbcontext, cbdata); if (OPAL_UNLIKELY(NULL == post_desc)) { return OPAL_ERR_OUT_OF_RESOURCE; } - post_desc->desc.base.type = GNI_POST_CQWRITE; - post_desc->desc.base.cqwrite_value = value; /* up to 48 bytes here, not used for now */ - post_desc->desc.base.cq_mode = GNI_CQMODE_GLOBAL_EVENT; - post_desc->desc.base.dlvr_mode = GNI_DLVMODE_IN_ORDER; - post_desc->desc.base.src_cq_hndl = cq_handle; - post_desc->desc.base.remote_mem_hndl = irq_mhndl; - post_desc->desc.tries = 0; + post_desc->desc.type = GNI_POST_CQWRITE; + post_desc->desc.cqwrite_value = value; /* up to 48 bytes here, not used for now */ + post_desc->desc.cq_mode = GNI_CQMODE_GLOBAL_EVENT; + post_desc->desc.dlvr_mode = GNI_DLVMODE_IN_ORDER; + post_desc->desc.src_cq_hndl = cq->gni_handle; + post_desc->desc.remote_mem_hndl = irq_mhndl; + post_desc->tries = 0; + post_desc->cq = cq; - OPAL_THREAD_LOCK(&endpoint->common->dev->dev_lock); - grc = GNI_PostCqWrite(endpoint->rdma_ep_handle, &post_desc->desc.base); - OPAL_THREAD_UNLOCK(&endpoint->common->dev->dev_lock); - if (GNI_RC_SUCCESS != grc) { /* errors for PostCqWrite treated as non-fatal */ - BTL_VERBOSE(("GNI_PostCqWrite returned error - %s", gni_err_str[grc])); - mca_btl_ugni_return_post_descriptor (endpoint->btl, post_desc); + rc = mca_btl_ugni_endpoint_post_cqwrite (endpoint, post_desc); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { /* errors for PostCqWrite treated as non-fatal */ + mca_btl_ugni_return_post_descriptor (post_desc); } - return opal_common_rc_ugni_to_opal (grc); + return rc; } static inline int mca_btl_ugni_post (mca_btl_base_endpoint_t *endpoint, int get, size_t size, @@ -183,27 +157,11 @@ static inline int mca_btl_ugni_post (mca_btl_base_endpoint_t *endpoint, int get, static inline int mca_btl_ugni_repost (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_post_descriptor_t *post_desc) { - gni_return_t grc; - - OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); - if (GNI_POST_RDMA_PUT == post_desc->desc.base.type || - GNI_POST_RDMA_GET == post_desc->desc.base.type) { - grc = GNI_PostRdma (post_desc->endpoint->rdma_ep_handle, &post_desc->desc.base); - } else { - grc = GNI_PostFma (post_desc->endpoint->rdma_ep_handle, &post_desc->desc.base); - } - OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); - - if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc)) { - /* NTH: Should we even retry these? When this code was written there was no indication - * whether an error in post is recoverable. Clobber this code and the associated data - * structures if post errors are not recoverable. */ - OPAL_THREAD_LOCK(&ugni_module->pending_descriptors_lock); - opal_list_append (&ugni_module->pending_descriptors, (opal_list_item_t *) post_desc); - OPAL_THREAD_UNLOCK(&ugni_module->pending_descriptors_lock); + if (GNI_POST_RDMA_PUT == post_desc->desc.type || GNI_POST_RDMA_GET == post_desc->desc.type) { + return mca_btl_ugni_endpoint_post_rdma (post_desc->endpoint, post_desc); } - return opal_common_rc_ugni_to_opal (grc); + return mca_btl_ugni_endpoint_post_fma (post_desc->endpoint, post_desc); } #endif /* MCA_BTL_UGNI_RDMA_H */ diff --git a/opal/mca/btl/ugni/btl_ugni_send.c b/opal/mca/btl/ugni/btl_ugni_send.c index 45e17ec13f..081f8d4da1 100644 --- a/opal/mca/btl/ugni/btl_ugni_send.c +++ b/opal/mca/btl/ugni/btl_ugni_send.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2014 Research Organization for Information Science @@ -17,6 +17,30 @@ #include "btl_ugni_smsg.h" #include "btl_ugni_prepare.h" +void mca_btl_ugni_wait_list_append (mca_btl_ugni_module_t *ugni_module, mca_btl_base_endpoint_t *endpoint, + mca_btl_ugni_base_frag_t *frag) +{ + BTL_VERBOSE(("wait-listing fragment %p to %s. endpoint state %d\n", frag, OPAL_NAME_PRINT(endpoint->peer_proc->proc_name), endpoint->state)); + + frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; + + /* queue up request */ + OPAL_THREAD_LOCK(&endpoint->lock); + + opal_list_append (&endpoint->frag_wait_list, (opal_list_item_t *) frag); + + OPAL_THREAD_UNLOCK(&endpoint->lock); + + if (false == endpoint->wait_listed && MCA_BTL_UGNI_EP_STATE_CONNECTED == endpoint->state) { + OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock); + if (false == endpoint->wait_listed) { + opal_list_append (&ugni_module->ep_wait_list, &endpoint->super); + endpoint->wait_listed = true; + } + OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock); + } +} + int mca_btl_ugni_send (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, struct mca_btl_base_descriptor_t *descriptor, @@ -30,18 +54,15 @@ int mca_btl_ugni_send (struct mca_btl_base_module_t *btl, /* tag and len are at the same location in eager and smsg frag hdrs */ frag->hdr.send.lag = (tag << 24) | size; + BTL_VERBOSE(("btl/ugni sending descriptor %p from %d -> %d. length = %" PRIu64, (void *)descriptor, + OPAL_PROC_MY_NAME.vpid, endpoint->peer_proc->proc_name.vpid, size)); + rc = mca_btl_ugni_check_endpoint_state (endpoint); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; - OPAL_THREAD_LOCK(&endpoint->lock); - opal_list_append (&endpoint->frag_wait_list, (opal_list_item_t *) frag); - OPAL_THREAD_UNLOCK(&endpoint->lock); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc || opal_list_get_size (&endpoint->frag_wait_list))) { + mca_btl_ugni_wait_list_append (ugni_module, endpoint, frag); return OPAL_SUCCESS; } - BTL_VERBOSE(("btl/ugni sending descriptor %p from %d -> %d. length = %" PRIu64, (void *)descriptor, - OPAL_PROC_MY_NAME.vpid, endpoint->common->ep_rem_id, size)); - /* add a reference to prevent the fragment from being returned until after the * completion flag is checked. */ ++frag->ref_cnt; @@ -61,7 +82,7 @@ int mca_btl_ugni_send (struct mca_btl_base_module_t *btl, frag->flags &= ~MCA_BTL_DES_SEND_ALWAYS_CALLBACK; if (call_callback) { - frag->base.des_cbfunc(&frag->endpoint->btl->super, frag->endpoint, &frag->base, rc); + frag->base.des_cbfunc(&ugni_module->super, frag->endpoint, &frag->base, rc); } (void) mca_btl_ugni_frag_del_ref (frag, OPAL_SUCCESS); @@ -77,18 +98,7 @@ int mca_btl_ugni_send (struct mca_btl_base_module_t *btl, if (OPAL_UNLIKELY(OPAL_ERR_OUT_OF_RESOURCE == rc)) { /* queue up request */ - if (false == endpoint->wait_listed) { - OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock); - if (false == endpoint->wait_listed) { - opal_list_append (&ugni_module->ep_wait_list, &endpoint->super); - endpoint->wait_listed = true; - } - OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock); - } - - OPAL_THREAD_LOCK(&endpoint->lock); - opal_list_append (&endpoint->frag_wait_list, (opal_list_item_t *) frag); - OPAL_THREAD_UNLOCK(&endpoint->lock); + mca_btl_ugni_wait_list_append (ugni_module, endpoint, frag); rc = OPAL_SUCCESS; } @@ -109,9 +119,9 @@ int mca_btl_ugni_sendi (struct mca_btl_base_module_t *btl, int rc; do { - if (OPAL_UNLIKELY(OPAL_SUCCESS != mca_btl_ugni_check_endpoint_state (endpoint))) { - break; - } + BTL_VERBOSE(("btl/ugni isend sending fragment from %d -> %d. length = %" PRIu64 + " endoint state %d", OPAL_PROC_MY_NAME.vpid, endpoint->peer_proc->proc_name.vpid, + payload_size + header_size, endpoint->state)); flags |= MCA_BTL_DES_FLAGS_BTL_OWNERSHIP; @@ -124,7 +134,8 @@ int mca_btl_ugni_sendi (struct mca_btl_base_module_t *btl, } assert (packed_size == payload_size); - if (OPAL_UNLIKELY(NULL == frag)) { + if (OPAL_UNLIKELY(NULL == frag || OPAL_SUCCESS != mca_btl_ugni_check_endpoint_state (endpoint) || + opal_list_get_size (&endpoint->frag_wait_list))) { break; } @@ -141,8 +152,9 @@ int mca_btl_ugni_sendi (struct mca_btl_base_module_t *btl, } while (0); if (NULL != descriptor) { - *descriptor = NULL; + *descriptor = &frag->base; } + return OPAL_ERR_OUT_OF_RESOURCE; } diff --git a/opal/mca/btl/ugni/btl_ugni_smsg.c b/opal/mca/btl/ugni/btl_ugni_smsg.c index b7848bfc66..0e338cc760 100644 --- a/opal/mca/btl/ugni/btl_ugni_smsg.c +++ b/opal/mca/btl/ugni/btl_ugni_smsg.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. * $COPYRIGHT$ @@ -28,7 +28,7 @@ static void mca_btl_ugni_smsg_mbox_construct (mca_btl_ugni_smsg_mbox_t *mbox) { mbox->attr.smsg_attr.buff_size = mca_btl_ugni_component.smsg_mbox_size; mbox->attr.smsg_attr.mem_hndl = ugni_reg->handle.gni_handle; mbox->attr.proc_name = OPAL_PROC_MY_NAME; - mbox->attr.rmt_irq_mem_hndl = mca_btl_ugni_component.modules[0].device->smsg_irq_mhndl; + mbox->attr.rmt_irq_mem_hndl = mca_btl_ugni_component.modules[0].devices[0].smsg_irq_mhndl; } OBJ_CLASS_INSTANCE(mca_btl_ugni_smsg_mbox_t, opal_free_list_item_t, @@ -39,11 +39,13 @@ int mca_btl_ugni_smsg_init (mca_btl_ugni_module_t *ugni_module) { gni_return_t rc; - rc = GNI_SmsgSetMaxRetrans (ugni_module->device->dev_handle, - mca_btl_ugni_component.smsg_max_retries); - if (GNI_RC_SUCCESS != rc) { - BTL_ERROR(("error setting maximum SMSG retries %s",gni_err_str[rc])); - return opal_common_rc_ugni_to_opal (rc); + for (int i = 0 ; i < mca_btl_ugni_component.virtual_device_count ; ++i) { + rc = GNI_SmsgSetMaxRetrans (ugni_module->devices[i].dev_handle, + mca_btl_ugni_component.smsg_max_retries); + if (GNI_RC_SUCCESS != rc) { + BTL_ERROR(("error setting maximum SMSG retries %s",gni_err_str[rc])); + return mca_btl_rc_ugni_to_opal (rc); + } } return OPAL_SUCCESS; @@ -52,6 +54,7 @@ int mca_btl_ugni_smsg_init (mca_btl_ugni_module_t *ugni_module) /* progress */ int mca_btl_ugni_smsg_process (mca_btl_base_endpoint_t *ep) { + mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_ep_btl (ep); mca_btl_active_message_callback_t *reg; mca_btl_ugni_base_frag_t frag; mca_btl_base_segment_t seg; @@ -70,27 +73,20 @@ int mca_btl_ugni_smsg_process (mca_btl_base_endpoint_t *ep) do { uint8_t tag = GNI_SMSG_ANY_TAG; - OPAL_THREAD_LOCK(&ep->common->dev->dev_lock); - rc = GNI_SmsgGetNextWTag (ep->smsg_ep_handle, (void **) &data_ptr, &tag); - OPAL_THREAD_UNLOCK(&ep->common->dev->dev_lock); - if (GNI_RC_NOT_DONE == rc) { - BTL_VERBOSE(("no smsg message waiting. rc = %s", gni_err_str[rc])); + rc = mca_btl_ugni_smsg_get_next_wtag (ep->smsg_ep_handle, &data_ptr, &tag); + if (GNI_RC_SUCCESS != rc) { + if (OPAL_LIKELY(GNI_RC_NOT_DONE == rc)) { + BTL_VERBOSE(("no smsg message waiting. rc = %s", gni_err_str[rc])); - ep->smsg_progressing = 0; + ep->smsg_progressing = 0; + return count; + } - return count; - } - - if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) { - BTL_ERROR(("GNI_SmsgGetNextWTag returned error %s", gni_err_str[rc])); + BTL_ERROR(("unhandled GNI_SmsgGetNextWTag error")); return OPAL_ERROR; } - if (OPAL_UNLIKELY(0 == data_ptr)) { - BTL_ERROR(("null data ptr!")); - assert (0); - return OPAL_ERROR; - } + assert (0 != data_ptr); count++; @@ -114,7 +110,7 @@ int mca_btl_ugni_smsg_process (mca_btl_base_endpoint_t *ep) assert (NULL != reg->cbfunc); - reg->cbfunc(&ep->btl->super, tag, &(frag.base), reg->cbdata); + reg->cbfunc(&ugni_module->super, tag, &(frag.base), reg->cbdata); break; case MCA_BTL_UGNI_TAG_GET_INIT: @@ -141,16 +137,14 @@ int mca_btl_ugni_smsg_process (mca_btl_base_endpoint_t *ep) break; } - OPAL_THREAD_LOCK(&ep->common->dev->dev_lock); - rc = GNI_SmsgRelease (ep->smsg_ep_handle); - OPAL_THREAD_UNLOCK(&ep->common->dev->dev_lock); + rc = mca_btl_ugni_smsg_release (ep->smsg_ep_handle); if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) { BTL_ERROR(("Smsg release failed! rc = %d", rc)); return OPAL_ERROR; } } while (!disconnect); - ep->smsg_progressing = false; + ep->smsg_progressing = 0; /* disconnect if we get here */ opal_mutex_lock (&ep->lock); @@ -165,7 +159,6 @@ int mca_btl_ugni_smsg_process (mca_btl_base_endpoint_t *ep) static inline int mca_btl_ugni_handle_remote_smsg_overrun (mca_btl_ugni_module_t *btl) { - gni_cq_entry_t event_data; size_t endpoint_count; unsigned int ep_index; int count, rc; @@ -177,11 +170,7 @@ mca_btl_ugni_handle_remote_smsg_overrun (mca_btl_ugni_module_t *btl) smsg remote cq and check all mailboxes */ /* clear out remote cq */ - do { - OPAL_THREAD_LOCK(&btl->device->dev_lock); - rc = GNI_CqGetEvent (btl->smsg_remote_cq, &event_data); - OPAL_THREAD_UNLOCK(&btl->device->dev_lock); - } while (GNI_RC_NOT_DONE != rc); + mca_btl_ugni_cq_clear (btl->devices, btl->smsg_remote_cq); endpoint_count = opal_pointer_array_get_size (&btl->endpoints); @@ -212,9 +201,7 @@ int mca_btl_ugni_progress_remote_smsg (mca_btl_ugni_module_t *btl) gni_return_t grc; uint64_t inst_id; - OPAL_THREAD_LOCK(&btl->device->dev_lock); - grc = GNI_CqGetEvent (btl->smsg_remote_cq, &event_data); - OPAL_THREAD_UNLOCK(&btl->device->dev_lock); + grc = mca_btl_ugni_gni_cq_get_event (btl->devices, btl->smsg_remote_cq, &event_data); if (GNI_RC_NOT_DONE == grc) { return 0; } @@ -231,12 +218,12 @@ int mca_btl_ugni_progress_remote_smsg (mca_btl_ugni_module_t *btl) /* unhandled error: crash */ assert (0); - return opal_common_rc_ugni_to_opal (grc); + return mca_btl_rc_ugni_to_opal (grc); } BTL_VERBOSE(("REMOTE CQ: Got event 0x%" PRIx64 ". msg id = %" PRIu64 - ". ok = %d, type = %" PRIu64 "\n", (uint64_t) event_data, - GNI_CQ_GET_MSG_ID(event_data), GNI_CQ_STATUS_OK(event_data), + ". ok = %d, type = %" PRIu64, (uint64_t) event_data, + GNI_CQ_GET_INST_ID(event_data), GNI_CQ_STATUS_OK(event_data), GNI_CQ_GET_TYPE(event_data))); inst_id = GNI_CQ_GET_INST_ID(event_data); diff --git a/opal/mca/btl/ugni/btl_ugni_smsg.h b/opal/mca/btl/ugni/btl_ugni_smsg.h index 7746e798a4..e3ba2d1960 100644 --- a/opal/mca/btl/ugni/btl_ugni_smsg.h +++ b/opal/mca/btl/ugni/btl_ugni_smsg.h @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. * $COPYRIGHT$ @@ -36,20 +36,13 @@ int mca_btl_ugni_smsg_init (mca_btl_ugni_module_t *ugni_module); int mca_btl_ugni_smsg_process (mca_btl_base_endpoint_t *ep); int mca_btl_ugni_progress_remote_smsg (mca_btl_ugni_module_t *btl); -static inline int mca_btl_ugni_progress_local_smsg (mca_btl_ugni_module_t *ugni_module) +static inline int mca_btl_ugni_progress_local_smsg (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_device_t *device) { mca_btl_ugni_base_frag_t *frag; gni_cq_entry_t event_data; gni_return_t grc; - /* nothing to do */ - if (0 == ugni_module->active_send_count) { - return OPAL_SUCCESS; - } - - OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); - grc = GNI_CqGetEvent (ugni_module->smsg_local_cq, &event_data); - OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); + grc = mca_btl_ugni_cq_get_event (device, &device->dev_smsg_local_cq, &event_data); if (GNI_RC_NOT_DONE == grc) { return OPAL_SUCCESS; } @@ -59,7 +52,7 @@ static inline int mca_btl_ugni_progress_local_smsg (mca_btl_ugni_module_t *ugni_ will the event eventually come back? Ask Cray */ BTL_ERROR(("post error! cq overrun = %d", (int)GNI_CQ_OVERRUN(event_data))); assert (0); - return opal_common_rc_ugni_to_opal (grc); + return mca_btl_rc_ugni_to_opal (grc); } assert (GNI_CQ_GET_TYPE(event_data) == GNI_CQ_EVENT_TYPE_SMSG); @@ -71,8 +64,6 @@ static inline int mca_btl_ugni_progress_local_smsg (mca_btl_ugni_module_t *ugni_ return OPAL_ERROR; } - opal_atomic_add_32(&ugni_module->active_send_count,-1); - frag->flags |= MCA_BTL_UGNI_FRAG_SMSG_COMPLETE; if (!(frag->flags & MCA_BTL_UGNI_FRAG_IGNORE)) { @@ -87,26 +78,22 @@ static inline int opal_mca_btl_ugni_smsg_send (mca_btl_ugni_base_frag_t *frag, void *payload, size_t payload_len, mca_btl_ugni_smsg_tag_t tag) { + mca_btl_base_endpoint_t *endpoint = frag->endpoint; + mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_ep_btl (endpoint); gni_return_t grc; - OPAL_THREAD_LOCK(&frag->endpoint->common->dev->dev_lock); - grc = GNI_SmsgSendWTag (frag->endpoint->smsg_ep_handle, hdr, hdr_len, - payload, payload_len, frag->msg_id, tag); - OPAL_THREAD_UNLOCK(&frag->endpoint->common->dev->dev_lock); - + grc = mca_btl_ugni_endpoint_smsg_send_wtag (endpoint, hdr, hdr_len, payload, payload_len, + frag->msg_id, tag); if (OPAL_LIKELY(GNI_RC_SUCCESS == grc)) { - /* increment the active send counter */ - opal_atomic_add_32(&frag->endpoint->btl->active_send_count,1); - if (mca_btl_ugni_component.progress_thread_enabled) { if (frag->base.des_flags & MCA_BTL_DES_FLAGS_SIGNAL) { /* errors for PostCqWrite treated as non-fatal */ - (void) mca_btl_ugni_post_cqwrite (frag->endpoint, frag->endpoint->btl->rdma_local_cq, - frag->endpoint->rmt_irq_mem_hndl, 0xdead, NULL, NULL, NULL); + (void) mca_btl_ugni_post_cqwrite (endpoint, &ugni_module->devices[0].dev_rdma_local_cq, + endpoint->rmt_irq_mem_hndl, 0xdead, NULL, NULL, NULL); } } - (void) mca_btl_ugni_progress_local_smsg ((mca_btl_ugni_module_t *) frag->endpoint->btl); + (void) mca_btl_ugni_progress_local_smsg (ugni_module, endpoint->smsg_ep_handle->device); return OPAL_SUCCESS; } diff --git a/opal/mca/common/ugni/Makefile.am b/opal/mca/common/ugni/Makefile.am deleted file mode 100644 index ac7482c345..0000000000 --- a/opal/mca/common/ugni/Makefile.am +++ /dev/null @@ -1,67 +0,0 @@ -# -*- indent-tabs-mode:nil -*- -# -# Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights -# reserved. -# Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -AM_CPPFLAGS = $(common_ugni_CPPFLAGS) - -component_noinst = lib@OPAL_LIB_PREFIX@mca_common_ugni_noinst.la -component_install = lib@OPAL_LIB_PREFIX@mca_common_ugni.la - -if MCA_BUILD_opal_common_ugni_DSO -lib_LTLIBRARIES = $(component_install) -else -noinst_LTLIBRARIES = $(component_noinst) -endif - -headers = common_ugni.h \ - common_ugni_ep.h - -ugni_SOURCES = common_ugni.c \ - common_ugni_ep.c - -#mcacomponentdir = $(opallibdir) -lib@OPAL_LIB_PREFIX@mca_common_ugni_la_SOURCES = $(headers) $(ugni_SOURCES) -nodist_lib@OPAL_LIB_PREFIX@mca_common_ugni_la_SOURCES = $(ugni_nodist_SOURCES) -lib@OPAL_LIB_PREFIX@mca_common_ugni_la_LIBADD = $(common_ugni_LIBS) -lib@OPAL_LIB_PREFIX@mca_common_ugni_la_LDFLAGS = \ - -version-info $(libmca_opal_common_ugni_so_version) \ - $(common_ugni_LDFLAGS) - -lib@OPAL_LIB_PREFIX@mca_common_ugni_noinst_la_SOURCES = \ - $(headers) $(ugni_SOURCES) -nodist_lib@OPAL_LIB_PREFIX@mca_common_ugni_noinst_la_SOURCES = \ - $(ugni_nodist_SOURCES) -lib@OPAL_LIB_PREFIX@mca_common_ugni_noinst_la_LIBADD = $(common_ugni_LIBS) -lib@OPAL_LIB_PREFIX@mca_common_ugni_noinst_la_LDFLAGS = \ - -module -avoid-version $(common_ugni_LDFLAGS) - -# These two rules will sym link the "noinst" libtool library filename -# to the installable libtool library filename in the case where we are -# compiling this component statically (case 2), described above). - -V=0 -OMPI_V_LN_SCOMP = $(ompi__v_LN_SCOMP_$V) -ompi__v_LN_SCOMP_ = $(ompi__v_LN_SCOMP_$AM_DEFAULT_VERBOSITY) -ompi__v_LN_SCOMP_0 = @echo " LN_S " `basename $(component_install)`; - -all-local: - $(OMPI_V_LN_SCOMP) if test -z "$(lib_LTLIBRARIES)"; then \ - rm -f "$(component_install)"; \ - $(LN_S) "$(component_noinst)" "$(component_install)"; \ - fi - -clean-local: - if test -z "$(mcacomponent_LTLIBRARIES)"; then \ - rm -f "$(component_install)"; \ - fi diff --git a/opal/mca/common/ugni/common_ugni.c b/opal/mca/common/ugni/common_ugni.c deleted file mode 100644 index 2877d046d0..0000000000 --- a/opal/mca/common/ugni/common_ugni.c +++ /dev/null @@ -1,301 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. - * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. - * Copyright (c) 2014 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - - -#include "common_ugni.h" - -#include "opal/class/opal_list.h" -#include "opal/dss/dss.h" -#include "opal/mca/pmix/pmix.h" - -/* NTH: we need some options from the btl */ -#include "opal/mca/btl/ugni/btl_ugni.h" - -static int opal_common_ugni_module_ref_count = 0; -opal_common_ugni_module_t opal_common_ugni_module = {0}; - -mca_base_component_t opal_common_ugni_component = { - OPAL_MCA_BASE_VERSION_2_1_0("common", 1, 0, 0), - .mca_component_name = "ugni", - .mca_component_major_version = 1, - .mca_component_minor_version = 0, - .mca_component_release_version = 0, -}; - -static inline int -get_ptag(uint8_t *out_ptag) -{ - /* TODO no need for tmp */ - char *ptr; - uint8_t tmp_ptag; - - if (NULL == (ptr = getenv("PMI_GNI_PTAG"))) { - /* TODO add err msg - better rc? */ - return OPAL_ERR_NOT_FOUND; - } - errno = 0; - tmp_ptag = (uint8_t)strtoul (ptr, (char **)NULL, 10); - if (0 != errno) { - /* TODO add err msg - better rc? */ - return OPAL_ERR_VALUE_OUT_OF_BOUNDS; - } - *out_ptag = tmp_ptag; - return OPAL_SUCCESS; -} - -static inline int get_cookie (uint32_t *out_cookie) -{ - /* TODO no need for tmp */ - char *ptr; - uint32_t tmp_cookie; - - if (NULL == (ptr = getenv("PMI_GNI_COOKIE"))) { - /* TODO add err msg - better rc? */ - return OPAL_ERR_NOT_FOUND; - } - errno = 0; - tmp_cookie = (uint32_t) strtoul (ptr, NULL, 10); - if (0 != errno) { - /* TODO add err msg - better rc? */ - return OPAL_ERR_VALUE_OUT_OF_BOUNDS; - } - - *out_cookie = tmp_cookie; - - return OPAL_SUCCESS; -} - -static unsigned int -opal_common_ugni_get_nic_address(int device_id) -{ - unsigned int address, cpu_id; - gni_return_t status; - int i, alps_dev_id = -1; - char *token,*p_ptr; - - p_ptr = getenv("PMI_GNI_DEV_ID"); - if (!p_ptr) { - status = GNI_CdmGetNicAddress(device_id, &address, &cpu_id); - if(status != GNI_RC_SUCCESS) { - opal_output (0, "FAILED:GNI_CdmGetNicAddress returned error %d", status); - return (unsigned int)-1; - } - return address; - } - - while (NULL != (token = strtok(p_ptr, ":"))) { - alps_dev_id = atoi(token); - if (alps_dev_id == device_id) { - break; - } - p_ptr = NULL; - } - - if (OPAL_UNLIKELY(-1 == alps_dev_id)) { - return (unsigned int)-1; - } - - p_ptr = getenv("PMI_GNI_LOC_ADDR"); - if (OPAL_UNLIKELY(NULL == p_ptr)) { - return (unsigned int)-1; - } - - i = 0; - while (NULL != (token = strtok(p_ptr, ":"))) { - if (i == alps_dev_id) { - return strtoul (token, NULL, 10); - } - p_ptr = NULL; - ++i; - } - - return (unsigned int)-1; -} - -static int opal_common_ugni_device_init (opal_common_ugni_device_t *device, - int device_id) -{ - int rc; - - /* Create a NIC Adress */ - device->dev_id = device_id; /* Minor number of the Gemini NIC */ - - device->dev_addr = opal_common_ugni_get_nic_address (device->dev_id); - - OPAL_OUTPUT((-1, "Got NIC Addr: 0x%08x, CPU ID: %d", device->dev_addr, device->dev_id)); - - OBJ_CONSTRUCT(&device->dev_lock,opal_mutex_t); - - /* Attach device to the communication domain */ - rc = GNI_CdmAttach (opal_common_ugni_module.cd_handle, device->dev_id, - &device->dev_pe_addr, &device->dev_handle); - if (GNI_RC_SUCCESS != rc) { - OPAL_OUTPUT((0, "Error: Creating communication domain %d\n", rc)); - return opal_common_rc_ugni_to_opal (rc); - } - - return OPAL_SUCCESS; -} - -static int opal_common_ugni_device_fini (opal_common_ugni_device_t *dev) -{ - return OPAL_SUCCESS; -} - -/* - * Send local device information and other information - * required for setup - */ -static int opal_common_ugni_send_modex (int my_cdm_id) -{ - uint32_t modex_size, total_msg_size, msg_offset; - struct opal_common_ugni_modex_t modex; - char *modex_msg; - int rc, i; - - modex_size = sizeof (struct opal_common_ugni_modex_t); - total_msg_size = opal_common_ugni_module.device_count * modex_size; - - modex_msg = (char *) malloc (total_msg_size); - if (NULL == modex_msg) { - OPAL_OUTPUT((-1, "Error allocating memory for modex @ %s:%d", - __FILE__, __LINE__)); - return OPAL_ERR_OUT_OF_RESOURCE; - } - - /* pack modex for all available devices */ - for (i = 0, msg_offset = 0; i < opal_common_ugni_module.device_count ; ++i) { - opal_common_ugni_device_t *dev = opal_common_ugni_module.devices + i; - - modex.addr = dev->dev_addr; - modex.id = my_cdm_id; - - memcpy ((void *)((uintptr_t) modex_msg + msg_offset), - (void *)&modex, modex_size); - - msg_offset += modex_size; - } - - /* - * need global for edge cases like MPI_Comm_spawn support with - * new ranks started on the same nodes as the spawnee ranks, etc. - */ - - OPAL_MODEX_SEND(rc, OPAL_PMIX_GLOBAL, - &opal_common_ugni_component, - modex_msg, total_msg_size); - - free(modex_msg); - - return rc; -} - -int opal_common_ugni_fini (void) -{ - int i, rc; - - if (0 == opal_common_ugni_module_ref_count) { - return OPAL_SUCCESS; - } - - if (1 == opal_common_ugni_module_ref_count) { - /* tear down component */ - if (opal_common_ugni_module.devices) { - /* finalize devices */ - for (i = 0 ; i < opal_common_ugni_module.device_count ; ++i) { - opal_common_ugni_device_fini (opal_common_ugni_module.devices + i); - } - - free (opal_common_ugni_module.devices); - opal_common_ugni_module.devices = NULL; - } - - /* finally, tear down the communication domain */ - rc = GNI_CdmDestroy (opal_common_ugni_module.cd_handle); - if (GNI_RC_SUCCESS != rc) { - OPAL_OUTPUT((-1, "error destroying cdm")); - } - } - - opal_common_ugni_module_ref_count--; - - return OPAL_SUCCESS; -} - -int opal_common_ugni_init (void) -{ - int modes, rc, i; - uint32_t my_cdm_id; - - opal_common_ugni_module_ref_count ++; - - if (opal_common_ugni_module_ref_count > 1) { - return OPAL_SUCCESS; - } - - /* use pid for my_cdm_id. Although its not stated in the uGNI - documentation, the cdm_id only needs to be unique - within a node for a given ptag/cookie tuple */ - - my_cdm_id = getpid(); /*TODO: eventually need something else for thread-hot support */ - - /* pull settings from ugni btl */ - opal_common_ugni_module.rdma_max_retries = - mca_btl_ugni_component.rdma_max_retries; - - /* Create a communication domain */ - - modes = GNI_CDM_MODE_FORK_FULLCOPY | GNI_CDM_MODE_CACHED_AMO_ENABLED | - GNI_CDM_MODE_ERR_NO_KILL | GNI_CDM_MODE_FAST_DATAGRAM_POLL | - GNI_CDM_MODE_FMA_SHARED; - - /* collect uGNI information */ - rc = get_ptag(&opal_common_ugni_module.ptag); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - return rc; - } - - rc = get_cookie(&opal_common_ugni_module.cookie); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - return rc; - } - - /* create a communication domain */ - rc = GNI_CdmCreate (my_cdm_id, opal_common_ugni_module.ptag, - opal_common_ugni_module.cookie, modes, - &opal_common_ugni_module.cd_handle); - if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) { - OPAL_OUTPUT((0, "Error: Creating communication domain %d\n",rc)); - return opal_common_rc_ugni_to_opal (rc); - } - - /* setup uGNI devices. we only support one device atm */ - opal_common_ugni_module.device_count = 1; - opal_common_ugni_module.devices = calloc (opal_common_ugni_module.device_count, - sizeof (opal_common_ugni_device_t)); - - for (i = 0 ; i < opal_common_ugni_module.device_count ; ++i) { - rc = opal_common_ugni_device_init (opal_common_ugni_module.devices + i, i); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - OPAL_OUTPUT((-1, "error initializing uGNI device")); - return rc; - } - } - - /* send ugni modex */ - opal_common_ugni_send_modex (my_cdm_id); - - return OPAL_SUCCESS; -} diff --git a/opal/mca/common/ugni/common_ugni.h b/opal/mca/common/ugni/common_ugni.h deleted file mode 100644 index 5f39fd1851..0000000000 --- a/opal/mca/common/ugni/common_ugni.h +++ /dev/null @@ -1,117 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "opal_config.h" - -#include "opal/util/output.h" -#include "opal/util/proc.h" -#include "opal/class/opal_list.h" -#include "opal/include/opal/prefetch.h" -#include "opal_stdint.h" - -#include -#include -#include -#include -#include -#include - -#include "common_ugni_ep.h" - -#if !defined(MPI_COMMON_UGNI_H) -#define MPI_COMMON_UGNI_H - -struct opal_common_ugni_modex_t { - uint32_t addr; - int id; - gni_mem_handle_t irq_memhndl; -}; -typedef struct opal_common_ugni_modex_t opal_common_ugni_modex_t; - -struct opal_common_ugni_device_t { - opal_object_t super; - - gni_nic_handle_t dev_handle; - - /* Minor number of the Gemini NIC */ - int32_t dev_id; - uint32_t dev_pe_addr; - uint32_t dev_addr; - uint32_t dev_cpu_id; - - size_t dev_ep_count; - opal_mutex_t dev_lock; - gni_mem_handle_t smsg_irq_mhndl; - void *btl_ctx; -}; -typedef struct opal_common_ugni_device_t opal_common_ugni_device_t; - -struct opal_common_ugni_module_t { - /* protection tag */ - uint8_t ptag; - - /* unique id for this process assigned by the system */ - uint32_t cookie; - - /* communication domain handle */ - gni_cdm_handle_t cd_handle; - - /* device count. to be used if we have more than 1 common per ugni device */ - int device_count; - opal_common_ugni_device_t *devices; - - int rdma_max_retries; -}; -typedef struct opal_common_ugni_module_t opal_common_ugni_module_t; - -struct opal_common_ugni_post_desc_t { - gni_post_descriptor_t base; - - opal_common_ugni_endpoint_t *endpoint; - int tries; -}; -typedef struct opal_common_ugni_post_desc_t opal_common_ugni_post_desc_t; - -extern opal_common_ugni_module_t opal_common_ugni_module; -extern mca_base_component_t opal_common_ugni_component; - -static inline int -opal_common_rc_ugni_to_opal (gni_return_t rc) -{ - int codes[] = {OPAL_SUCCESS, - OPAL_ERR_RESOURCE_BUSY, - OPAL_ERR_BAD_PARAM, - OPAL_ERR_OUT_OF_RESOURCE, - OPAL_ERR_TIMEOUT, - OPAL_ERR_PERM, - OPAL_ERROR, - OPAL_ERR_BAD_PARAM, - OPAL_ERR_BAD_PARAM, - OPAL_ERR_NOT_FOUND, - OPAL_ERR_VALUE_OUT_OF_BOUNDS, - OPAL_ERROR, - OPAL_ERR_NOT_SUPPORTED, - OPAL_ERR_OUT_OF_RESOURCE}; - return codes[rc]; -} - -/* - * Initialize uGNI communication domain and device(s). - */ -int opal_common_ugni_init (void); - -/* - * Finalize uGNI communication domain and device(s). - */ -int opal_common_ugni_fini (void); - -#endif /* MPI_COMMON_UGNI_H */ diff --git a/opal/mca/common/ugni/common_ugni_ep.c b/opal/mca/common/ugni/common_ugni_ep.c deleted file mode 100644 index dadf39ac52..0000000000 --- a/opal/mca/common/ugni/common_ugni_ep.c +++ /dev/null @@ -1,118 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. - * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "common_ugni.h" -#include "opal/mca/pmix/pmix.h" - -OBJ_CLASS_INSTANCE(opal_common_ugni_endpoint_t, opal_object_t, NULL, NULL); - -int opal_common_ugni_endpoint_for_proc (opal_common_ugni_device_t *dev, opal_proc_t *peer_proc, - opal_common_ugni_endpoint_t **ep) -{ - opal_common_ugni_endpoint_t *endpoint; - opal_common_ugni_modex_t *modex; - size_t msg_size; - int rc; - - assert (NULL != dev && NULL != ep && peer_proc); - - endpoint = OBJ_NEW(opal_common_ugni_endpoint_t); - if (OPAL_UNLIKELY(NULL == endpoint)) { - assert (0); - return OPAL_ERR_OUT_OF_RESOURCE; - } - - /* Receive the modex */ - OPAL_MODEX_RECV(rc, &opal_common_ugni_component, - &peer_proc->proc_name, (void **)&modex, &msg_size); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - OPAL_OUTPUT((-1, "btl/ugni error receiving modex")); - return rc; - } - - endpoint->ep_rem_addr = modex->addr; - endpoint->ep_rem_id = modex->id; - endpoint->ep_rem_irq_memhndl = modex->irq_memhndl; - - endpoint->dev = dev; - - *ep = endpoint; - - free (modex); - - return OPAL_SUCCESS; -} - -void opal_common_ugni_endpoint_return (opal_common_ugni_endpoint_t *ep) -{ - assert(NULL != ep); - - OBJ_RELEASE(ep); -} - -int opal_common_ugni_ep_create (opal_common_ugni_endpoint_t *cep, gni_cq_handle_t cq, - gni_ep_handle_t *ep_handle) -{ - gni_return_t grc; - - if (OPAL_UNLIKELY(NULL == cep)) { - assert (0); - return OPAL_ERR_BAD_PARAM; - } - - /* create a uGNI endpoint handle and bind it to the remote peer */ - OPAL_THREAD_LOCK(&cep->dev->dev_lock); - grc = GNI_EpCreate (cep->dev->dev_handle, cq, ep_handle); - OPAL_THREAD_UNLOCK(&cep->dev->dev_lock); - if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc)) { - return opal_common_rc_ugni_to_opal (grc); - } - - OPAL_THREAD_LOCK(&cep->dev->dev_lock); - grc = GNI_EpBind (*ep_handle, cep->ep_rem_addr, cep->ep_rem_id); - OPAL_THREAD_UNLOCK(&cep->dev->dev_lock); - - if (GNI_RC_SUCCESS != grc) { - OPAL_THREAD_LOCK(&cep->dev->dev_lock); - GNI_EpDestroy (*ep_handle); - OPAL_THREAD_UNLOCK(&cep->dev->dev_lock); - return opal_common_rc_ugni_to_opal (grc); - } - - return OPAL_SUCCESS; -} - -int opal_common_ugni_ep_destroy (gni_ep_handle_t *ep) -{ - int rc; - - if (NULL == ep || 0 == *ep) { - return OPAL_SUCCESS; - } - - /* TODO: need to fix, may be outstanding tx's, etc. */ - rc = GNI_EpUnbind (*ep); - if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) { - /* should warn */ - } - - GNI_EpDestroy (*ep); - if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) { - /* should warn */ - } - - *ep = 0; - - return OPAL_SUCCESS; -} - diff --git a/opal/mca/common/ugni/common_ugni_ep.h b/opal/mca/common/ugni/common_ugni_ep.h deleted file mode 100644 index 99f8d07ced..0000000000 --- a/opal/mca/common/ugni/common_ugni_ep.h +++ /dev/null @@ -1,63 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#if !defined(MPI_COMMON_UGNI_EP_H) -#define MPI_COMMON_UGNI_EP_H - -struct opal_common_ugni_device_t; - -struct opal_common_ugni_endpoint_t { - opal_object_t super; - uint32_t ep_rem_addr, ep_rem_id; /**< remote information */ - gni_mem_handle_t ep_rem_irq_memhndl; - struct opal_common_ugni_device_t *dev; /**< device this endpoint is using */ -}; -typedef struct opal_common_ugni_endpoint_t opal_common_ugni_endpoint_t; - -OBJ_CLASS_DECLARATION(opal_common_ugni_endpoint_t); - -/* - * Get (and retain) a reference to an endpoint to peer_proc. This endpoint - * needs to be returned with opal_common_ugni_endpoint_return. - * - * @param[IN] dev uGNI device this endpoint should be bound to. - * @param[IN] peer_proc remote peer the endpoint will be connected to. - * @param[OUT] ep uGNI endpoint for the peer - */ -int opal_common_ugni_endpoint_for_proc (struct opal_common_ugni_device_t *dev, opal_proc_t *peer_proc, - opal_common_ugni_endpoint_t **ep); - -/* - * Allocate and bind a uGNI endpoint handle to the remote peer. - * - * @param[IN] cep common endpoint - * @param[IN] cq completion queue - * @param[OUT] ep_handle uGNI endpoint handle - */ -int opal_common_ugni_ep_create (opal_common_ugni_endpoint_t *cep, gni_cq_handle_t cq, gni_ep_handle_t *ep_handle); - -/* - * Unbind and free the uGNI endpoint handle. - * - * @param[IN] ep_handle uGNI endpoint handle to unbind and release - */ -int opal_common_ugni_ep_destroy (gni_ep_handle_t *ep_handle); - -/* - * Return (and possibly free) a common endpoint. The endpoint may not be used - * once it is returned. - * - * @param[IN] ep uGNI endpoint to return - */ -void opal_common_ugni_endpoint_return (opal_common_ugni_endpoint_t *ep); - -#endif /* MPI_COMMON_UGNI_EP_H */ diff --git a/opal/mca/common/ugni/configure.m4 b/opal/mca/common/ugni/configure.m4 deleted file mode 100644 index b6870a683e..0000000000 --- a/opal/mca/common/ugni/configure.m4 +++ /dev/null @@ -1,54 +0,0 @@ -# -*- shell-script -*- -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2006 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2006 QLogic Corp. All rights reserved. -# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2011 Los Alamos National Security, LLC. -# All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# OPAL_CHECK_UGNI(prefix, [action-if-found], [action-if-not-found]) -# -------------------------------------------------------- -# check if GNI support can be found. sets prefix_{CPPFLAGS, -# LDFLAGS, LIBS} as needed and runs action-if-found if there is -# support, otherwise executes action-if-not-found -# -# NOTES -# on Cray XE6 systems, the GNI development header (gni_pub.h) is in a -# completely different place than the ugni library (libugni). -# -# EXAMPLE CONFIGURE USAGE: -# --with-ugni=/base/path/to/libugni --with-ugni-includedir=/path/to/gni_pub.h -# -# --with-ugni=/opt/cray/ugni/default --with-ugni-includedir=/opt/cray/gni-headers/default/include - -AC_DEFUN([MCA_opal_common_ugni_CONFIG],[ - AC_CONFIG_FILES([opal/mca/common/ugni/Makefile]) - - OPAL_CHECK_UGNI([common_ugni], - [common_ugni_happy="yes"], - [common_ugni_happy="no"]) - - AS_IF([test "$common_ugni_happy" = "yes"], - [$1], - [$2]) - - # substitute in the things needed to build ugni - AC_SUBST([common_ugni_CPPFLAGS]) - AC_SUBST([common_ugni_LDFLAGS]) - AC_SUBST([common_ugni_LIBS]) -])dnl diff --git a/opal/mca/common/ugni/owner.txt b/opal/mca/common/ugni/owner.txt deleted file mode 100644 index 48ac538cbb..0000000000 --- a/opal/mca/common/ugni/owner.txt +++ /dev/null @@ -1,7 +0,0 @@ -# -# owner/status file -# owner: institution that is responsible for this package -# status: e.g. active, maintenance, unmaintained -# -owner: LANL -status: active