1
1
openmpi/opal/mca/btl/ugni/btl_ugni_component.c
Ralph Castain ecc8000136 Silence a flood of warnings when compiling with gcc on Cray
Signed-off-by: Ralph Castain <rhc@open-mpi.org>
2017-03-24 13:37:11 -06:00

737 строки
34 KiB
C

/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "btl_ugni.h"
#include "btl_ugni_frag.h"
#include "btl_ugni_rdma.h"
#include "btl_ugni_smsg.h"
#include "opal/util/sys_limits.h"
#include <stdlib.h>
#include <fcntl.h>
#include <ctype.h>
#include "opal/memoryhooks/memory.h"
#include "opal/runtime/opal_params.h"
#include "opal/mca/base/mca_base_pvar.h"
static int btl_ugni_component_register(void);
static int btl_ugni_component_open(void);
static int btl_ugni_component_close(void);
static mca_btl_base_module_t **mca_btl_ugni_component_init(int *, bool, bool);
static int mca_btl_ugni_component_progress(void);
static unsigned long mca_btl_ugni_ugni_page_size = 0;
mca_btl_ugni_component_t mca_btl_ugni_component = {
.super = {
/* First, the mca_base_component_t struct containing meta information
about the component itself */
.btl_version = {
MCA_BTL_DEFAULT_VERSION("ugni"),
.mca_open_component = btl_ugni_component_open,
.mca_close_component = btl_ugni_component_close,
.mca_register_component_params = btl_ugni_component_register,
},
.btl_data = {
.param_field = MCA_BASE_METADATA_PARAM_CHECKPOINT
},
.btl_init = mca_btl_ugni_component_init,
.btl_progress = mca_btl_ugni_component_progress,
}
};
mca_base_var_enum_value_t rcache_values[] = {
{MCA_BTL_UGNI_RCACHE_UDREG, "udreg"},
{MCA_BTL_UGNI_RCACHE_GRDMA, "grdma"},
{-1, NULL} /* sentinal */
};
mca_base_var_enum_value_flag_t cdm_flags[] = {
{.flag = GNI_CDM_MODE_FORK_NOCOPY, .string = "fork-no-copy", .conflicting_flag = GNI_CDM_MODE_FORK_FULLCOPY | GNI_CDM_MODE_FORK_PARTCOPY},
{.flag = GNI_CDM_MODE_FORK_FULLCOPY, .string = "fork-full-copy", .conflicting_flag = GNI_CDM_MODE_FORK_NOCOPY | GNI_CDM_MODE_FORK_PARTCOPY},
{.flag = GNI_CDM_MODE_FORK_PARTCOPY, .string = "fork-part-copy", .conflicting_flag = GNI_CDM_MODE_FORK_NOCOPY | GNI_CDM_MODE_FORK_FULLCOPY},
{.flag = GNI_CDM_MODE_ERR_NO_KILL, .string = "err-no-kill", .conflicting_flag = GNI_CDM_MODE_ERR_ALL_KILL},
{.flag = GNI_CDM_MODE_ERR_ALL_KILL, .string = "err-all-kill", .conflicting_flag = GNI_CDM_MODE_ERR_NO_KILL},
{.flag = GNI_CDM_MODE_FAST_DATAGRAM_POLL, .string = "fast-datagram-poll", .conflicting_flag = 0},
{.flag = GNI_CDM_MODE_BTE_SINGLE_CHANNEL, .string = "bte-single-channel", .conflicting_flag = 0},
{.flag = GNI_CDM_MODE_USE_PCI_IOMMU, .string = "use-pci-iommu", .conflicting_flag = 0},
{.flag = GNI_CDM_MODE_MDD_DEDICATED, .string = "mdd-dedicated", .conflicting_flag = GNI_CDM_MODE_MDD_SHARED},
{.flag = GNI_CDM_MODE_MDD_SHARED, .string = "mdd-shared", .conflicting_flag = GNI_CDM_MODE_MDD_DEDICATED},
{.flag = GNI_CDM_MODE_FMA_DEDICATED, .string = "fma-dedicated", .conflicting_flag = GNI_CDM_MODE_FMA_SHARED},
{.flag = GNI_CDM_MODE_FMA_SHARED, .string = "fma-shared", .conflicting_flag = GNI_CDM_MODE_FMA_DEDICATED},
{.flag = GNI_CDM_MODE_CACHED_AMO_ENABLED, .string = "cached-amo-enabled", .conflicting_flag = 0},
{.flag = GNI_CDM_MODE_CQ_NIC_LOCAL_PLACEMENT, .string = "cq-nic-placement", .conflicting_flag = 0},
{.flag = GNI_CDM_MODE_FMA_SMALL_WINDOW, .string = "fma-small-window", .conflicting_flag = 0},
{.string = NULL}
};
static inline int mca_btl_ugni_get_stat (const mca_base_pvar_t *pvar, void *value, void *obj)
{
gni_statistic_t statistic = (gni_statistic_t) (intptr_t) pvar->ctx;
gni_return_t rc = GNI_RC_SUCCESS;
for (int i = 0 ; i < mca_btl_ugni_component.virtual_device_count ; ++i) {
rc = GNI_GetNicStat (mca_btl_ugni_component.modules[0].devices[i].dev_handle, statistic,
((unsigned int *) value) + i);
}
return mca_btl_rc_ugni_to_opal (rc);
}
static inline int mca_btl_ugni_notify_stat (mca_base_pvar_t *pvar, mca_base_pvar_event_t event, void *obj, int *count)
{
if (MCA_BASE_PVAR_HANDLE_BIND == event) {
/* one value for each virtual device handle */
*count = mca_btl_ugni_component.virtual_device_count;
}
return OPAL_SUCCESS;
}
static int btl_ugni_component_register(void)
{
mca_base_var_enum_t *new_enum;
gni_nic_device_t device_type;
char *mpool_hints_tmp = NULL;
int rc;
(void) mca_base_var_group_component_register(&mca_btl_ugni_component.super.btl_version,
"uGNI byte transport layer");
mca_btl_ugni_component.ugni_free_list_num = 8;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"free_list_num", NULL, MCA_BASE_VAR_TYPE_INT,
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.ugni_free_list_num);
mca_btl_ugni_component.ugni_free_list_max = 4096;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"free_list_max", NULL, MCA_BASE_VAR_TYPE_INT,
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.ugni_free_list_max);
mca_btl_ugni_component.ugni_free_list_inc = 64;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"free_list_inc", NULL, MCA_BASE_VAR_TYPE_INT,
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.ugni_free_list_inc);
mca_btl_ugni_component.ugni_eager_num = 16;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"eager_num", NULL, MCA_BASE_VAR_TYPE_INT,
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.ugni_eager_num);
mca_btl_ugni_component.ugni_eager_max = 128;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"eager_max", NULL, MCA_BASE_VAR_TYPE_INT,
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.ugni_eager_max);
mca_btl_ugni_component.ugni_eager_inc = 16;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"eager_inc", NULL, MCA_BASE_VAR_TYPE_INT,
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.ugni_eager_inc);
mca_btl_ugni_component.remote_cq_size = 40000;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"remote_cq_size", "Remote SMSG completion queue "
"size (default 40000)", MCA_BASE_VAR_TYPE_INT,
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.remote_cq_size);
mca_btl_ugni_component.local_cq_size = 8192;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"local_cq_size", "Local completion queue size "
"(default 8192)", MCA_BASE_VAR_TYPE_INT,
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.local_cq_size);
mca_btl_ugni_component.ugni_smsg_limit = 0;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"smsg_limit", "Maximum size message that "
"will be sent using the SMSG/MSGQ protocol "
"(0 - autoselect(default), 16k max)",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.ugni_smsg_limit);
mca_btl_ugni_component.smsg_max_credits = 32;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"smsg_max_credits", "Maximum number of "
"outstanding SMSG/MSGQ message (default 32)",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.smsg_max_credits);
mca_btl_ugni_component.ugni_fma_limit = 1024;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"fma_limit", "Maximum size message that "
"will be sent using the FMA (Fast Memory "
"Access) protocol (default 1024, 64k max)",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.ugni_fma_limit);
mca_btl_ugni_component.rdma_max_retries = 16;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"rdma_max_retries", NULL, MCA_BASE_VAR_TYPE_INT,
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.rdma_max_retries);
mca_btl_ugni_component.smsg_max_retries = 16;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"smsg_max_retries", NULL, MCA_BASE_VAR_TYPE_INT,
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.smsg_max_retries);
mca_btl_ugni_component.max_mem_reg = 0;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"max_mem_reg", "Maximum number of "
"memory registrations a process can "
"hold (0 - autoselect, -1 - unlimited)"
" (default 0)", MCA_BASE_VAR_TYPE_INT,
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.max_mem_reg);
mca_btl_ugni_component.mbox_increment = 0;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"mbox_inc", "Number of SMSG mailboxes to "
"allocate in each block (0 - autoselect(default))",
MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.mbox_increment);
/* communication domain flags */
rc = mca_base_var_enum_create_flag ("btl_ugni_cdm_flags", cdm_flags, (mca_base_var_enum_flag_t **) &new_enum);
if (OPAL_SUCCESS != rc) {
return rc;
}
mca_btl_ugni_component.cdm_flags = GNI_CDM_MODE_FORK_PARTCOPY | GNI_CDM_MODE_ERR_NO_KILL | GNI_CDM_MODE_FAST_DATAGRAM_POLL |
GNI_CDM_MODE_MDD_SHARED | GNI_CDM_MODE_FMA_SHARED | GNI_CDM_MODE_FMA_SMALL_WINDOW;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"cdm_flags", "Flags to set when creating a communication domain "
" (default: fork-fullcopy,cached-amo-enabled,err-no-kill,fast-datagram-poll,"
"fma-shared,fma-small-window)",
MCA_BASE_VAR_TYPE_UNSIGNED_INT, new_enum, 0,
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.cdm_flags);
OBJ_RELEASE(new_enum);
mca_btl_ugni_component.virtual_device_count = 0;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"virtual_device_count", "Number of virtual devices to create. Higher numbers may "
"result in better performance when using threads. (default: auto, max: 8)",
MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.virtual_device_count);
/* determine if there are get alignment restrictions */
GNI_GetDeviceType (&device_type);
mca_btl_ugni_component.smsg_page_size = 2 << 20;
if (GNI_DEVICE_GEMINI == device_type) {
if (access ("/sys/class/gemini/ghal0/mrt", R_OK)) {
int fd = open ("/sys/class/gemini/ghal0/mrt", O_RDONLY);
char buffer[10];
if (0 <= fd) {
memset (buffer, 0, sizeof (buffer));
read (fd, buffer, sizeof (buffer) - 1);
close (fd);
mca_btl_ugni_ugni_page_size = strtol (buffer, NULL, 10) * 1024;
mca_btl_ugni_component.smsg_page_size = mca_btl_ugni_ugni_page_size;
}
}
}
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"smsg_page_size", "Page size to use for SMSG mailbox allocation (default: detect)",
MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.smsg_page_size);
mca_btl_ugni_component.progress_thread_requested = 0;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"request_progress_thread",
"Enable to request ugni btl progress thread - requires MPI_THREAD_MULTIPLE support",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.progress_thread_requested);
/* performance variables */
mca_btl_ugni_progress_thread_wakeups = 0;
(void) mca_base_component_pvar_register(&mca_btl_ugni_component.super.btl_version,
"progress_thread_wakeups", "Number of times the progress thread "
"has been woken", OPAL_INFO_LVL_9, MCA_BASE_PVAR_CLASS_COUNTER,
MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, MCA_BASE_VAR_BIND_NO_OBJECT,
MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS, NULL,
NULL, NULL, &mca_btl_ugni_progress_thread_wakeups);
/* register network statistics as performance variables */
for (int i = 0 ; i < GNI_NUM_STATS ; ++i) {
char name[128], desc[128];
size_t str_len = strlen (gni_statistic_str[i]);
assert (str_len < sizeof (name));
/* we can get an all-caps string for the variable from gni_statistic_str. need to make it lowercase
* to match ompi standards */
for (size_t j = 0 ; j < str_len ; ++j) {
name[j] = tolower (gni_statistic_str[i][j]);
desc[j] = ('_' == name[j]) ? ' ' : name[j];
}
name[str_len] = '\0';
desc[str_len] = '\0';
(void) mca_base_component_pvar_register (&mca_btl_ugni_component.super.btl_version, name, desc,
OPAL_INFO_LVL_4, MCA_BASE_PVAR_CLASS_COUNTER,
MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, MCA_BASE_VAR_BIND_NO_OBJECT,
MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS,
mca_btl_ugni_get_stat, NULL, mca_btl_ugni_notify_stat,
(void *) (intptr_t) i);
}
/* btl/ugni can only support only a fixed set of rcache components (these rcache components have compatible resource
* structures) */
rc = mca_base_var_enum_create ("btl_ugni_rcache", rcache_values, &new_enum);
if (OPAL_SUCCESS != rc) {
return rc;
}
/* NTH: there are known *serious* performance issues with udreg. if they are ever resolved it is the preferred rcache */
mca_btl_ugni_component.rcache_type = MCA_BTL_UGNI_RCACHE_GRDMA;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"rcache", "registration cache to use (default: grdma)", MCA_BASE_VAR_TYPE_INT, new_enum,
0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.rcache_type);
OBJ_RELEASE(new_enum);
if (mca_btl_ugni_ugni_page_size) {
rc = asprintf (&mpool_hints_tmp, "page_size=%lu", mca_btl_ugni_ugni_page_size);
if (rc < 0) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
mca_btl_ugni_component.mpool_hints = mpool_hints_tmp;
} else {
mca_btl_ugni_component.mpool_hints = "page_size=2M";
}
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"mpool_hints", "hints to use when selecting a memory pool (default: "
"\"page_size=2M\")", MCA_BASE_VAR_TYPE_STRING, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.mpool_hints);
free (mpool_hints_tmp);
/* ensure we loose send exclusivity to sm and vader if they are enabled */
mca_btl_ugni_module.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH - 2;
/* smsg threshold */
mca_btl_ugni_module.super.btl_eager_limit = 8 * 1024;
mca_btl_ugni_module.super.btl_rndv_eager_limit = 8 * 1024;
mca_btl_ugni_module.super.btl_rdma_pipeline_frag_size = 4 * 1024 * 1024;
mca_btl_ugni_module.super.btl_max_send_size = 8 * 1024;
mca_btl_ugni_module.super.btl_rdma_pipeline_send_length = 8 * 1024;
mca_btl_ugni_module.super.btl_get_limit = 1 * 1024 * 1024;
/*
* see def. of ALIGNMENT_MASK to figure this one out
*/
/* both gemini and aries have a 4-byte alignment requirement on remote addresses */
mca_btl_ugni_module.super.btl_get_alignment = 4;
/* threshold for put */
mca_btl_ugni_module.super.btl_min_rdma_pipeline_size = 8 * 1024;
mca_btl_ugni_module.super.btl_flags = MCA_BTL_FLAGS_SEND |
MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND_INPLACE | MCA_BTL_FLAGS_ATOMIC_OPS |
MCA_BTL_FLAGS_ATOMIC_FOPS;
mca_btl_ugni_module.super.btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD |
MCA_BTL_ATOMIC_SUPPORTS_AND | MCA_BTL_ATOMIC_SUPPORTS_OR | MCA_BTL_ATOMIC_SUPPORTS_XOR |
MCA_BTL_ATOMIC_SUPPORTS_CSWAP;
if (GNI_DEVICE_ARIES == device_type) {
/* aries supports additional atomic operations */
mca_btl_ugni_module.super.btl_atomic_flags |= MCA_BTL_ATOMIC_SUPPORTS_MIN | MCA_BTL_ATOMIC_SUPPORTS_MAX |
MCA_BTL_ATOMIC_SUPPORTS_LAND | MCA_BTL_ATOMIC_SUPPORTS_LOR | MCA_BTL_ATOMIC_SUPPORTS_LXOR |
MCA_BTL_ATOMIC_SUPPORTS_32BIT | MCA_BTL_ATOMIC_SUPPORTS_FLOAT;
}
mca_btl_ugni_module.super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t);
mca_btl_ugni_module.super.btl_bandwidth = 40000; /* Mbs */
mca_btl_ugni_module.super.btl_latency = 2; /* Microsecs */
mca_btl_ugni_module.super.btl_get_local_registration_threshold = 0;
mca_btl_ugni_module.super.btl_put_local_registration_threshold = mca_btl_ugni_component.ugni_fma_limit;
/* Call the BTL based to register its MCA params */
mca_btl_base_param_register(&mca_btl_ugni_component.super.btl_version,
&mca_btl_ugni_module.super);
return OPAL_SUCCESS;
}
static int
btl_ugni_component_open(void)
{
mca_btl_ugni_component.ugni_num_btls = 0;
mca_btl_ugni_component.modules = NULL;
return OPAL_SUCCESS;
}
/*
* component cleanup - sanity checking of queue lengths
*/
static int
btl_ugni_component_close(void)
{
mca_btl_ugni_fini ();
if (mca_btl_ugni_component.modules) {
free (mca_btl_ugni_component.modules);
mca_btl_ugni_component.modules = NULL;
}
return OPAL_SUCCESS;
}
static mca_btl_base_module_t **
mca_btl_ugni_component_init (int *num_btl_modules,
bool enable_progress_threads,
bool enable_mpi_threads)
{
struct mca_btl_base_module_t **base_modules;
mca_btl_ugni_module_t *ugni_modules;
int rc;
if (16384 < mca_btl_ugni_component.ugni_smsg_limit) {
mca_btl_ugni_component.ugni_smsg_limit = 16384;
}
if (65536 < mca_btl_ugni_component.ugni_fma_limit) {
mca_btl_ugni_component.ugni_fma_limit = 65536;
}
mca_btl_ugni_module.super.btl_put_local_registration_threshold = mca_btl_ugni_component.ugni_fma_limit;
if (enable_mpi_threads && mca_btl_ugni_component.progress_thread_requested) {
mca_btl_ugni_component.progress_thread_enabled = 1;
}
/* Initialize ugni library and create communication domain */
rc = mca_btl_ugni_init();
if (OPAL_SUCCESS != rc) {
return NULL;
}
/* For now only create a single BTL module */
mca_btl_ugni_component.ugni_num_btls = 1;
BTL_VERBOSE(("btl/ugni initializing"));
ugni_modules = mca_btl_ugni_component.modules = (mca_btl_ugni_module_t *)
calloc (mca_btl_ugni_component.ugni_num_btls, sizeof (mca_btl_ugni_module_t));
if (OPAL_UNLIKELY(NULL == mca_btl_ugni_component.modules)) {
BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__));
return NULL;
}
base_modules = (struct mca_btl_base_module_t **)
calloc (mca_btl_ugni_component.ugni_num_btls,
sizeof (struct mca_btl_base_module_t *));
if (OPAL_UNLIKELY(NULL == base_modules)) {
BTL_ERROR(("Malloc failed : %s:%d", __FILE__, __LINE__));
return NULL;
}
if (mca_btl_ugni_component.smsg_page_size != (unsigned long) opal_getpagesize ()) {
if (mca_btl_ugni_ugni_page_size > mca_btl_ugni_component.smsg_page_size) {
mca_btl_ugni_component.smsg_page_size = mca_btl_ugni_ugni_page_size;
}
}
mca_btl_ugni_module.super.btl_rdma_pipeline_send_length = mca_btl_ugni_module.super.btl_eager_limit;
rc = mca_btl_ugni_module_init (ugni_modules);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
BTL_ERROR(("Failed to initialize uGNI module @ %s:%d", __FILE__,
__LINE__));
return NULL;
}
*base_modules = (mca_btl_base_module_t *) ugni_modules;
*num_btl_modules = mca_btl_ugni_component.ugni_num_btls;
BTL_VERBOSE(("btl/ugni done initializing %d module(s)", *num_btl_modules));
return base_modules;
}
int mca_btl_ugni_progress_datagram (mca_btl_ugni_device_t *device)
{
mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_component.modules;
mca_btl_base_endpoint_t *ep;
gni_ep_handle_t handle;
int count = 0, rc;
rc = mca_btl_ugni_get_datagram (ugni_module, device, &handle, &ep);
if (1 != rc) {
return rc;
}
BTL_VERBOSE(("remote datagram completion on handle %p", (void*)handle));
/* if this is a wildcard endpoint lookup the remote peer by the proc id we received */
if (handle == ugni_module->wildcard_ep) {
struct opal_proc_t *remote_proc = opal_proc_for_name (ugni_module->wc_remote_attr.proc_name);
BTL_VERBOSE(("received connection attempt on wildcard endpoint from proc: %s",
OPAL_NAME_PRINT(ugni_module->wc_remote_attr.proc_name)));
ep = mca_btl_ugni_get_ep (&ugni_module->super, remote_proc);
if (OPAL_UNLIKELY(NULL == ep)) {
/* there is no way to recover from this error so just abort() */
BTL_ERROR(("could not find/allocate a btl endpoint for peer %s",
OPAL_NAME_PRINT(ugni_module->wc_remote_attr.proc_name)));
abort ();
return OPAL_ERR_NOT_FOUND;
}
}
/* should not have gotten a NULL endpoint */
assert (NULL != ep);
BTL_VERBOSE(("got a datagram completion: ep = %p. wc = %d", (void *) ep, handle == ugni_module->wildcard_ep));
/* NTH: TODO -- error handling */
opal_mutex_lock (&ep->lock);
if (handle != ugni_module->wildcard_ep) {
/* directed post complete */
BTL_VERBOSE(("directed datagram complete for endpoint %p", (void *) ep));
ep->dg_posted = false;
(void) opal_atomic_add_32 (&ugni_module->active_datagrams, -1);
}
(void) mca_btl_ugni_ep_connect_progress (ep);
opal_mutex_unlock (&ep->lock);
if (MCA_BTL_UGNI_EP_STATE_CONNECTED == ep->state) {
/* process messages waiting in the endpoint's smsg mailbox */
count = mca_btl_ugni_smsg_process (ep);
}
/* repost the wildcard datagram */
if (handle == ugni_module->wildcard_ep) {
mca_btl_ugni_wildcard_ep_post (ugni_module);
}
return count;
}
#if OPAL_ENABLE_DEBUG
static inline void btl_ugni_dump_post_desc (mca_btl_ugni_post_descriptor_t *desc)
{
fprintf (stderr, "desc->desc.post_id = %" PRIx64 "\n", desc->desc.post_id);
fprintf (stderr, "desc->desc.status = %" PRIx64 "\n", desc->desc.status);
fprintf (stderr, "desc->desc.cq_mode_complete = %hu\n", desc->desc.cq_mode_complete);
fprintf (stderr, "desc->desc.type = %d\n", desc->desc.type);
fprintf (stderr, "desc->desc.cq_mode = %hu\n", desc->desc.cq_mode);
fprintf (stderr, "desc->desc.dlvr_mode = %hu\n", desc->desc.dlvr_mode);
fprintf (stderr, "desc->desc.local_addr = %" PRIx64 "\n", desc->desc.local_addr);
fprintf (stderr, "desc->desc.local_mem_hndl = {%" PRIx64 ", %" PRIx64 "}\n", desc->desc.local_mem_hndl.qword1,
desc->desc.local_mem_hndl.qword2);
fprintf (stderr, "desc->desc.remote_addr = %" PRIx64 "\n", desc->desc.remote_addr);
fprintf (stderr, "desc->desc.remote_mem_hndl = {%" PRIx64 ", %" PRIx64 "}\n", desc->desc.remote_mem_hndl.qword1,
desc->desc.remote_mem_hndl.qword2);
fprintf (stderr, "desc->desc.length = %" PRIu64 "\n", desc->desc.length);
fprintf (stderr, "desc->desc.rdma_mode = %hu\n", desc->desc.rdma_mode);
fprintf (stderr, "desc->desc.amo_cmd = %d\n", desc->desc.amo_cmd);
}
#endif
static inline int
mca_btl_ugni_post_pending (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_device_t *device)
{
int pending_post_count = opal_list_get_size (&device->pending_post);
mca_btl_ugni_post_descriptor_t *post_desc;
int rc;
/* check if there are any posts pending resources */
if (OPAL_LIKELY(0 == pending_post_count)) {
return 0;
}
BTL_VERBOSE(("progressing %d pending FMA/RDMA operations", pending_post_count));
for (int i = 0 ; i < pending_post_count ; ++i) {
mca_btl_ugni_device_lock (device);
post_desc = (mca_btl_ugni_post_descriptor_t *) opal_list_remove_first (&device->pending_post);
mca_btl_ugni_device_unlock (device);
if (NULL == post_desc) {
break;
}
rc = mca_btl_ugni_repost (ugni_module, post_desc);
if (OPAL_SUCCESS != rc) {
mca_btl_ugni_device_lock (device);
opal_list_prepend (&device->pending_post, (opal_list_item_t *) post_desc);
mca_btl_ugni_device_unlock (device);
break;
}
}
return 1;
}
static inline int mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_device_t *device,
mca_btl_ugni_cq_t *cq)
{
mca_btl_ugni_post_descriptor_t *post_desc[MCA_BTL_UGNI_COMPLETIONS_PER_LOOP];
gni_cq_entry_t event_data[MCA_BTL_UGNI_COMPLETIONS_PER_LOOP];
int rc;
rc = mca_btl_ugni_cq_get_completed_desc (device, cq, event_data, post_desc, MCA_BTL_UGNI_COMPLETIONS_PER_LOOP);
if (0 >= rc) {
return rc;
}
BTL_VERBOSE(("got %d completed rdma descriptors", rc));
for (int i = 0 ; i < rc ; ++i) {
BTL_VERBOSE(("post descriptor %p complete. GNI_CQ_STATUS_OK(): %d", (void*)post_desc[i],
GNI_CQ_STATUS_OK(event_data[i])));
if (OPAL_UNLIKELY(!GNI_CQ_STATUS_OK(event_data[i]))) {
uint32_t recoverable = 1;
(void) GNI_CqErrorRecoverable (event_data[i], &recoverable);
if (OPAL_UNLIKELY(++post_desc[i]->tries >= mca_btl_ugni_component.rdma_max_retries ||
!recoverable)) {
char char_buffer[1024];
GNI_CqErrorStr (event_data[i], char_buffer, 1024);
/* give up */
BTL_ERROR(("giving up on desciptor %p, recoverable %d: %s", (void *) post_desc[i],
recoverable, char_buffer));
#if OPAL_ENABLE_DEBUG
btl_ugni_dump_post_desc (post_desc[i]);
#endif
mca_btl_ugni_post_desc_complete (ugni_module, post_desc[i], OPAL_ERROR);
return OPAL_ERROR;
}
mca_btl_ugni_repost (ugni_module, post_desc[i]);
return 0;
}
mca_btl_ugni_post_desc_complete (ugni_module, post_desc[i], OPAL_SUCCESS);
}
/* should be resources to progress the pending post list */
(void) mca_btl_ugni_post_pending (ugni_module, device);
return rc;
}
static inline int
mca_btl_ugni_progress_wait_list (mca_btl_ugni_module_t *ugni_module)
{
int rc = OPAL_SUCCESS;
mca_btl_base_endpoint_t *endpoint = NULL;
int count;
if (0 == opal_list_get_size(&ugni_module->ep_wait_list)) {
return 0;
}
/* check the count before taking the lock to avoid unnecessary locking */
count = opal_list_get_size(&ugni_module->ep_wait_list);
if (0 == count) {
return 0;
}
OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock);
count = opal_list_get_size(&ugni_module->ep_wait_list);
do {
endpoint = (mca_btl_base_endpoint_t *) opal_list_remove_first (&ugni_module->ep_wait_list);
if (endpoint != NULL) {
rc = mca_btl_ugni_progress_send_wait_list (endpoint);
if (OPAL_SUCCESS != rc) {
opal_list_append (&ugni_module->ep_wait_list, &endpoint->super);
} else {
endpoint->wait_listed = false;
}
}
} while (endpoint != NULL && --count > 0) ;
OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock);
return rc;
}
static int mca_btl_ugni_component_progress (void)
{
mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_component.modules;
int count = 0;
count += mca_btl_ugni_progress_remote_smsg (ugni_module);
if (ugni_module->active_datagrams) {
count += mca_btl_ugni_progress_datagram (ugni_module->devices);
}
for (int i = 0 ; i < mca_btl_ugni_component.virtual_device_count ; ++i) {
mca_btl_ugni_device_t *device = ugni_module->devices + i;
if (device->smsg_connections) {
count += mca_btl_ugni_progress_local_smsg (ugni_module, device);
mca_btl_ugni_progress_wait_list (ugni_module);
}
if (device->dev_rdma_local_cq.active_operations) {
count += mca_btl_ugni_progress_rdma (ugni_module, device, &device->dev_rdma_local_cq);
}
if (mca_btl_ugni_component.progress_thread_enabled && device->dev_rdma_local_irq_cq.active_operations) {
count += mca_btl_ugni_progress_rdma (ugni_module, device, &device->dev_rdma_local_irq_cq);
}
}
return count;
}