1
1
openmpi/opal/mca/btl/ugni/btl_ugni_component.c
Nathan T. Weeks 08f9ae97ee btl/ugni: update BTL_VERBOSE argument list
Signed-off-by: Nathan T. Weeks <weeks@iastate.edu>
2018-07-02 09:23:30 -06:00

770 строки
37 KiB
C

/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2018 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "btl_ugni.h"
#include "btl_ugni_frag.h"
#include "btl_ugni_rdma.h"
#include "btl_ugni_smsg.h"
#include "opal/util/sys_limits.h"
#include <stdlib.h>
#include <fcntl.h>
#include <ctype.h>
#include "opal/memoryhooks/memory.h"
#include "opal/runtime/opal_params.h"
#include "opal/mca/base/mca_base_pvar.h"
static int btl_ugni_component_register(void);
static int btl_ugni_component_open(void);
static int btl_ugni_component_close(void);
static mca_btl_base_module_t **mca_btl_ugni_component_init(int *, bool, bool);
static int mca_btl_ugni_component_progress(void);
static unsigned long mca_btl_ugni_ugni_page_size = 0;
mca_btl_ugni_component_t mca_btl_ugni_component = {
.super = {
/* First, the mca_base_component_t struct containing meta information
about the component itself */
.btl_version = {
MCA_BTL_DEFAULT_VERSION("ugni"),
.mca_open_component = btl_ugni_component_open,
.mca_close_component = btl_ugni_component_close,
.mca_register_component_params = btl_ugni_component_register,
},
.btl_data = {
.param_field = MCA_BASE_METADATA_PARAM_CHECKPOINT
},
.btl_init = mca_btl_ugni_component_init,
.btl_progress = mca_btl_ugni_component_progress,
}
};
mca_base_var_enum_value_t rcache_values[] = {
{MCA_BTL_UGNI_RCACHE_UDREG, "udreg"},
{MCA_BTL_UGNI_RCACHE_GRDMA, "grdma"},
{-1, NULL} /* sentinal */
};
mca_base_var_enum_value_flag_t cdm_flags[] = {
{.flag = GNI_CDM_MODE_FORK_NOCOPY, .string = "fork-no-copy", .conflicting_flag = GNI_CDM_MODE_FORK_FULLCOPY | GNI_CDM_MODE_FORK_PARTCOPY},
{.flag = GNI_CDM_MODE_FORK_FULLCOPY, .string = "fork-full-copy", .conflicting_flag = GNI_CDM_MODE_FORK_NOCOPY | GNI_CDM_MODE_FORK_PARTCOPY},
{.flag = GNI_CDM_MODE_FORK_PARTCOPY, .string = "fork-part-copy", .conflicting_flag = GNI_CDM_MODE_FORK_NOCOPY | GNI_CDM_MODE_FORK_FULLCOPY},
{.flag = GNI_CDM_MODE_ERR_NO_KILL, .string = "err-no-kill", .conflicting_flag = GNI_CDM_MODE_ERR_ALL_KILL},
{.flag = GNI_CDM_MODE_ERR_ALL_KILL, .string = "err-all-kill", .conflicting_flag = GNI_CDM_MODE_ERR_NO_KILL},
{.flag = GNI_CDM_MODE_FAST_DATAGRAM_POLL, .string = "fast-datagram-poll", .conflicting_flag = 0},
{.flag = GNI_CDM_MODE_BTE_SINGLE_CHANNEL, .string = "bte-single-channel", .conflicting_flag = 0},
{.flag = GNI_CDM_MODE_USE_PCI_IOMMU, .string = "use-pci-iommu", .conflicting_flag = 0},
{.flag = GNI_CDM_MODE_MDD_DEDICATED, .string = "mdd-dedicated", .conflicting_flag = GNI_CDM_MODE_MDD_SHARED},
{.flag = GNI_CDM_MODE_MDD_SHARED, .string = "mdd-shared", .conflicting_flag = GNI_CDM_MODE_MDD_DEDICATED},
{.flag = GNI_CDM_MODE_FMA_DEDICATED, .string = "fma-dedicated", .conflicting_flag = GNI_CDM_MODE_FMA_SHARED},
{.flag = GNI_CDM_MODE_FMA_SHARED, .string = "fma-shared", .conflicting_flag = GNI_CDM_MODE_FMA_DEDICATED},
{.flag = GNI_CDM_MODE_CACHED_AMO_ENABLED, .string = "cached-amo-enabled", .conflicting_flag = 0},
{.flag = GNI_CDM_MODE_CQ_NIC_LOCAL_PLACEMENT, .string = "cq-nic-placement", .conflicting_flag = 0},
{.flag = GNI_CDM_MODE_FMA_SMALL_WINDOW, .string = "fma-small-window", .conflicting_flag = 0},
{.string = NULL}
};
static inline int mca_btl_ugni_get_stat (const mca_base_pvar_t *pvar, void *value, void *obj)
{
gni_statistic_t statistic = (gni_statistic_t) (intptr_t) pvar->ctx;
gni_return_t rc = GNI_RC_SUCCESS;
for (int i = 0 ; i < mca_btl_ugni_component.virtual_device_count ; ++i) {
rc = GNI_GetNicStat (mca_btl_ugni_component.modules[0].devices[i].dev_handle, statistic,
((unsigned int *) value) + i);
}
return mca_btl_rc_ugni_to_opal (rc);
}
static inline int mca_btl_ugni_notify_stat (mca_base_pvar_t *pvar, mca_base_pvar_event_t event, void *obj, int *count)
{
if (MCA_BASE_PVAR_HANDLE_BIND == event) {
/* one value for each virtual device handle */
*count = mca_btl_ugni_component.virtual_device_count;
}
return OPAL_SUCCESS;
}
static int btl_ugni_component_register(void)
{
mca_base_var_enum_t *new_enum;
gni_nic_device_t device_type;
char *mpool_hints_tmp = NULL;
int rc;
(void) mca_base_var_group_component_register(&mca_btl_ugni_component.super.btl_version,
"uGNI byte transport layer");
mca_btl_ugni_component.ugni_free_list_num = 8;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"free_list_num", NULL, MCA_BASE_VAR_TYPE_INT,
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.ugni_free_list_num);
mca_btl_ugni_component.ugni_free_list_max = 4096;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"free_list_max", NULL, MCA_BASE_VAR_TYPE_INT,
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.ugni_free_list_max);
mca_btl_ugni_component.ugni_free_list_inc = 64;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"free_list_inc", NULL, MCA_BASE_VAR_TYPE_INT,
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.ugni_free_list_inc);
mca_btl_ugni_component.ugni_eager_num = 16;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"eager_num", NULL, MCA_BASE_VAR_TYPE_INT,
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.ugni_eager_num);
mca_btl_ugni_component.ugni_eager_max = 128;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"eager_max", NULL, MCA_BASE_VAR_TYPE_INT,
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.ugni_eager_max);
mca_btl_ugni_component.ugni_eager_inc = 16;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"eager_inc", NULL, MCA_BASE_VAR_TYPE_INT,
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.ugni_eager_inc);
mca_btl_ugni_component.remote_cq_size = 40000;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"remote_cq_size", "Remote SMSG completion queue "
"size (default 40000)", MCA_BASE_VAR_TYPE_INT,
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.remote_cq_size);
mca_btl_ugni_component.local_cq_size = 8192;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"local_cq_size", "Local SMSG completion queue size "
"(default 8k)", MCA_BASE_VAR_TYPE_INT,
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.local_cq_size);
mca_btl_ugni_component.local_rdma_cq_size = 1024;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"local_rdma_cq_size", "Local FMA/RDMA completion queue size "
"(default: 1024)",MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.local_rdma_cq_size);
mca_btl_ugni_component.ugni_smsg_limit = 0;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"smsg_limit", "Maximum size message that "
"will be sent using the SMSG/MSGQ protocol "
"(0 - autoselect(default), 16k max)",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.ugni_smsg_limit);
mca_btl_ugni_component.smsg_max_credits = 32;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"smsg_max_credits", "Maximum number of "
"outstanding SMSG/MSGQ message (default 32)",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.smsg_max_credits);
#if OPAL_C_HAVE__THREAD_LOCAL
mca_btl_ugni_component.bind_threads_to_devices = true;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"bind_devices", "Bind threads to virtual "
"devices. In general this should improve "
"RDMA performance (default: true)",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.bind_threads_to_devices);
#endif
mca_btl_ugni_component.ugni_fma_limit = -1;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"fma_limit", "Default maximum size message that "
"will be sent using the FMA (Fast Memory "
"Access) protocol (default: -1 (don't use), 64k max)",
MCA_BASE_VAR_TYPE_LONG, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE | MCA_BASE_VAR_FLAG_DEPRECATED,
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.ugni_fma_limit);
mca_btl_ugni_component.ugni_fma_get_limit = 2048;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"fma_get_limit", "Maximum size message that "
"will be sent using the FMA (Fast Memory "
"Access) protocol for get (default 2k, "
"64k max)",
MCA_BASE_VAR_TYPE_LONG, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.ugni_fma_get_limit);
mca_btl_ugni_component.ugni_fma_put_limit = 4096;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"fma_put_limit", "Maximum size message that "
"will be sent using the FMA (Fast Memory "
"Access) protocol for put (default: 4k, "
"64k max)",
MCA_BASE_VAR_TYPE_LONG, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.ugni_fma_put_limit);
mca_btl_ugni_component.rdma_max_retries = 16;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"rdma_max_retries", NULL, MCA_BASE_VAR_TYPE_INT,
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.rdma_max_retries);
mca_btl_ugni_component.smsg_max_retries = 16;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"smsg_max_retries", NULL, MCA_BASE_VAR_TYPE_INT,
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.smsg_max_retries);
mca_btl_ugni_component.max_mem_reg = 0;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"max_mem_reg", "Maximum number of "
"memory registrations a process can "
"hold (0 - autoselect, -1 - unlimited)"
" (default 0)", MCA_BASE_VAR_TYPE_INT,
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.max_mem_reg);
mca_btl_ugni_component.mbox_increment = 0;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"mbox_inc", "Number of SMSG mailboxes to "
"allocate in each block (0 - autoselect(default))",
MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.mbox_increment);
/* communication domain flags */
rc = mca_base_var_enum_create_flag ("btl_ugni_cdm_flags", cdm_flags, (mca_base_var_enum_flag_t **) &new_enum);
if (OPAL_SUCCESS != rc) {
return rc;
}
mca_btl_ugni_component.cdm_flags = GNI_CDM_MODE_FORK_PARTCOPY | GNI_CDM_MODE_ERR_NO_KILL | GNI_CDM_MODE_FAST_DATAGRAM_POLL |
GNI_CDM_MODE_MDD_SHARED | GNI_CDM_MODE_FMA_SHARED | GNI_CDM_MODE_FMA_SMALL_WINDOW;
mca_btl_ugni_component.cdm_flags_id = mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"cdm_flags", "Flags to set when creating a communication domain "
" (default: fork-full-copy,cached-amo-enabled,err-no-kill,fast-datagram-poll,"
"fma-shared,fma-small-window)",
MCA_BASE_VAR_TYPE_UNSIGNED_INT, new_enum, 0,
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.cdm_flags);
OBJ_RELEASE(new_enum);
mca_btl_ugni_component.virtual_device_count = 0;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"virtual_device_count", "Number of virtual devices to create. Higher numbers may "
"result in better performance when using threads. (default: 0 (auto), max: 128)",
MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.virtual_device_count);
/* determine if there are get alignment restrictions */
GNI_GetDeviceType (&device_type);
mca_btl_ugni_component.smsg_page_size = 2 << 20;
if (GNI_DEVICE_GEMINI == device_type) {
if (access ("/sys/class/gemini/ghal0/mrt", R_OK)) {
int fd = open ("/sys/class/gemini/ghal0/mrt", O_RDONLY);
char buffer[10];
if (0 <= fd) {
memset (buffer, 0, sizeof (buffer));
read (fd, buffer, sizeof (buffer) - 1);
close (fd);
mca_btl_ugni_ugni_page_size = strtol (buffer, NULL, 10) * 1024;
mca_btl_ugni_component.smsg_page_size = mca_btl_ugni_ugni_page_size;
}
}
}
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"smsg_page_size", "Page size to use for SMSG mailbox allocation (default: detect)",
MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.smsg_page_size);
mca_btl_ugni_component.progress_thread_requested = 0;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"request_progress_thread",
"Enable to request ugni btl progress thread - requires MPI_THREAD_MULTIPLE support",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.progress_thread_requested);
/* performance variables */
mca_btl_ugni_progress_thread_wakeups = 0;
(void) mca_base_component_pvar_register(&mca_btl_ugni_component.super.btl_version,
"progress_thread_wakeups", "Number of times the progress thread "
"has been woken", OPAL_INFO_LVL_9, MCA_BASE_PVAR_CLASS_COUNTER,
MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, MCA_BASE_VAR_BIND_NO_OBJECT,
MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS, NULL,
NULL, NULL, &mca_btl_ugni_progress_thread_wakeups);
/* register network statistics as performance variables */
for (int i = 0 ; i < GNI_NUM_STATS ; ++i) {
char name[128], desc[128];
size_t str_len = strlen (gni_statistic_str[i]);
assert (str_len < sizeof (name));
/* we can get an all-caps string for the variable from gni_statistic_str. need to make it lowercase
* to match ompi standards */
for (size_t j = 0 ; j < str_len ; ++j) {
name[j] = tolower (gni_statistic_str[i][j]);
desc[j] = ('_' == name[j]) ? ' ' : name[j];
}
name[str_len] = '\0';
desc[str_len] = '\0';
(void) mca_base_component_pvar_register (&mca_btl_ugni_component.super.btl_version, name, desc,
OPAL_INFO_LVL_4, MCA_BASE_PVAR_CLASS_COUNTER,
MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, MCA_BASE_VAR_BIND_NO_OBJECT,
MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS,
mca_btl_ugni_get_stat, NULL, mca_btl_ugni_notify_stat,
(void *) (intptr_t) i);
}
/* btl/ugni can only support only a fixed set of rcache components (these rcache components have compatible resource
* structures) */
rc = mca_base_var_enum_create ("btl_ugni_rcache", rcache_values, &new_enum);
if (OPAL_SUCCESS != rc) {
return rc;
}
/* NTH: there are known *serious* performance issues with udreg. if they are ever resolved it is the preferred rcache */
mca_btl_ugni_component.rcache_type = MCA_BTL_UGNI_RCACHE_GRDMA;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"rcache", "registration cache to use (default: grdma)", MCA_BASE_VAR_TYPE_INT, new_enum,
0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.rcache_type);
OBJ_RELEASE(new_enum);
if (mca_btl_ugni_ugni_page_size) {
rc = asprintf (&mpool_hints_tmp, "page_size=%lu", mca_btl_ugni_ugni_page_size);
if (rc < 0) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
mca_btl_ugni_component.mpool_hints = mpool_hints_tmp;
} else {
mca_btl_ugni_component.mpool_hints = "page_size=2M";
}
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"mpool_hints", "hints to use when selecting a memory pool (default: "
"\"page_size=2M\")", MCA_BASE_VAR_TYPE_STRING, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.mpool_hints);
free (mpool_hints_tmp);
/* ensure we loose send exclusivity to sm and vader if they are enabled */
mca_btl_ugni_module.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH - 2;
/* smsg threshold */
mca_btl_ugni_module.super.btl_eager_limit = 8 * 1024;
mca_btl_ugni_module.super.btl_rndv_eager_limit = 8 * 1024;
mca_btl_ugni_module.super.btl_rdma_pipeline_frag_size = 4 * 1024 * 1024;
mca_btl_ugni_module.super.btl_max_send_size = 8 * 1024;
mca_btl_ugni_module.super.btl_rdma_pipeline_send_length = 8 * 1024;
mca_btl_ugni_module.super.btl_get_limit = 1 * 1024 * 1024;
/*
* see def. of ALIGNMENT_MASK to figure this one out
*/
/* both gemini and aries have a 4-byte alignment requirement on remote addresses */
mca_btl_ugni_module.super.btl_get_alignment = 4;
/* threshold for put */
mca_btl_ugni_module.super.btl_min_rdma_pipeline_size = 8 * 1024;
mca_btl_ugni_module.super.btl_flags = MCA_BTL_FLAGS_SEND |
MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND_INPLACE | MCA_BTL_FLAGS_ATOMIC_OPS |
MCA_BTL_FLAGS_ATOMIC_FOPS;
mca_btl_ugni_module.super.btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD |
MCA_BTL_ATOMIC_SUPPORTS_AND | MCA_BTL_ATOMIC_SUPPORTS_OR | MCA_BTL_ATOMIC_SUPPORTS_XOR |
MCA_BTL_ATOMIC_SUPPORTS_CSWAP;
if (GNI_DEVICE_ARIES == device_type) {
/* aries supports additional atomic operations */
mca_btl_ugni_module.super.btl_atomic_flags |= MCA_BTL_ATOMIC_SUPPORTS_MIN | MCA_BTL_ATOMIC_SUPPORTS_MAX |
MCA_BTL_ATOMIC_SUPPORTS_LAND | MCA_BTL_ATOMIC_SUPPORTS_LOR | MCA_BTL_ATOMIC_SUPPORTS_LXOR |
MCA_BTL_ATOMIC_SUPPORTS_32BIT | MCA_BTL_ATOMIC_SUPPORTS_FLOAT;
}
mca_btl_ugni_module.super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t);
mca_btl_ugni_module.super.btl_bandwidth = 40000; /* Mbs */
mca_btl_ugni_module.super.btl_latency = 2; /* Microsecs */
mca_btl_ugni_module.super.btl_get_local_registration_threshold = 0;
mca_btl_ugni_module.super.btl_put_local_registration_threshold = mca_btl_ugni_component.ugni_fma_put_limit;
/* Call the BTL based to register its MCA params */
mca_btl_base_param_register(&mca_btl_ugni_component.super.btl_version,
&mca_btl_ugni_module.super);
return OPAL_SUCCESS;
}
static int
btl_ugni_component_open(void)
{
mca_btl_ugni_component.ugni_num_btls = 0;
mca_btl_ugni_component.modules = NULL;
return OPAL_SUCCESS;
}
/*
* component cleanup - sanity checking of queue lengths
*/
static int
btl_ugni_component_close(void)
{
mca_btl_ugni_fini ();
free (mca_btl_ugni_component.modules);
mca_btl_ugni_component.modules = NULL;
return OPAL_SUCCESS;
}
static mca_btl_base_module_t **
mca_btl_ugni_component_init (int *num_btl_modules,
bool enable_progress_threads,
bool enable_mpi_threads)
{
struct mca_btl_base_module_t **base_modules;
mca_btl_ugni_module_t *ugni_modules;
int rc;
if (16384 < mca_btl_ugni_component.ugni_smsg_limit) {
mca_btl_ugni_component.ugni_smsg_limit = 16384;
}
if (65536 < mca_btl_ugni_component.ugni_fma_limit) {
mca_btl_ugni_component.ugni_fma_limit = 65536;
}
if (-1 != mca_btl_ugni_component.ugni_fma_limit) {
mca_btl_ugni_component.ugni_fma_get_limit = mca_btl_ugni_component.ugni_fma_limit;
} else if (65536 < mca_btl_ugni_component.ugni_fma_get_limit) {
mca_btl_ugni_component.ugni_fma_get_limit = 65536;
}
if (-1 != mca_btl_ugni_component.ugni_fma_limit) {
mca_btl_ugni_component.ugni_fma_put_limit = mca_btl_ugni_component.ugni_fma_limit;
} else if (65536 < mca_btl_ugni_component.ugni_fma_put_limit) {
mca_btl_ugni_component.ugni_fma_put_limit = 65536;
}
mca_btl_ugni_module.super.btl_put_local_registration_threshold = mca_btl_ugni_component.ugni_fma_put_limit;
/* limit the number of outstanding RDMA operations over all devices */
mca_btl_ugni_component.active_rdma_threshold = mca_btl_ugni_component.local_rdma_cq_size;
if (enable_mpi_threads && mca_btl_ugni_component.progress_thread_requested) {
mca_btl_ugni_component.progress_thread_enabled = 1;
}
/* Initialize ugni library and create communication domain */
rc = mca_btl_ugni_init();
if (OPAL_SUCCESS != rc) {
return NULL;
}
/* For now only create a single BTL module */
mca_btl_ugni_component.ugni_num_btls = 1;
BTL_VERBOSE(("btl/ugni initializing"));
ugni_modules = mca_btl_ugni_component.modules = (mca_btl_ugni_module_t *)
calloc (mca_btl_ugni_component.ugni_num_btls, sizeof (mca_btl_ugni_module_t));
if (OPAL_UNLIKELY(NULL == mca_btl_ugni_component.modules)) {
BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__));
return NULL;
}
base_modules = (struct mca_btl_base_module_t **)
calloc (mca_btl_ugni_component.ugni_num_btls,
sizeof (struct mca_btl_base_module_t *));
if (OPAL_UNLIKELY(NULL == base_modules)) {
BTL_ERROR(("Malloc failed : %s:%d", __FILE__, __LINE__));
return NULL;
}
if (mca_btl_ugni_component.smsg_page_size != (unsigned long) opal_getpagesize ()) {
if (mca_btl_ugni_ugni_page_size > mca_btl_ugni_component.smsg_page_size) {
mca_btl_ugni_component.smsg_page_size = mca_btl_ugni_ugni_page_size;
}
}
mca_btl_ugni_module.super.btl_rdma_pipeline_send_length = mca_btl_ugni_module.super.btl_eager_limit;
rc = mca_btl_ugni_module_init (ugni_modules);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
BTL_ERROR(("Failed to initialize uGNI module @ %s:%d", __FILE__,
__LINE__));
return NULL;
}
*base_modules = (mca_btl_base_module_t *) ugni_modules;
*num_btl_modules = mca_btl_ugni_component.ugni_num_btls;
BTL_VERBOSE(("btl/ugni done initializing %d module(s)", *num_btl_modules));
return base_modules;
}
int mca_btl_ugni_progress_datagram (mca_btl_ugni_device_t *device)
{
mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_component.modules;
mca_btl_base_endpoint_t *ep = NULL;
gni_ep_handle_t handle;
int count = 0, rc;
rc = mca_btl_ugni_get_datagram (ugni_module, device, &handle, &ep);
if (1 != rc) {
return rc;
}
BTL_VERBOSE(("remote datagram completion on handle %p", (void*)handle));
/* if this is a wildcard endpoint lookup the remote peer by the proc id we received */
if (handle == ugni_module->wildcard_ep) {
struct opal_proc_t *remote_proc = opal_proc_for_name (ugni_module->wc_remote_attr.proc_name);
BTL_VERBOSE(("received connection attempt on wildcard endpoint from proc: %s",
OPAL_NAME_PRINT(ugni_module->wc_remote_attr.proc_name)));
ep = mca_btl_ugni_get_ep (&ugni_module->super, remote_proc);
if (OPAL_UNLIKELY(NULL == ep)) {
/* there is no way to recover from this error so just abort() */
BTL_ERROR(("could not find/allocate a btl endpoint for peer %s",
OPAL_NAME_PRINT(ugni_module->wc_remote_attr.proc_name)));
abort ();
return OPAL_ERR_NOT_FOUND;
}
}
/* should not have gotten a NULL endpoint */
assert (NULL != ep);
BTL_VERBOSE(("got a datagram completion: ep = %p. wc = %d", (void *) ep, handle == ugni_module->wildcard_ep));
/* NTH: TODO -- error handling */
opal_mutex_lock (&ep->lock);
if (handle != ugni_module->wildcard_ep) {
/* directed post complete */
BTL_VERBOSE(("directed datagram complete for endpoint %p", (void *) ep));
ep->dg_posted = false;
(void) opal_atomic_add_fetch_32 (&ugni_module->active_datagrams, -1);
}
(void) mca_btl_ugni_ep_connect_progress (ep);
opal_mutex_unlock (&ep->lock);
if (MCA_BTL_UGNI_EP_STATE_CONNECTED == ep->state) {
/* process messages waiting in the endpoint's smsg mailbox */
count = mca_btl_ugni_smsg_process (ep);
}
/* repost the wildcard datagram */
if (handle == ugni_module->wildcard_ep) {
mca_btl_ugni_wildcard_ep_post (ugni_module);
}
return count;
}
void mca_btl_ugni_handle_rdma_completions (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_device_t *device,
struct mca_btl_ugni_post_descriptor_t *post_desc, const int count)
{
int bte_complete = 0;
for (int i = 0 ; i < count ; ++i) {
BTL_VERBOSE(("post descriptor complete. status: %d", post_desc[i].rc));
if (OPAL_UNLIKELY(OPAL_SUCCESS != post_desc[i].rc)) {
/* dump the post descriptor if in a debug build */
btl_ugni_dump_post_desc (post_desc + i);
}
bte_complete += post_desc[i].use_bte == true;
mca_btl_ugni_post_desc_complete (ugni_module, post_desc + i, post_desc[i].rc);
}
if (bte_complete > 0) {
(void) OPAL_THREAD_FETCH_ADD32 (&ugni_module->active_rdma_count, -bte_complete);
}
}
static inline int mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_device_t *device,
mca_btl_ugni_cq_t *cq)
{
mca_btl_ugni_post_descriptor_t post_desc[MCA_BTL_UGNI_COMPLETIONS_PER_LOOP];
int rc;
rc = mca_btl_ugni_cq_get_completed_desc (device, cq, post_desc, MCA_BTL_UGNI_COMPLETIONS_PER_LOOP);
if (0 >= rc) {
return rc;
}
BTL_VERBOSE(("got %d completed rdma descriptors", rc));
mca_btl_ugni_handle_rdma_completions (ugni_module, device, post_desc, rc);
return rc;
}
static inline int
mca_btl_ugni_progress_wait_list (mca_btl_ugni_module_t *ugni_module)
{
int rc = OPAL_SUCCESS;
mca_btl_base_endpoint_t *endpoint = NULL;
int count;
if (0 == opal_list_get_size(&ugni_module->ep_wait_list)) {
return 0;
}
/* check the count before taking the lock to avoid unnecessary locking */
count = opal_list_get_size(&ugni_module->ep_wait_list);
if (0 == count) {
return 0;
}
OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock);
count = opal_list_get_size(&ugni_module->ep_wait_list);
do {
endpoint = (mca_btl_base_endpoint_t *) opal_list_remove_first (&ugni_module->ep_wait_list);
if (endpoint != NULL) {
rc = mca_btl_ugni_progress_send_wait_list (endpoint);
if (OPAL_SUCCESS != rc) {
opal_list_append (&ugni_module->ep_wait_list, &endpoint->super);
} else {
endpoint->wait_listed = false;
}
}
} while (endpoint != NULL && --count > 0) ;
OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock);
return rc;
}
static int mca_btl_ugni_component_progress (void)
{
mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_component.modules;
int count = 0;
count += mca_btl_ugni_progress_remote_smsg (ugni_module);
if (ugni_module->active_datagrams) {
count += mca_btl_ugni_progress_datagram (ugni_module->devices);
}
for (int i = 0 ; i < mca_btl_ugni_component.virtual_device_count ; ++i) {
mca_btl_ugni_device_t *device = ugni_module->devices + i;
if (device->smsg_connections) {
count += mca_btl_ugni_progress_local_smsg (ugni_module, device);
mca_btl_ugni_progress_wait_list (ugni_module);
}
if (device->dev_rdma_local_cq.active_operations) {
count += mca_btl_ugni_progress_rdma (ugni_module, device, &device->dev_rdma_local_cq);
}
if (mca_btl_ugni_component.progress_thread_enabled && device->dev_rdma_local_irq_cq.active_operations) {
count += mca_btl_ugni_progress_rdma (ugni_module, device, &device->dev_rdma_local_irq_cq);
}
}
return count;
}
int mca_btl_ugni_flush (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint)
{
mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_component.modules;
for (int i = 0 ; i < mca_btl_ugni_component.virtual_device_count ; ++i) {
mca_btl_ugni_device_t *device = ugni_module->devices + i;
/* spin on progress until all active operations are complete. it is tempting to
* take an initial count then wait until that many operations have been completed
* but it is impossible to tell if those are the operations the caller is waiting
* on. */
while (device->dev_rdma_local_cq.active_operations) {
(void) mca_btl_ugni_progress_rdma (ugni_module, device, &device->dev_rdma_local_cq);
}
/* mark that the device was recently flushed */
device->flushed = true;
}
return OPAL_SUCCESS;
}
void btl_ugni_dump_post_desc (mca_btl_ugni_post_descriptor_t *desc)
{
fprintf (stderr, "desc->gni_desc.post_id = %" PRIx64 "\n", desc->gni_desc.post_id);
fprintf (stderr, "desc->gni_desc.status = %" PRIx64 "\n", desc->gni_desc.status);
fprintf (stderr, "desc->gni_desc.cq_mode_complete = %hu\n", desc->gni_desc.cq_mode_complete);
fprintf (stderr, "desc->gni_desc.type = %d\n", desc->gni_desc.type);
fprintf (stderr, "desc->gni_desc.cq_mode = %hu\n", desc->gni_desc.cq_mode);
fprintf (stderr, "desc->gni_desc.dlvr_mode = %hu\n", desc->gni_desc.dlvr_mode);
fprintf (stderr, "desc->gni_desc.local_addr = %" PRIx64 "\n", desc->gni_desc.local_addr);
fprintf (stderr, "desc->gni_desc.local_mem_hndl = {%" PRIx64 ", %" PRIx64 "}\n", desc->gni_desc.local_mem_hndl.qword1,
desc->gni_desc.local_mem_hndl.qword2);
fprintf (stderr, "desc->gni_desc.remote_addr = %" PRIx64 "\n", desc->gni_desc.remote_addr);
fprintf (stderr, "desc->gni_desc.remote_mem_hndl = {%" PRIx64 ", %" PRIx64 "}\n", desc->gni_desc.remote_mem_hndl.qword1,
desc->gni_desc.remote_mem_hndl.qword2);
fprintf (stderr, "desc->gni_desc.length = %" PRIu64 "\n", desc->gni_desc.length);
fprintf (stderr, "desc->gni_desc.rdma_mode = %hu\n", desc->gni_desc.rdma_mode);
fprintf (stderr, "desc->gni_desc.amo_cmd = %d\n", desc->gni_desc.amo_cmd);
}