08f9ae97ee
Signed-off-by: Nathan T. Weeks <weeks@iastate.edu>
770 строки
37 KiB
C
770 строки
37 KiB
C
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
|
/*
|
|
* Copyright (c) 2011-2018 Los Alamos National Security, LLC. All rights
|
|
* reserved.
|
|
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
|
|
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#include "btl_ugni.h"
|
|
#include "btl_ugni_frag.h"
|
|
#include "btl_ugni_rdma.h"
|
|
#include "btl_ugni_smsg.h"
|
|
|
|
#include "opal/util/sys_limits.h"
|
|
|
|
#include <stdlib.h>
|
|
#include <fcntl.h>
|
|
#include <ctype.h>
|
|
|
|
#include "opal/memoryhooks/memory.h"
|
|
#include "opal/runtime/opal_params.h"
|
|
|
|
#include "opal/mca/base/mca_base_pvar.h"
|
|
|
|
static int btl_ugni_component_register(void);
|
|
static int btl_ugni_component_open(void);
|
|
static int btl_ugni_component_close(void);
|
|
static mca_btl_base_module_t **mca_btl_ugni_component_init(int *, bool, bool);
|
|
static int mca_btl_ugni_component_progress(void);
|
|
static unsigned long mca_btl_ugni_ugni_page_size = 0;
|
|
|
|
mca_btl_ugni_component_t mca_btl_ugni_component = {
|
|
.super = {
|
|
/* First, the mca_base_component_t struct containing meta information
|
|
about the component itself */
|
|
.btl_version = {
|
|
MCA_BTL_DEFAULT_VERSION("ugni"),
|
|
.mca_open_component = btl_ugni_component_open,
|
|
.mca_close_component = btl_ugni_component_close,
|
|
.mca_register_component_params = btl_ugni_component_register,
|
|
},
|
|
.btl_data = {
|
|
.param_field = MCA_BASE_METADATA_PARAM_CHECKPOINT
|
|
},
|
|
.btl_init = mca_btl_ugni_component_init,
|
|
.btl_progress = mca_btl_ugni_component_progress,
|
|
}
|
|
};
|
|
|
|
mca_base_var_enum_value_t rcache_values[] = {
|
|
{MCA_BTL_UGNI_RCACHE_UDREG, "udreg"},
|
|
{MCA_BTL_UGNI_RCACHE_GRDMA, "grdma"},
|
|
{-1, NULL} /* sentinal */
|
|
};
|
|
|
|
mca_base_var_enum_value_flag_t cdm_flags[] = {
|
|
{.flag = GNI_CDM_MODE_FORK_NOCOPY, .string = "fork-no-copy", .conflicting_flag = GNI_CDM_MODE_FORK_FULLCOPY | GNI_CDM_MODE_FORK_PARTCOPY},
|
|
{.flag = GNI_CDM_MODE_FORK_FULLCOPY, .string = "fork-full-copy", .conflicting_flag = GNI_CDM_MODE_FORK_NOCOPY | GNI_CDM_MODE_FORK_PARTCOPY},
|
|
{.flag = GNI_CDM_MODE_FORK_PARTCOPY, .string = "fork-part-copy", .conflicting_flag = GNI_CDM_MODE_FORK_NOCOPY | GNI_CDM_MODE_FORK_FULLCOPY},
|
|
{.flag = GNI_CDM_MODE_ERR_NO_KILL, .string = "err-no-kill", .conflicting_flag = GNI_CDM_MODE_ERR_ALL_KILL},
|
|
{.flag = GNI_CDM_MODE_ERR_ALL_KILL, .string = "err-all-kill", .conflicting_flag = GNI_CDM_MODE_ERR_NO_KILL},
|
|
{.flag = GNI_CDM_MODE_FAST_DATAGRAM_POLL, .string = "fast-datagram-poll", .conflicting_flag = 0},
|
|
{.flag = GNI_CDM_MODE_BTE_SINGLE_CHANNEL, .string = "bte-single-channel", .conflicting_flag = 0},
|
|
{.flag = GNI_CDM_MODE_USE_PCI_IOMMU, .string = "use-pci-iommu", .conflicting_flag = 0},
|
|
{.flag = GNI_CDM_MODE_MDD_DEDICATED, .string = "mdd-dedicated", .conflicting_flag = GNI_CDM_MODE_MDD_SHARED},
|
|
{.flag = GNI_CDM_MODE_MDD_SHARED, .string = "mdd-shared", .conflicting_flag = GNI_CDM_MODE_MDD_DEDICATED},
|
|
{.flag = GNI_CDM_MODE_FMA_DEDICATED, .string = "fma-dedicated", .conflicting_flag = GNI_CDM_MODE_FMA_SHARED},
|
|
{.flag = GNI_CDM_MODE_FMA_SHARED, .string = "fma-shared", .conflicting_flag = GNI_CDM_MODE_FMA_DEDICATED},
|
|
{.flag = GNI_CDM_MODE_CACHED_AMO_ENABLED, .string = "cached-amo-enabled", .conflicting_flag = 0},
|
|
{.flag = GNI_CDM_MODE_CQ_NIC_LOCAL_PLACEMENT, .string = "cq-nic-placement", .conflicting_flag = 0},
|
|
{.flag = GNI_CDM_MODE_FMA_SMALL_WINDOW, .string = "fma-small-window", .conflicting_flag = 0},
|
|
{.string = NULL}
|
|
};
|
|
|
|
static inline int mca_btl_ugni_get_stat (const mca_base_pvar_t *pvar, void *value, void *obj)
|
|
{
|
|
gni_statistic_t statistic = (gni_statistic_t) (intptr_t) pvar->ctx;
|
|
gni_return_t rc = GNI_RC_SUCCESS;
|
|
|
|
for (int i = 0 ; i < mca_btl_ugni_component.virtual_device_count ; ++i) {
|
|
rc = GNI_GetNicStat (mca_btl_ugni_component.modules[0].devices[i].dev_handle, statistic,
|
|
((unsigned int *) value) + i);
|
|
}
|
|
|
|
return mca_btl_rc_ugni_to_opal (rc);
|
|
}
|
|
|
|
static inline int mca_btl_ugni_notify_stat (mca_base_pvar_t *pvar, mca_base_pvar_event_t event, void *obj, int *count)
|
|
{
|
|
if (MCA_BASE_PVAR_HANDLE_BIND == event) {
|
|
/* one value for each virtual device handle */
|
|
*count = mca_btl_ugni_component.virtual_device_count;
|
|
}
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
static int btl_ugni_component_register(void)
|
|
{
|
|
mca_base_var_enum_t *new_enum;
|
|
gni_nic_device_t device_type;
|
|
char *mpool_hints_tmp = NULL;
|
|
int rc;
|
|
|
|
(void) mca_base_var_group_component_register(&mca_btl_ugni_component.super.btl_version,
|
|
"uGNI byte transport layer");
|
|
|
|
mca_btl_ugni_component.ugni_free_list_num = 8;
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
"free_list_num", NULL, MCA_BASE_VAR_TYPE_INT,
|
|
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
|
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
|
|
&mca_btl_ugni_component.ugni_free_list_num);
|
|
mca_btl_ugni_component.ugni_free_list_max = 4096;
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
"free_list_max", NULL, MCA_BASE_VAR_TYPE_INT,
|
|
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
|
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
|
|
&mca_btl_ugni_component.ugni_free_list_max);
|
|
mca_btl_ugni_component.ugni_free_list_inc = 64;
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
"free_list_inc", NULL, MCA_BASE_VAR_TYPE_INT,
|
|
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
|
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
|
|
&mca_btl_ugni_component.ugni_free_list_inc);
|
|
|
|
mca_btl_ugni_component.ugni_eager_num = 16;
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
"eager_num", NULL, MCA_BASE_VAR_TYPE_INT,
|
|
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
|
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
|
|
&mca_btl_ugni_component.ugni_eager_num);
|
|
mca_btl_ugni_component.ugni_eager_max = 128;
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
"eager_max", NULL, MCA_BASE_VAR_TYPE_INT,
|
|
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
|
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
|
|
&mca_btl_ugni_component.ugni_eager_max);
|
|
mca_btl_ugni_component.ugni_eager_inc = 16;
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
"eager_inc", NULL, MCA_BASE_VAR_TYPE_INT,
|
|
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
|
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
|
|
&mca_btl_ugni_component.ugni_eager_inc);
|
|
|
|
mca_btl_ugni_component.remote_cq_size = 40000;
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
"remote_cq_size", "Remote SMSG completion queue "
|
|
"size (default 40000)", MCA_BASE_VAR_TYPE_INT,
|
|
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
|
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
|
|
&mca_btl_ugni_component.remote_cq_size);
|
|
|
|
mca_btl_ugni_component.local_cq_size = 8192;
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
"local_cq_size", "Local SMSG completion queue size "
|
|
"(default 8k)", MCA_BASE_VAR_TYPE_INT,
|
|
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
|
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
|
|
&mca_btl_ugni_component.local_cq_size);
|
|
|
|
mca_btl_ugni_component.local_rdma_cq_size = 1024;
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
"local_rdma_cq_size", "Local FMA/RDMA completion queue size "
|
|
"(default: 1024)",MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
|
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5,
|
|
MCA_BASE_VAR_SCOPE_LOCAL,
|
|
&mca_btl_ugni_component.local_rdma_cq_size);
|
|
|
|
mca_btl_ugni_component.ugni_smsg_limit = 0;
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
"smsg_limit", "Maximum size message that "
|
|
"will be sent using the SMSG/MSGQ protocol "
|
|
"(0 - autoselect(default), 16k max)",
|
|
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
|
MCA_BASE_VAR_FLAG_SETTABLE,
|
|
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
|
|
&mca_btl_ugni_component.ugni_smsg_limit);
|
|
|
|
mca_btl_ugni_component.smsg_max_credits = 32;
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
"smsg_max_credits", "Maximum number of "
|
|
"outstanding SMSG/MSGQ message (default 32)",
|
|
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
|
MCA_BASE_VAR_FLAG_SETTABLE,
|
|
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
|
|
&mca_btl_ugni_component.smsg_max_credits);
|
|
|
|
#if OPAL_C_HAVE__THREAD_LOCAL
|
|
mca_btl_ugni_component.bind_threads_to_devices = true;
|
|
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
"bind_devices", "Bind threads to virtual "
|
|
"devices. In general this should improve "
|
|
"RDMA performance (default: true)",
|
|
MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
|
|
MCA_BASE_VAR_FLAG_SETTABLE,
|
|
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
|
|
&mca_btl_ugni_component.bind_threads_to_devices);
|
|
#endif
|
|
|
|
mca_btl_ugni_component.ugni_fma_limit = -1;
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
"fma_limit", "Default maximum size message that "
|
|
"will be sent using the FMA (Fast Memory "
|
|
"Access) protocol (default: -1 (don't use), 64k max)",
|
|
MCA_BASE_VAR_TYPE_LONG, NULL, 0,
|
|
MCA_BASE_VAR_FLAG_SETTABLE | MCA_BASE_VAR_FLAG_DEPRECATED,
|
|
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
|
|
&mca_btl_ugni_component.ugni_fma_limit);
|
|
|
|
mca_btl_ugni_component.ugni_fma_get_limit = 2048;
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
"fma_get_limit", "Maximum size message that "
|
|
"will be sent using the FMA (Fast Memory "
|
|
"Access) protocol for get (default 2k, "
|
|
"64k max)",
|
|
MCA_BASE_VAR_TYPE_LONG, NULL, 0,
|
|
MCA_BASE_VAR_FLAG_SETTABLE,
|
|
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
|
|
&mca_btl_ugni_component.ugni_fma_get_limit);
|
|
|
|
mca_btl_ugni_component.ugni_fma_put_limit = 4096;
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
"fma_put_limit", "Maximum size message that "
|
|
"will be sent using the FMA (Fast Memory "
|
|
"Access) protocol for put (default: 4k, "
|
|
"64k max)",
|
|
MCA_BASE_VAR_TYPE_LONG, NULL, 0,
|
|
MCA_BASE_VAR_FLAG_SETTABLE,
|
|
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
|
|
&mca_btl_ugni_component.ugni_fma_put_limit);
|
|
|
|
mca_btl_ugni_component.rdma_max_retries = 16;
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
"rdma_max_retries", NULL, MCA_BASE_VAR_TYPE_INT,
|
|
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
|
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
|
|
&mca_btl_ugni_component.rdma_max_retries);
|
|
|
|
mca_btl_ugni_component.smsg_max_retries = 16;
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
"smsg_max_retries", NULL, MCA_BASE_VAR_TYPE_INT,
|
|
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
|
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
|
|
&mca_btl_ugni_component.smsg_max_retries);
|
|
|
|
mca_btl_ugni_component.max_mem_reg = 0;
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
"max_mem_reg", "Maximum number of "
|
|
"memory registrations a process can "
|
|
"hold (0 - autoselect, -1 - unlimited)"
|
|
" (default 0)", MCA_BASE_VAR_TYPE_INT,
|
|
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
|
OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
|
|
&mca_btl_ugni_component.max_mem_reg);
|
|
|
|
mca_btl_ugni_component.mbox_increment = 0;
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
"mbox_inc", "Number of SMSG mailboxes to "
|
|
"allocate in each block (0 - autoselect(default))",
|
|
MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0,
|
|
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
|
|
MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.mbox_increment);
|
|
|
|
/* communication domain flags */
|
|
rc = mca_base_var_enum_create_flag ("btl_ugni_cdm_flags", cdm_flags, (mca_base_var_enum_flag_t **) &new_enum);
|
|
if (OPAL_SUCCESS != rc) {
|
|
return rc;
|
|
}
|
|
|
|
mca_btl_ugni_component.cdm_flags = GNI_CDM_MODE_FORK_PARTCOPY | GNI_CDM_MODE_ERR_NO_KILL | GNI_CDM_MODE_FAST_DATAGRAM_POLL |
|
|
GNI_CDM_MODE_MDD_SHARED | GNI_CDM_MODE_FMA_SHARED | GNI_CDM_MODE_FMA_SMALL_WINDOW;
|
|
mca_btl_ugni_component.cdm_flags_id = mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
"cdm_flags", "Flags to set when creating a communication domain "
|
|
" (default: fork-full-copy,cached-amo-enabled,err-no-kill,fast-datagram-poll,"
|
|
"fma-shared,fma-small-window)",
|
|
MCA_BASE_VAR_TYPE_UNSIGNED_INT, new_enum, 0,
|
|
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
|
|
MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.cdm_flags);
|
|
OBJ_RELEASE(new_enum);
|
|
|
|
mca_btl_ugni_component.virtual_device_count = 0;
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
"virtual_device_count", "Number of virtual devices to create. Higher numbers may "
|
|
"result in better performance when using threads. (default: 0 (auto), max: 128)",
|
|
MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0,
|
|
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
|
|
MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.virtual_device_count);
|
|
|
|
/* determine if there are get alignment restrictions */
|
|
GNI_GetDeviceType (&device_type);
|
|
|
|
|
|
mca_btl_ugni_component.smsg_page_size = 2 << 20;
|
|
if (GNI_DEVICE_GEMINI == device_type) {
|
|
if (access ("/sys/class/gemini/ghal0/mrt", R_OK)) {
|
|
int fd = open ("/sys/class/gemini/ghal0/mrt", O_RDONLY);
|
|
char buffer[10];
|
|
|
|
if (0 <= fd) {
|
|
memset (buffer, 0, sizeof (buffer));
|
|
read (fd, buffer, sizeof (buffer) - 1);
|
|
close (fd);
|
|
mca_btl_ugni_ugni_page_size = strtol (buffer, NULL, 10) * 1024;
|
|
mca_btl_ugni_component.smsg_page_size = mca_btl_ugni_ugni_page_size;
|
|
}
|
|
}
|
|
}
|
|
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
"smsg_page_size", "Page size to use for SMSG mailbox allocation (default: detect)",
|
|
MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
|
|
MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.smsg_page_size);
|
|
|
|
mca_btl_ugni_component.progress_thread_requested = 0;
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
"request_progress_thread",
|
|
"Enable to request ugni btl progress thread - requires MPI_THREAD_MULTIPLE support",
|
|
MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
|
|
MCA_BASE_VAR_FLAG_SETTABLE,
|
|
OPAL_INFO_LVL_3,
|
|
MCA_BASE_VAR_SCOPE_LOCAL,
|
|
&mca_btl_ugni_component.progress_thread_requested);
|
|
|
|
/* performance variables */
|
|
mca_btl_ugni_progress_thread_wakeups = 0;
|
|
(void) mca_base_component_pvar_register(&mca_btl_ugni_component.super.btl_version,
|
|
"progress_thread_wakeups", "Number of times the progress thread "
|
|
"has been woken", OPAL_INFO_LVL_9, MCA_BASE_PVAR_CLASS_COUNTER,
|
|
MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, MCA_BASE_VAR_BIND_NO_OBJECT,
|
|
MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS, NULL,
|
|
NULL, NULL, &mca_btl_ugni_progress_thread_wakeups);
|
|
|
|
/* register network statistics as performance variables */
|
|
for (int i = 0 ; i < GNI_NUM_STATS ; ++i) {
|
|
char name[128], desc[128];
|
|
size_t str_len = strlen (gni_statistic_str[i]);
|
|
|
|
assert (str_len < sizeof (name));
|
|
|
|
/* we can get an all-caps string for the variable from gni_statistic_str. need to make it lowercase
|
|
* to match ompi standards */
|
|
for (size_t j = 0 ; j < str_len ; ++j) {
|
|
name[j] = tolower (gni_statistic_str[i][j]);
|
|
desc[j] = ('_' == name[j]) ? ' ' : name[j];
|
|
}
|
|
|
|
name[str_len] = '\0';
|
|
desc[str_len] = '\0';
|
|
|
|
(void) mca_base_component_pvar_register (&mca_btl_ugni_component.super.btl_version, name, desc,
|
|
OPAL_INFO_LVL_4, MCA_BASE_PVAR_CLASS_COUNTER,
|
|
MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, MCA_BASE_VAR_BIND_NO_OBJECT,
|
|
MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS,
|
|
mca_btl_ugni_get_stat, NULL, mca_btl_ugni_notify_stat,
|
|
(void *) (intptr_t) i);
|
|
}
|
|
|
|
/* btl/ugni can only support only a fixed set of rcache components (these rcache components have compatible resource
|
|
* structures) */
|
|
rc = mca_base_var_enum_create ("btl_ugni_rcache", rcache_values, &new_enum);
|
|
if (OPAL_SUCCESS != rc) {
|
|
return rc;
|
|
}
|
|
|
|
/* NTH: there are known *serious* performance issues with udreg. if they are ever resolved it is the preferred rcache */
|
|
mca_btl_ugni_component.rcache_type = MCA_BTL_UGNI_RCACHE_GRDMA;
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
"rcache", "registration cache to use (default: grdma)", MCA_BASE_VAR_TYPE_INT, new_enum,
|
|
0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
|
|
MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.rcache_type);
|
|
OBJ_RELEASE(new_enum);
|
|
|
|
if (mca_btl_ugni_ugni_page_size) {
|
|
rc = asprintf (&mpool_hints_tmp, "page_size=%lu", mca_btl_ugni_ugni_page_size);
|
|
if (rc < 0) {
|
|
return OPAL_ERR_OUT_OF_RESOURCE;
|
|
}
|
|
|
|
mca_btl_ugni_component.mpool_hints = mpool_hints_tmp;
|
|
} else {
|
|
mca_btl_ugni_component.mpool_hints = "page_size=2M";
|
|
}
|
|
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
"mpool_hints", "hints to use when selecting a memory pool (default: "
|
|
"\"page_size=2M\")", MCA_BASE_VAR_TYPE_STRING, NULL, 0,
|
|
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
|
|
MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.mpool_hints);
|
|
free (mpool_hints_tmp);
|
|
|
|
/* ensure we loose send exclusivity to sm and vader if they are enabled */
|
|
mca_btl_ugni_module.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH - 2;
|
|
|
|
/* smsg threshold */
|
|
mca_btl_ugni_module.super.btl_eager_limit = 8 * 1024;
|
|
mca_btl_ugni_module.super.btl_rndv_eager_limit = 8 * 1024;
|
|
mca_btl_ugni_module.super.btl_rdma_pipeline_frag_size = 4 * 1024 * 1024;
|
|
mca_btl_ugni_module.super.btl_max_send_size = 8 * 1024;
|
|
mca_btl_ugni_module.super.btl_rdma_pipeline_send_length = 8 * 1024;
|
|
|
|
mca_btl_ugni_module.super.btl_get_limit = 1 * 1024 * 1024;
|
|
|
|
/*
|
|
* see def. of ALIGNMENT_MASK to figure this one out
|
|
*/
|
|
/* both gemini and aries have a 4-byte alignment requirement on remote addresses */
|
|
mca_btl_ugni_module.super.btl_get_alignment = 4;
|
|
|
|
/* threshold for put */
|
|
mca_btl_ugni_module.super.btl_min_rdma_pipeline_size = 8 * 1024;
|
|
|
|
mca_btl_ugni_module.super.btl_flags = MCA_BTL_FLAGS_SEND |
|
|
MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND_INPLACE | MCA_BTL_FLAGS_ATOMIC_OPS |
|
|
MCA_BTL_FLAGS_ATOMIC_FOPS;
|
|
mca_btl_ugni_module.super.btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD |
|
|
MCA_BTL_ATOMIC_SUPPORTS_AND | MCA_BTL_ATOMIC_SUPPORTS_OR | MCA_BTL_ATOMIC_SUPPORTS_XOR |
|
|
MCA_BTL_ATOMIC_SUPPORTS_CSWAP;
|
|
|
|
if (GNI_DEVICE_ARIES == device_type) {
|
|
/* aries supports additional atomic operations */
|
|
mca_btl_ugni_module.super.btl_atomic_flags |= MCA_BTL_ATOMIC_SUPPORTS_MIN | MCA_BTL_ATOMIC_SUPPORTS_MAX |
|
|
MCA_BTL_ATOMIC_SUPPORTS_LAND | MCA_BTL_ATOMIC_SUPPORTS_LOR | MCA_BTL_ATOMIC_SUPPORTS_LXOR |
|
|
MCA_BTL_ATOMIC_SUPPORTS_32BIT | MCA_BTL_ATOMIC_SUPPORTS_FLOAT;
|
|
}
|
|
|
|
mca_btl_ugni_module.super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t);
|
|
|
|
mca_btl_ugni_module.super.btl_bandwidth = 40000; /* Mbs */
|
|
mca_btl_ugni_module.super.btl_latency = 2; /* Microsecs */
|
|
|
|
mca_btl_ugni_module.super.btl_get_local_registration_threshold = 0;
|
|
mca_btl_ugni_module.super.btl_put_local_registration_threshold = mca_btl_ugni_component.ugni_fma_put_limit;
|
|
|
|
/* Call the BTL based to register its MCA params */
|
|
mca_btl_base_param_register(&mca_btl_ugni_component.super.btl_version,
|
|
&mca_btl_ugni_module.super);
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
static int
|
|
btl_ugni_component_open(void)
|
|
{
|
|
mca_btl_ugni_component.ugni_num_btls = 0;
|
|
mca_btl_ugni_component.modules = NULL;
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
/*
|
|
* component cleanup - sanity checking of queue lengths
|
|
*/
|
|
static int
|
|
btl_ugni_component_close(void)
|
|
{
|
|
mca_btl_ugni_fini ();
|
|
|
|
free (mca_btl_ugni_component.modules);
|
|
mca_btl_ugni_component.modules = NULL;
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
static mca_btl_base_module_t **
|
|
mca_btl_ugni_component_init (int *num_btl_modules,
|
|
bool enable_progress_threads,
|
|
bool enable_mpi_threads)
|
|
{
|
|
struct mca_btl_base_module_t **base_modules;
|
|
mca_btl_ugni_module_t *ugni_modules;
|
|
int rc;
|
|
|
|
if (16384 < mca_btl_ugni_component.ugni_smsg_limit) {
|
|
mca_btl_ugni_component.ugni_smsg_limit = 16384;
|
|
}
|
|
|
|
if (65536 < mca_btl_ugni_component.ugni_fma_limit) {
|
|
mca_btl_ugni_component.ugni_fma_limit = 65536;
|
|
}
|
|
|
|
if (-1 != mca_btl_ugni_component.ugni_fma_limit) {
|
|
mca_btl_ugni_component.ugni_fma_get_limit = mca_btl_ugni_component.ugni_fma_limit;
|
|
} else if (65536 < mca_btl_ugni_component.ugni_fma_get_limit) {
|
|
mca_btl_ugni_component.ugni_fma_get_limit = 65536;
|
|
}
|
|
|
|
if (-1 != mca_btl_ugni_component.ugni_fma_limit) {
|
|
mca_btl_ugni_component.ugni_fma_put_limit = mca_btl_ugni_component.ugni_fma_limit;
|
|
} else if (65536 < mca_btl_ugni_component.ugni_fma_put_limit) {
|
|
mca_btl_ugni_component.ugni_fma_put_limit = 65536;
|
|
}
|
|
|
|
mca_btl_ugni_module.super.btl_put_local_registration_threshold = mca_btl_ugni_component.ugni_fma_put_limit;
|
|
|
|
/* limit the number of outstanding RDMA operations over all devices */
|
|
mca_btl_ugni_component.active_rdma_threshold = mca_btl_ugni_component.local_rdma_cq_size;
|
|
|
|
if (enable_mpi_threads && mca_btl_ugni_component.progress_thread_requested) {
|
|
mca_btl_ugni_component.progress_thread_enabled = 1;
|
|
}
|
|
|
|
/* Initialize ugni library and create communication domain */
|
|
rc = mca_btl_ugni_init();
|
|
if (OPAL_SUCCESS != rc) {
|
|
return NULL;
|
|
}
|
|
|
|
/* For now only create a single BTL module */
|
|
mca_btl_ugni_component.ugni_num_btls = 1;
|
|
|
|
BTL_VERBOSE(("btl/ugni initializing"));
|
|
|
|
ugni_modules = mca_btl_ugni_component.modules = (mca_btl_ugni_module_t *)
|
|
calloc (mca_btl_ugni_component.ugni_num_btls, sizeof (mca_btl_ugni_module_t));
|
|
|
|
if (OPAL_UNLIKELY(NULL == mca_btl_ugni_component.modules)) {
|
|
BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__));
|
|
return NULL;
|
|
}
|
|
|
|
base_modules = (struct mca_btl_base_module_t **)
|
|
calloc (mca_btl_ugni_component.ugni_num_btls,
|
|
sizeof (struct mca_btl_base_module_t *));
|
|
if (OPAL_UNLIKELY(NULL == base_modules)) {
|
|
BTL_ERROR(("Malloc failed : %s:%d", __FILE__, __LINE__));
|
|
return NULL;
|
|
}
|
|
|
|
if (mca_btl_ugni_component.smsg_page_size != (unsigned long) opal_getpagesize ()) {
|
|
if (mca_btl_ugni_ugni_page_size > mca_btl_ugni_component.smsg_page_size) {
|
|
mca_btl_ugni_component.smsg_page_size = mca_btl_ugni_ugni_page_size;
|
|
}
|
|
}
|
|
|
|
mca_btl_ugni_module.super.btl_rdma_pipeline_send_length = mca_btl_ugni_module.super.btl_eager_limit;
|
|
|
|
rc = mca_btl_ugni_module_init (ugni_modules);
|
|
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
|
BTL_ERROR(("Failed to initialize uGNI module @ %s:%d", __FILE__,
|
|
__LINE__));
|
|
return NULL;
|
|
}
|
|
|
|
*base_modules = (mca_btl_base_module_t *) ugni_modules;
|
|
|
|
*num_btl_modules = mca_btl_ugni_component.ugni_num_btls;
|
|
|
|
BTL_VERBOSE(("btl/ugni done initializing %d module(s)", *num_btl_modules));
|
|
|
|
return base_modules;
|
|
}
|
|
|
|
int mca_btl_ugni_progress_datagram (mca_btl_ugni_device_t *device)
|
|
{
|
|
mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_component.modules;
|
|
mca_btl_base_endpoint_t *ep = NULL;
|
|
gni_ep_handle_t handle;
|
|
int count = 0, rc;
|
|
|
|
rc = mca_btl_ugni_get_datagram (ugni_module, device, &handle, &ep);
|
|
if (1 != rc) {
|
|
return rc;
|
|
}
|
|
|
|
BTL_VERBOSE(("remote datagram completion on handle %p", (void*)handle));
|
|
|
|
/* if this is a wildcard endpoint lookup the remote peer by the proc id we received */
|
|
if (handle == ugni_module->wildcard_ep) {
|
|
struct opal_proc_t *remote_proc = opal_proc_for_name (ugni_module->wc_remote_attr.proc_name);
|
|
|
|
BTL_VERBOSE(("received connection attempt on wildcard endpoint from proc: %s",
|
|
OPAL_NAME_PRINT(ugni_module->wc_remote_attr.proc_name)));
|
|
|
|
ep = mca_btl_ugni_get_ep (&ugni_module->super, remote_proc);
|
|
if (OPAL_UNLIKELY(NULL == ep)) {
|
|
/* there is no way to recover from this error so just abort() */
|
|
BTL_ERROR(("could not find/allocate a btl endpoint for peer %s",
|
|
OPAL_NAME_PRINT(ugni_module->wc_remote_attr.proc_name)));
|
|
abort ();
|
|
return OPAL_ERR_NOT_FOUND;
|
|
}
|
|
}
|
|
|
|
/* should not have gotten a NULL endpoint */
|
|
assert (NULL != ep);
|
|
|
|
BTL_VERBOSE(("got a datagram completion: ep = %p. wc = %d", (void *) ep, handle == ugni_module->wildcard_ep));
|
|
|
|
/* NTH: TODO -- error handling */
|
|
opal_mutex_lock (&ep->lock);
|
|
if (handle != ugni_module->wildcard_ep) {
|
|
/* directed post complete */
|
|
BTL_VERBOSE(("directed datagram complete for endpoint %p", (void *) ep));
|
|
|
|
ep->dg_posted = false;
|
|
(void) opal_atomic_add_fetch_32 (&ugni_module->active_datagrams, -1);
|
|
}
|
|
|
|
(void) mca_btl_ugni_ep_connect_progress (ep);
|
|
opal_mutex_unlock (&ep->lock);
|
|
|
|
if (MCA_BTL_UGNI_EP_STATE_CONNECTED == ep->state) {
|
|
/* process messages waiting in the endpoint's smsg mailbox */
|
|
count = mca_btl_ugni_smsg_process (ep);
|
|
}
|
|
|
|
/* repost the wildcard datagram */
|
|
if (handle == ugni_module->wildcard_ep) {
|
|
mca_btl_ugni_wildcard_ep_post (ugni_module);
|
|
}
|
|
|
|
return count;
|
|
}
|
|
|
|
void mca_btl_ugni_handle_rdma_completions (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_device_t *device,
|
|
struct mca_btl_ugni_post_descriptor_t *post_desc, const int count)
|
|
{
|
|
int bte_complete = 0;
|
|
|
|
for (int i = 0 ; i < count ; ++i) {
|
|
BTL_VERBOSE(("post descriptor complete. status: %d", post_desc[i].rc));
|
|
|
|
if (OPAL_UNLIKELY(OPAL_SUCCESS != post_desc[i].rc)) {
|
|
/* dump the post descriptor if in a debug build */
|
|
btl_ugni_dump_post_desc (post_desc + i);
|
|
}
|
|
|
|
bte_complete += post_desc[i].use_bte == true;
|
|
|
|
mca_btl_ugni_post_desc_complete (ugni_module, post_desc + i, post_desc[i].rc);
|
|
}
|
|
|
|
if (bte_complete > 0) {
|
|
(void) OPAL_THREAD_FETCH_ADD32 (&ugni_module->active_rdma_count, -bte_complete);
|
|
}
|
|
}
|
|
|
|
static inline int mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_device_t *device,
|
|
mca_btl_ugni_cq_t *cq)
|
|
{
|
|
mca_btl_ugni_post_descriptor_t post_desc[MCA_BTL_UGNI_COMPLETIONS_PER_LOOP];
|
|
int rc;
|
|
|
|
rc = mca_btl_ugni_cq_get_completed_desc (device, cq, post_desc, MCA_BTL_UGNI_COMPLETIONS_PER_LOOP);
|
|
if (0 >= rc) {
|
|
return rc;
|
|
}
|
|
|
|
BTL_VERBOSE(("got %d completed rdma descriptors", rc));
|
|
|
|
mca_btl_ugni_handle_rdma_completions (ugni_module, device, post_desc, rc);
|
|
|
|
return rc;
|
|
}
|
|
|
|
static inline int
|
|
mca_btl_ugni_progress_wait_list (mca_btl_ugni_module_t *ugni_module)
|
|
{
|
|
int rc = OPAL_SUCCESS;
|
|
mca_btl_base_endpoint_t *endpoint = NULL;
|
|
int count;
|
|
|
|
if (0 == opal_list_get_size(&ugni_module->ep_wait_list)) {
|
|
return 0;
|
|
}
|
|
|
|
/* check the count before taking the lock to avoid unnecessary locking */
|
|
count = opal_list_get_size(&ugni_module->ep_wait_list);
|
|
if (0 == count) {
|
|
return 0;
|
|
}
|
|
|
|
OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock);
|
|
count = opal_list_get_size(&ugni_module->ep_wait_list);
|
|
do {
|
|
endpoint = (mca_btl_base_endpoint_t *) opal_list_remove_first (&ugni_module->ep_wait_list);
|
|
if (endpoint != NULL) {
|
|
rc = mca_btl_ugni_progress_send_wait_list (endpoint);
|
|
|
|
if (OPAL_SUCCESS != rc) {
|
|
opal_list_append (&ugni_module->ep_wait_list, &endpoint->super);
|
|
} else {
|
|
endpoint->wait_listed = false;
|
|
}
|
|
}
|
|
} while (endpoint != NULL && --count > 0) ;
|
|
OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock);
|
|
|
|
return rc;
|
|
}
|
|
|
|
static int mca_btl_ugni_component_progress (void)
|
|
{
|
|
mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_component.modules;
|
|
int count = 0;
|
|
|
|
count += mca_btl_ugni_progress_remote_smsg (ugni_module);
|
|
|
|
if (ugni_module->active_datagrams) {
|
|
count += mca_btl_ugni_progress_datagram (ugni_module->devices);
|
|
}
|
|
|
|
for (int i = 0 ; i < mca_btl_ugni_component.virtual_device_count ; ++i) {
|
|
mca_btl_ugni_device_t *device = ugni_module->devices + i;
|
|
|
|
if (device->smsg_connections) {
|
|
count += mca_btl_ugni_progress_local_smsg (ugni_module, device);
|
|
mca_btl_ugni_progress_wait_list (ugni_module);
|
|
}
|
|
|
|
if (device->dev_rdma_local_cq.active_operations) {
|
|
count += mca_btl_ugni_progress_rdma (ugni_module, device, &device->dev_rdma_local_cq);
|
|
}
|
|
|
|
if (mca_btl_ugni_component.progress_thread_enabled && device->dev_rdma_local_irq_cq.active_operations) {
|
|
count += mca_btl_ugni_progress_rdma (ugni_module, device, &device->dev_rdma_local_irq_cq);
|
|
}
|
|
}
|
|
|
|
return count;
|
|
}
|
|
|
|
int mca_btl_ugni_flush (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint)
|
|
{
|
|
mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_component.modules;
|
|
|
|
for (int i = 0 ; i < mca_btl_ugni_component.virtual_device_count ; ++i) {
|
|
mca_btl_ugni_device_t *device = ugni_module->devices + i;
|
|
/* spin on progress until all active operations are complete. it is tempting to
|
|
* take an initial count then wait until that many operations have been completed
|
|
* but it is impossible to tell if those are the operations the caller is waiting
|
|
* on. */
|
|
while (device->dev_rdma_local_cq.active_operations) {
|
|
(void) mca_btl_ugni_progress_rdma (ugni_module, device, &device->dev_rdma_local_cq);
|
|
}
|
|
|
|
/* mark that the device was recently flushed */
|
|
device->flushed = true;
|
|
}
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
void btl_ugni_dump_post_desc (mca_btl_ugni_post_descriptor_t *desc)
|
|
{
|
|
|
|
fprintf (stderr, "desc->gni_desc.post_id = %" PRIx64 "\n", desc->gni_desc.post_id);
|
|
fprintf (stderr, "desc->gni_desc.status = %" PRIx64 "\n", desc->gni_desc.status);
|
|
fprintf (stderr, "desc->gni_desc.cq_mode_complete = %hu\n", desc->gni_desc.cq_mode_complete);
|
|
fprintf (stderr, "desc->gni_desc.type = %d\n", desc->gni_desc.type);
|
|
fprintf (stderr, "desc->gni_desc.cq_mode = %hu\n", desc->gni_desc.cq_mode);
|
|
fprintf (stderr, "desc->gni_desc.dlvr_mode = %hu\n", desc->gni_desc.dlvr_mode);
|
|
fprintf (stderr, "desc->gni_desc.local_addr = %" PRIx64 "\n", desc->gni_desc.local_addr);
|
|
fprintf (stderr, "desc->gni_desc.local_mem_hndl = {%" PRIx64 ", %" PRIx64 "}\n", desc->gni_desc.local_mem_hndl.qword1,
|
|
desc->gni_desc.local_mem_hndl.qword2);
|
|
fprintf (stderr, "desc->gni_desc.remote_addr = %" PRIx64 "\n", desc->gni_desc.remote_addr);
|
|
fprintf (stderr, "desc->gni_desc.remote_mem_hndl = {%" PRIx64 ", %" PRIx64 "}\n", desc->gni_desc.remote_mem_hndl.qword1,
|
|
desc->gni_desc.remote_mem_hndl.qword2);
|
|
fprintf (stderr, "desc->gni_desc.length = %" PRIu64 "\n", desc->gni_desc.length);
|
|
fprintf (stderr, "desc->gni_desc.rdma_mode = %hu\n", desc->gni_desc.rdma_mode);
|
|
fprintf (stderr, "desc->gni_desc.amo_cmd = %d\n", desc->gni_desc.amo_cmd);
|
|
}
|