1
1

add a common ofi whitelist/blacklist

also add common verbose variable.

Note the verbosity thing is a little tricky owing to the way the MCA frameworks and components are registered and
and initialized.  The BTL's are registered/initialized prior to the MTL components even getting registered.

Here's the change in ofi mtl mca parameters.  Before commit:

            MCA mtl ofi: parameter "mtl_ofi_provider_include" (current value: "psm2", data source: environment, level: 1 user/basic, type: string)
                          Comma-delimited list of OFI providers that are considered for use (e.g., "psm,psm2"; an empty value means that all providers will be considered). Mutually exclusive with mtl_ofi_provider_exclude.
             MCA mtl ofi: parameter "mtl_ofi_provider_exclude" (current value: "shm,sockets,tcp,udp,rstream", data source: default, level: 1 user/basic, type: string)
                          Comma-delimited list of OFI providers that are not considered for use (default: "sockets,mxm"; empty value means that all providers will be considered). Mutually exclusive with mtl_ofi_provider_include.

After commit:

             MCA btl ofi: parameter "btl_ofi_provider_include" (current value: "", data source: default, level: 1 user/basic, type: string, synonym of: opal_common_ofi_provider_include)
                          Comma-delimited list of OFI providers that are considered for use (e.g., "psm,psm2"; an empty value means that all providers will be considered). Mutually exclusive with mtl_ofi_provider_exclude.
             MCA btl ofi: parameter "btl_ofi_provider_exclude" (current value: "shm,sockets,tcp,udp,rstream", data source: default, level: 1 user/basic, type: string, synonym of: opal_common_ofi_provider_exclude)
                          Comma-delimited list of OFI providers that are not considered for use (default: "sockets,mxm"; empty value means that all providers will be considered). Mutually exclusive with mtl_ofi_provider_include.
             MCA mtl ofi: parameter "mtl_ofi_provider_exclude" (current value: "shm,sockets,tcp,udp,rstream", data source: default, level: 1 user/basic, type: string, synonym of: opal_common_ofi_provider_exclude)
                          Comma-delimited list of OFI providers that are not considered for use (default: "sockets,mxm"; empty value means that all providers will be considered). Mutually exclusive with mtl_ofi_provider_include.
             MCA mtl ofi: parameter "mtl_ofi_verbose" (current value: "0", data source: default, level: 3 user/all, type: int, synonym of: opal_common_ofi_verbose)

related to #7755

Signed-off-by: Howard Pritchard <howardp@lanl.gov>
(cherry picked from commit 9f1081a07a)
(cherry picked from commit 45b643d0cf)
Этот коммит содержится в:
Howard Pritchard 2020-04-14 09:19:56 -06:00 коммит произвёл Howard Pritchard
родитель 4a466e4f08
Коммит 8fcf9cee3b
8 изменённых файлов: 191 добавлений и 77 удалений

Просмотреть файл

@ -7,6 +7,8 @@
# Copyright (c) 2017 IBM Corporation. All rights reserved.
# Copyright (c) 2019 Research Organization for Information Science
# and Technology (RIST). All rights reserved.
# Copyright (c) 2020 Triad National Security, LLC. All rights
# reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow

Просмотреть файл

@ -112,7 +112,7 @@ ompi_mtl_ofi_add_procs(struct mca_mtl_base_module_t *mtl,
*/
count = fi_av_insert(ompi_mtl_ofi.av, ep_names, nprocs, fi_addrs, 0, NULL);
if ((count < 0) || (nprocs != (size_t)count)) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
opal_output_verbose(1, opal_common_ofi.output,
"%s:%d: fi_av_insert failed: %d\n",
__FILE__, __LINE__, count);
ret = OMPI_ERROR;
@ -125,7 +125,7 @@ ompi_mtl_ofi_add_procs(struct mca_mtl_base_module_t *mtl,
for (i = 0; i < nprocs; ++i) {
endpoint = OBJ_NEW(mca_mtl_ofi_endpoint_t);
if (NULL == endpoint) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
opal_output_verbose(1, opal_common_ofi.output,
"%s:%d: mtl/ofi: could not allocate endpoint"
" structure\n",
__FILE__, __LINE__);
@ -170,7 +170,7 @@ ompi_mtl_ofi_del_procs(struct mca_mtl_base_module_t *mtl,
endpoint = procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL];
ret = fi_av_remove(ompi_mtl_ofi.av, &endpoint->peer_fiaddr, 1, 0);
if (ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
opal_output_verbose(1, opal_common_ofi.output,
"%s:%d: fi_av_remove failed: %s\n", __FILE__, __LINE__, fi_strerror(errno));
return ret;
}

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2013-2018 Intel, Inc. All rights reserved
* Copyright (c) 2017 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2019 Triad National Security, LLC. All rights
* Copyright (c) 2019-2020 Triad National Security, LLC. All rights
* reserved.
* Copyright (c) 2018-2020 Amazon.com, Inc. or its affiliates. All rights
* reserved.
@ -38,6 +38,7 @@
#include "ompi/mca/mtl/base/base.h"
#include "ompi/mca/mtl/base/mtl_base_datatype.h"
#include "ompi/message/message.h"
#include "opal/mca/common/ofi/common_ofi.h"
#include "mtl_ofi_opt.h"
#include "mtl_ofi_types.h"
@ -235,7 +236,7 @@ ompi_mtl_ofi_progress(void)
#define MTL_OFI_LOG_FI_ERR(err, string) \
do { \
opal_output_verbose(1, ompi_mtl_base_framework.framework_output, \
opal_output_verbose(1, opal_common_ofi.output, \
"%s:%d:%s: %s\n", \
__FILE__, __LINE__, string, fi_strerror(-err)); \
} while(0);
@ -377,7 +378,7 @@ ompi_mtl_ofi_ssend_recv(ompi_mtl_ofi_request_t *ack_req,
0, /* Exact match, no ignore bits */
(void *) &ack_req->ctx), ret);
if (OPAL_UNLIKELY(0 > ret)) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
opal_output_verbose(1, opal_common_ofi.output,
"%s:%d: fi_trecv failed: %s(%zd)",
__FILE__, __LINE__, fi_strerror(-ret), ret);
free(ack_req);
@ -663,7 +664,7 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
status->_ucount = wc->len;
if (OPAL_UNLIKELY(wc->len > ofi_req->length)) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
opal_output_verbose(1, opal_common_ofi.output,
"truncate expected: %ld %ld",
wc->len, ofi_req->length);
status->MPI_ERROR = MPI_ERR_TRUNCATE;
@ -677,7 +678,7 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
ofi_req->buffer,
wc->len);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ompi_ret)) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
opal_output_verbose(1, opal_common_ofi.output,
"%s:%d: ompi_mtl_datatype_unpack failed: %d",
__FILE__, __LINE__, ompi_ret);
status->MPI_ERROR = ompi_ret;
@ -1330,7 +1331,7 @@ init_regular_ep:
if (MPI_COMM_WORLD == comm) {
ret = opal_progress_register(ompi_mtl_ofi_progress_no_inline);
if (OMPI_SUCCESS != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
opal_output_verbose(1, opal_common_ofi.output,
"%s:%d: opal_progress_register failed: %d\n",
__FILE__, __LINE__, ret);
goto init_error;

Просмотреть файл

@ -5,6 +5,9 @@
* Copyright (c) 2014-2017 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
* Copyright (c) 2020 Triad National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -27,8 +30,6 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
bool enable_mpi_threads);
static int param_priority;
static char *prov_include;
static char *prov_exclude;
static int control_progress;
static int data_progress;
static int av_type;
@ -130,24 +131,6 @@ ompi_mtl_ofi_component_register(void)
MCA_BASE_VAR_SCOPE_READONLY,
&param_priority);
prov_include = NULL;
mca_base_component_var_register(&mca_mtl_ofi_component.super.mtl_version,
"provider_include",
"Comma-delimited list of OFI providers that are considered for use (e.g., \"psm,psm2\"; an empty value means that all providers will be considered). Mutually exclusive with mtl_ofi_provider_exclude.",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
OPAL_INFO_LVL_1,
MCA_BASE_VAR_SCOPE_READONLY,
&prov_include);
prov_exclude = "shm,sockets,tcp,udp,rstream";
mca_base_component_var_register(&mca_mtl_ofi_component.super.mtl_version,
"provider_exclude",
"Comma-delimited list of OFI providers that are not considered for use (default: \"sockets,mxm\"; empty value means that all providers will be considered). Mutually exclusive with mtl_ofi_provider_include.",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
OPAL_INFO_LVL_1,
MCA_BASE_VAR_SCOPE_READONLY,
&prov_exclude);
ompi_mtl_ofi.ofi_progress_event_count = MTL_OFI_MAX_PROG_EVENT_COUNT;
opal_asprintf(&desc, "Max number of events to read each call to OFI progress (default: %d events will be read per OFI progress call)", ompi_mtl_ofi.ofi_progress_event_count);
mca_base_component_var_register(&mca_mtl_ofi_component.super.mtl_version,
@ -267,6 +250,8 @@ ompi_mtl_ofi_component_register(void)
MCA_BASE_VAR_SCOPE_READONLY,
&ompi_mtl_ofi.num_ofi_contexts);
opal_common_ofi_register_mca_variables(&mca_mtl_ofi_component.super.mtl_version);
return OMPI_SUCCESS;
}
@ -311,6 +296,7 @@ ompi_mtl_ofi_component_query(mca_base_module_t **module, int *priority)
static int
ompi_mtl_ofi_component_close(void)
{
opal_common_ofi_mca_deregister();
return OMPI_SUCCESS;
}
@ -349,7 +335,7 @@ select_ofi_provider(struct fi_info *providers,
if (NULL != include_list) {
while ((NULL != prov) &&
(!is_in_list(include_list, prov->fabric_attr->prov_name))) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
opal_output_verbose(1, opal_common_ofi.output,
"%s:%d: mtl:ofi: \"%s\" not in include list\n",
__FILE__, __LINE__,
prov->fabric_attr->prov_name);
@ -358,7 +344,7 @@ select_ofi_provider(struct fi_info *providers,
} else if (NULL != exclude_list) {
while ((NULL != prov) &&
(is_in_list(exclude_list, prov->fabric_attr->prov_name))) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
opal_output_verbose(1, opal_common_ofi.output,
"%s:%d: mtl:ofi: \"%s\" in exclude list\n",
__FILE__, __LINE__,
prov->fabric_attr->prov_name);
@ -366,7 +352,7 @@ select_ofi_provider(struct fi_info *providers,
}
}
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
opal_output_verbose(1, opal_common_ofi.output,
"%s:%d: mtl:ofi:prov: %s\n",
__FILE__, __LINE__,
(prov ? prov->fabric_attr->prov_name : "none"));
@ -396,6 +382,7 @@ select_ofi_provider(struct fi_info *providers,
return prov;
}
/* Check if FI_REMOTE_CQ_DATA is supported, if so send the source rank there
* FI_DIRECTED_RECV is also needed so receives can discrimate the source
*/
@ -481,7 +468,7 @@ ompi_mtl_ofi_define_tag_mode(int ofi_tag_mode, int *bits_for_cid) {
do { \
ompi_mtl_ofi.comm_to_context = calloc(arr_size, sizeof(int)); \
if (OPAL_UNLIKELY(!ompi_mtl_ofi.comm_to_context)) { \
opal_output_verbose(1, ompi_mtl_base_framework.framework_output, \
opal_output_verbose(1, opal_common_ofi.output, \
"%s:%d: alloc of comm_to_context array failed: %s\n",\
__FILE__, __LINE__, strerror(errno)); \
return ret; \
@ -493,7 +480,7 @@ ompi_mtl_ofi_define_tag_mode(int ofi_tag_mode, int *bits_for_cid) {
ompi_mtl_ofi.ofi_ctxt = (mca_mtl_ofi_context_t *) malloc(ompi_mtl_ofi.num_ofi_contexts * \
sizeof(mca_mtl_ofi_context_t)); \
if (OPAL_UNLIKELY(!ompi_mtl_ofi.ofi_ctxt)) { \
opal_output_verbose(1, ompi_mtl_base_framework.framework_output, \
opal_output_verbose(1, opal_common_ofi.output, \
"%s:%d: alloc of ofi_ctxt array failed: %s\n", \
__FILE__, __LINE__, strerror(errno)); \
return ret; \
@ -641,17 +628,19 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
int universe_size;
char *univ_size_str;
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: mtl:ofi:provider_include = \"%s\"\n",
__FILE__, __LINE__, prov_include);
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: mtl:ofi:provider_exclude = \"%s\"\n",
__FILE__, __LINE__, prov_exclude);
opal_common_ofi_mca_register();
if (NULL != prov_include) {
include_list = opal_argv_split(prov_include, ',');
} else if (NULL != prov_exclude) {
exclude_list = opal_argv_split(prov_exclude, ',');
opal_output_verbose(1, opal_common_ofi.output,
"%s:%d: mtl:ofi:provider_include = \"%s\"\n",
__FILE__, __LINE__, *opal_common_ofi.prov_include);
opal_output_verbose(1, opal_common_ofi.output,
"%s:%d: mtl:ofi:provider_exclude = \"%s\"\n",
__FILE__, __LINE__, *opal_common_ofi.prov_exclude);
if (NULL != *opal_common_ofi.prov_include) {
include_list = opal_argv_split(*opal_common_ofi.prov_include, ',');
} else if (NULL != *opal_common_ofi.prov_exclude) {
exclude_list = opal_argv_split(*opal_common_ofi.prov_exclude, ',');
}
/**
@ -666,7 +655,7 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
*/
hints = fi_allocinfo();
if (!hints) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
opal_output_verbose(1, opal_common_ofi.output,
"%s:%d: Could not allocate fi_info\n",
__FILE__, __LINE__);
goto error;
@ -752,7 +741,7 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
ret = fi_getinfo(fi_version, NULL, NULL, 0ULL, hints_dup, &providers);
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
opal_output_verbose(1, opal_common_ofi.output,
"%s:%d: EFA specific fi_getinfo(): %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
@ -789,7 +778,7 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
hints, /* In: Hints to filter providers */
&providers); /* Out: List of matching providers */
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
opal_output_verbose(1, opal_common_ofi.output,
"%s:%d: fi_getinfo(): %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
@ -810,7 +799,7 @@ select_prov:
*/
prov = select_ofi_provider(providers, include_list, exclude_list);
if (!prov) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
opal_output_verbose(1, opal_common_ofi.output,
"%s:%d: select_ofi_provider: no provider found\n",
__FILE__, __LINE__);
goto error;
@ -839,7 +828,7 @@ select_prov:
/* Fallback to MTL_OFI_TAG_1 */
ompi_mtl_ofi_define_tag_mode(MTL_OFI_TAG_1, &ofi_tag_bits_for_cid);
} else { /* MTL_OFI_TAG_FULL */
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
opal_output_verbose(1, opal_common_ofi.output,
"%s:%d: OFI provider %s does not support FI_REMOTE_CQ_DATA\n",
__FILE__, __LINE__, prov->fabric_attr->prov_name);
goto error;
@ -919,7 +908,7 @@ select_prov:
ompi_process_info.nodename, __FILE__, __LINE__);
goto error;
} else if (1 == sep_support_in_provider) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
opal_output_verbose(1, opal_common_ofi.output,
"%s:%d: Scalable EP supported in %s provider. Enabling in MTL.\n",
__FILE__, __LINE__, prov->fabric_attr->prov_name);
}
@ -1078,7 +1067,7 @@ select_prov:
&ep_name,
namelen);
if (OMPI_SUCCESS != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
opal_output_verbose(1, opal_common_ofi.output,
"%s:%d: modex_send failed: %d\n",
__FILE__, __LINE__, ret);
goto error;

Просмотреть файл

@ -13,6 +13,8 @@
# Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
# Copyright (c) 2017 IBM Corporation. All rights reserved.
# Copyright (c) 2018 Intel, inc. All rights reserved
# Copyright (c) 2020 Triad National Security, LLC. All rights
# reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow

Просмотреть файл

@ -14,6 +14,9 @@
* reserved.
* Copyright (c) 2018 Intel, Inc, All rights reserved
*
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
* Copyright (c) 2020 Triad National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -41,7 +44,6 @@
#define MCA_BTL_OFI_REQUESTED_MR_MODE (FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_VIRT_ADDR)
static char *prov_include;
static char *ofi_progress_mode;
static bool disable_sep;
static int mca_btl_ofi_init_device(struct fi_info *info);
@ -107,20 +109,6 @@ static int mca_btl_ofi_component_register(void)
MCA_BASE_VAR_SCOPE_READONLY,
&mca_btl_ofi_component.mode);
/* fi_getinfo with prov_name == NULL means ALL provider.
* Since now we are using the first valid info returned, I'm not sure
* if we need to provide the support for comma limited provider list. */
prov_include = NULL;
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
"provider_include",
"OFI provider that ofi btl will query for. This parameter only "
"accept ONE provider name. "
"(e.g., \"psm2\"; an empty value means that all providers will "
"be considered.",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
OPAL_INFO_LVL_4,
MCA_BASE_VAR_SCOPE_READONLY,
&prov_include);
mca_btl_ofi_component.num_cqe_read = MCA_BTL_OFI_NUM_CQE_READ;
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
@ -185,6 +173,8 @@ static int mca_btl_ofi_component_register(void)
/* for now we want this component to lose to the MTL. */
module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH - 50;
opal_common_ofi_register_mca_variables(&mca_btl_ofi_component.super.btl_version);
return mca_btl_base_param_register (&mca_btl_ofi_component.super.btl_version,
&module->super);
}
@ -200,7 +190,8 @@ static int mca_btl_ofi_component_open(void)
*/
static int mca_btl_ofi_component_close(void)
{
/* If we don't sleep, sockets provider freaks out. */
opal_common_ofi_mca_deregister();
/* If we don't sleep, sockets provider freaks out. Ummm this is a scary comment */
sleep(1);
return OPAL_SUCCESS;
}
@ -225,6 +216,7 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init (int *num_btl_modules,
uint64_t progress_mode;
unsigned resource_count = 0;
struct mca_btl_base_module_t **base_modules;
char **include_list = NULL;
BTL_VERBOSE(("initializing ofi btl"));
@ -247,6 +239,8 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init (int *num_btl_modules,
struct fi_domain_attr domain_attr = {0};
uint64_t required_caps;
opal_common_ofi_mca_register();
switch (mca_btl_ofi_component.mode) {
case MCA_BTL_OFI_MODE_TWO_SIDED:
@ -266,8 +260,12 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init (int *num_btl_modules,
break;
}
/* Select the provider */
fabric_attr.prov_name = prov_include;
fabric_attr.prov_name = NULL;
/* Select the provider - sort of. we just take first element in list for now */
if (NULL != *opal_common_ofi.prov_include) {
include_list = opal_argv_split(*opal_common_ofi.prov_include, ',');
fabric_attr.prov_name = include_list[0];
}
domain_attr.mr_mode = MCA_BTL_OFI_REQUESTED_MR_MODE;
@ -312,9 +310,13 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init (int *num_btl_modules,
rc = fi_getinfo(FI_VERSION(1, 5), NULL, NULL, 0, &hints, &info_list);
if (0 != rc) {
BTL_VERBOSE(("fi_getinfo failed with code %d: %s",rc, fi_strerror(-rc)));
if (NULL != include_list) {
opal_argv_free(include_list);
}
return NULL;
}
/* count the number of resources/ */
info = info_list;
while(info) {
@ -356,6 +358,9 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init (int *num_btl_modules,
/* We are done with the returned info. */
fi_freeinfo(info_list);
if (NULL != include_list) {
opal_argv_free(include_list);
}
/* pass module array back to caller */
base_modules = calloc (mca_btl_ofi_component.module_count, sizeof (*base_modules));

Просмотреть файл

@ -2,6 +2,8 @@
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* Copyright (c) 2017 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2020 Triad National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -9,22 +11,110 @@
* $HEADER$
*/
#include "opal_config.h"
#include "opal/constants.h"
#include "opal/mca/hwloc/base/base.h"
#include <errno.h>
#include <unistd.h>
#include "common_ofi.h"
#include "opal_config.h"
#include "opal/constants.h"
#include "opal/util/argv.h"
#include "opal/mca/base/mca_base_var.h"
#include "opal/mca/base/mca_base_framework.h"
#include "opal/mca/hwloc/base/base.h"
int mca_common_ofi_register_mca_variables(void)
OPAL_DECLSPEC opal_common_ofi_module_t opal_common_ofi = {
.prov_include = NULL,
.prov_exclude = NULL,
.registered = 0,
.verbose = 0
};
static const char default_prov_exclude_list[] = "shm,sockets,tcp,udp,rstream";
OPAL_DECLSPEC int opal_common_ofi_register_mca_variables(const mca_base_component_t *component)
{
if (fi_version() >= FI_VERSION(1,0)) {
return OPAL_SUCCESS;
} else {
static int registered = 0;
static int include_index;
static int exclude_index;
static int verbose_index;
if (fi_version() < FI_VERSION(1,0)) {
return OPAL_ERROR;
}
if (!registered) {
/*
* this monkey business is needed because of the way the MCA VARs stuff tries to handle pointers to strings when
* when destructing the MCA var database. If you don't do something like this,the MCA var framework will try
* to dereference a pointer which itself is no longer a valid address owing to having been previously dlclosed.
*/
opal_common_ofi.prov_include = (char **)malloc(sizeof(char *));
*opal_common_ofi.prov_include = NULL;
include_index = mca_base_var_register("opal", "opal_common", "ofi",
"provider_include",
"Comma-delimited list of OFI providers that are considered for use (e.g., \"psm,psm2\"; an empty value means that all providers will be considered). Mutually exclusive with mtl_ofi_provider_exclude.",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
OPAL_INFO_LVL_1,
MCA_BASE_VAR_SCOPE_READONLY,
opal_common_ofi.prov_include);
opal_common_ofi.prov_exclude = (char **)malloc(sizeof(char *));
*opal_common_ofi.prov_exclude = strdup(default_prov_exclude_list);
exclude_index = mca_base_var_register("opal", "opal_common", "ofi",
"provider_exclude",
"Comma-delimited list of OFI providers that are not considered for use (default: \"sockets,mxm\"; empty value means that all providers will be considered). Mutually exclusive with mtl_ofi_provider_include.",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
OPAL_INFO_LVL_1,
MCA_BASE_VAR_SCOPE_READONLY,
opal_common_ofi.prov_exclude);
verbose_index = mca_base_var_register("opal", "opal_common", "ofi", "verbose",
"Verbose level of the OFI components",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_LOCAL,
&opal_common_ofi.verbose);
registered = 1;
}
if (component) {
mca_base_var_register_synonym(include_index, component->mca_project_name,
component->mca_type_name,
component->mca_component_name,
"provider_include", 0);
mca_base_var_register_synonym(exclude_index, component->mca_project_name,
component->mca_type_name,
component->mca_component_name,
"provider_exclude", 0);
mca_base_var_register_synonym(verbose_index, component->mca_project_name,
component->mca_type_name,
component->mca_component_name,
"verbose", 0);
}
return OPAL_SUCCESS;
}
OPAL_DECLSPEC void opal_common_ofi_mca_register(void)
{
opal_common_ofi.registered++;
if (opal_common_ofi.registered > 1) {
opal_output_set_verbosity(opal_common_ofi.output, opal_common_ofi.verbose);
return;
}
opal_common_ofi.output = opal_output_open(NULL);
opal_output_set_verbosity(opal_common_ofi.output, opal_common_ofi.verbose);
}
OPAL_DECLSPEC void opal_common_ofi_mca_deregister(void)
{
/* unregister only on last deregister */
opal_common_ofi.registered--;
assert(opal_common_ofi.registered >= 0);
if (opal_common_ofi.registered) {
return;
}
opal_output_close(opal_common_ofi.output);
}
/* check that the tx attributes match */

Просмотреть файл

@ -3,6 +3,9 @@
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* Copyright (c) 2017 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2020 Triad National Security, LLC. All rights
* reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -12,9 +15,31 @@
#ifndef OPAL_MCA_COMMON_OFI_H
#define OPAL_MCA_COMMON_OFI_H
#include "opal_config.h"
#include "opal/mca/base/mca_base_var.h"
#include "opal/mca/base/mca_base_framework.h"
#include <rdma/fabric.h>
OPAL_DECLSPEC int mca_common_ofi_register_mca_variables(void);
BEGIN_C_DECLS
typedef struct opal_common_ofi_module {
char **prov_include;
char **prov_exclude;
int verbose;
int registered;
int output;
} opal_common_ofi_module_t;
extern opal_common_ofi_module_t opal_common_ofi;
OPAL_DECLSPEC int opal_common_ofi_register_mca_variables(const mca_base_component_t *component);
OPAL_DECLSPEC void opal_common_ofi_mca_register(void);
OPAL_DECLSPEC void opal_common_ofi_mca_deregister(void);
OPAL_DECLSPEC struct fi_info* opal_common_ofi_select_ofi_provider(struct fi_info *providers,
char *framework_name);
END_C_DECLS
struct fi_info* opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int rank);