add a common ofi whitelist/blacklist
also add common verbose variable. Note the verbosity thing is a little tricky owing to the way the MCA frameworks and components are registered and and initialized. The BTL's are registered/initialized prior to the MTL components even getting registered. Here's the change in ofi mtl mca parameters. Before commit: MCA mtl ofi: parameter "mtl_ofi_provider_include" (current value: "psm2", data source: environment, level: 1 user/basic, type: string) Comma-delimited list of OFI providers that are considered for use (e.g., "psm,psm2"; an empty value means that all providers will be considered). Mutually exclusive with mtl_ofi_provider_exclude. MCA mtl ofi: parameter "mtl_ofi_provider_exclude" (current value: "shm,sockets,tcp,udp,rstream", data source: default, level: 1 user/basic, type: string) Comma-delimited list of OFI providers that are not considered for use (default: "sockets,mxm"; empty value means that all providers will be considered). Mutually exclusive with mtl_ofi_provider_include. After commit: MCA btl ofi: parameter "btl_ofi_provider_include" (current value: "", data source: default, level: 1 user/basic, type: string, synonym of: opal_common_ofi_provider_include) Comma-delimited list of OFI providers that are considered for use (e.g., "psm,psm2"; an empty value means that all providers will be considered). Mutually exclusive with mtl_ofi_provider_exclude. MCA btl ofi: parameter "btl_ofi_provider_exclude" (current value: "shm,sockets,tcp,udp,rstream", data source: default, level: 1 user/basic, type: string, synonym of: opal_common_ofi_provider_exclude) Comma-delimited list of OFI providers that are not considered for use (default: "sockets,mxm"; empty value means that all providers will be considered). Mutually exclusive with mtl_ofi_provider_include. MCA mtl ofi: parameter "mtl_ofi_provider_exclude" (current value: "shm,sockets,tcp,udp,rstream", data source: default, level: 1 user/basic, type: string, synonym of: opal_common_ofi_provider_exclude) Comma-delimited list of OFI providers that are not considered for use (default: "sockets,mxm"; empty value means that all providers will be considered). Mutually exclusive with mtl_ofi_provider_include. MCA mtl ofi: parameter "mtl_ofi_verbose" (current value: "0", data source: default, level: 3 user/all, type: int, synonym of: opal_common_ofi_verbose) related to #7755 Signed-off-by: Howard Pritchard <howardp@lanl.gov> (cherry picked from commit9f1081a07a
) (cherry picked from commit45b643d0cf
)
Этот коммит содержится в:
родитель
4a466e4f08
Коммит
8fcf9cee3b
@ -7,6 +7,8 @@
|
||||
# Copyright (c) 2017 IBM Corporation. All rights reserved.
|
||||
# Copyright (c) 2019 Research Organization for Information Science
|
||||
# and Technology (RIST). All rights reserved.
|
||||
# Copyright (c) 2020 Triad National Security, LLC. All rights
|
||||
# reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
|
@ -112,7 +112,7 @@ ompi_mtl_ofi_add_procs(struct mca_mtl_base_module_t *mtl,
|
||||
*/
|
||||
count = fi_av_insert(ompi_mtl_ofi.av, ep_names, nprocs, fi_addrs, 0, NULL);
|
||||
if ((count < 0) || (nprocs != (size_t)count)) {
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
opal_output_verbose(1, opal_common_ofi.output,
|
||||
"%s:%d: fi_av_insert failed: %d\n",
|
||||
__FILE__, __LINE__, count);
|
||||
ret = OMPI_ERROR;
|
||||
@ -125,7 +125,7 @@ ompi_mtl_ofi_add_procs(struct mca_mtl_base_module_t *mtl,
|
||||
for (i = 0; i < nprocs; ++i) {
|
||||
endpoint = OBJ_NEW(mca_mtl_ofi_endpoint_t);
|
||||
if (NULL == endpoint) {
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
opal_output_verbose(1, opal_common_ofi.output,
|
||||
"%s:%d: mtl/ofi: could not allocate endpoint"
|
||||
" structure\n",
|
||||
__FILE__, __LINE__);
|
||||
@ -170,7 +170,7 @@ ompi_mtl_ofi_del_procs(struct mca_mtl_base_module_t *mtl,
|
||||
endpoint = procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL];
|
||||
ret = fi_av_remove(ompi_mtl_ofi.av, &endpoint->peer_fiaddr, 1, 0);
|
||||
if (ret) {
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
opal_output_verbose(1, opal_common_ofi.output,
|
||||
"%s:%d: fi_av_remove failed: %s\n", __FILE__, __LINE__, fi_strerror(errno));
|
||||
return ret;
|
||||
}
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2013-2018 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2017 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2019 Triad National Security, LLC. All rights
|
||||
* Copyright (c) 2019-2020 Triad National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2018-2020 Amazon.com, Inc. or its affiliates. All rights
|
||||
* reserved.
|
||||
@ -38,6 +38,7 @@
|
||||
#include "ompi/mca/mtl/base/base.h"
|
||||
#include "ompi/mca/mtl/base/mtl_base_datatype.h"
|
||||
#include "ompi/message/message.h"
|
||||
#include "opal/mca/common/ofi/common_ofi.h"
|
||||
|
||||
#include "mtl_ofi_opt.h"
|
||||
#include "mtl_ofi_types.h"
|
||||
@ -235,7 +236,7 @@ ompi_mtl_ofi_progress(void)
|
||||
|
||||
#define MTL_OFI_LOG_FI_ERR(err, string) \
|
||||
do { \
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output, \
|
||||
opal_output_verbose(1, opal_common_ofi.output, \
|
||||
"%s:%d:%s: %s\n", \
|
||||
__FILE__, __LINE__, string, fi_strerror(-err)); \
|
||||
} while(0);
|
||||
@ -377,7 +378,7 @@ ompi_mtl_ofi_ssend_recv(ompi_mtl_ofi_request_t *ack_req,
|
||||
0, /* Exact match, no ignore bits */
|
||||
(void *) &ack_req->ctx), ret);
|
||||
if (OPAL_UNLIKELY(0 > ret)) {
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
opal_output_verbose(1, opal_common_ofi.output,
|
||||
"%s:%d: fi_trecv failed: %s(%zd)",
|
||||
__FILE__, __LINE__, fi_strerror(-ret), ret);
|
||||
free(ack_req);
|
||||
@ -663,7 +664,7 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
|
||||
status->_ucount = wc->len;
|
||||
|
||||
if (OPAL_UNLIKELY(wc->len > ofi_req->length)) {
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
opal_output_verbose(1, opal_common_ofi.output,
|
||||
"truncate expected: %ld %ld",
|
||||
wc->len, ofi_req->length);
|
||||
status->MPI_ERROR = MPI_ERR_TRUNCATE;
|
||||
@ -677,7 +678,7 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
|
||||
ofi_req->buffer,
|
||||
wc->len);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ompi_ret)) {
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
opal_output_verbose(1, opal_common_ofi.output,
|
||||
"%s:%d: ompi_mtl_datatype_unpack failed: %d",
|
||||
__FILE__, __LINE__, ompi_ret);
|
||||
status->MPI_ERROR = ompi_ret;
|
||||
@ -1330,7 +1331,7 @@ init_regular_ep:
|
||||
if (MPI_COMM_WORLD == comm) {
|
||||
ret = opal_progress_register(ompi_mtl_ofi_progress_no_inline);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
opal_output_verbose(1, opal_common_ofi.output,
|
||||
"%s:%d: opal_progress_register failed: %d\n",
|
||||
__FILE__, __LINE__, ret);
|
||||
goto init_error;
|
||||
|
@ -5,6 +5,9 @@
|
||||
* Copyright (c) 2014-2017 Cisco Systems, Inc. All rights reserved
|
||||
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
|
||||
* Copyright (c) 2020 Triad National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -27,8 +30,6 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
|
||||
bool enable_mpi_threads);
|
||||
|
||||
static int param_priority;
|
||||
static char *prov_include;
|
||||
static char *prov_exclude;
|
||||
static int control_progress;
|
||||
static int data_progress;
|
||||
static int av_type;
|
||||
@ -130,24 +131,6 @@ ompi_mtl_ofi_component_register(void)
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
¶m_priority);
|
||||
|
||||
prov_include = NULL;
|
||||
mca_base_component_var_register(&mca_mtl_ofi_component.super.mtl_version,
|
||||
"provider_include",
|
||||
"Comma-delimited list of OFI providers that are considered for use (e.g., \"psm,psm2\"; an empty value means that all providers will be considered). Mutually exclusive with mtl_ofi_provider_exclude.",
|
||||
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_1,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&prov_include);
|
||||
|
||||
prov_exclude = "shm,sockets,tcp,udp,rstream";
|
||||
mca_base_component_var_register(&mca_mtl_ofi_component.super.mtl_version,
|
||||
"provider_exclude",
|
||||
"Comma-delimited list of OFI providers that are not considered for use (default: \"sockets,mxm\"; empty value means that all providers will be considered). Mutually exclusive with mtl_ofi_provider_include.",
|
||||
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_1,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&prov_exclude);
|
||||
|
||||
ompi_mtl_ofi.ofi_progress_event_count = MTL_OFI_MAX_PROG_EVENT_COUNT;
|
||||
opal_asprintf(&desc, "Max number of events to read each call to OFI progress (default: %d events will be read per OFI progress call)", ompi_mtl_ofi.ofi_progress_event_count);
|
||||
mca_base_component_var_register(&mca_mtl_ofi_component.super.mtl_version,
|
||||
@ -267,6 +250,8 @@ ompi_mtl_ofi_component_register(void)
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&ompi_mtl_ofi.num_ofi_contexts);
|
||||
|
||||
opal_common_ofi_register_mca_variables(&mca_mtl_ofi_component.super.mtl_version);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
@ -311,6 +296,7 @@ ompi_mtl_ofi_component_query(mca_base_module_t **module, int *priority)
|
||||
static int
|
||||
ompi_mtl_ofi_component_close(void)
|
||||
{
|
||||
opal_common_ofi_mca_deregister();
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
@ -349,7 +335,7 @@ select_ofi_provider(struct fi_info *providers,
|
||||
if (NULL != include_list) {
|
||||
while ((NULL != prov) &&
|
||||
(!is_in_list(include_list, prov->fabric_attr->prov_name))) {
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
opal_output_verbose(1, opal_common_ofi.output,
|
||||
"%s:%d: mtl:ofi: \"%s\" not in include list\n",
|
||||
__FILE__, __LINE__,
|
||||
prov->fabric_attr->prov_name);
|
||||
@ -358,7 +344,7 @@ select_ofi_provider(struct fi_info *providers,
|
||||
} else if (NULL != exclude_list) {
|
||||
while ((NULL != prov) &&
|
||||
(is_in_list(exclude_list, prov->fabric_attr->prov_name))) {
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
opal_output_verbose(1, opal_common_ofi.output,
|
||||
"%s:%d: mtl:ofi: \"%s\" in exclude list\n",
|
||||
__FILE__, __LINE__,
|
||||
prov->fabric_attr->prov_name);
|
||||
@ -366,7 +352,7 @@ select_ofi_provider(struct fi_info *providers,
|
||||
}
|
||||
}
|
||||
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
opal_output_verbose(1, opal_common_ofi.output,
|
||||
"%s:%d: mtl:ofi:prov: %s\n",
|
||||
__FILE__, __LINE__,
|
||||
(prov ? prov->fabric_attr->prov_name : "none"));
|
||||
@ -396,6 +382,7 @@ select_ofi_provider(struct fi_info *providers,
|
||||
return prov;
|
||||
}
|
||||
|
||||
|
||||
/* Check if FI_REMOTE_CQ_DATA is supported, if so send the source rank there
|
||||
* FI_DIRECTED_RECV is also needed so receives can discrimate the source
|
||||
*/
|
||||
@ -481,7 +468,7 @@ ompi_mtl_ofi_define_tag_mode(int ofi_tag_mode, int *bits_for_cid) {
|
||||
do { \
|
||||
ompi_mtl_ofi.comm_to_context = calloc(arr_size, sizeof(int)); \
|
||||
if (OPAL_UNLIKELY(!ompi_mtl_ofi.comm_to_context)) { \
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output, \
|
||||
opal_output_verbose(1, opal_common_ofi.output, \
|
||||
"%s:%d: alloc of comm_to_context array failed: %s\n",\
|
||||
__FILE__, __LINE__, strerror(errno)); \
|
||||
return ret; \
|
||||
@ -493,7 +480,7 @@ ompi_mtl_ofi_define_tag_mode(int ofi_tag_mode, int *bits_for_cid) {
|
||||
ompi_mtl_ofi.ofi_ctxt = (mca_mtl_ofi_context_t *) malloc(ompi_mtl_ofi.num_ofi_contexts * \
|
||||
sizeof(mca_mtl_ofi_context_t)); \
|
||||
if (OPAL_UNLIKELY(!ompi_mtl_ofi.ofi_ctxt)) { \
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output, \
|
||||
opal_output_verbose(1, opal_common_ofi.output, \
|
||||
"%s:%d: alloc of ofi_ctxt array failed: %s\n", \
|
||||
__FILE__, __LINE__, strerror(errno)); \
|
||||
return ret; \
|
||||
@ -641,17 +628,19 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
|
||||
int universe_size;
|
||||
char *univ_size_str;
|
||||
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
"%s:%d: mtl:ofi:provider_include = \"%s\"\n",
|
||||
__FILE__, __LINE__, prov_include);
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
"%s:%d: mtl:ofi:provider_exclude = \"%s\"\n",
|
||||
__FILE__, __LINE__, prov_exclude);
|
||||
opal_common_ofi_mca_register();
|
||||
|
||||
if (NULL != prov_include) {
|
||||
include_list = opal_argv_split(prov_include, ',');
|
||||
} else if (NULL != prov_exclude) {
|
||||
exclude_list = opal_argv_split(prov_exclude, ',');
|
||||
opal_output_verbose(1, opal_common_ofi.output,
|
||||
"%s:%d: mtl:ofi:provider_include = \"%s\"\n",
|
||||
__FILE__, __LINE__, *opal_common_ofi.prov_include);
|
||||
opal_output_verbose(1, opal_common_ofi.output,
|
||||
"%s:%d: mtl:ofi:provider_exclude = \"%s\"\n",
|
||||
__FILE__, __LINE__, *opal_common_ofi.prov_exclude);
|
||||
|
||||
if (NULL != *opal_common_ofi.prov_include) {
|
||||
include_list = opal_argv_split(*opal_common_ofi.prov_include, ',');
|
||||
} else if (NULL != *opal_common_ofi.prov_exclude) {
|
||||
exclude_list = opal_argv_split(*opal_common_ofi.prov_exclude, ',');
|
||||
}
|
||||
|
||||
/**
|
||||
@ -666,7 +655,7 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
|
||||
*/
|
||||
hints = fi_allocinfo();
|
||||
if (!hints) {
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
opal_output_verbose(1, opal_common_ofi.output,
|
||||
"%s:%d: Could not allocate fi_info\n",
|
||||
__FILE__, __LINE__);
|
||||
goto error;
|
||||
@ -752,7 +741,7 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
|
||||
|
||||
ret = fi_getinfo(fi_version, NULL, NULL, 0ULL, hints_dup, &providers);
|
||||
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
opal_output_verbose(1, opal_common_ofi.output,
|
||||
"%s:%d: EFA specific fi_getinfo(): %s\n",
|
||||
__FILE__, __LINE__, fi_strerror(-ret));
|
||||
|
||||
@ -789,7 +778,7 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
|
||||
hints, /* In: Hints to filter providers */
|
||||
&providers); /* Out: List of matching providers */
|
||||
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
opal_output_verbose(1, opal_common_ofi.output,
|
||||
"%s:%d: fi_getinfo(): %s\n",
|
||||
__FILE__, __LINE__, fi_strerror(-ret));
|
||||
|
||||
@ -810,7 +799,7 @@ select_prov:
|
||||
*/
|
||||
prov = select_ofi_provider(providers, include_list, exclude_list);
|
||||
if (!prov) {
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
opal_output_verbose(1, opal_common_ofi.output,
|
||||
"%s:%d: select_ofi_provider: no provider found\n",
|
||||
__FILE__, __LINE__);
|
||||
goto error;
|
||||
@ -839,7 +828,7 @@ select_prov:
|
||||
/* Fallback to MTL_OFI_TAG_1 */
|
||||
ompi_mtl_ofi_define_tag_mode(MTL_OFI_TAG_1, &ofi_tag_bits_for_cid);
|
||||
} else { /* MTL_OFI_TAG_FULL */
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
opal_output_verbose(1, opal_common_ofi.output,
|
||||
"%s:%d: OFI provider %s does not support FI_REMOTE_CQ_DATA\n",
|
||||
__FILE__, __LINE__, prov->fabric_attr->prov_name);
|
||||
goto error;
|
||||
@ -919,7 +908,7 @@ select_prov:
|
||||
ompi_process_info.nodename, __FILE__, __LINE__);
|
||||
goto error;
|
||||
} else if (1 == sep_support_in_provider) {
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
opal_output_verbose(1, opal_common_ofi.output,
|
||||
"%s:%d: Scalable EP supported in %s provider. Enabling in MTL.\n",
|
||||
__FILE__, __LINE__, prov->fabric_attr->prov_name);
|
||||
}
|
||||
@ -1078,7 +1067,7 @@ select_prov:
|
||||
&ep_name,
|
||||
namelen);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
opal_output_verbose(1, opal_common_ofi.output,
|
||||
"%s:%d: modex_send failed: %d\n",
|
||||
__FILE__, __LINE__, ret);
|
||||
goto error;
|
||||
|
@ -13,6 +13,8 @@
|
||||
# Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
|
||||
# Copyright (c) 2017 IBM Corporation. All rights reserved.
|
||||
# Copyright (c) 2018 Intel, inc. All rights reserved
|
||||
# Copyright (c) 2020 Triad National Security, LLC. All rights
|
||||
# reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
|
@ -14,6 +14,9 @@
|
||||
* reserved.
|
||||
* Copyright (c) 2018 Intel, Inc, All rights reserved
|
||||
*
|
||||
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
|
||||
* Copyright (c) 2020 Triad National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -41,7 +44,6 @@
|
||||
|
||||
#define MCA_BTL_OFI_REQUESTED_MR_MODE (FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_VIRT_ADDR)
|
||||
|
||||
static char *prov_include;
|
||||
static char *ofi_progress_mode;
|
||||
static bool disable_sep;
|
||||
static int mca_btl_ofi_init_device(struct fi_info *info);
|
||||
@ -107,20 +109,6 @@ static int mca_btl_ofi_component_register(void)
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_btl_ofi_component.mode);
|
||||
|
||||
/* fi_getinfo with prov_name == NULL means ALL provider.
|
||||
* Since now we are using the first valid info returned, I'm not sure
|
||||
* if we need to provide the support for comma limited provider list. */
|
||||
prov_include = NULL;
|
||||
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
|
||||
"provider_include",
|
||||
"OFI provider that ofi btl will query for. This parameter only "
|
||||
"accept ONE provider name. "
|
||||
"(e.g., \"psm2\"; an empty value means that all providers will "
|
||||
"be considered.",
|
||||
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_4,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&prov_include);
|
||||
|
||||
mca_btl_ofi_component.num_cqe_read = MCA_BTL_OFI_NUM_CQE_READ;
|
||||
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
|
||||
@ -185,6 +173,8 @@ static int mca_btl_ofi_component_register(void)
|
||||
/* for now we want this component to lose to the MTL. */
|
||||
module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH - 50;
|
||||
|
||||
opal_common_ofi_register_mca_variables(&mca_btl_ofi_component.super.btl_version);
|
||||
|
||||
return mca_btl_base_param_register (&mca_btl_ofi_component.super.btl_version,
|
||||
&module->super);
|
||||
}
|
||||
@ -200,7 +190,8 @@ static int mca_btl_ofi_component_open(void)
|
||||
*/
|
||||
static int mca_btl_ofi_component_close(void)
|
||||
{
|
||||
/* If we don't sleep, sockets provider freaks out. */
|
||||
opal_common_ofi_mca_deregister();
|
||||
/* If we don't sleep, sockets provider freaks out. Ummm this is a scary comment */
|
||||
sleep(1);
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
@ -225,6 +216,7 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init (int *num_btl_modules,
|
||||
uint64_t progress_mode;
|
||||
unsigned resource_count = 0;
|
||||
struct mca_btl_base_module_t **base_modules;
|
||||
char **include_list = NULL;
|
||||
|
||||
BTL_VERBOSE(("initializing ofi btl"));
|
||||
|
||||
@ -247,6 +239,8 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init (int *num_btl_modules,
|
||||
struct fi_domain_attr domain_attr = {0};
|
||||
uint64_t required_caps;
|
||||
|
||||
opal_common_ofi_mca_register();
|
||||
|
||||
switch (mca_btl_ofi_component.mode) {
|
||||
|
||||
case MCA_BTL_OFI_MODE_TWO_SIDED:
|
||||
@ -266,8 +260,12 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init (int *num_btl_modules,
|
||||
break;
|
||||
}
|
||||
|
||||
/* Select the provider */
|
||||
fabric_attr.prov_name = prov_include;
|
||||
fabric_attr.prov_name = NULL;
|
||||
/* Select the provider - sort of. we just take first element in list for now */
|
||||
if (NULL != *opal_common_ofi.prov_include) {
|
||||
include_list = opal_argv_split(*opal_common_ofi.prov_include, ',');
|
||||
fabric_attr.prov_name = include_list[0];
|
||||
}
|
||||
|
||||
domain_attr.mr_mode = MCA_BTL_OFI_REQUESTED_MR_MODE;
|
||||
|
||||
@ -312,9 +310,13 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init (int *num_btl_modules,
|
||||
rc = fi_getinfo(FI_VERSION(1, 5), NULL, NULL, 0, &hints, &info_list);
|
||||
if (0 != rc) {
|
||||
BTL_VERBOSE(("fi_getinfo failed with code %d: %s",rc, fi_strerror(-rc)));
|
||||
if (NULL != include_list) {
|
||||
opal_argv_free(include_list);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
/* count the number of resources/ */
|
||||
info = info_list;
|
||||
while(info) {
|
||||
@ -356,6 +358,9 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init (int *num_btl_modules,
|
||||
|
||||
/* We are done with the returned info. */
|
||||
fi_freeinfo(info_list);
|
||||
if (NULL != include_list) {
|
||||
opal_argv_free(include_list);
|
||||
}
|
||||
|
||||
/* pass module array back to caller */
|
||||
base_modules = calloc (mca_btl_ofi_component.module_count, sizeof (*base_modules));
|
||||
|
@ -2,6 +2,8 @@
|
||||
* Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2017 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2020 Triad National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -9,22 +11,110 @@
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "opal_config.h"
|
||||
#include "opal/constants.h"
|
||||
#include "opal/mca/hwloc/base/base.h"
|
||||
|
||||
#include <errno.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "common_ofi.h"
|
||||
#include "opal_config.h"
|
||||
#include "opal/constants.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/mca/base/mca_base_var.h"
|
||||
#include "opal/mca/base/mca_base_framework.h"
|
||||
#include "opal/mca/hwloc/base/base.h"
|
||||
|
||||
int mca_common_ofi_register_mca_variables(void)
|
||||
OPAL_DECLSPEC opal_common_ofi_module_t opal_common_ofi = {
|
||||
.prov_include = NULL,
|
||||
.prov_exclude = NULL,
|
||||
.registered = 0,
|
||||
.verbose = 0
|
||||
};
|
||||
|
||||
static const char default_prov_exclude_list[] = "shm,sockets,tcp,udp,rstream";
|
||||
|
||||
OPAL_DECLSPEC int opal_common_ofi_register_mca_variables(const mca_base_component_t *component)
|
||||
{
|
||||
if (fi_version() >= FI_VERSION(1,0)) {
|
||||
return OPAL_SUCCESS;
|
||||
} else {
|
||||
static int registered = 0;
|
||||
static int include_index;
|
||||
static int exclude_index;
|
||||
static int verbose_index;
|
||||
|
||||
if (fi_version() < FI_VERSION(1,0)) {
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
if (!registered) {
|
||||
/*
|
||||
* this monkey business is needed because of the way the MCA VARs stuff tries to handle pointers to strings when
|
||||
* when destructing the MCA var database. If you don't do something like this,the MCA var framework will try
|
||||
* to dereference a pointer which itself is no longer a valid address owing to having been previously dlclosed.
|
||||
*/
|
||||
opal_common_ofi.prov_include = (char **)malloc(sizeof(char *));
|
||||
*opal_common_ofi.prov_include = NULL;
|
||||
include_index = mca_base_var_register("opal", "opal_common", "ofi",
|
||||
"provider_include",
|
||||
"Comma-delimited list of OFI providers that are considered for use (e.g., \"psm,psm2\"; an empty value means that all providers will be considered). Mutually exclusive with mtl_ofi_provider_exclude.",
|
||||
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_1,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
opal_common_ofi.prov_include);
|
||||
opal_common_ofi.prov_exclude = (char **)malloc(sizeof(char *));
|
||||
*opal_common_ofi.prov_exclude = strdup(default_prov_exclude_list);
|
||||
exclude_index = mca_base_var_register("opal", "opal_common", "ofi",
|
||||
"provider_exclude",
|
||||
"Comma-delimited list of OFI providers that are not considered for use (default: \"sockets,mxm\"; empty value means that all providers will be considered). Mutually exclusive with mtl_ofi_provider_include.",
|
||||
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_1,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
opal_common_ofi.prov_exclude);
|
||||
verbose_index = mca_base_var_register("opal", "opal_common", "ofi", "verbose",
|
||||
"Verbose level of the OFI components",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
|
||||
MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
&opal_common_ofi.verbose);
|
||||
registered = 1;
|
||||
}
|
||||
|
||||
if (component) {
|
||||
mca_base_var_register_synonym(include_index, component->mca_project_name,
|
||||
component->mca_type_name,
|
||||
component->mca_component_name,
|
||||
"provider_include", 0);
|
||||
mca_base_var_register_synonym(exclude_index, component->mca_project_name,
|
||||
component->mca_type_name,
|
||||
component->mca_component_name,
|
||||
"provider_exclude", 0);
|
||||
mca_base_var_register_synonym(verbose_index, component->mca_project_name,
|
||||
component->mca_type_name,
|
||||
component->mca_component_name,
|
||||
"verbose", 0);
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
OPAL_DECLSPEC void opal_common_ofi_mca_register(void)
|
||||
{
|
||||
opal_common_ofi.registered++;
|
||||
if (opal_common_ofi.registered > 1) {
|
||||
opal_output_set_verbosity(opal_common_ofi.output, opal_common_ofi.verbose);
|
||||
return;
|
||||
}
|
||||
|
||||
opal_common_ofi.output = opal_output_open(NULL);
|
||||
opal_output_set_verbosity(opal_common_ofi.output, opal_common_ofi.verbose);
|
||||
}
|
||||
|
||||
OPAL_DECLSPEC void opal_common_ofi_mca_deregister(void)
|
||||
{
|
||||
/* unregister only on last deregister */
|
||||
opal_common_ofi.registered--;
|
||||
assert(opal_common_ofi.registered >= 0);
|
||||
if (opal_common_ofi.registered) {
|
||||
return;
|
||||
}
|
||||
opal_output_close(opal_common_ofi.output);
|
||||
}
|
||||
|
||||
/* check that the tx attributes match */
|
||||
|
@ -3,6 +3,9 @@
|
||||
* Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2017 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2020 Triad National Security, LLC. All rights
|
||||
* reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -12,9 +15,31 @@
|
||||
|
||||
#ifndef OPAL_MCA_COMMON_OFI_H
|
||||
#define OPAL_MCA_COMMON_OFI_H
|
||||
|
||||
#include "opal_config.h"
|
||||
#include "opal/mca/base/mca_base_var.h"
|
||||
#include "opal/mca/base/mca_base_framework.h"
|
||||
#include <rdma/fabric.h>
|
||||
|
||||
OPAL_DECLSPEC int mca_common_ofi_register_mca_variables(void);
|
||||
BEGIN_C_DECLS
|
||||
|
||||
typedef struct opal_common_ofi_module {
|
||||
char **prov_include;
|
||||
char **prov_exclude;
|
||||
int verbose;
|
||||
int registered;
|
||||
int output;
|
||||
} opal_common_ofi_module_t;
|
||||
|
||||
extern opal_common_ofi_module_t opal_common_ofi;
|
||||
|
||||
OPAL_DECLSPEC int opal_common_ofi_register_mca_variables(const mca_base_component_t *component);
|
||||
OPAL_DECLSPEC void opal_common_ofi_mca_register(void);
|
||||
OPAL_DECLSPEC void opal_common_ofi_mca_deregister(void);
|
||||
OPAL_DECLSPEC struct fi_info* opal_common_ofi_select_ofi_provider(struct fi_info *providers,
|
||||
char *framework_name);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
struct fi_info* opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int rank);
|
||||
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user