btl/usnic: update for libfabric v1.4
With libfabric v1.4, the usnic provider changed the values of its fabric and domain name strings (compared to libfabric <v1.4). Update the Open MPI usNIC BTL to handle both pre-v1.4 and v1.4 fabric/domain names. Signed-off-by: Jeff Squyres <jsquyres@cisco.com>
Этот коммит содержится в:
родитель
6de64ddbc1
Коммит
6f5e377fe0
@ -197,7 +197,7 @@ int opal_btl_usnic_connectivity_listen(opal_btl_usnic_module_t *module)
|
||||
/* Ensure to NULL-terminate the passed strings */
|
||||
strncpy(cmd.nodename, opal_process_info.nodename,
|
||||
CONNECTIVITY_NODENAME_LEN - 1);
|
||||
strncpy(cmd.usnic_name, module->fabric_info->fabric_attr->name,
|
||||
strncpy(cmd.usnic_name, module->linux_device_name,
|
||||
CONNECTIVITY_IFNAME_LEN - 1);
|
||||
|
||||
if (OPAL_SUCCESS != opal_fd_write(agent_fd, sizeof(cmd), &cmd)) {
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2014-2015 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2016 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -536,7 +536,7 @@ opal_btl_usnic_prepare_src(
|
||||
|
||||
#if MSGDEBUG2
|
||||
opal_output(0, "prep_src: %s %s frag %p, size=%d+%u (was %u), conv=%p\n",
|
||||
module->fabric_info->fabric_attr->name,
|
||||
module->linux_device_name,
|
||||
(reserve + *size) <= module->max_frag_payload?"small":"large",
|
||||
(void *)frag, (int)reserve, (unsigned)*size, (unsigned)osize,
|
||||
(void *)convertor);
|
||||
@ -723,7 +723,7 @@ opal_btl_usnic_prepare_src(struct mca_btl_base_module_t *base_module,
|
||||
|
||||
#if MSGDEBUG2
|
||||
opal_output(0, "prep_src: %s %s frag %p, size=%d+%u (was %u), conv=%p\n",
|
||||
module->fabric_info->fabric_attr->name,
|
||||
module->linux_device_name,
|
||||
(reserve + *size) <= module->max_frag_payload?"small":"large",
|
||||
(void *)frag, (int)reserve, (unsigned)*size, (unsigned)osize,
|
||||
(void *)convertor);
|
||||
|
@ -378,7 +378,7 @@ static int check_usnic_config(opal_btl_usnic_module_t *module,
|
||||
"not enough usnic resources",
|
||||
true,
|
||||
opal_process_info.nodename,
|
||||
info->fabric_attr->name,
|
||||
module->linux_device_name,
|
||||
str);
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
@ -543,10 +543,12 @@ static bool filter_module(opal_btl_usnic_module_t *module,
|
||||
struct fi_usnic_info *uip;
|
||||
struct fi_info *info;
|
||||
bool match;
|
||||
const char *linux_device_name;
|
||||
|
||||
info = module->fabric_info;
|
||||
uip = &module->usnic_info;
|
||||
src = info->src_addr;
|
||||
linux_device_name = module->linux_device_name;
|
||||
module_mask = src->sin_addr.s_addr & uip->ui.v1.ui_netmask_be;
|
||||
match = false;
|
||||
for (i = 0; i < filter->n_elt; ++i) {
|
||||
@ -559,7 +561,7 @@ static bool filter_module(opal_btl_usnic_module_t *module,
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (strcmp(filter->elts[i].if_name, info->fabric_attr->name) == 0) {
|
||||
if (strcmp(filter->elts[i].if_name, linux_device_name) == 0) {
|
||||
match = true;
|
||||
break;
|
||||
}
|
||||
@ -590,6 +592,25 @@ static void free_filter(usnic_if_filter_t *filter)
|
||||
free(filter);
|
||||
}
|
||||
|
||||
static int do_fi_getinfo(uint32_t version, struct fi_info **info_list)
|
||||
{
|
||||
struct fi_info hints = {0};
|
||||
struct fi_ep_attr ep_attr = {0};
|
||||
struct fi_fabric_attr fabric_attr = {0};
|
||||
|
||||
/* We only want providers named "usnic" that are of type EP_DGRAM */
|
||||
fabric_attr.prov_name = "usnic";
|
||||
ep_attr.type = FI_EP_DGRAM;
|
||||
|
||||
hints.caps = FI_MSG;
|
||||
hints.mode = FI_LOCAL_MR | FI_MSG_PREFIX;
|
||||
hints.addr_format = FI_SOCKADDR;
|
||||
hints.ep_attr = &ep_attr;
|
||||
hints.fabric_attr = &fabric_attr;
|
||||
|
||||
return fi_getinfo(version, NULL, 0, 0, &hints, info_list);
|
||||
}
|
||||
|
||||
/*
|
||||
* UD component initialization:
|
||||
* (1) read interface list from kernel and compare against component
|
||||
@ -611,9 +632,6 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
|
||||
int min_distance, num_local_procs;
|
||||
struct fi_info *info_list;
|
||||
struct fi_info *info;
|
||||
struct fi_info hints = {0};
|
||||
struct fi_ep_attr ep_attr = {0};
|
||||
struct fi_fabric_attr fabric_attr = {0};
|
||||
struct fid_fabric *fabric;
|
||||
struct fid_domain *domain;
|
||||
int ret;
|
||||
@ -636,19 +654,9 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
|
||||
|
||||
OBJ_CONSTRUCT(&btl_usnic_lock, opal_recursive_mutex_t);
|
||||
|
||||
/* We only want providers named "usnic that are of type EP_DGRAM */
|
||||
fabric_attr.prov_name = "usnic";
|
||||
ep_attr.type = FI_EP_DGRAM;
|
||||
|
||||
hints.caps = FI_MSG;
|
||||
hints.mode = FI_LOCAL_MR | FI_MSG_PREFIX;
|
||||
hints.addr_format = FI_SOCKADDR;
|
||||
hints.ep_attr = &ep_attr;
|
||||
hints.fabric_attr = &fabric_attr;
|
||||
|
||||
/* This code understands libfabric API v1.0 and v1.1. Even if we
|
||||
were compiled with libfabric API v1.0, we still want to request
|
||||
v1.1 -- here's why:
|
||||
/* This code understands libfabric API versions v1.0, v1.1, and
|
||||
v1.4. Even if we were compiled with libfabric API v1.0, we
|
||||
still want to request v1.1 -- here's why:
|
||||
|
||||
- In libfabric v1.0.0 (i.e., API v1.0), the usnic provider did
|
||||
not check the value of the "version" parameter passed into
|
||||
@ -664,6 +672,17 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
|
||||
So never request API v1.0 -- always request a minimum of
|
||||
v1.1.
|
||||
|
||||
The usnic provider changed the strings in the fabric and domain
|
||||
names in API v1.4. With API <= v1.3:
|
||||
|
||||
- fabric name is "usnic_X" (device name)
|
||||
- domain name is NULL
|
||||
|
||||
With libfabric API >= v1.4:
|
||||
|
||||
- fabric name is "a.b.c.d/e" (CIDR notation of network)
|
||||
- domain name is "usnic_X" (device name)
|
||||
|
||||
NOTE: The configure.m4 in this component will require libfabric
|
||||
>= v1.1.0 (i.e., it won't accept v1.0.0) because of a critical
|
||||
bug in the usnic provider in libfabric v1.0.0. However, the
|
||||
@ -677,9 +696,17 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
|
||||
|
||||
Someday, #2 may no longer be true, and we may therefore rip out
|
||||
the libfabric v1.0.0 compatibility code. */
|
||||
|
||||
/* First try API version 1.4. If that doesn't work, try API
|
||||
version 1.1. */
|
||||
uint32_t libfabric_api;
|
||||
libfabric_api = FI_VERSION(1, 4);
|
||||
ret = do_fi_getinfo(libfabric_api, &info_list);
|
||||
// Libfabric core will return -FI_ENOSYS if it is too old
|
||||
if (-FI_ENOSYS == ret) {
|
||||
libfabric_api = FI_VERSION(1, 1);
|
||||
ret = fi_getinfo(libfabric_api, NULL, 0, 0, &hints, &info_list);
|
||||
ret = do_fi_getinfo(libfabric_api, &info_list);
|
||||
}
|
||||
if (0 != ret) {
|
||||
opal_output_verbose(5, USNIC_OUT,
|
||||
"btl:usnic: disqualifiying myself due to fi_getinfo failure: %s (%d)", strerror(-ret), ret);
|
||||
@ -800,13 +827,21 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
|
||||
i < mca_btl_usnic_component.max_modules);
|
||||
++i, info = info->next) {
|
||||
|
||||
// The fabric/domain names changed at libfabric API v1.4 (see above).
|
||||
char *linux_device_name;
|
||||
if (libfabric_api <= FI_VERSION(1, 3)) {
|
||||
linux_device_name = info->fabric_attr->name;
|
||||
} else {
|
||||
linux_device_name = info->domain_attr->name;
|
||||
}
|
||||
|
||||
ret = fi_fabric(info->fabric_attr, &fabric, NULL);
|
||||
if (0 != ret) {
|
||||
opal_show_help("help-mpi-btl-usnic.txt",
|
||||
"libfabric API failed",
|
||||
true,
|
||||
opal_process_info.nodename,
|
||||
info->fabric_attr->name,
|
||||
linux_device_name,
|
||||
"fi_fabric()", __FILE__, __LINE__,
|
||||
ret,
|
||||
strerror(-ret));
|
||||
@ -820,7 +855,7 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
|
||||
"libfabric API failed",
|
||||
true,
|
||||
opal_process_info.nodename,
|
||||
info->fabric_attr->name,
|
||||
linux_device_name,
|
||||
"fi_domain()", __FILE__, __LINE__,
|
||||
ret,
|
||||
strerror(-ret));
|
||||
@ -829,8 +864,8 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
|
||||
opal_memchecker_base_mem_defined(&domain, sizeof(domain));
|
||||
|
||||
opal_output_verbose(5, USNIC_OUT,
|
||||
"btl:usnic: found: usNIC direct device %s",
|
||||
info->fabric_attr->name);
|
||||
"btl:usnic: found: usNIC device %s",
|
||||
linux_device_name);
|
||||
|
||||
/* Save a little info on the module that we have already
|
||||
gathered. The rest of the module will be filled in
|
||||
@ -841,6 +876,12 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
|
||||
module->fabric = fabric;
|
||||
module->domain = domain;
|
||||
module->fabric_info = info;
|
||||
module->libfabric_api = libfabric_api;
|
||||
module->linux_device_name = strdup(linux_device_name);
|
||||
if (NULL == module->linux_device_name) {
|
||||
OPAL_ERROR_LOG(OPAL_ERR_OUT_OF_RESOURCE);
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* Obtain usnic-specific device info (e.g., netmask) that
|
||||
doesn't come in the normal fi_getinfo(). This allows us to
|
||||
@ -850,7 +891,7 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
|
||||
if (ret != 0) {
|
||||
opal_output_verbose(5, USNIC_OUT,
|
||||
"btl:usnic: device %s fabric_open_ops failed %d (%s)",
|
||||
info->fabric_attr->name, ret, fi_strerror(-ret));
|
||||
module->linux_device_name, ret, fi_strerror(-ret));
|
||||
fi_close(&domain->fid);
|
||||
fi_close(&fabric->fid);
|
||||
continue;
|
||||
@ -863,14 +904,14 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
|
||||
if (ret != 0) {
|
||||
opal_output_verbose(5, USNIC_OUT,
|
||||
"btl:usnic: device %s usnic_getinfo failed %d (%s)",
|
||||
info->fabric_attr->name, ret, fi_strerror(-ret));
|
||||
module->linux_device_name, ret, fi_strerror(-ret));
|
||||
fi_close(&domain->fid);
|
||||
fi_close(&fabric->fid);
|
||||
continue;
|
||||
}
|
||||
opal_output_verbose(5, USNIC_OUT,
|
||||
"btl:usnic: device %s usnic_info: link speed=%d, netmask=0x%x, ifname=%s, num_vf=%d, qp/vf=%d, cq/vf=%d",
|
||||
info->fabric_attr->name,
|
||||
module->linux_device_name,
|
||||
(unsigned int) module->usnic_info.ui.v1.ui_link_speed,
|
||||
(unsigned int) module->usnic_info.ui.v1.ui_netmask_be,
|
||||
module->usnic_info.ui.v1.ui_ifname,
|
||||
@ -884,7 +925,7 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
|
||||
opal_output_verbose(5, USNIC_OUT,
|
||||
"btl:usnic: %s %s due to %s",
|
||||
(keep_module ? "keeping" : "skipping"),
|
||||
info->fabric_attr->name,
|
||||
module->linux_device_name,
|
||||
(filter_incl ? "if_include" : "if_exclude"));
|
||||
if (!keep_module) {
|
||||
fi_close(&domain->fid);
|
||||
@ -902,7 +943,7 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
|
||||
check_usnic_config(module, num_local_procs) != OPAL_SUCCESS) {
|
||||
opal_output_verbose(5, USNIC_OUT,
|
||||
"btl:usnic: device %s is not provisioned with enough resources -- skipping",
|
||||
info->fabric_attr->name);
|
||||
module->linux_device_name);
|
||||
fi_close(&domain->fid);
|
||||
fi_close(&fabric->fid);
|
||||
|
||||
@ -916,7 +957,7 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
|
||||
|
||||
opal_output_verbose(5, USNIC_OUT,
|
||||
"btl:usnic: device %s looks good!",
|
||||
info->fabric_attr->name);
|
||||
module->linux_device_name);
|
||||
|
||||
/* Let this module advance to the next round! */
|
||||
btls[j++] = &(module->super);
|
||||
@ -966,7 +1007,7 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
|
||||
btls[num_final_modules++] = &(module->super);
|
||||
|
||||
/* Output all of this module's values. */
|
||||
const char *devname = module->fabric_info->fabric_attr->name;
|
||||
const char *devname = module->linux_device_name;
|
||||
opal_output_verbose(5, USNIC_OUT,
|
||||
"btl:usnic: %s num sqe=%d, num rqe=%d, num cqe=%d, num aveqe=%d",
|
||||
devname,
|
||||
@ -1212,7 +1253,7 @@ usnic_handle_cq_error(opal_btl_usnic_module_t* module,
|
||||
|
||||
if (cq_ret != -FI_EAVAIL) {
|
||||
BTL_ERROR(("%s: cq_read ret = %d (%s)",
|
||||
module->fabric_info->fabric_attr->name, cq_ret,
|
||||
module->linux_device_name, cq_ret,
|
||||
fi_strerror(-cq_ret)));
|
||||
channel->chan_error = true;
|
||||
}
|
||||
@ -1222,7 +1263,7 @@ usnic_handle_cq_error(opal_btl_usnic_module_t* module,
|
||||
return;
|
||||
} else if (rc != mca_btl_usnic_component.cq_readerr_success_value) {
|
||||
BTL_ERROR(("%s: cq_readerr ret = %d (expected %d)",
|
||||
module->fabric_info->fabric_attr->name, rc,
|
||||
module->linux_device_name, rc,
|
||||
(int) mca_btl_usnic_component.cq_readerr_success_value));
|
||||
channel->chan_error = true;
|
||||
}
|
||||
@ -1235,7 +1276,7 @@ usnic_handle_cq_error(opal_btl_usnic_module_t* module,
|
||||
static int once = 0;
|
||||
if (once++ == 0) {
|
||||
BTL_ERROR(("%s: Channel %d, %s",
|
||||
module->fabric_info->fabric_attr->name,
|
||||
module->linux_device_name,
|
||||
channel->chan_index,
|
||||
FI_ECRC == err_entry.prov_errno ?
|
||||
"CRC error" : "message truncation"));
|
||||
@ -1256,7 +1297,7 @@ usnic_handle_cq_error(opal_btl_usnic_module_t* module,
|
||||
}
|
||||
} else {
|
||||
BTL_ERROR(("%s: CQ[%d] prov_err = %d",
|
||||
module->fabric_info->fabric_attr->name, channel->chan_index,
|
||||
module->linux_device_name, channel->chan_index,
|
||||
err_entry.prov_errno));
|
||||
channel->chan_error = true;
|
||||
}
|
||||
@ -1469,7 +1510,7 @@ void opal_btl_usnic_component_debug(void)
|
||||
module = mca_btl_usnic_component.usnic_active_modules[i];
|
||||
|
||||
opal_output(0, "active_modules[%d]=%p %s max{frag,chunk,tiny}=%llu,%llu,%llu\n",
|
||||
i, (void *)module, module->fabric_info->fabric_attr->name,
|
||||
i, (void *)module, module->linux_device_name,
|
||||
(unsigned long long)module->max_frag_payload,
|
||||
(unsigned long long)module->max_chunk_payload,
|
||||
(unsigned long long)module->max_tiny_payload);
|
||||
|
@ -162,7 +162,7 @@ static hwloc_obj_t find_device_numa(opal_btl_usnic_module_t *module)
|
||||
if (obj->type != HWLOC_OBJ_NODE) {
|
||||
opal_output_verbose(5, USNIC_OUT,
|
||||
"btl:usnic:filter_numa: could not find NUMA node for %s; filtering by NUMA distance not possible",
|
||||
module->fabric_info->fabric_attr->name);
|
||||
module->linux_device_name);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -218,7 +218,7 @@ int opal_btl_usnic_hwloc_distance(opal_btl_usnic_module_t *module)
|
||||
|
||||
opal_output_verbose(5, USNIC_OUT,
|
||||
"btl:usnic:filter_numa: %s is distance %d from me",
|
||||
module->fabric_info->fabric_attr->name,
|
||||
module->linux_device_name,
|
||||
module->numa_distance);
|
||||
}
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -30,8 +30,8 @@ static int map_compare_modules(const void *aa, const void *bb)
|
||||
opal_btl_usnic_module_t *a = *((opal_btl_usnic_module_t**) aa);
|
||||
opal_btl_usnic_module_t *b = *((opal_btl_usnic_module_t**) bb);
|
||||
|
||||
return strcmp(a->fabric_info->fabric_attr->name,
|
||||
b->fabric_info->fabric_attr->name);
|
||||
return strcmp(a->linux_device_name,
|
||||
b->linux_device_name);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -74,7 +74,7 @@ static int map_output_modules(FILE *fp)
|
||||
prefix_len);
|
||||
|
||||
fprintf(fp, "device=%s,ip=%s,mss=%" PRIsize_t "\n",
|
||||
modules[i]->fabric_info->fabric_attr->name,
|
||||
modules[i]->linux_device_name,
|
||||
ipv4, modules[i]->fabric_info->ep_attr->max_msg_size);
|
||||
}
|
||||
|
||||
@ -102,8 +102,8 @@ static int map_compare_endpoints(const void *aa, const void *bb)
|
||||
return -1;
|
||||
}
|
||||
|
||||
return strcmp(a->endpoint_module->fabric_info->fabric_attr->name,
|
||||
b->endpoint_module->fabric_info->fabric_attr->name);
|
||||
return strcmp(a->endpoint_module->linux_device_name,
|
||||
b->endpoint_module->linux_device_name);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -148,7 +148,7 @@ static int map_output_endpoints(FILE *fp, opal_btl_usnic_proc_t *proc)
|
||||
eps[i]->endpoint_remote_modex.netmask);
|
||||
|
||||
fprintf(fp, "device=%s@peer_ip=%s",
|
||||
eps[i]->endpoint_module->fabric_info->fabric_attr->name,
|
||||
eps[i]->endpoint_module->linux_device_name,
|
||||
ipv4);
|
||||
++num_output;
|
||||
}
|
||||
|
@ -102,7 +102,7 @@ static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module,
|
||||
if (opal_proc == my_proc) {
|
||||
opal_output_verbose(75, USNIC_OUT,
|
||||
"btl:usnic:add_procs:%s: not connecting to self",
|
||||
module->fabric_info->fabric_attr->name);
|
||||
module->linux_device_name);
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -110,7 +110,7 @@ static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module,
|
||||
if (OPAL_PROC_ON_LOCAL_NODE(opal_proc->proc_flags)) {
|
||||
opal_output_verbose(75, USNIC_OUT,
|
||||
"btl:usnic:add_procs:%s: not connecting to %s on same server",
|
||||
module->fabric_info->fabric_attr->name,
|
||||
module->linux_device_name,
|
||||
usnic_compat_proc_name_print(&opal_proc->proc_name));
|
||||
continue;
|
||||
}
|
||||
@ -126,7 +126,7 @@ static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module,
|
||||
skip it */
|
||||
opal_output_verbose(75, USNIC_OUT,
|
||||
"btl:usnic:add_procs:%s: peer %s on %s does not have usnic modex info; skipping",
|
||||
module->fabric_info->fabric_attr->name,
|
||||
module->linux_device_name,
|
||||
usnic_compat_proc_name_print(&opal_proc->proc_name),
|
||||
opal_get_proc_hostname(opal_proc));
|
||||
continue;
|
||||
@ -142,7 +142,7 @@ static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module,
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
opal_output_verbose(5, USNIC_OUT,
|
||||
"btl:usnic:add_procs:%s: unable to create endpoint to peer %s on %s",
|
||||
module->fabric_info->fabric_attr->name,
|
||||
module->linux_device_name,
|
||||
usnic_compat_proc_name_print(&opal_proc->proc_name),
|
||||
opal_get_proc_hostname(opal_proc));
|
||||
OBJ_RELEASE(usnic_proc);
|
||||
@ -161,7 +161,7 @@ static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module,
|
||||
|
||||
opal_output_verbose(5, USNIC_OUT,
|
||||
"btl:usnic:add_procs:%s: new usnic peer endpoint: %s, proirity port %d, data port %d",
|
||||
module->fabric_info->fabric_attr->name,
|
||||
module->linux_device_name,
|
||||
str,
|
||||
modex->ports[USNIC_PRIORITY_CHANNEL],
|
||||
modex->ports[USNIC_DATA_CHANNEL]);
|
||||
@ -197,14 +197,14 @@ static void add_procs_warn_unreachable(opal_btl_usnic_module_t *module,
|
||||
|
||||
opal_output_verbose(15, USNIC_OUT,
|
||||
"btl:usnic: %s (which is %s) couldn't reach peer %s",
|
||||
module->fabric_info->fabric_attr->name,
|
||||
module->linux_device_name,
|
||||
module->if_ipv4_addr_str,
|
||||
remote);
|
||||
opal_show_help("help-mpi-btl-usnic.txt", "unreachable peer IP",
|
||||
true,
|
||||
opal_process_info.nodename,
|
||||
module->if_ipv4_addr_str,
|
||||
module->fabric_info->fabric_attr->name,
|
||||
module->linux_device_name,
|
||||
opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal),
|
||||
remote);
|
||||
}
|
||||
@ -303,7 +303,7 @@ add_procs_block_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
|
||||
"libfabric API failed",
|
||||
true,
|
||||
opal_process_info.nodename,
|
||||
module->fabric_info->fabric_attr->name,
|
||||
module->linux_device_name,
|
||||
"async insertion result", __FILE__, __LINE__,
|
||||
err_entry.err,
|
||||
"Failed to insert address to AV");
|
||||
@ -327,7 +327,7 @@ add_procs_block_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
|
||||
"internal error during init",
|
||||
true,
|
||||
opal_process_info.nodename,
|
||||
module->fabric_info->fabric_attr->name,
|
||||
module->linux_device_name,
|
||||
"fi_eq_readerr()", __FILE__, __LINE__,
|
||||
ret,
|
||||
"Returned != sizeof(err_entry)");
|
||||
@ -348,7 +348,7 @@ add_procs_block_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
|
||||
"internal error during init",
|
||||
true,
|
||||
opal_process_info.nodename,
|
||||
module->fabric_info->fabric_attr->name,
|
||||
module->linux_device_name,
|
||||
"fi_eq_sread()", __FILE__, __LINE__,
|
||||
ret,
|
||||
"Returned != (sizeof(entry) or -FI_EAVAIL)");
|
||||
@ -904,6 +904,8 @@ static int usnic_finalize(struct mca_btl_base_module_t* btl)
|
||||
fi_close(&module->domain->fid);
|
||||
fi_close(&module->fabric->fid);
|
||||
|
||||
free(module->linux_device_name);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
@ -1423,7 +1425,7 @@ static void module_async_event_callback(int fd, short flags, void *arg)
|
||||
opal_show_help("help-mpi-btl-usnic.txt", "libfabric API failed",
|
||||
true,
|
||||
opal_process_info.nodename,
|
||||
module->fabric_info->fabric_attr->name,
|
||||
module->linux_device_name,
|
||||
"fi_eq_read()", __FILE__, __LINE__,
|
||||
ret,
|
||||
"Failed to get domain event");
|
||||
@ -1442,7 +1444,7 @@ static void module_async_event_callback(int fd, short flags, void *arg)
|
||||
ignore it. */
|
||||
opal_output_verbose(10, USNIC_OUT,
|
||||
"btl:usnic: got LINK_UP on %s",
|
||||
module->fabric_info->fabric_attr->name);
|
||||
module->linux_device_name);
|
||||
break;
|
||||
|
||||
case 1: // USD_EVENT_LINK_DOWN:
|
||||
@ -1461,7 +1463,7 @@ static void module_async_event_callback(int fd, short flags, void *arg)
|
||||
opal_show_help("help-mpi-btl-usnic.txt", "async event",
|
||||
true,
|
||||
opal_process_info.nodename,
|
||||
module->fabric_info->fabric_attr->name,
|
||||
module->linux_device_name,
|
||||
str, entry.data);
|
||||
fatal = true;
|
||||
}
|
||||
@ -1492,7 +1494,7 @@ static int create_ep(opal_btl_usnic_module_t* module,
|
||||
"internal error during init",
|
||||
true,
|
||||
opal_process_info.nodename,
|
||||
module->fabric_info->fabric_attr->name,
|
||||
module->linux_device_name,
|
||||
"fi_dupinfo() failed", __FILE__, __LINE__,
|
||||
-1, "Unknown");
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
@ -1510,14 +1512,14 @@ static int create_ep(opal_btl_usnic_module_t* module,
|
||||
opal_process_info.my_local_rank);
|
||||
}
|
||||
|
||||
rc = fi_getinfo(FI_VERSION(1, 1), NULL, 0, 0, hint, &channel->info);
|
||||
rc = fi_getinfo(module->libfabric_api, NULL, 0, 0, hint, &channel->info);
|
||||
fi_freeinfo(hint);
|
||||
if (0 != rc) {
|
||||
opal_show_help("help-mpi-btl-usnic.txt",
|
||||
"internal error during init",
|
||||
true,
|
||||
opal_process_info.nodename,
|
||||
module->fabric_info->fabric_attr->name,
|
||||
module->linux_device_name,
|
||||
"fi_getinfo() failed", __FILE__, __LINE__,
|
||||
rc, fi_strerror(-rc));
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
@ -1553,7 +1555,7 @@ static int create_ep(opal_btl_usnic_module_t* module,
|
||||
"internal error during init",
|
||||
true,
|
||||
opal_process_info.nodename,
|
||||
module->fabric_info->fabric_attr->name,
|
||||
module->linux_device_name,
|
||||
"fi_endpoint() failed", __FILE__, __LINE__,
|
||||
rc, fi_strerror(-rc));
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
@ -1566,7 +1568,7 @@ static int create_ep(opal_btl_usnic_module_t* module,
|
||||
"internal error during init",
|
||||
true,
|
||||
opal_process_info.nodename,
|
||||
module->fabric_info->fabric_attr->name,
|
||||
module->linux_device_name,
|
||||
"fi_ep_bind() SCQ to EP failed", __FILE__, __LINE__,
|
||||
rc, fi_strerror(-rc));
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
@ -1577,7 +1579,7 @@ static int create_ep(opal_btl_usnic_module_t* module,
|
||||
"internal error during init",
|
||||
true,
|
||||
opal_process_info.nodename,
|
||||
module->fabric_info->fabric_attr->name,
|
||||
module->linux_device_name,
|
||||
"fi_ep_bind() RCQ to EP failed", __FILE__, __LINE__,
|
||||
rc, fi_strerror(-rc));
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
@ -1588,7 +1590,7 @@ static int create_ep(opal_btl_usnic_module_t* module,
|
||||
"internal error during init",
|
||||
true,
|
||||
opal_process_info.nodename,
|
||||
module->fabric_info->fabric_attr->name,
|
||||
module->linux_device_name,
|
||||
"fi_ep_bind() AV to EP failed", __FILE__, __LINE__,
|
||||
rc, fi_strerror(-rc));
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
@ -1601,7 +1603,7 @@ static int create_ep(opal_btl_usnic_module_t* module,
|
||||
"internal error during init",
|
||||
true,
|
||||
opal_process_info.nodename,
|
||||
module->fabric_info->fabric_attr->name,
|
||||
module->linux_device_name,
|
||||
"fi_enable() failed", __FILE__, __LINE__,
|
||||
rc, fi_strerror(-rc));
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
@ -1623,7 +1625,7 @@ static int create_ep(opal_btl_usnic_module_t* module,
|
||||
"internal error during init",
|
||||
true,
|
||||
opal_process_info.nodename,
|
||||
module->fabric_info->fabric_attr->name,
|
||||
module->linux_device_name,
|
||||
"fi_getname() failed", __FILE__, __LINE__,
|
||||
rc, fi_strerror(-rc));
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
@ -1714,7 +1716,7 @@ static int init_one_channel(opal_btl_usnic_module_t *module,
|
||||
"internal error during init",
|
||||
true,
|
||||
opal_process_info.nodename,
|
||||
module->fabric_info->fabric_attr->name,
|
||||
module->linux_device_name,
|
||||
"failed to create CQ", __FILE__, __LINE__);
|
||||
goto error;
|
||||
}
|
||||
@ -1770,7 +1772,7 @@ static int init_one_channel(opal_btl_usnic_module_t *module,
|
||||
"internal error during init",
|
||||
true,
|
||||
opal_process_info.nodename,
|
||||
module->fabric_info->fabric_attr->name,
|
||||
module->linux_device_name,
|
||||
"Failed to get receive buffer from freelist",
|
||||
__FILE__, __LINE__);
|
||||
goto error;
|
||||
@ -1786,7 +1788,7 @@ static int init_one_channel(opal_btl_usnic_module_t *module,
|
||||
"internal error during init",
|
||||
true,
|
||||
opal_process_info.nodename,
|
||||
module->fabric_info->fabric_attr->name,
|
||||
module->linux_device_name,
|
||||
"Failed to post receive buffer",
|
||||
__FILE__, __LINE__);
|
||||
goto error;
|
||||
@ -1853,7 +1855,7 @@ static void init_local_modex_part1(opal_btl_usnic_module_t *module)
|
||||
|
||||
opal_output_verbose(5, USNIC_OUT,
|
||||
"btl:usnic: %s IP charactertics: %s, %u Mbps",
|
||||
module->fabric_info->fabric_attr->name,
|
||||
module->linux_device_name,
|
||||
module->if_ipv4_addr_str,
|
||||
modex->link_speed_mbps);
|
||||
}
|
||||
@ -2074,7 +2076,7 @@ static int init_mpool(opal_btl_usnic_module_t *module)
|
||||
"internal error during init",
|
||||
true,
|
||||
opal_process_info.nodename,
|
||||
module->fabric_info->fabric_attr->name,
|
||||
module->linux_device_name,
|
||||
"create rcache", __FILE__, __LINE__);
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
@ -2082,7 +2084,7 @@ static int init_mpool(opal_btl_usnic_module_t *module)
|
||||
mca_mpool_base_module_lookup (mca_btl_usnic_component.usnic_mpool_hints);
|
||||
#else
|
||||
asprintf(&mpool_resources.pool_name, "%s",
|
||||
module->fabric_info->fabric_attr->name);
|
||||
module->linux_device_name);
|
||||
module->super.btl_mpool =
|
||||
mca_mpool_base_module_create(mca_btl_usnic_component.usnic_mpool_name,
|
||||
&module->super, &mpool_resources);
|
||||
@ -2092,7 +2094,7 @@ static int init_mpool(opal_btl_usnic_module_t *module)
|
||||
"internal error during init",
|
||||
true,
|
||||
opal_process_info.nodename,
|
||||
module->fabric_info->fabric_attr->name,
|
||||
module->linux_device_name,
|
||||
"create mpool", __FILE__, __LINE__);
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
@ -2205,7 +2207,7 @@ static void init_async_event(opal_btl_usnic_module_t *module)
|
||||
"libfabric API failed",
|
||||
true,
|
||||
opal_process_info.nodename,
|
||||
module->fabric_info->fabric_attr->name,
|
||||
module->linux_device_name,
|
||||
"fi_control(eq, FI_GETWAIT)", __FILE__, __LINE__,
|
||||
ret,
|
||||
fi_strerror(-ret));
|
||||
|
@ -103,8 +103,10 @@ typedef struct opal_btl_usnic_module_t {
|
||||
|
||||
/* Cache for use during component_init to associate a module with
|
||||
the libfabric device that it came from. */
|
||||
uint32_t libfabric_api;
|
||||
struct fid_fabric *fabric;
|
||||
struct fid_domain *domain;
|
||||
char *linux_device_name;
|
||||
struct fi_info *fabric_info;
|
||||
struct fi_usnic_ops_fabric *usnic_fabric_ops;
|
||||
struct fi_usnic_ops_av *usnic_av_ops;
|
||||
|
@ -11,7 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -643,7 +643,7 @@ static int match_modex(opal_btl_usnic_module_t *module,
|
||||
opal_show_help("help-mpi-btl-usnic.txt", "MTU mismatch",
|
||||
true,
|
||||
opal_process_info.nodename,
|
||||
module->fabric_info->fabric_attr->name,
|
||||
module->linux_device_name,
|
||||
module->fabric_info->ep_attr->max_msg_size,
|
||||
(NULL == proc->proc_opal->proc_hostname) ?
|
||||
"unknown" : proc->proc_opal->proc_hostname,
|
||||
@ -700,7 +700,7 @@ static int start_av_insert(opal_btl_usnic_module_t *module,
|
||||
opal_show_help("help-mpi-btl-usnic.txt", "libfabric API failed",
|
||||
true,
|
||||
opal_process_info.nodename,
|
||||
module->fabric_info->fabric_attr->name,
|
||||
module->linux_device_name,
|
||||
"fi_av_insert()", __FILE__, __LINE__,
|
||||
ret,
|
||||
"Failed to initiate AV insert");
|
||||
|
@ -216,7 +216,7 @@ opal_btl_usnic_endpoint_send_segment(
|
||||
"CHUNK" : "FRAG",
|
||||
sseg->ss_base.us_btl_header->pkt_seq,
|
||||
sseg->ss_base.us_btl_header->sender,
|
||||
endpoint->endpoint_module->fabric_info->fabric_attr->name,
|
||||
endpoint->endpoint_module->linux_device_name,
|
||||
local_ip,
|
||||
module->local_modex.ports[sseg->ss_channel],
|
||||
(void*)sseg,
|
||||
|
@ -86,7 +86,7 @@ void opal_btl_usnic_print_stats(
|
||||
prefix,
|
||||
opal_proc_local_get()->proc_name.vpid,
|
||||
|
||||
module->fabric_info->fabric_attr->name,
|
||||
module->linux_device_name,
|
||||
|
||||
module->stats.num_total_sends,
|
||||
module->mod_channels[USNIC_PRIORITY_CHANNEL].num_channel_sends,
|
||||
@ -394,7 +394,7 @@ static void setup_mpit_pvars_enum(void)
|
||||
|
||||
devices[i].value = i;
|
||||
rc = asprintf(&str, "%s,%hhu.%hhu.%hhu.%hhu/%" PRIu32,
|
||||
m->fabric_info->fabric_attr->name,
|
||||
m->linux_device_name,
|
||||
c[0], c[1], c[2], c[3],
|
||||
usnic_netmask_to_cidrlen(sin->sin_addr.s_addr));
|
||||
assert(rc > 0);
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user