usnic: don't overrun the fi_av_insert() EQ
Add endpoints in a blocked manner so that we don't overrun the fi_av_insert() event queue. Also make the AV EQ length an MCA param, and report it in mca_btl_base_verbose >=5 output.
Этот коммит содержится в:
родитель
d624e0d60f
Коммит
db825abc00
@ -11,7 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011-2015 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2016 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -181,6 +181,9 @@ typedef struct opal_btl_usnic_component_t {
|
||||
/** max completion queue entries per module */
|
||||
int32_t cq_num;
|
||||
|
||||
/** max number of entries in AV EQ */
|
||||
int32_t av_eq_num;
|
||||
|
||||
/** retrans characteristics */
|
||||
int retrans_timeout;
|
||||
|
||||
|
@ -956,11 +956,12 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
|
||||
/* Output all of this module's values. */
|
||||
const char *devname = module->fabric_info->fabric_attr->name;
|
||||
opal_output_verbose(5, USNIC_OUT,
|
||||
"btl:usnic: %s num sqe=%d, num rqe=%d, num cqe=%d",
|
||||
"btl:usnic: %s num sqe=%d, num rqe=%d, num cqe=%d, num aveqe=%d",
|
||||
devname,
|
||||
module->sd_num,
|
||||
module->rd_num,
|
||||
module->cq_num);
|
||||
module->cq_num,
|
||||
module->av_eq_num);
|
||||
opal_output_verbose(5, USNIC_OUT,
|
||||
"btl:usnic: %s priority MTU = %" PRIsize_t,
|
||||
devname,
|
||||
|
@ -11,7 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2008-2015 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2008-2016 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||
@ -162,6 +162,7 @@ int opal_btl_usnic_component_register(void)
|
||||
static int prio_sd_num;
|
||||
static int prio_rd_num;
|
||||
static int cq_num;
|
||||
static int av_eq_num;
|
||||
static int udp_port_base;
|
||||
static int max_tiny_msg_size;
|
||||
static int eager_limit;
|
||||
@ -235,6 +236,10 @@ int opal_btl_usnic_component_register(void)
|
||||
-1, &cq_num, REGINT_NEG_ONE_OK, OPAL_INFO_LVL_5));
|
||||
mca_btl_usnic_component.cq_num = (int32_t) cq_num;
|
||||
|
||||
CHECK(reg_int("av_eq_num", "Number of event queue entries for peer address resolution (-1 = pre-set defaults; depends on number and type of devices available; will error if ac_eq_num < 8)",
|
||||
-1, &av_eq_num, REGINT_NEG_ONE_OK, OPAL_INFO_LVL_5));
|
||||
mca_btl_usnic_component.av_eq_num = (int32_t) av_eq_num;
|
||||
|
||||
CHECK(reg_int("base_udp_port", "Base UDP port to use for usNIC communications. If 0, system will pick the port number. If non-zero, it will be added to each process' local rank to obtain the final port number (default: 0)",
|
||||
0, &udp_port_base, REGINT_GE_ZERO, OPAL_INFO_LVL_5));
|
||||
mca_btl_usnic_component.udp_port_base = (int) udp_port_base;
|
||||
|
@ -69,13 +69,14 @@ static void finalize_one_channel(opal_btl_usnic_module_t *module,
|
||||
|
||||
|
||||
/*
|
||||
* Loop over all procs sent to us in add_procs and see if we want to
|
||||
* add a proc/endpoint for them.
|
||||
* Loop over a block of procs sent to us in add_procs and see if we
|
||||
* want to add a proc/endpoint for them.
|
||||
*/
|
||||
static int add_procs_create_endpoints(opal_btl_usnic_module_t *module,
|
||||
size_t nprocs,
|
||||
opal_proc_t **procs,
|
||||
mca_btl_base_endpoint_t **endpoints)
|
||||
static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module,
|
||||
size_t block_offset,
|
||||
size_t block_len,
|
||||
opal_proc_t **procs,
|
||||
mca_btl_base_endpoint_t **endpoints)
|
||||
{
|
||||
int rc;
|
||||
opal_proc_t* my_proc;
|
||||
@ -87,8 +88,8 @@ static int add_procs_create_endpoints(opal_btl_usnic_module_t *module,
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* Loop over the procs we were given */
|
||||
for (size_t i = 0; i < nprocs; i++) {
|
||||
/* Loop over a block in the procs we were given */
|
||||
for (size_t i = block_offset; i < (block_offset + block_len); i++) {
|
||||
struct opal_proc_t* opal_proc = procs[i];
|
||||
opal_btl_usnic_proc_t* usnic_proc;
|
||||
mca_btl_base_endpoint_t* usnic_endpoint;
|
||||
@ -195,9 +196,10 @@ static void add_procs_warn_unreachable(opal_btl_usnic_module_t *module,
|
||||
* invoked. Go reap them all.
|
||||
*/
|
||||
static int
|
||||
add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
|
||||
size_t array_len,
|
||||
struct mca_btl_base_endpoint_t **endpoints)
|
||||
add_procs_block_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
|
||||
size_t block_offset,
|
||||
size_t block_len,
|
||||
struct mca_btl_base_endpoint_t **endpoints)
|
||||
{
|
||||
int ret = OPAL_SUCCESS;
|
||||
int num_left;
|
||||
@ -205,12 +207,11 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
|
||||
uint32_t event;
|
||||
struct fi_eq_entry entry;
|
||||
struct fi_eq_err_entry err_entry;
|
||||
|
||||
bool error_occurred = false;
|
||||
|
||||
/* compute num fi_av_insert completions we are waiting for */
|
||||
num_left = 0;
|
||||
for (i = 0; i < array_len; ++i) {
|
||||
for (i = block_offset; i < (block_offset + block_len); ++i) {
|
||||
if (NULL != endpoints[i]) {
|
||||
num_left += USNIC_NUM_CHANNELS;
|
||||
}
|
||||
@ -266,7 +267,7 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
|
||||
We therefore only want to print a pretty
|
||||
warning about (and OBJ_RELEASE) that endpoint
|
||||
the *first* time it is reported. */
|
||||
for (i = 0; i < array_len; ++i) {
|
||||
for (i = block_offset; i < (block_offset + block_len); ++i) {
|
||||
if (endpoints[i] == context->endpoint) {
|
||||
add_procs_warn_unreachable(module,
|
||||
context->endpoint);
|
||||
@ -348,7 +349,7 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
|
||||
- If an otherwise-valid endpoint has no dest, that means we timed
|
||||
out trying to resolve it, so just release that endpoint. */
|
||||
size_t num_endpoints_created = 0;
|
||||
for (i = 0; i < array_len; i++) {
|
||||
for (i = block_offset; i < (block_offset + block_len); i++) {
|
||||
if (NULL != endpoints[i]) {
|
||||
bool happy;
|
||||
|
||||
@ -382,6 +383,79 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Create endpoints for the procs we were given in add_procs.
|
||||
*/
|
||||
static int add_procs_create_endpoints(struct opal_btl_usnic_module_t* module,
|
||||
size_t nprocs,
|
||||
struct opal_proc_t **procs,
|
||||
struct mca_btl_base_endpoint_t** endpoints)
|
||||
{
|
||||
/* We need to ensure that we don't overrun the libfabric AV EQ.
|
||||
Divide up all the peer address resolutions we need to do into a
|
||||
series of blocks; insert and complete each block before moving
|
||||
to the next (note: if performance mandates it, we can move to a
|
||||
sliding window style of AV inserts to get better concurrency of
|
||||
AV resolution). */
|
||||
|
||||
/* Leave a few empty slots in the AV EQ, just for good measure */
|
||||
if (module->av_eq_size < 8) {
|
||||
opal_show_help("help-mpi-btl-usnic.txt", "fi_av_eq too small",
|
||||
true,
|
||||
opal_process_info.nodename,
|
||||
module->av_eq_size,
|
||||
8);
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
size_t eq_size = module->av_eq_size - 8;
|
||||
size_t block_len = eq_size;
|
||||
size_t num_av_inserts = nprocs * USNIC_NUM_CHANNELS;
|
||||
size_t num_blocks = num_av_inserts / eq_size;
|
||||
if (eq_size % num_av_inserts != 0) {
|
||||
++num_blocks;
|
||||
}
|
||||
|
||||
/* Per above, the blocks are expressed in terms of number of AV
|
||||
inserts. Convert them to be expressed in terms of number of
|
||||
procs. */
|
||||
block_len /= USNIC_NUM_CHANNELS;
|
||||
|
||||
/* Per above, loop over creating the endpoints so that we do not
|
||||
overrun the libfabric AV EQ. */
|
||||
int rc;
|
||||
for (size_t block_offset = 0, block = 0; block < num_blocks;
|
||||
block_offset += block_len, ++block) {
|
||||
/* Adjust for the last block */
|
||||
if (block_len > (nprocs - block_offset)) {
|
||||
block_len = nprocs - block_offset;
|
||||
}
|
||||
|
||||
/* First, create endpoints (and procs, if they're not already
|
||||
created) for the usnic-reachable procs we were given. */
|
||||
rc = add_procs_block_create_endpoints(module,
|
||||
block_offset, block_len,
|
||||
procs, endpoints);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* For each endpoint that was created, we initiated the
|
||||
process to create NUM_CHANNELS fi_addrs. Go finish all of
|
||||
those. This will be the final determination of whether we
|
||||
can use the endpoint or not because we'll find out if each
|
||||
endpoint is reachable or not. */
|
||||
rc = add_procs_block_reap_fi_av_inserts(module,
|
||||
block_offset, block_len,
|
||||
endpoints);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Add procs to this BTL module, receiving endpoint information from
|
||||
* the modex. This is done in 2 phases:
|
||||
@ -408,23 +482,13 @@ static int usnic_add_procs(struct mca_btl_base_module_t* base_module,
|
||||
opal_btl_usnic_module_t* module = (opal_btl_usnic_module_t*) base_module;
|
||||
int rc;
|
||||
|
||||
/* First, create endpoints (and procs, if they're not already
|
||||
created) for all the usnic-reachable procs we were given. */
|
||||
/* Go create the endpoints (including all relevant address
|
||||
resolution) */
|
||||
rc = add_procs_create_endpoints(module, nprocs, procs, endpoints);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
goto fail;
|
||||
}
|
||||
|
||||
/* For each endpoint that was created, we initiated the process to
|
||||
create NUM_CHANNELS fi_addrs. Go finish all of those. This
|
||||
will be the final determination of whether we can use the
|
||||
endpoint or not because we'll find out if each endpoint is
|
||||
reachable or not. */
|
||||
rc = add_procs_reap_fi_av_inserts(module, nprocs, endpoints);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
goto fail;
|
||||
}
|
||||
|
||||
/* Find all the endpoints with a complete set of USD destinations
|
||||
and mark them as reachable */
|
||||
for (size_t i = 0; NULL != reachable && i < nprocs; ++i) {
|
||||
@ -1831,6 +1895,11 @@ static void init_queue_lengths(opal_btl_usnic_module_t *module)
|
||||
} else {
|
||||
module->cq_num = mca_btl_usnic_component.cq_num;
|
||||
}
|
||||
if (-1 == mca_btl_usnic_component.av_eq_num) {
|
||||
module->av_eq_num = 1024;
|
||||
} else {
|
||||
module->av_eq_num = mca_btl_usnic_component.av_eq_num;
|
||||
}
|
||||
|
||||
/*
|
||||
* Queue sizes for priority channel scale with # of endpoint. A
|
||||
@ -2018,12 +2087,15 @@ static int init_channels(opal_btl_usnic_module_t *module)
|
||||
}
|
||||
|
||||
memset(&eq_attr, 0, sizeof(eq_attr));
|
||||
eq_attr.size = 1024;
|
||||
eq_attr.size = module->av_eq_num;
|
||||
eq_attr.wait_obj = FI_WAIT_UNSPEC;
|
||||
rc = fi_eq_open(module->fabric, &eq_attr, &module->av_eq, NULL);
|
||||
if (rc != OPAL_SUCCESS) {
|
||||
goto destroy;
|
||||
}
|
||||
// Save the size of the created EQ
|
||||
module->av_eq_size = eq_attr.size;
|
||||
|
||||
eq_attr.wait_obj = FI_WAIT_FD;
|
||||
rc = fi_eq_open(module->fabric, &eq_attr, &module->dom_eq, NULL);
|
||||
if (rc != OPAL_SUCCESS) {
|
||||
|
@ -11,7 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011-2015 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2016 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -110,6 +110,8 @@ typedef struct opal_btl_usnic_module_t {
|
||||
struct fid_eq *av_eq;
|
||||
struct fid_av *av;
|
||||
|
||||
size_t av_eq_size;
|
||||
|
||||
mca_btl_base_module_error_cb_fn_t pml_error_callback;
|
||||
|
||||
/* Information about the events */
|
||||
@ -127,6 +129,7 @@ typedef struct opal_btl_usnic_module_t {
|
||||
int sd_num;
|
||||
int rd_num;
|
||||
int cq_num;
|
||||
int av_eq_num;
|
||||
int prio_sd_num;
|
||||
int prio_rd_num;
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2012-2015 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2012-2016 Cisco Systems, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
@ -240,6 +240,20 @@ abort.
|
||||
usNIC interface: %s
|
||||
Current ARP timeout: %d (btl_usnic_arp_timeout MCA param)
|
||||
#
|
||||
[fi_av_eq too small]
|
||||
|
||||
The usnic BTL was told to create an address resolution queue that was
|
||||
too small via the mca_btl_usnic_av_eq_num MCA parameter. This
|
||||
parameter controls how many outstanding peer address resolutions can
|
||||
be outstanding at a time. Larger values allow more concurrent address
|
||||
resolutions, but consume more memory.
|
||||
|
||||
Server: %s
|
||||
av_eq_num param value: %d
|
||||
av_eq_num minimum value: %d
|
||||
|
||||
Your job will likely either perform poorly, or will abort.
|
||||
#
|
||||
[unreachable peer IP]
|
||||
WARNING: Open MPI failed to find a route to a peer IP address via a
|
||||
specific usNIC interface. This usually indicates a problem in the IP
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user