1
1

usnic: don't overrun the fi_av_insert() EQ

Add endpoints in a blocked manner so that we don't overrun the
fi_av_insert() event queue.  Also make the AV EQ length an MCA param,
and report it in mca_btl_base_verbose >=5 output.
Этот коммит содержится в:
Jeff Squyres 2016-01-30 06:58:46 -08:00
родитель d624e0d60f
Коммит db825abc00
6 изменённых файлов: 132 добавлений и 34 удалений

Просмотреть файл

@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2006 Sandia National Laboratories. All rights
* reserved.
* Copyright (c) 2011-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2016 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -181,6 +181,9 @@ typedef struct opal_btl_usnic_component_t {
/** max completion queue entries per module */
int32_t cq_num;
/** max number of entries in AV EQ */
int32_t av_eq_num;
/** retrans characteristics */
int retrans_timeout;

Просмотреть файл

@ -956,11 +956,12 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
/* Output all of this module's values. */
const char *devname = module->fabric_info->fabric_attr->name;
opal_output_verbose(5, USNIC_OUT,
"btl:usnic: %s num sqe=%d, num rqe=%d, num cqe=%d",
"btl:usnic: %s num sqe=%d, num rqe=%d, num cqe=%d, num aveqe=%d",
devname,
module->sd_num,
module->rd_num,
module->cq_num);
module->cq_num,
module->av_eq_num);
opal_output_verbose(5, USNIC_OUT,
"btl:usnic: %s priority MTU = %" PRIsize_t,
devname,

Просмотреть файл

@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2006 Sandia National Laboratories. All rights
* reserved.
* Copyright (c) 2008-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2008-2016 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved.
@ -162,6 +162,7 @@ int opal_btl_usnic_component_register(void)
static int prio_sd_num;
static int prio_rd_num;
static int cq_num;
static int av_eq_num;
static int udp_port_base;
static int max_tiny_msg_size;
static int eager_limit;
@ -235,6 +236,10 @@ int opal_btl_usnic_component_register(void)
-1, &cq_num, REGINT_NEG_ONE_OK, OPAL_INFO_LVL_5));
mca_btl_usnic_component.cq_num = (int32_t) cq_num;
CHECK(reg_int("av_eq_num", "Number of event queue entries for peer address resolution (-1 = pre-set defaults; depends on number and type of devices available; will error if ac_eq_num < 8)",
-1, &av_eq_num, REGINT_NEG_ONE_OK, OPAL_INFO_LVL_5));
mca_btl_usnic_component.av_eq_num = (int32_t) av_eq_num;
CHECK(reg_int("base_udp_port", "Base UDP port to use for usNIC communications. If 0, system will pick the port number. If non-zero, it will be added to each process' local rank to obtain the final port number (default: 0)",
0, &udp_port_base, REGINT_GE_ZERO, OPAL_INFO_LVL_5));
mca_btl_usnic_component.udp_port_base = (int) udp_port_base;

Просмотреть файл

@ -69,13 +69,14 @@ static void finalize_one_channel(opal_btl_usnic_module_t *module,
/*
* Loop over all procs sent to us in add_procs and see if we want to
* add a proc/endpoint for them.
* Loop over a block of procs sent to us in add_procs and see if we
* want to add a proc/endpoint for them.
*/
static int add_procs_create_endpoints(opal_btl_usnic_module_t *module,
size_t nprocs,
opal_proc_t **procs,
mca_btl_base_endpoint_t **endpoints)
static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module,
size_t block_offset,
size_t block_len,
opal_proc_t **procs,
mca_btl_base_endpoint_t **endpoints)
{
int rc;
opal_proc_t* my_proc;
@ -87,8 +88,8 @@ static int add_procs_create_endpoints(opal_btl_usnic_module_t *module,
return OPAL_ERR_OUT_OF_RESOURCE;
}
/* Loop over the procs we were given */
for (size_t i = 0; i < nprocs; i++) {
/* Loop over a block in the procs we were given */
for (size_t i = block_offset; i < (block_offset + block_len); i++) {
struct opal_proc_t* opal_proc = procs[i];
opal_btl_usnic_proc_t* usnic_proc;
mca_btl_base_endpoint_t* usnic_endpoint;
@ -195,9 +196,10 @@ static void add_procs_warn_unreachable(opal_btl_usnic_module_t *module,
* invoked. Go reap them all.
*/
static int
add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
size_t array_len,
struct mca_btl_base_endpoint_t **endpoints)
add_procs_block_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
size_t block_offset,
size_t block_len,
struct mca_btl_base_endpoint_t **endpoints)
{
int ret = OPAL_SUCCESS;
int num_left;
@ -205,12 +207,11 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
uint32_t event;
struct fi_eq_entry entry;
struct fi_eq_err_entry err_entry;
bool error_occurred = false;
/* compute num fi_av_insert completions we are waiting for */
num_left = 0;
for (i = 0; i < array_len; ++i) {
for (i = block_offset; i < (block_offset + block_len); ++i) {
if (NULL != endpoints[i]) {
num_left += USNIC_NUM_CHANNELS;
}
@ -266,7 +267,7 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
We therefore only want to print a pretty
warning about (and OBJ_RELEASE) that endpoint
the *first* time it is reported. */
for (i = 0; i < array_len; ++i) {
for (i = block_offset; i < (block_offset + block_len); ++i) {
if (endpoints[i] == context->endpoint) {
add_procs_warn_unreachable(module,
context->endpoint);
@ -348,7 +349,7 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
- If an otherwise-valid endpoint has no dest, that means we timed
out trying to resolve it, so just release that endpoint. */
size_t num_endpoints_created = 0;
for (i = 0; i < array_len; i++) {
for (i = block_offset; i < (block_offset + block_len); i++) {
if (NULL != endpoints[i]) {
bool happy;
@ -382,6 +383,79 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
return ret;
}
/*
* Create endpoints for the procs we were given in add_procs.
*/
static int add_procs_create_endpoints(struct opal_btl_usnic_module_t* module,
size_t nprocs,
struct opal_proc_t **procs,
struct mca_btl_base_endpoint_t** endpoints)
{
/* We need to ensure that we don't overrun the libfabric AV EQ.
Divide up all the peer address resolutions we need to do into a
series of blocks; insert and complete each block before moving
to the next (note: if performance mandates it, we can move to a
sliding window style of AV inserts to get better concurrency of
AV resolution). */
/* Leave a few empty slots in the AV EQ, just for good measure */
if (module->av_eq_size < 8) {
opal_show_help("help-mpi-btl-usnic.txt", "fi_av_eq too small",
true,
opal_process_info.nodename,
module->av_eq_size,
8);
return OPAL_ERR_OUT_OF_RESOURCE;
}
size_t eq_size = module->av_eq_size - 8;
size_t block_len = eq_size;
size_t num_av_inserts = nprocs * USNIC_NUM_CHANNELS;
size_t num_blocks = num_av_inserts / eq_size;
if (eq_size % num_av_inserts != 0) {
++num_blocks;
}
/* Per above, the blocks are expressed in terms of number of AV
inserts. Convert them to be expressed in terms of number of
procs. */
block_len /= USNIC_NUM_CHANNELS;
/* Per above, loop over creating the endpoints so that we do not
overrun the libfabric AV EQ. */
int rc;
for (size_t block_offset = 0, block = 0; block < num_blocks;
block_offset += block_len, ++block) {
/* Adjust for the last block */
if (block_len > (nprocs - block_offset)) {
block_len = nprocs - block_offset;
}
/* First, create endpoints (and procs, if they're not already
created) for the usnic-reachable procs we were given. */
rc = add_procs_block_create_endpoints(module,
block_offset, block_len,
procs, endpoints);
if (OPAL_SUCCESS != rc) {
return rc;
}
/* For each endpoint that was created, we initiated the
process to create NUM_CHANNELS fi_addrs. Go finish all of
those. This will be the final determination of whether we
can use the endpoint or not because we'll find out if each
endpoint is reachable or not. */
rc = add_procs_block_reap_fi_av_inserts(module,
block_offset, block_len,
endpoints);
if (OPAL_SUCCESS != rc) {
return rc;
}
}
return OPAL_SUCCESS;
}
/*
* Add procs to this BTL module, receiving endpoint information from
* the modex. This is done in 2 phases:
@ -408,23 +482,13 @@ static int usnic_add_procs(struct mca_btl_base_module_t* base_module,
opal_btl_usnic_module_t* module = (opal_btl_usnic_module_t*) base_module;
int rc;
/* First, create endpoints (and procs, if they're not already
created) for all the usnic-reachable procs we were given. */
/* Go create the endpoints (including all relevant address
resolution) */
rc = add_procs_create_endpoints(module, nprocs, procs, endpoints);
if (OPAL_SUCCESS != rc) {
goto fail;
}
/* For each endpoint that was created, we initiated the process to
create NUM_CHANNELS fi_addrs. Go finish all of those. This
will be the final determination of whether we can use the
endpoint or not because we'll find out if each endpoint is
reachable or not. */
rc = add_procs_reap_fi_av_inserts(module, nprocs, endpoints);
if (OPAL_SUCCESS != rc) {
goto fail;
}
/* Find all the endpoints with a complete set of USD destinations
and mark them as reachable */
for (size_t i = 0; NULL != reachable && i < nprocs; ++i) {
@ -1831,6 +1895,11 @@ static void init_queue_lengths(opal_btl_usnic_module_t *module)
} else {
module->cq_num = mca_btl_usnic_component.cq_num;
}
if (-1 == mca_btl_usnic_component.av_eq_num) {
module->av_eq_num = 1024;
} else {
module->av_eq_num = mca_btl_usnic_component.av_eq_num;
}
/*
* Queue sizes for priority channel scale with # of endpoint. A
@ -2018,12 +2087,15 @@ static int init_channels(opal_btl_usnic_module_t *module)
}
memset(&eq_attr, 0, sizeof(eq_attr));
eq_attr.size = 1024;
eq_attr.size = module->av_eq_num;
eq_attr.wait_obj = FI_WAIT_UNSPEC;
rc = fi_eq_open(module->fabric, &eq_attr, &module->av_eq, NULL);
if (rc != OPAL_SUCCESS) {
goto destroy;
}
// Save the size of the created EQ
module->av_eq_size = eq_attr.size;
eq_attr.wait_obj = FI_WAIT_FD;
rc = fi_eq_open(module->fabric, &eq_attr, &module->dom_eq, NULL);
if (rc != OPAL_SUCCESS) {

Просмотреть файл

@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2006 Sandia National Laboratories. All rights
* reserved.
* Copyright (c) 2011-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2016 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -110,6 +110,8 @@ typedef struct opal_btl_usnic_module_t {
struct fid_eq *av_eq;
struct fid_av *av;
size_t av_eq_size;
mca_btl_base_module_error_cb_fn_t pml_error_callback;
/* Information about the events */
@ -127,6 +129,7 @@ typedef struct opal_btl_usnic_module_t {
int sd_num;
int rd_num;
int cq_num;
int av_eq_num;
int prio_sd_num;
int prio_rd_num;

Просмотреть файл

@ -1,6 +1,6 @@
# -*- text -*-
#
# Copyright (c) 2012-2015 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2012-2016 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
@ -240,6 +240,20 @@ abort.
usNIC interface: %s
Current ARP timeout: %d (btl_usnic_arp_timeout MCA param)
#
[fi_av_eq too small]
The usnic BTL was told to create an address resolution queue that was
too small via the mca_btl_usnic_av_eq_num MCA parameter. This
parameter controls how many outstanding peer address resolutions can
be outstanding at a time. Larger values allow more concurrent address
resolutions, but consume more memory.
Server: %s
av_eq_num param value: %d
av_eq_num minimum value: %d
Your job will likely either perform poorly, or will abort.
#
[unreachable peer IP]
WARNING: Open MPI failed to find a route to a peer IP address via a
specific usNIC interface. This usually indicates a problem in the IP