usnic: don't overrun the fi_av_insert() EQ
Add endpoints in a blocked manner so that we don't overrun the fi_av_insert() event queue. Also make the AV EQ length an MCA param, and report it in mca_btl_base_verbose >=5 output.
Этот коммит содержится в:
родитель
d624e0d60f
Коммит
db825abc00
@ -11,7 +11,7 @@
|
|||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2011-2015 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2011-2016 Cisco Systems, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -181,6 +181,9 @@ typedef struct opal_btl_usnic_component_t {
|
|||||||
/** max completion queue entries per module */
|
/** max completion queue entries per module */
|
||||||
int32_t cq_num;
|
int32_t cq_num;
|
||||||
|
|
||||||
|
/** max number of entries in AV EQ */
|
||||||
|
int32_t av_eq_num;
|
||||||
|
|
||||||
/** retrans characteristics */
|
/** retrans characteristics */
|
||||||
int retrans_timeout;
|
int retrans_timeout;
|
||||||
|
|
||||||
|
@ -956,11 +956,12 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
|
|||||||
/* Output all of this module's values. */
|
/* Output all of this module's values. */
|
||||||
const char *devname = module->fabric_info->fabric_attr->name;
|
const char *devname = module->fabric_info->fabric_attr->name;
|
||||||
opal_output_verbose(5, USNIC_OUT,
|
opal_output_verbose(5, USNIC_OUT,
|
||||||
"btl:usnic: %s num sqe=%d, num rqe=%d, num cqe=%d",
|
"btl:usnic: %s num sqe=%d, num rqe=%d, num cqe=%d, num aveqe=%d",
|
||||||
devname,
|
devname,
|
||||||
module->sd_num,
|
module->sd_num,
|
||||||
module->rd_num,
|
module->rd_num,
|
||||||
module->cq_num);
|
module->cq_num,
|
||||||
|
module->av_eq_num);
|
||||||
opal_output_verbose(5, USNIC_OUT,
|
opal_output_verbose(5, USNIC_OUT,
|
||||||
"btl:usnic: %s priority MTU = %" PRIsize_t,
|
"btl:usnic: %s priority MTU = %" PRIsize_t,
|
||||||
devname,
|
devname,
|
||||||
|
@ -11,7 +11,7 @@
|
|||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2008-2015 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2008-2016 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2015 Intel, Inc. All rights reserved.
|
* Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||||
@ -162,6 +162,7 @@ int opal_btl_usnic_component_register(void)
|
|||||||
static int prio_sd_num;
|
static int prio_sd_num;
|
||||||
static int prio_rd_num;
|
static int prio_rd_num;
|
||||||
static int cq_num;
|
static int cq_num;
|
||||||
|
static int av_eq_num;
|
||||||
static int udp_port_base;
|
static int udp_port_base;
|
||||||
static int max_tiny_msg_size;
|
static int max_tiny_msg_size;
|
||||||
static int eager_limit;
|
static int eager_limit;
|
||||||
@ -235,6 +236,10 @@ int opal_btl_usnic_component_register(void)
|
|||||||
-1, &cq_num, REGINT_NEG_ONE_OK, OPAL_INFO_LVL_5));
|
-1, &cq_num, REGINT_NEG_ONE_OK, OPAL_INFO_LVL_5));
|
||||||
mca_btl_usnic_component.cq_num = (int32_t) cq_num;
|
mca_btl_usnic_component.cq_num = (int32_t) cq_num;
|
||||||
|
|
||||||
|
CHECK(reg_int("av_eq_num", "Number of event queue entries for peer address resolution (-1 = pre-set defaults; depends on number and type of devices available; will error if ac_eq_num < 8)",
|
||||||
|
-1, &av_eq_num, REGINT_NEG_ONE_OK, OPAL_INFO_LVL_5));
|
||||||
|
mca_btl_usnic_component.av_eq_num = (int32_t) av_eq_num;
|
||||||
|
|
||||||
CHECK(reg_int("base_udp_port", "Base UDP port to use for usNIC communications. If 0, system will pick the port number. If non-zero, it will be added to each process' local rank to obtain the final port number (default: 0)",
|
CHECK(reg_int("base_udp_port", "Base UDP port to use for usNIC communications. If 0, system will pick the port number. If non-zero, it will be added to each process' local rank to obtain the final port number (default: 0)",
|
||||||
0, &udp_port_base, REGINT_GE_ZERO, OPAL_INFO_LVL_5));
|
0, &udp_port_base, REGINT_GE_ZERO, OPAL_INFO_LVL_5));
|
||||||
mca_btl_usnic_component.udp_port_base = (int) udp_port_base;
|
mca_btl_usnic_component.udp_port_base = (int) udp_port_base;
|
||||||
|
@ -69,13 +69,14 @@ static void finalize_one_channel(opal_btl_usnic_module_t *module,
|
|||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Loop over all procs sent to us in add_procs and see if we want to
|
* Loop over a block of procs sent to us in add_procs and see if we
|
||||||
* add a proc/endpoint for them.
|
* want to add a proc/endpoint for them.
|
||||||
*/
|
*/
|
||||||
static int add_procs_create_endpoints(opal_btl_usnic_module_t *module,
|
static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module,
|
||||||
size_t nprocs,
|
size_t block_offset,
|
||||||
opal_proc_t **procs,
|
size_t block_len,
|
||||||
mca_btl_base_endpoint_t **endpoints)
|
opal_proc_t **procs,
|
||||||
|
mca_btl_base_endpoint_t **endpoints)
|
||||||
{
|
{
|
||||||
int rc;
|
int rc;
|
||||||
opal_proc_t* my_proc;
|
opal_proc_t* my_proc;
|
||||||
@ -87,8 +88,8 @@ static int add_procs_create_endpoints(opal_btl_usnic_module_t *module,
|
|||||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Loop over the procs we were given */
|
/* Loop over a block in the procs we were given */
|
||||||
for (size_t i = 0; i < nprocs; i++) {
|
for (size_t i = block_offset; i < (block_offset + block_len); i++) {
|
||||||
struct opal_proc_t* opal_proc = procs[i];
|
struct opal_proc_t* opal_proc = procs[i];
|
||||||
opal_btl_usnic_proc_t* usnic_proc;
|
opal_btl_usnic_proc_t* usnic_proc;
|
||||||
mca_btl_base_endpoint_t* usnic_endpoint;
|
mca_btl_base_endpoint_t* usnic_endpoint;
|
||||||
@ -195,9 +196,10 @@ static void add_procs_warn_unreachable(opal_btl_usnic_module_t *module,
|
|||||||
* invoked. Go reap them all.
|
* invoked. Go reap them all.
|
||||||
*/
|
*/
|
||||||
static int
|
static int
|
||||||
add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
|
add_procs_block_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
|
||||||
size_t array_len,
|
size_t block_offset,
|
||||||
struct mca_btl_base_endpoint_t **endpoints)
|
size_t block_len,
|
||||||
|
struct mca_btl_base_endpoint_t **endpoints)
|
||||||
{
|
{
|
||||||
int ret = OPAL_SUCCESS;
|
int ret = OPAL_SUCCESS;
|
||||||
int num_left;
|
int num_left;
|
||||||
@ -205,12 +207,11 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
|
|||||||
uint32_t event;
|
uint32_t event;
|
||||||
struct fi_eq_entry entry;
|
struct fi_eq_entry entry;
|
||||||
struct fi_eq_err_entry err_entry;
|
struct fi_eq_err_entry err_entry;
|
||||||
|
|
||||||
bool error_occurred = false;
|
bool error_occurred = false;
|
||||||
|
|
||||||
/* compute num fi_av_insert completions we are waiting for */
|
/* compute num fi_av_insert completions we are waiting for */
|
||||||
num_left = 0;
|
num_left = 0;
|
||||||
for (i = 0; i < array_len; ++i) {
|
for (i = block_offset; i < (block_offset + block_len); ++i) {
|
||||||
if (NULL != endpoints[i]) {
|
if (NULL != endpoints[i]) {
|
||||||
num_left += USNIC_NUM_CHANNELS;
|
num_left += USNIC_NUM_CHANNELS;
|
||||||
}
|
}
|
||||||
@ -266,7 +267,7 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
|
|||||||
We therefore only want to print a pretty
|
We therefore only want to print a pretty
|
||||||
warning about (and OBJ_RELEASE) that endpoint
|
warning about (and OBJ_RELEASE) that endpoint
|
||||||
the *first* time it is reported. */
|
the *first* time it is reported. */
|
||||||
for (i = 0; i < array_len; ++i) {
|
for (i = block_offset; i < (block_offset + block_len); ++i) {
|
||||||
if (endpoints[i] == context->endpoint) {
|
if (endpoints[i] == context->endpoint) {
|
||||||
add_procs_warn_unreachable(module,
|
add_procs_warn_unreachable(module,
|
||||||
context->endpoint);
|
context->endpoint);
|
||||||
@ -348,7 +349,7 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
|
|||||||
- If an otherwise-valid endpoint has no dest, that means we timed
|
- If an otherwise-valid endpoint has no dest, that means we timed
|
||||||
out trying to resolve it, so just release that endpoint. */
|
out trying to resolve it, so just release that endpoint. */
|
||||||
size_t num_endpoints_created = 0;
|
size_t num_endpoints_created = 0;
|
||||||
for (i = 0; i < array_len; i++) {
|
for (i = block_offset; i < (block_offset + block_len); i++) {
|
||||||
if (NULL != endpoints[i]) {
|
if (NULL != endpoints[i]) {
|
||||||
bool happy;
|
bool happy;
|
||||||
|
|
||||||
@ -382,6 +383,79 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Create endpoints for the procs we were given in add_procs.
|
||||||
|
*/
|
||||||
|
static int add_procs_create_endpoints(struct opal_btl_usnic_module_t* module,
|
||||||
|
size_t nprocs,
|
||||||
|
struct opal_proc_t **procs,
|
||||||
|
struct mca_btl_base_endpoint_t** endpoints)
|
||||||
|
{
|
||||||
|
/* We need to ensure that we don't overrun the libfabric AV EQ.
|
||||||
|
Divide up all the peer address resolutions we need to do into a
|
||||||
|
series of blocks; insert and complete each block before moving
|
||||||
|
to the next (note: if performance mandates it, we can move to a
|
||||||
|
sliding window style of AV inserts to get better concurrency of
|
||||||
|
AV resolution). */
|
||||||
|
|
||||||
|
/* Leave a few empty slots in the AV EQ, just for good measure */
|
||||||
|
if (module->av_eq_size < 8) {
|
||||||
|
opal_show_help("help-mpi-btl-usnic.txt", "fi_av_eq too small",
|
||||||
|
true,
|
||||||
|
opal_process_info.nodename,
|
||||||
|
module->av_eq_size,
|
||||||
|
8);
|
||||||
|
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t eq_size = module->av_eq_size - 8;
|
||||||
|
size_t block_len = eq_size;
|
||||||
|
size_t num_av_inserts = nprocs * USNIC_NUM_CHANNELS;
|
||||||
|
size_t num_blocks = num_av_inserts / eq_size;
|
||||||
|
if (eq_size % num_av_inserts != 0) {
|
||||||
|
++num_blocks;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Per above, the blocks are expressed in terms of number of AV
|
||||||
|
inserts. Convert them to be expressed in terms of number of
|
||||||
|
procs. */
|
||||||
|
block_len /= USNIC_NUM_CHANNELS;
|
||||||
|
|
||||||
|
/* Per above, loop over creating the endpoints so that we do not
|
||||||
|
overrun the libfabric AV EQ. */
|
||||||
|
int rc;
|
||||||
|
for (size_t block_offset = 0, block = 0; block < num_blocks;
|
||||||
|
block_offset += block_len, ++block) {
|
||||||
|
/* Adjust for the last block */
|
||||||
|
if (block_len > (nprocs - block_offset)) {
|
||||||
|
block_len = nprocs - block_offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* First, create endpoints (and procs, if they're not already
|
||||||
|
created) for the usnic-reachable procs we were given. */
|
||||||
|
rc = add_procs_block_create_endpoints(module,
|
||||||
|
block_offset, block_len,
|
||||||
|
procs, endpoints);
|
||||||
|
if (OPAL_SUCCESS != rc) {
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* For each endpoint that was created, we initiated the
|
||||||
|
process to create NUM_CHANNELS fi_addrs. Go finish all of
|
||||||
|
those. This will be the final determination of whether we
|
||||||
|
can use the endpoint or not because we'll find out if each
|
||||||
|
endpoint is reachable or not. */
|
||||||
|
rc = add_procs_block_reap_fi_av_inserts(module,
|
||||||
|
block_offset, block_len,
|
||||||
|
endpoints);
|
||||||
|
if (OPAL_SUCCESS != rc) {
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return OPAL_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Add procs to this BTL module, receiving endpoint information from
|
* Add procs to this BTL module, receiving endpoint information from
|
||||||
* the modex. This is done in 2 phases:
|
* the modex. This is done in 2 phases:
|
||||||
@ -408,23 +482,13 @@ static int usnic_add_procs(struct mca_btl_base_module_t* base_module,
|
|||||||
opal_btl_usnic_module_t* module = (opal_btl_usnic_module_t*) base_module;
|
opal_btl_usnic_module_t* module = (opal_btl_usnic_module_t*) base_module;
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
/* First, create endpoints (and procs, if they're not already
|
/* Go create the endpoints (including all relevant address
|
||||||
created) for all the usnic-reachable procs we were given. */
|
resolution) */
|
||||||
rc = add_procs_create_endpoints(module, nprocs, procs, endpoints);
|
rc = add_procs_create_endpoints(module, nprocs, procs, endpoints);
|
||||||
if (OPAL_SUCCESS != rc) {
|
if (OPAL_SUCCESS != rc) {
|
||||||
goto fail;
|
goto fail;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* For each endpoint that was created, we initiated the process to
|
|
||||||
create NUM_CHANNELS fi_addrs. Go finish all of those. This
|
|
||||||
will be the final determination of whether we can use the
|
|
||||||
endpoint or not because we'll find out if each endpoint is
|
|
||||||
reachable or not. */
|
|
||||||
rc = add_procs_reap_fi_av_inserts(module, nprocs, endpoints);
|
|
||||||
if (OPAL_SUCCESS != rc) {
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Find all the endpoints with a complete set of USD destinations
|
/* Find all the endpoints with a complete set of USD destinations
|
||||||
and mark them as reachable */
|
and mark them as reachable */
|
||||||
for (size_t i = 0; NULL != reachable && i < nprocs; ++i) {
|
for (size_t i = 0; NULL != reachable && i < nprocs; ++i) {
|
||||||
@ -1831,6 +1895,11 @@ static void init_queue_lengths(opal_btl_usnic_module_t *module)
|
|||||||
} else {
|
} else {
|
||||||
module->cq_num = mca_btl_usnic_component.cq_num;
|
module->cq_num = mca_btl_usnic_component.cq_num;
|
||||||
}
|
}
|
||||||
|
if (-1 == mca_btl_usnic_component.av_eq_num) {
|
||||||
|
module->av_eq_num = 1024;
|
||||||
|
} else {
|
||||||
|
module->av_eq_num = mca_btl_usnic_component.av_eq_num;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Queue sizes for priority channel scale with # of endpoint. A
|
* Queue sizes for priority channel scale with # of endpoint. A
|
||||||
@ -2018,12 +2087,15 @@ static int init_channels(opal_btl_usnic_module_t *module)
|
|||||||
}
|
}
|
||||||
|
|
||||||
memset(&eq_attr, 0, sizeof(eq_attr));
|
memset(&eq_attr, 0, sizeof(eq_attr));
|
||||||
eq_attr.size = 1024;
|
eq_attr.size = module->av_eq_num;
|
||||||
eq_attr.wait_obj = FI_WAIT_UNSPEC;
|
eq_attr.wait_obj = FI_WAIT_UNSPEC;
|
||||||
rc = fi_eq_open(module->fabric, &eq_attr, &module->av_eq, NULL);
|
rc = fi_eq_open(module->fabric, &eq_attr, &module->av_eq, NULL);
|
||||||
if (rc != OPAL_SUCCESS) {
|
if (rc != OPAL_SUCCESS) {
|
||||||
goto destroy;
|
goto destroy;
|
||||||
}
|
}
|
||||||
|
// Save the size of the created EQ
|
||||||
|
module->av_eq_size = eq_attr.size;
|
||||||
|
|
||||||
eq_attr.wait_obj = FI_WAIT_FD;
|
eq_attr.wait_obj = FI_WAIT_FD;
|
||||||
rc = fi_eq_open(module->fabric, &eq_attr, &module->dom_eq, NULL);
|
rc = fi_eq_open(module->fabric, &eq_attr, &module->dom_eq, NULL);
|
||||||
if (rc != OPAL_SUCCESS) {
|
if (rc != OPAL_SUCCESS) {
|
||||||
|
@ -11,7 +11,7 @@
|
|||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2011-2015 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2011-2016 Cisco Systems, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -110,6 +110,8 @@ typedef struct opal_btl_usnic_module_t {
|
|||||||
struct fid_eq *av_eq;
|
struct fid_eq *av_eq;
|
||||||
struct fid_av *av;
|
struct fid_av *av;
|
||||||
|
|
||||||
|
size_t av_eq_size;
|
||||||
|
|
||||||
mca_btl_base_module_error_cb_fn_t pml_error_callback;
|
mca_btl_base_module_error_cb_fn_t pml_error_callback;
|
||||||
|
|
||||||
/* Information about the events */
|
/* Information about the events */
|
||||||
@ -127,6 +129,7 @@ typedef struct opal_btl_usnic_module_t {
|
|||||||
int sd_num;
|
int sd_num;
|
||||||
int rd_num;
|
int rd_num;
|
||||||
int cq_num;
|
int cq_num;
|
||||||
|
int av_eq_num;
|
||||||
int prio_sd_num;
|
int prio_sd_num;
|
||||||
int prio_rd_num;
|
int prio_rd_num;
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
# -*- text -*-
|
# -*- text -*-
|
||||||
#
|
#
|
||||||
# Copyright (c) 2012-2015 Cisco Systems, Inc. All rights reserved.
|
# Copyright (c) 2012-2016 Cisco Systems, Inc. All rights reserved.
|
||||||
#
|
#
|
||||||
# $COPYRIGHT$
|
# $COPYRIGHT$
|
||||||
#
|
#
|
||||||
@ -240,6 +240,20 @@ abort.
|
|||||||
usNIC interface: %s
|
usNIC interface: %s
|
||||||
Current ARP timeout: %d (btl_usnic_arp_timeout MCA param)
|
Current ARP timeout: %d (btl_usnic_arp_timeout MCA param)
|
||||||
#
|
#
|
||||||
|
[fi_av_eq too small]
|
||||||
|
|
||||||
|
The usnic BTL was told to create an address resolution queue that was
|
||||||
|
too small via the mca_btl_usnic_av_eq_num MCA parameter. This
|
||||||
|
parameter controls how many outstanding peer address resolutions can
|
||||||
|
be outstanding at a time. Larger values allow more concurrent address
|
||||||
|
resolutions, but consume more memory.
|
||||||
|
|
||||||
|
Server: %s
|
||||||
|
av_eq_num param value: %d
|
||||||
|
av_eq_num minimum value: %d
|
||||||
|
|
||||||
|
Your job will likely either perform poorly, or will abort.
|
||||||
|
#
|
||||||
[unreachable peer IP]
|
[unreachable peer IP]
|
||||||
WARNING: Open MPI failed to find a route to a peer IP address via a
|
WARNING: Open MPI failed to find a route to a peer IP address via a
|
||||||
specific usNIC interface. This usually indicates a problem in the IP
|
specific usNIC interface. This usually indicates a problem in the IP
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user