1
1

usnic: fix fi_av_insert (ARP resolution) bugs

We had several problems in the old code:

1. We were specifying an arbitrary timeout (100 ms) and then abandoning
   all remaining pending AV insert operations.  We would then free the
   endpoint buffer that we gave to fi_av_insert(), usually causing
   libfabric's progress thread to write to a freed buffer.

2. We were claiming in a show_help message that the timeout was
   controllable via an MCA parameter.  This commit removes that
   parameter, since there's no good method for us to specify a timeout
   like this to libfabric right now.

3. We also weren't waiting for the correct number of fi_av_insert()
   operations to complete.  We were waiting for nprocs, which is
   accidentally fine for 2 procs on separate hosts, but not for most
   other proc counts.

Reviewed-by: Jeff Squyres <jsquyres@cisco.com>
Этот коммит содержится в:
Dave Goodell 2015-01-07 08:25:17 -08:00
родитель 06e071454e
Коммит 49069bc661
3 изменённых файлов: 19 добавлений и 32 удалений

Просмотреть файл

@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2006 Sandia National Laboratories. All rights
* reserved.
* Copyright (c) 2011-2014 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2015 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -208,9 +208,6 @@ typedef struct opal_btl_usnic_component_t {
int connectivity_ack_timeout;
int connectivity_num_retries;
/* remote address handle (i.e., ARP) timeout */
int arp_timeout;
/** how many short packets have to be received before outputting
the "received short packets" warning? */
uint32_t max_short_packets;

Просмотреть файл

@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2006 Sandia National Laboratories. All rights
* reserved.
* Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2008-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
@ -263,10 +263,6 @@ int opal_btl_usnic_component_register(void)
USNIC_DFLT_PACK_LAZY_THRESHOLD, &pack_lazy_threshold, REGINT_NEG_ONE_OK, OPAL_INFO_LVL_5));
mca_btl_usnic_component.pack_lazy_threshold = pack_lazy_threshold;
CHECK(reg_int("arp_timeout", "Timeout, in seconds, for the maximum delay between ARP replies (must be >=1)",
10, &mca_btl_usnic_component.arp_timeout,
REGINT_GE_ONE, OPAL_INFO_LVL_6));
CHECK(reg_int("max_short_packets", "Number of abnormally-short packets received before outputting a warning (0 = never show the warning)",
25, &max_short_packets,
REGINT_GE_ZERO, OPAL_INFO_LVL_5));

Просмотреть файл

@ -12,7 +12,7 @@
* All rights reserved.
* Copyright (c) 2006 Sandia National Laboratories. All rights
* reserved.
* Copyright (c) 2009-2014 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2009-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved
@ -224,7 +224,7 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
struct mca_btl_base_endpoint_t **endpoints)
{
int ret = OPAL_SUCCESS;
int num_left = array_len;
int num_left;
size_t i, channel;
uint32_t event;
struct fi_eq_entry entry;
@ -232,12 +232,20 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
bool error_occurred = false;
/* compute num fi_av_insert completions we are waiting for */
num_left = 0;
for (i = 0; i < array_len; ++i) {
if (NULL != endpoints[i]) {
num_left += USNIC_NUM_CHANNELS;
}
}
/* Loop polling for USD destination creation completion (they were
individually started in btl_usnic_proc.c) */
while (num_left > 0) {
opal_btl_usnic_addr_context_t *context;
ret = fi_eq_sread(module->av_eq, &event, &entry, sizeof(entry), 100, 0);
ret = fi_eq_sread(module->av_eq, &event, &entry, sizeof(entry), -1, 0);
if (sizeof(entry) == ret) {
context = entry.context;
num_left -= entry.data;
@ -245,22 +253,6 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
ret = 0;
}
/* Did we timeout? )? If so, we're probably never going to
finish, so just mark all remaining endpoints as unreachable
and bail. */
else if (-FI_ETIMEDOUT == ret) {
opal_show_help("help-mpi-btl-usnic.txt",
"fi_av_insert timeout",
true,
opal_process_info.nodename,
module->fabric_info->fabric_attr->name,
mca_btl_usnic_component.arp_timeout);
/* Note: this is not an error; these endpoints are just
unreachable. The desintations we've already created
are good, so we'll keep those. */
break;
}
else if (-FI_EAVAIL == ret) {
ret = fi_eq_readerr(module->av_eq, &err_entry, 0);
if (sizeof(err_entry) == ret) {
@ -301,7 +293,8 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
"Failed to insert address to AV");
ret = OPAL_ERR_OUT_OF_RESOURCE;
error_occurred = true;
break;
/* we can't break here, need to finish reaping all inserts */
continue;
}
}
else {
@ -315,7 +308,8 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
"Failed to insert address to AV");
ret = OPAL_ERR_OUT_OF_RESOURCE;
error_occurred = true;
break;
/* we can't break here, need to finish reaping all inserts */
continue;
}
}
@ -330,9 +324,9 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
ret,
"Failed to insert address to AV");
ret = OPAL_ERR_OUT_OF_RESOURCE;
/* JMS handle the error */
error_occurred = true;
break;
/* we can't break here, need to finish reaping all inserts */
continue;
}
}