1
1

This is what r30852 should have been: Consolidate into a single, outter loop of ibv_create_ah() calls

Follow on to SVN trunk r30850: consolidate the ibv_create_ah() calls
into a single loop, MPI_WAITALL-style.  That is, call the (effectively
non-blocking) ibv_create_ah() for each endpoint.  If we get
NULL+EAGAIN, it means that the UDP ARP is still ongoing down in the
kernel, so just try again later.  We put these all into a single loop
because it allows us to parallelize the ARP progress in the kernel.

cmr=v1.7.5:ticket=trac:4253

This commit was SVN r30879.

The following SVN revision numbers were found above:
  r30850 --> open-mpi/ompi@3641500442
  r30852 --> open-mpi/ompi@4e282a3295

The following Trac tickets were found above:
  Ticket 4253 --> https://svn.open-mpi.org/trac/ompi/ticket/4253
Этот коммит содержится в:
Jeff Squyres 2014-02-27 17:19:50 +00:00
родитель 45810f0efb
Коммит 3cbdf33b88
5 изменённых файлов: 119 добавлений и 29 удалений

Просмотреть файл

@ -198,6 +198,9 @@ typedef struct ompi_btl_usnic_component_t {
bool connectivity_enabled;
int connectivity_ack_timeout;
int connectivity_num_retries;
/* ibv_create_ah() (i.e., ARP) timeout */
int arp_timeout;
} ompi_btl_usnic_component_t;
OMPI_MODULE_DECLSPEC extern ompi_btl_usnic_component_t mca_btl_usnic_component;

Просмотреть файл

@ -267,6 +267,10 @@ int ompi_btl_usnic_component_register(void)
USNIC_DFLT_PACK_LAZY_THRESHOLD, &pack_lazy_threshold, REGINT_NEG_ONE_OK, OPAL_INFO_LVL_5));
mca_btl_usnic_component.pack_lazy_threshold = pack_lazy_threshold;
CHECK(reg_int("arp_timeout", "Timeout, in seconds, for the maximum delay between ARP replies when using the usNIC/UDP transport (ignored when using the usNIC/L2 transport, must be >=1)",
10, &mca_btl_usnic_component.arp_timeout,
REGINT_GE_ONE, OPAL_INFO_LVL_6));
/* Default to bandwidth auto-detection */
ompi_btl_usnic_module_template.super.btl_bandwidth = 0;
ompi_btl_usnic_module_template.super.btl_latency = 4;

Просмотреть файл

@ -25,6 +25,7 @@
#include <string.h>
#include <unistd.h>
#include <stdlib.h>
#include <time.h>
#include "opal/class/opal_bitmap.h"
#include "opal/prefetch.h"
@ -77,6 +78,81 @@ static inline void compute_sf_size(ompi_btl_usnic_send_frag_t *sfrag)
sfrag->sf_size += frag->uf_src_seg[1].seg_len;
}
/* The call to ibv_create_ah() may initiate an ARP resolution, and may
* therefore take some time to complete. Hence, it will return 1 of 3
* things:
*
* 1. a valid new ah
* 2. NULL and errno == EAGAIN (ARP not complete; try again later)
* 3. NULL and errno != EAGAIN (fatal error)
*
* Since ibv_create_ah() is therefore effectively non-blocking, we
* gang all the endpoint ah creations here in this loop so that we can
* get some parallelization of ARP resolution.
*/
static int create_ahs(size_t array_len, size_t num_endpoints,
struct mca_btl_base_endpoint_t** endpoints,
ompi_btl_usnic_module_t *module)
{
size_t i;
struct ibv_ah_attr ah_attr;
size_t num_ah_created, last_num_ah_created;
time_t ts_last_created;
/* memset the ah_attr to both silence valgrind warnings (since the
attr struct ends up getting written down an fd to the kernel)
and actually zero out all the fields that we don't care about /
want to be logically false. */
memset(&ah_attr, 0, sizeof(ah_attr));
ah_attr.is_global = 1;
ah_attr.port_num = 1;
ts_last_created = time(NULL);
last_num_ah_created = num_ah_created = 0;
while (num_ah_created < num_endpoints) {
for (i = 0; i < array_len; i++) {
if (NULL != endpoints[i] &&
NULL == endpoints[i]->endpoint_remote_ah) {
ah_attr.grh.dgid = endpoints[i]->endpoint_remote_addr.gid;
endpoints[i]->endpoint_remote_ah =
ibv_create_ah(module->pd, &ah_attr);
if (NULL != endpoints[i]->endpoint_remote_ah) {
ts_last_created = time(NULL);
++num_ah_created;
} else if (EAGAIN != errno) {
opal_show_help("help-mpi-btl-usnic.txt", "ibv API failed",
true,
ompi_process_info.nodename,
ibv_get_device_name(module->device),
module->port_num,
"ibv_create_ah()", __FILE__, __LINE__,
"Failed to create an address handle");
return OMPI_ERR_OUT_OF_RESOURCE;
}
}
}
/* Has it been too long since our last AH creation (ARP
resolution)? If so, we're probably never going to finish,
so just bail. */
if (num_ah_created < num_endpoints &&
time(NULL) > (ts_last_created + mca_btl_usnic_component.arp_timeout)) {
opal_show_help("help-mpi-btl-usnic.txt", "ibv_create_ah timeout",
true,
ompi_process_info.nodename,
ibv_get_device_name(module->device),
module->port_num,
module->if_name,
mca_btl_usnic_component.arp_timeout);
return OMPI_ERR_UNREACH;
}
}
opal_output_verbose(5, USNIC_OUT,
"btl:usnic: made %" PRIsize_t " address handles", num_endpoints);
return OMPI_SUCCESS;
}
/*
* Add procs to this BTL module, receiving endpoint information from
* the modex.
@ -154,6 +230,22 @@ static int usnic_add_procs(struct mca_btl_base_module_t* base_module,
opal_output_verbose(5, USNIC_OUT,
"btl:usnic: made %" PRIsize_t " endpoints", count);
/* Create address handles for all the newly-created endpoints */
rc = create_ahs(nprocs, count, endpoints, module);
if (OMPI_SUCCESS != rc) {
for (i = 0; i < nprocs; i++) {
if (NULL != endpoints[i]) {
OBJ_RELEASE(endpoints[i]);
endpoints[i] = NULL;
}
opal_bitmap_clear_bit(reachable, i);
}
/* Allow falling through to return OMPI_SUCCESS, even though
we couldn't reach everyone (and therefore we gave up). Let
the PML know that this module basically isn't reachable,
but it's free to continue with other BTL modules. */
}
return OMPI_SUCCESS;
}

Просмотреть файл

@ -623,7 +623,6 @@ ompi_btl_usnic_create_endpoint(ompi_btl_usnic_module_t *module,
{
int err;
int modex_index;
struct ibv_ah_attr ah_attr;
ompi_btl_usnic_endpoint_t *endpoint;
/* look for matching modex info */
@ -654,34 +653,10 @@ ompi_btl_usnic_create_endpoint(ompi_btl_usnic_module_t *module,
endpoint->endpoint_next_contig_seq_to_recv - 1;
endpoint->endpoint_rfstart = WINDOW_SIZE_MOD(endpoint->endpoint_next_contig_seq_to_recv);
/* Create the address handle on this endpoint from the modex info.
memset to both silence valgrind warnings (since the attr struct
ends up getting written down an fd to the kernel) and actually
zero out all the fields that we don't care about / want to be
logically false. */
memset(&ah_attr, 0, sizeof(ah_attr));
ah_attr.is_global = 1;
ah_attr.port_num = 1;
ah_attr.grh.dgid = endpoint->endpoint_remote_addr.gid;
while (1) {
endpoint->endpoint_remote_ah = ibv_create_ah(module->pd, &ah_attr);
if (NULL != endpoint->endpoint_remote_ah) {
break;
}
if (EAGAIN == errno) {
continue;
}
opal_show_help("help-mpi-btl-usnic.txt", "ibv API failed",
true,
ompi_process_info.nodename,
ibv_get_device_name(module->device),
module->port_num,
"ibv_create_ah()", __FILE__, __LINE__,
"Failed to create an address handle");
OBJ_RELEASE(endpoint);
return OMPI_ERR_OUT_OF_RESOURCE;
}
/* Defer creating the ibv_ah. Since calling ibv_create_ah() may
trigger ARP resolution, it's better to batch all the endpoints'
calls to ibv_create_ah() together to get some parallelism. */
endpoint->endpoint_remote_ah = NULL;
/* Now claim that modex slot */
proc->proc_modex_claimed[modex_index] = true;

Просмотреть файл

@ -270,3 +270,19 @@ Note that this behavior usually indicates some kind of network
misconfiguration. You should verify that UDP traffic with payloads up
to the "large message size" listed above can flow between these two
servers.
#
[ibv_create_ah timeout]
The usnic BTL failed to create addresses for remote peers within the
specified timeout. When using the usNIC/UDP transport, this usually
means that ARP requests failed to resolve in time. You may be able to
solve the problem by increasing the usnic BTL's ARP timeout. If that
doesn't work, you should diagnose why ARP replies are apparently not
being delivered in a timely manner.
The usNIC interface listed below will be ignored. Your MPI
application will likely either run with degraded performance and/or
abort.
Server: %s
Device: %s:%d (%s)
Current ARP timeout: %d (btl_usnic_arp_timeout MCA param)