diff --git a/ompi/mca/btl/usnic/btl_usnic_module.c b/ompi/mca/btl/usnic/btl_usnic_module.c index 62f1ea94d4..a6d5ed50ba 100644 --- a/ompi/mca/btl/usnic/btl_usnic_module.c +++ b/ompi/mca/btl/usnic/btl_usnic_module.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "opal/class/opal_bitmap.h" #include "opal/prefetch.h" @@ -76,6 +77,80 @@ static inline void compute_sf_size(ompi_btl_usnic_send_frag_t *sfrag) sfrag->sf_size += frag->uf_src_seg[1].seg_len; } +/* The call to ibv_create_ah() may initiate an ARP resolution, and may + * therefore take some time to complete. Hence, it will return 1 of 3 + * things: + * + * 1. a valid new ah + * 2. NULL and errno == EAGAIN (ARP not complete; try again later) + * 3. NULL and errno != EAGAIN (fatal error) + * + * Since ibv_create_ah() is therefore effectively non-blocking, we + * gang all the endpoint ah creations here in this loop so that we can + * get some parallelization of ARP resolution. + */ +static int create_ahs(size_t num_procs, size_t num_endpoints, + struct mca_btl_base_endpoint_t** endpoints, + ompi_btl_usnic_module_t *module) +{ + size_t i; + struct ibv_ah_attr ah_attr; + size_t num_ah_created, last_num_ah_created; + time_t ts_last_created; + + /* memset the ah_attr to both silence valgrind warnings (since the + attr struct ends up getting written down an fd to the kernel) + and actually zero out all the fields that we don't care about / + want to be logically false. */ + memset(&ah_attr, 0, sizeof(ah_attr)); + ah_attr.is_global = 1; + ah_attr.port_num = 1; + + ts_last_created = time(NULL); + last_num_ah_created = num_ah_created = 0; + while (num_ah_created < num_endpoints) { + for (i = 0; i < num_procs; i++) { + if (NULL != endpoints[i] && + NULL == endpoints[i]->endpoint_remote_ah) { + ah_attr.grh.dgid = endpoints[i]->endpoint_remote_addr.gid; + endpoints[i]->endpoint_remote_ah = + ibv_create_ah(module->pd, &ah_attr); + if (NULL != endpoints[i]->endpoint_remote_ah) { + ts_last_created = time(NULL); + ++num_ah_created; + } else if (EAGAIN != errno) { + opal_show_help("help-mpi-btl-usnic.txt", "ibv API failed", + true, + ompi_process_info.nodename, + ibv_get_device_name(module->device), + module->port_num, + "ibv_create_ah()", __FILE__, __LINE__, + "Failed to create an address handle"); + /* JMS WTF to do here? */ + //OBJ_RELEASE(endpoints[i]); + //return OMPI_ERR_OUT_OF_RESOURCE; + } + } + } + + /* Has it been too long since our last AH creation (ARP + resolution)? If so, we're probably never going to finish, + so just bail. */ + /* JMS PROBBALY WANT TO MAKE THIS AN MCA PARAM -- YAY I HAZ + ALL THE MCA PARAMS!! */ + if (num_ah_created < num_endpoints && + ts_last_created + 20 > time(NULL)) { + opal_output(0, "no ARP. life sux for you."); + // JMS make this a real show_help + abort(); // For the moment, die quietly, in the snow + } + } + + opal_output_verbose(5, USNIC_OUT, + "btl:usnic: made %" PRIsize_t " address handles", num_endpoints); + return OMPI_SUCCESS; +} + /* * Add procs to this BTL module, receiving endpoint information from * the modex. @@ -153,6 +228,9 @@ static int usnic_add_procs(struct mca_btl_base_module_t* base_module, opal_output_verbose(5, USNIC_OUT, "btl:usnic: made %" PRIsize_t " endpoints", count); + /* Create address handles for all the newly-created endpoints */ + create_ahs(nprocs, count, endpoints, module); + return OMPI_SUCCESS; } diff --git a/ompi/mca/btl/usnic/btl_usnic_proc.c b/ompi/mca/btl/usnic/btl_usnic_proc.c index ba72b34f2d..9b1542486c 100644 --- a/ompi/mca/btl/usnic/btl_usnic_proc.c +++ b/ompi/mca/btl/usnic/btl_usnic_proc.c @@ -623,7 +623,6 @@ ompi_btl_usnic_create_endpoint(ompi_btl_usnic_module_t *module, { int err; int modex_index; - struct ibv_ah_attr ah_attr; ompi_btl_usnic_endpoint_t *endpoint; /* look for matching modex info */ @@ -654,34 +653,10 @@ ompi_btl_usnic_create_endpoint(ompi_btl_usnic_module_t *module, endpoint->endpoint_next_contig_seq_to_recv - 1; endpoint->endpoint_rfstart = WINDOW_SIZE_MOD(endpoint->endpoint_next_contig_seq_to_recv); - /* Create the address handle on this endpoint from the modex info. - memset to both silence valgrind warnings (since the attr struct - ends up getting written down an fd to the kernel) and actually - zero out all the fields that we don't care about / want to be - logically false. */ - memset(&ah_attr, 0, sizeof(ah_attr)); - ah_attr.is_global = 1; - ah_attr.port_num = 1; - ah_attr.grh.dgid = endpoint->endpoint_remote_addr.gid; - - while (1) { - endpoint->endpoint_remote_ah = ibv_create_ah(module->pd, &ah_attr); - if (NULL != endpoint->endpoint_remote_ah) { - break; - } - if (EAGAIN == errno) { - continue; - } - opal_show_help("help-mpi-btl-usnic.txt", "ibv API failed", - true, - ompi_process_info.nodename, - ibv_get_device_name(module->device), - module->port_num, - "ibv_create_ah()", __FILE__, __LINE__, - "Failed to create an address handle"); - OBJ_RELEASE(endpoint); - return OMPI_ERR_OUT_OF_RESOURCE; - } + /* Defer creating the ibv_ah. Since calling ibv_create_ah() may + trigger ARP resolution, it's better to batch all the endpoints' + calls to ibv_create_ah() together to get some parallelism. */ + endpoint->endpoint_remote_ah = NULL; /* Now claim that modex slot */ proc->proc_modex_claimed[modex_index] = true;