diff --git a/opal/mca/btl/usnic/btl_usnic.h b/opal/mca/btl/usnic/btl_usnic.h index 6f4531184a..6444ca6b0f 100644 --- a/opal/mca/btl/usnic/btl_usnic.h +++ b/opal/mca/btl/usnic/btl_usnic.h @@ -220,6 +220,12 @@ typedef struct opal_btl_usnic_component_t { API >=v1.1, the usnic provider returned 1 upon success. */ ssize_t cq_readerr_success_value; ssize_t cq_readerr_try_again_value; + + /** Offset into the send buffer where the payload will go. For + libfabric v1.0.0 / API v1.0, this is 0. For libfabric >=v1.1 + / API >=v1.1, this is the endpoint.msg_prefix_size (i.e., + component.transport_header_len). */ + uint32_t prefix_send_offset; } opal_btl_usnic_component_t; OPAL_MODULE_DECLSPEC extern opal_btl_usnic_component_t mca_btl_usnic_component; diff --git a/opal/mca/btl/usnic/btl_usnic_component.c b/opal/mca/btl/usnic/btl_usnic_component.c index 26e66e630f..7302fe6f06 100644 --- a/opal/mca/btl/usnic/btl_usnic_component.c +++ b/opal/mca/btl/usnic/btl_usnic_component.c @@ -163,6 +163,7 @@ static int usnic_component_open(void) mca_btl_usnic_component.usnic_all_modules = NULL; mca_btl_usnic_component.usnic_active_modules = NULL; mca_btl_usnic_component.transport_header_len = -1; + mca_btl_usnic_component.prefix_send_offset = 0; /* initialize objects */ OBJ_CONSTRUCT(&mca_btl_usnic_component.usnic_procs, opal_list_t); @@ -630,7 +631,29 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, hints.ep_attr = &ep_attr; hints.fabric_attr = &fabric_attr; - ret = fi_getinfo(FI_VERSION(1, 0), NULL, 0, 0, &hints, &info_list); + /* This code understands libfabric API v1.0 and v1.1. Even if we + were compiled with libfabric API v1.0, we still want to request + v1.1 -- here's why: + + - In libfabric v1.0.0 (i.e., API v1.0), the usnic provider did + not check the value of the "version" parameter passed into + fi_getinfo() + + - If you pass FI_VERSION(1,0) to libfabric v1.1.0 (i.e., API + v1.1), the usnic provider will disable FI_MSG_PREFIX support + (on the assumption that the application will not handle + FI_MSG_PREFIX properly). This can happen if you compile OMPI + against libfabric v1.0.0 (i.e., API v1.0) and run OMPI + against libfabric v1.1.0 (i.e., API v1.1). + + So never request API v1.0 -- always request a minimum of + v1.1. */ + uint32_t libfabric_api; + libfabric_api = FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION); + if (libfabric_api == FI_VERSION(1, 0)) { + libfabric_api = FI_VERSION(1, 1); + } + ret = fi_getinfo(libfabric_api, NULL, 0, 0, &hints, &info_list); if (0 != ret) { opal_output_verbose(5, USNIC_OUT, "btl:usnic: disqualifiying myself due to fi_getinfo failure: %s (%d)", strerror(-ret), ret); @@ -671,8 +694,9 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, The ambiguities were clarified in libfabric v1.1.0 (i.e., API v1.1); the usnic provider returned 1 from fi_cq_readerr() upon success. - */ - uint32_t libfabric_api; + + So query to see what version of the libfabric API we are + running with, and adapt accordingly. */ libfabric_api = fi_version(); if (1 == FI_MAJOR(libfabric_api) && 0 == FI_MINOR(libfabric_api)) { diff --git a/opal/mca/btl/usnic/btl_usnic_frag.c b/opal/mca/btl/usnic/btl_usnic_frag.c index 5944e02cba..02c80402e1 100644 --- a/opal/mca/btl/usnic/btl_usnic_frag.c +++ b/opal/mca/btl/usnic/btl_usnic_frag.c @@ -30,23 +30,22 @@ #include "btl_usnic_ack.h" static void -common_send_seg_helper( - opal_btl_usnic_send_segment_t *seg, - int offset) +common_send_seg_helper(opal_btl_usnic_send_segment_t *seg) { opal_btl_usnic_segment_t *bseg; - bseg = &seg->ss_base; - - bseg->us_btl_header = (opal_btl_usnic_btl_header_t *) - (((char*) bseg->us_list.ptr) + offset); - bseg->us_btl_header->sender = mca_btl_usnic_component.my_hashed_rte_name; - + /* send ptr for fi_send(). ss_len will be filled in right before + the actual send. */ + seg->ss_ptr = (uint8_t *) seg->ss_base.us_list.ptr; seg->ss_send_posted = 0; seg->ss_ack_pending = false; - /* send ptr, len will be filled in just before send */ - seg->ss_ptr = (uint8_t *)bseg->us_btl_header; + /* Offset the BTL header by (prefix_send_offset) bytes into the + raw buffer */ + bseg = &seg->ss_base; + bseg->us_btl_header = (opal_btl_usnic_btl_header_t *) + (seg->ss_ptr + mca_btl_usnic_component.prefix_send_offset); + bseg->us_btl_header->sender = mca_btl_usnic_component.my_hashed_rte_name; } static void @@ -59,7 +58,7 @@ chunk_seg_constructor( bseg->us_type = OPAL_BTL_USNIC_SEG_CHUNK; /* some more common initializaiton */ - common_send_seg_helper(seg, mca_btl_usnic_component.transport_header_len); + common_send_seg_helper(seg); /* payload starts next byte beyond BTL chunk header */ bseg->us_payload.raw = (uint8_t *)(bseg->us_btl_chunk_header + 1); @@ -77,7 +76,7 @@ frag_seg_constructor( bseg->us_type = OPAL_BTL_USNIC_SEG_FRAG; /* some more common initializaiton */ - common_send_seg_helper(seg, mca_btl_usnic_component.transport_header_len); + common_send_seg_helper(seg); /* payload starts next byte beyond BTL header */ bseg->us_payload.raw = (uint8_t *)(bseg->us_btl_header + 1); @@ -95,7 +94,7 @@ ack_seg_constructor( bseg->us_type = OPAL_BTL_USNIC_SEG_ACK; /* some more common initializaiton */ - common_send_seg_helper(ack, mca_btl_usnic_component.transport_header_len); + common_send_seg_helper(ack); /* ACK value embedded in BTL header */ bseg->us_btl_header->payload_type = OPAL_BTL_USNIC_PAYLOAD_TYPE_ACK; diff --git a/opal/mca/btl/usnic/btl_usnic_module.c b/opal/mca/btl/usnic/btl_usnic_module.c index 2b36af126b..56aacfb9ae 100644 --- a/opal/mca/btl/usnic/btl_usnic_module.c +++ b/opal/mca/btl/usnic/btl_usnic_module.c @@ -1421,7 +1421,7 @@ static int create_ep(opal_btl_usnic_module_t* module, opal_process_info.my_local_rank); } - rc = fi_getinfo(FI_VERSION(1, 0), NULL, 0, 0, hint, &channel->info); + rc = fi_getinfo(FI_VERSION(1, 1), NULL, 0, 0, hint, &channel->info); fi_freeinfo(hint); if (0 != rc) { opal_show_help("help-mpi-btl-usnic.txt", @@ -1634,6 +1634,9 @@ static int init_one_channel(opal_btl_usnic_module_t *module, goto error; } + assert(channel->info->ep_attr->msg_prefix_size == + (uint32_t) mca_btl_usnic_component.transport_header_len); + /* * Initialize pool of receive segments. Round MTU up to cache * line size so that each segment is guaranteed to start on a @@ -1777,6 +1780,33 @@ static void init_find_transport_header_len(opal_btl_usnic_module_t *module) module->fabric_info->ep_attr->msg_prefix_size; mca_btl_usnic_component.transport_protocol = module->fabric_info->ep_attr->protocol; + + /* The usnic provider in libfabric v1.0.0 (i.e., API v1.0) treated + FI_MSG_PREFIX inconsistently between senders and receivers. It + was corrected in libfabric v1.1.0 (i.e., API v1.1), meaning + that FI_MSG_PREFIX is treated consistently between senders and + receivers. + + So check what version of the libfabric API we have, and setup + to use the "old" (inconsistent) MSG_PREFIX behavior, or the + "new" MSG_PREFIX (consistent) behavior. + + NOTE: This is a little redundant; we're setting a + component-level attribute during each module's setup. We do + this here (and not earlier, when we check fi_version() during + the component setup) because we can't obtain the value of the + endpoint msg_prefix_size until we setup the first module. + Also, it's safe because each module will set the component + attribute to the same value. So it's ok. */ + uint32_t libfabric_api; + libfabric_api = fi_version(); + if (1 == FI_MAJOR(libfabric_api) && + 0 == FI_MINOR(libfabric_api)) { + mca_btl_usnic_component.prefix_send_offset = 0; + } else { + mca_btl_usnic_component.prefix_send_offset = + module->fabric_info->ep_attr->msg_prefix_size; + } } /* @@ -1835,13 +1865,15 @@ static void init_payload_lengths(opal_btl_usnic_module_t *module) /* Find the max payload this port can handle */ module->max_frag_payload = module->local_modex.max_msg_size - /* start with the MTU */ - sizeof(opal_btl_usnic_btl_header_t); /* subtract size of - the BTL header */ + sizeof(opal_btl_usnic_btl_header_t) - /* subtract size of + the BTL header */ + mca_btl_usnic_component.prefix_send_offset; /* same, but use chunk header */ module->max_chunk_payload = module->local_modex.max_msg_size - - sizeof(opal_btl_usnic_btl_chunk_header_t); + sizeof(opal_btl_usnic_btl_chunk_header_t) - + mca_btl_usnic_component.prefix_send_offset; /* Priorirty queue MTU and max size */ if (0 == module->max_tiny_msg_size) { @@ -2097,7 +2129,6 @@ static void init_freelists(opal_btl_usnic_module_t *module) uint32_t segsize; segsize = (module->local_modex.max_msg_size + - module->fabric_info->ep_attr->msg_prefix_size + opal_cache_line_size - 1) & ~(opal_cache_line_size - 1); @@ -2105,7 +2136,7 @@ static void init_freelists(opal_btl_usnic_module_t *module) OBJ_CONSTRUCT(&module->small_send_frags, opal_free_list_t); rc = usnic_compat_free_list_init(&module->small_send_frags, sizeof(opal_btl_usnic_small_send_frag_t) + - mca_btl_usnic_component.transport_header_len, + mca_btl_usnic_component.prefix_send_offset, opal_cache_line_size, OBJ_CLASS(opal_btl_usnic_small_send_frag_t), segsize, @@ -2123,7 +2154,7 @@ static void init_freelists(opal_btl_usnic_module_t *module) OBJ_CONSTRUCT(&module->large_send_frags, opal_free_list_t); rc = usnic_compat_free_list_init(&module->large_send_frags, sizeof(opal_btl_usnic_large_send_frag_t) + - mca_btl_usnic_component.transport_header_len, + mca_btl_usnic_component.prefix_send_offset, opal_cache_line_size, OBJ_CLASS(opal_btl_usnic_large_send_frag_t), 0, /* payload size */ @@ -2141,7 +2172,7 @@ static void init_freelists(opal_btl_usnic_module_t *module) OBJ_CONSTRUCT(&module->put_dest_frags, opal_free_list_t); rc = usnic_compat_free_list_init(&module->put_dest_frags, sizeof(opal_btl_usnic_put_dest_frag_t) + - mca_btl_usnic_component.transport_header_len, + mca_btl_usnic_component.prefix_send_offset, opal_cache_line_size, OBJ_CLASS(opal_btl_usnic_put_dest_frag_t), 0, /* payload size */ @@ -2160,7 +2191,7 @@ static void init_freelists(opal_btl_usnic_module_t *module) OBJ_CONSTRUCT(&module->chunk_segs, opal_free_list_t); rc = usnic_compat_free_list_init(&module->chunk_segs, sizeof(opal_btl_usnic_chunk_segment_t) + - mca_btl_usnic_component.transport_header_len, + mca_btl_usnic_component.prefix_send_offset, opal_cache_line_size, OBJ_CLASS(opal_btl_usnic_chunk_segment_t), segsize, @@ -2178,12 +2209,11 @@ static void init_freelists(opal_btl_usnic_module_t *module) /* ACK segments freelist */ uint32_t ack_segment_len; ack_segment_len = (sizeof(opal_btl_usnic_btl_header_t) + - module->fabric_info->ep_attr->msg_prefix_size + opal_cache_line_size - 1) & ~(opal_cache_line_size - 1); OBJ_CONSTRUCT(&module->ack_segs, opal_free_list_t); rc = usnic_compat_free_list_init(&module->ack_segs, sizeof(opal_btl_usnic_ack_segment_t) + - mca_btl_usnic_component.transport_header_len, + mca_btl_usnic_component.prefix_send_offset, opal_cache_line_size, OBJ_CLASS(opal_btl_usnic_ack_segment_t), ack_segment_len, diff --git a/opal/mca/btl/usnic/btl_usnic_send.h b/opal/mca/btl/usnic/btl_usnic_send.h index 02fd2e9079..796008d2f7 100644 --- a/opal/mca/btl/usnic/btl_usnic_send.h +++ b/opal/mca/btl/usnic/btl_usnic_send.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2014 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -79,7 +79,7 @@ opal_btl_usnic_post_segment( /* Send the segment */ ret = fi_send(channel->ep, sseg->ss_ptr, - sseg->ss_len, + sseg->ss_len + mca_btl_usnic_component.prefix_send_offset, NULL, endpoint->endpoint_remote_addrs[channel_id], sseg); @@ -128,7 +128,7 @@ opal_btl_usnic_post_ack( ret = fi_send(channel->ep, sseg->ss_ptr, - sseg->ss_len, + sseg->ss_len + mca_btl_usnic_component.prefix_send_offset, NULL, endpoint->endpoint_remote_addrs[channel_id], sseg);