diff --git a/opal/mca/btl/usnic/btl_usnic.h b/opal/mca/btl/usnic/btl_usnic.h index 4907af5f37..c128f5cbba 100644 --- a/opal/mca/btl/usnic/btl_usnic.h +++ b/opal/mca/btl/usnic/btl_usnic.h @@ -57,7 +57,7 @@ extern uint64_t opal_btl_usnic_ticks; extern opal_recursive_mutex_t btl_usnic_lock; static inline uint64_t -get_nsec(void) +get_ticks(void) { return opal_btl_usnic_ticks; } @@ -190,6 +190,14 @@ typedef struct opal_btl_usnic_component_t { /** retrans characteristics */ int retrans_timeout; + /** max number of messages re-sent during a single progress + iteration */ + int max_resends_per_iteration; + + /** minimum number of times through component progress before + checking to see if standalone ACKs need to be sent */ + int ack_iteration_delay; + /** transport header length for all usNIC devices on this server (it is guaranteed that all usNIC devices on a single server will have the same underlying transport, and therefore the diff --git a/opal/mca/btl/usnic/btl_usnic_component.c b/opal/mca/btl/usnic/btl_usnic_component.c index 5fefb9be07..ea796e8e3b 100644 --- a/opal/mca/btl/usnic/btl_usnic_component.c +++ b/opal/mca/btl/usnic/btl_usnic_component.c @@ -380,8 +380,9 @@ static int check_usnic_config(opal_btl_usnic_module_t *module, static void usnic_clock_callback(int fd, short flags, void *timeout) { - /* 1ms == 1,000,000 ns */ - opal_btl_usnic_ticks += 1000000; + /* Increase by so many ticks that we will definitely force sending + any ACKs that are pending */ + opal_btl_usnic_ticks += 1000; /* run progress to make sure time change gets noticed */ usnic_component_progress(); @@ -1128,7 +1129,7 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, */ static int usnic_handle_completion(opal_btl_usnic_module_t* module, opal_btl_usnic_channel_t *channel, struct fi_cq_entry *completion); -static int usnic_component_progress_2(void); +static int usnic_component_progress_2(bool check_priority); static void usnic_handle_cq_error(opal_btl_usnic_module_t* module, opal_btl_usnic_channel_t *channel, int cq_ret); @@ -1141,9 +1142,7 @@ static int usnic_component_progress(void) struct fi_cq_entry completion; opal_btl_usnic_channel_t *channel; static bool fastpath_ok = true; - - /* update our simulated clock */ - opal_btl_usnic_ticks += 5000; + bool check_priority = true; count = 0; if (fastpath_ok) { @@ -1176,10 +1175,11 @@ static int usnic_component_progress(void) usnic_handle_cq_error(module, channel, ret); } } + check_priority = false; } fastpath_ok = true; - return count + usnic_component_progress_2(); + return count + usnic_component_progress_2(check_priority); } static int usnic_handle_completion( @@ -1300,7 +1300,7 @@ usnic_handle_cq_error(opal_btl_usnic_module_t* module, } } -static int usnic_component_progress_2(void) +static int usnic_component_progress_2(bool check_priority) { int i, j, count = 0, num_events, ret; opal_btl_usnic_module_t* module; @@ -1309,15 +1309,18 @@ static int usnic_component_progress_2(void) int rc; int c; - /* update our simulated clock */ - opal_btl_usnic_ticks += 5000; + opal_btl_usnic_ticks += 1; + + /* If we need to check priority, start with the priority channel. + Otherwise, just check the data channel. */ + int c_start = check_priority ? USNIC_PRIORITY_CHANNEL : USNIC_DATA_CHANNEL; /* Poll for completions */ for (i = 0; i < mca_btl_usnic_component.num_modules; i++) { module = mca_btl_usnic_component.usnic_active_modules[i]; /* poll each channel */ - for (c=0; cmod_channels[c]; if (channel->chan_deferred_recv != NULL) { diff --git a/opal/mca/btl/usnic/btl_usnic_frag.h b/opal/mca/btl/usnic/btl_usnic_frag.h index 428bac082c..16782debf4 100644 --- a/opal/mca/btl/usnic/btl_usnic_frag.h +++ b/opal/mca/btl/usnic/btl_usnic_frag.h @@ -138,18 +138,24 @@ typedef struct { the length of the packet to meet a minimum size */ uint16_t payload_len; - /* If this is an emulated PUT, store at this address on receiver */ - char *put_addr; - /* Type of BTL header (see enum, above) */ uint8_t payload_type; /* true if there is piggy-backed ACK */ uint8_t ack_present; + /* This field is ordered here so that we have no holes in the + struct. Technically this doesn't matter, because we're using + the __packed__ attribute (so there will be no holes anyway), + but ordering things nicely in the struct prevents the need for + unaligned reads/writes when using _packed__. */ + /* If this is an emulated PUT, store at this address on + receiver */ + char *put_addr; + /* tag for upper layer */ mca_btl_base_tag_t tag; -} opal_btl_usnic_btl_header_t; +} __opal_attribute_packed__ opal_btl_usnic_btl_header_t; /** * BTL header for a chunk of a fragment diff --git a/opal/mca/btl/usnic/btl_usnic_mca.c b/opal/mca/btl/usnic/btl_usnic_mca.c index 7da0812186..42e436d5b2 100644 --- a/opal/mca/btl/usnic/btl_usnic_mca.c +++ b/opal/mca/btl/usnic/btl_usnic_mca.c @@ -246,9 +246,17 @@ int opal_btl_usnic_component_register(void) mca_btl_usnic_component.udp_port_base = (int) udp_port_base; CHECK(reg_int("retrans_timeout", "Number of microseconds before retransmitting a frame", - 5000, &mca_btl_usnic_component.retrans_timeout, + 100000, &mca_btl_usnic_component.retrans_timeout, REGINT_GE_ONE, OPAL_INFO_LVL_5)); + CHECK(reg_int("max_resends_per_iteration", "Maximum number of frames to resend in a single iteration through usNIC component progress", + 16, &mca_btl_usnic_component.max_resends_per_iteration, + REGINT_GE_ONE, OPAL_INFO_LVL_5)); + + CHECK(reg_int("ack_iteration_delay", "Minimum number of times through usNIC \"progress\" function before checking to see if standalone ACKs need to be sent", + 0, &mca_btl_usnic_component.ack_iteration_delay, + REGINT_GE_ZERO, OPAL_INFO_LVL_5)); + CHECK(reg_int("priority_limit", "Max size of \"priority\" messages (0 = use pre-set defaults; depends on number and type of devices available)", 0, &max_tiny_msg_size, REGINT_GE_ZERO, OPAL_INFO_LVL_5)); diff --git a/opal/mca/btl/usnic/btl_usnic_module.c b/opal/mca/btl/usnic/btl_usnic_module.c index 0d7e3d64a9..c16001908e 100644 --- a/opal/mca/btl/usnic/btl_usnic_module.c +++ b/opal/mca/btl/usnic/btl_usnic_module.c @@ -954,11 +954,12 @@ usnic_do_resends( opal_btl_usnic_send_segment_t *sseg; opal_btl_usnic_endpoint_t *endpoint; struct opal_btl_usnic_channel_t *data_channel; - int ret; + int ret, count; data_channel = &module->mod_channels[USNIC_DATA_CHANNEL]; - while ((get_send_credits(data_channel) > 1) && + count = mca_btl_usnic_component.max_resends_per_iteration; + while (count > 0 && (get_send_credits(data_channel) > 1) && !opal_list_is_empty(&module->pending_resend_segs)) { /* @@ -999,6 +1000,8 @@ usnic_do_resends( if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) { opal_btl_usnic_util_abort("hotel checkin failed\n", __FILE__, __LINE__); } + + --count; } } @@ -1226,7 +1229,7 @@ opal_btl_usnic_module_progress_sends( /* Is it time to send ACK? */ if (endpoint->endpoint_acktime == 0 || - endpoint->endpoint_acktime <= get_nsec()) { + endpoint->endpoint_acktime <= get_ticks()) { if (OPAL_LIKELY(opal_btl_usnic_ack_send(module, endpoint) == OPAL_SUCCESS)) { opal_btl_usnic_remove_from_endpoints_needing_ack(endpoint); } else { @@ -2344,14 +2347,14 @@ static void init_freelists(opal_btl_usnic_module_t *module) uint32_t segsize; segsize = (module->local_modex.max_msg_size + - opal_cache_line_size - 1) & + mca_btl_usnic_component.prefix_send_offset + + opal_cache_line_size - 1) & ~(opal_cache_line_size - 1); /* Send frags freelists */ OBJ_CONSTRUCT(&module->small_send_frags, opal_free_list_t); rc = usnic_compat_free_list_init(&module->small_send_frags, - sizeof(opal_btl_usnic_small_send_frag_t) + - mca_btl_usnic_component.prefix_send_offset, + sizeof(opal_btl_usnic_small_send_frag_t), opal_cache_line_size, OBJ_CLASS(opal_btl_usnic_small_send_frag_t), segsize, @@ -2368,8 +2371,7 @@ static void init_freelists(opal_btl_usnic_module_t *module) OBJ_CONSTRUCT(&module->large_send_frags, opal_free_list_t); rc = usnic_compat_free_list_init(&module->large_send_frags, - sizeof(opal_btl_usnic_large_send_frag_t) + - mca_btl_usnic_component.prefix_send_offset, + sizeof(opal_btl_usnic_large_send_frag_t), opal_cache_line_size, OBJ_CLASS(opal_btl_usnic_large_send_frag_t), 0, /* payload size */ @@ -2386,8 +2388,7 @@ static void init_freelists(opal_btl_usnic_module_t *module) OBJ_CONSTRUCT(&module->put_dest_frags, opal_free_list_t); rc = usnic_compat_free_list_init(&module->put_dest_frags, - sizeof(opal_btl_usnic_put_dest_frag_t) + - mca_btl_usnic_component.prefix_send_offset, + sizeof(opal_btl_usnic_put_dest_frag_t), opal_cache_line_size, OBJ_CLASS(opal_btl_usnic_put_dest_frag_t), 0, /* payload size */ @@ -2405,8 +2406,7 @@ static void init_freelists(opal_btl_usnic_module_t *module) /* list of segments to use for sending */ OBJ_CONSTRUCT(&module->chunk_segs, opal_free_list_t); rc = usnic_compat_free_list_init(&module->chunk_segs, - sizeof(opal_btl_usnic_chunk_segment_t) + - mca_btl_usnic_component.prefix_send_offset, + sizeof(opal_btl_usnic_chunk_segment_t), opal_cache_line_size, OBJ_CLASS(opal_btl_usnic_chunk_segment_t), segsize, @@ -2424,11 +2424,11 @@ static void init_freelists(opal_btl_usnic_module_t *module) /* ACK segments freelist */ uint32_t ack_segment_len; ack_segment_len = (sizeof(opal_btl_usnic_btl_header_t) + + mca_btl_usnic_component.prefix_send_offset + opal_cache_line_size - 1) & ~(opal_cache_line_size - 1); OBJ_CONSTRUCT(&module->ack_segs, opal_free_list_t); rc = usnic_compat_free_list_init(&module->ack_segs, - sizeof(opal_btl_usnic_ack_segment_t) + - mca_btl_usnic_component.prefix_send_offset, + sizeof(opal_btl_usnic_ack_segment_t), opal_cache_line_size, OBJ_CLASS(opal_btl_usnic_ack_segment_t), ack_segment_len, diff --git a/opal/mca/btl/usnic/btl_usnic_recv.h b/opal/mca/btl/usnic/btl_usnic_recv.h index 7e056e488d..7a178c1630 100644 --- a/opal/mca/btl/usnic/btl_usnic_recv.h +++ b/opal/mca/btl/usnic/btl_usnic_recv.h @@ -112,9 +112,12 @@ opal_btl_usnic_update_window( opal_btl_usnic_add_to_endpoints_needing_ack(endpoint); } - /* give this process a chance to send something before ACKing */ + /* A hueristic: set to send this ACK after we have checked our + incoming DATA_CHANNEL component.act_iteration_delay times + (i.e., so we can piggyback an ACK on an outgoing send) */ if (0 == endpoint->endpoint_acktime) { - endpoint->endpoint_acktime = get_nsec() + 50000; /* 50 usec */ + endpoint->endpoint_acktime = + get_ticks() + mca_btl_usnic_component.ack_iteration_delay; } /* Save this incoming segment in the received segmentss array on the