d624e0d60f
Sequence numbers will wrap around; it is not sufficient to check for (seq-1) -- must use the SEQ_DIFF macro to properly handle the wraparound. This bug wasn't serious; it just meant we might retransmit one or two extra times when retransmits were triggerd and the sequence numbers wrapped around their sliding windows.
539 строки
19 KiB
C
539 строки
19 KiB
C
/*
|
|
* Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#include "opal_config.h"
|
|
|
|
#include <unistd.h>
|
|
#include <stdlib.h>
|
|
|
|
#include "opal/util/output.h"
|
|
#include "opal/mca/base/mca_base_var.h"
|
|
#include "opal/mca/base/mca_base_pvar.h"
|
|
|
|
#include "btl_usnic_compat.h"
|
|
#include "btl_usnic.h"
|
|
#include "btl_usnic_module.h"
|
|
#include "btl_usnic_stats.h"
|
|
#include "btl_usnic_util.h"
|
|
|
|
/*
|
|
* Local variables
|
|
*/
|
|
static mca_base_var_type_t pvar_type = MCA_BASE_VAR_TYPE_MAX;
|
|
|
|
static inline void usnic_stats_reset(opal_btl_usnic_module_t *module)
|
|
{
|
|
int i;
|
|
|
|
module->stats.num_total_sends =
|
|
module->stats.num_resends =
|
|
module->stats.num_chunk_sends =
|
|
module->stats.num_frag_sends =
|
|
module->stats.num_ack_recvs =
|
|
|
|
module->stats.num_total_recvs =
|
|
module->stats.num_unk_recvs =
|
|
module->stats.num_dup_recvs =
|
|
module->stats.num_oow_low_recvs =
|
|
module->stats.num_oow_high_recvs =
|
|
module->stats.num_frag_recvs =
|
|
module->stats.num_chunk_recvs =
|
|
module->stats.num_badfrag_recvs =
|
|
module->stats.num_ack_sends =
|
|
module->stats.num_recv_reposts =
|
|
module->stats.num_crc_errors =
|
|
|
|
module->stats.num_old_dup_acks =
|
|
module->stats.num_dup_acks =
|
|
module->stats.num_fast_retrans =
|
|
module->stats.num_timeout_retrans =
|
|
|
|
module->stats.max_sent_window_size =
|
|
module->stats.max_rcvd_window_size =
|
|
|
|
module->stats.pml_module_sends =
|
|
module->stats.pml_send_callbacks =
|
|
|
|
0;
|
|
|
|
for (i=0; i<USNIC_NUM_CHANNELS; ++i) {
|
|
module->mod_channels[i].num_channel_sends = 0;
|
|
}
|
|
}
|
|
|
|
/* Prints a few terse statistics lines via opal_output(0,...). The first
|
|
* line will be prefixed with the string "prefix". If "reset_stats" is true
|
|
* then the statistics will be reset after printing.
|
|
*
|
|
* NOTE: this routine ignores the setting of stats_enable, so it can be used
|
|
* for debugging routines even when normal stats reporting is not enabled.
|
|
*/
|
|
void opal_btl_usnic_print_stats(
|
|
opal_btl_usnic_module_t *module,
|
|
const char *prefix,
|
|
bool reset_stats)
|
|
{
|
|
char tmp[128], str[2048];
|
|
|
|
/* The usuals */
|
|
snprintf(str, sizeof(str), "%s:MCW:%3u, %s, ST(P+D)/F/C/R(T+F)/A:%8lu(%8u+%8u)/%8lu/%8lu/%4lu(%4lu+%4lu)/%8lu, RcvTot/Chk/F/C/L/H/D/BF/A:%8lu/%c%c/%8lu/%8lu/%4lu+%2lu/%4lu/%4lu/%6lu OA/DA %4lu/%4lu CRC:%4lu ",
|
|
prefix,
|
|
opal_proc_local_get()->proc_name.vpid,
|
|
|
|
module->fabric_info->fabric_attr->name,
|
|
|
|
module->stats.num_total_sends,
|
|
module->mod_channels[USNIC_PRIORITY_CHANNEL].num_channel_sends,
|
|
module->mod_channels[USNIC_DATA_CHANNEL].num_channel_sends,
|
|
module->stats.num_frag_sends,
|
|
module->stats.num_chunk_sends,
|
|
module->stats.num_resends,
|
|
module->stats.num_timeout_retrans,
|
|
module->stats.num_fast_retrans,
|
|
module->stats.num_ack_sends,
|
|
|
|
module->stats.num_total_recvs,
|
|
(module->stats.num_total_recvs -
|
|
module->stats.num_recv_reposts) == 0 ? 'g' : 'B',
|
|
(module->stats.num_total_recvs -
|
|
module->stats.num_frag_recvs -
|
|
module->stats.num_chunk_recvs -
|
|
module->stats.num_badfrag_recvs -
|
|
module->stats.num_oow_low_recvs -
|
|
module->stats.num_oow_high_recvs -
|
|
module->stats.num_dup_recvs -
|
|
module->stats.num_ack_recvs -
|
|
module->stats.num_unk_recvs) == 0 ? 'g' : 'B',
|
|
module->stats.num_frag_recvs,
|
|
module->stats.num_chunk_recvs,
|
|
module->stats.num_oow_low_recvs,
|
|
module->stats.num_oow_high_recvs,
|
|
module->stats.num_dup_recvs,
|
|
module->stats.num_badfrag_recvs,
|
|
module->stats.num_ack_recvs,
|
|
|
|
module->stats.num_old_dup_acks,
|
|
module->stats.num_dup_acks,
|
|
|
|
module->stats.num_crc_errors);
|
|
|
|
/* If our PML calls were 0, then show send and receive window
|
|
extents instead */
|
|
if (module->stats.pml_module_sends +
|
|
module->stats.pml_send_callbacks == 0) {
|
|
int64_t send_unacked, su_min = WINDOW_SIZE * 2, su_max = 0;
|
|
int64_t recv_depth, rd_min = WINDOW_SIZE * 2, rd_max = 0;
|
|
opal_btl_usnic_endpoint_t *endpoint;
|
|
opal_list_item_t *item;
|
|
|
|
rd_min = su_min = WINDOW_SIZE * 2;
|
|
rd_max = su_max = 0;
|
|
|
|
opal_mutex_lock(&module->all_endpoints_lock);
|
|
item = opal_list_get_first(&module->all_endpoints);
|
|
while (item != opal_list_get_end(&(module->all_endpoints))) {
|
|
endpoint = container_of(item, mca_btl_base_endpoint_t,
|
|
endpoint_endpoint_li);
|
|
item = opal_list_get_next(item);
|
|
|
|
/* Number of un-acked sends (i.e., sends for which we're
|
|
still waiting for ACK) */
|
|
send_unacked =
|
|
SEQ_DIFF(endpoint->endpoint_next_seq_to_send,
|
|
SEQ_DIFF(endpoint->endpoint_ack_seq_rcvd, 1));
|
|
|
|
if (send_unacked > su_max) su_max = send_unacked;
|
|
if (send_unacked < su_min) su_min = send_unacked;
|
|
|
|
/* Receive window depth (i.e., difference between highest
|
|
seq received and the next message we haven't ACKed
|
|
yet) */
|
|
recv_depth =
|
|
endpoint->endpoint_highest_seq_rcvd -
|
|
endpoint->endpoint_next_contig_seq_to_recv;
|
|
if (recv_depth > rd_max) rd_max = recv_depth;
|
|
if (recv_depth < rd_min) rd_min = recv_depth;
|
|
}
|
|
opal_mutex_unlock(&module->all_endpoints_lock);
|
|
snprintf(tmp, sizeof(tmp), "PML S:%1ld, Win!A/R:%4ld/%4ld %4ld/%4ld",
|
|
module->stats.pml_module_sends,
|
|
su_min, su_max,
|
|
rd_min, rd_max);
|
|
} else {
|
|
snprintf(tmp, sizeof(tmp), "PML S/CB/Diff:%4lu/%4lu=%4ld",
|
|
module->stats.pml_module_sends,
|
|
module->stats.pml_send_callbacks,
|
|
module->stats.pml_module_sends -
|
|
module->stats.pml_send_callbacks);
|
|
}
|
|
|
|
strncat(str, tmp, sizeof(str) - strlen(str) - 1);
|
|
opal_output(0, "%s", str);
|
|
|
|
if (reset_stats) {
|
|
usnic_stats_reset(module);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Callback routine for libevent
|
|
*/
|
|
static void usnic_stats_callback(int fd, short flags, void *arg)
|
|
{
|
|
opal_btl_usnic_module_t *module = (opal_btl_usnic_module_t*) arg;
|
|
char tmp[128];
|
|
|
|
if (!mca_btl_usnic_component.stats_enabled) {
|
|
return;
|
|
}
|
|
|
|
snprintf(tmp, sizeof(tmp), "%4lu", ++module->stats.report_num);
|
|
|
|
opal_btl_usnic_print_stats(module, tmp,
|
|
/*reset=*/mca_btl_usnic_component.stats_relative);
|
|
}
|
|
|
|
/*
|
|
* Initialize usnic module statistics
|
|
*/
|
|
int opal_btl_usnic_stats_init(opal_btl_usnic_module_t *module)
|
|
{
|
|
if (mca_btl_usnic_component.stats_enabled) {
|
|
usnic_stats_reset(module);
|
|
|
|
module->stats.timeout.tv_sec = mca_btl_usnic_component.stats_frequency;
|
|
module->stats.timeout.tv_usec = 0;
|
|
|
|
opal_event_set(mca_btl_usnic_component.opal_evbase,
|
|
&(module->stats.timer_event),
|
|
-1, EV_TIMEOUT | EV_PERSIST,
|
|
&usnic_stats_callback, module);
|
|
opal_event_add(&(module->stats.timer_event),
|
|
&(module->stats.timeout));
|
|
}
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
/*
|
|
* Finalize usnic module statistics
|
|
*/
|
|
int opal_btl_usnic_stats_finalize(opal_btl_usnic_module_t *module)
|
|
{
|
|
/* Disable the stats callback event, and then call the stats
|
|
callback manually to display the final stats */
|
|
if (mca_btl_usnic_component.stats_enabled) {
|
|
opal_event_del(&(module->stats.timer_event));
|
|
opal_btl_usnic_print_stats(module, "final", /*reset_stats=*/false);
|
|
}
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
/************************************************************************/
|
|
|
|
/*
|
|
* Function called by the pvar base upon MPI_T_pvar_handle_alloc,
|
|
* handle_start, and handle_stop.
|
|
*/
|
|
static int usnic_pvar_notify(struct mca_base_pvar_t *pvar,
|
|
mca_base_pvar_event_t event,
|
|
void *obj, int *count)
|
|
{
|
|
if (MCA_BASE_PVAR_HANDLE_BIND == event) {
|
|
*count = mca_btl_usnic_component.num_modules;
|
|
}
|
|
|
|
/* Don't care about the other events */
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
|
|
/*
|
|
* Function called by the pvar base when a user wants to read the
|
|
* value of an MPI_T performance variable.
|
|
*/
|
|
static int usnic_pvar_read(const struct mca_base_pvar_t *pvar,
|
|
void *value, void *bound_obj)
|
|
{
|
|
size_t offset = (size_t) pvar->ctx;
|
|
uint64_t *array = (uint64_t*) value;
|
|
|
|
for (int i = 0; i < mca_btl_usnic_component.num_modules; ++i) {
|
|
char *base = (char*) &(mca_btl_usnic_component.usnic_active_modules[i]->stats);
|
|
array[i] = *((uint64_t*) (base + offset));
|
|
}
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
|
|
/*
|
|
* Register an MPI_T performance variable of type CLASS_HIGHWATERMARK.
|
|
*/
|
|
static void register_pvar_highwater(char *name, char *desc, size_t offset)
|
|
{
|
|
int rc __opal_attribute_unused__;
|
|
|
|
rc = mca_base_component_pvar_register(&mca_btl_usnic_component.super.btl_version,
|
|
name, desc,
|
|
OPAL_INFO_LVL_5,
|
|
MCA_BASE_PVAR_CLASS_HIGHWATERMARK,
|
|
pvar_type,
|
|
NULL, /* enumeration */
|
|
MCA_BASE_VAR_BIND_NO_OBJECT,
|
|
(MCA_BASE_PVAR_FLAG_READONLY |
|
|
MCA_BASE_PVAR_FLAG_CONTINUOUS),
|
|
usnic_pvar_read,
|
|
NULL, /* write function */
|
|
usnic_pvar_notify,
|
|
(void *) offset);
|
|
assert(rc >= 0);
|
|
}
|
|
|
|
|
|
/*
|
|
* Function called by the pvar base when a user wants to read the
|
|
* devices enum value. The array is a simple list of 0..num_modules,
|
|
* which will map to the strings in the devices_enum
|
|
* setup_mpit_pvar_type().
|
|
*/
|
|
static int usnic_pvar_enum_read(const struct mca_base_pvar_t *pvar,
|
|
void *value, void *bound_obj)
|
|
{
|
|
int *array = (int *) value;
|
|
|
|
for (int i = 0; i < mca_btl_usnic_component.num_modules; ++i) {
|
|
array[i] = i;
|
|
}
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
|
|
/*
|
|
* Register an MPI_T performance variable of type CLASS_COUNTER.
|
|
*/
|
|
static void register_pvar_counter(char *name, char *desc, size_t offset)
|
|
{
|
|
int rc __opal_attribute_unused__;
|
|
|
|
rc = mca_base_component_pvar_register(&mca_btl_usnic_component.super.btl_version,
|
|
name, desc,
|
|
OPAL_INFO_LVL_5,
|
|
MCA_BASE_PVAR_CLASS_COUNTER,
|
|
pvar_type,
|
|
NULL, /* enumeration */
|
|
MCA_BASE_VAR_BIND_NO_OBJECT,
|
|
(MCA_BASE_PVAR_FLAG_READONLY |
|
|
MCA_BASE_PVAR_FLAG_CONTINUOUS),
|
|
usnic_pvar_read,
|
|
NULL, /* write function */
|
|
usnic_pvar_notify,
|
|
(void *) offset);
|
|
assert(rc >= 0);
|
|
}
|
|
|
|
|
|
/*
|
|
* Find the MPI_T type corresponding to our uint64_t counters and
|
|
* highwatermarks.
|
|
*/
|
|
static bool setup_mpit_pvar_type(void)
|
|
{
|
|
/* Our stats variables are uint64_t's, so find a pvar type that is
|
|
compatible */
|
|
if (sizeof(uint64_t) == sizeof(unsigned int)) {
|
|
pvar_type = MCA_BASE_VAR_TYPE_UNSIGNED_INT;
|
|
} else if (sizeof(uint64_t) == sizeof(unsigned long)) {
|
|
pvar_type = MCA_BASE_VAR_TYPE_UNSIGNED_LONG;
|
|
#ifdef HAVE_UNSIGNED_LONG_LONG
|
|
} else if (sizeof(uint64_t) == sizeof(unsigned long long)) {
|
|
pvar_type = MCA_BASE_VAR_TYPE_UNSIGNED_LONG_LONG;
|
|
#endif
|
|
}
|
|
|
|
/* Let the caller know if we found a compatible type or not */
|
|
if (MCA_BASE_VAR_TYPE_MAX == pvar_type) {
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
|
|
/*
|
|
* Setup the usnic_X device enumeration pvar
|
|
*/
|
|
static void setup_mpit_pvars_enum(void)
|
|
{
|
|
int i;
|
|
int rc __opal_attribute_unused__;
|
|
mca_base_var_enum_value_t *devices;
|
|
static mca_base_var_enum_t *devices_enum;
|
|
opal_btl_usnic_module_t *m;
|
|
unsigned char *c;
|
|
struct sockaddr_in *sin;
|
|
|
|
devices = calloc(mca_btl_usnic_component.num_modules + 1,
|
|
sizeof(*devices));
|
|
assert(devices != NULL);
|
|
|
|
for (i = 0; i < mca_btl_usnic_component.num_modules; ++i) {
|
|
char *str;
|
|
|
|
m = mca_btl_usnic_component.usnic_active_modules[i];
|
|
sin = m->fabric_info->src_addr;
|
|
c = (unsigned char*) &sin->sin_addr.s_addr;
|
|
|
|
devices[i].value = i;
|
|
rc = asprintf(&str, "%s,%hhu.%hhu.%hhu.%hhu/%" PRIu32,
|
|
m->fabric_info->fabric_attr->name,
|
|
c[0], c[1], c[2], c[3],
|
|
usnic_netmask_to_cidrlen(sin->sin_addr.s_addr));
|
|
assert(rc > 0);
|
|
devices[i].string = str;
|
|
}
|
|
devices[i].string = NULL;
|
|
|
|
rc = mca_base_var_enum_create("btl_usnic", devices, &devices_enum);
|
|
assert(OPAL_SUCCESS == rc);
|
|
|
|
rc = mca_base_component_pvar_register(&mca_btl_usnic_component.super.btl_version,
|
|
"devices",
|
|
"Enumeration representing which slot in btl_usnic_* MPI_T pvar value arrays correspond to which usnic_X Linux device",
|
|
OPAL_INFO_LVL_5,
|
|
MCA_BASE_PVAR_CLASS_STATE,
|
|
MCA_BASE_VAR_TYPE_INT,
|
|
devices_enum,
|
|
MCA_BASE_VAR_BIND_NO_OBJECT,
|
|
(MCA_BASE_PVAR_FLAG_READONLY |
|
|
MCA_BASE_PVAR_FLAG_CONTINUOUS),
|
|
usnic_pvar_enum_read,
|
|
NULL, /* write function */
|
|
usnic_pvar_notify,
|
|
NULL /* context */);
|
|
assert(rc >= 0);
|
|
|
|
/* Free the strings (mca_base_var_enum_create() strdup()'ed them
|
|
into private storage, so we don't need them any more) */
|
|
for (int i = 0; i < mca_btl_usnic_component.num_modules; ++i) {
|
|
free((char*) devices[i].string);
|
|
}
|
|
free(devices);
|
|
|
|
/* The devices_enum has been RETAIN'ed by the pvar, so we can
|
|
RELEASE it here, and the enum will be destroyed when the pvar
|
|
is destroyed. */
|
|
OBJ_RELEASE(devices_enum);
|
|
}
|
|
|
|
|
|
/*
|
|
* Setup high watermark MPI_T performance variables
|
|
*/
|
|
static void setup_mpit_pvars_highwatermark(void)
|
|
{
|
|
#define REGISTERHW(field, desc) \
|
|
register_pvar_highwater(#field, (desc), offsetof(opal_btl_usnic_module_stats_t, field))
|
|
|
|
REGISTERHW(max_sent_window_size,
|
|
"Maximum number of entries in all send windows from this peer");
|
|
REGISTERHW(max_rcvd_window_size,
|
|
"Maximum number of entries in all receive windows to this peer");
|
|
}
|
|
|
|
|
|
/*
|
|
* Setup counter MPI_T performance variables
|
|
*/
|
|
static void setup_mpit_pvars_counters(void)
|
|
{
|
|
#define REGISTERC(field, desc) \
|
|
register_pvar_counter(#field, (desc), offsetof(opal_btl_usnic_module_stats_t, field))
|
|
|
|
REGISTERC(num_total_sends,
|
|
"Total number of sends (MPI data, ACKs, retransmissions, etc.)");
|
|
REGISTERC(num_resends,
|
|
"Total number of all retransmissions");
|
|
REGISTERC(num_timeout_retrans,
|
|
"Number of times chunk retransmissions have occured because an ACK was not received within the timeout");
|
|
REGISTERC(num_fast_retrans,
|
|
"Number of times chunk retransmissions have occured because due to a repeated ACK");
|
|
REGISTERC(num_chunk_sends,
|
|
"Number of sends that were part of a larger MPI message fragment (i.e., the MPI message was so long that it had to be split into multiple MTU/network sends)");
|
|
REGISTERC(num_frag_sends,
|
|
"Number of sends where the entire MPI message fragment fit into a single MTU/network send");
|
|
REGISTERC(num_ack_sends,
|
|
"Number of ACKs sent (i.e., usNIC-BTL-to-usNIC-BTL control messages)");
|
|
|
|
REGISTERC(num_total_recvs,
|
|
"Total number of receives completed");
|
|
REGISTERC(num_unk_recvs,
|
|
"Number of receives with an unknown source or type, and therefore ignored by the usNIC BTL (this should never be >0)");
|
|
REGISTERC(num_dup_recvs,
|
|
"Number of duplicate receives");
|
|
REGISTERC(num_oow_low_recvs,
|
|
"Number of times a receive was out of the sliding window (on the low side)");
|
|
REGISTERC(num_oow_high_recvs,
|
|
"Number of times a receive was out of the sliding window (on the high side)");
|
|
REGISTERC(num_frag_recvs,
|
|
"Number of receives where the entire MPI message fragment fit into a single MTU/network send");
|
|
REGISTERC(num_chunk_recvs,
|
|
"Number of receives that were part of a larger MPI message fragment (i.e., this receive was reassembled into a larger MPI message fragment)");
|
|
REGISTERC(num_badfrag_recvs,
|
|
"Number of chunks received that had a bad fragment ID (this should never be >0)");
|
|
|
|
REGISTERC(num_ack_recvs,
|
|
"Total number of ACKs received");
|
|
REGISTERC(num_old_dup_acks,
|
|
"Number of old duplicate ACKs received (i.e., before the current expected ACK)");
|
|
REGISTERC(num_dup_acks,
|
|
"Number of duplicate ACKs received (i.e., the current expected ACK)");
|
|
|
|
REGISTERC(num_recv_reposts,
|
|
"Number of times buffers have been reposted for receives");
|
|
REGISTERC(num_crc_errors,
|
|
"Number of times receives were aborted because of a CRC error");
|
|
|
|
REGISTERC(pml_module_sends,
|
|
"Number of times the PML has called down to send a message");
|
|
REGISTERC(pml_send_callbacks,
|
|
"Number of times the usNIC BTL has called up to the PML to complete a send");
|
|
}
|
|
|
|
|
|
/*
|
|
* Initialize MPI_T performance variables
|
|
*/
|
|
int opal_btl_usnic_setup_mpit_pvars(void)
|
|
{
|
|
/* If we cannot find a compatible pvar type, we're done (i.e.,
|
|
don't register any pvars) */
|
|
if (!setup_mpit_pvar_type()) {
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
/* Setup the usnic_X device enumeration pvar */
|
|
setup_mpit_pvars_enum();
|
|
|
|
/* Register watermark pvars */
|
|
setup_mpit_pvars_highwatermark();
|
|
|
|
/* If our counter stats are relative, don't report them through
|
|
MPI_T, because MPI_T expects counters to be monotonically
|
|
rising. */
|
|
if (!mca_btl_usnic_component.stats_relative) {
|
|
setup_mpit_pvars_counters();
|
|
}
|
|
|
|
/* All done */
|
|
return OPAL_SUCCESS;
|
|
}
|