From 6855ebb84bb1b95d72f29f443f09e28745de8303 Mon Sep 17 00:00:00 2001 From: Mark Allen Date: Wed, 1 Aug 2018 17:49:52 -0400 Subject: [PATCH] Adding -mca comm_method to print table of communication methods This is closely related to Platform-MPI's old -prot feature. The long-format of the tables it prints could look like this: > Host 0 [myhost001] ranks 0 - 1 > Host 1 [myhost002] ranks 2 - 3 > Host 2 [myhost003] ranks 4 > Host 3 [myhost004] ranks 5 > Host 4 [myhost005] ranks 6 > Host 5 [myhost006] ranks 7 > Host 6 [myhost007] ranks 8 > Host 7 [myhost008] ranks 9 > Host 8 [myhost009] ranks 10 > > host | 0 1 2 3 4 5 6 7 8 > ======|============================================== > 0 : sm tcp tcp tcp tcp tcp tcp tcp tcp > 1 : tcp sm tcp tcp tcp tcp tcp tcp tcp > 2 : tcp tcp self tcp tcp tcp tcp tcp tcp > 3 : tcp tcp tcp self tcp tcp tcp tcp tcp > 4 : tcp tcp tcp tcp self tcp tcp tcp tcp > 5 : tcp tcp tcp tcp tcp self tcp tcp tcp > 6 : tcp tcp tcp tcp tcp tcp self tcp tcp > 7 : tcp tcp tcp tcp tcp tcp tcp self tcp > 8 : tcp tcp tcp tcp tcp tcp tcp tcp self > > Connection summary: > on-host: all connections are sm or self > off-host: all connections are tcp In this example hosts 0 and 1 had multiple ranks so "sm" was more meaningful than "self" to identify how the ranks on the host are talking to each other. While host 2..8 were one rank per host so "self" was more meaningful as their btl. Above a certain number of hosts (12 by default) the above table gets too big so we shrink to a more abbreviated looking table that has the same data: > host | 0 1 2 3 4 8 > ======|==================== > 0 : A C C C C C C C C > 1 : C A C C C C C C C > 2 : C C B C C C C C C > 3 : C C C B C C C C C > 4 : C C C C B C C C C > 5 : C C C C C B C C C > 6 : C C C C C C B C C > 7 : C C C C C C C B C > 8 : C C C C C C C C B > key: A == sm > key: B == self > key: C == tcp Then above 36 hosts we stop printing the 2d table entirely and just print the summary: > Connection summary: > on-host: all connections are sm or self > off-host: all connections are tcp The options to control it are -mca comm_method 1 : print the above table at the end of MPI_Init -mca comm_method 2 : print the above table at the beginning of MPI_Finalize -mca comm_method_max : number of hosts for which to print a full size 2d -mca comm_method_brief 1 : only print summary output, no 2d table -mca comm_method_fakefile : for debugging only * printing at init vs finalize: The most important difference between these two is that when printing the table during MPI_Init(), we send extra messages to make sure all hosts are connected to each other. So the table ends up working against the idea of on-demand connections (although it's only forcing the n^2 connections in the number of hosts, not the total ranks). If printing at MPI_Finalize() we don't create any connections that aren't already connected, so the table is more likely to have "n/a" entries if some hosts never connected to each other. * how many hosts for which to print a full size 2d table The option -mca comm_method_max can be used to specify a number of hosts (default 12) that controls at what host-count the unabbreviated / abbreviated 2d tables get printed: 1 - n : full size 2d table n+1 - 3n : shortened 2d table 3n+1 - inf : summary only, no 2d table * brief The option -mca comm_method_brief 1 can be used to skip the printing of the 2d table and only show the short summary * fakefile This is a debugging option that allows easeir testing of all the printout routines by letting all the detected communication methods between the hosts be overridden by fake data from a file. The source of the information used in the table is the .mca_component_name In the case of BTLs, the module always had a .btl_component linking back to the component. The vars mca_pml_base_selected_component and ompi_mtl_base_selected_component offer similar functionality for pml/mtl. So with the ability to identify the component, we can then access the component name with code like this mca_pml_base_selected_component.pmlm_version.mca_component_name See the three lookup_{pml,mtl,btl}_name() functions in hook_comm_method_fns.c, and their use in comm_method() to parse the strings and produce an integer to represent the connection type being used. Signed-off-by: Mark Allen --- ompi/mca/hook/comm_method/Makefile.am | 20 + ompi/mca/hook/comm_method/configure.m4 | 25 + ompi/mca/hook/comm_method/hook_comm_method.h | 37 + .../comm_method/hook_comm_method_component.c | 179 ++++ .../hook/comm_method/hook_comm_method_fns.c | 882 ++++++++++++++++++ ompi/mca/hook/comm_method/owner.txt | 7 + ompi/mca/pml/base/pml_base_frame.c | 4 +- ompi/mca/pml/cm/pml_cm.c | 5 +- ompi/mca/pml/crcpw/pml_crcpw_module.c | 4 +- ompi/mca/pml/example/pml_example.c | 4 +- .../pml/monitoring/pml_monitoring_component.c | 4 +- ompi/mca/pml/ob1/pml_ob1.c | 5 +- ompi/mca/pml/ucx/pml_ucx.c | 4 +- ompi/mca/pml/yalla/pml_yalla.c | 2 + 14 files changed, 1174 insertions(+), 8 deletions(-) create mode 100644 ompi/mca/hook/comm_method/Makefile.am create mode 100644 ompi/mca/hook/comm_method/configure.m4 create mode 100644 ompi/mca/hook/comm_method/hook_comm_method.h create mode 100644 ompi/mca/hook/comm_method/hook_comm_method_component.c create mode 100644 ompi/mca/hook/comm_method/hook_comm_method_fns.c create mode 100644 ompi/mca/hook/comm_method/owner.txt diff --git a/ompi/mca/hook/comm_method/Makefile.am b/ompi/mca/hook/comm_method/Makefile.am new file mode 100644 index 0000000000..3cc2f3e993 --- /dev/null +++ b/ompi/mca/hook/comm_method/Makefile.am @@ -0,0 +1,20 @@ +# +# Copyright (c) 2018 IBM Corporation. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +sources = \ + hook_comm_method.h \ + hook_comm_method_component.c \ + hook_comm_method_fns.c + +# This component will only ever be built statically -- never as a DSO. + +noinst_LTLIBRARIES = libmca_hook_comm_method.la + +libmca_hook_comm_method_la_SOURCES = $(sources) +libmca_hook_comm_method_la_LDFLAGS = -module -avoid-version diff --git a/ompi/mca/hook/comm_method/configure.m4 b/ompi/mca/hook/comm_method/configure.m4 new file mode 100644 index 0000000000..d3dd70973a --- /dev/null +++ b/ompi/mca/hook/comm_method/configure.m4 @@ -0,0 +1,25 @@ +# +# Copyright (c) 2018 IBM Corporation. All rights reserved. +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Make this a static component +AC_DEFUN([MCA_ompi_hook_comm_method_COMPILE_MODE], [ + AC_MSG_CHECKING([for MCA component $2:$3 compile mode]) + $4="static" + AC_MSG_RESULT([$$4]) +]) + +# MCA_hook_comm_method_CONFIG([action-if-can-compile], +# [action-if-cant-compile]) +# ------------------------------------------------ +AC_DEFUN([MCA_ompi_hook_comm_method_CONFIG],[ + AC_CONFIG_FILES([ompi/mca/hook/comm_method/Makefile]) + + $1 +]) diff --git a/ompi/mca/hook/comm_method/hook_comm_method.h b/ompi/mca/hook/comm_method/hook_comm_method.h new file mode 100644 index 0000000000..e5251f25c5 --- /dev/null +++ b/ompi/mca/hook/comm_method/hook_comm_method.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2016-2018 IBM Corporation. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#ifndef MCA_HOOK_COMM_METHOD_H +#define MCA_HOOK_COMM_METHOD_H + +#include "ompi_config.h" + +#include "ompi/constants.h" + +#include "ompi/mca/hook/hook.h" +#include "ompi/mca/hook/base/base.h" + +BEGIN_C_DECLS + +OMPI_MODULE_DECLSPEC extern const ompi_hook_base_component_1_0_0_t mca_hook_comm_method_component; + +extern int mca_hook_comm_method_verbose; +extern int mca_hook_comm_method_output; +extern bool hook_comm_method_enable_mpi_init; +extern bool hook_comm_method_enable_mpi_finalize; +extern int hook_comm_method_max; +extern int hook_comm_method_brief; +extern char *hook_comm_method_fakefile; + +void ompi_hook_comm_method_mpi_init_bottom(int argc, char **argv, int requested, int *provided); + +void ompi_hook_comm_method_mpi_finalize_top(void); + +END_C_DECLS + +#endif /* MCA_HOOK_COMM_METHOD_H */ diff --git a/ompi/mca/hook/comm_method/hook_comm_method_component.c b/ompi/mca/hook/comm_method/hook_comm_method_component.c new file mode 100644 index 0000000000..e39fe78b3f --- /dev/null +++ b/ompi/mca/hook/comm_method/hook_comm_method_component.c @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2016-2018 IBM Corporation. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "hook_comm_method.h" + +static int ompi_hook_comm_method_component_open(void); +static int ompi_hook_comm_method_component_close(void); +static int ompi_hook_comm_method_component_register(void); + +/* + * Public string showing the component version number + */ +const char *mca_hook_comm_method_component_version_string = + "Open MPI 'comm_method' hook MCA component version " OMPI_VERSION; + +/* + * Instantiate the public struct with all of our public information + * and pointers to our public functions in it + */ +const ompi_hook_base_component_1_0_0_t mca_hook_comm_method_component = { + + /* First, the mca_component_t struct containing meta information + * about the component itself */ + .hookm_version = { + OMPI_HOOK_BASE_VERSION_1_0_0, + + /* Component name and version */ + .mca_component_name = "comm_method", + MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, + OMPI_RELEASE_VERSION), + + /* Component open and close functions */ + .mca_open_component = ompi_hook_comm_method_component_open, + .mca_close_component = ompi_hook_comm_method_component_close, + .mca_register_component_params = ompi_hook_comm_method_component_register, + + // Force this component to always be considered - component must be static + //.mca_component_flags = MCA_BASE_COMPONENT_FLAG_ALWAYS_CONSIDER, + }, + .hookm_data = { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + + /* Component functions */ + .hookm_mpi_initialized_top = NULL, + .hookm_mpi_initialized_bottom = NULL, + + .hookm_mpi_finalized_top = NULL, + .hookm_mpi_finalized_bottom = NULL, + + .hookm_mpi_init_top = NULL, + .hookm_mpi_init_top_post_opal = NULL, + .hookm_mpi_init_bottom = ompi_hook_comm_method_mpi_init_bottom, + .hookm_mpi_init_error = NULL, + + .hookm_mpi_finalize_top = ompi_hook_comm_method_mpi_finalize_top, + .hookm_mpi_finalize_bottom = NULL, +}; + +int mca_hook_comm_method_verbose = 0; +int mca_hook_comm_method_output = -1; +bool hook_comm_method_enable_mpi_init = false; +bool hook_comm_method_enable_mpi_finalize = false; +int hook_comm_method_max = 12; +int hook_comm_method_brief = 0; +char *hook_comm_method_fakefile = NULL; + +static int ompi_hook_comm_method_component_open(void) +{ + // Nothing to do + return OMPI_SUCCESS; +} + +static int ompi_hook_comm_method_component_close(void) +{ + // Nothing to do + return OMPI_SUCCESS; +} + +static int ompi_hook_comm_method_component_register(void) +{ + + /* + * Component verbosity level + */ + // Inherit the verbosity of the base framework, but also allow this to be overridden + if( ompi_hook_base_framework.framework_verbose > MCA_BASE_VERBOSE_NONE ) { + mca_hook_comm_method_verbose = ompi_hook_base_framework.framework_verbose; + } + else { + mca_hook_comm_method_verbose = MCA_BASE_VERBOSE_NONE; + } + (void) mca_base_component_var_register(&mca_hook_comm_method_component.hookm_version, "verbose", + NULL, + MCA_BASE_VAR_TYPE_INT, NULL, + 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_hook_comm_method_verbose); + + mca_hook_comm_method_output = opal_output_open(NULL); + opal_output_set_verbosity(mca_hook_comm_method_output, mca_hook_comm_method_verbose); + + /* + * If the component is active for mpi_init / mpi_finalize + */ + hook_comm_method_enable_mpi_init = false; + (void) mca_base_component_var_register(&mca_hook_comm_method_component.hookm_version, "enable_mpi_init", + "Enable comm_method behavior on mpi_init", + MCA_BASE_VAR_TYPE_BOOL, NULL, + 0, 0, + OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_READONLY, + &hook_comm_method_enable_mpi_init); + + hook_comm_method_enable_mpi_finalize = false; + (void) mca_base_component_var_register(&mca_hook_comm_method_component.hookm_version, "enable_mpi_finalize", + "Enable comm_method behavior on mpi_finalize", + MCA_BASE_VAR_TYPE_BOOL, NULL, + 0, 0, + OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_READONLY, + &hook_comm_method_enable_mpi_finalize); + + // User can set the comm_method mca variable too + int hook_comm_method = -1; + (void) mca_base_var_register("ompi", NULL, NULL, "comm_method", + "Enable comm_method behavior (1) mpi_init or (2) mpi_finalize", + MCA_BASE_VAR_TYPE_INT, NULL, + 0, 0, + OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_READONLY, + &hook_comm_method); + + if( 1 == hook_comm_method ) { + hook_comm_method_enable_mpi_init = true; + } + else if( 2 == hook_comm_method ) { + hook_comm_method_enable_mpi_finalize = true; + } + + // comm_method_max + (void) mca_base_var_register("ompi", NULL, NULL, "comm_method_max", + "Number of hosts for which to print unabbreviated 2d table of comm methods.", + MCA_BASE_VAR_TYPE_INT, NULL, + 0, 0, + OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_READONLY, + &hook_comm_method_max); + // comm_method_brief + (void) mca_base_var_register("ompi", NULL, NULL, "comm_method_brief", + "Only print the comm method summary, skip the 2d table.", + MCA_BASE_VAR_TYPE_INT, NULL, + 0, 0, + OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_READONLY, + &hook_comm_method_brief); + + // comm_method_fakefile is just for debugging, allows complete override of all the + // comm method in the table + (void) mca_base_var_register("ompi", NULL, NULL, "comm_method_fakefile", + "For debugging only: read comm methods from a file", + MCA_BASE_VAR_TYPE_STRING, NULL, + 0, 0, + OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_READONLY, + &hook_comm_method_fakefile); + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/hook/comm_method/hook_comm_method_fns.c b/ompi/mca/hook/comm_method/hook_comm_method_fns.c new file mode 100644 index 0000000000..3266c54d2a --- /dev/null +++ b/ompi/mca/hook/comm_method/hook_comm_method_fns.c @@ -0,0 +1,882 @@ +/* + * Copyright (c) 2016-2018 IBM Corporation. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "hook_comm_method.h" + +#ifdef HAVE_DLFCN_H +#include +#endif + +#include "ompi/communicator/communicator.h" +#include "ompi/mca/pml/pml.h" +#include "opal/mca/btl/btl.h" +#include "ompi/mca/pml/base/base.h" +#include "ompi/mca/bml/base/base.h" +#include "ompi/mca/mtl/base/base.h" + +// In regular strncpy up to n bytes are copied, so if the 'to' buffer +// was char string[16] and you called strncpy(string, , 16) you could +// get 16 bytes of chars without a null. My preferred API is to let +// n be the size of the buffer, and to let n-1 chars be copied, and +// to guarantee null termination. +static void +mystrncpy(char *to, const char *from, int n) { + strncpy(to, from, n-1); + to[n-1] = 0; +} + +// For converting comm_method strings to comm_method id# and back. +// This starts as our local set of strings, but gets Allreduced into +// a global mapping so all the strings at all the ranks are represented. +// If an MCA's name is more than 15 chars it gets truncated. +#define COMM_METHOD_STRING_SIZE 16 +#define MAX_COMM_METHODS 50 +typedef struct { + int n; + char str[MAX_COMM_METHODS][COMM_METHOD_STRING_SIZE]; +} comm_method_string_conversion_t; + +static comm_method_string_conversion_t comm_method_string_conversion; + +#define MODE_IS_PML 1 +#define MODE_IS_MTL 2 +#define MODE_IS_BTL 3 + +// ---------------------------------------------------------------------------- + +// return the pml's module:component:name function pointer in fp +static char* +lookup_pml_name(void) +{ + return (char*) mca_pml_base_selected_component.pmlm_version.mca_component_name; +} +// return the mtl's module:component:name function pointer in fp +static char* +lookup_mtl_name(void) +{ + if (!ompi_mtl_base_selected_component) { return NULL; } + return (char*) ompi_mtl_base_selected_component->mtl_version.mca_component_name; +} +// Find the send btl's module:component:name for the incoming comm,rank +static char* +lookup_btl_name_for_send(ompi_communicator_t* comm, int rank) { + ompi_proc_t *dst_proc = ompi_group_peer_lookup_existing(comm->c_remote_group, rank); + + mca_bml_base_endpoint_t* endpoint = mca_bml_base_get_endpoint(dst_proc); + if (endpoint && + endpoint->btl_send.bml_btls && + endpoint->btl_send.bml_btls[0].btl) + { + return (char*) + endpoint->btl_send.bml_btls[0].btl->btl_component->btl_version.mca_component_name; + } + return NULL; +} + +// Use the above to lookup the mca_component_name for the rank's +// sending BTL/MTL/PML. The extra 3rd argument returns whether +// MODE_IS_BTL / MTL / PML (can pass NULL if you don't want that info). +// +// And this one puts the result into malloced mem of size +// COMM_METHOD_STRING_SIZE +// that the caller has to free. +static char * +comm_method_string(MPI_Comm comm, int rank, int *comm_mode) { + char *p; + char *string = malloc(COMM_METHOD_STRING_SIZE); + + if (!string) { return NULL; } + + p = lookup_pml_name(); + if (p && 0==strncmp("ob1", p, 4)) { // BTL + if (comm_mode) { *comm_mode = MODE_IS_BTL; } + mystrncpy(string, lookup_btl_name_for_send(comm, rank), COMM_METHOD_STRING_SIZE); + } + else if (p && 0==strncmp("cm", p, 3)) { // MTL + if (comm_mode) { *comm_mode = MODE_IS_MTL; } + mystrncpy(string, lookup_mtl_name(), COMM_METHOD_STRING_SIZE); + } else { // PML + if (comm_mode) { *comm_mode = MODE_IS_PML; } + mystrncpy(string, p, COMM_METHOD_STRING_SIZE); + } + return string; +} + +// ---------------------------------------------------------------------------- + +// Managing the comm_method_string_conversion structure +// and using it to convert strings to/from id numbers: + +// The data should be +// string 0 == "n/a" for unconnected / unknown +// string 1,2,... == "tcp" "self", etc, sorted +// self is important enough we want to make sure it's always in the list +static void +init_string_to_conversion_struct(comm_method_string_conversion_t *data) +{ + data->n = 0; + strcpy(data->str[data->n], "n/a"); + ++(data->n); + strcpy(data->str[data->n], "self"); + ++(data->n); +} + +static int +lookup_string_in_conversion_struct(comm_method_string_conversion_t *data, char *string) +{ + int i; + for (i=0; in; ++i) { + if (0==strncmp(data->str[i], string, COMM_METHOD_STRING_SIZE)) { + return i; + } + } + return 0; +} + +// For qsort of the str[] array in a comm_method_string_conversion_t +static int mycompar(const void *a, const void *b) { + return strcmp(a, b); +} + +static void +add_string_to_conversion_struct(comm_method_string_conversion_t *data, char *string) +{ + int i; + if (0 == strcmp(string, "n/a")) { return; } + + i = lookup_string_in_conversion_struct(data, string); + if (i == 0) { // didn't find string in list, so add it + if (data->n < MAX_COMM_METHODS) { + mystrncpy(data->str[data->n], string, COMM_METHOD_STRING_SIZE); + ++(data->n); + } + } + qsort(&data->str[1], data->n - 1, COMM_METHOD_STRING_SIZE, &mycompar); +} + +// For MPI_Allreduce of a comm_method_string_conversion_t +static void myfn(void* invec, void* inoutvec, int *len, MPI_Datatype *dt) { + comm_method_string_conversion_t *a, *b; + int i, j; + + for (i=0; i<*len; ++i) { + b = &((comm_method_string_conversion_t*)invec)[i]; + a = &((comm_method_string_conversion_t*)inoutvec)[i]; + for (j=0; jn; ++j) { // for each entry j in 'b', add it to 'a' + add_string_to_conversion_struct(a, b->str[j]); + } + qsort(&a->str[1], a->n - 1, COMM_METHOD_STRING_SIZE, &mycompar); + } +} + +// ---------------------------------------------------------------------------- + +// Use the static global comm_method_string_conversion to convert +// between comm_method string and id number + +// This might convert "pami" for example to 1, "yalla" to 2, etc. +static int +string_to_comm_method(char *str) { + // default to "n/a" for any bad or unrecognized inputs + if (!str || !str[0]) { return 0; } + + return lookup_string_in_conversion_struct(&comm_method_string_conversion, str); +} + +static char * +comm_method_to_string(int id) { + return comm_method_string_conversion.str[id]; +} + +static int +comm_method(MPI_Comm comm, int rank) { + char *p = comm_method_string(comm, rank, NULL); + int id = string_to_comm_method(p); + free(p); + return id; +} + +#define COMM_METHOD_SELF (string_to_comm_method("self")) +#define NUM_COMM_METHODS (comm_method_string_conversion.n) + +// ---------------------------------------------------------------------------- + +typedef void (*VoidFuncPtr)(void); // a function pointer to a function that takes no arguments and returns void. +static char* comm_method_string(MPI_Comm comm, int rank, int *comm_mode); +static int comm_method(MPI_Comm comm, int rank); +static char* comm_method_to_string(int method); +static int icompar(const void *a, const void *b); +static void abbreviate_list_into_string(char *str, int max, int *list, int nlist); +static void ompi_report_comm_methods(int called_from_location); + +void ompi_hook_comm_method_mpi_init_bottom(int argc, char **argv, int requested, int *provided) +{ + if( hook_comm_method_enable_mpi_init ) { + ompi_report_comm_methods( 1 ); + } +} + +void ompi_hook_comm_method_mpi_finalize_top(void) +{ + if( hook_comm_method_enable_mpi_finalize ) { + ompi_report_comm_methods( 2 ); + } +} + +// ---------------------------------------------------------------------------- + +static int +icompar(const void *a, const void *b) { + if (*(int*)a < *(int*)b) { return -1; } + if (*(int*)a > *(int*)b) { return 1; } + return 0; +} + +// Input list[] is expected to be sorted +static void +abbreviate_list_into_string(char *str, int max, int *list, int nlist) +{ + int lo, hi; + int i; + int per, tmp; + +/* + * How much space do we need in strings to store rank numbers. + * A 10000 rank run needs more digits to write the rank numbers in than + * a 4 rank job. + */ + per = 1; + tmp = list[nlist-1]; + while (tmp >= 10) { ++per; tmp /= 10; } + + str[0] = 0; + lo = hi = -1; + for (i=0; i hi) { + if (strlen(str)==0 || str[strlen(str)-1] != '.') { + if (strlen(str) != 0) { + strcpy(&str[strlen(str)], ", "); + } + if (lo != hi) { + sprintf(&str[strlen(str)], "%d - %d", lo, hi); + } else { + sprintf(&str[strlen(str)], "%d", lo); + } + } +/* + * If we've almost written to the end of the string, and we haven't + * already written ".." to indicate we're not writing amy more, then + * add the "..". Also set hi=lo=i since the data we just wrote is + * for the previous contiguous chunk, and the current i is the start + * of the next chunk. + */ + if (((int)strlen(str)) >= max - 5 - 2*per + && + (strlen(str) == 0 || str[strlen(str)-1] != '.')) + { + strcpy(&str[strlen(str)], ", .."); + break; + } + hi = lo = list[i]; + } + } + if (strlen(str)==0 || str[strlen(str)-1] != '.') { + if (strlen(str)!=0) { + strcpy(&str[strlen(str)], ", "); + } + if (lo != hi) { + sprintf(&str[strlen(str)], "%d - %d", lo, hi); + } else { + sprintf(&str[strlen(str)], "%d", lo); + } + } +} + +// Input argument tells where we're being called from: +// 1 for init, 2 for finalize. +// The other implicit input is an environment variable we look at. +// When activated from init: we establish connections before printing. +// When activated from finalize: we just print whatever info is available. +static void +ompi_report_comm_methods(int called_from_location) // 1 = from init, 2 = from finalize +{ + int numhosts, i, j, k; + int max2Dprottable = 12; + int max2D1Cprottable = 36; + int hpmp_myrank; + int mylocalrank, nlocalranks, myleaderrank, nleaderranks; + int ret; + ompi_communicator_t *local_comm, *leader_comm; + int *method; + char *hoststring; + char **allhoststrings; + int comm_mode; // MODE_IS_BTL / MTL / PML + +// early return in the case of spawn + // PMPI_Comm_get_parent(&parent); + if (ompi_mpi_comm_parent != MPI_COMM_NULL) { return; } + + hpmp_myrank = ompi_comm_rank(MPI_COMM_WORLD); + // hpmp_nprocs = ompi_comm_size(MPI_COMM_WORLD); + + max2Dprottable = hook_comm_method_max; + max2D1Cprottable = 3 * max2Dprottable; + if (hook_comm_method_brief) { + // force only the short summary output to be printed with no 2d table: + max2Dprottable = 0; + max2D1Cprottable = 0; + } + +// Gathering layout data the same way osc_rdma_component.c does + ret = ompi_comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, NULL, + &local_comm); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return; + } + mylocalrank = ompi_comm_rank(local_comm); + nlocalranks = ompi_comm_size(local_comm); + + ret = ompi_comm_split(MPI_COMM_WORLD, + (0 == mylocalrank) ? 0 : MPI_UNDEFINED, + hpmp_myrank, &leader_comm, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + ompi_comm_free(&local_comm); + return; + } + +// Non-host-leaders return early. + if (mylocalrank != 0) { + ompi_comm_free(&local_comm); + return; + } +// ------------------------------------------------- +// Only host-leaders exist from this point on. +// ------------------------------------------------- + myleaderrank = ompi_comm_rank(leader_comm); + nleaderranks = numhosts = ompi_comm_size(leader_comm); + +/* + * Allocate space for each rank to store its communication method + * on a per-host basis. But rank 0 gets enough space to store the + * data for all pairs of hosts. + */ + method = malloc(numhosts * sizeof(int) * (hpmp_myrank?1:numhosts)); + if (!method) { + ompi_comm_free(&local_comm); + ompi_comm_free(&leader_comm); + return; + } + +// Each host leader figures out a string of basic info for its host +// in hoststring. (allocated at all host leaders, can be different sizes) + + { + int len; + int *ranklist; // comm-world ranks contained in local_comm + // sorted into comm-world order (although + // local_comm should already be constructed + // in that way) + int *ranklist_in; + + ompi_group_t *local_group, *world_group; + ompi_comm_group(local_comm, &local_group); + ompi_comm_group(MPI_COMM_WORLD, &world_group); + ranklist = malloc(nlocalranks * sizeof(int) * 2); + ranklist_in = ranklist + nlocalranks; + for (i=0; i 1) { + char *p = comm_method_string(local_comm, 1, NULL); + add_string_to_conversion_struct(&comm_method_string_conversion, p); + free(p); + } + + MPI_Datatype mydt; + MPI_Op myop; + MPI_Type_contiguous(sizeof(comm_method_string_conversion_t), MPI_BYTE, &mydt); + MPI_Type_commit(&mydt); + MPI_Op_create(myfn, 1, &myop); + leader_comm->c_coll->coll_allreduce( + MPI_IN_PLACE, (void*)&comm_method_string_conversion, 1, mydt, myop, leader_comm, + leader_comm->c_coll->coll_allreduce_module); + MPI_Op_free(&myop); + MPI_Type_free(&mydt); + +// Each host leader fills in a "numhosts" sized array method[] of +// how it communicates with each peer. + for (i=0; i 1) { + method[i] = comm_method(local_comm, 1); + } + } + } + +// Gather the strings and the methods at rank 0. +// The gatherv of the strings takes a few steps since we have to get +// the sizes first and allocate the receiving string. + { + int len, *lens, *disps; + + len = strlen(hoststring) + 1; + if (myleaderrank == 0) { + lens = malloc(nleaderranks * sizeof(int)); + disps = malloc(nleaderranks * sizeof(int)); + } else { + lens = disps = NULL; + } + leader_comm->c_coll->coll_gather( + &len, 1, MPI_INT, + lens, 1, MPI_INT, + 0, leader_comm, leader_comm->c_coll->coll_gather_module); + if (myleaderrank == 0) { + int tlen = 0; + char *p; + for (i=0; ic_coll->coll_gatherv( + hoststring, strlen(hoststring) + 1, MPI_CHAR, + &allhoststrings[0][0], lens, disps, MPI_CHAR, + 0, leader_comm, leader_comm->c_coll->coll_gatherv_module); + } else { + // matching above call from rank 0, just &allhoststrings[0][0] + // isn't legal here, and those args aren't used at non-root anyway + leader_comm->c_coll->coll_gatherv( + hoststring, strlen(hoststring) + 1, MPI_CHAR, + NULL, NULL, NULL, MPI_CHAR, + 0, leader_comm, leader_comm->c_coll->coll_gatherv_module); + } + if (myleaderrank == 0) { + free(lens); + free(disps); + } +// and a simpler gather for the methods + leader_comm->c_coll->coll_gather( + method, nleaderranks, MPI_INT, + method, nleaderranks, MPI_INT, + 0, leader_comm, leader_comm->c_coll->coll_gather_module); + } + ompi_comm_free(&local_comm); + ompi_comm_free(&leader_comm); + +// Interception for testing purposes. Let rank-0 meddle with all its method[] +// settings, this is only for testing, eg to make sure the printing comes out +// right. + if (myleaderrank == 0) { + if (hook_comm_method_fakefile) { + FILE *fp; + int setting; + fp = fopen(hook_comm_method_fakefile, "r"); + for (i=0; i= 10) { ++per; tmp /= 10; } + for (i=0; i per) { per = tmp+1; } + } + } + + str = malloc(nleaderranks * per + 1); + p = str; + for (i=0; i=str && ((*p)==' ')) { *(p--)=0; } + printf(" host | %s\n", str); + memset(str, (int)'=', tmp); + str[tmp] = 0; + printf("======|=%s\n", str); + + for (i=0; istr && *p==' ') { *(p--)=0; } + printf("%5d : %s\n", i, str); + } + printf("\n"); + free(str); + } + else if (nleaderranks <= max2D1Cprottable) { + char *str, *p; + int tmp, per, done; + char char_code[NUM_COMM_METHODS], next_char; + int method_count[NUM_COMM_METHODS]; + + // characters for the number column in the 2d table, + // must be large enough for the digits needed for host numbers + per = 2; + tmp = nleaderranks; + while (tmp >= 10) { ++per; tmp /= 10; } + + // pick a character code for each comm method based on + // how many times it's in the table, use 'A' for the least common + for (i=0; i=str && ((*p)==' ')) { *(p--)=0; } + tmp = (int)strlen(str) + 2; + printf(" host | %s\n", str); + memset(str, (int)'=', tmp); + str[tmp] = 0; + printf("======|=%s\n", str); + + for (i=0; istr && *p==' ') { *(p--)=0; } + printf("%5d : %s\n", i, str); + } + free(str); + for (i=0; i 0 && + majority_method_onhost == COMM_METHOD_SELF) + { + majority_method_onhost = i; + } + if (method_count[i] > method_count[majority_method_onhost]) { + if (i != COMM_METHOD_SELF) { + majority_method_onhost = i; + } + } + } + if (method_count[COMM_METHOD_SELF] > 0) { any_self = 1; } + + majority_method_offhost = -1; + uniformity_offhost = 1; + for (i=0; i 0 && majority_method_offhost == 0) { + majority_method_offhost = i; + } + if (method_count[i] > method_count[majority_method_offhost]) { + majority_method_offhost = i; + } + } + + char *all_or_most = "all"; + char *or_self = ""; + if (!uniformity_onhost) { + all_or_most = "most"; + } + if ((majority_method_onhost != COMM_METHOD_SELF) && any_self) { + or_self = " or self"; + } + printf(" on-host: %s connections are %s%s\n", all_or_most, + comm_method_to_string(majority_method_onhost), or_self); + + all_or_most = "all"; + if (!uniformity_offhost) { + all_or_most = "most"; + } + printf(" off-host: %s connections are %s\n", all_or_most, + comm_method_to_string(majority_method_offhost)); + + if (!uniformity_onhost || !uniformity_offhost) { + printf("Exceptions:\n"); + for (i=0; i 0) { +// if (!first) { +// strcat(str, " /"); +// } + sprintf(&str[strlen(str)], + " [%dx %s]", + method_count[k], + comm_method_to_string(k)); +// first = 0; + } + } + printf("%s\n", str); + free(str); + } + } + } + printf("\n"); + } + } + + if (myleaderrank == 0) { + free(allhoststrings); + } + free(method); +} diff --git a/ompi/mca/hook/comm_method/owner.txt b/ompi/mca/hook/comm_method/owner.txt new file mode 100644 index 0000000000..2fd247dddb --- /dev/null +++ b/ompi/mca/hook/comm_method/owner.txt @@ -0,0 +1,7 @@ +# +# owner/status file +# owner: institution that is responsible for this package +# status: e.g. active, maintenance, unmaintained +# +owner: IBM +status: active diff --git a/ompi/mca/pml/base/pml_base_frame.c b/ompi/mca/pml/base/pml_base_frame.c index bf35186ef7..dd5cdc5009 100644 --- a/ompi/mca/pml/base/pml_base_frame.c +++ b/ompi/mca/pml/base/pml_base_frame.c @@ -15,6 +15,7 @@ * reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2018 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -78,7 +79,8 @@ mca_pml_base_module_t mca_pml = { NULL, /* pml_dump */ NULL, /* pml_ft_event */ 0, /* pml_max_contextid */ - 0 /* pml_max_tag */ + 0, /* pml_max_tag */ + 0 /* pml_flags */ }; mca_pml_base_component_t mca_pml_base_selected_component = {{0}}; diff --git a/ompi/mca/pml/cm/pml_cm.c b/ompi/mca/pml/cm/pml_cm.c index a7322e4c33..54b691053f 100644 --- a/ompi/mca/pml/cm/pml_cm.c +++ b/ompi/mca/pml/cm/pml_cm.c @@ -11,6 +11,7 @@ * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2018 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -28,6 +29,7 @@ #include "pml_cm.h" #include "pml_cm_sendreq.h" #include "pml_cm_recvreq.h" +#include "pml_cm_component.h" ompi_pml_cm_t ompi_pml_cm = { { @@ -53,7 +55,8 @@ ompi_pml_cm_t ompi_pml_cm = { mca_pml_cm_dump, NULL, 0, - 0 + 0, + 0 /* flags */ } }; diff --git a/ompi/mca/pml/crcpw/pml_crcpw_module.c b/ompi/mca/pml/crcpw/pml_crcpw_module.c index c5982c5538..aa287fddbf 100644 --- a/ompi/mca/pml/crcpw/pml_crcpw_module.c +++ b/ompi/mca/pml/crcpw/pml_crcpw_module.c @@ -13,6 +13,7 @@ * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2018 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -57,7 +58,8 @@ mca_pml_crcpw_module_t mca_pml_crcpw_module = { mca_pml_crcpw_ft_event, 32768, - INT_MAX + INT_MAX, + 0 /* flags */ } }; diff --git a/ompi/mca/pml/example/pml_example.c b/ompi/mca/pml/example/pml_example.c index 799e3abe45..146f3f0e99 100644 --- a/ompi/mca/pml/example/pml_example.c +++ b/ompi/mca/pml/example/pml_example.c @@ -6,6 +6,7 @@ * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2018 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -43,7 +44,8 @@ mca_pml_example_t mca_pml_example = { mca_pml_example_ft_event, 32768, - (0x7fffffff) + (0x7fffffff), + 0 /* flags */ } }; diff --git a/ompi/mca/pml/monitoring/pml_monitoring_component.c b/ompi/mca/pml/monitoring/pml_monitoring_component.c index 44aa555bca..31de1f98ec 100644 --- a/ompi/mca/pml/monitoring/pml_monitoring_component.c +++ b/ompi/mca/pml/monitoring/pml_monitoring_component.c @@ -6,6 +6,7 @@ * Copyright (c) 2015 Bull SAS. All rights reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2018 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -48,7 +49,8 @@ mca_pml_monitoring_module_t mca_pml_monitoring_module = { mca_pml_monitoring_dump, NULL, 65535, - INT_MAX + INT_MAX, + 0 /* flags */ }; /** diff --git a/ompi/mca/pml/ob1/pml_ob1.c b/ompi/mca/pml/ob1/pml_ob1.c index 703c918dc6..cc633e3a57 100644 --- a/ompi/mca/pml/ob1/pml_ob1.c +++ b/ompi/mca/pml/ob1/pml_ob1.c @@ -20,6 +20,7 @@ * Copyright (c) 2015 FUJITSU LIMITED. All rights reserved. * Copyright (c) 2018 Sandia National Laboratories * All rights reserved. + * Copyright (c) 2018 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -79,7 +80,8 @@ mca_pml_ob1_t mca_pml_ob1 = { mca_pml_ob1_dump, mca_pml_ob1_ft_event, 65535, - INT_MAX + INT_MAX, + 0 /* flags */ } }; @@ -1057,4 +1059,3 @@ int mca_pml_ob1_com_btl_comp(const void *v1, const void *v2) return 0; } - diff --git a/ompi/mca/pml/ucx/pml_ucx.c b/ompi/mca/pml/ucx/pml_ucx.c index cd5aa32810..5228f4e9ab 100644 --- a/ompi/mca/pml/ucx/pml_ucx.c +++ b/ompi/mca/pml/ucx/pml_ucx.c @@ -5,6 +5,7 @@ * reserved. * Copyright (c) 2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2018 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -73,7 +74,8 @@ mca_pml_ucx_module_t ompi_pml_ucx = { .pml_dump = mca_pml_ucx_dump, .pml_ft_event = NULL, .pml_max_contextid = (1ul << (PML_UCX_CONTEXT_BITS)) - 1, - .pml_max_tag = (1ul << (PML_UCX_TAG_BITS - 1)) - 1 + .pml_max_tag = (1ul << (PML_UCX_TAG_BITS - 1)) - 1, + 0 /* flags */ }, .ucp_context = NULL, .ucp_worker = NULL diff --git a/ompi/mca/pml/yalla/pml_yalla.c b/ompi/mca/pml/yalla/pml_yalla.c index 03bb65d420..99ccb50fbb 100644 --- a/ompi/mca/pml/yalla/pml_yalla.c +++ b/ompi/mca/pml/yalla/pml_yalla.c @@ -3,6 +3,7 @@ * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2018 Cisco Systems, Inc. All rights reserved + * Copyright (c) 2018 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -51,6 +52,7 @@ mca_pml_yalla_module_t ompi_pml_yalla = { NULL, /* FT */ 1ul << ((sizeof(mxm_ctxid_t)*8) - 1), 1ul << ((sizeof(mxm_tag_t)*8 - 1) - 1), + 0 /* flags */ }, NULL, NULL,