2016-11-22 15:03:20 -08:00
|
|
|
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
|
|
|
/*
|
|
|
|
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2007 The University of Tennessee and The University
|
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
|
|
|
* Copyright (c) 2004-2007 High Performance Computing Center Stuttgart,
|
|
|
|
* University of Stuttgart. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
|
|
|
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
|
|
|
* Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights
|
|
|
|
* reserved.
|
|
|
|
* Copyright (c) 2015 Research Organization for Information Science
|
|
|
|
* and Technology (RIST). All rights reserved.
|
Adding -mca comm_method to print table of communication methods
This is closely related to Platform-MPI's old -prot feature.
The long-format of the tables it prints could look like this:
> Host 0 [myhost001] ranks 0 - 1
> Host 1 [myhost002] ranks 2 - 3
> Host 2 [myhost003] ranks 4
> Host 3 [myhost004] ranks 5
> Host 4 [myhost005] ranks 6
> Host 5 [myhost006] ranks 7
> Host 6 [myhost007] ranks 8
> Host 7 [myhost008] ranks 9
> Host 8 [myhost009] ranks 10
>
> host | 0 1 2 3 4 5 6 7 8
> ======|==============================================
> 0 : sm tcp tcp tcp tcp tcp tcp tcp tcp
> 1 : tcp sm tcp tcp tcp tcp tcp tcp tcp
> 2 : tcp tcp self tcp tcp tcp tcp tcp tcp
> 3 : tcp tcp tcp self tcp tcp tcp tcp tcp
> 4 : tcp tcp tcp tcp self tcp tcp tcp tcp
> 5 : tcp tcp tcp tcp tcp self tcp tcp tcp
> 6 : tcp tcp tcp tcp tcp tcp self tcp tcp
> 7 : tcp tcp tcp tcp tcp tcp tcp self tcp
> 8 : tcp tcp tcp tcp tcp tcp tcp tcp self
>
> Connection summary:
> on-host: all connections are sm or self
> off-host: all connections are tcp
In this example hosts 0 and 1 had multiple ranks so "sm" was more
meaningful than "self" to identify how the ranks on the host are
talking to each other. While host 2..8 were one rank per host so
"self" was more meaningful as their btl.
Above a certain number of hosts (12 by default) the above table gets too big
so we shrink to a more abbreviated looking table that has the same data:
> host | 0 1 2 3 4 8
> ======|====================
> 0 : A C C C C C C C C
> 1 : C A C C C C C C C
> 2 : C C B C C C C C C
> 3 : C C C B C C C C C
> 4 : C C C C B C C C C
> 5 : C C C C C B C C C
> 6 : C C C C C C B C C
> 7 : C C C C C C C B C
> 8 : C C C C C C C C B
> key: A == sm
> key: B == self
> key: C == tcp
Then above 36 hosts we stop printing the 2d table entirely and just print the
summary:
> Connection summary:
> on-host: all connections are sm or self
> off-host: all connections are tcp
The options to control it are
-mca comm_method 1 : print the above table at the end of MPI_Init
-mca comm_method 2 : print the above table at the beginning of MPI_Finalize
-mca comm_method_max <n> : number of hosts <n> for which to print a full size 2d
-mca comm_method_brief 1 : only print summary output, no 2d table
-mca comm_method_fakefile <filename> : for debugging only
* printing at init vs finalize:
The most important difference between these two is that when printing the table
during MPI_Init(), we send extra messages to make sure all hosts are connected to
each other. So the table ends up working against the idea of on-demand connections
(although it's only forcing the n^2 connections in the number of hosts, not the
total ranks). If printing at MPI_Finalize() we don't create any connections that
aren't already connected, so the table is more likely to have "n/a" entries if
some hosts never connected to each other.
* how many hosts <n> for which to print a full size 2d table
The option -mca comm_method_max <n> can be used to specify a number of hosts <n>
(default 12) that controls at what host-count the unabbreviated / abbreviated
2d tables get printed:
1 - n : full size 2d table
n+1 - 3n : shortened 2d table
3n+1 - inf : summary only, no 2d table
* brief
The option -mca comm_method_brief 1 can be used to skip the printing of the 2d
table and only show the short summary
* fakefile
This is a debugging option that allows easeir testing of all the printout
routines by letting all the detected communication methods between the hosts
be overridden by fake data from a file.
The source of the information used in the table is the .mca_component_name
In the case of BTLs, the module always had a .btl_component linking back to the
component. The vars mca_pml_base_selected_component and ompi_mtl_base_selected_component
offer similar functionality for pml/mtl.
So with the ability to identify the component, we can then access
the component name with code like this
mca_pml_base_selected_component.pmlm_version.mca_component_name
See the three lookup_{pml,mtl,btl}_name() functions in hook_comm_method_fns.c,
and their use in comm_method() to parse the strings and produce an integer
to represent the connection type being used.
Signed-off-by: Mark Allen <markalle@us.ibm.com>
2018-08-01 17:49:52 -04:00
|
|
|
* Copyright (c) 2018 IBM Corporation. All rights reserved.
|
2016-11-22 15:03:20 -08:00
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
#include "ompi_config.h"
|
|
|
|
#include <stdio.h>
|
|
|
|
|
|
|
|
#include <string.h>
|
|
|
|
#ifdef HAVE_UNISTD_H
|
|
|
|
#include <unistd.h>
|
|
|
|
#endif /* HAVE_UNIST_H */
|
|
|
|
#include "ompi/mca/mca.h"
|
|
|
|
#include "opal/util/output.h"
|
|
|
|
#include "opal/mca/base/base.h"
|
|
|
|
|
|
|
|
|
|
|
|
#include "ompi/constants.h"
|
|
|
|
#include "ompi/mca/pml/pml.h"
|
|
|
|
#include "ompi/mca/pml/base/base.h"
|
|
|
|
#include "ompi/mca/pml/base/pml_base_request.h"
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The following file was created by configure. It contains extern
|
|
|
|
* statements and the definition of an array of pointers to each
|
|
|
|
* component's public mca_base_component_t struct.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "ompi/mca/pml/base/static-components.h"
|
|
|
|
|
|
|
|
int mca_pml_base_progress(void)
|
|
|
|
{
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
#define xstringify(pml) #pml
|
|
|
|
#define stringify(pml) xstringify(pml)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Global variables
|
|
|
|
*/
|
|
|
|
mca_pml_base_module_t mca_pml = {
|
|
|
|
NULL, /* pml_add_procs */
|
|
|
|
NULL, /* pml_del_procs */
|
|
|
|
NULL, /* pml_enable */
|
|
|
|
mca_pml_base_progress, /* pml_progress */
|
|
|
|
NULL, /* pml_add_comm */
|
|
|
|
NULL, /* pml_del_comm */
|
|
|
|
NULL, /* pml_irecv_init */
|
|
|
|
NULL, /* pml_irecv */
|
|
|
|
NULL, /* pml_recv */
|
|
|
|
NULL, /* pml_isend_init */
|
|
|
|
NULL, /* pml_isend */
|
|
|
|
NULL, /* pml_send */
|
|
|
|
NULL, /* pml_iprobe */
|
|
|
|
NULL, /* pml_probe */
|
|
|
|
NULL, /* pml_start */
|
|
|
|
NULL, /* pml_dump */
|
|
|
|
NULL, /* pml_ft_event */
|
|
|
|
0, /* pml_max_contextid */
|
Adding -mca comm_method to print table of communication methods
This is closely related to Platform-MPI's old -prot feature.
The long-format of the tables it prints could look like this:
> Host 0 [myhost001] ranks 0 - 1
> Host 1 [myhost002] ranks 2 - 3
> Host 2 [myhost003] ranks 4
> Host 3 [myhost004] ranks 5
> Host 4 [myhost005] ranks 6
> Host 5 [myhost006] ranks 7
> Host 6 [myhost007] ranks 8
> Host 7 [myhost008] ranks 9
> Host 8 [myhost009] ranks 10
>
> host | 0 1 2 3 4 5 6 7 8
> ======|==============================================
> 0 : sm tcp tcp tcp tcp tcp tcp tcp tcp
> 1 : tcp sm tcp tcp tcp tcp tcp tcp tcp
> 2 : tcp tcp self tcp tcp tcp tcp tcp tcp
> 3 : tcp tcp tcp self tcp tcp tcp tcp tcp
> 4 : tcp tcp tcp tcp self tcp tcp tcp tcp
> 5 : tcp tcp tcp tcp tcp self tcp tcp tcp
> 6 : tcp tcp tcp tcp tcp tcp self tcp tcp
> 7 : tcp tcp tcp tcp tcp tcp tcp self tcp
> 8 : tcp tcp tcp tcp tcp tcp tcp tcp self
>
> Connection summary:
> on-host: all connections are sm or self
> off-host: all connections are tcp
In this example hosts 0 and 1 had multiple ranks so "sm" was more
meaningful than "self" to identify how the ranks on the host are
talking to each other. While host 2..8 were one rank per host so
"self" was more meaningful as their btl.
Above a certain number of hosts (12 by default) the above table gets too big
so we shrink to a more abbreviated looking table that has the same data:
> host | 0 1 2 3 4 8
> ======|====================
> 0 : A C C C C C C C C
> 1 : C A C C C C C C C
> 2 : C C B C C C C C C
> 3 : C C C B C C C C C
> 4 : C C C C B C C C C
> 5 : C C C C C B C C C
> 6 : C C C C C C B C C
> 7 : C C C C C C C B C
> 8 : C C C C C C C C B
> key: A == sm
> key: B == self
> key: C == tcp
Then above 36 hosts we stop printing the 2d table entirely and just print the
summary:
> Connection summary:
> on-host: all connections are sm or self
> off-host: all connections are tcp
The options to control it are
-mca comm_method 1 : print the above table at the end of MPI_Init
-mca comm_method 2 : print the above table at the beginning of MPI_Finalize
-mca comm_method_max <n> : number of hosts <n> for which to print a full size 2d
-mca comm_method_brief 1 : only print summary output, no 2d table
-mca comm_method_fakefile <filename> : for debugging only
* printing at init vs finalize:
The most important difference between these two is that when printing the table
during MPI_Init(), we send extra messages to make sure all hosts are connected to
each other. So the table ends up working against the idea of on-demand connections
(although it's only forcing the n^2 connections in the number of hosts, not the
total ranks). If printing at MPI_Finalize() we don't create any connections that
aren't already connected, so the table is more likely to have "n/a" entries if
some hosts never connected to each other.
* how many hosts <n> for which to print a full size 2d table
The option -mca comm_method_max <n> can be used to specify a number of hosts <n>
(default 12) that controls at what host-count the unabbreviated / abbreviated
2d tables get printed:
1 - n : full size 2d table
n+1 - 3n : shortened 2d table
3n+1 - inf : summary only, no 2d table
* brief
The option -mca comm_method_brief 1 can be used to skip the printing of the 2d
table and only show the short summary
* fakefile
This is a debugging option that allows easeir testing of all the printout
routines by letting all the detected communication methods between the hosts
be overridden by fake data from a file.
The source of the information used in the table is the .mca_component_name
In the case of BTLs, the module always had a .btl_component linking back to the
component. The vars mca_pml_base_selected_component and ompi_mtl_base_selected_component
offer similar functionality for pml/mtl.
So with the ability to identify the component, we can then access
the component name with code like this
mca_pml_base_selected_component.pmlm_version.mca_component_name
See the three lookup_{pml,mtl,btl}_name() functions in hook_comm_method_fns.c,
and their use in comm_method() to parse the strings and produce an integer
to represent the connection type being used.
Signed-off-by: Mark Allen <markalle@us.ibm.com>
2018-08-01 17:49:52 -04:00
|
|
|
0, /* pml_max_tag */
|
|
|
|
0 /* pml_flags */
|
2016-11-22 15:03:20 -08:00
|
|
|
};
|
|
|
|
|
|
|
|
mca_pml_base_component_t mca_pml_base_selected_component = {{0}};
|
|
|
|
opal_pointer_array_t mca_pml_base_pml = {{0}};
|
|
|
|
char *ompi_pml_base_bsend_allocator_name = NULL;
|
|
|
|
|
|
|
|
#if !MCA_ompi_pml_DIRECT_CALL && OPAL_ENABLE_FT_CR == 1
|
|
|
|
static char *ompi_pml_base_wrapper = NULL;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static int mca_pml_base_register(mca_base_register_flag_t flags)
|
|
|
|
{
|
|
|
|
#if !MCA_ompi_pml_DIRECT_CALL && OPAL_ENABLE_FT_CR == 1
|
|
|
|
int var_id;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
ompi_pml_base_bsend_allocator_name = "basic";
|
|
|
|
(void) mca_base_var_register("ompi", "pml", "base", "bsend_allocator", NULL,
|
|
|
|
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
|
|
|
|
OPAL_INFO_LVL_9,
|
|
|
|
MCA_BASE_VAR_SCOPE_READONLY,
|
|
|
|
&ompi_pml_base_bsend_allocator_name);
|
|
|
|
|
|
|
|
#if !MCA_ompi_pml_DIRECT_CALL && OPAL_ENABLE_FT_CR == 1
|
|
|
|
ompi_pml_base_wrapper = NULL;
|
|
|
|
var_id = mca_base_var_register("ompi", "pml", "base", "wrapper",
|
|
|
|
"Use a Wrapper component around the selected PML component",
|
|
|
|
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
|
|
|
|
OPAL_INFO_LVL_9,
|
|
|
|
MCA_BASE_VAR_SCOPE_READONLY,
|
|
|
|
&ompi_pml_base_wrapper);
|
|
|
|
(void) mca_base_var_register_synonym(var_id, "ompi", "pml", NULL, "wrapper", 0);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
int mca_pml_base_finalize(void) {
|
|
|
|
if (NULL != mca_pml_base_selected_component.pmlm_finalize) {
|
|
|
|
return mca_pml_base_selected_component.pmlm_finalize();
|
|
|
|
}
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static int mca_pml_base_close(void)
|
|
|
|
{
|
|
|
|
int i, j;
|
|
|
|
|
|
|
|
/* turn off the progress code for the pml */
|
|
|
|
if( NULL != mca_pml.pml_progress ) {
|
|
|
|
opal_progress_unregister(mca_pml.pml_progress);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Blatently ignore the return code (what would we do to recover,
|
|
|
|
anyway? This module is going away, so errors don't matter
|
|
|
|
anymore) */
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Destruct the send and receive queues. The opal_free_list_t destructor
|
|
|
|
* will return the memory to the mpool, so this has to be done before the
|
|
|
|
* mpool get released by the PML close function.
|
|
|
|
*/
|
|
|
|
OBJ_DESTRUCT(&mca_pml_base_send_requests);
|
|
|
|
OBJ_DESTRUCT(&mca_pml_base_recv_requests);
|
|
|
|
|
|
|
|
mca_pml.pml_progress = mca_pml_base_progress;
|
|
|
|
|
|
|
|
/* Free all the strings in the array */
|
|
|
|
j = opal_pointer_array_get_size(&mca_pml_base_pml);
|
|
|
|
for (i = 0; i < j; ++i) {
|
|
|
|
char *str;
|
|
|
|
str = (char*) opal_pointer_array_get_item(&mca_pml_base_pml, i);
|
|
|
|
free(str);
|
|
|
|
}
|
|
|
|
OBJ_DESTRUCT(&mca_pml_base_pml);
|
|
|
|
|
|
|
|
/* Close all remaining available components */
|
|
|
|
return mca_base_framework_components_close(&ompi_pml_base_framework, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Function for finding and opening either all MCA components, or the one
|
|
|
|
* that was specifically requested via a MCA parameter.
|
|
|
|
*/
|
|
|
|
static int mca_pml_base_open(mca_base_open_flag_t flags)
|
|
|
|
{
|
|
|
|
/**
|
|
|
|
* Construct the send and receive request queues. There are 2 reasons to do it
|
|
|
|
* here. First, as they are globals it's better to construct them in one common
|
|
|
|
* place. Second, in order to be able to allow the external debuggers to show
|
|
|
|
* their content, they should get constructed as soon as possible once the MPI
|
|
|
|
* process is started.
|
|
|
|
*/
|
|
|
|
OBJ_CONSTRUCT(&mca_pml_base_send_requests, opal_free_list_t);
|
|
|
|
OBJ_CONSTRUCT(&mca_pml_base_recv_requests, opal_free_list_t);
|
|
|
|
|
|
|
|
OBJ_CONSTRUCT(&mca_pml_base_pml, opal_pointer_array_t);
|
|
|
|
|
|
|
|
/* Open up all available components */
|
|
|
|
|
|
|
|
if (OPAL_SUCCESS !=
|
|
|
|
mca_base_framework_components_open(&ompi_pml_base_framework, flags)) {
|
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Set a sentinel in case we don't select any components (e.g.,
|
|
|
|
ompi_info) */
|
|
|
|
|
|
|
|
mca_pml_base_selected_component.pmlm_finalize = NULL;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Right now our selection of BTLs is completely broken. If we have
|
|
|
|
* multiple PMLs that use BTLs than we will open all BTLs several times, leading to
|
|
|
|
* undefined behaviors. The simplest solution, at least until we
|
|
|
|
* figure out the correct way to do it, is to force a default PML that
|
|
|
|
* uses BTLs and any other PMLs that do not in the mca_pml_base_pml array.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#if MCA_ompi_pml_DIRECT_CALL
|
|
|
|
opal_pointer_array_add(&mca_pml_base_pml,
|
|
|
|
strdup(stringify(MCA_ompi_pml_DIRECT_CALL_COMPONENT)));
|
|
|
|
#else
|
|
|
|
{
|
|
|
|
const char **default_pml = NULL;
|
|
|
|
int var_id;
|
|
|
|
|
|
|
|
var_id = mca_base_var_find("ompi", "pml", NULL, NULL);
|
|
|
|
mca_base_var_get_value(var_id, &default_pml, NULL, NULL);
|
|
|
|
|
|
|
|
if( (NULL == default_pml || NULL == default_pml[0] ||
|
|
|
|
0 == strlen(default_pml[0])) || (default_pml[0][0] == '^') ) {
|
|
|
|
opal_pointer_array_add(&mca_pml_base_pml, strdup("ob1"));
|
|
|
|
opal_pointer_array_add(&mca_pml_base_pml, strdup("yalla"));
|
2018-01-17 15:08:04 +02:00
|
|
|
opal_pointer_array_add(&mca_pml_base_pml, strdup("ucx"));
|
2016-11-22 15:03:20 -08:00
|
|
|
opal_pointer_array_add(&mca_pml_base_pml, strdup("cm"));
|
|
|
|
} else {
|
|
|
|
opal_pointer_array_add(&mca_pml_base_pml, strdup(default_pml[0]));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#if OPAL_ENABLE_FT_CR == 1
|
|
|
|
/*
|
|
|
|
* Which PML Wrapper component to use, if any
|
|
|
|
* - NULL or "" = No wrapper
|
|
|
|
* - ow. select that specific wrapper component
|
|
|
|
*/
|
|
|
|
if( NULL != ompi_pml_base_wrapper) {
|
|
|
|
opal_pointer_array_add(&mca_pml_base_pml, ompi_pml_base_wrapper);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
MCA_BASE_FRAMEWORK_DECLARE(ompi, pml, "OMPI PML", mca_pml_base_register,
|
|
|
|
mca_pml_base_open, mca_pml_base_close,
|
|
|
|
mca_pml_base_static_components, 0);
|