usnic: add btl_usnic_connectivity_map MCA param to output link information
If the btl_usnic_connectivity_map MCA param is set to a non-NULL value, then each MPI process will output a file named <prefix>-<hostname>.pid<pid>.job<jobid>.mcwrank<MCW rank>.txt. Its contents will detail which usNIC device(s) (and therefore which link(s)) are being used to communicate with each peer MPI process. Here is a sample output file (named mpi005.pid26071.job1640759297.mcwrank0.txt): {{{ device=usnic_0,interface=eth4,ip=10.10.0.5/16,mac=24:57:20:05:20:00,mtu=9000 device=usnic_1,interface=eth5,ip=10.2.0.5/16,mac=24:57:20:05:21:00,mtu=9000 device=usnic_2,interface=eth6,ip=10.3.0.5/16,mac=24:57:20:05:50:00,mtu=9000 peer=1,hostname=mpi006,device=usnic_0@peer_ip=10.10.0.6/16@peer_mac=24:57:20:06:20:00,device=usnic_1@peer_ip=10.2.0.6/16@peer_mac=24:57:20:06:21:00,device=usnic_2@peer_ip=10.3.0.6/16@peer_mac=24:57:20:06:50:00 peer=2,hostname=mpi007,device=usnic_0@peer_ip=10.10.0.7/16@peer_mac=24:57:20:07:20:00,device=usnic_1@peer_ip=10.2.0.7/16@peer_mac=24:57:20:07:21:00,device=usnic_2@peer_ip=10.3.0.7/16@peer_mac=24:57:20:07:50:00 peer=3,hostname=mpi008,device=usnic_0@peer_ip=10.10.0.8/16@peer_mac=24:57:20:08:20:00,device=usnic_1@peer_ip=10.2.0.8/16@peer_mac=24:57:20:08:21:00,device=usnic_2@peer_ip=10.3.0.8/16@peer_mac=24:57:20:08:50:00 }}} Reviewed by Reese Faucette cmr=v1.8.2 This commit was SVN r32156.
Этот коммит содержится в:
родитель
df82810d03
Коммит
1e17ab461b
@ -62,6 +62,7 @@ sources = \
|
||||
btl_usnic_graph.h \
|
||||
btl_usnic_graph.c \
|
||||
btl_usnic_hwloc.h \
|
||||
btl_usnic_map.c \
|
||||
btl_usnic_mca.c \
|
||||
btl_usnic_proc.c \
|
||||
btl_usnic_proc.h \
|
||||
|
@ -202,6 +202,10 @@ typedef struct ompi_btl_usnic_component_t {
|
||||
|
||||
/* ibv_create_ah() (i.e., ARP) timeout */
|
||||
int arp_timeout;
|
||||
|
||||
/* Prefix for the connectivity map filename (map will be output if
|
||||
the prefix is non-NULL) */
|
||||
char *connectivity_map_prefix;
|
||||
} ompi_btl_usnic_component_t;
|
||||
|
||||
OMPI_MODULE_DECLSPEC extern ompi_btl_usnic_component_t mca_btl_usnic_component;
|
||||
@ -235,5 +239,10 @@ int ompi_btl_usnic_component_register(void);
|
||||
* fragment, and segment state to standard output. */
|
||||
void ompi_btl_usnic_component_debug(void);
|
||||
|
||||
/**
|
||||
* Called to output the connectivity map
|
||||
*/
|
||||
void ompi_btl_usnic_connectivity_map(void);
|
||||
|
||||
END_C_DECLS
|
||||
#endif
|
||||
|
@ -291,6 +291,13 @@ int ompi_btl_usnic_component_register(void)
|
||||
&mca_btl_usnic_component.connectivity_num_retries,
|
||||
REGINT_GE_ONE, OPAL_INFO_LVL_3));
|
||||
|
||||
mca_btl_usnic_component.connectivity_map_prefix = NULL;
|
||||
CHECK(reg_string("connectivity_map",
|
||||
"Display the usNIC connectivity map. If this parameter is specified, it is the filename prefix emitted by each MPI process. The full filename emitted by each process is of the form: <prefix>-<hostname>.<pid>.<jobid>.<MCW rank>.txt.",
|
||||
mca_btl_usnic_component.connectivity_map_prefix,
|
||||
&mca_btl_usnic_component.connectivity_map_prefix,
|
||||
REGSTR_EMPTY_OK, OPAL_INFO_LVL_3));
|
||||
|
||||
/* Register some synonyms to the ompi common verbs component */
|
||||
ompi_common_verbs_mca_register(&mca_btl_usnic_component.super.btl_version);
|
||||
|
||||
|
@ -381,6 +381,20 @@ static int usnic_add_procs(struct mca_btl_base_module_t* base_module,
|
||||
}
|
||||
}
|
||||
|
||||
/* This is fairly gross, but we need to output the connectivity
|
||||
map after add_procs() has been called on all existing usnic
|
||||
modules. The only way I can think to do that is to count each
|
||||
time add_procs() is called, and when we're at a multiple of
|
||||
component.num_modules (i.e., add_procs() has been called on
|
||||
each module -- both during MPI_INIT and dynamic process cases),
|
||||
call the function to output the map. */
|
||||
static int num_times_add_procs_called = 0;
|
||||
++num_times_add_procs_called;
|
||||
if (0 == (num_times_add_procs_called %
|
||||
mca_btl_usnic_component.num_modules)) {
|
||||
ompi_btl_usnic_connectivity_map();
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
fail:
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user