1
1

usnic: ensure that procs agree on use_udp value

Add the component use_udp value into the modex.  If my component's
use_udp value doesn't agree with the use_udp value from a peer's modex
data, print a helpful message and disqualify the usnic BTL (the usnic
BTL will not be used).  This prevents accidental customer
misconfigurations.

Reviewed by Dave Goodell

cmr=v1.8.2:reviewer=ompi-rm1.8

This commit was SVN r31689.
Этот коммит содержится в:
Jeff Squyres 2014-05-08 16:43:50 +00:00
родитель e9c3df652e
Коммит 184e4fc0ca
4 изменённых файлов: 38 добавлений и 0 удалений

Просмотреть файл

@ -1145,6 +1145,7 @@ static int init_module_from_port(ompi_btl_usnic_module_t *module,
module->device_context = port->device->context;
module->port_num = port->port_num;
module->numa_distance = 0;
module->local_addr.use_udp = mca_btl_usnic_component.use_udp;
/* If we fail to query the GID, just warn and skip this port */
if (0 != ibv_query_gid(module->device_context,

Просмотреть файл

@ -74,6 +74,7 @@ typedef struct ompi_btl_usnic_addr_t {
uint16_t mtu;
ompi_btl_usnic_seq_t isn;
uint8_t mac[6];
uint8_t use_udp;
} ompi_btl_usnic_addr_t;
struct ompi_btl_usnic_send_segment_t;

Просмотреть файл

@ -233,6 +233,21 @@ static int create_proc(ompi_proc_t *ompi_proc,
return OMPI_ERR_UNREACH;
}
/* Sanity check: ensure that the remote proc agrees with this proc
on whether we're doing UDP or not. Note that all endpoints on
the remote proc will have the same "use_udp" value, so we only
need to check one of them. */
if (proc->proc_modex[0].use_udp !=
mca_btl_usnic_component.use_udp) {
opal_show_help("help-mpi-btl-usnic.txt",
"transport mismatch",
true,
ompi_process_info.nodename,
proc->proc_ompi->proc_hostname);
OBJ_RELEASE(proc);
return OMPI_ERR_BAD_PARAM;
}
proc->proc_modex_claimed = (bool*)
calloc(proc->proc_modex_count, sizeof(bool));
if (NULL == proc->proc_modex_claimed) {

Просмотреть файл

@ -246,3 +246,24 @@ abort.
Server: %s
Device: %s:%d (%s)
Current ARP timeout: %d (btl_usnic_arp_timeout MCA param)
#
[transport mismatch]
The underlying transports used by the usNIC driver stack on multiple
servers do not match. This configuration is unsupported and is almost
certainly not what you want.
This error indicates that the VIC firmware, Linux usNIC kernel driver,
and/or Linux usNIC userspace drivers are not compatible between at
least the following two servers:
Local server: %s
Remote server: %s
The usnic MPI transport will be deactivated in at least the one local
MPI process that reported the problem. This may lead to performance
degradation, and may also result in aborting the overall MPI job.
It is usually easiest to have the same VIC firmware, Linux usNIC
kernel driver, and Linux usNIC userspace driver installed on all
servers.
#