usnic: ensure that procs agree on use_udp value
Add the component use_udp value into the modex. If my component's use_udp value doesn't agree with the use_udp value from a peer's modex data, print a helpful message and disqualify the usnic BTL (the usnic BTL will not be used). This prevents accidental customer misconfigurations. Reviewed by Dave Goodell cmr=v1.8.2:reviewer=ompi-rm1.8 This commit was SVN r31689.
Этот коммит содержится в:
родитель
e9c3df652e
Коммит
184e4fc0ca
@ -1145,6 +1145,7 @@ static int init_module_from_port(ompi_btl_usnic_module_t *module,
|
||||
module->device_context = port->device->context;
|
||||
module->port_num = port->port_num;
|
||||
module->numa_distance = 0;
|
||||
module->local_addr.use_udp = mca_btl_usnic_component.use_udp;
|
||||
|
||||
/* If we fail to query the GID, just warn and skip this port */
|
||||
if (0 != ibv_query_gid(module->device_context,
|
||||
|
@ -74,6 +74,7 @@ typedef struct ompi_btl_usnic_addr_t {
|
||||
uint16_t mtu;
|
||||
ompi_btl_usnic_seq_t isn;
|
||||
uint8_t mac[6];
|
||||
uint8_t use_udp;
|
||||
} ompi_btl_usnic_addr_t;
|
||||
|
||||
struct ompi_btl_usnic_send_segment_t;
|
||||
|
@ -233,6 +233,21 @@ static int create_proc(ompi_proc_t *ompi_proc,
|
||||
return OMPI_ERR_UNREACH;
|
||||
}
|
||||
|
||||
/* Sanity check: ensure that the remote proc agrees with this proc
|
||||
on whether we're doing UDP or not. Note that all endpoints on
|
||||
the remote proc will have the same "use_udp" value, so we only
|
||||
need to check one of them. */
|
||||
if (proc->proc_modex[0].use_udp !=
|
||||
mca_btl_usnic_component.use_udp) {
|
||||
opal_show_help("help-mpi-btl-usnic.txt",
|
||||
"transport mismatch",
|
||||
true,
|
||||
ompi_process_info.nodename,
|
||||
proc->proc_ompi->proc_hostname);
|
||||
OBJ_RELEASE(proc);
|
||||
return OMPI_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
proc->proc_modex_claimed = (bool*)
|
||||
calloc(proc->proc_modex_count, sizeof(bool));
|
||||
if (NULL == proc->proc_modex_claimed) {
|
||||
|
@ -246,3 +246,24 @@ abort.
|
||||
Server: %s
|
||||
Device: %s:%d (%s)
|
||||
Current ARP timeout: %d (btl_usnic_arp_timeout MCA param)
|
||||
#
|
||||
[transport mismatch]
|
||||
The underlying transports used by the usNIC driver stack on multiple
|
||||
servers do not match. This configuration is unsupported and is almost
|
||||
certainly not what you want.
|
||||
|
||||
This error indicates that the VIC firmware, Linux usNIC kernel driver,
|
||||
and/or Linux usNIC userspace drivers are not compatible between at
|
||||
least the following two servers:
|
||||
|
||||
Local server: %s
|
||||
Remote server: %s
|
||||
|
||||
The usnic MPI transport will be deactivated in at least the one local
|
||||
MPI process that reported the problem. This may lead to performance
|
||||
degradation, and may also result in aborting the overall MPI job.
|
||||
|
||||
It is usually easiest to have the same VIC firmware, Linux usNIC
|
||||
kernel driver, and Linux usNIC userspace driver installed on all
|
||||
servers.
|
||||
#
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user