oob ud: better error msgs, tolerate systems without UD devices
It is perfectly ok to be on a system without UD devices. Also, make some of the error messages better -- so that the user has a clue about where the error messages are coming from, and what they should do.
Этот коммит содержится в:
родитель
e95010b095
Коммит
8f941a6613
@ -12,32 +12,34 @@
|
|||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
# 2015 Mellanox Technologies, Inc.
|
# 2015 Mellanox Technologies, Inc.
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
|
# Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
|
||||||
# $COPYRIGHT$
|
# $COPYRIGHT$
|
||||||
#
|
#
|
||||||
# Additional copyrights may follow
|
# Additional copyrights may follow
|
||||||
#
|
#
|
||||||
# $HEADER$
|
# $HEADER$
|
||||||
#
|
#
|
||||||
[no-devices-available]
|
[no-devices-error]
|
||||||
No available RDMA devices found:
|
Open MPI has detected a failure in a basic verbs function call. This
|
||||||
|
is unusual, and may indicate that something is malfunctioning on this
|
||||||
|
system.
|
||||||
|
|
||||||
Hostname: %s
|
You job will continue, but Open MPI will ignore the "ud" oob component
|
||||||
|
in this run.
|
||||||
|
|
||||||
|
Verbs function: ibv_get_device_list()
|
||||||
|
Error: %s
|
||||||
|
Hostname: %s
|
||||||
|
|
||||||
Please contact your system administrator.
|
Please contact your system administrator.
|
||||||
#
|
#
|
||||||
[no-devices-error]
|
|
||||||
Failed to get list of the available RDMA devices:
|
|
||||||
|
|
||||||
Hostname: %s
|
|
||||||
Error: %s
|
|
||||||
#
|
|
||||||
[no-devices-usable]
|
|
||||||
No usable devices found:
|
|
||||||
|
|
||||||
Hostname: %s
|
|
||||||
#
|
|
||||||
[no-ports-usable]
|
[no-ports-usable]
|
||||||
No usable ports found:
|
Open MPI has detected that there are UD-capable Verbs devices on your
|
||||||
|
system, but none of them were able to be setup properly. This may
|
||||||
|
indicate a problem on this system.
|
||||||
|
|
||||||
|
You job will continue, but Open MPI will ignore the "ud" oob component
|
||||||
|
in this run.
|
||||||
|
|
||||||
Hostname: %s
|
Hostname: %s
|
||||||
#
|
#
|
||||||
|
@ -344,14 +344,16 @@ static int mca_oob_ud_component_startup(void)
|
|||||||
devices = ibv_get_device_list (&num_devices);
|
devices = ibv_get_device_list (&num_devices);
|
||||||
if (NULL == devices) {
|
if (NULL == devices) {
|
||||||
orte_show_help("help-oob-ud.txt", "no-devices-error", true,
|
orte_show_help("help-oob-ud.txt", "no-devices-error", true,
|
||||||
orte_process_info.nodename, strerror(errno));
|
strerror(errno),
|
||||||
|
orte_process_info.nodename);
|
||||||
return ORTE_ERROR;
|
return ORTE_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* If there are no devices, it is not an error; we just won't use
|
||||||
|
this component. */
|
||||||
if (0 == num_devices) {
|
if (0 == num_devices) {
|
||||||
orte_show_help("help-oob-ud.txt", "no-devices-available", true,
|
ibv_free_device_list(devices);
|
||||||
orte_process_info.nodename);
|
return ORTE_ERR_NOT_FOUND;
|
||||||
return ORTE_ERROR;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (i = 0 ; i < num_devices ; ++i) {
|
for (i = 0 ; i < num_devices ; ++i) {
|
||||||
@ -377,10 +379,10 @@ static int mca_oob_ud_component_startup(void)
|
|||||||
|
|
||||||
ibv_free_device_list (devices);
|
ibv_free_device_list (devices);
|
||||||
|
|
||||||
|
/* If no usable devices are found, then just ignore this component
|
||||||
|
in this run */
|
||||||
if (0 == opal_list_get_size (&mca_oob_ud_component.ud_devices)) {
|
if (0 == opal_list_get_size (&mca_oob_ud_component.ud_devices)) {
|
||||||
orte_show_help("help-oob-ud.txt", "no-devices-usable", true,
|
return ORTE_ERR_NOT_FOUND;
|
||||||
orte_process_info.nodename);
|
|
||||||
return ORTE_ERROR;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user