oob ud: better error msgs, tolerate systems without UD devices
It is perfectly ok to be on a system without UD devices. Also, make some of the error messages better -- so that the user has a clue about where the error messages are coming from, and what they should do.
Этот коммит содержится в:
родитель
e95010b095
Коммит
8f941a6613
@ -12,32 +12,34 @@
|
||||
# All rights reserved.
|
||||
# 2015 Mellanox Technologies, Inc.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
[no-devices-available]
|
||||
No available RDMA devices found:
|
||||
[no-devices-error]
|
||||
Open MPI has detected a failure in a basic verbs function call. This
|
||||
is unusual, and may indicate that something is malfunctioning on this
|
||||
system.
|
||||
|
||||
Hostname: %s
|
||||
You job will continue, but Open MPI will ignore the "ud" oob component
|
||||
in this run.
|
||||
|
||||
Verbs function: ibv_get_device_list()
|
||||
Error: %s
|
||||
Hostname: %s
|
||||
|
||||
Please contact your system administrator.
|
||||
#
|
||||
[no-devices-error]
|
||||
Failed to get list of the available RDMA devices:
|
||||
|
||||
Hostname: %s
|
||||
Error: %s
|
||||
#
|
||||
[no-devices-usable]
|
||||
No usable devices found:
|
||||
|
||||
Hostname: %s
|
||||
#
|
||||
[no-ports-usable]
|
||||
No usable ports found:
|
||||
Open MPI has detected that there are UD-capable Verbs devices on your
|
||||
system, but none of them were able to be setup properly. This may
|
||||
indicate a problem on this system.
|
||||
|
||||
You job will continue, but Open MPI will ignore the "ud" oob component
|
||||
in this run.
|
||||
|
||||
Hostname: %s
|
||||
#
|
||||
|
@ -344,14 +344,16 @@ static int mca_oob_ud_component_startup(void)
|
||||
devices = ibv_get_device_list (&num_devices);
|
||||
if (NULL == devices) {
|
||||
orte_show_help("help-oob-ud.txt", "no-devices-error", true,
|
||||
orte_process_info.nodename, strerror(errno));
|
||||
strerror(errno),
|
||||
orte_process_info.nodename);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* If there are no devices, it is not an error; we just won't use
|
||||
this component. */
|
||||
if (0 == num_devices) {
|
||||
orte_show_help("help-oob-ud.txt", "no-devices-available", true,
|
||||
orte_process_info.nodename);
|
||||
return ORTE_ERROR;
|
||||
ibv_free_device_list(devices);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
for (i = 0 ; i < num_devices ; ++i) {
|
||||
@ -377,10 +379,10 @@ static int mca_oob_ud_component_startup(void)
|
||||
|
||||
ibv_free_device_list (devices);
|
||||
|
||||
/* If no usable devices are found, then just ignore this component
|
||||
in this run */
|
||||
if (0 == opal_list_get_size (&mca_oob_ud_component.ud_devices)) {
|
||||
orte_show_help("help-oob-ud.txt", "no-devices-usable", true,
|
||||
orte_process_info.nodename);
|
||||
return ORTE_ERROR;
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user