Refs trac:3626: that ticket specifically refers to the v1.6 branch; this
commit is the trunk version of what is needed for #3626. Add the "ignore_device" field to the INI file. This allows us to specifically list devices that should be ignored by the openib BTL (such as the Intel Phi, at least as of May 2013 -- see #3626). Also add the Intel Phi to the ini file, and set its ignore_device=1. Finally, add the concept of counting intentionally ignored verbs devices. Devices are ignored for one of two reasons: * If the number of allowed ports on that device is 0 (i.e., if if_include/if_exclude was set such that we're intentionally ignoring this device). * If the INI ignore_device field for this device is set to 1. Once we have the count of devices that were intentionally ignored, only show the "Hey, there's verbs devices that you're not using!" show_help message if there are devices that were ''unintentionally'' ignored. This commit was SVN r28589. The following Trac tickets were found above: Ticket 3626 --> https://svn.open-mpi.org/trac/ompi/ticket/3626
Этот коммит содержится в:
родитель
3019b7a3f8
Коммит
713e3aa3db
@ -111,6 +111,8 @@ static void btl_openib_handle_incoming_completion(mca_btl_base_module_t* btl,
|
||||
*/
|
||||
static mca_btl_openib_device_t *receive_queues_device = NULL;
|
||||
static bool malloc_hook_set = false;
|
||||
static int num_devices_intentionally_ignored = 0;
|
||||
|
||||
mca_btl_openib_component_t mca_btl_openib_component = {
|
||||
{
|
||||
/* First, the mca_base_component_t struct containing meta information
|
||||
@ -1645,6 +1647,7 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
|
||||
if (0 == port_cnt) {
|
||||
free(allowed_ports);
|
||||
ret = OMPI_SUCCESS;
|
||||
++num_devices_intentionally_ignored;
|
||||
goto error;
|
||||
}
|
||||
|
||||
@ -1672,6 +1675,17 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
|
||||
device->ib_dev_attr.vendor_part_id);
|
||||
}
|
||||
}
|
||||
|
||||
/* If we're supposed to ignore devices of this vendor/part ID,
|
||||
then do so */
|
||||
if (values.ignore_device_set && values.ignore_device) {
|
||||
BTL_VERBOSE(("device %s skipped; ignore_device=1",
|
||||
ibv_get_device_name(device->ib_dev)));
|
||||
ret = OMPI_SUCCESS;
|
||||
++num_devices_intentionally_ignored;
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* Note that even if we don't find default values, "values" will
|
||||
be set indicating that it does not have good values */
|
||||
ret = ompi_btl_openib_ini_query(0, 0, &default_values);
|
||||
@ -2704,8 +2718,8 @@ btl_openib_component_init(int *num_btl_modules,
|
||||
}
|
||||
|
||||
found = true;
|
||||
if (OMPI_SUCCESS !=
|
||||
(ret = init_one_device(&btl_list, dev_sorted[i].ib_dev))) {
|
||||
ret = init_one_device(&btl_list, dev_sorted[i].ib_dev);
|
||||
if (OMPI_SUCCESS != ret && OMPI_ERR_NOT_SUPPORTED != ret) {
|
||||
free(dev_sorted);
|
||||
goto no_btls;
|
||||
}
|
||||
@ -2737,8 +2751,13 @@ btl_openib_component_init(int *num_btl_modules,
|
||||
}
|
||||
|
||||
if(0 == mca_btl_openib_component.ib_num_btls) {
|
||||
opal_show_help("help-mpi-btl-openib.txt",
|
||||
"no active ports found", true, ompi_process_info.nodename);
|
||||
/* If there were unusable devices that weren't specifically
|
||||
ignored, warn about it */
|
||||
if (num_devices_intentionally_ignored < num_devs) {
|
||||
opal_show_help("help-mpi-btl-openib.txt",
|
||||
"no active ports found", true,
|
||||
ompi_process_info.nodename);
|
||||
}
|
||||
goto no_btls;
|
||||
}
|
||||
|
||||
|
@ -413,6 +413,12 @@ static int parse_line(parsed_section_values_t *sv)
|
||||
sv->values.rdmacm_reject_causes_connect_error_set = true;
|
||||
}
|
||||
|
||||
else if (0 == strcasecmp(key_buffer, "ignore_device")) {
|
||||
/* Single value */
|
||||
sv->values.ignore_device = (bool) ompi_btl_openib_ini_intify(value);
|
||||
sv->values.ignore_device_set = true;
|
||||
}
|
||||
|
||||
else {
|
||||
/* Have no idea what this parameter is. Not an error -- just
|
||||
ignore it */
|
||||
@ -568,6 +574,11 @@ static int save_section(parsed_section_values_t *s)
|
||||
true;
|
||||
}
|
||||
|
||||
if (s->values.ignore_device_set) {
|
||||
h->values.ignore_device = s->values.ignore_device;
|
||||
h->values.ignore_device_set = true;
|
||||
}
|
||||
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2013 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2008 Mellanox Technologies. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -34,6 +34,9 @@ typedef struct ompi_btl_openib_ini_values_t {
|
||||
|
||||
bool rdmacm_reject_causes_connect_error;
|
||||
bool rdmacm_reject_causes_connect_error_set;
|
||||
|
||||
bool ignore_device;
|
||||
bool ignore_device_set;
|
||||
} ompi_btl_openib_ini_values_t;
|
||||
|
||||
|
||||
|
@ -225,11 +225,11 @@ You may need to consult with your system administrator to get this
|
||||
problem fixed.
|
||||
#
|
||||
[no active ports found]
|
||||
WARNING: There is at least one OpenFabrics device found but there are
|
||||
no active ports detected (or Open MPI was unable to use them). This
|
||||
is most certainly not what you wanted. Check your cables, subnet
|
||||
manager configuration, etc. The openib BTL will be ignored for this
|
||||
job.
|
||||
WARNING: There is at least non-excluded one OpenFabrics device found,
|
||||
but there are no active ports detected (or Open MPI was unable to use
|
||||
them). This is most certainly not what you wanted. Check your
|
||||
cables, subnet manager configuration, etc. The openib BTL will be
|
||||
ignored for this job.
|
||||
|
||||
Local host: %s
|
||||
#
|
||||
|
@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2006-2013 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2006-2011 Mellanox Technologies. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
@ -268,3 +268,14 @@ use_eager_rdma = 1
|
||||
mtu = 2048
|
||||
receive_queues = P,65536,256,192,128
|
||||
max_inline_data = 64
|
||||
|
||||
############################################################################
|
||||
|
||||
# Intel has several OUI's, including 0x8086. Amusing. :-) Intel has
|
||||
# advised us (June, 2013) to ignore the Intel Phi OpenFabrics
|
||||
# device... at least for now.
|
||||
|
||||
[Intel Xeon Phi]
|
||||
vendor_id = 0x8086
|
||||
vendor_part_id = 0
|
||||
ignore_device = 1
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user