1
1

Refs trac:3626: that ticket specifically refers to the v1.6 branch; this

commit is the trunk version of what is needed for #3626.

Add the "ignore_device" field to the INI file.  This allows us to
specifically list devices that should be ignored by the openib BTL
(such as the Intel Phi, at least as of May 2013 -- see #3626).  

Also add the Intel Phi to the ini file, and set its ignore_device=1.

Finally, add the concept of counting intentionally ignored verbs
devices.  Devices are ignored for one of two reasons:

 * If the number of allowed ports on that device is 0 (i.e., if
   if_include/if_exclude was set such that we're intentionally
   ignoring this device).
 * If the INI ignore_device field for this device is set to 1.

Once we have the count of devices that were intentionally ignored,
only show the "Hey, there's verbs devices that you're not using!"
show_help message if there are devices that were ''unintentionally''
ignored.

This commit was SVN r28589.

The following Trac tickets were found above:
  Ticket 3626 --> https://svn.open-mpi.org/trac/ompi/ticket/3626
Этот коммит содержится в:
Jeff Squyres 2013-06-05 12:12:09 +00:00
родитель 3019b7a3f8
Коммит 713e3aa3db
5 изменённых файлов: 55 добавлений и 11 удалений

Просмотреть файл

@ -111,6 +111,8 @@ static void btl_openib_handle_incoming_completion(mca_btl_base_module_t* btl,
*/
static mca_btl_openib_device_t *receive_queues_device = NULL;
static bool malloc_hook_set = false;
static int num_devices_intentionally_ignored = 0;
mca_btl_openib_component_t mca_btl_openib_component = {
{
/* First, the mca_base_component_t struct containing meta information
@ -1645,6 +1647,7 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
if (0 == port_cnt) {
free(allowed_ports);
ret = OMPI_SUCCESS;
++num_devices_intentionally_ignored;
goto error;
}
@ -1672,6 +1675,17 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
device->ib_dev_attr.vendor_part_id);
}
}
/* If we're supposed to ignore devices of this vendor/part ID,
then do so */
if (values.ignore_device_set && values.ignore_device) {
BTL_VERBOSE(("device %s skipped; ignore_device=1",
ibv_get_device_name(device->ib_dev)));
ret = OMPI_SUCCESS;
++num_devices_intentionally_ignored;
goto error;
}
/* Note that even if we don't find default values, "values" will
be set indicating that it does not have good values */
ret = ompi_btl_openib_ini_query(0, 0, &default_values);
@ -2704,8 +2718,8 @@ btl_openib_component_init(int *num_btl_modules,
}
found = true;
if (OMPI_SUCCESS !=
(ret = init_one_device(&btl_list, dev_sorted[i].ib_dev))) {
ret = init_one_device(&btl_list, dev_sorted[i].ib_dev);
if (OMPI_SUCCESS != ret && OMPI_ERR_NOT_SUPPORTED != ret) {
free(dev_sorted);
goto no_btls;
}
@ -2737,8 +2751,13 @@ btl_openib_component_init(int *num_btl_modules,
}
if(0 == mca_btl_openib_component.ib_num_btls) {
opal_show_help("help-mpi-btl-openib.txt",
"no active ports found", true, ompi_process_info.nodename);
/* If there were unusable devices that weren't specifically
ignored, warn about it */
if (num_devices_intentionally_ignored < num_devs) {
opal_show_help("help-mpi-btl-openib.txt",
"no active ports found", true,
ompi_process_info.nodename);
}
goto no_btls;
}

Просмотреть файл

@ -413,6 +413,12 @@ static int parse_line(parsed_section_values_t *sv)
sv->values.rdmacm_reject_causes_connect_error_set = true;
}
else if (0 == strcasecmp(key_buffer, "ignore_device")) {
/* Single value */
sv->values.ignore_device = (bool) ompi_btl_openib_ini_intify(value);
sv->values.ignore_device_set = true;
}
else {
/* Have no idea what this parameter is. Not an error -- just
ignore it */
@ -568,6 +574,11 @@ static int save_section(parsed_section_values_t *s)
true;
}
if (s->values.ignore_device_set) {
h->values.ignore_device = s->values.ignore_device;
h->values.ignore_device_set = true;
}
found = true;
break;
}

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006-2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2008 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
@ -34,6 +34,9 @@ typedef struct ompi_btl_openib_ini_values_t {
bool rdmacm_reject_causes_connect_error;
bool rdmacm_reject_causes_connect_error_set;
bool ignore_device;
bool ignore_device_set;
} ompi_btl_openib_ini_values_t;

Просмотреть файл

@ -225,11 +225,11 @@ You may need to consult with your system administrator to get this
problem fixed.
#
[no active ports found]
WARNING: There is at least one OpenFabrics device found but there are
no active ports detected (or Open MPI was unable to use them). This
is most certainly not what you wanted. Check your cables, subnet
manager configuration, etc. The openib BTL will be ignored for this
job.
WARNING: There is at least non-excluded one OpenFabrics device found,
but there are no active ports detected (or Open MPI was unable to use
them). This is most certainly not what you wanted. Check your
cables, subnet manager configuration, etc. The openib BTL will be
ignored for this job.
Local host: %s
#

Просмотреть файл

@ -1,5 +1,5 @@
#
# Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2006-2013 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2006-2011 Mellanox Technologies. All rights reserved.
# $COPYRIGHT$
#
@ -268,3 +268,14 @@ use_eager_rdma = 1
mtu = 2048
receive_queues = P,65536,256,192,128
max_inline_data = 64
############################################################################
# Intel has several OUI's, including 0x8086. Amusing. :-) Intel has
# advised us (June, 2013) to ignore the Intel Phi OpenFabrics
# device... at least for now.
[Intel Xeon Phi]
vendor_id = 0x8086
vendor_part_id = 0
ignore_device = 1