btl/openib: immediately release the device when no port is allowed
Many thanks to Sergey Oblomov for reporting this issue and the countless traces provided when troubleshooting it. This is a one-off commit for the v4.0.x branch since btl/openib has been removed from master. Refs. open-mpi/ompi#6137 Signed-off-by: Gilles Gouaillardet <gilles@rist.or.jp>
Этот коммит содержится в:
родитель
c58c774981
Коммит
8da4605589
@ -1045,7 +1045,7 @@ int mca_btl_openib_add_procs(
|
|||||||
opal_bitmap_clear_all_bits(reachable);
|
opal_bitmap_clear_all_bits(reachable);
|
||||||
opal_show_help("help-mpi-btl-openib.txt", "ib port not selected",
|
opal_show_help("help-mpi-btl-openib.txt", "ib port not selected",
|
||||||
true, opal_process_info.nodename,
|
true, opal_process_info.nodename,
|
||||||
ibv_get_device_name(openib_btl->device->ib_dev), openib_btl->port_num);
|
openib_btl->device_name, openib_btl->port_num);
|
||||||
return OPAL_SUCCESS;
|
return OPAL_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1718,12 +1718,12 @@ static int mca_btl_openib_finalize_resources(struct mca_btl_base_module_t* btl)
|
|||||||
free(openib_btl->cpcs[i]);
|
free(openib_btl->cpcs[i]);
|
||||||
}
|
}
|
||||||
free(openib_btl->cpcs);
|
free(openib_btl->cpcs);
|
||||||
}
|
|
||||||
|
|
||||||
/* Release device if there are no more users */
|
/* Release device if there are no more users */
|
||||||
if(!(--openib_btl->device->btls)) {
|
if(!(--openib_btl->device->allowed_btls)) {
|
||||||
OBJ_RELEASE(openib_btl->device);
|
OBJ_RELEASE(openib_btl->device);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (NULL != openib_btl->qps) {
|
if (NULL != openib_btl->qps) {
|
||||||
free(openib_btl->qps);
|
free(openib_btl->qps);
|
||||||
|
@ -392,6 +392,7 @@ typedef struct mca_btl_openib_device_t {
|
|||||||
/* Whether this device supports eager RDMA */
|
/* Whether this device supports eager RDMA */
|
||||||
uint8_t use_eager_rdma;
|
uint8_t use_eager_rdma;
|
||||||
uint8_t btls; /** < number of btls using this device */
|
uint8_t btls; /** < number of btls using this device */
|
||||||
|
uint8_t allowed_btls; /** < number of allowed btls using this device */
|
||||||
opal_pointer_array_t *endpoints;
|
opal_pointer_array_t *endpoints;
|
||||||
opal_pointer_array_t *device_btls;
|
opal_pointer_array_t *device_btls;
|
||||||
uint16_t hp_cq_polls;
|
uint16_t hp_cq_polls;
|
||||||
@ -483,6 +484,7 @@ struct mca_btl_openib_module_t {
|
|||||||
uint8_t num_cpcs;
|
uint8_t num_cpcs;
|
||||||
|
|
||||||
mca_btl_openib_device_t *device;
|
mca_btl_openib_device_t *device;
|
||||||
|
char * device_name;
|
||||||
uint8_t port_num; /**< ID of the PORT */
|
uint8_t port_num; /**< ID of the PORT */
|
||||||
uint16_t pkey_index;
|
uint16_t pkey_index;
|
||||||
struct ibv_port_attr ib_port_attr;
|
struct ibv_port_attr ib_port_attr;
|
||||||
|
@ -648,9 +648,10 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
|
|||||||
sizeof(mca_btl_openib_module));
|
sizeof(mca_btl_openib_module));
|
||||||
ib_selected = OBJ_NEW(mca_btl_base_selected_module_t);
|
ib_selected = OBJ_NEW(mca_btl_base_selected_module_t);
|
||||||
ib_selected->btl_module = (mca_btl_base_module_t*) openib_btl;
|
ib_selected->btl_module = (mca_btl_base_module_t*) openib_btl;
|
||||||
openib_btl->device = device;
|
|
||||||
openib_btl->port_num = (uint8_t) port_num;
|
openib_btl->port_num = (uint8_t) port_num;
|
||||||
openib_btl->allowed = false;
|
openib_btl->allowed = false;
|
||||||
|
openib_btl->device = NULL;
|
||||||
|
openib_btl->device_name = strdup(ibv_get_device_name(device->ib_dev));
|
||||||
OBJ_CONSTRUCT(&openib_btl->ib_lock, opal_mutex_t);
|
OBJ_CONSTRUCT(&openib_btl->ib_lock, opal_mutex_t);
|
||||||
opal_list_append(btl_list, (opal_list_item_t*) ib_selected);
|
opal_list_append(btl_list, (opal_list_item_t*) ib_selected);
|
||||||
opal_pointer_array_add(device->device_btls, (void*) openib_btl);
|
opal_pointer_array_add(device->device_btls, (void*) openib_btl);
|
||||||
@ -784,6 +785,7 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
|
|||||||
ib_selected = OBJ_NEW(mca_btl_base_selected_module_t);
|
ib_selected = OBJ_NEW(mca_btl_base_selected_module_t);
|
||||||
ib_selected->btl_module = (mca_btl_base_module_t*) openib_btl;
|
ib_selected->btl_module = (mca_btl_base_module_t*) openib_btl;
|
||||||
openib_btl->device = device;
|
openib_btl->device = device;
|
||||||
|
openib_btl->device_name = NULL;
|
||||||
openib_btl->port_num = (uint8_t) port_num;
|
openib_btl->port_num = (uint8_t) port_num;
|
||||||
openib_btl->pkey_index = pkey_index;
|
openib_btl->pkey_index = pkey_index;
|
||||||
openib_btl->lid = lid;
|
openib_btl->lid = lid;
|
||||||
@ -904,6 +906,7 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
|
|||||||
opal_list_append(btl_list, (opal_list_item_t*) ib_selected);
|
opal_list_append(btl_list, (opal_list_item_t*) ib_selected);
|
||||||
opal_pointer_array_add(device->device_btls, (void*) openib_btl);
|
opal_pointer_array_add(device->device_btls, (void*) openib_btl);
|
||||||
++device->btls;
|
++device->btls;
|
||||||
|
++device->allowed_btls;
|
||||||
++mca_btl_openib_component.ib_num_btls;
|
++mca_btl_openib_component.ib_num_btls;
|
||||||
++mca_btl_openib_component.ib_allowed_btls;
|
++mca_btl_openib_component.ib_allowed_btls;
|
||||||
if (-1 != mca_btl_openib_component.ib_max_btls &&
|
if (-1 != mca_btl_openib_component.ib_max_btls &&
|
||||||
@ -1933,7 +1936,7 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
|
|||||||
if (ib_port_attr.active_mtu < device->mtu){
|
if (ib_port_attr.active_mtu < device->mtu){
|
||||||
device->mtu = ib_port_attr.active_mtu;
|
device->mtu = ib_port_attr.active_mtu;
|
||||||
}
|
}
|
||||||
if (mca_btl_openib_component.apm_ports && device->btls > 0) {
|
if (mca_btl_openib_component.apm_ports && device->allowed_btls > 0) {
|
||||||
init_apm_port(device, i, ib_port_attr.lid);
|
init_apm_port(device, i, ib_port_attr.lid);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -1969,7 +1972,7 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
|
|||||||
|
|
||||||
/* If we made a BTL, check APM status and return. Otherwise, fall
|
/* If we made a BTL, check APM status and return. Otherwise, fall
|
||||||
through and destroy everything */
|
through and destroy everything */
|
||||||
if (device->btls > 0) {
|
if (device->allowed_btls > 0) {
|
||||||
/* if apm was enabled it should be > 1 */
|
/* if apm was enabled it should be > 1 */
|
||||||
if (1 == mca_btl_openib_component.apm_ports) {
|
if (1 == mca_btl_openib_component.apm_ports) {
|
||||||
opal_show_help("help-mpi-btl-openib.txt",
|
opal_show_help("help-mpi-btl-openib.txt",
|
||||||
@ -2290,6 +2293,11 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
|
|||||||
good:
|
good:
|
||||||
mca_btl_openib_component.devices_count++;
|
mca_btl_openib_component.devices_count++;
|
||||||
return OPAL_SUCCESS;
|
return OPAL_SUCCESS;
|
||||||
|
} else if (device->btls > 0) {
|
||||||
|
/* no port is allowed to be used by btl/openib,
|
||||||
|
* so release the device right away */
|
||||||
|
OBJ_RELEASE(device);
|
||||||
|
return OPAL_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
error:
|
error:
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user