735 строки
26 KiB
C
735 строки
26 KiB
C
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
|
/*
|
|
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
|
* reserved.
|
|
* Copyright (c) 2014 Research Organization for Information Science
|
|
* and Technology (RIST). All rights reserved.
|
|
* 2014 Mellanox Technologies, Inc.
|
|
* All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*
|
|
*/
|
|
|
|
#include "orte_config.h"
|
|
#include "orte/types.h"
|
|
#include "opal/types.h"
|
|
|
|
#include "orte/runtime/orte_globals.h"
|
|
#include "orte/util/name_fns.h"
|
|
#include "orte/util/proc_info.h"
|
|
|
|
#include "oob_ud_component.h"
|
|
|
|
static int mca_oob_ud_component_open (void);
|
|
static int mca_oob_ud_component_close (void);
|
|
static int mca_oob_ud_component_register (void);
|
|
static bool mca_oob_ud_component_available(void);
|
|
static int mca_oob_ud_component_startup(void);
|
|
static int mca_oob_ud_component_send_nb(orte_rml_send_t *msg);
|
|
static void mca_oob_ud_component_shutdown(void);
|
|
static char* mca_oob_ud_component_get_addr(void);
|
|
static int mca_oob_ud_component_set_addr(orte_process_name_t *peer, char **uris);
|
|
static bool mca_oob_ud_component_is_reachable(orte_process_name_t *peer);
|
|
#if OPAL_ENABLE_FT_CR == 1
|
|
static int mca_oob_ud_component_ft_event(int state);
|
|
#endif // OPAL_ENABLE_FT_CR
|
|
|
|
static int mca_oob_ud_listen_create (mca_oob_ud_port_t *port);
|
|
static int mca_oob_ud_listen_destroy (mca_oob_ud_port_t *port);
|
|
static int mca_oob_ud_port_alloc_buffers (mca_oob_ud_port_t *port);
|
|
static inline int mca_oob_ud_port_recv_start (mca_oob_ud_port_t *port);
|
|
static inline int mca_oob_ud_alloc_reg_mem (struct ibv_pd *pd, mca_oob_ud_reg_mem_t *reg_mem,
|
|
const int buffer_len);
|
|
static inline void mca_oob_ud_free_reg_mem (mca_oob_ud_reg_mem_t *reg_mem);
|
|
static void mca_oob_ud_cancel_all_in_list (opal_list_t *list);
|
|
static void mca_oob_ud_empty_list (opal_list_t *list);
|
|
static void mca_oob_ud_port_construct (mca_oob_ud_port_t *port);
|
|
static void mca_oob_ud_port_destruct (mca_oob_ud_port_t *port);
|
|
static void mca_oob_ud_device_construct (mca_oob_ud_device_t *device);
|
|
static void mca_oob_ud_device_destruct (mca_oob_ud_device_t *device);
|
|
|
|
/*
|
|
* Struct of function pointers and all that to let us be initialized
|
|
*/
|
|
mca_oob_ud_component_t mca_oob_ud_component = {
|
|
{
|
|
{
|
|
MCA_OOB_BASE_VERSION_2_0_0,
|
|
"ud", /* MCA module name */
|
|
ORTE_MAJOR_VERSION,
|
|
ORTE_MINOR_VERSION,
|
|
ORTE_RELEASE_VERSION,
|
|
mca_oob_ud_component_open, /* component open */
|
|
mca_oob_ud_component_close, /* component close */
|
|
NULL, /* component query */
|
|
mca_oob_ud_component_register, /* component register */
|
|
},
|
|
{
|
|
/* The component is checkpoint ready */
|
|
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
|
},
|
|
0, // reserve space for an assigned index
|
|
0, //set the priority so that we will select this component only if someone directs to do so
|
|
mca_oob_ud_component_available, //available
|
|
mca_oob_ud_component_startup, //startup
|
|
mca_oob_ud_component_shutdown, //shutdown
|
|
mca_oob_ud_component_send_nb, //send_nb
|
|
mca_oob_ud_component_get_addr,
|
|
mca_oob_ud_component_set_addr,
|
|
mca_oob_ud_component_is_reachable, //is_reachable
|
|
#if OPAL_ENABLE_FT_CR == 1
|
|
mca_oob_ud_component_ft_event,
|
|
#endif // OPAL_ENABLE_FT_CR
|
|
},
|
|
};
|
|
|
|
static int mca_oob_ud_component_open (void)
|
|
{
|
|
OBJ_CONSTRUCT(&mca_oob_ud_component.ud_devices, opal_list_t);
|
|
OBJ_CONSTRUCT(&mca_oob_ud_component.ud_active_sends, opal_list_t);
|
|
|
|
OBJ_CONSTRUCT(&mca_oob_ud_component.ud_active_recvs, opal_list_t);
|
|
OBJ_CONSTRUCT(&mca_oob_ud_component.ud_event_queued_reqs, opal_list_t);
|
|
|
|
OBJ_CONSTRUCT(&mca_oob_ud_component.ud_event_processing_msgs, opal_list_t);
|
|
|
|
OBJ_CONSTRUCT(&mca_oob_ud_component.ud_lock, opal_mutex_t);
|
|
OBJ_CONSTRUCT(&mca_oob_ud_component.ud_match_lock, opal_mutex_t);
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
static int mca_oob_ud_component_close (void)
|
|
{
|
|
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
|
"%s oob:ud:component_close entering",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
|
|
|
OBJ_DESTRUCT(&mca_oob_ud_component.ud_devices);
|
|
OBJ_DESTRUCT(&mca_oob_ud_component.ud_active_sends);
|
|
|
|
OBJ_DESTRUCT(&mca_oob_ud_component.ud_active_recvs);
|
|
OBJ_DESTRUCT(&mca_oob_ud_component.ud_event_queued_reqs);
|
|
|
|
OBJ_DESTRUCT(&mca_oob_ud_component.ud_lock);
|
|
OBJ_DESTRUCT(&mca_oob_ud_component.ud_match_lock);
|
|
|
|
OBJ_DESTRUCT(&mca_oob_ud_component.ud_event_processing_msgs);
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
static int mca_oob_ud_component_register (void)
|
|
{
|
|
mca_base_component_t *component = &mca_oob_ud_component.super.oob_base;
|
|
|
|
mca_oob_ud_component.ud_min_qp = 8;
|
|
|
|
(void) mca_base_component_var_register (component, "min_qp", "Minimum number of UD queue pairs "
|
|
"to allocate (default: 8)", MCA_BASE_VAR_TYPE_INT, NULL,
|
|
0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5,
|
|
MCA_BASE_VAR_SCOPE_LOCAL, &mca_oob_ud_component.ud_min_qp);
|
|
|
|
mca_oob_ud_component.ud_max_qp = 32;
|
|
(void) mca_base_component_var_register (component, "max_qp", "Maximum number of UD queue pairs "
|
|
"to allocate (default: 32)", MCA_BASE_VAR_TYPE_INT, NULL,
|
|
0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5,
|
|
MCA_BASE_VAR_SCOPE_LOCAL, &mca_oob_ud_component.ud_max_qp);
|
|
|
|
mca_oob_ud_component.ud_recv_buffer_count = 512;
|
|
(void) mca_base_component_var_register (component, "recv_buffers", "Number of MTU sized recv "
|
|
"buffers to post (default: 512)", MCA_BASE_VAR_TYPE_INT, NULL,
|
|
0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5,
|
|
MCA_BASE_VAR_SCOPE_LOCAL, &mca_oob_ud_component.ud_recv_buffer_count);
|
|
|
|
mca_oob_ud_component.ud_send_buffer_count = 512;
|
|
(void) mca_base_component_var_register (component, "send_buffers", "Number of MTU sized send "
|
|
"buffers to allocate (default: 512)", MCA_BASE_VAR_TYPE_INT, NULL,
|
|
0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5,
|
|
MCA_BASE_VAR_SCOPE_LOCAL, &mca_oob_ud_component.ud_send_buffer_count);
|
|
|
|
mca_oob_ud_component.ud_max_retries = 5;
|
|
(void)mca_base_component_var_register(component, "peer_retries",
|
|
"Number of times to try shutting down a connection before giving up",
|
|
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
|
OPAL_INFO_LVL_9,
|
|
MCA_BASE_VAR_SCOPE_LOCAL,
|
|
&mca_oob_ud_component.ud_max_retries);
|
|
|
|
mca_oob_ud_component.ud_timeout_usec = 800000;
|
|
(void)mca_base_component_var_register(component, "peer_timeout",
|
|
"Timeout in microseconds between retransmission of data",
|
|
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
|
OPAL_INFO_LVL_9,
|
|
MCA_BASE_VAR_SCOPE_LOCAL,
|
|
&mca_oob_ud_component.ud_timeout_usec);
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
static bool mca_oob_ud_component_available(void) {
|
|
|
|
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
|
"oob:ud: component_available called");
|
|
|
|
/* set the module event base - this is where we would spin off a separate
|
|
* progress thread if so desired */
|
|
mca_oob_ud_module.ev_base = orte_event_base;
|
|
|
|
return true;
|
|
}
|
|
|
|
static int port_mtus[] = {0, 256, 512, 1024, 2048, 4096};
|
|
|
|
static inline int mca_oob_ud_port_setup (mca_oob_ud_port_t *port)
|
|
{
|
|
int rc;
|
|
struct ibv_port_attr port_attr;
|
|
|
|
rc = ibv_query_port (port->device->ib_context, port->port_num, &port_attr);
|
|
if (0 != rc || IBV_PORT_ACTIVE != port_attr.state || 0 == port_attr.lid) {
|
|
/* skip this port */
|
|
return ORTE_ERROR;
|
|
}
|
|
|
|
port->lid = port_attr.lid;
|
|
port->mtu = port_attr.active_mtu > IBV_MTU_4096 ? 2048 : port_mtus[port_attr.active_mtu];
|
|
|
|
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
|
"%s oob:ud:port_setup found port: num = %u, lid = %u, mtu = %u",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
port->port_num, port->lid, port->mtu);
|
|
|
|
return rc;
|
|
}
|
|
|
|
static inline int mca_oob_ud_device_setup (mca_oob_ud_device_t *device,
|
|
struct ibv_device *ib_device)
|
|
{
|
|
int rc, port_num;
|
|
struct ibv_device_attr dev_attr;
|
|
|
|
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
|
"%s oob:ud:device_setup attempting to setup ib device %p",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) ib_device);
|
|
|
|
device->ib_context = ibv_open_device (ib_device);
|
|
if (NULL == device->ib_context) {
|
|
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
|
"%s oob:ud:device_setup error opening device. errno = %d",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
|
|
return ORTE_ERROR;
|
|
}
|
|
|
|
rc = ibv_query_device (device->ib_context, &dev_attr);
|
|
if (0 != rc) {
|
|
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
|
"%s oob:ud:device_setup error querying device. errno = %d",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
|
|
return ORTE_ERROR;
|
|
}
|
|
|
|
device->ib_channel = ibv_create_comp_channel (device->ib_context);
|
|
if (NULL == device->ib_channel) {
|
|
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
|
"%s oob:ud:device_setup error completing completion channel."
|
|
"errno = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
|
|
return ORTE_ERROR;
|
|
}
|
|
|
|
device->ib_pd = ibv_alloc_pd (device->ib_context);
|
|
if (NULL == device->ib_pd) {
|
|
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
|
"%s oob:ud:device_setup error allocating protection domain."
|
|
"errno = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
|
|
return ORTE_ERROR;
|
|
}
|
|
|
|
for (port_num = 1 ; port_num <= dev_attr.phys_port_cnt ; ++port_num) {
|
|
mca_oob_ud_port_t *port = OBJ_NEW(mca_oob_ud_port_t);
|
|
|
|
if (NULL == port) {
|
|
opal_output (0, "oob:ud:device_setup malloc failure. errno = %d", errno);
|
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
|
}
|
|
|
|
port->device = device;
|
|
port->port_num = port_num;
|
|
|
|
rc = mca_oob_ud_port_setup (port);
|
|
if (ORTE_SUCCESS != rc) {
|
|
OBJ_RELEASE(port);
|
|
continue;
|
|
}
|
|
|
|
opal_list_append (&device->ports, (opal_list_item_t *) port);
|
|
|
|
break;
|
|
}
|
|
|
|
if (0 == opal_list_get_size(&device->ports)) {
|
|
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
|
"%s oob:ud:device_setup could not init device. no usable "
|
|
"ports present", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
|
return ORTE_ERROR;
|
|
}
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
static int mca_oob_ud_component_startup(void)
|
|
{
|
|
struct ibv_device **devices;
|
|
int num_devices, i, rc;
|
|
opal_list_item_t *item, *item2;
|
|
bool found_one = false;
|
|
|
|
devices = ibv_get_device_list (&num_devices);
|
|
if (NULL == devices || 0 == num_devices) {
|
|
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
|
"%s oob:ud:component_init no devices found",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
|
return ORTE_ERROR;
|
|
}
|
|
|
|
for (i = 0 ; i < num_devices ; ++i) {
|
|
mca_oob_ud_device_t *device = OBJ_NEW(mca_oob_ud_device_t);
|
|
|
|
if (NULL == device) {
|
|
opal_output (0, "oob:ud:component_init malloc failure. errno = %d",
|
|
errno);
|
|
return ORTE_ERROR;
|
|
}
|
|
|
|
rc = mca_oob_ud_device_setup (device, devices[i]);
|
|
if (ORTE_SUCCESS != rc) {
|
|
OBJ_RELEASE(device);
|
|
continue;
|
|
}
|
|
|
|
opal_list_append (&mca_oob_ud_component.ud_devices,
|
|
(opal_list_item_t *) device);
|
|
|
|
/* NTH: support only 1 device for now */
|
|
break;
|
|
}
|
|
|
|
ibv_free_device_list (devices);
|
|
|
|
if (0 == opal_list_get_size (&mca_oob_ud_component.ud_devices)) {
|
|
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
|
"%s oob:ud:component_init no usable devices found.",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
|
return ORTE_ERROR;
|
|
}
|
|
|
|
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
|
"%s oob:ud:init initializing oob/openib. # of devices = %u",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
(unsigned int) opal_list_get_size (&mca_oob_ud_component.ud_devices));
|
|
|
|
for (item = opal_list_get_first (&mca_oob_ud_component.ud_devices);
|
|
item != opal_list_get_end (&mca_oob_ud_component.ud_devices);
|
|
item = opal_list_get_next (item)) {
|
|
mca_oob_ud_device_t *device = (mca_oob_ud_device_t *) item;
|
|
|
|
/* start monitoring the device for completions */
|
|
for (item2 = opal_list_get_first (&device->ports) ;
|
|
item2 != opal_list_get_end (&device->ports) ;
|
|
item2 = opal_list_get_next (item2)) {
|
|
mca_oob_ud_port_t *port = (mca_oob_ud_port_t *) item2;
|
|
|
|
rc = mca_oob_ud_listen_create (port);
|
|
if (0 != rc) {
|
|
continue;
|
|
}
|
|
|
|
rc = mca_oob_ud_port_alloc_buffers (port);
|
|
if (ORTE_SUCCESS != rc) {
|
|
mca_oob_ud_listen_destroy (port);
|
|
continue;
|
|
}
|
|
|
|
rc = opal_free_list_init (&port->data_qps,
|
|
sizeof (mca_oob_ud_qp_t),
|
|
OBJ_CLASS(mca_oob_ud_qp_t),
|
|
mca_oob_ud_component.ud_min_qp,
|
|
mca_oob_ud_component.ud_max_qp,
|
|
2);
|
|
if (OPAL_SUCCESS != rc) {
|
|
mca_oob_ud_listen_destroy (port);
|
|
continue;
|
|
}
|
|
|
|
rc = mca_oob_ud_port_recv_start (port);
|
|
if (ORTE_SUCCESS != rc) {
|
|
mca_oob_ud_listen_destroy (port);
|
|
continue;
|
|
}
|
|
|
|
/* NTH: only supports one port for now */
|
|
found_one = true;
|
|
|
|
/* NTH: since we only support one port start monitoring now */
|
|
mca_oob_ud_event_start_monitor (device);
|
|
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!found_one) {
|
|
return ORTE_ERR_NOT_FOUND;
|
|
}
|
|
|
|
/* have to call the module init here so we can test for available qpair */
|
|
if ((NULL != mca_oob_ud_module.api.init) && (ORTE_SUCCESS != (rc = mca_oob_ud_module.api.init()))){
|
|
return ORTE_ERROR;
|
|
}
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
static void mca_oob_ud_component_shutdown(void)
|
|
{
|
|
mca_oob_ud_peer_t *peer;
|
|
opal_list_item_t *item;
|
|
|
|
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
|
"%s oob:ud:fini entering",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
|
|
|
OPAL_THREAD_LOCK(&mca_oob_ud_component.ud_lock);
|
|
OPAL_THREAD_LOCK(&mca_oob_ud_component.ud_match_lock);
|
|
|
|
if (ORTE_VPID_INVALID != ORTE_PROC_MY_PARENT->vpid) {
|
|
if (ORTE_SUCCESS == mca_oob_ud_peer_lookup (ORTE_PROC_MY_PARENT, &peer) && NULL != peer) {
|
|
mca_oob_ud_peer_handle_end (peer);
|
|
}
|
|
}
|
|
|
|
/* abort active receives */
|
|
mca_oob_ud_cancel_all_in_list (&mca_oob_ud_component.ud_active_recvs);
|
|
mca_oob_ud_cancel_all_in_list (&mca_oob_ud_component.ud_active_sends);
|
|
|
|
mca_oob_ud_empty_list (&mca_oob_ud_component.ud_event_queued_reqs);
|
|
|
|
OPAL_THREAD_UNLOCK(&mca_oob_ud_component.ud_match_lock);
|
|
|
|
if (NULL != mca_oob_ud_module.api.finalize) {
|
|
mca_oob_ud_module.api.finalize(&peer);
|
|
}
|
|
|
|
for (item = opal_list_get_first (&mca_oob_ud_component.ud_devices);
|
|
item != opal_list_get_end (&mca_oob_ud_component.ud_devices);
|
|
item = opal_list_get_next (item)) {
|
|
mca_oob_ud_event_stop_monitor ((mca_oob_ud_device_t *) item);
|
|
}
|
|
|
|
mca_oob_ud_empty_list (&mca_oob_ud_component.ud_devices);
|
|
OPAL_THREAD_UNLOCK(&mca_oob_ud_component.ud_lock);
|
|
}
|
|
|
|
static char* mca_oob_ud_component_get_addr(void) {
|
|
/* NTH: qp_num - 32 bits (10), lid - 16 bits (5), port - 8 bits (3) + ud:// + 3 .'s + \0 = 27 chars */
|
|
char *contact_info = (char *) calloc(opal_list_get_size(&mca_oob_ud_component.ud_devices) * 27, 1);
|
|
char *ptr = contact_info;
|
|
opal_list_item_t *item, *port_item;
|
|
*ptr = 0;
|
|
|
|
for (item = opal_list_get_first (&mca_oob_ud_component.ud_devices) ;
|
|
item != opal_list_get_end (&mca_oob_ud_component.ud_devices) ;
|
|
item = opal_list_get_next (item)) {
|
|
|
|
mca_oob_ud_device_t *device = (mca_oob_ud_device_t *) item;
|
|
|
|
for (port_item = opal_list_get_first (&device->ports);
|
|
port_item != opal_list_get_end (&device->ports);
|
|
port_item = opal_list_get_next (port_item)) {
|
|
|
|
if (ptr != contact_info) {
|
|
ptr += sprintf (ptr, ";");
|
|
}
|
|
|
|
mca_oob_ud_port_get_uri ((mca_oob_ud_port_t *) port_item, ptr);
|
|
ptr += strlen (ptr);
|
|
}
|
|
}
|
|
|
|
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
|
"%s oob:ud:get_addr contact information: %s",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), contact_info);
|
|
|
|
return contact_info;
|
|
}
|
|
|
|
static int mca_oob_ud_component_send_nb(orte_rml_send_t *msg) {
|
|
if (NULL != mca_oob_ud_module.api.send_nb) {
|
|
mca_oob_ud_module.api.send_nb(msg);
|
|
return ORTE_SUCCESS;
|
|
}
|
|
return ORTE_ERROR;
|
|
}
|
|
|
|
static int mca_oob_ud_component_set_addr(orte_process_name_t *peer, char **uris)
|
|
{
|
|
int rc;
|
|
|
|
OPAL_THREAD_LOCK(&mca_oob_ud_component.ud_lock);
|
|
|
|
for (int i = 0; NULL != uris[i]; i++) {
|
|
if (NULL != mca_oob_ud_module.api.set_addr) {
|
|
if (ORTE_SUCCESS != (rc = mca_oob_ud_module.api.set_addr(peer, uris[i]))) {
|
|
return rc;
|
|
}
|
|
}
|
|
}
|
|
|
|
OPAL_THREAD_UNLOCK(&mca_oob_ud_component.ud_lock);
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
#if OPAL_ENABLE_FT_CR == 1
|
|
static int mca_oob_ud_component_ft_event(int state) {
|
|
(void) state;
|
|
return ORTE_SUCCESS;
|
|
}
|
|
#endif // OPAL_ENABLE_FT_CR
|
|
|
|
static int mca_oob_ud_port_alloc_buffers (mca_oob_ud_port_t *port) {
|
|
int total_buffer_count = mca_oob_ud_component.ud_recv_buffer_count +
|
|
mca_oob_ud_component.ud_send_buffer_count;
|
|
opal_list_item_t *item;
|
|
int rc, i;
|
|
|
|
rc = mca_oob_ud_alloc_reg_mem (port->device->ib_pd, &port->grh_buf,
|
|
mca_oob_ud_component.ud_recv_buffer_count * sizeof (struct ibv_grh));
|
|
if (ORTE_SUCCESS != rc) {
|
|
return rc;
|
|
}
|
|
|
|
|
|
rc = mca_oob_ud_alloc_reg_mem (port->device->ib_pd, &port->msg_buf,
|
|
total_buffer_count * port->mtu);
|
|
if (ORTE_SUCCESS != rc) {
|
|
return rc;
|
|
}
|
|
|
|
rc = opal_free_list_init (&port->free_msgs, sizeof (mca_oob_ud_msg_t),
|
|
OBJ_CLASS(mca_oob_ud_msg_t), mca_oob_ud_component.ud_send_buffer_count,
|
|
mca_oob_ud_component.ud_send_buffer_count, 0);
|
|
if (ORTE_SUCCESS != rc) {
|
|
return rc;
|
|
}
|
|
|
|
for (i = 0, item = opal_list_get_first (&port->free_msgs.super) ;
|
|
item != opal_list_get_end (&port->free_msgs.super) ;
|
|
item = opal_list_get_next (item), ++i) {
|
|
char *ptr = port->msg_buf.ptr + (i + mca_oob_ud_component.ud_recv_buffer_count) *
|
|
port->mtu;
|
|
|
|
mca_oob_ud_msg_init ((mca_oob_ud_msg_t *) item, port,
|
|
ptr, port->msg_buf.mr);
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
static bool mca_oob_ud_component_is_reachable(orte_process_name_t *peer_name)
|
|
{
|
|
orte_process_name_t hop;
|
|
|
|
/* if we have a route to this peer, then we can reach it */
|
|
hop = orte_routed.get_route(peer_name);
|
|
if (ORTE_JOBID_INVALID == hop.jobid ||
|
|
ORTE_VPID_INVALID == hop.vpid) {
|
|
opal_output (0, "%s oob:ud:component_is_reachable peer %s is unreachable",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(peer_name));
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
static void mca_oob_ud_port_construct (mca_oob_ud_port_t *port)
|
|
{
|
|
memset((char *) port + sizeof (port->super), 0, sizeof (*port) - sizeof (port->super));
|
|
|
|
OBJ_CONSTRUCT(&port->data_qps, opal_free_list_t);
|
|
OBJ_CONSTRUCT(&port->free_msgs, opal_free_list_t);
|
|
OBJ_CONSTRUCT(&port->listen_qp, opal_free_list_item_t);
|
|
}
|
|
|
|
static void mca_oob_ud_port_destruct (mca_oob_ud_port_t *port)
|
|
{
|
|
(void) mca_oob_ud_listen_destroy (port);
|
|
OBJ_DESTRUCT(&port->data_qps);
|
|
OBJ_DESTRUCT(&port->free_msgs);
|
|
|
|
mca_oob_ud_free_reg_mem (&port->grh_buf);
|
|
mca_oob_ud_free_reg_mem (&port->msg_buf);
|
|
}
|
|
|
|
OBJ_CLASS_INSTANCE(mca_oob_ud_port_t, opal_list_item_t,
|
|
mca_oob_ud_port_construct,
|
|
mca_oob_ud_port_destruct);
|
|
|
|
static int mca_oob_ud_listen_create (mca_oob_ud_port_t *port) {
|
|
return mca_oob_ud_qp_init (&port->listen_qp, port, port->device->ib_channel, NULL, false);
|
|
}
|
|
|
|
/* mca_oob_ud_listen_destroy:
|
|
*
|
|
* Destory the listen queue pair associated with a port.
|
|
*/
|
|
static int mca_oob_ud_listen_destroy (mca_oob_ud_port_t *port)
|
|
{
|
|
if (NULL == port || NULL == port->listen_qp.ib_qp) {
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
OBJ_DESTRUCT(&port->listen_qp);
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
static inline int mca_oob_ud_port_recv_start (mca_oob_ud_port_t *port)
|
|
{
|
|
int i, rc;
|
|
|
|
rc = mca_oob_ud_qp_to_rts (&port->listen_qp);
|
|
if (ORTE_SUCCESS != rc) {
|
|
return rc;
|
|
}
|
|
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
|
"%s oob:ud:port_recv_start posting "
|
|
"%d message buffers", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
mca_oob_ud_component.ud_recv_buffer_count);
|
|
|
|
for (i = 0 ; i < mca_oob_ud_component.ud_recv_buffer_count ; ++i) {
|
|
rc = mca_oob_ud_port_post_one_recv (port, i);
|
|
if (ORTE_SUCCESS != rc) {
|
|
return rc;
|
|
}
|
|
}
|
|
|
|
rc = ibv_req_notify_cq (port->listen_qp.ib_recv_cq, 0);
|
|
if (0 != rc) {
|
|
opal_output (0, "%s oob:ud:port_recv_start error requesting completion"
|
|
"notifications. rc = %d, errno = %d",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rc, errno);
|
|
return ORTE_ERROR;
|
|
}
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
static inline int mca_oob_ud_alloc_reg_mem (struct ibv_pd *pd, mca_oob_ud_reg_mem_t *reg_mem,
|
|
const int buffer_len)
|
|
{
|
|
reg_mem->len = buffer_len;
|
|
reg_mem->ptr = NULL;
|
|
reg_mem->mr = NULL;
|
|
|
|
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
|
"%s oob:ud:alloc_reg_mem allocing and registering %d bytes of memory with pd %p",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), buffer_len, (void *) pd);
|
|
|
|
posix_memalign ((void **)®_mem->ptr, sysconf(_SC_PAGESIZE), buffer_len);
|
|
if (NULL == reg_mem->ptr) {
|
|
opal_output (0, "%s oob:ud:alloc_reg_mem malloc failed! errno = %d",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
|
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
|
}
|
|
|
|
memset (reg_mem->ptr, 0, buffer_len);
|
|
|
|
reg_mem->mr = ibv_reg_mr (pd, reg_mem->ptr, buffer_len, IBV_ACCESS_LOCAL_WRITE);
|
|
if (NULL == reg_mem->mr) {
|
|
opal_output (0, "%s oob:ud:alloc_reg_mem failed to register memory. errno = %d",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
|
|
return ORTE_ERROR;
|
|
}
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
static inline void mca_oob_ud_free_reg_mem (mca_oob_ud_reg_mem_t *reg_mem)
|
|
{
|
|
if (reg_mem->mr) {
|
|
(void) ibv_dereg_mr (reg_mem->mr);
|
|
}
|
|
|
|
if (reg_mem->ptr) {
|
|
free (reg_mem->ptr);
|
|
}
|
|
|
|
memset (reg_mem, 0, sizeof (mca_oob_ud_reg_mem_t));
|
|
}
|
|
|
|
static void mca_oob_ud_cancel_all_in_list (opal_list_t *list)
|
|
{
|
|
opal_list_item_t *item;
|
|
|
|
while (NULL != (item = opal_list_remove_first (list))) {
|
|
((mca_oob_ud_req_t *)item)->req_list = NULL;
|
|
mca_oob_ud_req_abort ((mca_oob_ud_req_t *) item);
|
|
}
|
|
}
|
|
|
|
static void mca_oob_ud_empty_list (opal_list_t *list)
|
|
{
|
|
opal_list_item_t *item;
|
|
|
|
while (NULL != (item = opal_list_remove_first (list))) {
|
|
OBJ_RELEASE(item);
|
|
}
|
|
}
|
|
|
|
static void mca_oob_ud_device_construct (mca_oob_ud_device_t *device)
|
|
{
|
|
memset((char *) device + sizeof (device->super), 0, sizeof (*device) - sizeof (device->super));
|
|
|
|
OBJ_CONSTRUCT(&device->ports, opal_list_t);
|
|
}
|
|
|
|
static void mca_oob_ud_device_destruct (mca_oob_ud_device_t *device)
|
|
{
|
|
opal_list_item_t *item;
|
|
|
|
while (NULL != (item = opal_list_remove_first (&device->ports))) {
|
|
OBJ_RELEASE(item);
|
|
}
|
|
|
|
if (device->ib_pd) {
|
|
(void) ibv_dealloc_pd (device->ib_pd);
|
|
}
|
|
|
|
if (device->ib_channel) {
|
|
(void) ibv_destroy_comp_channel (device->ib_channel);
|
|
}
|
|
|
|
if (device->ib_context) {
|
|
(void) ibv_close_device (device->ib_context);
|
|
}
|
|
|
|
OBJ_DESTRUCT(&device->ports);
|
|
|
|
memset (device, 0, sizeof (mca_oob_ud_device_t));
|
|
}
|
|
|
|
OBJ_CLASS_INSTANCE(mca_oob_ud_device_t, opal_list_item_t,
|
|
mca_oob_ud_device_construct,
|
|
mca_oob_ud_device_destruct);
|
|
|
|
OBJ_CLASS_INSTANCE(mca_oob_ud_msg_op_t,
|
|
opal_object_t,
|
|
NULL, NULL);
|
|
|
|
OBJ_CLASS_INSTANCE(mca_oob_ud_ping_t,
|
|
opal_object_t,
|
|
NULL, NULL);
|