1
1

OPENIB BTL/CONNECT: Add support for AF_IB addressing in rdmacm.

This commit was SVN r30875.
Этот коммит содержится в:
Vasily Filipov 2014-02-27 11:29:47 +00:00
родитель e466fee747
Коммит f2014b96e7
3 изменённых файлов: 309 добавлений и 34 удалений

Просмотреть файл

@ -283,6 +283,10 @@ AC_DEFUN([OMPI_CHECK_OPENFABRICS_CM_ARGS],[
AC_ARG_ENABLE([openib-rdmacm], AC_ARG_ENABLE([openib-rdmacm],
[AC_HELP_STRING([--enable-openib-rdmacm], [AC_HELP_STRING([--enable-openib-rdmacm],
[Enable Open Fabrics RDMACM support in openib BTL (default: enabled)])]) [Enable Open Fabrics RDMACM support in openib BTL (default: enabled)])])
AC_ARG_ENABLE([openib-rdmacm-ibaddr],
[AC_HELP_STRING([--enable-openib-rdmacm-ibaddr],
[Enable Open Fabrics RDMACM with IB addressing support in openib BTL (default: disabled)])],
[enable_openib_rdmacm=yes])
])dnl ])dnl
AC_DEFUN([OMPI_CHECK_OPENFABRICS_CM],[ AC_DEFUN([OMPI_CHECK_OPENFABRICS_CM],[

Просмотреть файл

@ -71,6 +71,42 @@ AC_DEFUN([MCA_ompi_btl_openib_CONFIG],[
if test "x$btl_openib_have_rdmacm" = "x1" -a \ if test "x$btl_openib_have_rdmacm" = "x1" -a \
"$have_threads" != "none"; then "$have_threads" != "none"; then
cpcs="$cpcs rdmacm" cpcs="$cpcs rdmacm"
if test "$enable_openib_rdmacm_ibaddr" = "yes"; then
LDFLAGS_save="$LDFLAGS"
LIBS_save="$LIBS"
LDFLAGS="$LDFLAGS $btl_openib_LDFLAGS"
LIBS="$LIBS $btl_openib_LIBS"
AC_LANG(C)
AC_MSG_CHECKING([rsockets keepalive])
AC_RUN_IFELSE(
[AC_LANG_PROGRAM(
[
#include <stdio.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <sys/types.h>
#include <rdma/rsocket.h>
#include <infiniband/ib.h>
],
[
int rsock;
rsock = rsocket(AF_IB, SOCK_STREAM, 0);
if (rsock < 0) {
return -1;
}
rclose(rsock);
]
)],
[ AC_MSG_RESULT([yes])
AC_DEFINE(BTL_OPENIB_RDMACM_IB_ADDR, 1, rdmacm with rsockets support)
],
[ AC_MSG_RESULT([no])
AC_MSG_WARN([rsockets does not support keepalives. librdmacm 1.0.18 or beyond is needed.])
]
)
LDFLAGS="$LDFLAGS_save"
LIBS="$LIBS_save"
fi
fi fi
if test "x$btl_openib_have_udcm" = "x1" -a \ if test "x$btl_openib_have_udcm" = "x1" -a \
"$have_threads" != "none"; then "$have_threads" != "none"; then

Просмотреть файл

@ -58,6 +58,10 @@
#include "btl_openib_ip.h" #include "btl_openib_ip.h"
#include "btl_openib_ini.h" #include "btl_openib_ini.h"
#if BTL_OPENIB_RDMACM_IB_ADDR
#include <infiniband/ib.h>
#endif
#define mymin(a, b) ((a) < (b) ? (a) : (b)) #define mymin(a, b) ((a) < (b) ? (a) : (b))
static void rdmacm_component_register(void); static void rdmacm_component_register(void);
@ -85,8 +89,13 @@ typedef struct {
/* Dummy QP only used when we expect the connection to be /* Dummy QP only used when we expect the connection to be
rejected */ rejected */
struct ibv_cq *dummy_cq; struct ibv_cq *dummy_cq;
#if BTL_OPENIB_RDMACM_IB_ADDR
union ibv_gid gid;
uint64_t service_id;
#else
uint32_t ipaddr; uint32_t ipaddr;
uint16_t tcp_port; uint16_t tcp_port;
#endif
/* server==false means that this proc initiated the connection; /* server==false means that this proc initiated the connection;
server==true means that this proc accepted the incoming server==true means that this proc accepted the incoming
connection. Note that this may be different than the "one way" connection. Note that this may be different than the "one way"
@ -114,8 +123,13 @@ OBJ_CLASS_INSTANCE(rdmacm_contents_t, opal_list_item_t,
typedef struct { typedef struct {
int device_max_qp_rd_atom; int device_max_qp_rd_atom;
int device_max_qp_init_rd_atom; int device_max_qp_init_rd_atom;
#if BTL_OPENIB_RDMACM_IB_ADDR
uint8_t gid[16];
uint64_t service_id;
#else
uint32_t ipaddr; uint32_t ipaddr;
uint16_t tcp_port; uint16_t tcp_port;
#endif
uint8_t end; uint8_t end;
} modex_message_t; } modex_message_t;
@ -147,10 +161,15 @@ OBJ_CLASS_INSTANCE(id_context_t, opal_list_item_t,
typedef struct { typedef struct {
uint32_t rem_index; uint32_t rem_index;
#if BTL_OPENIB_RDMACM_IB_ADDR
uint64_t rem_port;
#else
uint16_t rem_port; uint16_t rem_port;
#endif
uint8_t qpnum; uint8_t qpnum;
} private_data_t; } private_data_t;
#ifndef BTL_OPENIB_RDMACM_IB_ADDR
/* Used to send a specific show_help message from the service_thread /* Used to send a specific show_help message from the service_thread
to the main thread (because we can't call show_help from the to the main thread (because we can't call show_help from the
service_thread) */ service_thread) */
@ -159,6 +178,7 @@ typedef struct {
uint32_t peer_ip_addr; uint32_t peer_ip_addr;
uint32_t peer_tcp_port; uint32_t peer_tcp_port;
} cant_find_endpoint_context_t; } cant_find_endpoint_context_t;
#endif
static opal_list_t server_listener_list; static opal_list_t server_listener_list;
static opal_list_t client_list; static opal_list_t client_list;
@ -166,7 +186,11 @@ static opal_mutex_t client_list_lock;
static struct rdma_event_channel *event_channel = NULL; static struct rdma_event_channel *event_channel = NULL;
static int rdmacm_priority = 30; static int rdmacm_priority = 30;
static unsigned int rdmacm_port = 0; static unsigned int rdmacm_port = 0;
#ifndef BTL_OPENIB_RDMACM_IB_ADDR
static uint32_t rdmacm_addr = 0; static uint32_t rdmacm_addr = 0;
#endif
static int rdmacm_resolve_timeout = 30000; static int rdmacm_resolve_timeout = 30000;
static int rdmacm_resolve_max_retry_count = 20; static int rdmacm_resolve_max_retry_count = 20;
static bool rdmacm_reject_causes_connect_error = false; static bool rdmacm_reject_causes_connect_error = false;
@ -209,8 +233,12 @@ static void rdmacm_contents_constructor(rdmacm_contents_t *contents)
contents->endpoint = NULL; contents->endpoint = NULL;
contents->openib_btl = NULL; contents->openib_btl = NULL;
contents->dummy_cq = NULL; contents->dummy_cq = NULL;
#if BTL_OPENIB_RDMACM_IB_ADDR
contents->service_id = 0;
#else
contents->ipaddr = 0; contents->ipaddr = 0;
contents->tcp_port = 0; contents->tcp_port = 0;
#endif
contents->server = false; contents->server = false;
contents->on_client_list = false; contents->on_client_list = false;
OBJ_CONSTRUCT(&(contents->ids), opal_list_t); OBJ_CONSTRUCT(&(contents->ids), opal_list_t);
@ -327,13 +355,23 @@ static char *stringify(uint32_t addr)
*/ */
static mca_btl_openib_endpoint_t *rdmacm_find_endpoint(rdmacm_contents_t *contents, static mca_btl_openib_endpoint_t *rdmacm_find_endpoint(rdmacm_contents_t *contents,
struct rdma_cm_id *id, struct rdma_cm_id *id,
#if BTL_OPENIB_RDMACM_IB_ADDR
uint64_t rem_port)
#else
uint16_t rem_port) uint16_t rem_port)
#endif
{ {
int i; int i;
mca_btl_openib_endpoint_t *ep = NULL; mca_btl_openib_endpoint_t *ep = NULL;
opal_pointer_array_t *endpoints = contents->openib_btl->device->endpoints; opal_pointer_array_t *endpoints = contents->openib_btl->device->endpoints;
struct sockaddr *peeraddr = rdma_get_peer_addr(id); struct sockaddr *peeraddr = rdma_get_peer_addr(id);
uint32_t peeripaddr = ((struct sockaddr_in *)peeraddr)->sin_addr.s_addr; #if BTL_OPENIB_RDMACM_IB_ADDR
union ibv_gid *ep_gid, peer_gid;
memcpy(peer_gid.raw, ((struct sockaddr_ib *) peeraddr)->sib_addr.sib_raw, sizeof peer_gid);
#else
uint32_t peeripaddr = ((struct sockaddr_in *) peeraddr)->sin_addr.s_addr;
#if OPAL_ENABLE_DEBUG #if OPAL_ENABLE_DEBUG
char *a; char *a;
#endif #endif
@ -343,6 +381,8 @@ static mca_btl_openib_endpoint_t *rdmacm_find_endpoint(rdmacm_contents_t *conten
#if OPAL_ENABLE_DEBUG #if OPAL_ENABLE_DEBUG
free(a); free(a);
#endif #endif
#endif
for (i = 0; i < opal_pointer_array_get_size(endpoints); i++) { for (i = 0; i < opal_pointer_array_get_size(endpoints); i++) {
mca_btl_openib_endpoint_t *endpoint; mca_btl_openib_endpoint_t *endpoint;
modex_message_t *message; modex_message_t *message;
@ -353,12 +393,22 @@ static mca_btl_openib_endpoint_t *rdmacm_find_endpoint(rdmacm_contents_t *conten
} }
message = (modex_message_t *) endpoint->endpoint_remote_cpc_data->cbm_modex_message; message = (modex_message_t *) endpoint->endpoint_remote_cpc_data->cbm_modex_message;
#ifndef BTL_OPENIB_RDMACM_IB_ADDR
OPAL_OUTPUT((-1, "message ipaddr = %s port %d", OPAL_OUTPUT((-1, "message ipaddr = %s port %d",
a = stringify(message->ipaddr), message->tcp_port)); a = stringify(message->ipaddr), message->tcp_port));
#if OPAL_ENABLE_DEBUG #if OPAL_ENABLE_DEBUG
free(a); free(a);
#endif #endif
#endif
#if BTL_OPENIB_RDMACM_IB_ADDR
ep_gid = (union ibv_gid *) message->gid;
if (ep_gid->global.interface_id == peer_gid.global.interface_id &&
ep_gid->global.subnet_prefix == peer_gid.global.subnet_prefix &&
message->service_id == rem_port) {
#else
if (message->ipaddr == peeripaddr && message->tcp_port == rem_port) { if (message->ipaddr == peeripaddr && message->tcp_port == rem_port) {
#endif
ep = endpoint; ep = endpoint;
break; break;
} }
@ -495,34 +545,93 @@ out:
* IP addresses are the same (i.e., the MPI procs are on the same * IP addresses are the same (i.e., the MPI procs are on the same
* node), then the process with the lower TCP port wins. * node), then the process with the lower TCP port wins.
*/ */
static bool i_initiate(uint32_t local_ipaddr, uint16_t local_port, static bool i_initiate(uint64_t local_port, uint64_t remote_port,
uint32_t remote_ipaddr, uint16_t remote_port) #if BTL_OPENIB_RDMACM_IB_ADDR
union ibv_gid *local_gid, union ibv_gid *remote_gid)
{
#else
uint32_t local_ipaddr, uint32_t remote_ipaddr)
{ {
#if OPAL_ENABLE_DEBUG #if OPAL_ENABLE_DEBUG
char *a = stringify(local_ipaddr); char *a = stringify(local_ipaddr);
char *b = stringify(remote_ipaddr); char *b = stringify(remote_ipaddr);
#endif #endif
#endif
#if BTL_OPENIB_RDMACM_IB_ADDR
if (local_gid->global.subnet_prefix < remote_gid->global.subnet_prefix ||
(local_gid->global.subnet_prefix == remote_gid->global.subnet_prefix &&
local_gid->global.interface_id < remote_gid->global.interface_id) ||
(local_gid->global.subnet_prefix == remote_gid->global.subnet_prefix &&
local_gid->global.interface_id == remote_gid->global.interface_id &&
#else
if (local_ipaddr > remote_ipaddr || if (local_ipaddr > remote_ipaddr ||
(local_ipaddr == remote_ipaddr && local_port < remote_port)) { (local_ipaddr == remote_ipaddr &&
#endif
local_port < remote_port)) {
#ifndef BTL_OPENIB_RDMACM_IB_ADDR
OPAL_OUTPUT((-1, "i_initiate (I WIN): local ipaddr %s, remote ipaddr %s", OPAL_OUTPUT((-1, "i_initiate (I WIN): local ipaddr %s, remote ipaddr %s",
a, b)); a, b));
#if OPAL_ENABLE_DEBUG #if OPAL_ENABLE_DEBUG
free(a); free(a);
free(b); free(b);
#endif #endif
return true;
} else {
OPAL_OUTPUT((-1, "i_initiate (I lose): local ipaddr %s, remote ipaddr %s",
a, b));
#if OPAL_ENABLE_DEBUG
free(a);
free(b);
#endif #endif
return false; return true;
} }
#ifndef BTL_OPENIB_RDMACM_IB_ADDR
OPAL_OUTPUT((-1, "i_initiate (I lose): local ipaddr %s, remote ipaddr %s",
a, b));
#if OPAL_ENABLE_DEBUG
free(a);
free(b);
#endif
#endif
return false;
} }
#if BTL_OPENIB_RDMACM_IB_ADDR
static int get_rdma_addr(char *src, char *dst,
struct rdma_addrinfo **rdma_addr,
int server)
{
int rc;
struct rdma_addrinfo hints, *sres, *dres;
memset(&hints, 0, sizeof hints);
hints.ai_family = AF_IB;
hints.ai_port_space = RDMA_PS_TCP;
hints.ai_flags = RAI_NUMERICHOST | RAI_FAMILY | RAI_PASSIVE;
rc = rdma_getaddrinfo(src, NULL, &hints, &sres);
if (0 != rc) {
return OMPI_ERROR;
}
if (server) {
*rdma_addr = sres;
return OMPI_SUCCESS;
}
hints.ai_src_len = sres->ai_src_len;
hints.ai_src_addr = sres->ai_src_addr;
hints.ai_flags &= ~RAI_PASSIVE;
rc = rdma_getaddrinfo(dst, NULL, &hints, &dres);
if (0 != rc) {
rdma_freeaddrinfo(sres);
return OMPI_ERROR;
}
rdma_freeaddrinfo(sres);
*rdma_addr = dres;
return OMPI_SUCCESS;
}
#endif
/* /*
* Invoked by main thread * Invoked by main thread
*/ */
@ -530,11 +639,17 @@ static int rdmacm_client_connect_one(rdmacm_contents_t *contents,
modex_message_t *message, modex_message_t *message,
int num) int num)
{ {
struct sockaddr_in src_in, dest_in;
id_context_t *context;
int rc; int rc;
id_context_t *context;
#if BTL_OPENIB_RDMACM_IB_ADDR
char src_addr[32], dst_addr[32];
struct rdma_addrinfo *rdma_addr;
#else
struct sockaddr_in src_in, dest_in;
#if OPAL_ENABLE_DEBUG #if OPAL_ENABLE_DEBUG
char *a, *b; char *a, *b;
#endif
#endif #endif
/* We'll need to access some data in the event handler. We can /* We'll need to access some data in the event handler. We can
@ -558,7 +673,7 @@ static int rdmacm_client_connect_one(rdmacm_contents_t *contents,
BTL_ERROR(("Failed to create a rdma id with %d", rc)); BTL_ERROR(("Failed to create a rdma id with %d", rc));
goto out1; goto out1;
} }
#ifndef BTL_OPENIB_RDMACM_IB_ADDR
/* Source address (we must specify this to ensure that the traffic /* Source address (we must specify this to ensure that the traffic
goes out on the device+port that we expect it go out). */ goes out on the device+port that we expect it go out). */
memset(&src_in, 0, sizeof(src_in)); memset(&src_in, 0, sizeof(src_in));
@ -585,7 +700,7 @@ static int rdmacm_client_connect_one(rdmacm_contents_t *contents,
free(a); free(a);
free(b); free(b);
#endif #endif
#endif
/* This is odd an worth explaining: when we place the context on /* This is odd an worth explaining: when we place the context on
the ids list, we need to add an extra RETAIN to the context. the ids list, we need to add an extra RETAIN to the context.
The reason is because of a race condition. Let's explain The reason is because of a race condition. Let's explain
@ -623,15 +738,46 @@ static int rdmacm_client_connect_one(rdmacm_contents_t *contents,
*/ */
OBJ_RETAIN(context); OBJ_RETAIN(context);
opal_list_append(&(contents->ids), &(context->super)); opal_list_append(&(contents->ids), &(context->super));
#if BTL_OPENIB_RDMACM_IB_ADDR
if (NULL == inet_ntop(AF_INET6, contents->gid.raw,
src_addr, sizeof src_addr)) {
BTL_ERROR(("local addr string creating fail"));
goto out1;
}
if (NULL == inet_ntop(AF_INET6, message->gid,
dst_addr, sizeof dst_addr)) {
BTL_ERROR(("remote addr string creating fail"));
goto out1;
}
rc = get_rdma_addr(src_addr, dst_addr, &rdma_addr, 0);
if (OMPI_SUCCESS != rc) {
BTL_ERROR(("server: create rdma addr error"));
goto out1;
}
((struct sockaddr_ib *) (rdma_addr->ai_dst_addr))->sib_sid = message->service_id;
#endif
rc = rdma_resolve_addr(context->id, rc = rdma_resolve_addr(context->id,
#if BTL_OPENIB_RDMACM_IB_ADDR
rdma_addr->ai_src_addr,
rdma_addr->ai_dst_addr,
#else
(struct sockaddr *) &src_in, (struct sockaddr *) &src_in,
(struct sockaddr *) &dest_in, (struct sockaddr *) &dest_in,
#endif
rdmacm_resolve_timeout); rdmacm_resolve_timeout);
if (0 != rc) { if (0 != rc) {
BTL_ERROR(("Failed to resolve the remote address with %d", rc)); BTL_ERROR(("Failed to resolve the remote address with %d", rc));
#if BTL_OPENIB_RDMACM_IB_ADDR
rdma_freeaddrinfo(rdma_addr);
#endif
goto out1; goto out1;
} }
#if BTL_OPENIB_RDMACM_IB_ADDR
rdma_freeaddrinfo(rdma_addr);
#endif
return OMPI_SUCCESS; return OMPI_SUCCESS;
@ -655,10 +801,11 @@ static int rdmacm_module_start_connect(ompi_btl_openib_connect_base_module_t *cp
modex_message_t *message, *local_message; modex_message_t *message, *local_message;
int rc, qp; int rc, qp;
opal_list_item_t *item; opal_list_item_t *item;
#ifndef BTL_OPENIB_RDMACM_IB_ADDR
#if OPAL_ENABLE_DEBUG #if OPAL_ENABLE_DEBUG
char *a, *b; char *a, *b;
#endif #endif
#endif
/* Don't use the CPC to get the message, because this function is /* Don't use the CPC to get the message, because this function is
invoked from the event_handler (to intitiate connections in the invoked from the event_handler (to intitiate connections in the
Right direction), where we don't have the CPC, so it'll be Right direction), where we don't have the CPC, so it'll be
@ -667,7 +814,7 @@ static int rdmacm_module_start_connect(ompi_btl_openib_connect_base_module_t *cp
(modex_message_t *) endpoint->endpoint_local_cpc->data.cbm_modex_message; (modex_message_t *) endpoint->endpoint_local_cpc->data.cbm_modex_message;
message = (modex_message_t *) message = (modex_message_t *)
endpoint->endpoint_remote_cpc_data->cbm_modex_message; endpoint->endpoint_remote_cpc_data->cbm_modex_message;
#ifndef BTL_OPENIB_RDMACM_IB_ADDR
OPAL_OUTPUT((-1, "Connecting from IP %s:%d to remote IP %s:%d ep state = %d", OPAL_OUTPUT((-1, "Connecting from IP %s:%d to remote IP %s:%d ep state = %d",
a = stringify(local_message->ipaddr), local_message->tcp_port, a = stringify(local_message->ipaddr), local_message->tcp_port,
b = stringify(message->ipaddr), message->tcp_port, endpoint->endpoint_state)); b = stringify(message->ipaddr), message->tcp_port, endpoint->endpoint_state));
@ -677,7 +824,7 @@ static int rdmacm_module_start_connect(ompi_btl_openib_connect_base_module_t *cp
#endif #endif
BTL_VERBOSE(("Connecting to remote ip addr = %x, port = %d ep state = %d", BTL_VERBOSE(("Connecting to remote ip addr = %x, port = %d ep state = %d",
message->ipaddr, message->tcp_port, endpoint->endpoint_state)); message->ipaddr, message->tcp_port, endpoint->endpoint_state));
#endif
if (MCA_BTL_IB_CONNECTED == endpoint->endpoint_state || if (MCA_BTL_IB_CONNECTED == endpoint->endpoint_state ||
MCA_BTL_IB_CONNECTING == endpoint->endpoint_state || MCA_BTL_IB_CONNECTING == endpoint->endpoint_state ||
MCA_BTL_IB_CONNECT_ACK == endpoint->endpoint_state) { MCA_BTL_IB_CONNECT_ACK == endpoint->endpoint_state) {
@ -705,14 +852,25 @@ static int rdmacm_module_start_connect(ompi_btl_openib_connect_base_module_t *cp
* is being connected from, in the case where there are multiple * is being connected from, in the case where there are multiple
* listeners on the local system. * listeners on the local system.
*/ */
#if BTL_OPENIB_RDMACM_IB_ADDR
memcpy(contents->gid.raw, local_message->gid, sizeof(contents->gid));
contents->service_id = local_message->service_id;
#else
contents->ipaddr = local_message->ipaddr; contents->ipaddr = local_message->ipaddr;
contents->tcp_port = local_message->tcp_port; contents->tcp_port = local_message->tcp_port;
#endif
/* Are we the initiator? Or do we expect this connect request to /* Are we the initiator? Or do we expect this connect request to
be rejected? */ be rejected? */
endpoint->endpoint_initiator = endpoint->endpoint_initiator =
i_initiate(contents->ipaddr, contents->tcp_port, i_initiate(
message->ipaddr, message->tcp_port); #if BTL_OPENIB_RDMACM_IB_ADDR
contents->service_id, message->service_id,
&contents->gid, (union ibv_gid *) message->gid);
#else
contents->tcp_port, message->tcp_port,
contents->ipaddr, message->ipaddr);
#endif
OPAL_OUTPUT((-1, "MAIN Start connect; ep=%p (%p), I %s the initiator to %s", OPAL_OUTPUT((-1, "MAIN Start connect; ep=%p (%p), I %s the initiator to %s",
(void*) endpoint, (void*) endpoint,
(void*) endpoint->endpoint_local_cpc, (void*) endpoint->endpoint_local_cpc,
@ -760,6 +918,7 @@ out:
return rc; return rc;
} }
#ifndef BTL_OPENIB_RDMACM_IB_ADDR
static void *show_help_cant_find_endpoint(void *context) static void *show_help_cant_find_endpoint(void *context)
{ {
char *msg; char *msg;
@ -786,6 +945,7 @@ static void *show_help_cant_find_endpoint(void *context)
mca_btl_openib_endpoint_invoke_error(NULL); mca_btl_openib_endpoint_invoke_error(NULL);
return NULL; return NULL;
} }
#endif
/* /*
* Invoked by service thread * Invoked by service thread
@ -809,7 +969,11 @@ static int handle_connect_request(struct rdma_cm_event *event)
private_data_t msg; private_data_t msg;
int rc = -1, qpnum; int rc = -1, qpnum;
uint32_t rem_index; uint32_t rem_index;
#if BTL_OPENIB_RDMACM_IB_ADDR
uint64_t rem_port;
#else
uint16_t rem_port; uint16_t rem_port;
#endif
qpnum = ((private_data_t *)event->param.conn.private_data)->qpnum; qpnum = ((private_data_t *)event->param.conn.private_data)->qpnum;
rem_port = ((private_data_t *)event->param.conn.private_data)->rem_port; rem_port = ((private_data_t *)event->param.conn.private_data)->rem_port;
@ -819,6 +983,7 @@ static int handle_connect_request(struct rdma_cm_event *event)
to; use the listener's context->contents to figure it out */ to; use the listener's context->contents to figure it out */
endpoint = rdmacm_find_endpoint(contents, event->id, rem_port); endpoint = rdmacm_find_endpoint(contents, event->id, rem_port);
if (NULL == endpoint) { if (NULL == endpoint) {
#ifndef BTL_OPENIB_RDMACM_IB_ADDR
struct sockaddr *peeraddr = rdma_get_peer_addr(event->id); struct sockaddr *peeraddr = rdma_get_peer_addr(event->id);
cant_find_endpoint_context_t *c = (cant_find_endpoint_context_t *) calloc(1, sizeof(*c)); cant_find_endpoint_context_t *c = (cant_find_endpoint_context_t *) calloc(1, sizeof(*c));
if (NULL != c) { if (NULL != c) {
@ -831,18 +996,25 @@ static int handle_connect_request(struct rdma_cm_event *event)
c->peer_tcp_port = rdma_get_dst_port(event->id); c->peer_tcp_port = rdma_get_dst_port(event->id);
} }
ompi_btl_openib_fd_run_in_main(show_help_cant_find_endpoint, c); ompi_btl_openib_fd_run_in_main(show_help_cant_find_endpoint, c);
#else
BTL_ERROR(("Cannot find endpoint."));
#endif
goto out; goto out;
} }
message = (modex_message_t *) endpoint->endpoint_remote_cpc_data->cbm_modex_message; message = (modex_message_t *) endpoint->endpoint_remote_cpc_data->cbm_modex_message;
endpoint->endpoint_initiator = endpoint->endpoint_initiator =
i_initiate(contents->ipaddr, contents->tcp_port, i_initiate(
message->ipaddr, rem_port); #if BTL_OPENIB_RDMACM_IB_ADDR
contents->service_id, rem_port,
&contents->gid, (union ibv_gid *) message->gid);
#else
contents->tcp_port, rem_port,
contents->ipaddr, message->ipaddr);
BTL_VERBOSE(("ep state = %d, local ipaddr = %x, remote ipaddr = %x, local port = %d, remote port = %d", BTL_VERBOSE(("ep state = %d, local ipaddr = %x, remote ipaddr = %x, local port = %d, remote port = %d",
endpoint->endpoint_state, contents->ipaddr, message->ipaddr, endpoint->endpoint_state, contents->ipaddr, message->ipaddr,
contents->tcp_port, rem_port)); contents->tcp_port, rem_port));
#endif
OPAL_OUTPUT((-1, "SERVICE in handle_connect_request; ep=%p (%p), I still %s the initiator to %s", OPAL_OUTPUT((-1, "SERVICE in handle_connect_request; ep=%p (%p), I still %s the initiator to %s",
(void*) endpoint, (void*) endpoint,
(void*) endpoint->endpoint_local_cpc, (void*) endpoint->endpoint_local_cpc,
@ -1161,11 +1333,13 @@ static int rdmacm_connect_endpoint(id_context_t *context,
} }
message = (modex_message_t *) endpoint->endpoint_remote_cpc_data->cbm_modex_message; message = (modex_message_t *) endpoint->endpoint_remote_cpc_data->cbm_modex_message;
#ifndef BTL_OPENIB_RDMACM_IB_ADDR
BTL_VERBOSE(("%s connected!!! local %x remote %x state = %d", BTL_VERBOSE(("%s connected!!! local %x remote %x state = %d",
contents->server?"server":"client", contents->server?"server":"client",
contents->ipaddr, contents->ipaddr,
message->ipaddr, message->ipaddr,
endpoint->endpoint_state)); endpoint->endpoint_state));
#endif
/* Ensure that all the writes back to the endpoint and associated /* Ensure that all the writes back to the endpoint and associated
data structures have completed */ data structures have completed */
@ -1348,13 +1522,17 @@ static int finish_connect(id_context_t *context)
private_data_t msg; private_data_t msg;
int rc; int rc;
struct sockaddr *peeraddr; struct sockaddr *peeraddr;
#ifndef BTL_OPENIB_RDMACM_IB_ADDR
uint32_t remoteipaddr; uint32_t remoteipaddr;
#endif
uint16_t remoteport; uint16_t remoteport;
modex_message_t *message; modex_message_t *message;
remoteport = rdma_get_dst_port(context->id); remoteport = rdma_get_dst_port(context->id);
peeraddr = rdma_get_peer_addr(context->id); peeraddr = rdma_get_peer_addr(context->id);
#ifndef BTL_OPENIB_RDMACM_IB_ADDR
remoteipaddr = ((struct sockaddr_in *)peeraddr)->sin_addr.s_addr; remoteipaddr = ((struct sockaddr_in *)peeraddr)->sin_addr.s_addr;
#endif
message = (modex_message_t *) message = (modex_message_t *)
context->endpoint->endpoint_remote_cpc_data->cbm_modex_message; context->endpoint->endpoint_remote_cpc_data->cbm_modex_message;
@ -1431,6 +1609,9 @@ static int finish_connect(id_context_t *context)
msg.qpnum = context->qpnum; msg.qpnum = context->qpnum;
msg.rem_index = contents->endpoint->index; msg.rem_index = contents->endpoint->index;
#if BTL_OPENIB_RDMACM_IB_ADDR
msg.rem_port = contents->service_id;
#else
msg.rem_port = contents->tcp_port; msg.rem_port = contents->tcp_port;
if (contents->endpoint->endpoint_initiator) { if (contents->endpoint->endpoint_initiator) {
#if OPAL_ENABLE_DEBUG #if OPAL_ENABLE_DEBUG
@ -1447,6 +1628,7 @@ static int finish_connect(id_context_t *context)
free(a); free(a);
#endif #endif
} }
#endif
/* Now all of the local setup has been done. The remote system /* Now all of the local setup has been done. The remote system
should now get a RDMA_CM_EVENT_CONNECT_REQUEST event to further should now get a RDMA_CM_EVENT_CONNECT_REQUEST event to further
@ -1512,8 +1694,10 @@ static int event_handler(struct rdma_cm_event *event)
{ {
id_context_t *context = (id_context_t*) event->id->context; id_context_t *context = (id_context_t*) event->id->context;
rdmacm_contents_t *contents; rdmacm_contents_t *contents;
#ifndef BTL_OPENIB_RDMACM_IB_ADDR
struct sockaddr *peeraddr, *localaddr; struct sockaddr *peeraddr, *localaddr;
uint32_t peeripaddr, localipaddr; uint32_t peeripaddr, localipaddr;
#endif
int rc = -1; int rc = -1;
ompi_btl_openib_ini_values_t ini; ompi_btl_openib_ini_values_t ini;
bool found; bool found;
@ -1523,6 +1707,8 @@ static int event_handler(struct rdma_cm_event *event)
} }
contents = context->contents; contents = context->contents;
#ifndef BTL_OPENIB_RDMACM_IB_ADDR
localaddr = rdma_get_local_addr(event->id); localaddr = rdma_get_local_addr(event->id);
peeraddr = rdma_get_peer_addr(event->id); peeraddr = rdma_get_peer_addr(event->id);
localipaddr = ((struct sockaddr_in *)localaddr)->sin_addr.s_addr; localipaddr = ((struct sockaddr_in *)localaddr)->sin_addr.s_addr;
@ -1533,6 +1719,7 @@ static int event_handler(struct rdma_cm_event *event)
rdma_event_str(event->event), rdma_event_str(event->event),
event->status, event->status,
peeripaddr)); peeripaddr));
#endif
switch (event->event) { switch (event->event) {
case RDMA_CM_EVENT_ADDR_RESOLVED: case RDMA_CM_EVENT_ADDR_RESOLVED:
@ -1542,7 +1729,9 @@ static int event_handler(struct rdma_cm_event *event)
case RDMA_CM_EVENT_ROUTE_RESOLVED: case RDMA_CM_EVENT_ROUTE_RESOLVED:
OPAL_OUTPUT((-1, "SERVICE Got ROUTE_RESOLVED: ID %p", (void*) context->id)); OPAL_OUTPUT((-1, "SERVICE Got ROUTE_RESOLVED: ID %p", (void*) context->id));
#ifndef BTL_OPENIB_RDMACM_IB_ADDR
contents->ipaddr = localipaddr; contents->ipaddr = localipaddr;
#endif
rc = finish_connect(context); rc = finish_connect(context);
break; break;
@ -1717,6 +1906,7 @@ static int rdmacm_init(mca_btl_openib_endpoint_t *endpoint)
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
#ifndef BTL_OPENIB_RDMACM_IB_ADDR
static int ipaddrcheck(id_context_t *context, static int ipaddrcheck(id_context_t *context,
mca_btl_openib_module_t *openib_btl) mca_btl_openib_module_t *openib_btl)
{ {
@ -1781,14 +1971,17 @@ static int ipaddrcheck(id_context_t *context,
return already_exists ? OMPI_ERROR : OMPI_SUCCESS; return already_exists ? OMPI_ERROR : OMPI_SUCCESS;
} }
#endif
static int create_message(rdmacm_contents_t *server, static int create_message(rdmacm_contents_t *server,
mca_btl_openib_module_t *openib_btl, mca_btl_openib_module_t *openib_btl,
ompi_btl_openib_connect_base_module_data_t *data) ompi_btl_openib_connect_base_module_data_t *data)
{ {
modex_message_t *message; modex_message_t *message;
#ifndef BTL_OPENIB_RDMACM_IB_ADDR
#if OPAL_ENABLE_DEBUG #if OPAL_ENABLE_DEBUG
char *a; char *a;
#endif
#endif #endif
message = (modex_message_t *) malloc(sizeof(modex_message_t)); message = (modex_message_t *) malloc(sizeof(modex_message_t));
@ -1801,6 +1994,11 @@ static int create_message(rdmacm_contents_t *server,
openib_btl->device->ib_dev_attr.max_qp_rd_atom; openib_btl->device->ib_dev_attr.max_qp_rd_atom;
message->device_max_qp_init_rd_atom = message->device_max_qp_init_rd_atom =
openib_btl->device->ib_dev_attr.max_qp_init_rd_atom; openib_btl->device->ib_dev_attr.max_qp_init_rd_atom;
#if BTL_OPENIB_RDMACM_IB_ADDR
memcpy(message->gid, server->gid.raw, sizeof(server->gid));
message->service_id = server->service_id;
#else
message->ipaddr = server->ipaddr; message->ipaddr = server->ipaddr;
message->tcp_port = server->tcp_port; message->tcp_port = server->tcp_port;
@ -1808,6 +2006,7 @@ static int create_message(rdmacm_contents_t *server,
a = stringify(message->ipaddr), message->tcp_port)); a = stringify(message->ipaddr), message->tcp_port));
#if OPAL_ENABLE_DEBUG #if OPAL_ENABLE_DEBUG
free(a); free(a);
#endif
#endif #endif
data->cbm_modex_message = message; data->cbm_modex_message = message;
data->cbm_modex_message_len = message_len; data->cbm_modex_message_len = message_len;
@ -1823,11 +2022,18 @@ static int create_message(rdmacm_contents_t *server,
*/ */
static int rdmacm_component_query(mca_btl_openib_module_t *openib_btl, ompi_btl_openib_connect_base_module_t **cpc) static int rdmacm_component_query(mca_btl_openib_module_t *openib_btl, ompi_btl_openib_connect_base_module_t **cpc)
{ {
rdmacm_contents_t *server = NULL;
struct sockaddr_in sin;
id_context_t *context;
int rc; int rc;
id_context_t *context;
rdmacm_contents_t *server = NULL;
#if BTL_OPENIB_RDMACM_IB_ADDR
char rdmacm_addr_str[32];
struct rdma_addrinfo *rdma_addr;
#else
struct sockaddr_in sin;
#endif
/* RDMACM is not supported if we have any XRC QPs */ /* RDMACM is not supported if we have any XRC QPs */
if (mca_btl_openib_component.num_xrc_qps > 0) { if (mca_btl_openib_component.num_xrc_qps > 0) {
BTL_VERBOSE(("rdmacm CPC not supported with XRC receive queues, please try xoob CPC; skipped on %s:%d", BTL_VERBOSE(("rdmacm CPC not supported with XRC receive queues, please try xoob CPC; skipped on %s:%d",
@ -1883,23 +2089,49 @@ static int rdmacm_component_query(mca_btl_openib_module_t *openib_btl, ompi_btl_
rc = OMPI_ERR_OUT_OF_RESOURCE; rc = OMPI_ERR_OUT_OF_RESOURCE;
goto out4; goto out4;
} }
#ifndef BTL_OPENIB_RDMACM_IB_ADDR
memset(&sin, 0, sizeof(sin)); memset(&sin, 0, sizeof(sin));
sin.sin_family = AF_INET; sin.sin_family = AF_INET;
sin.sin_addr.s_addr = rdmacm_addr; sin.sin_addr.s_addr = rdmacm_addr;
sin.sin_port = (uint16_t) rdmacm_port; sin.sin_port = (uint16_t) rdmacm_port;
#else
rc = ibv_query_gid(openib_btl->device->ib_pd->context, openib_btl->port_num, 0, &server->gid);
if (0 != rc) {
BTL_ERROR(("local gid query failed"));
goto out4;
}
if (NULL == inet_ntop(AF_INET6, server->gid.raw,
rdmacm_addr_str, sizeof rdmacm_addr_str)) {
BTL_ERROR(("local gaddr string creating fail"));
goto out4;
}
rc = get_rdma_addr(rdmacm_addr_str, NULL, &rdma_addr, 1);
if (OMPI_SUCCESS != rc) {
BTL_ERROR(("server: create rdma addr error"));
goto out4;
}
#endif
/* Bind the rdmacm server to the local IP address and an ephemerial /* Bind the rdmacm server to the local IP address and an ephemerial
* port or one specified by a comand arg. * port or one specified by a comand arg.
*/ */
rc = rdma_bind_addr(context->id, (struct sockaddr *)&sin); rc = rdma_bind_addr(context->id,
#if BTL_OPENIB_RDMACM_IB_ADDR
rdma_addr->ai_src_addr);
#else
(struct sockaddr *)&sin);
#endif
if (0 != rc) { if (0 != rc) {
opal_output_verbose(5, ompi_btl_base_framework.framework_output, opal_output_verbose(5, ompi_btl_base_framework.framework_output,
"openib BTL: rdmacm CPC unable to bind to address"); "openib BTL: rdmacm CPC unable to bind to address");
rc = OMPI_ERR_UNREACH; rc = OMPI_ERR_UNREACH;
goto out5; goto out5;
} }
#if BTL_OPENIB_RDMACM_IB_ADDR
server->service_id = ((struct sockaddr_ib *) (&context->id->route.addr.src_addr))->sib_sid;
rdma_freeaddrinfo(rdma_addr);
#else
/* Verify that the device has a valid IP address on it, or we /* Verify that the device has a valid IP address on it, or we
cannot use the cpc */ cannot use the cpc */
rc = ipaddrcheck(context, openib_btl); rc = ipaddrcheck(context, openib_btl);
@ -1909,7 +2141,7 @@ static int rdmacm_component_query(mca_btl_openib_module_t *openib_btl, ompi_btl_
rc = OMPI_ERR_NOT_SUPPORTED; rc = OMPI_ERR_NOT_SUPPORTED;
goto out5; goto out5;
} }
#endif
/* Listen on the specified address/port with the rdmacm, limit the /* Listen on the specified address/port with the rdmacm, limit the
amount of incoming connections to 1024 */ amount of incoming connections to 1024 */
/* FIXME - 1024 should be (num of connectors * /* FIXME - 1024 should be (num of connectors *
@ -1944,6 +2176,9 @@ out5:
* But don't do it here since it's part of out4:OBJ_RELEASE(context), * But don't do it here since it's part of out4:OBJ_RELEASE(context),
* and we don't want to do it twice. * and we don't want to do it twice.
*/ */
#if BTL_OPENIB_RDMACM_IB_ADDR
rdma_freeaddrinfo(rdma_addr);
#endif
out4: out4:
opal_list_remove_first(&(server->ids)); opal_list_remove_first(&(server->ids));
OBJ_RELEASE(context); OBJ_RELEASE(context);