diff --git a/config/ompi_check_openfabrics.m4 b/config/ompi_check_openfabrics.m4 index c0a69c34b2..8a258c94b1 100644 --- a/config/ompi_check_openfabrics.m4 +++ b/config/ompi_check_openfabrics.m4 @@ -283,6 +283,10 @@ AC_DEFUN([OMPI_CHECK_OPENFABRICS_CM_ARGS],[ AC_ARG_ENABLE([openib-rdmacm], [AC_HELP_STRING([--enable-openib-rdmacm], [Enable Open Fabrics RDMACM support in openib BTL (default: enabled)])]) + AC_ARG_ENABLE([openib-rdmacm-ibaddr], + [AC_HELP_STRING([--enable-openib-rdmacm-ibaddr], + [Enable Open Fabrics RDMACM with IB addressing support in openib BTL (default: disabled)])], + [enable_openib_rdmacm=yes]) ])dnl AC_DEFUN([OMPI_CHECK_OPENFABRICS_CM],[ diff --git a/ompi/mca/btl/openib/configure.m4 b/ompi/mca/btl/openib/configure.m4 index d68a4b8c8c..9534e35b48 100644 --- a/ompi/mca/btl/openib/configure.m4 +++ b/ompi/mca/btl/openib/configure.m4 @@ -71,6 +71,42 @@ AC_DEFUN([MCA_ompi_btl_openib_CONFIG],[ if test "x$btl_openib_have_rdmacm" = "x1" -a \ "$have_threads" != "none"; then cpcs="$cpcs rdmacm" + if test "$enable_openib_rdmacm_ibaddr" = "yes"; then + LDFLAGS_save="$LDFLAGS" + LIBS_save="$LIBS" + LDFLAGS="$LDFLAGS $btl_openib_LDFLAGS" + LIBS="$LIBS $btl_openib_LIBS" + AC_LANG(C) + AC_MSG_CHECKING([rsockets keepalive]) + AC_RUN_IFELSE( + [AC_LANG_PROGRAM( + [ + #include + #include + #include + #include + #include + #include + ], + [ + int rsock; + rsock = rsocket(AF_IB, SOCK_STREAM, 0); + if (rsock < 0) { + return -1; + } + rclose(rsock); + ] + )], + [ AC_MSG_RESULT([yes]) + AC_DEFINE(BTL_OPENIB_RDMACM_IB_ADDR, 1, rdmacm with rsockets support) + ], + [ AC_MSG_RESULT([no]) + AC_MSG_WARN([rsockets does not support keepalives. librdmacm 1.0.18 or beyond is needed.]) + ] + ) + LDFLAGS="$LDFLAGS_save" + LIBS="$LIBS_save" + fi fi if test "x$btl_openib_have_udcm" = "x1" -a \ "$have_threads" != "none"; then diff --git a/ompi/mca/btl/openib/connect/btl_openib_connect_rdmacm.c b/ompi/mca/btl/openib/connect/btl_openib_connect_rdmacm.c index dae82e6d8d..f4e3bce7bb 100644 --- a/ompi/mca/btl/openib/connect/btl_openib_connect_rdmacm.c +++ b/ompi/mca/btl/openib/connect/btl_openib_connect_rdmacm.c @@ -58,6 +58,10 @@ #include "btl_openib_ip.h" #include "btl_openib_ini.h" +#if BTL_OPENIB_RDMACM_IB_ADDR +#include +#endif + #define mymin(a, b) ((a) < (b) ? (a) : (b)) static void rdmacm_component_register(void); @@ -85,8 +89,13 @@ typedef struct { /* Dummy QP only used when we expect the connection to be rejected */ struct ibv_cq *dummy_cq; +#if BTL_OPENIB_RDMACM_IB_ADDR + union ibv_gid gid; + uint64_t service_id; +#else uint32_t ipaddr; uint16_t tcp_port; +#endif /* server==false means that this proc initiated the connection; server==true means that this proc accepted the incoming connection. Note that this may be different than the "one way" @@ -114,8 +123,13 @@ OBJ_CLASS_INSTANCE(rdmacm_contents_t, opal_list_item_t, typedef struct { int device_max_qp_rd_atom; int device_max_qp_init_rd_atom; +#if BTL_OPENIB_RDMACM_IB_ADDR + uint8_t gid[16]; + uint64_t service_id; +#else uint32_t ipaddr; uint16_t tcp_port; +#endif uint8_t end; } modex_message_t; @@ -147,10 +161,15 @@ OBJ_CLASS_INSTANCE(id_context_t, opal_list_item_t, typedef struct { uint32_t rem_index; +#if BTL_OPENIB_RDMACM_IB_ADDR + uint64_t rem_port; +#else uint16_t rem_port; +#endif uint8_t qpnum; } private_data_t; +#ifndef BTL_OPENIB_RDMACM_IB_ADDR /* Used to send a specific show_help message from the service_thread to the main thread (because we can't call show_help from the service_thread) */ @@ -159,6 +178,7 @@ typedef struct { uint32_t peer_ip_addr; uint32_t peer_tcp_port; } cant_find_endpoint_context_t; +#endif static opal_list_t server_listener_list; static opal_list_t client_list; @@ -166,7 +186,11 @@ static opal_mutex_t client_list_lock; static struct rdma_event_channel *event_channel = NULL; static int rdmacm_priority = 30; static unsigned int rdmacm_port = 0; + +#ifndef BTL_OPENIB_RDMACM_IB_ADDR static uint32_t rdmacm_addr = 0; +#endif + static int rdmacm_resolve_timeout = 30000; static int rdmacm_resolve_max_retry_count = 20; static bool rdmacm_reject_causes_connect_error = false; @@ -209,8 +233,12 @@ static void rdmacm_contents_constructor(rdmacm_contents_t *contents) contents->endpoint = NULL; contents->openib_btl = NULL; contents->dummy_cq = NULL; +#if BTL_OPENIB_RDMACM_IB_ADDR + contents->service_id = 0; +#else contents->ipaddr = 0; contents->tcp_port = 0; +#endif contents->server = false; contents->on_client_list = false; OBJ_CONSTRUCT(&(contents->ids), opal_list_t); @@ -327,13 +355,23 @@ static char *stringify(uint32_t addr) */ static mca_btl_openib_endpoint_t *rdmacm_find_endpoint(rdmacm_contents_t *contents, struct rdma_cm_id *id, +#if BTL_OPENIB_RDMACM_IB_ADDR + uint64_t rem_port) +#else uint16_t rem_port) +#endif { int i; mca_btl_openib_endpoint_t *ep = NULL; opal_pointer_array_t *endpoints = contents->openib_btl->device->endpoints; + struct sockaddr *peeraddr = rdma_get_peer_addr(id); - uint32_t peeripaddr = ((struct sockaddr_in *)peeraddr)->sin_addr.s_addr; +#if BTL_OPENIB_RDMACM_IB_ADDR + union ibv_gid *ep_gid, peer_gid; + memcpy(peer_gid.raw, ((struct sockaddr_ib *) peeraddr)->sib_addr.sib_raw, sizeof peer_gid); +#else + uint32_t peeripaddr = ((struct sockaddr_in *) peeraddr)->sin_addr.s_addr; + #if OPAL_ENABLE_DEBUG char *a; #endif @@ -343,6 +381,8 @@ static mca_btl_openib_endpoint_t *rdmacm_find_endpoint(rdmacm_contents_t *conten #if OPAL_ENABLE_DEBUG free(a); #endif +#endif + for (i = 0; i < opal_pointer_array_get_size(endpoints); i++) { mca_btl_openib_endpoint_t *endpoint; modex_message_t *message; @@ -353,12 +393,22 @@ static mca_btl_openib_endpoint_t *rdmacm_find_endpoint(rdmacm_contents_t *conten } message = (modex_message_t *) endpoint->endpoint_remote_cpc_data->cbm_modex_message; +#ifndef BTL_OPENIB_RDMACM_IB_ADDR OPAL_OUTPUT((-1, "message ipaddr = %s port %d", a = stringify(message->ipaddr), message->tcp_port)); #if OPAL_ENABLE_DEBUG free(a); #endif +#endif + +#if BTL_OPENIB_RDMACM_IB_ADDR + ep_gid = (union ibv_gid *) message->gid; + if (ep_gid->global.interface_id == peer_gid.global.interface_id && + ep_gid->global.subnet_prefix == peer_gid.global.subnet_prefix && + message->service_id == rem_port) { +#else if (message->ipaddr == peeripaddr && message->tcp_port == rem_port) { +#endif ep = endpoint; break; } @@ -495,34 +545,93 @@ out: * IP addresses are the same (i.e., the MPI procs are on the same * node), then the process with the lower TCP port wins. */ -static bool i_initiate(uint32_t local_ipaddr, uint16_t local_port, - uint32_t remote_ipaddr, uint16_t remote_port) +static bool i_initiate(uint64_t local_port, uint64_t remote_port, +#if BTL_OPENIB_RDMACM_IB_ADDR + union ibv_gid *local_gid, union ibv_gid *remote_gid) +{ +#else + uint32_t local_ipaddr, uint32_t remote_ipaddr) { #if OPAL_ENABLE_DEBUG char *a = stringify(local_ipaddr); char *b = stringify(remote_ipaddr); #endif +#endif +#if BTL_OPENIB_RDMACM_IB_ADDR + if (local_gid->global.subnet_prefix < remote_gid->global.subnet_prefix || + (local_gid->global.subnet_prefix == remote_gid->global.subnet_prefix && + local_gid->global.interface_id < remote_gid->global.interface_id) || + (local_gid->global.subnet_prefix == remote_gid->global.subnet_prefix && + local_gid->global.interface_id == remote_gid->global.interface_id && +#else if (local_ipaddr > remote_ipaddr || - (local_ipaddr == remote_ipaddr && local_port < remote_port)) { + (local_ipaddr == remote_ipaddr && +#endif + local_port < remote_port)) { +#ifndef BTL_OPENIB_RDMACM_IB_ADDR OPAL_OUTPUT((-1, "i_initiate (I WIN): local ipaddr %s, remote ipaddr %s", a, b)); #if OPAL_ENABLE_DEBUG free(a); free(b); #endif - return true; - } else { - OPAL_OUTPUT((-1, "i_initiate (I lose): local ipaddr %s, remote ipaddr %s", - a, b)); -#if OPAL_ENABLE_DEBUG - free(a); - free(b); #endif - return false; + return true; } +#ifndef BTL_OPENIB_RDMACM_IB_ADDR + OPAL_OUTPUT((-1, "i_initiate (I lose): local ipaddr %s, remote ipaddr %s", + a, b)); +#if OPAL_ENABLE_DEBUG + free(a); + free(b); +#endif +#endif + return false; } +#if BTL_OPENIB_RDMACM_IB_ADDR +static int get_rdma_addr(char *src, char *dst, + struct rdma_addrinfo **rdma_addr, + int server) +{ + int rc; + struct rdma_addrinfo hints, *sres, *dres; + + memset(&hints, 0, sizeof hints); + + hints.ai_family = AF_IB; + hints.ai_port_space = RDMA_PS_TCP; + hints.ai_flags = RAI_NUMERICHOST | RAI_FAMILY | RAI_PASSIVE; + + rc = rdma_getaddrinfo(src, NULL, &hints, &sres); + if (0 != rc) { + return OMPI_ERROR; + } + + if (server) { + *rdma_addr = sres; + return OMPI_SUCCESS; + } + + hints.ai_src_len = sres->ai_src_len; + hints.ai_src_addr = sres->ai_src_addr; + + hints.ai_flags &= ~RAI_PASSIVE; + + rc = rdma_getaddrinfo(dst, NULL, &hints, &dres); + if (0 != rc) { + rdma_freeaddrinfo(sres); + return OMPI_ERROR; + } + + rdma_freeaddrinfo(sres); + *rdma_addr = dres; + + return OMPI_SUCCESS; +} +#endif + /* * Invoked by main thread */ @@ -530,11 +639,17 @@ static int rdmacm_client_connect_one(rdmacm_contents_t *contents, modex_message_t *message, int num) { - struct sockaddr_in src_in, dest_in; - id_context_t *context; int rc; + id_context_t *context; +#if BTL_OPENIB_RDMACM_IB_ADDR + char src_addr[32], dst_addr[32]; + struct rdma_addrinfo *rdma_addr; +#else + struct sockaddr_in src_in, dest_in; + #if OPAL_ENABLE_DEBUG char *a, *b; +#endif #endif /* We'll need to access some data in the event handler. We can @@ -558,7 +673,7 @@ static int rdmacm_client_connect_one(rdmacm_contents_t *contents, BTL_ERROR(("Failed to create a rdma id with %d", rc)); goto out1; } - +#ifndef BTL_OPENIB_RDMACM_IB_ADDR /* Source address (we must specify this to ensure that the traffic goes out on the device+port that we expect it go out). */ memset(&src_in, 0, sizeof(src_in)); @@ -585,7 +700,7 @@ static int rdmacm_client_connect_one(rdmacm_contents_t *contents, free(a); free(b); #endif - +#endif /* This is odd an worth explaining: when we place the context on the ids list, we need to add an extra RETAIN to the context. The reason is because of a race condition. Let's explain @@ -623,15 +738,46 @@ static int rdmacm_client_connect_one(rdmacm_contents_t *contents, */ OBJ_RETAIN(context); opal_list_append(&(contents->ids), &(context->super)); +#if BTL_OPENIB_RDMACM_IB_ADDR + if (NULL == inet_ntop(AF_INET6, contents->gid.raw, + src_addr, sizeof src_addr)) { + BTL_ERROR(("local addr string creating fail")); + goto out1; + } + if (NULL == inet_ntop(AF_INET6, message->gid, + dst_addr, sizeof dst_addr)) { + BTL_ERROR(("remote addr string creating fail")); + goto out1; + } + + rc = get_rdma_addr(src_addr, dst_addr, &rdma_addr, 0); + if (OMPI_SUCCESS != rc) { + BTL_ERROR(("server: create rdma addr error")); + goto out1; + } + + ((struct sockaddr_ib *) (rdma_addr->ai_dst_addr))->sib_sid = message->service_id; +#endif rc = rdma_resolve_addr(context->id, +#if BTL_OPENIB_RDMACM_IB_ADDR + rdma_addr->ai_src_addr, + rdma_addr->ai_dst_addr, +#else (struct sockaddr *) &src_in, (struct sockaddr *) &dest_in, +#endif rdmacm_resolve_timeout); if (0 != rc) { BTL_ERROR(("Failed to resolve the remote address with %d", rc)); +#if BTL_OPENIB_RDMACM_IB_ADDR + rdma_freeaddrinfo(rdma_addr); +#endif goto out1; } +#if BTL_OPENIB_RDMACM_IB_ADDR + rdma_freeaddrinfo(rdma_addr); +#endif return OMPI_SUCCESS; @@ -655,10 +801,11 @@ static int rdmacm_module_start_connect(ompi_btl_openib_connect_base_module_t *cp modex_message_t *message, *local_message; int rc, qp; opal_list_item_t *item; +#ifndef BTL_OPENIB_RDMACM_IB_ADDR #if OPAL_ENABLE_DEBUG char *a, *b; #endif - +#endif /* Don't use the CPC to get the message, because this function is invoked from the event_handler (to intitiate connections in the Right direction), where we don't have the CPC, so it'll be @@ -667,7 +814,7 @@ static int rdmacm_module_start_connect(ompi_btl_openib_connect_base_module_t *cp (modex_message_t *) endpoint->endpoint_local_cpc->data.cbm_modex_message; message = (modex_message_t *) endpoint->endpoint_remote_cpc_data->cbm_modex_message; - +#ifndef BTL_OPENIB_RDMACM_IB_ADDR OPAL_OUTPUT((-1, "Connecting from IP %s:%d to remote IP %s:%d ep state = %d", a = stringify(local_message->ipaddr), local_message->tcp_port, b = stringify(message->ipaddr), message->tcp_port, endpoint->endpoint_state)); @@ -677,7 +824,7 @@ static int rdmacm_module_start_connect(ompi_btl_openib_connect_base_module_t *cp #endif BTL_VERBOSE(("Connecting to remote ip addr = %x, port = %d ep state = %d", message->ipaddr, message->tcp_port, endpoint->endpoint_state)); - +#endif if (MCA_BTL_IB_CONNECTED == endpoint->endpoint_state || MCA_BTL_IB_CONNECTING == endpoint->endpoint_state || MCA_BTL_IB_CONNECT_ACK == endpoint->endpoint_state) { @@ -705,14 +852,25 @@ static int rdmacm_module_start_connect(ompi_btl_openib_connect_base_module_t *cp * is being connected from, in the case where there are multiple * listeners on the local system. */ +#if BTL_OPENIB_RDMACM_IB_ADDR + memcpy(contents->gid.raw, local_message->gid, sizeof(contents->gid)); + contents->service_id = local_message->service_id; +#else contents->ipaddr = local_message->ipaddr; contents->tcp_port = local_message->tcp_port; +#endif /* Are we the initiator? Or do we expect this connect request to be rejected? */ endpoint->endpoint_initiator = - i_initiate(contents->ipaddr, contents->tcp_port, - message->ipaddr, message->tcp_port); + i_initiate( +#if BTL_OPENIB_RDMACM_IB_ADDR + contents->service_id, message->service_id, + &contents->gid, (union ibv_gid *) message->gid); +#else + contents->tcp_port, message->tcp_port, + contents->ipaddr, message->ipaddr); +#endif OPAL_OUTPUT((-1, "MAIN Start connect; ep=%p (%p), I %s the initiator to %s", (void*) endpoint, (void*) endpoint->endpoint_local_cpc, @@ -760,6 +918,7 @@ out: return rc; } +#ifndef BTL_OPENIB_RDMACM_IB_ADDR static void *show_help_cant_find_endpoint(void *context) { char *msg; @@ -786,6 +945,7 @@ static void *show_help_cant_find_endpoint(void *context) mca_btl_openib_endpoint_invoke_error(NULL); return NULL; } +#endif /* * Invoked by service thread @@ -809,7 +969,11 @@ static int handle_connect_request(struct rdma_cm_event *event) private_data_t msg; int rc = -1, qpnum; uint32_t rem_index; +#if BTL_OPENIB_RDMACM_IB_ADDR + uint64_t rem_port; +#else uint16_t rem_port; +#endif qpnum = ((private_data_t *)event->param.conn.private_data)->qpnum; rem_port = ((private_data_t *)event->param.conn.private_data)->rem_port; @@ -819,6 +983,7 @@ static int handle_connect_request(struct rdma_cm_event *event) to; use the listener's context->contents to figure it out */ endpoint = rdmacm_find_endpoint(contents, event->id, rem_port); if (NULL == endpoint) { +#ifndef BTL_OPENIB_RDMACM_IB_ADDR struct sockaddr *peeraddr = rdma_get_peer_addr(event->id); cant_find_endpoint_context_t *c = (cant_find_endpoint_context_t *) calloc(1, sizeof(*c)); if (NULL != c) { @@ -831,18 +996,25 @@ static int handle_connect_request(struct rdma_cm_event *event) c->peer_tcp_port = rdma_get_dst_port(event->id); } ompi_btl_openib_fd_run_in_main(show_help_cant_find_endpoint, c); +#else + BTL_ERROR(("Cannot find endpoint.")); +#endif goto out; } message = (modex_message_t *) endpoint->endpoint_remote_cpc_data->cbm_modex_message; endpoint->endpoint_initiator = - i_initiate(contents->ipaddr, contents->tcp_port, - message->ipaddr, rem_port); - + i_initiate( +#if BTL_OPENIB_RDMACM_IB_ADDR + contents->service_id, rem_port, + &contents->gid, (union ibv_gid *) message->gid); +#else + contents->tcp_port, rem_port, + contents->ipaddr, message->ipaddr); BTL_VERBOSE(("ep state = %d, local ipaddr = %x, remote ipaddr = %x, local port = %d, remote port = %d", - endpoint->endpoint_state, contents->ipaddr, message->ipaddr, - contents->tcp_port, rem_port)); - + endpoint->endpoint_state, contents->ipaddr, message->ipaddr, + contents->tcp_port, rem_port)); +#endif OPAL_OUTPUT((-1, "SERVICE in handle_connect_request; ep=%p (%p), I still %s the initiator to %s", (void*) endpoint, (void*) endpoint->endpoint_local_cpc, @@ -1161,11 +1333,13 @@ static int rdmacm_connect_endpoint(id_context_t *context, } message = (modex_message_t *) endpoint->endpoint_remote_cpc_data->cbm_modex_message; +#ifndef BTL_OPENIB_RDMACM_IB_ADDR BTL_VERBOSE(("%s connected!!! local %x remote %x state = %d", contents->server?"server":"client", contents->ipaddr, message->ipaddr, endpoint->endpoint_state)); +#endif /* Ensure that all the writes back to the endpoint and associated data structures have completed */ @@ -1348,13 +1522,17 @@ static int finish_connect(id_context_t *context) private_data_t msg; int rc; struct sockaddr *peeraddr; +#ifndef BTL_OPENIB_RDMACM_IB_ADDR uint32_t remoteipaddr; +#endif uint16_t remoteport; modex_message_t *message; remoteport = rdma_get_dst_port(context->id); peeraddr = rdma_get_peer_addr(context->id); +#ifndef BTL_OPENIB_RDMACM_IB_ADDR remoteipaddr = ((struct sockaddr_in *)peeraddr)->sin_addr.s_addr; +#endif message = (modex_message_t *) context->endpoint->endpoint_remote_cpc_data->cbm_modex_message; @@ -1431,6 +1609,9 @@ static int finish_connect(id_context_t *context) msg.qpnum = context->qpnum; msg.rem_index = contents->endpoint->index; +#if BTL_OPENIB_RDMACM_IB_ADDR + msg.rem_port = contents->service_id; +#else msg.rem_port = contents->tcp_port; if (contents->endpoint->endpoint_initiator) { #if OPAL_ENABLE_DEBUG @@ -1447,6 +1628,7 @@ static int finish_connect(id_context_t *context) free(a); #endif } +#endif /* Now all of the local setup has been done. The remote system should now get a RDMA_CM_EVENT_CONNECT_REQUEST event to further @@ -1512,8 +1694,10 @@ static int event_handler(struct rdma_cm_event *event) { id_context_t *context = (id_context_t*) event->id->context; rdmacm_contents_t *contents; +#ifndef BTL_OPENIB_RDMACM_IB_ADDR struct sockaddr *peeraddr, *localaddr; uint32_t peeripaddr, localipaddr; +#endif int rc = -1; ompi_btl_openib_ini_values_t ini; bool found; @@ -1523,6 +1707,8 @@ static int event_handler(struct rdma_cm_event *event) } contents = context->contents; + +#ifndef BTL_OPENIB_RDMACM_IB_ADDR localaddr = rdma_get_local_addr(event->id); peeraddr = rdma_get_peer_addr(event->id); localipaddr = ((struct sockaddr_in *)localaddr)->sin_addr.s_addr; @@ -1533,6 +1719,7 @@ static int event_handler(struct rdma_cm_event *event) rdma_event_str(event->event), event->status, peeripaddr)); +#endif switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: @@ -1542,7 +1729,9 @@ static int event_handler(struct rdma_cm_event *event) case RDMA_CM_EVENT_ROUTE_RESOLVED: OPAL_OUTPUT((-1, "SERVICE Got ROUTE_RESOLVED: ID %p", (void*) context->id)); +#ifndef BTL_OPENIB_RDMACM_IB_ADDR contents->ipaddr = localipaddr; +#endif rc = finish_connect(context); break; @@ -1717,6 +1906,7 @@ static int rdmacm_init(mca_btl_openib_endpoint_t *endpoint) return OMPI_SUCCESS; } +#ifndef BTL_OPENIB_RDMACM_IB_ADDR static int ipaddrcheck(id_context_t *context, mca_btl_openib_module_t *openib_btl) { @@ -1781,14 +1971,17 @@ static int ipaddrcheck(id_context_t *context, return already_exists ? OMPI_ERROR : OMPI_SUCCESS; } +#endif static int create_message(rdmacm_contents_t *server, mca_btl_openib_module_t *openib_btl, ompi_btl_openib_connect_base_module_data_t *data) { modex_message_t *message; +#ifndef BTL_OPENIB_RDMACM_IB_ADDR #if OPAL_ENABLE_DEBUG char *a; +#endif #endif message = (modex_message_t *) malloc(sizeof(modex_message_t)); @@ -1801,6 +1994,11 @@ static int create_message(rdmacm_contents_t *server, openib_btl->device->ib_dev_attr.max_qp_rd_atom; message->device_max_qp_init_rd_atom = openib_btl->device->ib_dev_attr.max_qp_init_rd_atom; + +#if BTL_OPENIB_RDMACM_IB_ADDR + memcpy(message->gid, server->gid.raw, sizeof(server->gid)); + message->service_id = server->service_id; +#else message->ipaddr = server->ipaddr; message->tcp_port = server->tcp_port; @@ -1808,6 +2006,7 @@ static int create_message(rdmacm_contents_t *server, a = stringify(message->ipaddr), message->tcp_port)); #if OPAL_ENABLE_DEBUG free(a); +#endif #endif data->cbm_modex_message = message; data->cbm_modex_message_len = message_len; @@ -1823,11 +2022,18 @@ static int create_message(rdmacm_contents_t *server, */ static int rdmacm_component_query(mca_btl_openib_module_t *openib_btl, ompi_btl_openib_connect_base_module_t **cpc) { - rdmacm_contents_t *server = NULL; - struct sockaddr_in sin; - id_context_t *context; int rc; + id_context_t *context; + rdmacm_contents_t *server = NULL; + +#if BTL_OPENIB_RDMACM_IB_ADDR + char rdmacm_addr_str[32]; + struct rdma_addrinfo *rdma_addr; +#else + struct sockaddr_in sin; +#endif + /* RDMACM is not supported if we have any XRC QPs */ if (mca_btl_openib_component.num_xrc_qps > 0) { BTL_VERBOSE(("rdmacm CPC not supported with XRC receive queues, please try xoob CPC; skipped on %s:%d", @@ -1883,23 +2089,49 @@ static int rdmacm_component_query(mca_btl_openib_module_t *openib_btl, ompi_btl_ rc = OMPI_ERR_OUT_OF_RESOURCE; goto out4; } - +#ifndef BTL_OPENIB_RDMACM_IB_ADDR memset(&sin, 0, sizeof(sin)); sin.sin_family = AF_INET; sin.sin_addr.s_addr = rdmacm_addr; sin.sin_port = (uint16_t) rdmacm_port; +#else + rc = ibv_query_gid(openib_btl->device->ib_pd->context, openib_btl->port_num, 0, &server->gid); + if (0 != rc) { + BTL_ERROR(("local gid query failed")); + goto out4; + } + if (NULL == inet_ntop(AF_INET6, server->gid.raw, + rdmacm_addr_str, sizeof rdmacm_addr_str)) { + BTL_ERROR(("local gaddr string creating fail")); + goto out4; + } + + rc = get_rdma_addr(rdmacm_addr_str, NULL, &rdma_addr, 1); + if (OMPI_SUCCESS != rc) { + BTL_ERROR(("server: create rdma addr error")); + goto out4; + } +#endif /* Bind the rdmacm server to the local IP address and an ephemerial * port or one specified by a comand arg. */ - rc = rdma_bind_addr(context->id, (struct sockaddr *)&sin); + rc = rdma_bind_addr(context->id, +#if BTL_OPENIB_RDMACM_IB_ADDR + rdma_addr->ai_src_addr); +#else + (struct sockaddr *)&sin); +#endif if (0 != rc) { opal_output_verbose(5, ompi_btl_base_framework.framework_output, "openib BTL: rdmacm CPC unable to bind to address"); rc = OMPI_ERR_UNREACH; goto out5; } - +#if BTL_OPENIB_RDMACM_IB_ADDR + server->service_id = ((struct sockaddr_ib *) (&context->id->route.addr.src_addr))->sib_sid; + rdma_freeaddrinfo(rdma_addr); +#else /* Verify that the device has a valid IP address on it, or we cannot use the cpc */ rc = ipaddrcheck(context, openib_btl); @@ -1909,7 +2141,7 @@ static int rdmacm_component_query(mca_btl_openib_module_t *openib_btl, ompi_btl_ rc = OMPI_ERR_NOT_SUPPORTED; goto out5; } - +#endif /* Listen on the specified address/port with the rdmacm, limit the amount of incoming connections to 1024 */ /* FIXME - 1024 should be (num of connectors * @@ -1944,6 +2176,9 @@ out5: * But don't do it here since it's part of out4:OBJ_RELEASE(context), * and we don't want to do it twice. */ +#if BTL_OPENIB_RDMACM_IB_ADDR + rdma_freeaddrinfo(rdma_addr); +#endif out4: opal_list_remove_first(&(server->ids)); OBJ_RELEASE(context);