1
1
- Added enable/disable configuration parameter for dynamic SL
 - All the dynamic SL code is conditionalized
 - Removed libibmad dependency
 - Using only one include - ib_types.h (part of opensm-devel package)
 - Removed all the macro and data types definitions, using the
   existing definitions from ib_types.h instead
 - general cleaning here and there

The async mode is not implemented yet - stay tuned...

This commit was SVN r24830.
Этот коммит содержится в:
Yevgeny Kliteynik 2011-06-28 14:28:29 +00:00
родитель 84be81df95
Коммит b05211148d
4 изменённых файлов: 182 добавлений и 180 удалений

Просмотреть файл

@ -155,11 +155,21 @@ dnl [enable_openib_ibcm="$enableval"], [enable_openib_ibc
[$ompi_cv_func_ibv_create_cq_args], [$ompi_cv_func_ibv_create_cq_args],
[Number of arguments to ibv_create_cq])])]) [Number of arguments to ibv_create_cq])])])
#
# OpenIB dynamic SL
#
AC_ARG_ENABLE([openib-dynamic-sl],
[AC_HELP_STRING([--enable-openib-dynamic-sl],
[Enable openib BTL to query Subnet Manager for IB SL (default: enabled)])],
[enable_openib_dynamic_sl="$enableval"],
[enable_openib_dynamic_sl="yes"])
# Set these up so that we can do an AC_DEFINE below # Set these up so that we can do an AC_DEFINE below
# (unconditionally) # (unconditionally)
$1_have_xrc=0 $1_have_xrc=0
$1_have_rdmacm=0 $1_have_rdmacm=0
$1_have_ibcm=0 $1_have_ibcm=0
$1_have_dynamic_sl=0
# If we have the openib stuff available, find out what we've got # If we have the openib stuff available, find out what we've got
AS_IF([test "$ompi_check_openib_happy" = "yes"], AS_IF([test "$ompi_check_openib_happy" = "yes"],
@ -176,6 +186,19 @@ dnl [enable_openib_ibcm="$enableval"], [enable_openib_ibc
AC_CHECK_FUNCS([ibv_create_xrc_rcv_qp], [$1_have_xrc=1]) AC_CHECK_FUNCS([ibv_create_xrc_rcv_qp], [$1_have_xrc=1])
fi fi
if test "$enable_openib_dynamic_sl" = "yes"; then
# We need ib_types.h file, which is installed with opensm-devel
# package. However, ib_types.h has a bad include directive,
# which will cause AC_CHECK_HEADER to fail.
# So instead, we will look for another file that is also
# installed as part of opensm-devel package and included in
# ib_types.h, but it doesn't include any other IB-related files.
AC_CHECK_HEADER([infiniband/complib/cl_types_osd.h],
[$1_have_dynamic_sl=1],
[AC_MSG_ERROR([opensm-devel package not found - please install it or disable dynamic SL support with \"--disable-openib-dynamic-sl\"])],
[])
fi
# Do we have a recent enough RDMA CM? Need to have the # Do we have a recent enough RDMA CM? Need to have the
# rdma_get_peer_addr (inline) function (originally appeared # rdma_get_peer_addr (inline) function (originally appeared
# in OFED v1.3). # in OFED v1.3).
@ -245,6 +268,15 @@ dnl fi
AC_MSG_RESULT([no]) AC_MSG_RESULT([no])
fi fi
AC_MSG_CHECKING([if dynamic SL is enabled])
AC_DEFINE_UNQUOTED([OMPI_ENABLE_DYNAMIC_SL], [$$1_have_dynamic_sl],
[Enable features required for dynamic SL support])
if test "1" = "$$1_have_dynamic_sl"; then
AC_MSG_RESULT([yes])
else
AC_MSG_RESULT([no])
fi
AC_MSG_CHECKING([if OpenFabrics RDMACM support is enabled]) AC_MSG_CHECKING([if OpenFabrics RDMACM support is enabled])
AC_DEFINE_UNQUOTED([OMPI_HAVE_RDMACM], [$$1_have_rdmacm], AC_DEFINE_UNQUOTED([OMPI_HAVE_RDMACM], [$$1_have_rdmacm],
[Whether RDMA CM is available or not]) [Whether RDMA CM is available or not])
@ -267,7 +299,11 @@ dnl fi
AC_MSG_RESULT([no]) AC_MSG_RESULT([no])
fi fi
CPPFLAGS="$ompi_check_openib_$1_save_CPPFLAGS" AS_IF([test -z "$ompi_check_openib_dir"],
[openib_include_dir="/usr/include"],
[openib_include_dir="$ompi_check_openib_dir/include"])
CPPFLAGS="$ompi_check_openib_$1_save_CPPFLAGS -I$openib_include_dir/infiniband"
LDFLAGS="$ompi_check_openib_$1_save_LDFLAGS" LDFLAGS="$ompi_check_openib_$1_save_LDFLAGS"
LIBS="$ompi_check_openib_$1_save_LIBS" LIBS="$ompi_check_openib_$1_save_LIBS"

Просмотреть файл

@ -52,6 +52,7 @@
BEGIN_C_DECLS BEGIN_C_DECLS
#define HAVE_XRC (1 == OMPI_HAVE_CONNECTX_XRC) #define HAVE_XRC (1 == OMPI_HAVE_CONNECTX_XRC)
#define ENABLE_DYNAMIC_SL (1 == OMPI_ENABLE_DYNAMIC_SL)
#define MCA_BTL_IB_LEAVE_PINNED 1 #define MCA_BTL_IB_LEAVE_PINNED 1
#define IB_DEFAULT_GID_PREFIX 0xfe80000000000000ll #define IB_DEFAULT_GID_PREFIX 0xfe80000000000000ll
@ -215,7 +216,9 @@ struct mca_btl_openib_component_t {
uint32_t ib_rnr_retry; uint32_t ib_rnr_retry;
uint32_t ib_max_rdma_dst_ops; uint32_t ib_max_rdma_dst_ops;
uint32_t ib_service_level; uint32_t ib_service_level;
uint32_t ib_path_rec_service_level; #if (ENABLE_DYNAMIC_SL)
uint32_t ib_path_record_service_level;
#endif
int32_t use_eager_rdma; int32_t use_eager_rdma;
int32_t eager_rdma_threshold; /**< After this number of msg, use RDMA for short messages, always */ int32_t eager_rdma_threshold; /**< After this number of msg, use RDMA for short messages, always */
int32_t eager_rdma_num; int32_t eager_rdma_num;

Просмотреть файл

@ -398,10 +398,14 @@ int btl_openib_register_mca_params(void)
} }
mca_btl_openib_component.ib_service_level = (uint32_t) ival; mca_btl_openib_component.ib_service_level = (uint32_t) ival;
CHECK(reg_int("ib_path_rec_service_level", NULL, "Enable getting InfiniBand service level from PathRecord " #if (ENABLE_DYNAMIC_SL)
"(must be >= 0, 0 = disabled, positive = try to get the service level from PathRecord)", CHECK(reg_int("ib_path_record_service_level", NULL,
"Enable getting InfiniBand service level from PathRecord "
"(must be >= 0, 0 = disabled, positive = try to get the "
"service level from PathRecord)",
0, &ival, REGINT_GE_ZERO)); 0, &ival, REGINT_GE_ZERO));
mca_btl_openib_component.ib_path_rec_service_level = (uint32_t) ival; mca_btl_openib_component.ib_path_record_service_level = (uint32_t) ival;
#endif
CHECK(reg_int("use_eager_rdma", NULL, "Use RDMA for eager messages " CHECK(reg_int("use_eager_rdma", NULL, "Use RDMA for eager messages "
"(-1 = use device default, 0 = do not use eager RDMA, " "(-1 = use device default, 0 = do not use eager RDMA, "

Просмотреть файл

@ -44,6 +44,10 @@
#include "connect/connect.h" #include "connect/connect.h"
#include "orte/util/show_help.h" #include "orte/util/show_help.h"
#if (ENABLE_DYNAMIC_SL)
#include <infiniband/iba/ib_types.h>
#endif
#ifdef HAVE_UNISTD_H #ifdef HAVE_UNISTD_H
#include <unistd.h> #include <unistd.h>
#endif #endif
@ -54,109 +58,17 @@ typedef enum {
ENDPOINT_CONNECT_ACK ENDPOINT_CONNECT_ACK
} connect_message_type_t; } connect_message_type_t;
#ifndef __WINDOWS__ #define SL_NOT_PRESENT 0xFF
#define PACK_SUFFIX __attribute__((packed))
#else
#define PACK_SUFFIX
#endif
#define SL_NOT_PRESENT 0x7F
#define MAX_GET_SL_REC_RETRIES 20 #define MAX_GET_SL_REC_RETRIES 20
#define GET_SL_REC_RETRIES_TIMEOUT_MS 2000000 #define GET_SL_REC_RETRIES_TIMEOUT_MS 2000000
#define IB_SA_QPN 1 #if (ENABLE_DYNAMIC_SL)
#define IB_GLOBAL_QKEY 0x80010000UL
#define IB_MGMT_BASE_VERSION 1
#define IB_MGMT_CLASS_SUBN_ADM 0x03
#define IB_MGMT_METHOD_GET 0x01
#define IB_SA_TID_GET_PATH_REC_0 0xCA000000UL
#define IB_SA_TID_GET_PATH_REC_1 0xBEEF0000UL
#define IB_PATH_REC_SL_MASK 0x000F
#define IB_SA_ATTR_PATH_REC 0x35
#define IB_SA_PATH_REC_DLID (1<<4)
#define IB_SA_PATH_REC_SLID (1<<5)
#ifdef __WINDOWS__
#pragma pack(push)
#pragma pack(1)
#endif
struct ib_mad_hdr {
uint8_t base_version;
uint8_t mgmt_class;
uint8_t class_version;
uint8_t method;
uint16_t status;
uint16_t class_spec;
uint32_t tid[2];
uint16_t attr_id;
uint16_t resv;
uint32_t attr_mod;
} PACK_SUFFIX;
struct ib_rmpp_hdr {
uint32_t raw[3];
} PACK_SUFFIX;
struct ib_sa_hdr {
uint32_t sm_key[2];
uint16_t reserved;
uint16_t attrib_offset;
uint32_t comp_mask[2];
} PACK_SUFFIX;
typedef union _ib_gid {
uint8_t raw[16];
struct _ib_gid_unicast {
uint64_t prefix;
uint64_t interface_id;
} PACK_SUFFIX unicast;
struct _ib_gid_multicast {
uint8_t header[2];
uint8_t raw_group_id[14];
} PACK_SUFFIX multicast;
} PACK_SUFFIX ib_gid_t;
struct ib_path_record {
uint64_t service_id;
ib_gid_t dgit;
ib_gid_t sgit;
uint16_t dlid;
uint16_t slid;
uint32_t hop_flow_raw;
uint8_t tclass;
uint8_t num_path;
uint16_t pkey;
uint8_t reserved1;
uint8_t qos_class_sl;
uint8_t mtu;
uint8_t rate;
uint32_t preference__packet_lifetime__packet_lifetime_selector;
uint32_t reserved2[35];
} PACK_SUFFIX;
union ib_sa_data {
struct ib_path_record path_record;
} PACK_SUFFIX;
struct ib_mad_sa {
struct ib_mad_hdr mad_hdr;
struct ib_rmpp_hdr rmpp_hdr;
struct ib_sa_hdr sa_hdr;
union ib_sa_data sa_data;
} PACK_SUFFIX;
#ifdef __WINDOWS__
#pragma pack(pop)
#endif
static struct mca_btl_openib_sa_qp_cache { static struct mca_btl_openib_sa_qp_cache {
/* There will be a MR with the one send and receive buffer together */ /* There will be a MR with the one send and receive buffer together */
/* The send buffer is first, the receive buffer is second */ /* The send buffer is first, the receive buffer is second */
/* The receive buffer in a UD queue pair needs room for the 40 byte GRH */ /* The receive buffer in a UD queue pair needs room for the 40 byte GRH */
/* The buffers are first in the structure for page alignment */ /* The buffers are first in the structure for page alignment */
char send_recv_buffer[sizeof(struct ib_mad_sa) * 2 + 40]; char send_recv_buffer[MAD_BLOCK_SIZE * 2 + 40];
struct mca_btl_openib_sa_qp_cache *next; struct mca_btl_openib_sa_qp_cache *next;
struct ibv_context *context; struct ibv_context *context;
char *device_name; char *device_name;
@ -168,8 +80,9 @@ static struct mca_btl_openib_sa_qp_cache {
struct ibv_pd *pd; struct ibv_pd *pd;
struct ibv_recv_wr rwr; struct ibv_recv_wr rwr;
struct ibv_sge rsge; struct ibv_sge rsge;
char sl_values[65536]; uint8_t sl_values[65536]; /* 64K */
} *sa_qp_cache = 0; } *sa_qp_cache = 0;
#endif
static int oob_priority = 50; static int oob_priority = 50;
static bool rml_recv_posted = false; static bool rml_recv_posted = false;
@ -198,27 +111,31 @@ static void rml_send_cb(int status, orte_process_name_t* endpoint,
static void rml_recv_cb(int status, orte_process_name_t* process_name, static void rml_recv_cb(int status, orte_process_name_t* process_name,
opal_buffer_t* buffer, orte_rml_tag_t tag, opal_buffer_t* buffer, orte_rml_tag_t tag,
void* cbdata); void* cbdata);
#if (ENABLE_DYNAMIC_SL)
static int init_ud_qp(struct ibv_context *context_arg, static int init_ud_qp(struct ibv_context *context_arg,
struct mca_btl_openib_sa_qp_cache *cache); struct mca_btl_openib_sa_qp_cache *cache);
static void init_sa_mad(struct mca_btl_openib_sa_qp_cache *cache, static void init_sa_mad(struct mca_btl_openib_sa_qp_cache *cache,
struct ib_mad_sa *sag, ib_sa_mad_t *sa_mad,
struct ibv_send_wr *swr, struct ibv_send_wr *swr,
struct ibv_sge *ssge, struct ibv_sge *ssge,
uint16_t lid, uint16_t lid,
uint16_t rem_lid); uint16_t rem_lid);
static int get_pathrecord_info(struct mca_btl_openib_sa_qp_cache *cache, static int get_pathrecord_info(struct mca_btl_openib_sa_qp_cache *cache,
struct ib_mad_sa *sag, ib_sa_mad_t *sa_mad,
struct ib_mad_sa *sar, ib_sa_mad_t *sar,
struct ibv_send_wr *swr, struct ibv_send_wr *swr,
uint16_t lid,
uint16_t rem_lid);
static int init_device(struct ibv_context *context_arg,
struct mca_btl_openib_sa_qp_cache *cache,
uint32_t port_num);
static int get_pathrecord_sl(struct ibv_context *context_arg,
uint32_t port_num,
uint16_t lid, uint16_t lid,
uint16_t rem_lid); uint16_t rem_lid);
static int init_device(struct ibv_context *context_arg,
struct mca_btl_openib_sa_qp_cache *cache,
uint32_t port_num);
static int get_pathrecord_sl(struct ibv_context *context_arg,
uint32_t port_num,
uint16_t lid,
uint16_t rem_lid);
static void free_sa_qp_cache(void);
#endif
/* /*
* The "component" struct -- the top-level function pointers for the * The "component" struct -- the top-level function pointers for the
@ -351,6 +268,33 @@ static int oob_module_start_connect(ompi_btl_openib_connect_base_module_t *cpc,
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
#if (ENABLE_DYNAMIC_SL)
static void free_sa_qp_cache(void)
{
struct mca_btl_openib_sa_qp_cache *cache, *tmp;
cache = sa_qp_cache;
while (NULL != cache) {
/* free cache data */
if (cache->device_name)
free(cache->device_name);
if (NULL != cache->qp)
ibv_destroy_qp(cache->qp);
if (NULL != cache->ah)
ibv_destroy_ah(cache->ah);
if (NULL != cache->cq)
ibv_destroy_cq(cache->cq);
if (NULL != cache->mr)
ibv_dereg_mr(cache->mr);
if (NULL != cache->pd)
ibv_dealloc_pd(cache->pd);
tmp = cache->next;
free(cache);
cache = tmp;
}
}
#endif
/* /*
* Component finalize function. Cleanup RML non-blocking receive. * Component finalize function. Cleanup RML non-blocking receive.
*/ */
@ -360,7 +304,9 @@ static int oob_component_finalize(void)
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, OMPI_RML_TAG_OPENIB); orte_rml.recv_cancel(ORTE_NAME_WILDCARD, OMPI_RML_TAG_OPENIB);
rml_recv_posted = false; rml_recv_posted = false;
} }
#if (ENABLE_DYNAMIC_SL)
free_sa_qp_cache();
#endif
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
@ -425,7 +371,7 @@ static int set_remote_info(mca_btl_base_endpoint_t* endpoint,
*/ */
static int qp_connect_all(mca_btl_openib_endpoint_t *endpoint) static int qp_connect_all(mca_btl_openib_endpoint_t *endpoint)
{ {
int i, rc; int i;
mca_btl_openib_module_t* openib_btl = mca_btl_openib_module_t* openib_btl =
(mca_btl_openib_module_t*)endpoint->endpoint_btl; (mca_btl_openib_module_t*)endpoint->endpoint_btl;
@ -446,18 +392,24 @@ static int qp_connect_all(mca_btl_openib_endpoint_t *endpoint)
attr.ah_attr.dlid = endpoint->rem_info.rem_lid; attr.ah_attr.dlid = endpoint->rem_info.rem_lid;
attr.ah_attr.src_path_bits = openib_btl->src_path_bits; attr.ah_attr.src_path_bits = openib_btl->src_path_bits;
attr.ah_attr.port_num = openib_btl->port_num; attr.ah_attr.port_num = openib_btl->port_num;
attr.ah_attr.sl = mca_btl_openib_component.ib_service_level;
/* if user enable ib_path_rec_service_level - dynamically get the sl from PathRecord */ #if (ENABLE_DYNAMIC_SL)
if (mca_btl_openib_component.ib_path_rec_service_level > 0) { /* if user enabled dynamic SL, get it from PathRecord */
rc = get_pathrecord_sl(qp->context, if (0 != mca_btl_openib_component.ib_path_record_service_level) {
int rc = get_pathrecord_sl(qp->context,
attr.ah_attr.port_num, attr.ah_attr.port_num,
openib_btl->lid, openib_btl->lid,
attr.ah_attr.dlid); attr.ah_attr.dlid);
if (OMPI_ERROR == rc) { if (OMPI_ERROR == rc) {
free_sa_qp_cache();
return OMPI_ERROR; return OMPI_ERROR;
} }
attr.ah_attr.sl = rc; attr.ah_attr.sl = rc;
} }
#else
attr.ah_attr.sl = mca_btl_openib_component.ib_service_level;
#endif
/* JMS to be filled in later dynamically */ /* JMS to be filled in later dynamically */
attr.ah_attr.static_rate = 0; attr.ah_attr.static_rate = 0;
@ -1056,6 +1008,7 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name,
OPAL_THREAD_UNLOCK(&mca_btl_openib_component.ib_lock); OPAL_THREAD_UNLOCK(&mca_btl_openib_component.ib_lock);
} }
#if (ENABLE_DYNAMIC_SL)
static int init_ud_qp(struct ibv_context *context_arg, static int init_ud_qp(struct ibv_context *context_arg,
struct mca_btl_openib_sa_qp_cache *cache) struct mca_btl_openib_sa_qp_cache *cache)
{ {
@ -1094,7 +1047,7 @@ static int init_ud_qp(struct ibv_context *context_arg,
memset(&mattr, 0, sizeof(mattr)); memset(&mattr, 0, sizeof(mattr));
mattr.qp_state = IBV_QPS_INIT; mattr.qp_state = IBV_QPS_INIT;
mattr.port_num = cache->port_num; mattr.port_num = cache->port_num;
mattr.qkey = IB_GLOBAL_QKEY; mattr.qkey = ntohl(IB_QP1_WELL_KNOWN_Q_KEY);
rc = ibv_modify_qp(cache->qp, &mattr, rc = ibv_modify_qp(cache->qp, &mattr,
IBV_QP_STATE | IBV_QP_STATE |
IBV_QP_PKEY_INDEX | IBV_QP_PKEY_INDEX |
@ -1128,61 +1081,75 @@ static int init_ud_qp(struct ibv_context *context_arg,
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
static void init_sa_mad(struct mca_btl_openib_sa_qp_cache *cache, static void init_sa_mad(struct mca_btl_openib_sa_qp_cache *cache,
struct ib_mad_sa *sag, ib_sa_mad_t *sa_mad,
struct ibv_send_wr *swr, struct ibv_send_wr *swr,
struct ibv_sge *ssge, struct ibv_sge *ssge,
uint16_t lid, uint16_t lid,
uint16_t rem_lid) uint16_t rem_lid)
{ {
memset(sag, 0, sizeof(*sag)); ib_path_rec_t *path_record = (ib_path_rec_t*)sa_mad->data;
memset(swr, 0, sizeof(*swr)); memset(swr, 0, sizeof(*swr));
memset(ssge, 0, sizeof(*ssge)); memset(ssge, 0, sizeof(*ssge));
sag->mad_hdr.base_version = IB_MGMT_BASE_VERSION; /* Initialize the standard MAD header. */
sag->mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM; memset(sa_mad, 0, MAD_BLOCK_SIZE);
sag->mad_hdr.class_version = 2; ib_mad_init_new((ib_mad_t *)sa_mad, /* mad header pointer */
sag->mad_hdr.method = IB_MGMT_METHOD_GET; IB_MCLASS_SUBN_ADM, /* management class */
sag->mad_hdr.attr_id = htons (IB_SA_ATTR_PATH_REC); (uint8_t) 2, /* version */
sag->mad_hdr.tid[0] = IB_SA_TID_GET_PATH_REC_0 + cache->qp->qp_num; IB_MAD_METHOD_GET, /* method */
sag->mad_hdr.tid[1] = IB_SA_TID_GET_PATH_REC_1 + rem_lid; hton64((uint64_t)lid << 48 | /* transaction ID */
sag->sa_hdr.comp_mask[1] = (uint64_t)rem_lid << 32 |
htonl(IB_SA_PATH_REC_DLID | IB_SA_PATH_REC_SLID); (uint64_t)cache->qp->qp_num << 8),
sag->sa_data.path_record.dlid = htons(rem_lid); IB_MAD_ATTR_PATH_RECORD, /* attribute ID */
sag->sa_data.path_record.slid = htons(lid); 0); /* attribute modifier */
sa_mad->comp_mask = IB_PR_COMPMASK_DLID | IB_PR_COMPMASK_SLID;
path_record->dlid = htons(rem_lid);
path_record->slid = htons(lid);
swr->sg_list = ssge; swr->sg_list = ssge;
swr->num_sge = 1; swr->num_sge = 1;
swr->opcode = IBV_WR_SEND; swr->opcode = IBV_WR_SEND;
swr->wr.ud.ah = cache->ah; swr->wr.ud.ah = cache->ah;
swr->wr.ud.remote_qpn = IB_SA_QPN; swr->wr.ud.remote_qpn = ntohl(IB_QP1);
swr->wr.ud.remote_qkey = IB_GLOBAL_QKEY; swr->wr.ud.remote_qkey = ntohl(IB_QP1_WELL_KNOWN_Q_KEY);
swr->send_flags = IBV_SEND_SIGNALED | IBV_SEND_SOLICITED; swr->send_flags = IBV_SEND_SIGNALED | IBV_SEND_SOLICITED;
ssge->addr = (uint64_t)(void *)sag; ssge->addr = (uint64_t)(void *)sa_mad;
ssge->length = sizeof(*sag); ssge->length = MAD_BLOCK_SIZE;
ssge->lkey = cache->mr->lkey; ssge->lkey = cache->mr->lkey;
} }
static int get_pathrecord_info(struct mca_btl_openib_sa_qp_cache *cache, static int get_pathrecord_info(struct mca_btl_openib_sa_qp_cache *cache,
struct ib_mad_sa *sag, ib_sa_mad_t *req_mad,
struct ib_mad_sa *sar, ib_sa_mad_t *resp_mad,
struct ibv_send_wr *swr, struct ibv_send_wr *swr,
uint16_t lid, uint16_t lid,
uint16_t rem_lid) uint16_t rem_lid)
{ {
struct ibv_send_wr *bswr; struct ibv_send_wr *bswr;
struct ibv_wc wc; struct ibv_wc wc;
struct timeval get_sl_rec_last_sent, get_sl_rec_last_poll; struct timeval get_sl_rec_last_sent, get_sl_rec_last_poll;
struct ibv_recv_wr *brwr; struct ibv_recv_wr *brwr;
int got_sl_value, get_sl_rec_retries, rc, ne, i; int got_sl_value, get_sl_rec_retries, rc, ne, i;
ib_path_rec_t *req_path_record = ib_sa_mad_get_payload_ptr(req_mad);
ib_path_rec_t *resp_path_record = ib_sa_mad_get_payload_ptr(resp_mad);
got_sl_value = 0; got_sl_value = 0;
get_sl_rec_retries = 0; get_sl_rec_retries = 0;
rc = ibv_post_recv(cache->qp, &(cache->rwr), &brwr);
if (0 != rc) {
BTL_ERROR(("error posting receive on QP [0x%x] errno says: %s [%d]",
cache->qp->qp_num, strerror(errno), errno));
return OMPI_ERROR;
}
while (0 == got_sl_value) { while (0 == got_sl_value) {
rc = ibv_post_send(cache->qp, swr, &bswr); rc = ibv_post_send(cache->qp, swr, &bswr);
if (0 != rc) { if (0 != rc) {
BTL_ERROR(("error posing send on QP[%x] errno says: %s [%d]", BTL_ERROR(("error posting send on QP [0x%x] errno says: %s [%d]",
cache->qp->qp_num, strerror(errno), errno)); cache->qp->qp_num, strerror(errno), errno));
return OMPI_ERROR; return OMPI_ERROR;
} }
@ -1190,25 +1157,23 @@ static int get_pathrecord_info(struct mca_btl_openib_sa_qp_cache *cache,
while (0 == got_sl_value) { while (0 == got_sl_value) {
ne = ibv_poll_cq(cache->cq, 1, &wc); ne = ibv_poll_cq(cache->cq, 1, &wc);
if (ne > 0 if (ne > 0 &&
&& wc.status == IBV_WC_SUCCESS IBV_WC_SUCCESS == wc.status &&
&& wc.opcode == IBV_WC_RECV IBV_WC_RECV == wc.opcode &&
&& wc.byte_len >= sizeof(*sar) wc.byte_len >= MAD_BLOCK_SIZE &&
&& sar->mad_hdr.tid[0] == sag->mad_hdr.tid[0] resp_mad->trans_id == req_mad->trans_id) {
&& sar->mad_hdr.tid[1] == sag->mad_hdr.tid[1]) { if (0 == resp_mad->status &&
if (0 == sar->mad_hdr.status req_path_record->slid == htons(lid) &&
&& sar->sa_data.path_record.slid == htons(lid) req_path_record->dlid == htons(rem_lid)) {
&& sar->sa_data.path_record.dlid == htons(rem_lid)) {
/* Everything matches, so we have the desired SL */ /* Everything matches, so we have the desired SL */
cache->sl_values[rem_lid] = cache->sl_values[rem_lid] = ib_path_rec_sl(resp_path_record);
sar->sa_data.path_record.qos_class_sl & IB_PATH_REC_SL_MASK;
got_sl_value = 1; /* still must repost recieve buf */ got_sl_value = 1; /* still must repost recieve buf */
} else { } else {
/* Probably bad status, unlikely bad lid match. We will */ /* Probably bad status, unlikely bad lid match. We will */
/* ignore response and let it time out so that we do a */ /* ignore response and let it time out so that we do a */
/* retry, but after a delay. We must make a new TID so */ /* retry, but after a delay. We must make a new TID so */
/* the SM doesn't see it as the same request. */ /* the SM doesn't see it as the same request. */
sag->mad_hdr.tid[1] += 0x10000; req_mad->trans_id += hton64(1);
} }
rc = ibv_post_recv(cache->qp, &(cache->rwr), &brwr); rc = ibv_post_recv(cache->qp, &(cache->rwr), &brwr);
if (0 != rc) { if (0 != rc) {
@ -1249,7 +1214,6 @@ static int init_device(struct ibv_context *context_arg,
{ {
struct ibv_ah_attr aattr; struct ibv_ah_attr aattr;
struct ibv_port_attr pattr; struct ibv_port_attr pattr;
struct ibv_recv_wr *brwr;
int rc; int rc;
cache->context = ibv_open_device(context_arg->device); cache->context = ibv_open_device(context_arg->device);
@ -1315,16 +1279,10 @@ static int init_device(struct ibv_context *context_arg,
cache->rwr.sg_list = &(cache->rsge); cache->rwr.sg_list = &(cache->rsge);
memset(&(cache->rsge), 0, sizeof(cache->rsge)); memset(&(cache->rsge), 0, sizeof(cache->rsge));
cache->rsge.addr = (uint64_t)(void *) cache->rsge.addr = (uint64_t)(void *)
(cache->send_recv_buffer + sizeof(struct ib_mad_sa)); (cache->send_recv_buffer + MAD_BLOCK_SIZE);
cache->rsge.length = sizeof(struct ib_mad_sa) + 40; cache->rsge.length = MAD_BLOCK_SIZE + 40;
cache->rsge.lkey = cache->mr->lkey; cache->rsge.lkey = cache->mr->lkey;
rc = ibv_post_recv(cache->qp, &(cache->rwr), &brwr);
if (0 != rc) {
BTL_ERROR(("error posing receive on QP[%x] errno says: %s [%d]",
cache->qp->qp_num, strerror(errno), errno));
return OMPI_ERROR;
}
return 0; return 0;
} }
@ -1334,7 +1292,7 @@ static int get_pathrecord_sl(struct ibv_context *context_arg,
uint16_t rem_lid) uint16_t rem_lid)
{ {
struct ibv_send_wr swr; struct ibv_send_wr swr;
struct ib_mad_sa *sag, *sar; ib_sa_mad_t *req_mad, *resp_mad;
struct ibv_sge ssge; struct ibv_sge ssge;
struct mca_btl_openib_sa_qp_cache *cache; struct mca_btl_openib_sa_qp_cache *cache;
long page_size = sysconf(_SC_PAGESIZE); long page_size = sysconf(_SC_PAGESIZE);
@ -1342,8 +1300,8 @@ static int get_pathrecord_sl(struct ibv_context *context_arg,
/* search for a cached item */ /* search for a cached item */
for (cache = sa_qp_cache; cache; cache = cache->next) { for (cache = sa_qp_cache; cache; cache = cache->next) {
if (strcmp(cache->device_name, if (0 == strcmp(cache->device_name,
ibv_get_device_name(context_arg->device)) == 0 ibv_get_device_name(context_arg->device))
&& cache->port_num == port_num) { && cache->port_num == port_num) {
break; break;
} }
@ -1365,15 +1323,15 @@ static int get_pathrecord_sl(struct ibv_context *context_arg,
/* if the destination lid SL value is not in the cache, go get it */ /* if the destination lid SL value is not in the cache, go get it */
if (SL_NOT_PRESENT == cache->sl_values[rem_lid]) { if (SL_NOT_PRESENT == cache->sl_values[rem_lid]) {
/* sag is first buffer, where we build the SA Get request to send */ /* sa_mad is first buffer, where we build the SA Get request to send */
sag = (struct ib_mad_sa *)(cache->send_recv_buffer); req_mad = (ib_sa_mad_t *)(cache->send_recv_buffer);
init_sa_mad(cache, sag, &swr, &ssge, lid, rem_lid); init_sa_mad(cache, req_mad, &swr, &ssge, lid, rem_lid);
/* sar is the receive buffer (40 byte GRH) */ /* resp_mad is the receive buffer (40 byte offset is for GRH) */
sar = (struct ib_mad_sa *)(cache->send_recv_buffer + sizeof(struct ib_mad_sa) + 40); resp_mad = (ib_sa_mad_t *)(cache->send_recv_buffer + MAD_BLOCK_SIZE + 40);
rc = get_pathrecord_info(cache, sag, sar, &swr, lid, rem_lid); rc = get_pathrecord_info(cache, req_mad, resp_mad, &swr, lid, rem_lid);
if (0 != rc) { if (0 != rc) {
return rc; return rc;
} }
@ -1382,3 +1340,4 @@ static int get_pathrecord_sl(struct ibv_context *context_arg,
/* now all we do is send back the value laying around */ /* now all we do is send back the value laying around */
return cache->sl_values[rem_lid]; return cache->sl_values[rem_lid];
} }
#endif