1
1

Initial XRC support by Mellanox.

This commit was SVN r16787.
Этот коммит содержится в:
Gleb Natapov 2007-11-28 07:18:59 +00:00
родитель b49788c499
Коммит bd47da4699
17 изменённых файлов: 1761 добавлений и 90 удалений

Просмотреть файл

@ -102,7 +102,7 @@ AC_DEFUN([OMPI_CHECK_OPENIB],[
AS_IF([test "$ompi_check_openib_happy" = "yes"],
[AC_CHECK_DECLS([IBV_EVENT_CLIENT_REREGISTER], [], [],
[#include <infiniband/verbs.h>])
AC_CHECK_FUNCS([ibv_get_device_list ibv_resize_cq])])
AC_CHECK_FUNCS([ibv_get_device_list ibv_resize_cq ibv_open_xrc_domain])])
CPPFLAGS="$ompi_check_openib_$1_save_CPPFLAGS"
LDFLAGS="$ompi_check_openib_$1_save_LDFLAGS"

Просмотреть файл

@ -735,4 +735,22 @@ AC_DEFINE_UNQUOTED([OPAL_IDENT_STRING], ["$with_ident_string"],
[ident string for Open MPI])
AC_MSG_RESULT([$with_ident_string])
#
# ConnectX XRC support
#
AC_MSG_CHECKING([if ConnectX XRC support should be enabled])
AC_ARG_ENABLE([connectx-xrc],
[AC_HELP_STRING([--enable-connectx-xrc],
[Enable features required for ConnectX XRC support. If you don't have Infiniband ConnectX adapters you may disable the ConnectX XRC support. If you don't know which Infiniband adapter is installed on you cluster - leave it enabled (default: enabled)])])
if test "$enable_connectx_xrc" = "no" ; then
AC_MSG_RESULT([no])
ompi_want_connectx_xrc=0
else
AC_MSG_RESULT([yes])
ompi_want_connectx_xrc=1
fi
AC_DEFINE_UNQUOTED([OMPI_ENABLE_CONNECTX_XRC_SUPPORT],
[$ompi_want_connectx_xrc],
[Enable features required for ConnectX XRC support])
])

Просмотреть файл

@ -49,10 +49,14 @@ sources = \
btl_openib_ini.h \
btl_openib_async.c \
btl_openib_async.h \
btl_openib_xrc.c \
btl_openib_xrc.h \
connect/base.h \
connect/btl_openib_connect_base.c \
connect/btl_openib_connect_oob.c \
connect/btl_openib_connect_oob.h \
connect/btl_openib_connect_xoob.c \
connect/btl_openib_connect_xoob.h \
connect/btl_openib_connect_rdma_cm.c \
connect/btl_openib_connect_rdma_cm.h \
connect/connect.h

Просмотреть файл

@ -34,6 +34,7 @@
#include "btl_openib_frag.h"
#include "btl_openib_proc.h"
#include "btl_openib_endpoint.h"
#include "btl_openib_xrc.h"
#include "ompi/datatype/convertor.h"
#include "ompi/datatype/datatype.h"
#include "ompi/datatype/dt_arch.h"
@ -155,6 +156,17 @@ int mca_btl_openib_add_procs(
}
}
#if HAVE_XRC
if(MCA_BTL_XRC_ENABLED &&
NULL == mca_btl_openib_component.ib_addr_table.ht_table) {
if(OPAL_SUCCESS != opal_hash_table_init(
&mca_btl_openib_component.ib_addr_table, nprocs)) {
BTL_ERROR(("XRC internal error. Failed to allocate ib_table\n"));
return OMPI_ERROR;
}
}
#endif
for(i = 0; i < (int) nprocs; i++) {
struct ompi_proc_t* ompi_proc = ompi_procs[i];
mca_btl_openib_proc_t* ib_proc;
@ -210,6 +222,21 @@ int mca_btl_openib_add_procs(
endpoint->use_eager_rdma = openib_btl->hca->use_eager_rdma &
mca_btl_openib_component.use_eager_rdma;
endpoint->subnet_id = openib_btl->port_info.subnet_id;
#if HAVE_XRC
if (MCA_BTL_XRC_ENABLED) {
/* Pasha: now we need to push the subnet and lid to some global table in the component */
rc = mca_btl_openib_ib_address_add_new(
ib_proc->proc_ports[ib_proc->port_touse].subnet_id,
ib_proc->proc_ports[ib_proc->port_touse].lid, endpoint);
if (OMPI_SUCCESS != rc ) {
OPAL_THREAD_UNLOCK(&ib_proc->proc_lock);
return OMPI_ERROR;
}
/* we caching REMOTE_LID to the endpoint */
endpoint->lid = ib_proc->proc_ports[ib_proc->port_touse].lid;
ib_proc->port_touse++;
}
#endif
rc = mca_btl_openib_proc_insert(ib_proc, endpoint);
if(rc != OMPI_SUCCESS) {
OBJ_RELEASE(endpoint);
@ -263,6 +290,26 @@ static int create_srq(mca_btl_openib_module_t *openib_btl)
return OMPI_ERROR;
}
}
#if HAVE_XRC
if(BTL_OPENIB_QP_TYPE_XRC(qp)) {
int prio = (mca_btl_openib_component.qp_infos[qp].size <=
mca_btl_openib_component.eager_limit) ?
BTL_OPENIB_HP_CQ : BTL_OPENIB_LP_CQ;
attr.attr.max_wr = mca_btl_openib_component.qp_infos[qp].rd_num +
mca_btl_openib_component.qp_infos[qp].u.xrc_qp.sd_max;
attr.attr.max_sge = mca_btl_openib_component.ib_sg_list_size;
openib_btl->qps[qp].u.xrc_qp.rd_posted = 0;
openib_btl->qps[qp].u.xrc_qp.xrc =
ibv_create_xrc_srq(openib_btl->hca->ib_pd,
openib_btl->hca->xrc_domain,
openib_btl->hca->ib_cq[prio],&attr);
if (NULL == openib_btl->qps[qp].u.xrc_qp.xrc) {
show_init_error(__FILE__, __LINE__, "ibv_create_srq",
ibv_get_device_name(openib_btl->hca->ib_dev));
return OMPI_ERROR;
}
}
#endif
}
return OMPI_SUCCESS;
@ -757,6 +804,16 @@ static int mca_btl_finalize_hca(struct mca_btl_openib_hca_t *hca)
BTL_VERBOSE(("Failed to release mpool"));
return OMPI_ERROR;
}
#if HAVE_XRC
if (MCA_BTL_XRC_ENABLED) {
if (OMPI_SUCCESS != mca_btl_openib_close_xrc_domain(hca)) {
BTL_ERROR(("XRC Internal error. Failed to close xrc domain"));
return OMPI_ERROR;
}
}
#endif
if (ibv_dealloc_pd(hca->ib_pd)) {
BTL_VERBOSE(("Warning! Failed to release PD"));
return OMPI_ERROR;
@ -823,20 +880,37 @@ int mca_btl_openib_finalize(struct mca_btl_base_module_t* btl)
}
/* Release SRQ resources */
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
if(BTL_OPENIB_QP_TYPE_SRQ(qp)){
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
switch (BTL_OPENIB_QP_TYPE(qp)) {
case MCA_BTL_OPENIB_SRQ_QP:
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
&openib_btl->qps[qp].u.srq_qp.pending_frags[0]);
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
&openib_btl->qps[qp].u.srq_qp.pending_frags[1]);
if (ibv_destroy_srq(openib_btl->qps[qp].u.srq_qp.srq)){
BTL_VERBOSE(("Failed to close SRQ %d", qp));
rc = OMPI_ERROR;
}
/* Destroy free lists */
OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[0]);
OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[1]);
if (ibv_destroy_srq(openib_btl->qps[qp].u.srq_qp.srq)){
BTL_VERBOSE(("Failed to close SRQ %d", qp));
return OMPI_ERROR;
}
OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[0]);
OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[1]);
break;
case MCA_BTL_OPENIB_XRC_QP:
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
&openib_btl->qps[qp].u.xrc_qp.pending_frags[0]);
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
&openib_btl->qps[qp].u.xrc_qp.pending_frags[1]);
if (ibv_destroy_srq(openib_btl->qps[qp].u.xrc_qp.xrc)) {
BTL_VERBOSE(("Failed to close SRQ %d", qp));
return OMPI_ERROR;
}
OBJ_DESTRUCT(&openib_btl->qps[qp].u.xrc_qp.pending_frags[0]);
OBJ_DESTRUCT(&openib_btl->qps[qp].u.xrc_qp.pending_frags[1]);
break;
case MCA_BTL_OPENIB_PP_QP:
/* Nothing to do */
break;
default:
BTL_VERBOSE(("Unknow qp type %d", qp));
break;
}
/* Destroy free lists */
OBJ_DESTRUCT(&openib_btl->qps[qp].send_free);
@ -962,6 +1036,10 @@ int mca_btl_openib_put( mca_btl_base_module_t* btl,
(uint64_t)descriptor->des_src->seg_addr.pval;
to_com_frag(frag)->sg_entry.length = descriptor->des_src->seg_len;
to_com_frag(frag)->endpoint = ep;
#if HAVE_XRC
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp))
frag->sr_desc.xrc_remote_srq_num=ep->rem_info.rem_srqs[qp].rem_srq_num;
#endif
descriptor->order = qp;
/* Setting opcode on a frag constructor isn't enough since prepare_src
@ -1037,7 +1115,11 @@ int mca_btl_openib_get(mca_btl_base_module_t* btl,
(uint64_t)descriptor->des_dst->seg_addr.pval;
to_com_frag(frag)->sg_entry.length = descriptor->des_dst->seg_len;
to_com_frag(frag)->endpoint = ep;
#if HAVE_XRC
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp))
frag->sr_desc.xrc_remote_srq_num=ep->rem_info.rem_srqs[qp].rem_srq_num;
#endif
descriptor->order = qp;
if(ibv_post_send(ep->qps[qp].qp->lcl_qp, &frag->sr_desc, &bad_wr))
return OMPI_ERROR;

Просмотреть файл

@ -49,6 +49,8 @@
BEGIN_C_DECLS
#define HAVE_XRC (defined(HAVE_IBV_OPEN_XRC_DOMAIN) && (1 == OMPI_ENABLE_CONNECTX_XRC_SUPPORT))
#define MCA_BTL_IB_LEAVE_PINNED 1
#define IB_DEFAULT_GID_PREFIX 0xfe80000000000000ll
@ -59,7 +61,8 @@ BEGIN_C_DECLS
typedef enum {
MCA_BTL_OPENIB_PP_QP,
MCA_BTL_OPENIB_SRQ_QP
MCA_BTL_OPENIB_SRQ_QP,
MCA_BTL_OPENIB_XRC_QP
} mca_btl_openib_qp_type_t;
struct mca_btl_openib_pp_qp_info_t {
@ -71,6 +74,10 @@ struct mca_btl_openib_srq_qp_info_t {
int32_t sd_max;
}; typedef struct mca_btl_openib_srq_qp_info_t mca_btl_openib_srq_qp_info_t;
struct mca_btl_openib_xrc_qp_info_t {
int32_t sd_max;
}; typedef struct mca_btl_openib_xrc_qp_info_t mca_btl_openib_xrc_qp_info_t;
struct mca_btl_openib_qp_info_t {
mca_btl_openib_qp_type_t type;
size_t size;
@ -79,6 +86,7 @@ struct mca_btl_openib_qp_info_t {
union {
mca_btl_openib_pp_qp_info_t pp_qp;
mca_btl_openib_srq_qp_info_t srq_qp;
mca_btl_openib_xrc_qp_info_t xrc_qp;
} u;
}; typedef struct mca_btl_openib_qp_info_t mca_btl_openib_qp_info_t;
@ -87,6 +95,8 @@ struct mca_btl_openib_qp_info_t {
(BTL_OPENIB_QP_TYPE(Q) == MCA_BTL_OPENIB_PP_QP)
#define BTL_OPENIB_QP_TYPE_SRQ(Q) \
(BTL_OPENIB_QP_TYPE(Q) == MCA_BTL_OPENIB_SRQ_QP)
#define BTL_OPENIB_QP_TYPE_XRC(Q) \
(BTL_OPENIB_QP_TYPE(Q) == MCA_BTL_OPENIB_XRC_QP)
struct mca_btl_openib_component_t {
mca_btl_base_component_1_0_1_t super; /**< base BTL component */
@ -129,8 +139,11 @@ struct mca_btl_openib_component_t {
uint8_t num_pp_qps; /**< number of pp qp's */
uint8_t num_srq_qps; /**< number of srq qp's */
uint8_t num_xrc_qps; /**< number of xrc qp's */
uint8_t num_qps; /**< total number of qp's */
opal_hash_table_t ib_addr_table; /**< used only for xrc.hash-table that
keeps table of all lids/subnets */
mca_btl_openib_qp_info_t* qp_infos;
size_t eager_limit; /**< Eager send limit of first fragment, in Bytes */
@ -208,18 +221,31 @@ struct mca_btl_openib_port_info_t {
uint8_t padding[4];
#endif
uint64_t subnet_id;
#if HAVE_XRC
uint16_t lid; /* used only in xrc */
#endif
};
typedef struct mca_btl_openib_port_info_t mca_btl_openib_port_info_t;
#if HAVE_XRC
#define MCA_BTL_OPENIB_LID_NTOH(hdr) (hdr).lid = ntohs((hdr).lid)
#define MCA_BTL_OPENIB_LID_HTON(hdr) (hdr).lid = htons((hdr).lid)
#else
#define MCA_BTL_OPENIB_LID_NTOH(hdr)
#define MCA_BTL_OPENIB_LID_HTON(hdr)
#endif
#define MCA_BTL_OPENIB_PORT_INFO_NTOH(hdr) \
do { \
(hdr).mtu = ntohl((hdr).mtu); \
(hdr).subnet_id = ntoh64((hdr).subnet_id); \
MCA_BTL_OPENIB_LID_NTOH(hdr); \
} while (0)
#define MCA_BTL_OPENIB_PORT_INFO_HTON(hdr) \
do { \
(hdr).mtu = htonl((hdr).mtu); \
(hdr).subnet_id = hton64((hdr).subnet_id); \
MCA_BTL_OPENIB_LID_HTON(hdr); \
} while (0)
struct mca_btl_openib_hca_t {
@ -246,6 +272,10 @@ struct mca_btl_openib_hca_t {
#if OMPI_HAVE_THREADS
volatile bool got_fatal_event;
#endif
#if HAVE_XRC
struct ibv_xrc_domain *xrc_domain;
int xrc_fd;
#endif
};
typedef struct mca_btl_openib_hca_t mca_btl_openib_hca_t;
@ -261,12 +291,20 @@ struct mca_btl_openib_module_srq_qp_t {
opal_list_t pending_frags[2]; /**< list of high/low prio frags */
}; typedef struct mca_btl_openib_module_srq_qp_t mca_btl_openib_module_srq_qp_t;
struct mca_btl_openib_module_xrc_qp_t {
struct ibv_srq *xrc;
int32_t rd_posted;
int32_t sd_credits;
opal_list_t pending_frags[2];
}; typedef struct mca_btl_openib_module_xrc_qp_t mca_btl_openib_module_xrc_qp_t;
struct mca_btl_openib_module_qp_t {
ompi_free_list_t send_free; /**< free lists of send buffer descriptors */
ompi_free_list_t recv_free; /**< free lists of receive buffer descriptors */
union {
mca_btl_openib_module_pp_qp_t pp_qp;
mca_btl_openib_module_srq_qp_t srq_qp;
mca_btl_openib_module_xrc_qp_t xrc_qp;
} u;
}; typedef struct mca_btl_openib_module_qp_t mca_btl_openib_module_qp_t;
@ -565,6 +603,55 @@ static inline int mca_btl_openib_post_srr(mca_btl_openib_module_t* openib_btl,
return OMPI_SUCCESS;
}
/**
* Post to XRC with certain priority
*
* @param openib_btl (IN) BTL module
* @param additional (IN) Additional Bytes to reserve
* @param prio (IN) Priority (either BTL_OPENIB_HP_QP or BTL_OPENIB_LP_QP)
* @return OMPI_SUCCESS or failure status
*/
static inline int mca_btl_openib_post_xrr(mca_btl_openib_module_t* openib_btl,
const int additional,
const int qp)
{
assert(BTL_OPENIB_QP_TYPE_XRC(qp));
OPAL_THREAD_LOCK(&openib_btl->ib_lock);
if(openib_btl->qps[qp].u.xrc_qp.rd_posted <=
mca_btl_openib_component.qp_infos[qp].rd_low + additional &&
openib_btl->qps[qp].u.xrc_qp.rd_posted <
mca_btl_openib_component.qp_infos[qp].rd_num) {
int rc;
int32_t i, num_post = mca_btl_openib_component.qp_infos[qp].rd_num -
openib_btl->qps[qp].u.xrc_qp.rd_posted;
struct ibv_recv_wr *bad_wr;
ompi_free_list_t *free_list;
free_list = &openib_btl->qps[qp].recv_free;
for(i = 0; i < num_post; i++) {
ompi_free_list_item_t* item;
OMPI_FREE_LIST_WAIT(free_list, item, rc);
to_base_frag(item)->base.order = qp;
to_com_frag(item)->endpoint = NULL;
if(ibv_post_srq_recv(openib_btl->qps[qp].u.xrc_qp.xrc,
&to_recv_frag(item)->rd_desc, &bad_wr)) {
BTL_ERROR(("error posting receive descriptors to shared "
"receive queue: %s", strerror(errno)));
OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
return OMPI_ERROR;
}
}
OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.xrc_qp.rd_posted, num_post);
}
OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
return OMPI_SUCCESS;
}
#define BTL_OPENIB_RDMA_QP(QP) \
((QP) == mca_btl_openib_component.rdma_qp)

Просмотреть файл

@ -59,6 +59,7 @@
#include "btl_openib_proc.h"
#include "btl_openib_ini.h"
#include "btl_openib_mca.h"
#include "btl_openib_xrc.h"
#if OMPI_HAVE_THREADS
#include "btl_openib_async.h"
#endif
@ -369,6 +370,13 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_hca_t *hca,
/* store the subnet for multi-nic support */
openib_btl->port_info.subnet_id = subnet_id;
openib_btl->port_info.mtu = hca->mtu;
#if HAVE_XRC
/* This code is protected with ifdef because we don't want to send
* extra bytes during OOB */
if(MCA_BTL_XRC_ENABLED) {
openib_btl->port_info.lid = lid;
}
#endif
openib_btl->ib_reg[MCA_BTL_TAG_BTL].cbfunc = btl_openib_control;
openib_btl->ib_reg[MCA_BTL_TAG_BTL].cbdata = NULL;
@ -510,6 +518,21 @@ static int init_one_hca(opal_list_t *btl_list, struct ibv_device* ib_dev)
ret = OMPI_SUCCESS;
goto close_hca;
}
#if HAVE_XRC
/* if user configured to run with XRC qp and the device don't support it -
* we should ignore this hca. Maybe we have other one that have XRC support
*/
if (!(hca->ib_dev_attr.device_cap_flags & IBV_DEVICE_XRC) &&
mca_btl_openib_component.num_xrc_qps > 0) {
opal_show_help("help-mpi-btl-openib.txt",
"XRC on device without XRC support", true,
mca_btl_openib_component.num_xrc_qps,
ibv_get_device_name(ib_dev),
orte_system_info.nodename);
ret = OMPI_SUCCESS;
goto close_hca;
}
#endif
/* Load in vendor/part-specific HCA parameters. Note that even if
we don't find values for this vendor/part, "values" will be set
indicating that it does not have good values */
@ -581,6 +604,13 @@ static int init_one_hca(opal_list_t *btl_list, struct ibv_device* ib_dev)
goto close_hca;
}
if (MCA_BTL_XRC_ENABLED) {
if (OMPI_SUCCESS != mca_btl_openib_open_xrc_domain(hca)) {
BTL_ERROR(("XRC Internal error. Failed to open xrc domain"));
goto dealloc_pd;
}
}
mpool_resources.reg_data = (void*)hca;
mpool_resources.sizeof_reg = sizeof(mca_btl_openib_reg_t);
mpool_resources.register_mem = openib_reg_mr;
@ -591,7 +621,7 @@ static int init_one_hca(opal_list_t *btl_list, struct ibv_device* ib_dev)
if(NULL == hca->mpool){
BTL_ERROR(("error creating IB memory pool for %s errno says %s\n",
ibv_get_device_name(ib_dev), strerror(errno)));
goto dealloc_pd;
goto close_xrc_domain;
}
#if OMPI_ENABLE_PROGRESS_THREADS == 1
@ -682,6 +712,12 @@ mpool_destroy:
#endif
#endif
mca_mpool_base_module_destroy(hca->mpool);
close_xrc_domain:
if (MCA_BTL_XRC_ENABLED) {
if (OMPI_SUCCESS != mca_btl_openib_close_xrc_domain(hca)) {
BTL_ERROR(("XRC Internal error. Failed to close xrc domain"));
}
}
dealloc_pd:
ibv_dealloc_pd(hca->ib_pd);
close_hca:
@ -778,6 +814,15 @@ static int finish_btl_init(mca_btl_openib_module_t *openib_btl)
openib_btl->qps[qp].u.srq_qp.sd_credits =
mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max;
}
if(BTL_OPENIB_QP_TYPE_XRC(qp)) {
OBJ_CONSTRUCT(&openib_btl->qps[qp].u.xrc_qp.pending_frags[0],
opal_list_t);
OBJ_CONSTRUCT(&openib_btl->qps[qp].u.xrc_qp.pending_frags[1],
opal_list_t);
openib_btl->qps[qp].u.xrc_qp.sd_credits =
mca_btl_openib_component.qp_infos[qp].u.xrc_qp.sd_max;
}
init_data = malloc(sizeof(mca_btl_openib_frag_init_data_t));
/* Initialize pool of send fragments */
@ -876,6 +921,11 @@ btl_openib_component_init(int *num_btl_modules,
goto no_btls;
}
if(MCA_BTL_XRC_ENABLED) {
OBJ_CONSTRUCT(&mca_btl_openib_component.ib_addr_table,
opal_hash_table_t);
}
/* If we want fork support, try to enable it */
#ifdef HAVE_IBV_FORK_INIT
if (0 != mca_btl_openib_component.want_fork_support) {
@ -1053,6 +1103,9 @@ btl_openib_component_init(int *num_btl_modules,
/* If we fail early enough in the setup, we just modex around that
there are no openib BTL's in this process and return NULL. */
if (MCA_BTL_XRC_ENABLED)
OBJ_DESTRUCT(&mca_btl_openib_component.ib_addr_table);
mca_btl_openib_component.ib_num_btls = 0;
btl_openib_modex_send();
return NULL;
@ -1143,7 +1196,11 @@ static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl,
mca_btl_openib_module_t *btl = ep->endpoint_btl;
OPAL_THREAD_ADD32(&btl->qps[rqp].u.srq_qp.rd_posted, -1);
mca_btl_openib_post_srr(btl, 0, rqp);
} else {
} else if(BTL_OPENIB_QP_TYPE_XRC(rqp)) {
mca_btl_openib_module_t *btl = ep->endpoint_btl;
OPAL_THREAD_ADD32(&btl->qps[rqp].u.xrc_qp.rd_posted, -1);
mca_btl_openib_post_xrr(openib_btl, 0, rqp);
} else { /* PP QP */
if(OPAL_UNLIKELY(is_credit_msg))
OPAL_THREAD_ADD32(&ep->qps[cqp].u.pp_qp.cm_received, 1);
else
@ -1349,13 +1406,22 @@ static void progress_pending_frags_srq(mca_btl_openib_module_t* openib_btl,
opal_list_item_t *frag;
int i;
assert(BTL_OPENIB_QP_TYPE_SRQ(qp));
assert(BTL_OPENIB_QP_TYPE_SRQ(qp) || BTL_OPENIB_QP_TYPE_XRC(qp));
for(i = 0; i < 2; i++) {
while(openib_btl->qps[qp].u.srq_qp.sd_credits > 0) {
opal_list_t *pending;
int32_t *sd_credits;
if (BTL_OPENIB_QP_TYPE_SRQ(qp)) {
pending = &openib_btl->qps[qp].u.srq_qp.pending_frags[i];
sd_credits = &openib_btl->qps[qp].u.srq_qp.sd_credits;
} else {
pending = &openib_btl->qps[qp].u.xrc_qp.pending_frags[i];
sd_credits = &openib_btl->qps[qp].u.xrc_qp.sd_credits;
}
while(*sd_credits > 0) {
OPAL_THREAD_LOCK(&openib_btl->ib_lock);
frag = opal_list_remove_first(
&openib_btl->qps[qp].u.srq_qp.pending_frags[i]);
frag = opal_list_remove_first(pending);
OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
if(NULL == frag)
@ -1551,9 +1617,12 @@ static int btl_openib_module_progress(mca_btl_openib_hca_t* hca)
/* return send wqe */
qp_put_wqe(endpoint, qp);
if(IBV_WC_SEND == wc.opcode && BTL_OPENIB_QP_TYPE_SRQ(qp)) {
OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits,
1);
if(IBV_WC_SEND == wc.opcode && !BTL_OPENIB_QP_TYPE_PP(qp)) {
int32_t *sd_credits = BTL_OPENIB_QP_TYPE_SRQ(qp) ?
&openib_btl->qps[qp].u.srq_qp.sd_credits :
&openib_btl->qps[qp].u.xrc_qp.sd_credits;
OPAL_THREAD_ADD32(sd_credits, 1);
/* new SRQ credit available. Try to progress pending frags*/
progress_pending_frags_srq(openib_btl, qp);
}

Просмотреть файл

@ -13,6 +13,7 @@
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2006-2007 Voltaire All rights reserved.
* Copyright (c) 2006-2007 Mellanox Technologies, Inc. All rights reserved.
*
* $COPYRIGHT$
*
@ -42,6 +43,7 @@
#include "btl_openib_endpoint.h"
#include "btl_openib_proc.h"
#include "btl_openib_frag.h"
#include "btl_openib_xrc.h"
static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint);
static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint);
@ -93,14 +95,18 @@ static int post_send(mca_btl_openib_endpoint_t *ep,
sizeof(mca_btl_openib_footer_t);
sr_desc->wr.rdma.remote_addr -= sg->length;
} else {
if(BTL_OPENIB_QP_TYPE_SRQ(qp)) {
if(BTL_OPENIB_QP_TYPE_PP(qp)) {
sr_desc->opcode = IBV_WR_SEND;
} else {
sr_desc->opcode = IBV_WR_SEND_WITH_IMM;
sr_desc->imm_data = ep->rem_info.rem_index;
} else {
sr_desc->opcode = IBV_WR_SEND;
}
}
#if HAVE_XRC
if(BTL_OPENIB_QP_TYPE_XRC(qp))
sr_desc->xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num;
#endif
assert(sg->addr == (uint64_t)frag->hdr);
return ibv_post_send(ep->qps[qp].qp->lcl_qp, sr_desc, &bad_wr);
@ -149,8 +155,9 @@ static int acquire_send_credit(mca_btl_openib_endpoint_t *endpoint,
(opal_list_item_t *)frag);
return OMPI_ERR_OUT_OF_RESOURCE;
}
} else {
if(OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, -1) < 0) {
} else if(BTL_OPENIB_QP_TYPE_SRQ(qp)) {
if(OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, -1) < 0)
{
OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1);
OPAL_THREAD_LOCK(&openib_btl->ib_lock);
opal_list_append(&openib_btl->qps[qp].u.srq_qp.pending_frags[prio],
@ -158,6 +165,16 @@ static int acquire_send_credit(mca_btl_openib_endpoint_t *endpoint,
OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
return OMPI_ERR_OUT_OF_RESOURCE;
}
} else { /* XRC QP */
if(OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.xrc_qp.sd_credits, -1) < 0)
{
OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.xrc_qp.sd_credits, 1);
OPAL_THREAD_LOCK(&openib_btl->ib_lock);
opal_list_append(&openib_btl->qps[qp].u.xrc_qp.pending_frags[prio],
(opal_list_item_t *)frag);
OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
return OMPI_ERR_OUT_OF_RESOURCE;
}
}
return OMPI_SUCCESS;
@ -241,9 +258,12 @@ int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t *endpoint,
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits,
hdr->credits);
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, 1);
} else {
} else if BTL_OPENIB_QP_TYPE_SRQ(qp){
mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl;
OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1);
} else { /* XRC QP */
mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl;
OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.xrc_qp.sd_credits, 1);
}
}
BTL_ERROR(("error posting send request error %d: %s\n",
@ -313,8 +333,27 @@ endpoint_init_qp_srq(mca_btl_openib_endpoint_qp_t *ep_qp, const int qp)
ep_qp->qp->sd_wqe = mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max;
}
static void endpoint_init_qp(mca_btl_openib_endpoint_qp_t *ep_qp, const int qp)
static void
endpoint_init_qp_xrc(mca_btl_openib_endpoint_qp_t *ep_qp, const int qp,
mca_btl_openib_qp_t *xrc_qp)
{
/* In XRC mode the we the qps used as send qp only. We need only one send
* qp, and other qps points to the first one */
if (0 == qp) {
ep_qp->qp = endpoint_alloc_qp();
/* number of available send WQEs */
ep_qp->qp->sd_wqe =
mca_btl_openib_component.qp_infos[qp].u.xrc_qp.sd_max;
} else {
ep_qp->qp = xrc_qp;
}
ep_qp->qp->users++;
}
static void endpoint_init_qp(mca_btl_base_endpoint_t *ep, const int qp)
{
mca_btl_openib_endpoint_qp_t *ep_qp = &ep->qps[qp];
ep_qp->rd_credit_send_lock = 0;
ep_qp->credit_frag = NULL;
@ -327,6 +366,9 @@ static void endpoint_init_qp(mca_btl_openib_endpoint_qp_t *ep_qp, const int qp)
case MCA_BTL_OPENIB_SRQ_QP:
endpoint_init_qp_srq(ep_qp, qp);
break;
case MCA_BTL_OPENIB_XRC_QP:
endpoint_init_qp_xrc(ep_qp, qp, ep->qps[0].qp);
break;
default:
BTL_ERROR(("Wrong QP type"));
break;
@ -339,15 +381,24 @@ static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint)
int qp;
/* setup qp structures */
if( mca_btl_openib_component.num_qps > 0 ) {
endpoint->qps = (mca_btl_openib_endpoint_qp_t*)
calloc(mca_btl_openib_component.num_qps,
sizeof(mca_btl_openib_endpoint_qp_t));
endpoint->qps = (mca_btl_openib_endpoint_qp_t*)
calloc(mca_btl_openib_component.num_qps,
sizeof(mca_btl_openib_endpoint_qp_t));
if (MCA_BTL_XRC_ENABLED) {
endpoint->rem_info.rem_qps = (mca_btl_openib_rem_qp_info_t*)
calloc(1, sizeof(mca_btl_openib_rem_qp_info_t));
endpoint->rem_info.rem_srqs = (mca_btl_openib_rem_srq_info_t*)
calloc(mca_btl_openib_component.num_xrc_qps,
sizeof(mca_btl_openib_rem_srq_info_t));
} else {
endpoint->rem_info.rem_qps = (mca_btl_openib_rem_qp_info_t*)
calloc(mca_btl_openib_component.num_qps,
sizeof(mca_btl_openib_rem_qp_info_t));
endpoint->rem_info.rem_srqs = NULL;
}
endpoint->ib_addr = NULL;
endpoint->xrc_recv_qp = NULL;
endpoint->endpoint_btl = 0;
endpoint->endpoint_proc = 0;
endpoint->endpoint_tstamp = 0.0;
@ -376,12 +427,8 @@ static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint)
endpoint->eager_rdma_remote.tokens = 0;
endpoint->eager_rdma_local.credits = 0;
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
endpoint_init_qp(&endpoint->qps[qp], qp);
/* setup rem_info */
endpoint->rem_info.rem_qps[qp].rem_qp_num = 0;
endpoint->rem_info.rem_qps[qp].rem_psn = 0;
}
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++)
endpoint_init_qp(endpoint, qp);
}
/*
@ -416,6 +463,12 @@ static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint)
/* Close opened QPs if we have them*/
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
if (BTL_OPENIB_QP_TYPE_XRC(qp) &&
endpoint != endpoint->ib_addr->ep_xrc_master) {
/* in XRC case we need to release only first one on master
* endpoint */
goto clean_endpoint;
}
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->qps[qp].pending_frags[0]);
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->qps[qp].pending_frags[1]);
OBJ_DESTRUCT(&endpoint->qps[qp].pending_frags[0]);
@ -441,6 +494,16 @@ static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint)
/* free the qps */
free(endpoint->qps);
clean_endpoint:
/* destroy recv qp */
if (NULL != endpoint->xrc_recv_qp && NULL != endpoint->xrc_recv_qp->qp) {
if(ibv_destroy_qp(endpoint->xrc_recv_qp->qp->lcl_qp)) {
BTL_ERROR(("Failed to destroy XRC recv QP:%d\n", qp));
}
free(endpoint->xrc_recv_qp->qp);
free(endpoint->xrc_recv_qp);
}
OBJ_DESTRUCT(&endpoint->endpoint_lock);
/* Clean pending lists */
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->pending_lazy_frags);
@ -465,7 +528,9 @@ int mca_btl_openib_endpoint_post_recvs(mca_btl_openib_endpoint_t *endpoint)
for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) {
if (BTL_OPENIB_QP_TYPE_SRQ(qp)) {
mca_btl_openib_post_srr(endpoint->endpoint_btl, 1, qp);
} else {
} else if(BTL_OPENIB_QP_TYPE_XRC(qp)) {
mca_btl_openib_post_xrr(endpoint->endpoint_btl, 1, qp);
} else { /* PP QP */
mca_btl_openib_endpoint_post_rr(endpoint, qp);
}
}
@ -479,8 +544,25 @@ int mca_btl_openib_endpoint_post_recvs(mca_btl_openib_endpoint_t *endpoint)
*/
void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint)
{
opal_list_item_t *frag_item;
opal_list_item_t *frag_item, *ep_item;
mca_btl_openib_send_frag_t *frag;
mca_btl_openib_endpoint_t *ep;
bool master = false;
if (MCA_BTL_XRC_ENABLED) {
OPAL_THREAD_LOCK(&endpoint->ib_addr->addr_lock);
if (MCA_BTL_IB_ADDR_CONNECTED == endpoint->ib_addr->status) {
/* We are not xrc master */
/* set our qp pointer to master qp */
endpoint->qps = endpoint->ib_addr->ep_xrc_master->qps;
master = false;
} else {
/* I'm master of XRC */
endpoint->ib_addr->status = MCA_BTL_IB_ADDR_CONNECTED;
endpoint->ib_addr->ep_xrc_master = endpoint;
master = true;
}
}
endpoint->endpoint_state = MCA_BTL_IB_CONNECTED;
@ -502,6 +584,17 @@ void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint)
* state then we restart them here */
mca_btl_openib_frag_progress_pending_put_get(endpoint,
mca_btl_openib_component.rdma_qp);
if(MCA_BTL_XRC_ENABLED) {
while(master && !opal_list_is_empty(&endpoint->ib_addr->pending_ep)) {
ep_item = opal_list_remove_first(&endpoint->ib_addr->pending_ep);
ep = (mca_btl_openib_endpoint_t *)ep_item;
if (OMPI_SUCCESS != ompi_btl_openib_connect.bcf_start_connect(ep)) {
BTL_ERROR(("Failed to connect pending endpoint\n"));
}
}
OPAL_THREAD_UNLOCK(&endpoint->ib_addr->addr_lock);
}
}
/*

Просмотреть файл

@ -13,6 +13,7 @@
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2006-2007 Voltaire All rights reserved.
* Copyright (c) 2007 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -74,6 +75,11 @@ struct mca_btl_openib_rem_qp_info_t {
/* Remote processes port sequence number */
}; typedef struct mca_btl_openib_rem_qp_info_t mca_btl_openib_rem_qp_info_t;
struct mca_btl_openib_rem_srq_info_t {
/* Remote SRQ number */
uint32_t rem_srq_num;
}; typedef struct mca_btl_openib_rem_srq_info_t mca_btl_openib_rem_srq_info_t;
struct mca_btl_openib_rem_info_t {
uint16_t rem_lid;
/* Local identifier of the remote process */
@ -84,6 +90,8 @@ struct mca_btl_openib_rem_info_t {
uint32_t rem_index;
/* index of remote endpoint in endpoint array */
mca_btl_openib_rem_qp_info_t *rem_qps;
/* remote xrc_srq info , used only with xrc connections */
mca_btl_openib_rem_srq_info_t *rem_srqs;
}; typedef struct mca_btl_openib_rem_info_t mca_btl_openib_rem_info_t;
@ -112,6 +120,8 @@ struct mca_btl_openib_endpoint_srq_qp_t {
int32_t dummy;
}; typedef struct mca_btl_openib_endpoint_srq_qp_t mca_btl_openib_endpoint_srq_qp_t;
typedef struct mca_btl_openib_endpoint_srq_qp_t mca_btl_openib_endpoint_xrc_qp_t;
typedef struct mca_btl_openib_qp_t {
struct ibv_qp *lcl_qp;
uint32_t lcl_psn;
@ -130,6 +140,7 @@ typedef struct mca_btl_openib_endpoint_qp_t {
mca_btl_openib_send_control_frag_t *credit_frag;
union {
mca_btl_openib_endpoint_srq_qp_t srq_qp;
mca_btl_openib_endpoint_xrc_qp_t xrc_qp;
mca_btl_openib_endpoint_pp_qp_t pp_qp;
} u;
} mca_btl_openib_endpoint_qp_t;
@ -168,6 +179,7 @@ struct mca_btl_base_endpoint_t {
*/
mca_btl_openib_endpoint_qp_t *qps;
mca_btl_openib_endpoint_qp_t *xrc_recv_qp; /* in xrc we will use it as recv qp */
opal_list_t pending_get_frags; /**< list of pending rget ops */
opal_list_t pending_put_frags; /**< list of pending rput ops */
@ -182,6 +194,10 @@ struct mca_btl_base_endpoint_t {
uint64_t subnet_id; /**< subnet id of this endpoint*/
uint16_t lid; /**< used only for xrc. caching remote lid number.
Pasha: do we need to cache it here ?!!! */
struct ib_address_t *ib_addr; /**< used only for xrc; pointer to struct
that keeps remote port info */
int32_t eager_recv_count; /**< number of eager received */
mca_btl_openib_eager_rdma_remote_t eager_rdma_remote;

Просмотреть файл

@ -479,11 +479,11 @@ static int mca_btl_openib_mca_setup_qps(void)
/* All the multi-qp stuff.. */
char *str;
char **queues, **params = NULL;
int num_pp_qps = 0, num_srq_qps = 0, qp = 0, ret = OMPI_ERROR;
int num_xrc_qps = 0, num_pp_qps = 0, num_srq_qps = 0, qp = 0;
char *default_qps = "P,128,256,128,16:S,1024,256,128,32:S,4096,256,128,32:S,65536,256,128,32";
uint32_t max_qp_size, max_size_needed;
int32_t min_freelist_size = 0;
int smallest_pp_qp = 0;
int smallest_pp_qp = 0, ret = OMPI_ERROR;
reg_string("receive_queues",
"Colon-delimited, coma delimited list of receive queues: P,4096,8,6,4:P,32768,8,6,4",
@ -504,6 +504,14 @@ static int mca_btl_openib_mca_setup_qps(void)
smallest_pp_qp = qp;
} else if (0 == strncmp("S,", queues[qp], 2)) {
num_srq_qps++;
} else if (0 == strncmp("X,", queues[qp], 2)) {
#if HAVE_XRC
num_xrc_qps++;
#else
opal_show_help("help-mpi-btl-openib.txt", "No XRC support", true,
orte_system_info.nodename, str);
goto error;
#endif
} else {
opal_show_help("help-mpi-btl-openib.txt",
"invalid qp type in receive_queues", true,
@ -512,9 +520,23 @@ static int mca_btl_openib_mca_setup_qps(void)
}
qp++;
}
/* Current XRC implementation can't used with other QP types - PP and SRQ */
if (num_xrc_qps > 0 && (num_pp_qps > 0 || num_srq_qps > 0)) {
opal_show_help("help-mpi-btl-openib.txt", "XRC with PP or SRQ", true,
orte_system_info.nodename, str);
goto error;
}
/* Current XRC implementation can't used with btls_per_lid > 1 */
if (num_xrc_qps > 0 && mca_btl_openib_component.btls_per_lid > 1) {
opal_show_help("help-mpi-btl-openib.txt", "XRC with BTLs per LID", true,
orte_system_info.nodename, str, num_xrc_qps);
goto error;
}
mca_btl_openib_component.num_pp_qps = num_pp_qps;
mca_btl_openib_component.num_srq_qps = num_srq_qps;
mca_btl_openib_component.num_qps = num_pp_qps + num_srq_qps;
mca_btl_openib_component.num_xrc_qps = num_xrc_qps;
mca_btl_openib_component.num_qps = num_pp_qps + num_srq_qps + num_xrc_qps;
mca_btl_openib_component.qp_infos = (mca_btl_openib_qp_info_t*)
malloc(sizeof(mca_btl_openib_qp_info_t) *
@ -591,6 +613,25 @@ static int mca_btl_openib_mca_setup_qps(void)
min_freelist_size =
mca_btl_openib_component.qp_infos[qp].rd_num;
}
} else if(params[0][0] =='X') {
if(count < 3 || count > 5) {
opal_show_help("help-mpi-btl-openib.txt",
"invalid xrc specification", true,
orte_system_info.nodename, queues[qp]);
goto error;
}
mca_btl_openib_component.qp_infos[qp].size = atoi_param(P(1), 0);
mca_btl_openib_component.qp_infos[qp].rd_num = atoi_param(P(2), 16);
tmp = mca_btl_openib_component.qp_infos[qp].rd_num >> 1;
mca_btl_openib_component.qp_infos[qp].rd_low = atoi_param(P(3), tmp);
tmp = mca_btl_openib_component.qp_infos[qp].rd_low >> 2;
mca_btl_openib_component.qp_infos[qp].u.xrc_qp.sd_max =
atoi_param(P(4), tmp);
BTL_VERBOSE(("xrc: rd_num is %d\trd_low is %d\tsd_max is %d\n",
mca_btl_openib_component.qp_infos[qp].rd_num,
mca_btl_openib_component.qp_infos[qp].rd_low,
mca_btl_openib_component.qp_infos[qp].u.xrc_qp.sd_max));
mca_btl_openib_component.qp_infos[qp].type = MCA_BTL_OPENIB_XRC_QP;
}
if (mca_btl_openib_component.qp_infos[qp].rd_num <=
@ -647,7 +688,10 @@ static int mca_btl_openib_mca_setup_qps(void)
ompi_btl_openib_connect_base_open();
ret = MPI_SUCCESS;
if ( OMPI_SUCCESS != ompi_btl_openib_connect_base_open())
goto error;
ret = OMPI_SUCCESS;
error:
if(params) {
qp = 0;

Просмотреть файл

@ -121,6 +121,8 @@ mca_btl_openib_proc_t* mca_btl_openib_proc_create(ompi_proc_t* ompi_proc)
/* Initialize number of peer */
module_proc->proc_endpoint_count = 0;
module_proc->proc_ompi = ompi_proc;
/* Initialize nex port to use, used only for xrc */
module_proc->port_touse = 0;
/* build a unique identifier (of arbitrary
* size) to represent the proc */

Просмотреть файл

@ -50,6 +50,10 @@ struct mca_btl_openib_proc_t {
size_t proc_port_count;
/**< number of ports published by endpoint */
size_t port_touse;
/**< the index of port that will be used for new endpoint; used only for
* xrc qp */
struct mca_btl_base_endpoint_t **proc_endpoints;
/**< array of endpoints that have been created to access this proc */

212
ompi/mca/btl/openib/btl_openib_xrc.c Обычный файл
Просмотреть файл

@ -0,0 +1,212 @@
/*
* Copyright (c) 2007 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include <infiniband/verbs.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>
#include <unistd.h>
#include "opal/util/output.h"
#include "ompi/mca/btl/btl.h"
#include "ompi/mca/btl/base/base.h"
#include "btl_openib_xrc.h"
#include "btl_openib.h"
#if HAVE_XRC
#define SIZE_OF2(A,B) (sizeof(A) + sizeof(B))
static void ib_address_constructor(ib_address_t *ib_addr);
static void ib_address_destructor(ib_address_t *ib_addr);
OBJ_CLASS_INSTANCE(ib_address_t,
opal_list_item_t,
ib_address_constructor,
ib_address_destructor);
/* This func. opens XRC domain */
int mca_btl_openib_open_xrc_domain(struct mca_btl_openib_hca_t *hca)
{
int len;
char *xrc_file_name;
len = asprintf(&xrc_file_name, "%s"OPAL_PATH_SEP"openib_xrc_domain",
orte_process_info.job_session_dir);
if (0 > len) {
BTL_ERROR(("Failed to allocate memomry for XRC file name\n",
strerror(errno)));
return OMPI_ERROR;
}
hca->xrc_fd = open(xrc_file_name, O_CREAT);
if (0 > hca->xrc_fd) {
BTL_ERROR(("Failed to open XRC domain file %s, errno says %s\n",
xrc_file_name,strerror(errno)));
free(xrc_file_name);
}
hca->xrc_domain = ibv_open_xrc_domain(hca->ib_dev_context, hca->xrc_fd, O_CREAT);
if (NULL == hca->xrc_domain) {
BTL_ERROR(("Failed to open XRC domain\n"));
close(hca->xrc_fd);
free(xrc_file_name);
return OMPI_ERROR;
}
return OMPI_SUCCESS;
}
/* This func. closes XRC domain */
int mca_btl_openib_close_xrc_domain(struct mca_btl_openib_hca_t *hca)
{
if (ibv_close_xrc_domain(hca->xrc_domain)) {
BTL_ERROR(("Failed to close XRC domain, errno says %s\n",
hca->xrc_fd, strerror(errno)));
return OMPI_ERROR;
}
/* do we need to check exit status */
if (close(hca->xrc_fd)) {
BTL_ERROR(("Failed to close XRC file descriptor %s, errno says %s\n",
hca->xrc_fd, strerror(errno)));
return OMPI_ERROR;
}
return OMPI_SUCCESS;
}
static void ib_address_constructor(ib_address_t *ib_addr)
{
ib_addr->key = NULL;
ib_addr->subnet_id = 0;
ib_addr->lid = 0;
ib_addr->status = MCA_BTL_IB_ADDR_CLOSED;
ib_addr->ep_xrc_master = NULL;
OBJ_CONSTRUCT(&ib_addr->addr_lock, opal_mutex_t);
OBJ_CONSTRUCT(&ib_addr->pending_ep, opal_list_t);
}
static void ib_address_destructor(ib_address_t *ib_addr)
{
if (NULL != ib_addr->key) {
free(ib_addr->key);
}
OBJ_DESTRUCT(&ib_addr->addr_lock);
OBJ_DESTRUCT(&ib_addr->pending_ep);
}
static int ib_address_init(ib_address_t *ib_addr, uint64_t s_id, uint16_t lid)
{
ib_addr->key = malloc(SIZE_OF2(s_id,lid));
if (NULL == ib_addr->key) {
BTL_ERROR(("Failed to allocate memory for key\n"));
return OMPI_ERROR;
}
memset(ib_addr->key, 0, SIZE_OF2(s_id,lid));
/* creating the key */
memcpy(ib_addr->key, &lid, sizeof(lid));
memcpy((void*)((char*)ib_addr->key + sizeof(lid)), &s_id, sizeof(s_id));
/* caching lid and subnet id */
ib_addr->subnet_id = s_id;
ib_addr->lid = lid;
return OMPI_SUCCESS;
}
/* Create new entry in hash table for subnet_id and lid,
* update the endpoint pointer.
* Before call to this function you need to protect with
*/
int mca_btl_openib_ib_address_add_new (uint64_t s_id, uint16_t lid, mca_btl_openib_endpoint_t *ep)
{
void *tmp;
int ret = OMPI_SUCCESS;
struct ib_address_t *ib_addr = OBJ_NEW(ib_address_t);
ret = ib_address_init(ib_addr, s_id, lid);
if (OMPI_SUCCESS != ret ) {
BTL_ERROR(("XRC Internal error. Failed to init ib_addr\n"));
OBJ_DESTRUCT(ib_addr);
return ret;
}
/* is it already in the table ?*/
OPAL_THREAD_LOCK(&mca_btl_openib_component.ib_lock);
if (OPAL_SUCCESS != opal_hash_table_get_value_ptr(&mca_btl_openib_component.ib_addr_table,
ib_addr->key,
SIZE_OF2(s_id,lid), &tmp)) {
/* It is new one, lets put it on the table */
ret = opal_hash_table_set_value_ptr(&mca_btl_openib_component.ib_addr_table,
ib_addr->key, SIZE_OF2(s_id,lid), (void*)ib_addr);
if (OPAL_SUCCESS != ret) {
BTL_ERROR(("XRC Internal error."
" Failed to add element to mca_btl_openib_component.ib_addr_table\n"));
OPAL_THREAD_UNLOCK(&mca_btl_openib_component.ib_lock);
OBJ_DESTRUCT(ib_addr);
return ret;
}
/* opal_list_append(&mca_btl_openib_component.ib_addr_list,(opal_list_item_t*)ib_addr); */
/* update the endpoint with pointer to ib address */
ep->ib_addr = ib_addr;
} else {
/* so we have this one in the table, just add the pointer to the endpoint */
ep->ib_addr = (ib_address_t *)tmp;
assert(lid == ep->ib_addr->lid && s_id == ep->ib_addr->subnet_id);
OBJ_DESTRUCT(ib_addr);
}
OPAL_THREAD_UNLOCK(&mca_btl_openib_component.ib_lock);
return ret;
}
/* this one not really used, but i need it for debug */
int mca_btl_openib_ib_address_status(uint64_t s_id, uint16_t lid)
{
void *key,*tmp;
int status;
struct ib_address_t *ib_addr;
/* build the key */
key = malloc(SIZE_OF2(s_id,lid));
if (NULL == key) {
BTL_ERROR(("Failed to allocate memory for temporary key\n"));
return OMPI_ERROR;
}
memcpy(ib_addr->key, &lid, sizeof(lid));
memcpy((void*)((char*)ib_addr->key + sizeof(lid)), &s_id, sizeof(s_id));
/* lets get the status*/
OPAL_THREAD_LOCK(&mca_btl_openib_component.ib_lock);
status = opal_hash_table_get_value_ptr(&mca_btl_openib_component.ib_addr_table,
ib_addr->key,
SIZE_OF2(s_id,lid), &tmp);
if (OPAL_SUCCESS == status) {
/* check what is the status of these key */
ib_addr = (struct ib_address_t*)tmp;
/* debug stuff */
if (ib_addr->subnet_id != s_id || ib_addr->lid != lid) {
BTL_ERROR(("XRC Internal error. Was searching for %d, %d but found %d, %d",
s_id, lid, ib_addr->subnet_id, ib_addr->lid));
}
status = ib_addr->status;
OPAL_THREAD_UNLOCK(&mca_btl_openib_component.ib_lock);
free(key);
return status;
}
/* all lids and subnets should be in the list.
* If we is here - we have some problem */
OPAL_THREAD_UNLOCK(&mca_btl_openib_component.ib_lock);
BTL_ERROR(("XRC Internal error. Failed to locate element with subnet %d and lid %d\n",
s_id, lid));
free(key);
return OMPI_ERROR;
}
#endif

46
ompi/mca/btl/openib/btl_openib_xrc.h Обычный файл
Просмотреть файл

@ -0,0 +1,46 @@
/*
* Copyright (c) 2007 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
* @file
*/
#ifndef MCA_BTL_OPENIB_XRC_H
#define MCA_BTL_OPENIB_XRC_H
#include "btl_openib.h"
#include "btl_openib_endpoint.h"
#if HAVE_XRC
#define MCA_BTL_XRC_ENABLED (mca_btl_openib_component.num_xrc_qps)
#else
#define MCA_BTL_XRC_ENABLED 0
#endif
typedef enum {
MCA_BTL_IB_ADDR_CONNECTING = 100,
MCA_BTL_IB_ADDR_CONNECTED,
MCA_BTL_IB_ADDR_CLOSED
} mca_btl_openib_ib_addr_state_t;
struct ib_address_t {
opal_list_item_t super;
void *key; /* the key with size 80bit - [subnet(64) LID(16bit)] */
uint64_t subnet_id; /* caching subnet_id */
uint16_t lid; /* caching lid */
opal_list_t pending_ep; /* list of endpoints that use this ib_address */
mca_btl_openib_endpoint_t *ep_xrc_master; /* pointer to endpoint that keeps the xrc connection */
opal_mutex_t addr_lock; /* protection */
mca_btl_openib_ib_addr_state_t status; /* ib port status */
};
typedef struct ib_address_t ib_address_t;
int mca_btl_openib_open_xrc_domain(struct mca_btl_openib_hca_t *hca);
int mca_btl_openib_close_xrc_domain(struct mca_btl_openib_hca_t *hca);
int mca_btl_openib_ib_address_add_new (uint64_t s_id, uint16_t lid, mca_btl_openib_endpoint_t *ep);
int mca_btl_openib_ib_address_status(uint64_t s_id, uint16_t lid);
#endif

Просмотреть файл

@ -1,5 +1,6 @@
/*
* Copyright (c) 2007 Cisco, Inc. All rights reserved.
* Copyright (c) 2007 Mellanox Technologies, Inc. All rights reserved.
*
* $COPYRIGHT$
*
@ -13,9 +14,11 @@
#include "btl_openib.h"
#include "connect/base.h"
#include "connect/btl_openib_connect_oob.h"
#include "connect/btl_openib_connect_xoob.h"
#include "connect/btl_openib_connect_rdma_cm.h"
#include "opal/util/argv.h"
#include "opal/util/show_help.h"
/*
* Global variable with the selected function pointers in it
@ -30,6 +33,7 @@ ompi_btl_openib_connect_base_funcs_t ompi_btl_openib_connect = {
*/
static ompi_btl_openib_connect_base_funcs_t *all[] = {
&ompi_btl_openib_connect_oob,
&ompi_btl_openib_connect_xoob,
&ompi_btl_openib_connect_rdma_cm,
NULL
};
@ -63,6 +67,26 @@ int ompi_btl_openib_connect_base_open(void)
b, false, false,
"oob", &param);
/* For XRC qps we must to use XOOB connection manager */
if (mca_btl_openib_component.num_xrc_qps > 0 && 0 == strcmp("oob", param)) {
opal_show_help("help-mpi-btl-openib.txt",
"XRC with OOB", true,
orte_system_info.nodename,
mca_btl_openib_component.num_xrc_qps);
return OMPI_ERROR;
}
/* XOOB connection manager may be used only with XRC qps */
if ((mca_btl_openib_component.num_srq_qps > 0 || mca_btl_openib_component.num_pp_qps > 0)
&& 0 == strcmp("xoob", param)) {
opal_show_help("help-mpi-btl-openib.txt",
"SRQ or PP with XOOB", true,
orte_system_info.nodename,
mca_btl_openib_component.num_srq_qps,
mca_btl_openib_component.num_pp_qps);
return OMPI_ERROR;
}
/* Call the open function on all the connect modules */
for (i = 0; NULL != all[i]; ++i) {
if (NULL != all[i]->bcf_open) {

Просмотреть файл

@ -0,0 +1,872 @@
/*
* Copyright (c) 2007 Mellanox Technologies. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "orte/mca/ns/base/base.h"
#include "orte/mca/oob/base/base.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/dss/dss.h"
#include "btl_openib.h"
#include "btl_openib_endpoint.h"
#include "btl_openib_proc.h"
#include "btl_openib_xrc.h"
#include "connect/connect.h"
static int xoob_init(void);
static int xoob_start_connect(mca_btl_base_endpoint_t *e);
static int xoob_finalize(void);
/*
* The "module" struct -- the top-level function pointers for the xoob
* connection scheme.
*/
ompi_btl_openib_connect_base_funcs_t ompi_btl_openib_connect_xoob = {
"xoob",
/* No need for "open */
NULL,
/* Init */
xoob_init,
/* Connect */
xoob_start_connect,
/* Finalize */
xoob_finalize,
};
#if HAVE_XRC
typedef enum {
SEND,
RECV
} xoob_qp_type;
typedef enum {
ENDPOINT_XOOB_CONNECT_REQUEST,
ENDPOINT_XOOB_CONNECT_RESPONSE,
ENDPOINT_XOOB_CONNECT_XRC_REQUEST,
ENDPOINT_XOOB_CONNECT_XRC_RESPONSE
} connect_message_type_t;
#define XOOB_TAG (ORTE_RML_TAG_DYNAMIC - 1)
#define XOOB_SET_REMOTE_INFO(EP, INFO) \
do { \
/* copy the rem_info stuff */ \
EP.rem_lid = INFO.rem_lid; \
EP.rem_subnet_id = INFO.rem_subnet_id; \
EP.rem_mtu = INFO.rem_mtu; \
EP.rem_index = INFO.rem_index; \
memcpy((void*)EP.rem_qps, (void*)INFO.rem_qps, \
sizeof(mca_btl_openib_rem_qp_info_t)); \
/* copy the rem_info stuff */ \
memcpy((void*)EP.rem_srqs, (void*)INFO.rem_srqs, \
sizeof(mca_btl_openib_rem_srq_info_t) * \
mca_btl_openib_component.num_xrc_qps); \
} while (0)
/* remote data processing */
static mca_btl_openib_endpoint_t* xoob_find_endpoint(orte_process_name_t* process_name,
uint64_t subnet_id, uint16_t lid, uint8_t message_type);
/* send/recv connection data */
static int xoob_reply_first_connect(mca_btl_openib_endpoint_t *endpoint,
mca_btl_openib_rem_info_t *rem_info);
static int xoob_send_connect_data(mca_btl_base_endpoint_t* endpoint,
uint8_t message_type);
static int xoob_receive_connect_data(mca_btl_openib_rem_info_t *info, uint16_t *lid,
uint8_t *message_type, orte_buffer_t* buffer);
/* func that take care for qp creations */
static int xoob_qp_create(mca_btl_base_endpoint_t* endpoint, xoob_qp_type type);
/* static int xoob_qp_connect(mca_btl_openib_endpoint_t *endpoint, xoob_qp_type type); */
static int xoob_qp_connect(mca_btl_openib_endpoint_t *endpoint, xoob_qp_type type, mca_btl_openib_rem_info_t *rem_info);
static void xoob_rml_send_cb(int status, orte_process_name_t* endpoint,
orte_buffer_t* buffer, orte_rml_tag_t tag,
void* cbdata);
static void xoob_rml_recv_cb(int status, orte_process_name_t* process_name,
orte_buffer_t* buffer, orte_rml_tag_t tag,
void* cbdata);
static int init_rem_info(mca_btl_openib_rem_info_t *rem_info);
static void free_rem_info(mca_btl_openib_rem_info_t *rem_info);
/*
* Init function. Post non-blocking RML receive to accept incoming
* connection requests.
*/
static int xoob_init(void)
{
int rc;
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
XOOB_TAG,
ORTE_RML_PERSISTENT,
xoob_rml_recv_cb,
NULL);
return (ORTE_SUCCESS == rc) ? OMPI_SUCCESS : rc;
}
/*
* Connect function. Start initiation of connections to a remote
* peer. We send our Queue Pair information over the RML/OOB
* communication mechanism. On completion of our send, a send
* completion handler is called.
*/
static int xoob_start_connect(mca_btl_base_endpoint_t *endpoint)
{
int rc = OMPI_SUCCESS;
OPAL_THREAD_LOCK(&endpoint->ib_addr->addr_lock);
switch (endpoint->ib_addr->status) {
case MCA_BTL_IB_ADDR_CLOSED:
BTL_VERBOSE(("XOOB. The IB addr: sid %d lid %d"
"in MCA_BTL_IB_ADDR_CLOSED status,"
" sending ENDPOINT_XOOB_CONNECT_REQUEST\n",
endpoint->ib_addr->subnet_id,endpoint->ib_addr->lid));
if (OMPI_SUCCESS != (rc = xoob_qp_create(endpoint, SEND))) {
break;
}
/* Send connection info over to remote endpoint */
endpoint->endpoint_state = MCA_BTL_IB_CONNECTING;
endpoint->ib_addr->status = MCA_BTL_IB_ADDR_CONNECTING;
if (OMPI_SUCCESS !=
(rc = xoob_send_connect_data(endpoint, ENDPOINT_XOOB_CONNECT_REQUEST))) {
BTL_ERROR(("error sending connect request, error code %d", rc));
}
break;
case MCA_BTL_IB_ADDR_CONNECTING:
BTL_VERBOSE(("XOOB. The IB addr: sid %d lid %d"
"in MCA_BTL_IB_ADDR_CONNECTING status,"
" Subscribing to this address\n",
endpoint->ib_addr->subnet_id,endpoint->ib_addr->lid));
/* some body already connectng to this machine, lets wait */
opal_list_append(&endpoint->ib_addr->pending_ep, (opal_list_item_t*)endpoint);
endpoint->endpoint_state = MCA_BTL_IB_CONNECTING;
break;
case MCA_BTL_IB_ADDR_CONNECTED:
/* so we have the send qp, we just need the recive site.
* Send request for SRQ numbers */
BTL_VERBOSE(("XOOB. The IB addr: sid %d lid %d"
"in MCA_BTL_IB_ADDR_CONNECTED status,"
" sending ENDPOINT_XOOB_CONNECT_XRC_REQUEST\n",
endpoint->ib_addr->subnet_id,endpoint->ib_addr->lid));
endpoint->endpoint_state = MCA_BTL_IB_CONNECTING;
if (OMPI_SUCCESS !=
(rc = xoob_send_connect_data(endpoint, ENDPOINT_XOOB_CONNECT_XRC_REQUEST))) {
BTL_ERROR(("error sending xrc connect request, error code %d", rc));
}
break;
default :
BTL_ERROR(("XOOB: Invalid endpoint status %d", endpoint->ib_addr->status));
}
OPAL_THREAD_UNLOCK(&endpoint->ib_addr->addr_lock);
return rc;
}
/*
* Finalize function. Cleanup RML non-blocking receive.
*/
static int xoob_finalize(void)
{
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, XOOB_TAG);
return OMPI_SUCCESS;
}
/*
* Reply to a `start - connect' message
*/
static int xoob_reply_first_connect(mca_btl_openib_endpoint_t *endpoint,
mca_btl_openib_rem_info_t *rem_info)
{
int rc;
BTL_VERBOSE(("Initialized QPs, LID = %d",
((mca_btl_openib_module_t*)endpoint->endpoint_btl)->lid));
/* Create local QP's and post receive resources */
if (OMPI_SUCCESS != (rc = xoob_qp_create(endpoint, RECV))) {
return rc;
}
/* Connect to remote endpoint qp's */
if (OMPI_SUCCESS != (rc = xoob_qp_connect(endpoint, RECV, rem_info))) {
return rc;
}
if (OMPI_SUCCESS !=
(rc = xoob_send_connect_data(endpoint, ENDPOINT_XOOB_CONNECT_RESPONSE))) {
BTL_ERROR(("error in endpoint send connect request error code is %d",
rc));
return rc;
}
return OMPI_SUCCESS;
}
/*
* Create the local side of all the qp's. The remote sides will be
* connected later.
*/
static int xoob_qp_create(mca_btl_base_endpoint_t* endpoint, xoob_qp_type type)
{
int prio = BTL_OPENIB_LP_CQ; /* pasha - on witch CP do we want to put send complition ?! */
mca_btl_openib_endpoint_qp_t * ep_qp;
mca_btl_openib_module_t *openib_btl =
(mca_btl_openib_module_t*)endpoint->endpoint_btl;
/* Prepare QP structs */
if (SEND == type) {
BTL_VERBOSE(("XOOB. Creating Send QP\n"));
ep_qp = endpoint->qps;
} else {
BTL_VERBOSE(("XOOB. Creating Recv QP\n"));
assert(NULL == endpoint->xrc_recv_qp);
endpoint->xrc_recv_qp =
(mca_btl_openib_endpoint_qp_t*)
malloc(sizeof(mca_btl_openib_endpoint_qp_t));
if (NULL == endpoint->xrc_recv_qp) {
BTL_ERROR(("XOOB. Failed to allocate memory for QP\n"));
return OMPI_ERROR;
}
endpoint->xrc_recv_qp->qp = (struct mca_btl_openib_qp_t*)
calloc(1, sizeof(struct mca_btl_openib_qp_t));
if (NULL == endpoint->xrc_recv_qp->qp) {
BTL_ERROR(("XOOB. Failed to allocate memory for QP data\n"));
return OMPI_ERROR;
}
ep_qp = endpoint->xrc_recv_qp;
}
/* Create the Queue Pair */
{
struct ibv_qp* my_qp;
struct ibv_qp_init_attr qp_init_attr;
struct ibv_qp_attr attr;
memset(&qp_init_attr, 0, sizeof(struct ibv_qp_init_attr));
memset(&attr, 0, sizeof(struct ibv_qp_attr));
qp_init_attr.send_cq =
qp_init_attr.recv_cq = openib_btl->hca->ib_cq[prio];
qp_init_attr.cap.max_recv_wr =
mca_btl_openib_component.qp_infos->rd_num;
/* reserve additional wr for eager rdma credit management */
qp_init_attr.cap.max_send_wr =
mca_btl_openib_component.qp_infos->u.xrc_qp.sd_max +
(mca_btl_openib_component.use_eager_rdma ?
mca_btl_openib_component.max_eager_rdma : 0);
qp_init_attr.cap.max_send_sge = mca_btl_openib_component.ib_sg_list_size;
/* this one is ignored by driver */
qp_init_attr.cap.max_recv_sge = mca_btl_openib_component.ib_sg_list_size;
qp_init_attr.qp_type = IBV_QPT_XRC;
qp_init_attr.xrc_domain = openib_btl->hca->xrc_domain;
my_qp = ibv_create_qp(openib_btl->hca->ib_pd, &qp_init_attr);
if (NULL == my_qp) {
BTL_ERROR(("error creating qp errno says %s", strerror(errno)));
return OMPI_ERROR;
}
ep_qp->qp->lcl_qp = my_qp;
openib_btl->ib_inline_max = qp_init_attr.cap.max_inline_data;
attr.qp_state = IBV_QPS_INIT;
attr.pkey_index = openib_btl->pkey_index;
attr.port_num = openib_btl->port_num;
attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ;
if (ibv_modify_qp(ep_qp->qp->lcl_qp,
&attr,
IBV_QP_STATE |
IBV_QP_PKEY_INDEX |
IBV_QP_PORT |
IBV_QP_ACCESS_FLAGS )) {
BTL_ERROR(("error modifying qp to INIT errno says %s", strerror(errno)));
return OMPI_ERROR;
}
}
/* Setup meta data on the endpoint */
ep_qp->qp->lcl_psn = lrand48() & 0xffffff;
ep_qp->credit_frag = NULL;
openib_btl->hca->cq_users[prio]++;
/* Now that all the qp's are created locally, post some receive
buffers, setup credits, etc. */
return mca_btl_openib_endpoint_post_recvs(endpoint);
}
/*
* Connect the local ends of qp to the remote side
*/
static int xoob_qp_connect(mca_btl_openib_endpoint_t *endpoint, xoob_qp_type type, mca_btl_openib_rem_info_t *rem_info)
{
struct ibv_qp* qp;
struct ibv_qp_attr attr;
mca_btl_openib_endpoint_qp_t * ep_qp; /* endpoint qp */
mca_btl_openib_module_t* openib_btl =
(mca_btl_openib_module_t*)endpoint->endpoint_btl;
if (SEND == type) {
BTL_VERBOSE(("XOOB. Connecting Send QP\n"));
assert(NULL != endpoint->qps);
ep_qp = endpoint->qps;
} else {
BTL_VERBOSE(("XOOB. Connecting Recv QP\n"));
assert(NULL != endpoint->xrc_recv_qp);
ep_qp = endpoint->xrc_recv_qp;
}
qp = ep_qp->qp->lcl_qp;
memset(&attr, 0, sizeof(attr));
attr.qp_state = IBV_QPS_RTR;
attr.path_mtu = (openib_btl->hca->mtu < endpoint->rem_info.rem_mtu) ?
openib_btl->hca->mtu : rem_info->rem_mtu;
attr.dest_qp_num = rem_info->rem_qps->rem_qp_num;
attr.rq_psn = rem_info->rem_qps->rem_psn;
attr.max_dest_rd_atomic = mca_btl_openib_component.ib_max_rdma_dst_ops;
attr.min_rnr_timer = mca_btl_openib_component.ib_min_rnr_timer;
attr.ah_attr.is_global = 0;
attr.ah_attr.dlid = rem_info->rem_lid;
attr.ah_attr.sl = mca_btl_openib_component.ib_service_level;
attr.ah_attr.src_path_bits = openib_btl->src_path_bits;
attr.ah_attr.port_num = openib_btl->port_num;
attr.ah_attr.static_rate = 0;
if (mca_btl_openib_component.verbose) {
BTL_VERBOSE(("Set MTU to IBV value %d (%s bytes)", attr.path_mtu,
(attr.path_mtu == IBV_MTU_256) ? "256" :
(attr.path_mtu == IBV_MTU_512) ? "512" :
(attr.path_mtu == IBV_MTU_1024) ? "1024" :
(attr.path_mtu == IBV_MTU_2048) ? "2048" :
(attr.path_mtu == IBV_MTU_4096) ? "4096" :
"unknown (!)"));
}
if (ibv_modify_qp(qp, &attr,
IBV_QP_STATE |
IBV_QP_AV |
IBV_QP_PATH_MTU |
IBV_QP_DEST_QPN |
IBV_QP_RQ_PSN |
IBV_QP_MAX_DEST_RD_ATOMIC |
IBV_QP_MIN_RNR_TIMER)) {
BTL_ERROR(("error modifing QP to RTR errno says %s",
strerror(errno)));
return OMPI_ERROR;
}
attr.qp_state = IBV_QPS_RTS;
attr.timeout = mca_btl_openib_component.ib_timeout;
attr.retry_cnt = mca_btl_openib_component.ib_retry_count;
attr.rnr_retry = mca_btl_openib_component.ib_rnr_retry;
attr.sq_psn = ep_qp->qp->lcl_psn;
attr.max_rd_atomic = mca_btl_openib_component.ib_max_rdma_dst_ops;
if (ibv_modify_qp(qp, &attr,
IBV_QP_STATE |
IBV_QP_TIMEOUT |
IBV_QP_RETRY_CNT |
IBV_QP_RNR_RETRY |
IBV_QP_SQ_PSN |
IBV_QP_MAX_QP_RD_ATOMIC)) {
BTL_ERROR(("error modifying QP to RTS errno says %s",
strerror(errno)));
return OMPI_ERROR;
}
return OMPI_SUCCESS;
}
/* Receive connect information to remote endpoint */
static int xoob_receive_connect_data(mca_btl_openib_rem_info_t *info, uint16_t *lid,
uint8_t *message_type, orte_buffer_t* buffer)
{
int cnt = 1, rc, srq;
/* Recv standart header */
BTL_VERBOSE(("unpacking %d of %d\n", cnt, ORTE_UINT8));
rc = orte_dss.unpack(buffer, message_type, &cnt, ORTE_UINT8);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return OMPI_ERROR;
}
BTL_VERBOSE(("XOOB Recv unpack Message type = %d", *message_type));
BTL_VERBOSE(("unpacking %d of %d\n", cnt, ORTE_UINT64));
rc = orte_dss.unpack(buffer, &info->rem_subnet_id, &cnt, ORTE_UINT64);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return OMPI_ERROR;
}
BTL_VERBOSE(("XOOB Recv unpack sid = %d", info->rem_subnet_id));
BTL_VERBOSE(("unpacking %d of %d\n", cnt, ORTE_UINT16));
rc = orte_dss.unpack(buffer, &info->rem_lid, &cnt, ORTE_UINT16);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return OMPI_ERROR;
}
BTL_VERBOSE(("XOOB Recv unpack lid = %d", info->rem_lid));
/* Till now we got the standart header, now we continue to recieve data for
* different packet types
*/
if (ENDPOINT_XOOB_CONNECT_REQUEST == *message_type ||
ENDPOINT_XOOB_CONNECT_RESPONSE == *message_type) {
BTL_VERBOSE(("unpacking %d of %d\n", cnt, ORTE_UINT32));
rc = orte_dss.unpack(buffer, &info->rem_qps->rem_qp_num, &cnt,
ORTE_UINT32);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return OMPI_ERROR;
}
BTL_VERBOSE(("XOOB Recv unpack remote qp = %d", info->rem_qps->rem_qp_num));
BTL_VERBOSE(("unpacking %d of %d\n", cnt, ORTE_UINT32));
rc = orte_dss.unpack(buffer, &info->rem_qps->rem_psn, &cnt,
ORTE_UINT32);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return OMPI_ERROR;
}
BTL_VERBOSE(("XOOB Recv unpack remote psn = %d", info->rem_qps->rem_psn));
BTL_VERBOSE(("unpacking %d of %d\n", cnt, ORTE_UINT32));
rc = orte_dss.unpack(buffer, &info->rem_mtu, &cnt, ORTE_UINT32);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return OMPI_ERROR;
}
BTL_VERBOSE(("XOOB Recv unpack remote mtu = %d", info->rem_mtu));
}
if (ENDPOINT_XOOB_CONNECT_REQUEST == *message_type ||
ENDPOINT_XOOB_CONNECT_XRC_REQUEST == *message_type) {
/* unpack requested lid info */
BTL_VERBOSE(("unpacking %d of %d\n", cnt, ORTE_UINT16));
rc = orte_dss.unpack(buffer, lid, &cnt, ORTE_UINT16);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return OMPI_ERROR;
}
BTL_VERBOSE(("XOOB Recv unpack requested lid = %d", *lid));
}
if (ENDPOINT_XOOB_CONNECT_RESPONSE == *message_type ||
ENDPOINT_XOOB_CONNECT_XRC_RESPONSE == *message_type) {
BTL_VERBOSE(("unpacking %d of %d\n", cnt, ORTE_UINT32));
rc = orte_dss.unpack(buffer, &info->rem_index, &cnt, ORTE_UINT32);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return OMPI_ERROR;
}
BTL_VERBOSE(("XOOB Recv unpack remote index = %d", info->rem_index));
for (srq = 0; srq < mca_btl_openib_component.num_xrc_qps; srq++) {
BTL_VERBOSE(("unpacking %d of %d\n", cnt, ORTE_UINT32));
rc = orte_dss.unpack(buffer, &info->rem_srqs[srq].rem_srq_num, &cnt, ORTE_UINT32);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return OMPI_ERROR;
}
BTL_VERBOSE(("XOOB Recv unpack remote index srq num[%d]= %d", srq, info->rem_srqs[srq].rem_srq_num));
}
}
return OMPI_SUCCESS;
}
/*
* send connect information to remote endpoint
*/
static int xoob_send_connect_data(mca_btl_base_endpoint_t* endpoint,
uint8_t message_type)
{
orte_buffer_t* buffer = OBJ_NEW(orte_buffer_t);
int rc, srq, prio;
if (NULL == buffer) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* Bulding standart header that we use in all messages:
* - Message type,
* - Our subnet id
* - Our LID
*/
/* pack the info in the send buffer */
BTL_VERBOSE(("XOOB Send pack Message type = %d", message_type));
BTL_VERBOSE(("packing %d of %d\n", 1, ORTE_UINT8));
rc = orte_dss.pack(buffer, &message_type, 1, ORTE_UINT8);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
BTL_VERBOSE(("XOOB Send pack sid = %d", endpoint->subnet_id));
BTL_VERBOSE(("packing %d of %d\n", 1, ORTE_UINT64));
rc = orte_dss.pack(buffer, &endpoint->subnet_id, 1, ORTE_UINT64);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
BTL_VERBOSE(("XOOB Send pack lid = %d", endpoint->endpoint_btl->lid));
BTL_VERBOSE(("packing %d of %d\n", 1, ORTE_UINT16));
rc = orte_dss.pack(buffer, &endpoint->endpoint_btl->lid, 1, ORTE_UINT16);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* Now we append to standart header additional information
* that is required for full (open qp,etc..) connect request and response:
* - qp_num of first qp
* - psn of first qp
* - MTU
*/
if (ENDPOINT_XOOB_CONNECT_REQUEST == message_type ||
ENDPOINT_XOOB_CONNECT_RESPONSE == message_type) {
struct mca_btl_openib_qp_t *qp;
if (ENDPOINT_XOOB_CONNECT_REQUEST == message_type) {
qp = endpoint->qps->qp;
} else {
qp = endpoint->xrc_recv_qp->qp;
}
/* stuff all the QP info into the buffer */
/* we need to send only one QP */
BTL_VERBOSE(("XOOB Send pack qp num = %d", qp->lcl_qp->qp_num));
BTL_VERBOSE(("packing %d of %d\n", 1, ORTE_UINT32));
rc = orte_dss.pack(buffer, &qp->lcl_qp->qp_num,
1, ORTE_UINT32);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
BTL_VERBOSE(("XOOB Send pack lpsn = %d", qp->lcl_psn));
BTL_VERBOSE(("packing %d of %d\n", 1, ORTE_UINT32));
rc = orte_dss.pack(buffer, &qp->lcl_psn, 1,
ORTE_UINT32);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
BTL_VERBOSE(("XOOB Send pack mtu = %d", endpoint->endpoint_btl->hca->mtu));
BTL_VERBOSE(("packing %d of %d\n", 1, ORTE_UINT32));
rc = orte_dss.pack(buffer, &endpoint->endpoint_btl->hca->mtu, 1,
ORTE_UINT32);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
/* We append to header above additional information
* that is required for full & XRC connect request:
* - The lid ob btl on remote site that we want to connect
*/
if (ENDPOINT_XOOB_CONNECT_REQUEST == message_type ||
ENDPOINT_XOOB_CONNECT_XRC_REQUEST == message_type) {
/* when we are sending request we add remote lid that we want to connect */
BTL_VERBOSE(("XOOB Send pack remote lid = %d", endpoint->ib_addr->lid));
BTL_VERBOSE(("packing %d of %d\n", 1, ORTE_UINT16));
rc = orte_dss.pack(buffer, &endpoint->ib_addr->lid, 1, ORTE_UINT16);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
/* We append to header above additional information
* that is required for full & XRC connect response:
* - index of our endpoint
* - array of xrc-srq numbers
*/
if (ENDPOINT_XOOB_CONNECT_RESPONSE == message_type ||
ENDPOINT_XOOB_CONNECT_XRC_RESPONSE == message_type) {
/* we need to send the endpoint index for immidate send */
BTL_VERBOSE(("XOOB Send pack index = %d", endpoint->index));
BTL_VERBOSE(("packing %d of %d\n", 1, ORTE_UINT32));
rc = orte_dss.pack(buffer, &endpoint->index, 1, ORTE_UINT32);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* on response we add all SRQ numbers */
for (srq = 0; srq < mca_btl_openib_component.num_xrc_qps; srq++) {
BTL_VERBOSE(("XOOB Send pack srq[%d] num = %d", srq, endpoint->endpoint_btl->qps[srq].u.xrc_qp.xrc->xrc_srq_num));
BTL_VERBOSE(("packing %d of %d\n", 1, ORTE_UINT32));
rc = orte_dss.pack(buffer, &endpoint->endpoint_btl->qps[srq].u.xrc_qp.xrc->xrc_srq_num,
1, ORTE_UINT32);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
prio = (mca_btl_openib_component.qp_infos[srq].size <=
mca_btl_openib_component.eager_limit) ?
BTL_OPENIB_HP_CQ : BTL_OPENIB_LP_CQ;
endpoint->endpoint_btl->hca->cq_users[prio]++;
}
}
/* send to remote endpoint */
rc = orte_rml.send_buffer_nb(&endpoint->endpoint_proc->proc_guid,
buffer, XOOB_TAG, 0,
xoob_rml_send_cb, NULL);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
BTL_VERBOSE(("XOOB Send QP Info, LID = %d, SUBNET = %016x\n, Message type = %d",
endpoint->endpoint_btl->lid,
endpoint->subnet_id,
message_type));
return OMPI_SUCCESS;
}
/*
* Callback when we have finished RML sending the connect data to a
* remote peer
*/
static void xoob_rml_send_cb(int status, orte_process_name_t* endpoint,
orte_buffer_t* buffer, orte_rml_tag_t tag,
void* cbdata)
{
OBJ_RELEASE(buffer);
}
static mca_btl_openib_endpoint_t* xoob_find_endpoint(orte_process_name_t* process_name,
uint64_t subnet_id, uint16_t lid, uint8_t message_type)
{
size_t i;
mca_btl_openib_proc_t *ib_proc;
mca_btl_openib_endpoint_t *ib_endpoint = NULL;
bool found = false;
BTL_VERBOSE(("XOOB. searching for ep and proc with follow parameters:"
"jobid %d, vpid %d, sid %d, lid %d",
process_name->jobid, process_name->vpid, subnet_id, lid));
/* find ibproc */
for (ib_proc = (mca_btl_openib_proc_t*)
opal_list_get_first(&mca_btl_openib_component.ib_procs);
ib_proc != (mca_btl_openib_proc_t*)
opal_list_get_end(&mca_btl_openib_component.ib_procs);
ib_proc = (mca_btl_openib_proc_t*)opal_list_get_next(ib_proc)) {
if (orte_ns.compare_fields(ORTE_NS_CMP_ALL,
&ib_proc->proc_guid, process_name) == ORTE_EQUAL) {
found = true;
break;
}
}
/* we found our ib_proc, lets find endpoint now */
if (found) {
for (i = 0; i < ib_proc->proc_endpoint_count; i++) {
ib_endpoint = ib_proc->proc_endpoints[i];
/* we need to check different
* lid for different message type */
if (ENDPOINT_XOOB_CONNECT_RESPONSE || ENDPOINT_XOOB_CONNECT_XRC_RESPONSE) {
/* response message */
if (ib_endpoint->subnet_id == subnet_id &&
ib_endpoint->ib_addr->lid == lid) {
break; /* Found one */
}
} else {
/* request message */
if (ib_endpoint->subnet_id == subnet_id &&
ib_endpoint->endpoint_btl->lid == lid) {
break; /* Found one */
}
}
}
if (NULL == ib_endpoint) {
BTL_ERROR(("can't find suitable endpoint for this peer\n"));
}
} else {
BTL_ERROR(("can't find suitable endpoint for this peer\n"));
}
return ib_endpoint;
}
/*
* Non blocking RML recv callback. Read incoming QP and other info,
* and if this endpoint is trying to connect, reply with our QP info,
* otherwise try to modify QP's and establish reliable connection
*/
static void xoob_rml_recv_cb(int status, orte_process_name_t* process_name,
orte_buffer_t* buffer, orte_rml_tag_t tag,
void* cbdata)
{
int rc;
uint8_t message_type;
uint16_t requested_lid = 0;
mca_btl_openib_rem_info_t rem_info;
mca_btl_openib_endpoint_t *ib_endpoint = NULL;
if ( OMPI_SUCCESS != init_rem_info(&rem_info)) {
return;
}
/* Get data. */
if ( OMPI_SUCCESS != xoob_receive_connect_data(&rem_info, &requested_lid, &message_type, buffer)) {
BTL_ERROR(("XOOB. Failed to read data\n"));
return;
}
/* Processing message */
switch (message_type) {
case ENDPOINT_XOOB_CONNECT_REQUEST:
BTL_VERBOSE(("Received ENDPOINT_XOOB_CONNECT_REQUEST: lid %d, sid %d\n",
rem_info.rem_lid,
rem_info.rem_subnet_id));
ib_endpoint = xoob_find_endpoint(process_name,rem_info.rem_subnet_id,
requested_lid, message_type);
if ( NULL == ib_endpoint) {
BTL_ERROR(("XOOB. Got ENDPOINT_XOOB_CONNECT_REQUEST."
" Failed to find endpoint with subnet %d and LID %d",
rem_info.rem_subnet_id,requested_lid));
return;
}
OPAL_THREAD_LOCK(&ib_endpoint->endpoint_lock);
/* we should create qp and send the info + srq to requestor */
rc = xoob_reply_first_connect(ib_endpoint, &rem_info);
if (OMPI_SUCCESS != rc) {
BTL_ERROR(("error in endpoint reply start connect"));
return;
}
/* enable pooling for this btl */
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
break;
case ENDPOINT_XOOB_CONNECT_XRC_REQUEST:
/* pasha we don't need the remote lid here ??*/
BTL_VERBOSE(("Received ENDPOINT_XOOB_CONNECT_XRC_REQUEST: lid %d, sid %d\n",
rem_info.rem_lid,
rem_info.rem_subnet_id));
ib_endpoint = xoob_find_endpoint(process_name,rem_info.rem_subnet_id,
requested_lid, message_type);
if ( NULL == ib_endpoint) {
BTL_ERROR(("XOOB. Got ENDPOINT_XOOB_CONNECT_XRC_REQUEST."
" Failed to find endpoint with subnet %d and LID %d",
rem_info.rem_subnet_id,requested_lid));
return;
}
OPAL_THREAD_LOCK(&ib_endpoint->endpoint_lock);
rc = xoob_send_connect_data(ib_endpoint, ENDPOINT_XOOB_CONNECT_XRC_RESPONSE);
if (OMPI_SUCCESS != rc) {
BTL_ERROR(("error in endpoint reply start connect"));
return;
}
/* enable pooling for this btl */
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
break;
case ENDPOINT_XOOB_CONNECT_RESPONSE:
BTL_VERBOSE(("Received ENDPOINT_XOOB_CONNECT_RESPONSE: lid %d, sid %d\n",
rem_info.rem_lid,
rem_info.rem_subnet_id));
ib_endpoint = xoob_find_endpoint(process_name, rem_info.rem_subnet_id,
rem_info.rem_lid, message_type);
if ( NULL == ib_endpoint) {
BTL_ERROR(("Xoob. Got ENDPOINT_XOOB_CONNECT_RESPONSE."
" Failed to find endpoint with subnet %d and LID %d",
rem_info.rem_subnet_id,rem_info.rem_lid));
return;
}
OPAL_THREAD_LOCK(&ib_endpoint->endpoint_lock);
/* we got all the data qp+srq. switch the endpoint to connect mode */
XOOB_SET_REMOTE_INFO(ib_endpoint->rem_info, rem_info);
BTL_VERBOSE(("rem_info: lid %d, sid %d ep %d %d",
rem_info.rem_lid,
rem_info.rem_subnet_id,ib_endpoint->rem_info.rem_lid,ib_endpoint->rem_info.rem_subnet_id));
if (OMPI_SUCCESS != xoob_qp_connect(ib_endpoint, SEND, &rem_info)) {
BTL_ERROR(("XOOB: Failed to connect endpoint\n"));
return;
}
mca_btl_openib_endpoint_connected(ib_endpoint);
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
break;
case ENDPOINT_XOOB_CONNECT_XRC_RESPONSE:
BTL_VERBOSE(("Received ENDPOINT_XOOB_CONNECT_XRC_REQUEST: lid %d, sid %d\n",
rem_info.rem_lid,
rem_info.rem_subnet_id));
ib_endpoint = xoob_find_endpoint(process_name, rem_info.rem_subnet_id,
rem_info.rem_lid, message_type);
if ( NULL == ib_endpoint) {
BTL_ERROR(("XOOB. Got ENDPOINT_XOOB_CONNECT_XRC_RESPONSE."
" Failed to find endpoint with subnet %d and LID %d",
rem_info.rem_subnet_id,rem_info.rem_lid));
return;
}
OPAL_THREAD_LOCK(&ib_endpoint->endpoint_lock);
/* we got srq numbers on our request */
XOOB_SET_REMOTE_INFO(ib_endpoint->rem_info, rem_info);
mca_btl_openib_endpoint_connected(ib_endpoint);
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
break;
default :
BTL_ERROR(("XOOB: Invalid message type %d", message_type));
}
free_rem_info(&rem_info);
}
static int init_rem_info(mca_btl_openib_rem_info_t *rem_info)
{
rem_info->rem_qps = (mca_btl_openib_rem_qp_info_t*)malloc(sizeof(mca_btl_openib_rem_qp_info_t));
if (NULL == rem_info->rem_qps) {
BTL_ERROR(("XOOB. Failed to allocate memory for remote QP data\n"));
return OMPI_ERROR;
}
rem_info->rem_srqs = (mca_btl_openib_rem_srq_info_t*)malloc(sizeof(mca_btl_openib_rem_srq_info_t) *
mca_btl_openib_component.num_xrc_qps);
if (NULL == rem_info->rem_srqs) {
BTL_ERROR(("XOOB. Failed to allocate memory for remote SRQ data\n"));
return OMPI_ERROR;
}
return OMPI_SUCCESS;
}
static void free_rem_info(mca_btl_openib_rem_info_t *rem_info)
{
if (NULL != rem_info->rem_qps) {
free(rem_info->rem_qps);
}
if (NULL != rem_info->rem_srqs) {
free(rem_info->rem_srqs);
}
}
#else
/* In case if the XRC was disabled during compilation we will print message and return error */
static int xoob_init(void)
{
printf("xoob init\n");
return OMPI_ERR_NOT_IMPLEMENTED;
}
static int xoob_start_connect(mca_btl_base_endpoint_t *e)
{
printf("xoob start connect\n");
return OMPI_ERR_NOT_IMPLEMENTED;
}
static int xoob_finalize(void)
{
printf("xoob finalize\n");
return OMPI_ERR_NOT_IMPLEMENTED;
}
#endif

Просмотреть файл

@ -0,0 +1,18 @@
/*
* Copyright (c) 2007 Mellanox Technologies. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef BTL_OPENIB_CONNECT_XOOB_H
#define BTL_OPENIB_CONNECT_XOOB_H
#include "connect/connect.h"
extern ompi_btl_openib_connect_base_funcs_t ompi_btl_openib_connect_xoob;
#endif

Просмотреть файл

@ -22,17 +22,17 @@
# (the openib BTL).
#
[ini file:file not found]
The Open MPI OpenIB BTL component was unable to find or read an INI
file that was requested via the btl_openib_hca_param_files MCA
parameter. Please check this file and/or modify the
The Open MPI OpenFabrics (openib) BTL component was unable to find or
read an INI file that was requested via the btl_openib_hca_param_files
MCA parameter. Please check this file and/or modify the
btl_openib_hca_param_files MCA parameter:
%s
#
[ini file:not in a section]
In parsing OpenIB BTL parameter file, values were found that were not
in a valid INI section. These values will be ignored. Please
re-check this file:
In parsing the OpenFabrics (openib) BTL parameter file, values were
found that were not in a valid INI section. These values will be
ignored. Please re-check this file:
%s
@ -41,9 +41,9 @@ At line %d, near the following text:
%s
#
[ini file:unexpected token]
In parsing OpenIB BTL parameter file, unexpected tokens were found
(this may cause significant portions of the INI file to be ignored).
Please re-check this file:
In parsing the OpenFabrics (openib) BTL parameter file, unexpected
tokens were found (this may cause significant portions of the INI file
to be ignored). Please re-check this file:
%s
@ -52,10 +52,10 @@ At line %d, near the following text:
%s
#
[ini file:expected equals]
In parsing OpenIB BTL parameter file, unexpected tokens were found
(this may cause significant portions of the INI file to be ignored).
An equals sign ("=") was expected but was not found. Please re-check
this file:
In parsing the OpenFabrics (openib) BTL parameter file, unexpected
tokens were found (this may cause significant portions of the INI file
to be ignored). An equals sign ("=") was expected but was not found.
Please re-check this file:
%s
@ -64,9 +64,10 @@ At line %d, near the following text:
%s
#
[ini file:expected newline]
In parsing OpenIB BTL parameter file, unexpected tokens were found
(this may cause significant portions of the INI file to be ignored).
A newline was expected but was not found. Please re-check this file:
In parsing the OpenFabrics (openib) BTL parameter file, unexpected
tokens were found (this may cause significant portions of the INI file
to be ignored). A newline was expected but was not found. Please
re-check this file:
%s
@ -75,8 +76,8 @@ At line %d, near the following text:
%s
#
[ini file:unknown field]
In parsing OpenIB BTL parameter file, an unrecognized field name was
found. Please re-check this file:
In parsing the OpenFabrics (openib) BTL parameter file, an
unrecognized field name was found. Please re-check this file:
%s
@ -102,10 +103,11 @@ NOTE: You can turn off this warning by setting the MCA parameter
btl_openib_warn_no_hca_params_found to 0.
#
[init-fail-no-mem]
The OpenIB BTL failed to initialize while trying to allocate some
locked memory. This typically can indicate that the memlock limits
are set too low. For most HPC installations, the memlock limits
should be set to "unlimited". The failure occured here:
The OpenFabrics (openib) BTL failed to initialize while trying to
allocate some locked memory. This typically can indicate that the
memlock limits are set too low. For most HPC installations, the
memlock limits should be set to "unlimited". The failure occured
here:
Host: %s
OMPI source: %s:%d
@ -120,9 +122,10 @@ helpful:
http://www.open-mpi.org/faq/?category=openfabrics#ib-locked-pages
#
[init-fail-create-q]
The OpenIB BTL failed to initialize while trying to create an internal
queue. This typically indicates a failed OpenFabrics installation or
faulty hardware. The failure occured here:
The OpenFabrics (openib) BTL failed to initialize while trying to
create an internal queue. This typically indicates a failed
OpenFabrics installation or faulty hardware. The failure occured
here:
Host: %s
OMPI source: %s:%d
@ -185,13 +188,14 @@ NOTE: You can turn off this warning by setting the MCA parameter
btl_openib_warn_default_gid_prefix to 0.
#
[ibv_fork requested but not supported]
WARNING: fork() support was requested for the openib BTL, but it is
not supported on the host %s. Deactivating the openib BTL.
WARNING: fork() support was requested for the OpenFabrics (openib)
BTL, but it is not supported on the host %s. Deactivating the
OpenFabrics BTL.
#
[ibv_fork_init fail]
WARNING: fork() support was requested for the openib BTL, but the
library call ibv_fork_init() failed on the host %s.
Deactivating the openib BTL.
WARNING: fork() support was requested for the OpenFabrics (openib)
BTL, but the library call ibv_fork_init() failed on the host %s.
Deactivating the OpenFabrics BTL.
#
[wrong buffer alignment]
Wrong buffer alignment %d configured on host '%s'. Should be bigger
@ -251,15 +255,15 @@ Resolution: %s
#
[no qps in receive_queues]
WARNING: No queue pairs were defined in the btl_openib_receive_queues
MCA parameter. At least one queue pair must be defined. The openib
BTL will therefore be deactivated for this run.
MCA parameter. At least one queue pair must be defined. The
OpenFabrics (openib) BTL will therefore be deactivated for this run.
Host: %s
#
[invalid qp type in receive_queues]
WARNING: An invalid queue pair type was specified in the
btl_openib_receive_queues MCA parameter. The openib BTL will be
deactivated for this run.
btl_openib_receive_queues MCA parameter. The OpenFabrics (openib) BTL
will be deactivated for this run.
Valid queue pair types are "P" for per-peer and "S" for shared receive
queue.
@ -270,8 +274,8 @@ Bad specification: %s
#
[invalid pp qp specification]
WARNING: An invalid per-peer receive queue specification was detected
as part of the btl_openib_receive_queues MCA parameter. The openib
BTL will therefore be deactivated for this run.
as part of the btl_openib_receive_queues MCA parameter. The
OpenFabrics (openib) BTL will therefore be deactivated for this run.
Per-peer receive queues require between 1 and 5 parameters:
@ -297,8 +301,8 @@ Bad queue specification: %s
#
[invalid srq specification]
WARNING: An invalid shared receive queue specification was detected as
part of the btl_openib_receive_queues MCA parameter. The openib BTL
will therefore be deactivated for this run.
part of the btl_openib_receive_queues MCA parameter. The OpenFabrics
(openib) BTL will therefore be deactivated for this run.
Shared receive queues can take between 2 and 4 parameters:
@ -322,8 +326,8 @@ Bad queue specification: %s
[rd_num must be > rd_low]
WARNING: The number of buffers for a queue pair specified via the
btl_openib_receive_queues MCA parameter must be greater than the low
buffer count watermark. The openib BTL will therefore be deactivated
for this run.
buffer count watermark. The OpenFabrics (openib) BTL will therefore
be deactivated for this run.
Host: %s
Bad queue specification: %s
@ -333,8 +337,8 @@ WARNING: The largest queue pair buffer size specified in the
btl_openib_receive_queues MCA parameter is smaller than the maximum
send size (i.e., the btl_openib_max_send_size MCA parameter), meaning
that no queue is large enough to receive the largest possible incoming
message fragment. The openib BTL will therefore be deactivated for
this run.
message fragment. The OpenFabrics (openib) BTL will therefore be
deactivated for this run.
Host: %s
Largest buffer size: %d
@ -364,3 +368,79 @@ bit larger than this for performance reasons.
Host: %s
Specified freelist size: %d
Minimum required freelist size: %d
#
[XRC with PP or SRQ]
WARNING: An invalid queue pair type was specified in the
btl_openib_receive_queues MCA parameter. The OpenFabrics (openib) BTL
will be deactivated for this run.
Note that XRC ("X") queue pairs cannot be used with per-peer ("P") and
SRQ ("S") queue pairs. This restriction may be removed in future
versions of Open MPI.
Host: %s
btl_openib_receive_queues: %s
#
[XRC with BTLs per LID]
WARNING: An invalid queue pair type was specified in the
btl_openib_receive_queues MCA parameter. The OpenFabrics (openib) BTL
will be deactivated for this run.
XRC ("X") queue pairs can not be used when (btls_per_lid > 1). This
restriction may be removed in future versions of Open MPI.
Host: %s
btl_openib_receive_queues: %s
btls_per_lid: %d
#
[XRC on device without XRC support]
WARNING: You configured the OpenFabrics (openib) BTL to run with %d
XRC queues. The device %s does not have XRC capabilities; the
OpenFabrics btl will ignore this device. If no devices are found with
XRC capabilities, the OpenFabrics BTL will be disabled.
Host: %s
#
[No XRC support]
WARNING: The Open MPI build was compiled without XRC support, but XRC
("X") queues were specified in the btl_openib_receive_queues MCA
parameter. The OpenFabrics (openib) BTL will therefore be deactivated
for this run.
Host: %s
btl_openib_receive_queues: %s
#
[XRC with OOB]
WARNING: You must use the "xoob" OpenFabrics (openib) connection
manager when XRC ("X") queues are specified in the
btl_openib_receive_queues MCA parameter. Either remove the X queues
from btl_openib_receive_queues or ensure to use the "xoob" connection
manager by setting btl_openib_connect to "xoob". The OpenFabrics BTL
will therefore be deactivated for this run.
Host: %s
<<<PASHA>>>: it's probably worthwhile to show the values of the
receive_queues and connect MCA params here, not the number of X
queues.
<<<JEFF>>>>: A: This check we do in "connect" module and we don't have
the receive_queues params string there.
<<PASHA>>: But MCA parameters are globals... From the user
perspective, a help message should show exactly what is wrong. The
number of wrong things is not helpful.
num_xrc_qps: %d
#
[SRQ or PP with XOOB]
WARNINGS: You cannot specify to use per-peer ("P") or SRQ ("S") queues
in the btl_openib_receive_queues MCA parameter and also use the "xoob"
connection manager. The "xoob" connection manager is for XRC ("X)
queues only; you must use one of the other connection managers for
per-peer/SRQ queues (such as "oob").
Host: %s
<<<PASHA>>> Same comment as above.
<<<JEFF>>>>: A: This check we do in "connect" module and we don't have
the receive_queues
<<<PASHA>>>: Same somment as above. :-)
params string there.
num_srq_qps: %d
num_pp_qps: %d