1
1

oob/ud: remove as it has bitrotted

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
Этот коммит содержится в:
Nathan Hjelm 2018-06-19 13:06:26 -06:00
родитель 733cac864a
Коммит 845516ca11
20 изменённых файлов: 0 добавлений и 5043 удалений

Просмотреть файл

@ -1,67 +0,0 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2017 IBM Corporation. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
AM_CPPFLAGS = $(orte_oob_ud_CPPFLAGS)
dist_ortedata_DATA = help-oob-ud.txt
sources = \
oob_ud_component.h \
oob_ud_component.c \
oob_ud.c \
oob_ud.h \
oob_ud_event.c \
oob_ud_peer.c \
oob_ud_peer.h \
oob_ud_ping.c \
oob_ud_ping.h \
oob_ud_qp.c \
oob_ud_qp.h \
oob_ud_recv.c \
oob_ud_req.c \
oob_ud_req.h \
oob_ud_send.c \
oob_ud_send.h
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_oob_ud_DSO
component_noinst =
component_install = mca_oob_ud.la
else
component_noinst = libmca_oob_ud.la
component_install =
endif
mcacomponentdir = $(ortelibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_oob_ud_la_SOURCES = $(sources)
mca_oob_ud_la_LDFLAGS = -module -avoid-version $(orte_oob_ud_LDFLAGS)
mca_oob_ud_la_LIBADD = $(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la \
$(orte_oob_ud_LIBS) \
$(OPAL_TOP_BUILDDIR)/opal/mca/common/verbs/lib@OPAL_LIB_PREFIX@mca_common_verbs.la
noinst_LTLIBRARIES = $(component_noinst)
libmca_oob_ud_la_SOURCES = $(sources)
libmca_oob_ud_la_LDFLAGS = -module -avoid-version $(orte_oob_ud_LDFLAGS)
libmca_oob_ud_la_LIBADD = $(orte_oob_ud_LIBS)

Просмотреть файл

@ -1,63 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2012 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_oob_ud_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_orte_oob_ud_CONFIG],[
# We need to know if we have verbs support
AC_REQUIRE([OPAL_CHECK_VERBS_DIR])
AC_CONFIG_FILES([orte/mca/oob/ud/Makefile])
# JMS Still have problems with AC_ARG ENABLE not yet having been
# called or CHECK_WITHDIR'ed.
orte_oob_ud_check_save_CPPFLAGS=$CPPFLAGS
orte_oob_ud_check_save_LDFLAGS=$LDFLAGS
orte_oob_ud_check_save_LIBS=$LIBS
OPAL_CHECK_PACKAGE([orte_oob_ud],
[infiniband/verbs.h],
[ibverbs],
[ibv_open_device],
[],
[$opal_verbs_dir],
[$opal_verbs_libdir],
[orte_oob_ud_check_happy=yes],
[orte_oob_ud_check_happy=no])
CPPFLAGS=$orte_oob_ud_check_save_CPPFLAGS
LDFLAGS=$orte_oob_ud_check_save_LDFLAGS
LIBS=$orte_oob_ud_check_save_LIBS
AS_IF([test "$orte_oob_ud_check_happy" = "yes" && test "$opal_want_verbs" != "no"],
[$1],
[AS_IF([test "$opal_want_verbs" = "yes"],
[AC_MSG_WARN([--with-verbs specified, but cannot build this component])
AC_MSG_ERROR([Cannot continue])
])
$2])
# substitute in the things needed to build this component
AC_SUBST([orte_oob_ud_CFLAGS])
AC_SUBST([orte_oob_ud_CPPFLAGS])
AC_SUBST([orte_oob_ud_LDFLAGS])
AC_SUBST([orte_oob_ud_LIBS])
])dnl

Просмотреть файл

@ -1,121 +0,0 @@
# -*- text -*-
#
# Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2006 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# 2015 Mellanox Technologies, Inc.
# All rights reserved.
# Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
[no-devices-error]
Open MPI has detected a failure in a basic verbs function call. This
is unusual, and may indicate that something is malfunctioning on this
system.
You job will continue, but Open MPI will ignore the "ud" oob component
in this run.
Verbs function: ibv_get_device_list()
Error: %s
Hostname: %s
Please contact your system administrator.
#
[no-ports-usable]
Open MPI has detected that there are UD-capable Verbs devices on your
system, but none of them were able to be setup properly. This may
indicate a problem on this system.
You job will continue, but Open MPI will ignore the "ud" oob component
in this run.
Hostname: %s
#
[reg-mr-failed]
Failed to register memory region (MR):
Hostname: %s
Address: %x
Length: %lu
Error: %s
#
[notify-cq-failed]
Failed to request completion notification on a completion queue (CQ):
Hostname: %s
Error: %s
#
[create-cq-failed]
Failed to create a completion queue (CQ):
Hostname: %s
Requested CQE: %d
Error: %s
Check the CQE attribute.
#
[create-qp-failed]
Failed to create a queue pair (QP):
Hostname: %s
Requested max number of outstanding WRs in the SQ: %u
Requested max number of outstanding WRs in the RQ: %u
Requested max number of SGEs in a WR in the SQ: %u
Requested max number of SGEs in a WR in the RQ: %u
Requested max number of data that can be posted inline to the SQ: %u
Error: %s
Check requested attributes.
#
[poll-cq-failed]
Failed to poll the CQ cq for work completions:
Hostname: %s
Number of entries: %d
Error: %s
#
[poll-cq-failed-wc]
Failed to poll the CQ cq for work completions:
Hostname: %s
Number of entries: %d
Entry ID : %d
WC status: %d
#
[post-send-failed]
Failed to post a list of work requests (WRs) to a send queue:
Hostname: %s
Error: %s
#
[post-recv-failed]
Failed to post a list of work requests (WRs) to a receive queue:
Hostname: %s
Error: %s
#
[modify-qp-failed]
Failed to modify the attributes of a queue pair (QP):
Hostname: %s
Mask for QP attributes to be modified: %d
Error: %s
#
[destroy-qp-failed]
Failed to destroy a queue pair (QP):
Hostname: %s
Error: %s
#

Просмотреть файл

@ -1,279 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* 2014 Mellanox Technologies, Inc.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#include "orte_config.h"
#include "orte/types.h"
#include "opal/types.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "orte/mca/routed/routed.h"
#include "oob_ud.h"
#include "oob_ud_send.h"
#define min(a,b) ((a) < (b) ? (a) : (b))
static int mca_oob_ud_module_init (void);
static void mca_oob_ud_module_fini (mca_oob_ud_peer_t **peer);
static int mca_oob_ud_set_addr (const orte_process_name_t *name, const char *uri);
static void mca_oob_ud_send_nb(orte_rml_send_t *msg);
static void mca_oob_ud_ping(const orte_process_name_t *proc);
mca_oob_ud_module_t mca_oob_ud_module = {
{
mca_oob_ud_module_init,
mca_oob_ud_module_fini,
mca_oob_ud_set_addr,
mca_oob_ud_ping,
mca_oob_ud_send_nb
}
};
static void mca_oob_ud_send_nb(orte_rml_send_t *msg) {
opal_output_verbose(2, orte_oob_base_framework.framework_output,
"%s oob:ud:send_nb to peer %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&msg->dst));
/* push this into our event base for processing */
ORTE_ACTIVATE_UD_POST_SEND(msg, mca_oob_ud_process_send_nb);
}
static void mca_oob_ud_ping(const orte_process_name_t *proc) {
opal_output_verbose(2, orte_oob_base_framework.framework_output,
"%s oob:ud:ping proc %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc));
/* push this into our event base for processing */
ORTE_ACTIVATE_UD_PING(proc, mca_oob_ud_process_ping);
}
/* uri must be at least 27 bytes in size */
void mca_oob_ud_port_get_uri (mca_oob_ud_port_t *port, char *uri)
{
sprintf (uri, "ud://%u.%u.%u", port->listen_qp.ib_qp->qp_num,
port->lid, port->port_num);
}
static int mca_oob_ud_set_addr (const orte_process_name_t *name, const char *uri)
{
mca_oob_ud_peer_t *peer = NULL;
int rc;
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:set_addr: setting location for peer %s from %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(name), uri);
(void) mca_oob_ud_peer_lookup (name, &peer);
if (NULL == uri) {
if (NULL != peer) {
mca_oob_ud_peer_release (peer);
}
peer = NULL;
} else if (NULL == peer) {
peer = mca_oob_ud_peer_from_uri (uri);
if (NULL == peer) {
return ORTE_ERR_BAD_PARAM;
}
} else {
rc = mca_oob_ud_peer_update_with_uri (peer, uri);
if (ORTE_SUCCESS != rc) {
return rc;
}
}
if (NULL != peer) {
peer->peer_name = *name;
peer->needs_notification = true;
}
opal_proc_table_set_value(&mca_oob_ud_module.peers,
*name, (void *)peer);
return ORTE_SUCCESS;
}
int mca_oob_ud_port_post_one_recv (mca_oob_ud_port_t *port, int msg_num)
{
char *grh_buf = port->grh_buf.ptr + msg_num * sizeof (struct ibv_grh);
char *msg_buf = port->msg_buf.ptr + msg_num * port->mtu;
struct ibv_recv_wr wr;
struct ibv_sge sge[2];
/* GRH */
mca_oob_ud_fill_sge(sge, grh_buf, sizeof (struct ibv_grh), port->grh_buf.mr->lkey);
/* message */
mca_oob_ud_fill_sge(sge + 1, msg_buf, port->mtu, port->msg_buf.mr->lkey);
mca_oob_ud_fill_recv_wr (&wr, sge, 2);
wr.wr_id = MCA_OOB_UD_RECV_WR | (uint64_t)msg_num;
return mca_oob_ud_qp_post_recv (&port->listen_qp, &wr);
}
static bool module_has_been_inited = false;
static int mca_oob_ud_module_init (void)
{
/* protect against repeat inits */
if (module_has_been_inited) {
return ORTE_SUCCESS;
}
module_has_been_inited = true;
OBJ_CONSTRUCT(&mca_oob_ud_module.peers, opal_proc_table_t);
opal_proc_table_init (&mca_oob_ud_module.peers, 16, 1024);
return ORTE_SUCCESS;
}
static void mca_oob_ud_module_fini (mca_oob_ud_peer_t **peer)
{
opal_process_name_t key;
void *node1, *node2;
int rc;
rc = opal_proc_table_get_first_key (&mca_oob_ud_module.peers, &key,
(void **) peer, &node1, &node2);
if (OPAL_SUCCESS == rc) {
do {
if (NULL != *peer) {
mca_oob_ud_peer_release (*peer);
}
rc = opal_proc_table_get_next_key (&mca_oob_ud_module.peers, &key,
(void **) peer, node1, &node1, node2, &node2);
} while (OPAL_SUCCESS == rc);
}
opal_proc_table_remove_all(&mca_oob_ud_module.peers);
OBJ_DESTRUCT(&mca_oob_ud_module.peers);
return;
}
int mca_oob_ud_register_iov (struct iovec *iov, int count, struct ibv_mr **ib_mr,
struct ibv_pd *ib_pd, unsigned int mtu, int *sge_countp,
int *wr_countp, int *data_lenp)
{
int data_len, iov_index, sge_count;
unsigned int packet_size = 0;
opal_output_verbose (80, orte_oob_base_framework.framework_output,
"%s oob:ud:register_iov registering memory", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
*wr_countp = 0;
*data_lenp = 0;
*sge_countp = 0;
for (iov_index = 0, data_len = 0, sge_count = 0 ; iov_index < count ; ++iov_index) {
unsigned int iov_left = iov[iov_index].iov_len;
data_len += iov_left;
sge_count++;
do {
unsigned int to_trans = min (iov_left, mtu - packet_size);
packet_size = (to_trans < iov_left) ? 0 : packet_size + to_trans;
iov_left -= to_trans;
if (0 == packet_size && iov_left) {
sge_count++;
}
} while (iov_left);
/* register buffers */
if (NULL == ib_mr[iov_index]) {
ib_mr[iov_index] = ibv_reg_mr (ib_pd,
iov[iov_index].iov_base,
iov[iov_index].iov_len,
IBV_ACCESS_LOCAL_WRITE |
IBV_ACCESS_REMOTE_WRITE);
if (NULL == ib_mr[iov_index]) {
/* Ruh-roh */
orte_show_help("help-oob-ud.txt", "reg-mr-failed", true,
orte_process_info.nodename, iov[iov_index].iov_base,
iov[iov_index].iov_len,strerror(errno));
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
}
*wr_countp = (data_len + mtu - 1) / mtu;
*sge_countp = sge_count;
*data_lenp = data_len;
return ORTE_SUCCESS;
}
int mca_oob_ud_register_buf (char *buf, int size, struct ibv_mr **ib_mr_buf,
struct ibv_pd *ib_pd, unsigned int mtu, int *sge_countp, int *wr_countp)
{
int sge_count = 0;
unsigned int packet_size = 0;
opal_output_verbose (80, orte_oob_base_framework.framework_output,
"%s oob:ud:mca_oob_ud_register_buf registering memory", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
*wr_countp = 0;
*sge_countp = 0;
unsigned int iov_left = size;
sge_count++;
do {
unsigned int to_trans = min (iov_left, mtu - packet_size);
packet_size = (to_trans < iov_left) ? 0 : packet_size + to_trans;
iov_left -= to_trans;
if (0 == packet_size && iov_left) {
sge_count++;
}
} while (iov_left);
/* register buffers */
if (NULL == *ib_mr_buf) {
*ib_mr_buf = ibv_reg_mr (ib_pd, buf, size,
IBV_ACCESS_LOCAL_WRITE |
IBV_ACCESS_REMOTE_WRITE);
if (NULL == *ib_mr_buf) {
orte_show_help("help-oob-ud.txt", "reg-mr-failed", true,
orte_process_info.nodename, buf, size, strerror(errno));
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
*wr_countp = (size + mtu - 1) / mtu;
*sge_countp = sge_count;
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,207 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
*
* 2014 Mellanox Technologies, Inc.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#if !defined(MCA_OOB_UD_H)
#define MCA_OOB_UD_H
#include "orte_config.h"
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif
#include <math.h>
#include <infiniband/verbs.h>
#include "opal/types.h"
#include "orte/types.h"
#include "opal/mca/base/base.h"
#include "opal/class/opal_free_list.h"
#include "opal/class/opal_hash_table.h"
#include "opal/threads/mutex.h"
#include "opal/threads/condition.h"
#include "opal/threads/threads.h"
#include "opal/mca/timer/base/base.h"
#include "opal/include/opal_stdint.h"
#include "opal/mca/memchecker/base/base.h"
#include "orte/mca/oob/oob.h"
#include "orte/mca/oob/base/base.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "oob_ud_qp.h"
#include "oob_ud_peer.h"
#include "oob_ud_req.h"
/* Use for valgrind checks*/
#ifdef HAVE_VALGRIND
#include <valgrind/memcheck.h>
#else
#define VALGRIND_MAKE_MEM_DEFINED(addr,len)
#endif
BEGIN_C_DECLS
enum {
MCA_OOB_UD_SEND_WR = 0x10000000,
MCA_OOB_UD_RECV_WR = 0x20000000
};
enum {
MCA_OOB_UD_DEBUG_NONE,
MCA_OOB_UD_DEBUG_ALL
};
static inline void mca_oob_ud_fill_send_wr (struct ibv_send_wr *wr, struct ibv_sge *sge,
int num_sge, const mca_oob_ud_peer_t *peer)
{
wr->wr_id = MCA_OOB_UD_SEND_WR;
wr->next = NULL;
wr->sg_list = sge;
wr->num_sge = num_sge;
wr->opcode = IBV_WR_SEND;
wr->send_flags = IBV_SEND_SIGNALED;
wr->wr.ud.ah = peer->peer_ah;
wr->wr.ud.remote_qpn = peer->peer_qpn;
wr->wr.ud.remote_qkey = peer->peer_qkey;
}
static inline void mca_oob_ud_fill_recv_wr (struct ibv_recv_wr *wr, struct ibv_sge *sge,
int num_sge)
{
wr->wr_id = MCA_OOB_UD_RECV_WR;
wr->next = NULL;
wr->sg_list = sge;
wr->num_sge = num_sge;
}
static inline void mca_oob_ud_fill_sge (struct ibv_sge *sge, void *addr,
uint32_t length, uint32_t lkey)
{
sge->addr = (uint64_t)addr;
sge->length = length;
sge->lkey = lkey;
}
struct mca_oob_ud_device_t {
opal_list_item_t super;
struct ibv_device_attr attr;
struct ibv_context *ib_context;
struct ibv_comp_channel *ib_channel;
struct ibv_pd *ib_pd;
opal_event_t event;
opal_list_t ports;
};
typedef struct mca_oob_ud_device_t mca_oob_ud_device_t;
OBJ_CLASS_DECLARATION(mca_oob_ud_device_t);
/* events */
void mca_oob_ud_event_start_monitor (mca_oob_ud_device_t *device);
void mca_oob_ud_event_stop_monitor (mca_oob_ud_device_t *device);
struct mca_oob_ud_reg_mem_t {
char *ptr;
size_t len;
struct ibv_mr *mr;
};
typedef struct mca_oob_ud_reg_mem_t mca_oob_ud_reg_mem_t;
struct mca_oob_ud_port_t {
opal_list_item_t super;
mca_oob_ud_device_t *device;
mca_oob_ud_qp_t listen_qp;
opal_free_list_t data_qps;
opal_free_list_t free_msgs;
int mtu;
uint16_t lid;
uint8_t port_num;
/** current send buffer index. used by init function for free_msgs member */
int send_buffer_index;
mca_oob_ud_reg_mem_t grh_buf;
mca_oob_ud_reg_mem_t msg_buf;
};
typedef struct mca_oob_ud_port_t mca_oob_ud_port_t;
OBJ_CLASS_DECLARATION(mca_oob_ud_port_t);
int mca_oob_ud_port_post_one_recv (mca_oob_ud_port_t *port, int msg_num);
void mca_oob_ud_port_get_uri (mca_oob_ud_port_t *port, char *uri);
/* Module definition */
typedef int (*mca_oob_ud_module_init_fn_t)(void);
typedef void (*mca_oob_ud_module_fini_fn_t)(mca_oob_ud_peer_t **peer);
typedef int (*mca_oob_ud_set_addr_fn_t)(const orte_process_name_t *name, const char *uri);
typedef void (*mca_oob_ud_ping_fn_t)(const orte_process_name_t *proc);
typedef void (*mca_oob_ud_send_nb_fn_t)(orte_rml_send_t *msg);
typedef int (*mca_oob_ud_recv_nb_fn_t)(orte_process_name_t* peer,
orte_rml_send_t *msg);
typedef int (*mca_oob_ud_recv_cancel_fn_t)(orte_process_name_t *name, int tag);
typedef struct {
mca_oob_ud_module_init_fn_t init;
mca_oob_ud_module_fini_fn_t finalize;
mca_oob_ud_set_addr_fn_t set_addr;
mca_oob_ud_ping_fn_t ping;
mca_oob_ud_send_nb_fn_t send_nb;
} mca_oob_ud_module_api_t;
typedef struct {
mca_oob_ud_module_api_t api;
opal_event_base_t *ev_base; /* event base for the module progress thread */
bool ev_active;
opal_thread_t progress_thread;
opal_proc_table_t peers; // connection addresses for peers
} mca_oob_ud_module_t;
ORTE_MODULE_DECLSPEC extern mca_oob_ud_module_t mca_oob_ud_module;
int mca_oob_ud_process_ping(int fd, short args, void *cbdata);
int mca_oob_ud_process_send_nb(int fd, short args, void *cbdata);
int mca_oob_ud_send_try (mca_oob_ud_req_t *send_req);
int mca_oob_ud_send_complete (mca_oob_ud_req_t *send_req, int rc);
/* recv */
int mca_oob_ud_recv_nb(orte_process_name_t* peer,
orte_rml_send_t *msg);
int mca_oob_ud_recv_cancel(orte_process_name_t* name, int tag);
int mca_oob_ud_recv_complete (mca_oob_ud_req_t *recv_req);
int mca_oob_ud_recv_try (mca_oob_ud_req_t *recv_req);
int mca_oob_ud_recv_match_send (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer,
mca_oob_ud_msg_hdr_t *msg_hdr, mca_oob_ud_req_t **reqp);
int mca_oob_ud_get_recv_req (const orte_process_name_t name, const int tag, mca_oob_ud_req_t **reqp, bool iovec_used);
int mca_oob_ud_register_iov (struct iovec *iov, int count, struct ibv_mr **ib_mr,
struct ibv_pd *ib_pd, unsigned int mtu, int *sge_countp,
int *wr_countp, int *data_lenp);
int mca_oob_ud_register_buf (char *buf, int size, struct ibv_mr **ib_mr_buf,
struct ibv_pd *ib_pd, unsigned int mtu, int *sge_countp, int *wr_countp);
void mca_oob_ud_event_queue_completed (mca_oob_ud_req_t *req);
END_C_DECLS
#endif

Просмотреть файл

@ -1,789 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* 2014 Mellanox Technologies, Inc.
* All rights reserved.
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#include "orte_config.h"
#include "orte/types.h"
#include "opal/types.h"
#include "opal/align.h"
#include "opal/util/sys_limits.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/name_fns.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "oob_ud_component.h"
#include "opal/mca/common/verbs/common_verbs.h"
static int mca_oob_ud_component_open (void);
static int mca_oob_ud_component_close (void);
static int mca_oob_ud_component_register (void);
static int mca_oob_ud_component_available(void);
static int mca_oob_ud_component_startup(void);
static int mca_oob_ud_component_send_nb(orte_rml_send_t *msg);
static void mca_oob_ud_component_shutdown(void);
static char* mca_oob_ud_component_get_addr(void);
static int mca_oob_ud_component_set_addr(orte_process_name_t *peer, char **uris);
static bool mca_oob_ud_component_is_reachable(char *routed, orte_process_name_t *peer);
#if OPAL_ENABLE_FT_CR == 1
static int mca_oob_ud_component_ft_event(int state);
#endif // OPAL_ENABLE_FT_CR
static int mca_oob_ud_listen_create (mca_oob_ud_port_t *port);
static int mca_oob_ud_listen_destroy (mca_oob_ud_port_t *port);
static int mca_oob_ud_port_alloc_buffers (mca_oob_ud_port_t *port);
static inline int mca_oob_ud_port_recv_start (mca_oob_ud_port_t *port);
static inline int mca_oob_ud_alloc_reg_mem (struct ibv_pd *pd, mca_oob_ud_reg_mem_t *reg_mem,
const int buffer_len);
static inline void mca_oob_ud_free_reg_mem (mca_oob_ud_reg_mem_t *reg_mem);
static void mca_oob_ud_cancel_all_in_list (opal_list_t *list);
static void mca_oob_ud_empty_list (opal_list_t *list);
static void mca_oob_ud_port_construct (mca_oob_ud_port_t *port);
static void mca_oob_ud_port_destruct (mca_oob_ud_port_t *port);
static void mca_oob_ud_device_construct (mca_oob_ud_device_t *device);
static void mca_oob_ud_device_destruct (mca_oob_ud_device_t *device);
/*
* Struct of function pointers and all that to let us be initialized
*/
mca_oob_ud_component_t mca_oob_ud_component = {
{
.oob_base = {
MCA_OOB_BASE_VERSION_2_0_0,
.mca_component_name = "ud",
MCA_BASE_MAKE_VERSION(component, ORTE_MAJOR_VERSION, ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION),
.mca_open_component = mca_oob_ud_component_open,
.mca_close_component = mca_oob_ud_component_close,
.mca_register_component_params = mca_oob_ud_component_register,
},
.oob_data = {
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
.priority = 0, //set the priority so that we will select this component only if someone directs to do so
.available = mca_oob_ud_component_available, //available
.startup = mca_oob_ud_component_startup, //startup
.shutdown = mca_oob_ud_component_shutdown, //shutdown
.send_nb = mca_oob_ud_component_send_nb, //send_nb
.get_addr = mca_oob_ud_component_get_addr,
.set_addr = mca_oob_ud_component_set_addr,
.is_reachable = mca_oob_ud_component_is_reachable, //is_reachable
#if OPAL_ENABLE_FT_CR == 1
.ft_event = mca_oob_ud_component_ft_event,
#endif // OPAL_ENABLE_FT_CR
},
};
static int mca_oob_ud_component_open (void)
{
OBJ_CONSTRUCT(&mca_oob_ud_component.ud_devices, opal_list_t);
OBJ_CONSTRUCT(&mca_oob_ud_component.ud_active_sends, opal_list_t);
OBJ_CONSTRUCT(&mca_oob_ud_component.ud_active_recvs, opal_list_t);
OBJ_CONSTRUCT(&mca_oob_ud_component.ud_event_queued_reqs, opal_list_t);
OBJ_CONSTRUCT(&mca_oob_ud_component.ud_event_processing_msgs, opal_list_t);
OBJ_CONSTRUCT(&mca_oob_ud_component.ud_lock, opal_mutex_t);
OBJ_CONSTRUCT(&mca_oob_ud_component.ud_match_lock, opal_mutex_t);
return ORTE_SUCCESS;
}
static int mca_oob_ud_component_close (void)
{
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:component_close entering",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
OBJ_DESTRUCT(&mca_oob_ud_component.ud_devices);
OBJ_DESTRUCT(&mca_oob_ud_component.ud_active_sends);
OBJ_DESTRUCT(&mca_oob_ud_component.ud_active_recvs);
OBJ_DESTRUCT(&mca_oob_ud_component.ud_event_queued_reqs);
OBJ_DESTRUCT(&mca_oob_ud_component.ud_lock);
OBJ_DESTRUCT(&mca_oob_ud_component.ud_match_lock);
OBJ_DESTRUCT(&mca_oob_ud_component.ud_event_processing_msgs);
return ORTE_SUCCESS;
}
static int mca_oob_ud_component_register (void)
{
mca_base_component_t *component = &mca_oob_ud_component.super.oob_base;
mca_oob_ud_component.ud_min_qp = 8;
(void) mca_base_component_var_register (component, "min_qp", "Minimum number of UD queue pairs "
"to allocate (default: 8)", MCA_BASE_VAR_TYPE_INT, NULL,
0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_LOCAL, &mca_oob_ud_component.ud_min_qp);
mca_oob_ud_component.ud_max_qp = 32;
(void) mca_base_component_var_register (component, "max_qp", "Maximum number of UD queue pairs "
"to allocate (default: 32)", MCA_BASE_VAR_TYPE_INT, NULL,
0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_LOCAL, &mca_oob_ud_component.ud_max_qp);
mca_oob_ud_component.ud_recv_buffer_count = 512;
(void) mca_base_component_var_register (component, "recv_buffers", "Number of MTU sized recv "
"buffers to post (default: 512)", MCA_BASE_VAR_TYPE_INT, NULL,
0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_LOCAL, &mca_oob_ud_component.ud_recv_buffer_count);
mca_oob_ud_component.ud_send_buffer_count = 512;
(void) mca_base_component_var_register (component, "send_buffers", "Number of MTU sized send "
"buffers to allocate (default: 512)", MCA_BASE_VAR_TYPE_INT, NULL,
0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_LOCAL, &mca_oob_ud_component.ud_send_buffer_count);
mca_oob_ud_component.ud_max_retries = 5;
(void)mca_base_component_var_register(component, "peer_retries",
"Number of times to try shutting down a connection before giving up",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_LOCAL,
&mca_oob_ud_component.ud_max_retries);
mca_oob_ud_component.ud_timeout_usec = 800000;
(void)mca_base_component_var_register(component, "peer_timeout",
"Timeout in microseconds between retransmission of data",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_LOCAL,
&mca_oob_ud_component.ud_timeout_usec);
mca_oob_ud_component.ud_qp_max_send_sge = 1;
(void)mca_base_component_var_register(component, "max_send_sge",
"Requested max number of outstanding WRs in the SQ",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_LOCAL,
&mca_oob_ud_component.ud_qp_max_send_sge);
mca_oob_ud_component.ud_qp_max_recv_sge = 2;
(void)mca_base_component_var_register(component, "max_recv_sge",
"Requested max number of outstanding WRs in the RQ",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_LOCAL,
&mca_oob_ud_component.ud_qp_max_recv_sge);
mca_oob_ud_component.ud_qp_max_send_wr = 4096;
(void)mca_base_component_var_register(component, "max_send_wr",
"Requested max number of scatter/gather (s/g) elements in a WR in the SQ",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_LOCAL,
&mca_oob_ud_component.ud_qp_max_send_wr);
mca_oob_ud_component.ud_qp_max_recv_wr = 4096;
(void)mca_base_component_var_register(component, "max_recv_wr",
"Requested max number of scatter/gather (s/g) elements in a WR in the RQ",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_LOCAL,
&mca_oob_ud_component.ud_qp_max_recv_wr);
mca_oob_ud_component.ud_qp_max_inline_data = 0;
(void)mca_base_component_var_register(component, "max_inline_data",
"Requested max number of data (bytes) that can be posted inline to the SQ",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_LOCAL,
&mca_oob_ud_component.ud_qp_max_inline_data);
return ORTE_SUCCESS;
}
static int mca_oob_ud_component_available(void) {
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"oob:ud: component_available called");
/* set the module event base - this is where we would spin off a separate
* progress thread if so desired */
mca_oob_ud_module.ev_base = orte_event_base;
return ORTE_SUCCESS;
}
static int port_mtus[] = {0, 256, 512, 1024, 2048, 4096};
static inline int mca_oob_ud_port_setup (mca_oob_ud_port_t *port)
{
int rc;
struct ibv_port_attr port_attr;
rc = ibv_query_port (port->device->ib_context, port->port_num, &port_attr);
if (0 != rc || IBV_PORT_ACTIVE != port_attr.state || 0 == port_attr.lid) {
/* skip this port */
return ORTE_ERROR;
}
port->lid = port_attr.lid;
port->mtu = port_attr.active_mtu > IBV_MTU_4096 ? 2048 : port_mtus[port_attr.active_mtu];
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:port_setup found port: num = %u, lid = %u, mtu = %u",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
port->port_num, port->lid, port->mtu);
return rc;
}
static inline int mca_oob_ud_device_setup (mca_oob_ud_device_t *device,
struct ibv_device *ib_device)
{
int rc, port_num;
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:device_setup attempting to setup ib device %p",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) ib_device);
device->ib_context = ibv_open_device (ib_device);
if (NULL == device->ib_context) {
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:device_setup error opening device. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
return ORTE_ERROR;
}
rc = ibv_query_device (device->ib_context, &device->attr);
if (0 != rc) {
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:device_setup error querying device. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
return ORTE_ERROR;
}
device->ib_channel = ibv_create_comp_channel (device->ib_context);
if (NULL == device->ib_channel) {
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:device_setup error completing completion channel."
"errno = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
return ORTE_ERROR;
}
device->ib_pd = ibv_alloc_pd (device->ib_context);
if (NULL == device->ib_pd) {
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:device_setup error allocating protection domain."
"errno = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
return ORTE_ERROR;
}
for (port_num = 1 ; port_num <= device->attr.phys_port_cnt ; ++port_num) {
mca_oob_ud_port_t *port = OBJ_NEW(mca_oob_ud_port_t);
if (NULL == port) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
port->device = device;
port->port_num = port_num;
rc = mca_oob_ud_port_setup (port);
if (ORTE_SUCCESS != rc) {
OBJ_RELEASE(port);
continue;
}
opal_list_append (&device->ports, (opal_list_item_t *) port);
break;
}
if (0 == opal_list_get_size(&device->ports)) {
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:device_setup could not init device. no usable "
"ports present", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
return ORTE_ERROR;
}
return ORTE_SUCCESS;
}
static int mca_oob_ud_component_startup(void)
{
struct ibv_device **devices;
int num_devices, i, rc;
opal_list_item_t *item, *item2;
bool found_one = false;
/* If fork support is requested, try to enable it */
rc = opal_common_verbs_fork_test();
if (OPAL_SUCCESS != rc) {
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:device_setup failed in ibv_fork_init. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
return ORTE_ERROR;
}
/* If there are no devices, it is not an error; we just won't use
this component. */
devices = ibv_get_device_list (&num_devices);
if (NULL == devices) {
return ORTE_ERR_NOT_FOUND;
}
if (0 == num_devices) {
ibv_free_device_list(devices);
return ORTE_ERR_NOT_FOUND;
}
for (i = 0 ; i < num_devices ; ++i) {
mca_oob_ud_device_t *device = OBJ_NEW(mca_oob_ud_device_t);
if (NULL == device) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERROR;
}
rc = mca_oob_ud_device_setup (device, devices[i]);
if (ORTE_SUCCESS != rc) {
OBJ_RELEASE(device);
continue;
}
opal_list_append (&mca_oob_ud_component.ud_devices,
(opal_list_item_t *) device);
/* NTH: support only 1 device for now */
break;
}
ibv_free_device_list (devices);
/* If no usable devices are found, then just ignore this component
in this run */
if (0 == opal_list_get_size (&mca_oob_ud_component.ud_devices)) {
return ORTE_ERR_NOT_FOUND;
}
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:init initializing oob/openib. # of devices = %u",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(unsigned int) opal_list_get_size (&mca_oob_ud_component.ud_devices));
for (item = opal_list_get_first (&mca_oob_ud_component.ud_devices);
item != opal_list_get_end (&mca_oob_ud_component.ud_devices);
item = opal_list_get_next (item)) {
mca_oob_ud_device_t *device = (mca_oob_ud_device_t *) item;
/* start monitoring the device for completions */
for (item2 = opal_list_get_first (&device->ports) ;
item2 != opal_list_get_end (&device->ports) ;
item2 = opal_list_get_next (item2)) {
mca_oob_ud_port_t *port = (mca_oob_ud_port_t *) item2;
rc = mca_oob_ud_listen_create (port);
if (0 != rc) {
continue;
}
rc = mca_oob_ud_port_alloc_buffers (port);
if (ORTE_SUCCESS != rc) {
mca_oob_ud_listen_destroy (port);
continue;
}
rc = opal_free_list_init (&port->data_qps,
sizeof (mca_oob_ud_qp_t), 8,
OBJ_CLASS(mca_oob_ud_qp_t), 0, 0,
mca_oob_ud_component.ud_min_qp,
mca_oob_ud_component.ud_max_qp,
2, NULL, 0, NULL, NULL, NULL);
if (OPAL_SUCCESS != rc) {
mca_oob_ud_listen_destroy (port);
continue;
}
rc = mca_oob_ud_port_recv_start (port);
if (ORTE_SUCCESS != rc) {
mca_oob_ud_listen_destroy (port);
continue;
}
/* NTH: only supports one port for now */
found_one = true;
/* NTH: since we only support one port start monitoring now */
mca_oob_ud_event_start_monitor (device);
break;
}
}
if (!found_one) {
orte_show_help("help-oob-ud.txt", "no-ports-usable", true,
orte_process_info.nodename);
return ORTE_ERR_NOT_FOUND;
}
/* have to call the module init here so we can test for available qpair */
if ((NULL != mca_oob_ud_module.api.init) && (ORTE_SUCCESS != (rc = mca_oob_ud_module.api.init()))){
return ORTE_ERROR;
}
return ORTE_SUCCESS;
}
static void mca_oob_ud_component_shutdown(void)
{
mca_oob_ud_peer_t *peer;
opal_list_item_t *item;
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:fini entering",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
OPAL_THREAD_LOCK(&mca_oob_ud_component.ud_lock);
OPAL_THREAD_LOCK(&mca_oob_ud_component.ud_match_lock);
if (ORTE_VPID_INVALID != ORTE_PROC_MY_PARENT->vpid) {
if (ORTE_SUCCESS == mca_oob_ud_peer_lookup (ORTE_PROC_MY_PARENT, &peer) && NULL != peer) {
mca_oob_ud_peer_handle_end (peer);
}
}
/* abort active receives */
mca_oob_ud_cancel_all_in_list (&mca_oob_ud_component.ud_active_recvs);
mca_oob_ud_cancel_all_in_list (&mca_oob_ud_component.ud_active_sends);
mca_oob_ud_empty_list (&mca_oob_ud_component.ud_event_queued_reqs);
OPAL_THREAD_UNLOCK(&mca_oob_ud_component.ud_match_lock);
if (NULL != mca_oob_ud_module.api.finalize) {
mca_oob_ud_module.api.finalize(&peer);
}
for (item = opal_list_get_first (&mca_oob_ud_component.ud_devices);
item != opal_list_get_end (&mca_oob_ud_component.ud_devices);
item = opal_list_get_next (item)) {
mca_oob_ud_event_stop_monitor ((mca_oob_ud_device_t *) item);
}
mca_oob_ud_empty_list (&mca_oob_ud_component.ud_devices);
OPAL_THREAD_UNLOCK(&mca_oob_ud_component.ud_lock);
}
static char* mca_oob_ud_component_get_addr(void) {
/* NTH: qp_num - 32 bits (10), lid - 16 bits (5), port - 8 bits (3) + ud:// + 3 .'s + \0 = 27 chars */
char *contact_info = (char *) calloc(opal_list_get_size(&mca_oob_ud_component.ud_devices) * 27, 1);
char *ptr = contact_info;
opal_list_item_t *item, *port_item;
*ptr = 0;
for (item = opal_list_get_first (&mca_oob_ud_component.ud_devices) ;
item != opal_list_get_end (&mca_oob_ud_component.ud_devices) ;
item = opal_list_get_next (item)) {
mca_oob_ud_device_t *device = (mca_oob_ud_device_t *) item;
for (port_item = opal_list_get_first (&device->ports);
port_item != opal_list_get_end (&device->ports);
port_item = opal_list_get_next (port_item)) {
if (ptr != contact_info) {
ptr += sprintf (ptr, ";");
}
mca_oob_ud_port_get_uri ((mca_oob_ud_port_t *) port_item, ptr);
ptr += strlen (ptr);
}
}
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:get_addr contact information: %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), contact_info);
return contact_info;
}
static int mca_oob_ud_component_send_nb(orte_rml_send_t *msg) {
if (NULL != mca_oob_ud_module.api.send_nb) {
mca_oob_ud_module.api.send_nb(msg);
return ORTE_SUCCESS;
}
return ORTE_ERROR;
}
static int mca_oob_ud_component_set_addr(orte_process_name_t *peer, char **uris)
{
int rc;
OPAL_THREAD_LOCK(&mca_oob_ud_component.ud_lock);
for (int i = 0; NULL != uris[i]; i++) {
if (0 == strncmp(uris[i], "ud:", 3)) {
if (NULL != mca_oob_ud_module.api.set_addr) {
if (ORTE_SUCCESS != (rc = mca_oob_ud_module.api.set_addr(peer, uris[i]))) {
OPAL_THREAD_UNLOCK(&mca_oob_ud_component.ud_lock);
return rc;
}
}
}
}
OPAL_THREAD_UNLOCK(&mca_oob_ud_component.ud_lock);
return ORTE_SUCCESS;
}
#if OPAL_ENABLE_FT_CR == 1
static int mca_oob_ud_component_ft_event(int state) {
(void) state;
return ORTE_SUCCESS;
}
#endif // OPAL_ENABLE_FT_CR
static int mca_oob_ud_port_alloc_buffers (mca_oob_ud_port_t *port) {
int total_buffer_count = mca_oob_ud_component.ud_recv_buffer_count +
mca_oob_ud_component.ud_send_buffer_count;
int rc;
rc = mca_oob_ud_alloc_reg_mem (port->device->ib_pd, &port->grh_buf,
mca_oob_ud_component.ud_recv_buffer_count * sizeof (struct ibv_grh));
if (ORTE_SUCCESS != rc) {
return rc;
}
rc = mca_oob_ud_alloc_reg_mem (port->device->ib_pd, &port->msg_buf,
total_buffer_count * port->mtu);
if (ORTE_SUCCESS != rc) {
return rc;
}
port->send_buffer_index = 0;
rc = opal_free_list_init (&port->free_msgs, sizeof (mca_oob_ud_msg_t), 8,
OBJ_CLASS(mca_oob_ud_msg_t), 0, 0, mca_oob_ud_component.ud_send_buffer_count,
mca_oob_ud_component.ud_send_buffer_count, 0, NULL, 0, NULL, mca_oob_ud_msg_init,
port);
if (ORTE_SUCCESS != rc) {
return rc;
}
return rc;
}
static bool mca_oob_ud_component_is_reachable(char *routed, orte_process_name_t *peer_name)
{
orte_process_name_t hop;
/* if we have a route to this peer, then we can reach it */
hop = orte_routed.get_route(routed, peer_name);
if (ORTE_JOBID_INVALID == hop.jobid ||
ORTE_VPID_INVALID == hop.vpid) {
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return false;
}
return true;
}
static void mca_oob_ud_port_construct (mca_oob_ud_port_t *port)
{
memset((char *) port + sizeof (port->super), 0, sizeof (*port) - sizeof (port->super));
OBJ_CONSTRUCT(&port->data_qps, opal_free_list_t);
OBJ_CONSTRUCT(&port->free_msgs, opal_free_list_t);
OBJ_CONSTRUCT(&port->listen_qp, opal_free_list_item_t);
}
static void mca_oob_ud_port_destruct (mca_oob_ud_port_t *port)
{
(void) mca_oob_ud_listen_destroy (port);
OBJ_DESTRUCT(&port->data_qps);
OBJ_DESTRUCT(&port->free_msgs);
mca_oob_ud_free_reg_mem (&port->grh_buf);
mca_oob_ud_free_reg_mem (&port->msg_buf);
}
OBJ_CLASS_INSTANCE(mca_oob_ud_port_t, opal_list_item_t,
mca_oob_ud_port_construct,
mca_oob_ud_port_destruct);
static int mca_oob_ud_listen_create (mca_oob_ud_port_t *port) {
return mca_oob_ud_qp_init (&port->listen_qp, port, port->device->ib_channel, NULL, false);
}
/* mca_oob_ud_listen_destroy:
*
* Destory the listen queue pair associated with a port.
*/
static int mca_oob_ud_listen_destroy (mca_oob_ud_port_t *port)
{
if (NULL == port || NULL == port->listen_qp.ib_qp) {
return ORTE_SUCCESS;
}
OBJ_DESTRUCT(&port->listen_qp);
return ORTE_SUCCESS;
}
static inline int mca_oob_ud_port_recv_start (mca_oob_ud_port_t *port)
{
int i, rc;
rc = mca_oob_ud_qp_to_rts (&port->listen_qp);
if (ORTE_SUCCESS != rc) {
return rc;
}
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:port_recv_start posting "
"%d message buffers", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
mca_oob_ud_component.ud_recv_buffer_count);
for (i = 0 ; i < mca_oob_ud_component.ud_recv_buffer_count ; ++i) {
rc = mca_oob_ud_port_post_one_recv (port, i);
if (ORTE_SUCCESS != rc) {
return rc;
}
}
rc = ibv_req_notify_cq (port->listen_qp.ib_recv_cq, 0);
if (0 != rc) {
orte_show_help("help-oob-ud.txt", "notify-cq-failed", true,
orte_process_info.nodename, strerror(errno));
return ORTE_ERROR;
}
return ORTE_SUCCESS;
}
static inline int mca_oob_ud_alloc_reg_mem (struct ibv_pd *pd, mca_oob_ud_reg_mem_t *reg_mem,
const int buffer_len)
{
size_t buffer_len_aligned, page_size;
reg_mem->len = buffer_len;
reg_mem->ptr = NULL;
reg_mem->mr = NULL;
/* The allocated buffer should be a multiple of page size.
If ibv_fork_init() has been invoked the pages are marked MADV_DONTFORK.
If we only partially use a page, any data allocated on the remainder of
the page will be inaccessible to the child process */
page_size = opal_getpagesize();
buffer_len_aligned = OPAL_ALIGN(buffer_len, page_size, size_t);
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:alloc_reg_mem allocing and registering %d bytes of memory with pd %p",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), buffer_len, (void *) pd);
posix_memalign ((void **)&reg_mem->ptr, page_size, buffer_len_aligned);
if (NULL == reg_mem->ptr) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
memset (reg_mem->ptr, 0, buffer_len);
reg_mem->mr = ibv_reg_mr (pd, reg_mem->ptr, buffer_len, IBV_ACCESS_LOCAL_WRITE);
if (NULL == reg_mem->mr) {
orte_show_help("help-oob-ud.txt", "reg-mr-failed", true,
orte_process_info.nodename, reg_mem->ptr, buffer_len, strerror(errno));
return ORTE_ERROR;
}
return ORTE_SUCCESS;
}
static inline void mca_oob_ud_free_reg_mem (mca_oob_ud_reg_mem_t *reg_mem)
{
if (reg_mem->mr) {
(void) ibv_dereg_mr (reg_mem->mr);
}
if (reg_mem->ptr) {
free (reg_mem->ptr);
}
memset (reg_mem, 0, sizeof (mca_oob_ud_reg_mem_t));
}
static void mca_oob_ud_cancel_all_in_list (opal_list_t *list)
{
opal_list_item_t *item;
while (NULL != (item = opal_list_remove_first (list))) {
((mca_oob_ud_req_t *)item)->req_list = NULL;
mca_oob_ud_req_abort ((mca_oob_ud_req_t *) item);
}
}
static void mca_oob_ud_empty_list (opal_list_t *list)
{
opal_list_item_t *item;
while (NULL != (item = opal_list_remove_first (list))) {
OBJ_RELEASE(item);
}
}
static void mca_oob_ud_device_construct (mca_oob_ud_device_t *device)
{
memset((char *) device + sizeof (device->super), 0, sizeof (*device) - sizeof (device->super));
OBJ_CONSTRUCT(&device->ports, opal_list_t);
}
static void mca_oob_ud_device_destruct (mca_oob_ud_device_t *device)
{
opal_list_item_t *item;
while (NULL != (item = opal_list_remove_first (&device->ports))) {
OBJ_RELEASE(item);
}
if (device->ib_pd) {
(void) ibv_dealloc_pd (device->ib_pd);
}
if (device->ib_channel) {
(void) ibv_destroy_comp_channel (device->ib_channel);
}
if (device->ib_context) {
(void) ibv_close_device (device->ib_context);
}
OBJ_DESTRUCT(&device->ports);
memset (device, 0, sizeof (mca_oob_ud_device_t));
}
OBJ_CLASS_INSTANCE(mca_oob_ud_device_t, opal_list_item_t,
mca_oob_ud_device_construct,
mca_oob_ud_device_destruct);
OBJ_CLASS_INSTANCE(mca_oob_ud_msg_op_t,
opal_object_t,
NULL, NULL);
OBJ_CLASS_INSTANCE(mca_oob_ud_ping_t,
opal_object_t,
NULL, NULL);

Просмотреть файл

@ -1,68 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2014 Mellanox Technologies, Inc.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#if !defined(MCA_OOB_UD_COMPONENT_H)
#define MCA_OOB_UD_COMPONENT_H
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif
#include "opal/class/opal_bitmap.h"
#include "opal/class/opal_list.h"
#include "opal/class/opal_pointer_array.h"
#include "orte/mca/oob/oob.h"
#include "orte/mca/routed/routed.h"
#include "oob_ud.h"
#include "oob_ud_send.h"
#include "oob_ud_ping.h"
/**
* OOB UD Component
*/
/**
* OOB USOCK Component
*/
typedef struct {
mca_oob_base_component_t super; /**< base OOB component */
opal_list_t ud_devices;
opal_list_t ud_active_recvs;
opal_list_t ud_active_sends;
opal_list_t ud_event_queued_reqs;
opal_list_t ud_event_processing_msgs;
opal_event_t ud_complete_event;
opal_mutex_t ud_lock;
int ud_min_qp;
int ud_max_qp;
int ud_recv_buffer_count;
int ud_send_buffer_count;
opal_mutex_t ud_match_lock;
int ud_max_retries; /**< max number of retries before declaring peer gone */
int ud_timeout_usec; /**< timeout in microsecond between peer retries */
int ud_qp_max_send_sge;
int ud_qp_max_recv_sge;
int ud_qp_max_send_wr;
int ud_qp_max_recv_wr;
int ud_qp_max_inline_data;
} mca_oob_ud_component_t;
ORTE_MODULE_DECLSPEC extern mca_oob_ud_component_t mca_oob_ud_component;
#endif //MCA_OOB_UD_COMPONENT_H

Просмотреть файл

@ -1,606 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights
* reserved.
* 2014 Mellanox Technologies, Inc.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#include "oob_ud_component.h"
#define min(a,b) ((a) < (b) ? (a) : (b))
static int mca_oob_ud_event_send_ack (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer, mca_oob_ud_msg_hdr_t *msg);
static int mca_oob_ud_event_send_nack (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer, mca_oob_ud_msg_hdr_t *msg);
static int mca_oob_ud_event_handle_ack (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer,
mca_oob_ud_msg_hdr_t *msg_hdr);
static int mca_oob_ud_event_handle_nack (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer,
mca_oob_ud_msg_hdr_t *msg_hdr);
static int mca_oob_ud_event_handle_completion (mca_oob_ud_port_t *port, mca_oob_ud_msg_hdr_t *msg);
static int mca_oob_ud_event_handle_data_ok (mca_oob_ud_port_t *port, mca_oob_ud_msg_hdr_t *msg);
static int mca_oob_ud_event_handle_req (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer, mca_oob_ud_msg_hdr_t *msg_hdr);
static int mca_oob_ud_event_handle_rep (mca_oob_ud_port_t *port, mca_oob_ud_msg_hdr_t *msg);
static int mca_oob_ud_event_handle_end (mca_oob_ud_peer_t *peer, mca_oob_ud_msg_hdr_t *msg_hdr);
static void *mca_oob_ud_event_dispatch(int fd, int flags, void *context);
static void *mca_oob_ud_complete_dispatch(int fd, int flags, void *context);
static void mca_oob_ud_stop_events(mca_oob_ud_device_t *device);
static inline opal_list_item_t *mca_oob_ud_list_get_first (opal_list_t *list)
{
return (opal_list_get_size (list) == 0) ? NULL : opal_list_get_first (list);
}
static inline opal_list_item_t *mca_oob_ud_list_get_next (opal_list_t *list, opal_list_item_t *item)
{
opal_list_item_t *next = opal_list_get_next (item);
return (opal_list_get_end(list) == next) ? NULL : next;
}
static bool event_started = false;
static bool event_completed_set = false;
void mca_oob_ud_event_start_monitor (mca_oob_ud_device_t *device)
{
if (!event_started) {
opal_event_set (orte_event_base, &device->event, device->ib_channel->fd,
OPAL_EV_READ, mca_oob_ud_event_dispatch, (void *) device);
opal_event_add (&device->event, NULL);
event_started = true;
}
}
void mca_oob_ud_event_stop_monitor (mca_oob_ud_device_t *device)
{
if (event_started) {
opal_event_del (&device->event);
mca_oob_ud_stop_events (device);
event_started = false;
}
}
struct mca_oob_ud_msg_item_t {
opal_list_item_t super;
mca_oob_ud_msg_hdr_t *hdr;
mca_oob_ud_port_t *port;
mca_oob_ud_peer_t *peer;
int msg_num;
};
typedef struct mca_oob_ud_msg_item_t mca_oob_ud_msg_item_t;
OBJ_CLASS_DECLARATION(mca_oob_ud_msg_item_t);
static void mca_oob_ud_msg_item_construct (mca_oob_ud_msg_item_t *item)
{
memset ((char *) item + sizeof (item->super), 0, sizeof (*item) - sizeof (item->super));
}
static void mca_oob_ud_msg_item_destruct (mca_oob_ud_msg_item_t *item)
{
if (item->hdr) {
/* repost the receive request */
mca_oob_ud_port_post_one_recv (item->port, item->msg_num);
}
}
OBJ_CLASS_INSTANCE(mca_oob_ud_msg_item_t, opal_list_item_t,
mca_oob_ud_msg_item_construct,
mca_oob_ud_msg_item_destruct);
static int mca_oob_ud_msg_item_cmp (opal_list_item_t **a, opal_list_item_t **b)
{
mca_oob_ud_msg_item_t *aitem = *((mca_oob_ud_msg_item_t **) a);
mca_oob_ud_msg_item_t *bitem = *((mca_oob_ud_msg_item_t **) b);
if (aitem->peer == bitem->peer) {
return (aitem->hdr->msg_id > bitem->hdr->msg_id ? 1 : -1);
} else {
return (aitem->peer > bitem->peer) ? 1 : -1;
}
}
static int mca_oob_ud_process_messages (struct ibv_cq *event_cq, mca_oob_ud_port_t *port)
{
mca_oob_ud_msg_item_t *msg_item, *next_item;
opal_list_t *processing_msgs = &mca_oob_ud_component.ud_event_processing_msgs;
mca_oob_ud_peer_t *peer;
mca_oob_ud_msg_hdr_t *msg_hdr;
int msg_num, i, count;
struct ibv_wc wc[40];
bool peer_nacked;
count = ibv_poll_cq (event_cq, 40, wc);
if (count < 0)
return count;
/* acknowlege the events */
ibv_ack_cq_events (event_cq, count);
for (i = 0 ; i < count ; ++i) {
msg_num = (int)(wc[i].wr_id & (~MCA_OOB_UD_RECV_WR));
msg_hdr = (mca_oob_ud_msg_hdr_t *) (port->msg_buf.ptr + msg_num * port->mtu);
VALGRIND_MAKE_MEM_DEFINED(msg_hdr, wc[i].byte_len);
if (!(wc[i].wr_id & MCA_OOB_UD_RECV_WR) || IBV_WC_SUCCESS != wc[i].status) {
mca_oob_ud_port_post_one_recv (port, msg_num);
continue;
}
peer = mca_oob_ud_get_peer (port, &msg_hdr->ra.name, wc[i].src_qp, msg_hdr->ra.qkey,
wc[i].slid, msg_hdr->ra.port_num);
if (peer) {
if (MCA_OOB_UD_MSG_ACK != msg_hdr->msg_type && MCA_OOB_UD_MSG_NACK != msg_hdr->msg_type &&
MCA_OOB_UD_MSG_END != msg_hdr->msg_type) {
mca_oob_ud_msg_item_t *msg_item = OBJ_NEW(mca_oob_ud_msg_item_t);
msg_item->msg_num = msg_num;
msg_item->hdr = msg_hdr;
msg_item->port = port;
msg_item->peer = peer;
opal_list_append (processing_msgs, (opal_list_item_t *) msg_item);
} else {
if (MCA_OOB_UD_MSG_ACK == msg_hdr->msg_type) {
(void) mca_oob_ud_event_handle_ack (port, peer, msg_hdr);
} else if (MCA_OOB_UD_MSG_NACK == msg_hdr->msg_type) {
(void) mca_oob_ud_event_handle_nack (port, peer, msg_hdr);
} else {
mca_oob_ud_event_handle_end (peer, msg_hdr);
}
mca_oob_ud_port_post_one_recv (port, msg_num);
}
} else {
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s oob:ud:process_message got a null peer for message id %"
PRIu64, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg_hdr->msg_id);
mca_oob_ud_port_post_one_recv (port, msg_num);
}
}
/* Sort messages by peer then id */
opal_list_sort (processing_msgs, mca_oob_ud_msg_item_cmp);
/* Send ACKs/NACKs and throw away out-of-order messages */
msg_item = (mca_oob_ud_msg_item_t *) mca_oob_ud_list_get_first (processing_msgs);
for (peer = NULL, peer_nacked = false ; NULL != msg_item ; msg_item = next_item) {
if (peer != msg_item->peer) {
peer_nacked = false;
}
peer = msg_item->peer;
next_item = (mca_oob_ud_msg_item_t *) mca_oob_ud_list_get_next (processing_msgs,
(opal_list_item_t *)msg_item);
if (false == peer_nacked) {
if (msg_item->hdr->msg_id > peer->peer_expected_id) {
(void) mca_oob_ud_event_send_nack (msg_item->port, peer, msg_item->hdr);
peer_nacked = true;
} else if (NULL == next_item || (next_item->peer != msg_item->peer)) {
(void) mca_oob_ud_event_send_ack (msg_item->port, msg_item->peer, msg_item->hdr);
}
}
if (msg_item->hdr->msg_id != peer->peer_expected_id) {
opal_list_remove_item (processing_msgs, (opal_list_item_t *) msg_item);
OBJ_RELEASE(msg_item);
} else {
peer->peer_expected_id++;
}
}
/* Process remaining messages */
while (NULL !=
(msg_item = (mca_oob_ud_msg_item_t *) opal_list_remove_first (processing_msgs))) {
switch (msg_item->hdr->msg_type) {
case MCA_OOB_UD_MSG_REQUEST:
mca_oob_ud_event_handle_req (port, msg_item->peer, msg_item->hdr);
break;
case MCA_OOB_UD_MSG_REPLY:
mca_oob_ud_event_handle_rep (port, msg_item->hdr);
break;
case MCA_OOB_UD_MSG_COMPLETE:
mca_oob_ud_event_handle_completion (port, msg_item->hdr);
break;
case MCA_OOB_UD_MSG_DATA_OK:
mca_oob_ud_event_handle_data_ok (port, msg_item->hdr);
break;
case MCA_OOB_UD_MSG_END:
mca_oob_ud_event_handle_end (peer, msg_item->hdr);
break;
default:
/* do nothing */
break;
}
OBJ_RELEASE(msg_item);
}
return count;
}
static int mca_oob_ud_event_handle_ack (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer,
mca_oob_ud_msg_hdr_t *msg_hdr)
{
mca_oob_ud_msg_t *msg;
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s oob:ud:event_handle_ack got ack for msg id %" PRIu64
" from peer %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg_hdr->msg_id,
ORTE_NAME_PRINT(&peer->peer_name));
OPAL_THREAD_LOCK(&peer->peer_lock);
mca_oob_ud_peer_stop_timer (peer);
msg = (mca_oob_ud_msg_t *) mca_oob_ud_list_get_first (&peer->peer_flying_messages);
while (NULL != (msg = (mca_oob_ud_msg_t *) mca_oob_ud_list_get_first (&peer->peer_flying_messages))) {
if (msg->hdr->msg_id > msg_hdr->msg_id) {
break;
}
msg = (mca_oob_ud_msg_t *)opal_list_remove_first (&peer->peer_flying_messages);
(void) mca_oob_ud_msg_status_update (msg, MCA_OOB_UD_MSG_STATUS_COMPLETE);
}
mca_oob_ud_peer_start_timer (peer);
OPAL_THREAD_UNLOCK(&peer->peer_lock);
return ORTE_SUCCESS;
}
static int mca_oob_ud_event_handle_nack (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer,
mca_oob_ud_msg_hdr_t *msg_hdr)
{
mca_oob_ud_msg_t *msg;
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s oob:ud:event_handle_nack got nack for msg id %" PRIu64
" from peer %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg_hdr->msg_id,
ORTE_NAME_PRINT(&peer->peer_name));
OPAL_THREAD_LOCK(&peer->peer_lock);
mca_oob_ud_peer_stop_timer (peer);
while (NULL !=
(msg = (mca_oob_ud_msg_t *) mca_oob_ud_list_get_first (&peer->peer_flying_messages))) {
if (msg->hdr->msg_id >= msg_hdr->msg_id) {
break;
}
(void) opal_list_remove_first (&peer->peer_flying_messages);
(void) mca_oob_ud_msg_status_update (msg, MCA_OOB_UD_MSG_STATUS_COMPLETE);
}
/* repost remaining messages */
mca_oob_ud_peer_post_all (peer);
/* reset and start the timer */
mca_oob_ud_peer_reset_timer (peer);
mca_oob_ud_peer_start_timer (peer);
OPAL_THREAD_UNLOCK(&peer->peer_lock);
return ORTE_SUCCESS;
}
static int mca_oob_ud_event_handle_end (mca_oob_ud_peer_t *peer, mca_oob_ud_msg_hdr_t *msg_hdr)
{
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s oob:ud:event_handle_end got end message from peer %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->peer_name));
mca_oob_ud_peer_lost (peer);
return ORTE_SUCCESS;
}
static int mca_oob_ud_event_send_ack (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer, mca_oob_ud_msg_hdr_t *msg_hdr)
{
mca_oob_ud_msg_hdr_t tmp_hdr;
int rc = ORTE_SUCCESS;
struct ibv_send_wr wr;
struct ibv_sge sge;
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s oob:ud:event_send_ack sending ack for message id %"
PRIu64 " peer = %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg_hdr->msg_id,
ORTE_NAME_PRINT(&peer->peer_name));
/* reuse registered buffer to send ack (just need to change the type/return address) */
memcpy (&tmp_hdr, msg_hdr, sizeof (tmp_hdr));
msg_hdr->msg_type = MCA_OOB_UD_MSG_ACK;
/* set return address */
msg_hdr->ra.qkey = 0;
msg_hdr->ra.name = *ORTE_PROC_MY_NAME;
msg_hdr->ra.port_num = port->port_num;
mca_oob_ud_fill_sge (&sge, msg_hdr, sizeof (*msg_hdr), port->msg_buf.mr->lkey);
mca_oob_ud_fill_send_wr (&wr, &sge, 1, peer);
rc = mca_oob_ud_qp_post_send (&port->listen_qp, &wr, 1);
if (ORTE_SUCCESS != rc) {
opal_output (0, "oob:ud:event_send_ack error posting ack!");
return rc;
}
memcpy (msg_hdr, &tmp_hdr, sizeof (tmp_hdr));
return ORTE_SUCCESS;
}
static int mca_oob_ud_event_send_nack (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer, mca_oob_ud_msg_hdr_t *msg_hdr)
{
mca_oob_ud_msg_hdr_t tmp_hdr;
int rc = ORTE_SUCCESS;
struct ibv_send_wr wr;
struct ibv_sge sge;
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s oob:ud:event_send_nack sending nack for message id %"
PRIu64 " peer = %s. msg_id = %" PRIu64, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
peer->peer_expected_id, ORTE_NAME_PRINT(&peer->peer_name), msg_hdr->msg_id);
/* reuse registered buffer to send the nack (just need to change the type/return address) */
memcpy (&tmp_hdr, msg_hdr, sizeof (tmp_hdr));
msg_hdr->msg_type = MCA_OOB_UD_MSG_NACK;
/* set return address */
msg_hdr->ra.qkey = 0;
msg_hdr->ra.name = *ORTE_PROC_MY_NAME;
msg_hdr->ra.port_num = port->port_num;
msg_hdr->msg_id = peer->peer_expected_id;
mca_oob_ud_fill_sge (&sge, msg_hdr, sizeof (*msg_hdr), port->msg_buf.mr->lkey);
mca_oob_ud_fill_send_wr (&wr, &sge, 1, peer);
rc = mca_oob_ud_qp_post_send (&port->listen_qp, &wr, 1);
if (ORTE_SUCCESS != rc) {
opal_output (0, "oob:ud:event_send_ack error posting nack!");
return rc;
}
memcpy (msg_hdr, &tmp_hdr, sizeof (tmp_hdr));
return ORTE_SUCCESS;
}
void mca_oob_ud_event_queue_completed (mca_oob_ud_req_t *req)
{
struct timeval now = {0, 0};
mca_oob_ud_req_append_to_list (req, &mca_oob_ud_component.ud_event_queued_reqs);
if (!(event_completed_set) ||
!(opal_event_evtimer_pending (&mca_oob_ud_component.ud_complete_event, &now))) {
event_completed_set = true;
opal_event_evtimer_set (orte_event_base, &mca_oob_ud_component.ud_complete_event,
mca_oob_ud_complete_dispatch, NULL);
opal_event_add (&mca_oob_ud_component.ud_complete_event, &now);
}
}
static int mca_oob_ud_event_handle_completion (mca_oob_ud_port_t *port, mca_oob_ud_msg_hdr_t *msg_hdr)
{
mca_oob_ud_req_t *recv_req = msg_hdr->msg_lcl_ctx;
bool brc;
if (NULL == recv_req) {
opal_output(0, "%s oob:ud:event_handle_completion msg_hdr->msg_lcl_ctx is NULL",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
return ORTE_ERROR;
}
brc = mca_oob_ud_req_is_in_list (recv_req, &mca_oob_ud_component.ud_active_recvs);
if (false == brc) {
/* duplicate completion message? */
opal_output_verbose(0, orte_oob_base_framework.framework_output,
"%s oob:ud:event_handle_completion apparent duplicate completion. "
"request %p. req list = %p", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) recv_req,
(void *) recv_req->req_list);
return ORTE_SUCCESS;
}
recv_req->state = MCA_OOB_UD_REQ_COMPLETE;
mca_oob_ud_event_queue_completed (recv_req);
return ORTE_SUCCESS;
}
static int mca_oob_ud_event_handle_data_ok (mca_oob_ud_port_t *port, mca_oob_ud_msg_hdr_t *msg_hdr)
{
mca_oob_ud_req_t *send_req = msg_hdr->msg_lcl_ctx;
bool brc;
if (NULL == send_req) {
/* ack! */
return ORTE_ERROR;
}
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s oob:ud:event_handle_data_ok got data ok message for request %p",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) send_req);
brc = mca_oob_ud_req_is_in_list (send_req, &mca_oob_ud_component.ud_active_sends);
if (false == brc) {
opal_output_verbose(0, orte_oob_base_framework.framework_output,
"%s oob:ud:event_handle_data_ok apparent duplicate data ok. "
"request %p. req list = %p", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) send_req,
(void *) send_req->req_list);
/* duplicate data ok message? */
return ORTE_SUCCESS;
}
send_req->state = MCA_OOB_UD_REQ_COMPLETE;
mca_oob_ud_event_queue_completed (send_req);
return ORTE_SUCCESS;
}
static int mca_oob_ud_event_handle_req (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer, mca_oob_ud_msg_hdr_t *msg_hdr)
{
mca_oob_ud_req_t *recv_req;
int rc;
rc = mca_oob_ud_recv_match_send (port, peer, msg_hdr, &recv_req);
if (ORTE_SUCCESS == rc) {
mca_oob_ud_event_queue_completed (recv_req);
}
return rc;
}
static int mca_oob_ud_event_handle_rep (mca_oob_ud_port_t *port, mca_oob_ud_msg_hdr_t *msg_hdr)
{
mca_oob_ud_req_t *send_req = (mca_oob_ud_req_t *) msg_hdr->msg_lcl_ctx;
bool brc;
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s oob:ud:event_handle_rep got reply for request %p",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) send_req);
brc = mca_oob_ud_req_is_in_list (send_req, &mca_oob_ud_component.ud_active_sends);
if (false == brc) {
opal_output_verbose(0, orte_oob_base_framework.framework_output,
"%s oob:ud:event_handle_rep no send matches reply",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
/* duplicate reply message? */
return ORTE_SUCCESS;
}
send_req->req_mtu = min(send_req->req_mtu, msg_hdr->msg_data.rep.mtu);
send_req->req_rem_data_len = msg_hdr->msg_data.rep.data_len;
send_req->req_rem_ctx = msg_hdr->msg_rem_ctx;
send_req->req_rem_qpn = msg_hdr->msg_data.rep.qpn;
mca_oob_ud_event_queue_completed (send_req);
return ORTE_SUCCESS;
}
static void *mca_oob_ud_event_dispatch(int fd, int flags, void *context)
{
int rc;
mca_oob_ud_device_t *device = (mca_oob_ud_device_t *) context;
mca_oob_ud_port_t *port = NULL;
struct ibv_cq *event_cq = NULL;
void *event_context = NULL;
do {
rc = ibv_get_cq_event (device->ib_channel, &event_cq, &event_context);
} while (rc && errno == EINTR);
if (NULL == event_cq) {
/* re-arm the event */
opal_output (0, "%s oob:ud:event_dispatch re-arm the event",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
opal_event_add (&port->device->event, NULL);
return NULL;
}
port = (mca_oob_ud_port_t *) event_context;
rc = mca_oob_ud_process_messages (event_cq, port);
if (rc < 0) {
opal_output (0, "%s oob:ud:event_dispatch error processing messages",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
return NULL;
}
if (ibv_req_notify_cq(event_cq, 0)) {
opal_output (0, "%s oob:ud:event_dispatch error asking for cq notifications",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
}
/* re-arm the event */
opal_event_add (&port->device->event, NULL);
return NULL;
}
static void *mca_oob_ud_complete_dispatch(int fd, int flags, void *context)
{
mca_oob_ud_req_t *req;
OPAL_THREAD_LOCK(&mca_oob_ud_component.ud_match_lock);
while (NULL !=
(req = (mca_oob_ud_req_t *) opal_list_remove_first (&mca_oob_ud_component.ud_event_queued_reqs))) {
OPAL_THREAD_UNLOCK(&mca_oob_ud_component.ud_match_lock);
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s oob:ud:event_process processing request %p",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) req);
req->req_list = NULL;
switch (req->type) {
case MCA_OOB_UD_REQ_RECV:
if (req->state == MCA_OOB_UD_REQ_COMPLETE) {
mca_oob_ud_recv_complete (req);
} else {
mca_oob_ud_req_append_to_list (req, &mca_oob_ud_component.ud_active_recvs);
mca_oob_ud_recv_try (req);
}
break;
case MCA_OOB_UD_REQ_SEND:
if (req->state == MCA_OOB_UD_REQ_COMPLETE) {
mca_oob_ud_send_complete (req, ORTE_SUCCESS);
} else {
mca_oob_ud_req_append_to_list (req, &mca_oob_ud_component.ud_active_sends);
mca_oob_ud_send_try (req);
}
break;
default:
break;
}
OPAL_THREAD_LOCK(&mca_oob_ud_component.ud_match_lock);
}
return NULL;
}
static void mca_oob_ud_stop_events (mca_oob_ud_device_t *device)
{
opal_list_item_t *item;
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:stop_events stopping event processing",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
for (item = opal_list_get_first (&device->ports) ;
item != opal_list_get_end (&device->ports) ;
item = opal_list_get_next (item)) {
mca_oob_ud_port_t *port = (mca_oob_ud_port_t *) item;
/* flush all receives */
mca_oob_ud_qp_to_reset (&port->listen_qp);
}
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:stop_events events stopped",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
}

Просмотреть файл

@ -1,398 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* 2014 Mellanox Technologies, Inc.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#include "oob_ud_peer.h"
#include "oob_ud_component.h"
#include "opal/include/opal_stdint.h"
#include "orte/util/name_fns.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/state/state.h"
#include "orte/mca/routed/routed.h"
static void mca_oob_ud_peer_construct (mca_oob_ud_peer_t *peer);
static void mca_oob_ud_peer_destruct (mca_oob_ud_peer_t *peer);
OBJ_CLASS_INSTANCE(mca_oob_ud_peer_t, opal_object_t,
mca_oob_ud_peer_construct,
mca_oob_ud_peer_destruct);
int mca_oob_ud_peer_lookup (const orte_process_name_t *name, mca_oob_ud_peer_t **peer) {
int rc;
*peer = NULL;
rc = opal_proc_table_get_value(&mca_oob_ud_module.peers,
*name, (void**)peer);
if (OPAL_SUCCESS != rc) {
return ORTE_ERR_UNREACH;
}
return ORTE_SUCCESS;
}
static inline int mca_oob_ud_parse_uri (const char *uri, uint32_t *qp_num,
uint16_t *lid, uint16_t *port_num)
{
int rc;
rc = sscanf (uri, "ud://%u.%hu.%hu", qp_num, lid, port_num);
if (3 != rc) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_BAD_PARAM;
}
return ORTE_SUCCESS;
}
int mca_oob_ud_peer_update_with_uri (mca_oob_ud_peer_t *peer, const char *uri)
{
opal_list_item_t *item;
struct ibv_ah_attr ah_attr;
mca_oob_ud_device_t *device;
uint32_t qp_num;
/* NTH: port is 16-bit here because C90 does not support hh in sscanf */
uint16_t lid, port_num;
int rc;
rc = mca_oob_ud_parse_uri (uri, &qp_num, &lid, &port_num);
if (ORTE_SUCCESS != rc) {
return rc;
}
if (peer->peer_lid != lid || peer->peer_port != port_num) {
if (NULL != peer->peer_ah) {
(void) ibv_destroy_ah (peer->peer_ah);
peer->peer_ah = NULL;
}
}
peer->peer_qpn = qp_num;
peer->peer_qkey = 0; /* NTH: todo -- add qkey support if needed */
peer->peer_lid = lid;
peer->peer_port = port_num;
if (NULL == peer->peer_ah) {
memset (&ah_attr, 0, sizeof (ah_attr));
ah_attr.dlid = lid;
ah_attr.port_num = port_num;
for (item = opal_list_get_first (&mca_oob_ud_component.ud_devices);
item != opal_list_get_end (&mca_oob_ud_component.ud_devices);
item = opal_list_get_next (item)) {
device = (mca_oob_ud_device_t *)item;
/* try to create an address handle using this device */
peer->peer_ah = ibv_create_ah (device->ib_pd, &ah_attr);
if (NULL != peer->peer_ah) {
peer->peer_context = (void *) item;
break;
}
}
if (NULL == peer->peer_ah) {
free (peer);
return ORTE_ERROR;
}
}
return ORTE_SUCCESS;
}
mca_oob_ud_peer_t *mca_oob_ud_get_peer (struct mca_oob_ud_port_t *port,
orte_process_name_t *name,
uint32_t qpn, uint32_t qkey,
uint16_t lid, uint8_t port_num)
{
struct ibv_ah_attr ah_attr;
mca_oob_ud_peer_t *peer;
int rc;
rc = mca_oob_ud_peer_lookup (name, &peer);
if (ORTE_SUCCESS == rc) {
opal_output_verbose(20, orte_oob_base_framework.framework_output,
"%s oob:ud:peer_from_msg_hdr using cached peer",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
return peer;
}
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s oob:ud:peer_from_msg_hdr creating peer from return address",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
peer = OBJ_NEW(mca_oob_ud_peer_t);
if (NULL == peer) {
return NULL;
}
peer->peer_qpn = qpn;
peer->peer_qkey = qkey;
peer->peer_name = *name;
peer->peer_lid = lid;
peer->peer_port = port_num;
memset (&ah_attr, 0, sizeof (ah_attr));
ah_attr.dlid = peer->peer_lid;
ah_attr.port_num = peer->peer_port;
peer->peer_ah = ibv_create_ah (port->device->ib_pd, &ah_attr);
if (NULL == peer->peer_ah) {
free (peer);
return NULL;
}
peer->peer_context = port->device;
OPAL_THREAD_LOCK(&mca_oob_ud_component.ud_lock);
opal_proc_table_set_value(&mca_oob_ud_module.peers,
*name, (void *) peer);
OPAL_THREAD_UNLOCK(&mca_oob_ud_component.ud_lock);
return peer;
}
mca_oob_ud_peer_t *mca_oob_ud_peer_from_uri (const char *uri)
{
mca_oob_ud_peer_t *peer;
int rc;
peer = OBJ_NEW(mca_oob_ud_peer_t);
if (NULL == peer) {
return NULL;
}
rc = mca_oob_ud_peer_update_with_uri (peer, uri);
if (ORTE_SUCCESS != rc) {
OBJ_RELEASE (peer);
peer = NULL;
}
return peer;
}
static void mca_oob_ud_peer_construct (mca_oob_ud_peer_t *peer)
{
memset ((char *) peer + sizeof (peer->super), 0, sizeof (*peer) - sizeof (peer->super));
OBJ_CONSTRUCT(&peer->peer_flying_messages, opal_list_t);
peer->peer_expected_id = 1;
}
void mca_oob_ud_peer_handle_end (mca_oob_ud_peer_t *peer)
{
mca_oob_ud_port_t *port = NULL;
mca_oob_ud_msg_t *msg = NULL;
int rc;
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:peer_handle_end telling peer %s i am going away",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&peer->peer_name));
do {
/* tell the peer that we are deleting them */
if (NULL == peer || NULL == peer->peer_context || false == peer->peer_available ||
false == peer->needs_notification) {
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:peer_handle_end don't need to tell %s i am going away",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&peer->peer_name));
break;
}
port = (mca_oob_ud_port_t *) opal_list_get_first (&((mca_oob_ud_device_t *)peer->peer_context)->ports);
if (NULL == port) {
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:peer_handle_end can't tell %s i am going away (no port)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&peer->peer_name));
break;
}
rc = mca_oob_ud_msg_get (port, NULL, &port->listen_qp, peer, true, &msg);
if (ORTE_SUCCESS != rc) {
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:peer_handle_end can't tell %s i am going away (no message buffer)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&peer->peer_name));
break;
}
peer->peer_timer.tries = 2;
peer->peer_timer.value.tv_usec = 500000;
msg->hdr->msg_type = MCA_OOB_UD_MSG_END;
rc = mca_oob_ud_qp_post_send (&port->listen_qp, &msg->wr, 1);
if (ORTE_SUCCESS != rc) {
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:peer_handle_end can't tell %s i am going away (send failed)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&peer->peer_name));
break;
}
} while (0);
if (NULL != msg) {
mca_oob_ud_msg_return (msg);
}
}
void mca_oob_ud_peer_lost (mca_oob_ud_peer_t *peer)
{
OPAL_THREAD_LOCK(&peer->peer_lock);
if (true == peer->peer_available) {
peer->peer_available = false;
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s oob:ud:peer_lost lost connectivity to peer %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->peer_name));
/* inform the ERRMGR framework that we have lost a connection so
* it can decide if this is important, what to do about it, etc.
*/
ORTE_ACTIVATE_PROC_STATE(&peer->peer_name, ORTE_PROC_STATE_COMM_FAILED);
}
OPAL_THREAD_UNLOCK(&peer->peer_lock);
}
void mca_oob_ud_peer_release (mca_oob_ud_peer_t *peer)
{
OBJ_RELEASE(peer);
}
static void mca_oob_ud_peer_destruct (mca_oob_ud_peer_t *peer)
{
if (NULL != peer->peer_ah) {
(void) ibv_destroy_ah (peer->peer_ah);
}
}
static void mca_oob_ud_peer_msg_timeout (int fd, short event, void *ctx)
{
mca_oob_ud_peer_t *peer = (mca_oob_ud_peer_t *) ctx;
mca_oob_ud_msg_t *msg = (mca_oob_ud_msg_t *) opal_list_get_first (&peer->peer_flying_messages);
OPAL_THREAD_LOCK(&peer->peer_lock);
if (false == peer->peer_timer.active) {
return;
}
peer->peer_timer.active = false;
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s oob:ud:peer_msg_timeout timeout sending to peer %s. first message = %" PRIu64 " which has length %d" ,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&peer->peer_name), msg->hdr->msg_id, msg->wr.sg_list[0].length);
if (peer->peer_timer.tries == 0) {
opal_list_item_t *item;
while (NULL != (item = opal_list_remove_first (&peer->peer_flying_messages))) {
msg = (mca_oob_ud_msg_t *) item;
mca_oob_ud_msg_status_update (msg, MCA_OOB_UD_MSG_STATUS_TIMEOUT);
if (msg->req) {
mca_oob_ud_req_complete (msg->req, ORTE_ERR_TIMEOUT);
}
}
OPAL_THREAD_UNLOCK(&peer->peer_lock);
mca_oob_ud_peer_lost (peer);
return;
}
peer->peer_timer.tries--;
mca_oob_ud_peer_post_all (peer);
mca_oob_ud_peer_start_timer (peer);
OPAL_THREAD_UNLOCK(&peer->peer_lock);
}
int mca_oob_ud_peer_post_msg (mca_oob_ud_peer_t *peer, mca_oob_ud_msg_t *msg)
{
int rc;
msg->hdr->msg_id = ++peer->peer_next_id;
rc = mca_oob_ud_qp_post_send (msg->qp, &msg->wr, 1);
if (ORTE_SUCCESS != rc) {
return rc;
}
opal_list_append (&peer->peer_flying_messages, (opal_list_item_t *) msg);
if (false == peer->peer_timer.active) {
mca_oob_ud_peer_reset_timer (peer);
mca_oob_ud_peer_start_timer (peer);
}
return ORTE_SUCCESS;
}
void mca_oob_ud_peer_stop_timer (mca_oob_ud_peer_t *peer)
{
if (peer->peer_timer.active) {
peer->peer_timer.active = false;
opal_event_evtimer_del (&peer->peer_timer.event);
}
}
void mca_oob_ud_peer_reset_timer (mca_oob_ud_peer_t *peer)
{
peer->peer_timer.tries = mca_oob_ud_component.ud_max_retries;
peer->peer_timer.value.tv_sec = mca_oob_ud_component.ud_timeout_usec / 1000000;
peer->peer_timer.value.tv_usec = mca_oob_ud_component.ud_timeout_usec % 1000000;
}
void mca_oob_ud_peer_start_timer (mca_oob_ud_peer_t *peer)
{
if (!peer->peer_timer.active && opal_list_get_size (&peer->peer_flying_messages)) {
peer->peer_timer.active = true;
opal_event_evtimer_set (orte_event_base, &peer->peer_timer.event,
mca_oob_ud_peer_msg_timeout, (void *) peer);
opal_event_evtimer_add (&peer->peer_timer.event, &peer->peer_timer.value);
}
}
void mca_oob_ud_peer_post_all (mca_oob_ud_peer_t *peer)
{
opal_list_item_t *item;
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s oob:ud:peer_post_all reposting all messages for peer %p",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) peer);
for (item = opal_list_get_first (&peer->peer_flying_messages) ;
item != opal_list_get_end (&peer->peer_flying_messages) ;
item = opal_list_get_next (item)) {
mca_oob_ud_msg_t *msg = (mca_oob_ud_msg_t *) item;
(void) mca_oob_ud_qp_post_send (msg->qp, &msg->wr, 1);
}
}

Просмотреть файл

@ -1,97 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
* 2014 Mellanox Technologies, Inc.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#if !defined(MCA_OOB_UD_PEER_H)
#define MCA_OOB_UD_PEER_H
#include "orte_config.h"
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif
#include "orte/types.h"
#include "opal/mca/base/base.h"
#include "opal/class/opal_free_list.h"
#include "opal/class/opal_hash_table.h"
#include "opal/threads/mutex.h"
#include "opal/threads/condition.h"
#include "opal/threads/threads.h"
#include "opal/mca/timer/base/base.h"
#include "orte/mca/oob/oob.h"
#include "orte/mca/oob/base/base.h"
#include <infiniband/verbs.h>
struct mca_oob_ud_msg_hdr_t;
struct mca_oob_ud_port_t;
struct mca_oob_ud_peer_t {
opal_object_t super;
void *peer_context;
struct ibv_ah *peer_ah;
uint32_t peer_qpn;
uint32_t peer_qkey;
uint64_t peer_next_id;
uint64_t peer_expected_id;
orte_process_name_t peer_name;
uint16_t peer_lid;
uint8_t peer_port;
bool peer_available;
bool needs_notification;
opal_list_t peer_flying_messages;
opal_mutex_t peer_lock;
struct {
int tries;
opal_event_t event;
struct timeval value;
bool active;
} peer_timer;
};
typedef struct mca_oob_ud_peer_t mca_oob_ud_peer_t;
OBJ_CLASS_DECLARATION(mca_oob_ud_peer_t);
int mca_oob_ud_peer_lookup (const orte_process_name_t *name, mca_oob_ud_peer_t **peer);
int mca_oob_ud_peer_update_with_uri (mca_oob_ud_peer_t *peer, const char *uri);
mca_oob_ud_peer_t *mca_oob_ud_peer_from_uri (const char *uri);
mca_oob_ud_peer_t *mca_oob_ud_get_peer (struct mca_oob_ud_port_t *port,
orte_process_name_t *name,
uint32_t qpn, uint32_t qkey,
uint16_t lid, uint8_t port_num);
void mca_oob_ud_peer_lost (mca_oob_ud_peer_t *peer);
void mca_oob_ud_peer_release (mca_oob_ud_peer_t *peer);
struct mca_oob_ud_msg_t;
int mca_oob_ud_peer_post_msg (mca_oob_ud_peer_t *peer, struct mca_oob_ud_msg_t *msg);
void mca_oob_ud_peer_start_timer (mca_oob_ud_peer_t *peer);
void mca_oob_ud_peer_stop_timer (mca_oob_ud_peer_t *peer);
void mca_oob_ud_peer_reset_timer (mca_oob_ud_peer_t *peer);
void mca_oob_ud_peer_post_all (mca_oob_ud_peer_t *peer);
void mca_oob_ud_peer_handle_end (mca_oob_ud_peer_t *peer);
#endif

Просмотреть файл

@ -1,70 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
* 2014 Mellanox Technologies, Inc.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#include "opal/mca/event/event.h"
#include "opal/opal_socket_errno.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "oob_ud_ping.h"
int mca_oob_ud_process_ping(int fd, short args, void *cbdata)
{
mca_oob_ud_ping_t *op = (mca_oob_ud_ping_t*)cbdata;
orte_process_name_t* name = &op->peer;
mca_oob_ud_peer_t *peer;
mca_oob_ud_port_t *port;
mca_oob_ud_msg_t *msg = NULL;
int rc;
opal_output_verbose (2, orte_oob_base_framework.framework_output,
"%s oob:ud:ping attempting to ping %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(name));
rc = mca_oob_ud_peer_lookup(name, &peer);
if (rc != ORTE_SUCCESS) {
return rc;
}
/* NTH: TODO -- get a random port? */
port = (mca_oob_ud_port_t *) opal_list_get_first (&((mca_oob_ud_device_t *)peer->peer_context)->ports);
do {
rc = mca_oob_ud_msg_get (port, NULL, &port->listen_qp, peer, true, &msg);
if (ORTE_SUCCESS != rc) {
break;
}
msg->hdr->msg_type = MCA_OOB_UD_MSG_PING;
rc = mca_oob_ud_msg_post_send (msg);
/* wait for ack */
rc = mca_oob_ud_msg_wait (msg);
opal_output_verbose (2, orte_oob_base_framework.framework_output,
"%s oob:ud:ping result to %s -> %s: %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(name), rc);
} while (0);
if (NULL != msg) {
mca_oob_ud_msg_return(msg);
}
mca_oob_ud_peer_release (peer);
return rc;
}

Просмотреть файл

@ -1,39 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
* 2014 Mellanox Technologies, Inc.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#ifndef _MCA_OOB_UD_PING_H_
#define _MCA_OOB_UD_PING_H_
#include "oob_ud_component.h"
typedef struct {
opal_object_t super;
opal_event_t ev;
orte_process_name_t peer;
} mca_oob_ud_ping_t;
OBJ_CLASS_DECLARATION(mca_oob_ud_ping_t);
#define ORTE_ACTIVATE_UD_PING(p, cbfunc) \
do { \
mca_oob_ud_ping_t *pop; \
pop = OBJ_NEW(mca_oob_ud_ping_t); \
pop->peer.jobid = (p)->jobid; \
pop->peer.vpid = (p)->vpid; \
opal_event_set(mca_oob_ud_module.ev_base, &pop->ev, -1, \
OPAL_EV_WRITE, (cbfunc), pop); \
opal_event_set_priority(&pop->ev, ORTE_MSG_PRI); \
opal_event_active(&pop->ev, OPAL_EV_WRITE, 1); \
} while(0);
#endif /* _MCA_OOB_UD_PING_H_ */

Просмотреть файл

@ -1,321 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
* reserved.
* 2014 Mellanox Technologies, Inc.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#include "oob_ud_component.h"
#include "oob_ud_qp.h"
#include "oob_ud.h"
#include "orte/util/show_help.h"
static void mca_oob_ud_qp_constructor (mca_oob_ud_qp_t *qp);
static void mca_oob_ud_qp_destructor (mca_oob_ud_qp_t *qp);
OBJ_CLASS_INSTANCE(mca_oob_ud_qp_t, opal_free_list_item_t,
mca_oob_ud_qp_constructor,
mca_oob_ud_qp_destructor);
static inline int mca_oob_ud_qp_process_send_completions (mca_oob_ud_qp_t *qp,
int num_completions);
#define MCA_OOB_UD_CLEAR_CQ(cq) \
do { \
if (NULL == (cq)->channel) { \
struct ibv_wc wc; \
while (ibv_poll_cq ((cq), 1, &wc)); \
} \
} while (0); \
int mca_oob_ud_qp_init (mca_oob_ud_qp_t *qp, struct mca_oob_ud_port_t *port,
struct ibv_comp_channel *recv_channel,
struct ibv_comp_channel *send_channel, bool onecq)
{
struct ibv_qp_init_attr init_attr;
int max_cqe = min(port->device->attr.max_cqe, 16384);
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s oob:ud:qp_init creating UD QP on port %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), port->port_num);
/* create a UD queue pair */
memset(&init_attr, 0, sizeof(init_attr));
init_attr.qp_type = IBV_QPT_UD;
qp->ib_recv_cq = ibv_create_cq (port->device->ib_context, max_cqe,
port, recv_channel, 0);
if (NULL == qp->ib_recv_cq) {
orte_show_help("help-oob-ud.txt", "create-cq-failed", true,
orte_process_info.nodename, max_cqe, strerror(errno));
return ORTE_ERROR;
}
if (false == onecq) {
qp->ib_send_cq = ibv_create_cq (port->device->ib_context, max_cqe,
port, send_channel, 0);
if (NULL == qp->ib_send_cq) {
orte_show_help("help-oob-ud.txt", "create-cq-failed", true,
orte_process_info.nodename, max_cqe, strerror(errno));
return ORTE_ERROR;
}
} else {
qp->ib_send_cq = qp->ib_recv_cq;
}
init_attr.send_cq = qp->ib_send_cq;
init_attr.recv_cq = qp->ib_recv_cq;
mca_oob_ud_device_t *device = (mca_oob_ud_device_t *) opal_list_get_first (&mca_oob_ud_component.ud_devices);
opal_output_verbose(80, orte_oob_base_framework.framework_output,
"%s oob:ud:qp_init create queue pair for device: device->attr.max_sge = %d, device->attr.max_qp_wr = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), device->attr.max_sge, device->attr.max_qp_wr);
init_attr.cap.max_send_sge = mca_oob_ud_component.ud_qp_max_send_sge;
init_attr.cap.max_recv_sge = mca_oob_ud_component.ud_qp_max_recv_sge; /* GRH, data */
init_attr.cap.max_inline_data = mca_oob_ud_component.ud_qp_max_inline_data;
init_attr.cap.max_recv_wr = min(mca_oob_ud_component.ud_qp_max_recv_wr, device->attr.max_qp_wr);
init_attr.cap.max_send_wr = min(mca_oob_ud_component.ud_qp_max_send_wr, device->attr.max_qp_wr);
qp->ib_qp = ibv_create_qp (port->device->ib_pd, &init_attr);
if (NULL == qp->ib_qp) {
orte_show_help("help-oob-ud.txt", "create-qp-failed", true,
orte_process_info.nodename, init_attr.cap.max_send_sge, init_attr.cap.max_recv_sge,
init_attr.cap.max_send_wr, init_attr.cap.max_recv_wr, init_attr.cap.max_inline_data,
strerror(errno));
return ORTE_ERROR;
}
/* end: create the UD queue pair */
qp->port = port;
return ORTE_SUCCESS;
}
int mca_oob_ud_qp_to_reset (mca_oob_ud_qp_t *qp)
{
struct ibv_qp_attr attr;
/* move the QP into the ERR state */
memset(&attr, 0, sizeof(attr));
attr.qp_state = IBV_QPS_ERR;
if (0 != ibv_modify_qp(qp->ib_qp, &attr, IBV_QP_STATE)) {
orte_show_help("help-oob-ud.txt", "modify-qp-failed", true,
orte_process_info.nodename, IBV_QP_STATE, strerror(errno));
return ORTE_ERROR;
}
/* poll thread/event will clear failed work requests */
MCA_OOB_UD_CLEAR_CQ(qp->ib_send_cq);
MCA_OOB_UD_CLEAR_CQ(qp->ib_recv_cq);
/* move the QP into the RESET state */
memset(&attr, 0, sizeof(attr));
attr.qp_state = IBV_QPS_RESET;
if (0 != ibv_modify_qp(qp->ib_qp, &attr, IBV_QP_STATE)) {
orte_show_help("help-oob-ud.txt", "modify-qp-failed", true,
orte_process_info.nodename, IBV_QP_STATE, strerror(errno));
return ORTE_ERROR;
}
return ORTE_SUCCESS;
}
int mca_oob_ud_qp_to_rts (mca_oob_ud_qp_t *qp)
{
struct mca_oob_ud_port_t *port = qp->port;
int attr_mask;
struct ibv_qp_attr attr;
/* move the QP into the INIT state */
memset(&attr, 0, sizeof(attr));
attr.qp_state = IBV_QPS_INIT;
attr.pkey_index = 0; /* NTH: might need to modify the pkey index later */
attr.port_num = port->port_num;
attr.qkey = 0;
attr_mask = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_QKEY;
if (0 != ibv_modify_qp(qp->ib_qp, &attr, attr_mask)) {
orte_show_help("help-oob-ud.txt", "modify-qp-failed", true,
orte_process_info.nodename, attr_mask, strerror(errno));
return ORTE_ERROR;
}
/* Move QP to RTR */
attr.qp_state = IBV_QPS_RTR;
if (0 != ibv_modify_qp(qp->ib_qp, &attr, IBV_QP_STATE)) {
orte_show_help("help-oob-ud.txt", "modify-qp-failed", true,
orte_process_info.nodename, attr_mask, strerror(errno));
return ORTE_ERROR;
}
/* Setup attributes */
memset(&attr, 0, sizeof(attr));
attr.qp_state = IBV_QPS_RTS;
attr.sq_psn = 0;
attr_mask = IBV_QP_STATE | IBV_QP_SQ_PSN;
if (0 != ibv_modify_qp(qp->ib_qp, &attr, attr_mask)) {
orte_show_help("help-oob-ud.txt", "modify-qp-failed", true,
orte_process_info.nodename, attr_mask, strerror(errno));
return ORTE_ERROR;
}
return ORTE_SUCCESS;
}
/* purge all work requests on a qp */
int mca_oob_ud_qp_purge (mca_oob_ud_qp_t *qp)
{
int rc;
rc = mca_oob_ud_qp_to_reset (qp);
if (ORTE_SUCCESS != rc) {
return rc;
}
return mca_oob_ud_qp_to_rts (qp);
}
static void mca_oob_ud_qp_constructor (mca_oob_ud_qp_t *qp)
{
memset ((char *)qp + sizeof(qp->super), 0, sizeof (*qp) - sizeof (qp->super));
}
static void mca_oob_ud_qp_destructor (mca_oob_ud_qp_t *qp)
{
int rc;
if (NULL != qp->ib_qp) {
/* clear qp and move to reset */
(void) mca_oob_ud_qp_to_reset (qp);
/* destroy qp */
rc = ibv_destroy_qp (qp->ib_qp);
if (0 != rc) {
orte_show_help("help-oob-ud.txt", "destroy-qp-failed", true,
orte_process_info.nodename, strerror(errno));
}
}
if (NULL != qp->ib_send_cq) {
(void) ibv_destroy_cq (qp->ib_send_cq);
}
if (NULL != qp->ib_recv_cq && qp->ib_recv_cq != qp->ib_send_cq) {
(void) ibv_destroy_cq (qp->ib_recv_cq);
}
}
static inline int mca_oob_ud_qp_process_send_completions (mca_oob_ud_qp_t *qp,
int num_completions)
{
struct ibv_wc wc[1];
int count, rc, ret, i;
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s oob:ud:qp_process_send_completions polling for %d completions",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
num_completions);
rc = ORTE_SUCCESS;
for (count = 0 ; count < num_completions ; ) {
ret = ibv_poll_cq (qp->ib_send_cq, 1, wc);
if (ret < 0) {
orte_show_help("help-oob-ud.txt", "poll-cq-failed", true,
orte_process_info.nodename, 1, strerror(errno));
return ORTE_ERROR;
}
for (i = 0 ; i < ret ; ++i) {
if (IBV_WC_SUCCESS != wc[i].status) {
orte_show_help("help-oob-ud.txt", "poll-cq-failed-wc", true,
orte_process_info.nodename, 1, i, wc[i].status);
rc = ORTE_ERROR;
}
}
count += ret;
}
return rc;
}
int mca_oob_ud_qp_post_send (mca_oob_ud_qp_t *qp, struct ibv_send_wr *wr,
int num_completions) {
struct ibv_send_wr *bad_wr;
int rc;
rc = ibv_post_send (qp->ib_qp, wr, &bad_wr);
if (0 != rc) {
orte_show_help("help-oob-ud.txt", "post-send-failed", true,
orte_process_info.nodename, strerror(errno));
return ORTE_ERROR;
}
return mca_oob_ud_qp_process_send_completions (qp, num_completions);
}
int mca_oob_ud_qp_post_recv (mca_oob_ud_qp_t *qp, struct ibv_recv_wr *wr) {
struct ibv_recv_wr *bad_wr;
int rc;
rc = ibv_post_recv (qp->ib_qp, wr, &bad_wr);
if (0 != rc) {
orte_show_help("help-oob-ud.txt", "post-recv-failed", true,
orte_process_info.nodename, strerror(errno));
return ORTE_ERROR;
}
return ORTE_SUCCESS;
}
int mca_oob_ud_qp_data_aquire (struct mca_oob_ud_port_t *port, mca_oob_ud_qp_t **qp_ptr) {
int rc = ORTE_SUCCESS;
opal_free_list_item_t *item;
do {
item = opal_free_list_get_st (&port->data_qps);
if (NULL == item) {
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:qp_data_aquire error allocating new data qp. error = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rc);
rc = ORTE_ERR_TEMP_OUT_OF_RESOURCE;
break;
}
*qp_ptr = (mca_oob_ud_qp_t *) item;
if (NULL == (*qp_ptr)->ib_qp) {
rc = mca_oob_ud_qp_init (*qp_ptr, port, NULL, NULL, true);
if (ORTE_SUCCESS != rc) {
break;
}
rc = mca_oob_ud_qp_to_rts (*qp_ptr);
}
} while (0);
return rc;
}
int mca_oob_ud_qp_data_release (mca_oob_ud_qp_t *qp) {
int rc;
rc = mca_oob_ud_qp_purge (qp);
if (ORTE_SUCCESS != rc) {
return rc;
}
opal_free_list_return_st (&qp->port->data_qps, &qp->super);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,73 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
* 2014 Mellanox Technologies, Inc.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#if !defined(MCA_OOB_UD_QP_H)
#define MCA_OOB_UD_QP_H
#include "orte_config.h"
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif
#include "orte/types.h"
#include "opal/mca/base/base.h"
#include "opal/class/opal_free_list.h"
#include "opal/class/opal_hash_table.h"
#include "opal/threads/mutex.h"
#include "opal/threads/condition.h"
#include "opal/threads/threads.h"
#include "opal/mca/timer/base/base.h"
#include "orte/mca/oob/oob.h"
#include "orte/mca/oob/base/base.h"
#include <infiniband/verbs.h>
enum mca_oob_ud_qp_type_t {
MCA_OOB_UD_QP_DATA,
MCA_OOB_UD_QP_LISTEN
};
struct mca_oob_ud_port_t;
struct mca_oob_ud_qp_t {
opal_free_list_item_t super;
enum mca_oob_ud_qp_type_t type;
struct ibv_qp *ib_qp;
struct mca_oob_ud_port_t *port;
struct ibv_cq *ib_send_cq, *ib_recv_cq;
};
typedef struct mca_oob_ud_qp_t mca_oob_ud_qp_t;
OBJ_CLASS_DECLARATION(mca_oob_ud_qp_t);
int mca_oob_ud_qp_init (mca_oob_ud_qp_t *qp, struct mca_oob_ud_port_t *port,
struct ibv_comp_channel *recv_channel,
struct ibv_comp_channel *send_channel, bool onecq);
int mca_oob_ud_qp_to_reset (mca_oob_ud_qp_t *qp);
int mca_oob_ud_qp_to_rts (mca_oob_ud_qp_t *qp);
int mca_oob_ud_qp_purge (mca_oob_ud_qp_t *qp);
int mca_oob_ud_qp_post_send (mca_oob_ud_qp_t *qp, struct ibv_send_wr *wr, int num_completions);
int mca_oob_ud_qp_post_recv (mca_oob_ud_qp_t *qp, struct ibv_recv_wr *wr);
int mca_oob_ud_qp_data_aquire (struct mca_oob_ud_port_t *port, mca_oob_ud_qp_t **qp_ptr);
int mca_oob_ud_qp_data_release (mca_oob_ud_qp_t *qp);
#endif

Просмотреть файл

@ -1,539 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
* 2014 Mellanox Technologies, Inc.
* All rights reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#include "orte_config.h"
#include "orte/types.h"
#include "opal/types.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "math.h"
#include "oob_ud_component.h"
#define min(a,b) ((a) < (b) ? (a) : (b))
/* Caller MUST hold the matching lock before calling */
static inline int mca_oob_ud_find_recv (opal_list_t *list, const orte_process_name_t name,
const int tag, mca_oob_ud_req_t **req)
{
opal_list_item_t *item;
int rc = ORTE_ERR_NOT_FOUND;
*req = NULL;
OPAL_THREAD_LOCK(&mca_oob_ud_component.ud_match_lock);
for (item = opal_list_get_first (list) ; item != opal_list_get_end (list) ;
item = opal_list_get_next (item)) {
mca_oob_ud_req_t *recv_req = (mca_oob_ud_req_t *) item;
opal_output_verbose(15, orte_oob_base_framework.framework_output,
"%s oob:ud:find_recv matching against "
"peer: %s, tag: %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&recv_req->req_origin), recv_req->req_tag);
if (OPAL_EQUAL == opal_dss.compare (&name, &recv_req->req_origin, ORTE_NAME) &&
tag == recv_req->req_tag) {
*req = recv_req;
rc = ORTE_SUCCESS;
break;
}
}
opal_output_verbose(15, orte_oob_base_framework.framework_output,
"%s oob:ud:find_recv %sfound",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_SUCCESS != rc ? "not " : "");
OPAL_THREAD_UNLOCK(&mca_oob_ud_component.ud_match_lock);
if (ORTE_SUCCESS == rc) {
mca_oob_ud_req_append_to_list (*req, NULL);
}
return rc;
}
int mca_oob_ud_get_recv_req (const orte_process_name_t name, const int tag,
mca_oob_ud_req_t **reqp, bool iovec_used) {
mca_oob_ud_req_t *req;
opal_output_verbose(15, orte_oob_base_framework.framework_output,
"%s oob:ud:get_recv_req create receive request against: %s, tag: %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&name), tag);
*reqp = req = OBJ_NEW(mca_oob_ud_req_t);
req->req_origin = name;
req->req_tag = tag;
req->req_channel = ORTE_RML_INVALID_CHANNEL_NUM;
req->req_seq_num = 0;
/* this receive was not expected */
req->type = MCA_OOB_UD_REQ_RECV;
/* let mca_oob_ud_recv_alloc alloc memory for the receive */
if (iovec_used) {
req->req_data.iov.uiov = calloc (1, sizeof (struct iovec));
req->req_data_type = MCA_OOB_UD_REQ_IOV;
} else {
req->req_data_type = MCA_OOB_UD_REQ_BUF;
}
req->req_data.iov.count = 1;
return ORTE_SUCCESS;
}
static inline int mca_oob_ud_find_active_recv (const orte_process_name_t name, const int tag,
mca_oob_ud_req_t **req) {
opal_output_verbose(15, orte_oob_base_framework.framework_output,
"%s oob:ud:recv_match active receive request "
"against: %s, tag: %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&name), tag);
return mca_oob_ud_find_recv (&mca_oob_ud_component.ud_active_recvs, name, tag, req);
}
static void mca_oob_ud_recv_try_to (int fd, short event, void *data)
{
(void) mca_oob_ud_recv_try ((mca_oob_ud_req_t *) data);
}
int mca_oob_ud_recv_try (mca_oob_ud_req_t *recv_req)
{
int rc, data_len;
int wr_count, sge_count, wr_index, sge_index, iov_index;
unsigned int iov_left, iov_offset, packet_size;
const unsigned int mtu = recv_req->req_mtu;
struct timeval aquire_timeout = {0, 500000};
mca_oob_ud_msg_t *rep_msg = NULL;
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s oob:ud:recv_try receiving from %s. recv_req = %p. rem ctx = %p",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&recv_req->req_peer->peer_name),
(void *)recv_req, (void *)recv_req->req_rem_ctx);
do {
if (NULL == recv_req->req_qp) {
rc = mca_oob_ud_qp_data_aquire (recv_req->req_port, &recv_req->req_qp);
if (ORTE_SUCCESS != rc) {
break;
}
}
(void) mca_oob_ud_qp_purge (recv_req->req_qp);
rc = mca_oob_ud_msg_get (recv_req->req_port, recv_req, &recv_req->req_port->listen_qp,
recv_req->req_peer, NULL, &rep_msg);
if (ORTE_SUCCESS != rc) {
break;
}
if (MCA_OOB_UD_REQ_IOV == recv_req->req_data_type) {
if (NULL == recv_req->req_data.iov.mr) {
/* allocate space for memory registers */
recv_req->req_data.iov.mr = (struct ibv_mr **) calloc (recv_req->req_data.iov.count, sizeof (struct ibv_mr *));
if (NULL == recv_req->req_data.iov.mr) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
rc = ORTE_ERR_OUT_OF_RESOURCE;
break;
}
}
rc = mca_oob_ud_register_iov (recv_req->req_data.iov.uiov, recv_req->req_data.iov.count,
recv_req->req_data.iov.mr, recv_req->req_port->device->ib_pd,
mtu, &sge_count, &wr_count, &data_len);
if (ORTE_SUCCESS != rc) {
break;
}
} else {
data_len = recv_req->req_data.buf.size;
rc = mca_oob_ud_register_buf (recv_req->req_data.buf.p, recv_req->req_data.buf.size,
&recv_req->req_data.buf.mr, recv_req->req_port->device->ib_pd,
mtu, &sge_count, &wr_count);
if (ORTE_SUCCESS != rc) {
break;
}
}
data_len = min(data_len, recv_req->req_rem_data_len);
if (data_len < recv_req->req_rem_data_len) {
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:recv_try receive buffers are not big. this is probably an error condition."
"data_len = %d, recv_req->req_rem_data_len = %d.",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), data_len, recv_req->req_rem_data_len);
rc = ORTE_ERR_BAD_PARAM;
break;
}
wr_count = (data_len + mtu - 1) / mtu;
sge_count += wr_count;
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:recv_try receiving %d bytes in %d "
"work requests, %d sges", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), data_len,
wr_count, sge_count);
recv_req->req_packet_count = wr_count;
if (NULL == recv_req->req_wr.recv) {
/* allocate work requests */
recv_req->req_wr.recv = (struct ibv_recv_wr *) calloc (wr_count, sizeof (struct ibv_recv_wr));
if (NULL == recv_req->req_wr.recv) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
rc = ORTE_ERR_OUT_OF_RESOURCE;
break;
}
}
if (NULL == recv_req->req_sge) {
/* allocate scatter-gather lists. we need more to hold the grh */
recv_req->req_sge = (struct ibv_sge *) calloc (sge_count, sizeof (struct ibv_sge));
if (NULL == recv_req->req_sge) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
rc = ORTE_ERR_OUT_OF_RESOURCE;
break;
}
}
if (NULL == recv_req->req_grh) {
/* allocate grh buffers */
recv_req->req_grh = (struct ibv_grh *) calloc (wr_count, sizeof (struct ibv_grh));
if (NULL == recv_req->req_grh) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
rc = ORTE_ERR_OUT_OF_RESOURCE;
break;
}
}
if (NULL == recv_req->req_grh_mr) {
/* register grh buffers */
recv_req->req_grh_mr = ibv_reg_mr (recv_req->req_port->device->ib_pd, recv_req->req_grh,
wr_count * sizeof (struct ibv_grh),
IBV_ACCESS_LOCAL_WRITE);
if (NULL == recv_req->req_grh_mr) {
orte_show_help("help-oob-ud.txt", "reg-mr-failed", true,
orte_process_info.nodename, recv_req->req_grh,
wr_count * sizeof (struct ibv_grh), strerror(errno));
/* could not register memory */
rc = ORTE_ERR_OUT_OF_RESOURCE;
break;
}
}
rc = ORTE_SUCCESS;
if (MCA_OOB_UD_REQ_IOV == recv_req->req_data_type) {
iov_left = recv_req->req_data.iov.uiov[0].iov_len;
iov_offset = 0;
iov_index = 0;
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud: recv_req->req_data.iov.uiov[0].iov_len = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)recv_req->req_data.iov.uiov[0].iov_len);
for (wr_index = 0, sge_index = 0 ; wr_index < wr_count ; ++wr_index) {
int sge_first = sge_index;
packet_size = 0;
/* grh */
mca_oob_ud_fill_sge(recv_req->req_sge + sge_index++,
recv_req->req_grh + wr_index,
sizeof (struct ibv_grh),
recv_req->req_grh_mr->lkey);
do {
int to_recv = min (iov_left, mtu - packet_size);
mca_oob_ud_fill_sge(recv_req->req_sge + sge_index++,
(char *)recv_req->req_data.iov.uiov[iov_index].iov_base + iov_offset,
to_recv, recv_req->req_data.iov.mr[iov_index]->lkey);
iov_offset += to_recv;
iov_left -= to_recv;
packet_size += to_recv;
if (0 == iov_left) {
iov_index++;
iov_offset = 0;
if (iov_index < recv_req->req_data.iov.count) {
iov_left = recv_req->req_data.iov.uiov[iov_index].iov_len;
}
}
} while ((packet_size < mtu) && (iov_left > 0));
mca_oob_ud_fill_recv_wr(recv_req->req_wr.recv + wr_index,
recv_req->req_sge + sge_first,
sge_index - sge_first);
if (wr_index + 1 < wr_count) {
recv_req->req_wr.recv[wr_index].next = recv_req->req_wr.recv + wr_index + 1;
}
}
} else {
unsigned int buffer_left = recv_req->req_data.buf.size;
unsigned int buffer_offset = 0;
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:recv_try recv_req->req_data.buf.size = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv_req->req_data.buf.size);
for (wr_index = 0, sge_index = 0 ; wr_index < wr_count ; ++wr_index) {
int sge_first = sge_index;
packet_size = 0;
/* grh */
mca_oob_ud_fill_sge(recv_req->req_sge + sge_index++,
recv_req->req_grh + wr_index,
sizeof (struct ibv_grh),
recv_req->req_grh_mr->lkey);
do {
int to_recv = min (buffer_left, mtu - packet_size);
mca_oob_ud_fill_sge(recv_req->req_sge + sge_index++,
(char *)recv_req->req_data.buf.p + buffer_offset,
to_recv, recv_req->req_data.buf.mr->lkey);
buffer_offset += to_recv;
buffer_left -= to_recv;
packet_size += to_recv;
} while ((packet_size < mtu) && (buffer_left > 0));
mca_oob_ud_fill_recv_wr(recv_req->req_wr.recv + wr_index,
recv_req->req_sge + sge_first,
sge_index - sge_first);
if (wr_index + 1 < wr_count) {
recv_req->req_wr.recv[wr_index].next = recv_req->req_wr.recv + wr_index + 1;
}
}
}
rc = mca_oob_ud_qp_post_recv (recv_req->req_qp, recv_req->req_wr.recv);
if (ORTE_SUCCESS != rc) {
break;
}
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:recv_try posting reply message",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
/* ok, we have a data queue pair */
rep_msg->hdr->msg_type = MCA_OOB_UD_MSG_REPLY;
rep_msg->hdr->msg_lcl_ctx = recv_req->req_rem_ctx;
rep_msg->hdr->msg_rem_ctx = recv_req;
rep_msg->hdr->msg_data.rep.qpn = recv_req->req_qp->ib_qp->qp_num;
rep_msg->hdr->msg_data.rep.data_len = data_len;
rep_msg->hdr->msg_data.rep.mtu = mtu;
rc = mca_oob_ud_msg_post_send (rep_msg);
/* post send already returned the message */
rep_msg = NULL;
} while (0);
if (ORTE_ERR_TEMP_OUT_OF_RESOURCE == rc) {
mca_oob_ud_req_timer_set (recv_req, &aquire_timeout, 1, mca_oob_ud_recv_try_to);
rc = ORTE_SUCCESS;
}
if (ORTE_SUCCESS != rc) {
/* bad stuff happened */
mca_oob_ud_req_complete (recv_req, rc);
if (mca_oob_ud_req_is_in_list(recv_req, &mca_oob_ud_component.ud_active_recvs)) {
opal_list_remove_item (&mca_oob_ud_component.ud_active_recvs, (opal_list_item_t *) recv_req);
}
OBJ_RELEASE(recv_req);
return rc;
}
recv_req->state = MCA_OOB_UD_REQ_ACTIVE;
return rc;
}
int mca_oob_ud_recv_complete (mca_oob_ud_req_t *recv_req)
{
mca_oob_ud_msg_t *dataok;
int i, j, rc = ORTE_SUCCESS;
uint32_t expected;
bool error = false, out_of_order = false;
#if defined(HAVE_VALGRIND)
int iov_index;
#endif
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:recv_complete req = %p",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) recv_req);
if (false == recv_req->req_is_eager) {
for (i = 0, expected = 0 ; i < recv_req->req_packet_count ; ) {
struct ibv_wc wc[10];
rc = ibv_poll_cq (recv_req->req_qp->ib_recv_cq, 10, wc);
for (j = 0 ; j < rc ; ++j) {
if (wc[j].imm_data != expected) {
out_of_order = true;
}
if (IBV_WC_SUCCESS != wc[j].status) {
error = true;
}
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:recv_complete wc status = %d. imm data = %u. len = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), wc[j].status, wc[j].imm_data,
wc[j].byte_len);
expected++;
}
if (rc <= 0) {
break;
}
i += rc;
}
if (i != recv_req->req_packet_count || error || out_of_order) {
/* retry */
recv_req->state = MCA_OOB_UD_REQ_PENDING;
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:recv_complete receive incomplete. error: %d, "
"out_of_order: %d packets: %d/%d. rc = %d, errno = %d.",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), error, out_of_order, i,
recv_req->req_packet_count, rc, errno);
mca_oob_ud_recv_try (recv_req);
return ORTE_SUCCESS;
}
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s oob:ud:recv_complete data received ok!",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
/* send data ok and wait for ack */
rc = mca_oob_ud_msg_get (recv_req->req_port, recv_req, &recv_req->req_port->listen_qp,
recv_req->req_peer, false, &dataok);
if (ORTE_SUCCESS != rc) {
return rc;
}
dataok->hdr->msg_type = MCA_OOB_UD_MSG_DATA_OK;
dataok->hdr->msg_lcl_ctx = recv_req->req_rem_ctx;
rc = mca_oob_ud_msg_post_send (dataok);
if (ORTE_SUCCESS != rc) {
return rc;
}
}
#if defined(HAVE_VALGRIND)
for (iov_index = 0 ; iov_index < recv_req->req_count ; ++iov_index) {
VALGRIND_MAKE_MEM_DEFINED(recv_req->req_uiov[iov_index].iov_base,
recv_req->req_uiov[iov_index].iov_len);
}
#endif
mca_oob_ud_req_complete (recv_req, rc);
return ORTE_SUCCESS;
}
int mca_oob_ud_recv_match_send (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer, mca_oob_ud_msg_hdr_t *msg_hdr,
mca_oob_ud_req_t **reqp)
{
char *data = (msg_hdr->msg_data.req.data_follows ? (char *)(msg_hdr + 1) : NULL);
mca_oob_ud_req_t *req;
int rc, i;
*reqp = NULL;
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s oob:ud:recv_incoming_send matching incoming "
"send from peer %s with tag %d (data_follows = %d, data = %p, iovec_use = %d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&msg_hdr->msg_origin), msg_hdr->msg_data.req.tag,
msg_hdr->msg_data.req.data_follows, (void *)data, msg_hdr->msg_data.req.data_iovec_used);
rc = mca_oob_ud_get_recv_req (msg_hdr->msg_origin, msg_hdr->msg_data.req.tag, &req, msg_hdr->msg_data.req.data_iovec_used);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
req->req_rem_ctx = msg_hdr->msg_rem_ctx;
req->req_port = port;
req->req_mtu = min(port->mtu, msg_hdr->msg_data.req.mtu);
req->req_origin = msg_hdr->msg_origin;
req->req_target = msg_hdr->msg_target;
req->req_rem_data_len = msg_hdr->msg_data.req.data_len;
req->req_channel = msg_hdr->msg_channel;
req->req_seq_num = msg_hdr->msg_seq_num;
do {
rc = mca_oob_ud_recv_alloc (req);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
free (req->req_data.iov.uiov);
OBJ_RELEASE(req);
req = NULL;
break;
}
req->req_peer = peer;
OBJ_RETAIN(req->req_peer);
if (NULL == data) {
req->state = MCA_OOB_UD_REQ_ACTIVE;
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s oob:ud:recv_incoming_send request still active",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
break;
}
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s oob:ud:recv_incoming_send send was eager",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
req->req_is_eager = true;
if (msg_hdr->msg_data.req.data_iovec_used) {
for (i = 0 ; i < req->req_data.iov.count; ++i) {
memcpy (req->req_data.iov.uiov[i].iov_base, data, req->req_data.iov.uiov[i].iov_len);
data += req->req_data.iov.uiov[i].iov_len;
}
} else {
memcpy(req->req_data.buf.p, data, msg_hdr->msg_data.req.data_len);
}
req->state = MCA_OOB_UD_REQ_COMPLETE;
} while (0);
*reqp = req;
return rc;
}

Просмотреть файл

@ -1,420 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
* reserved.
* 2014 Mellanox Technologies, Inc.
* All rights reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#include "oob_ud_component.h"
#include "oob_ud_req.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
static void mca_oob_ud_req_constuct (mca_oob_ud_req_t *req);
static void mca_oob_ud_req_destruct (mca_oob_ud_req_t *req);
OBJ_CLASS_INSTANCE(mca_oob_ud_req_t, opal_list_item_t, mca_oob_ud_req_constuct,
mca_oob_ud_req_destruct);
static void mca_oob_ud_msg_destruct (mca_oob_ud_msg_t *msg);
static void mca_oob_ud_msg_construct (mca_oob_ud_msg_t *msg);
OBJ_CLASS_INSTANCE(mca_oob_ud_msg_t, opal_free_list_item_t,
mca_oob_ud_msg_construct,
mca_oob_ud_msg_destruct);
static void mca_oob_ud_req_constuct (mca_oob_ud_req_t *req)
{
memset ((char *)req + sizeof (req->super), 0, sizeof (*req) - sizeof (req->super));
}
static void mca_oob_ud_req_destruct (mca_oob_ud_req_t *req)
{
int i;
if (req->req_peer) {
OBJ_RELEASE(req->req_peer);
}
if (req->req_wr.send) {
free (req->req_wr.send);
}
if (req->req_grh_mr) {
(void) ibv_dereg_mr (req->req_grh_mr);
}
if (req->req_grh) {
free (req->req_grh);
}
if (req->req_sge) {
free (req->req_sge);
}
MCA_OOB_UD_REQ_DEREG_MR(req);
}
void mca_oob_ud_req_timer_set (mca_oob_ud_req_t *req, const struct timeval *timeout,
int max_tries, void (*cb)(evutil_socket_t, short, void *))
{
opal_event_evtimer_set (orte_event_base, &req->timer.event, cb, (void *) req);
req->timer.value.tv_sec = timeout->tv_sec;
req->timer.value.tv_usec = timeout->tv_usec;
opal_event_evtimer_add (&req->timer.event, &req->timer.value);
}
int mca_oob_ud_msg_get (struct mca_oob_ud_port_t *port, mca_oob_ud_req_t *req,
mca_oob_ud_qp_t *qp, mca_oob_ud_peer_t *peer, bool persist,
mca_oob_ud_msg_t **msgp)
{
opal_free_list_item_t *item;
opal_free_list_t *list = &port->free_msgs;
item = opal_free_list_wait_st (list);
if (NULL == item) {
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:msg_get error getting message buffer",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
return ORTE_ERROR;
}
*msgp = (mca_oob_ud_msg_t *) item;
(*msgp)->persist = persist;
(*msgp)->req = req;
(*msgp)->peer = peer;
(*msgp)->qp = qp;
if (NULL != peer) {
OBJ_RETAIN(peer);
}
memset ((*msgp)->hdr, 0, sizeof (*((*msgp)->hdr)));
mca_oob_ud_fill_sge (&(*msgp)->sge, (*msgp)->hdr, port->mtu, (*msgp)->mr->lkey);
mca_oob_ud_fill_send_wr (&(*msgp)->wr, &(*msgp)->sge, 1, peer);
/* set return address */
(*msgp)->hdr->ra.name = *ORTE_PROC_MY_NAME;
(*msgp)->hdr->ra.qkey = 0;
(*msgp)->hdr->ra.port_num = port->port_num;
return ORTE_SUCCESS;
}
int mca_oob_ud_msg_init (opal_free_list_item_t *item, void *context) {
mca_oob_ud_port_t *port = (mca_oob_ud_port_t *) context;
int buffer_id = port->send_buffer_index++ + mca_oob_ud_component.ud_recv_buffer_count;
char *buf = port->msg_buf.ptr + buffer_id * port->mtu;
mca_oob_ud_msg_t *msg = (mca_oob_ud_msg_t *) item;
msg->port = port;
msg->hdr = (mca_oob_ud_msg_hdr_t *) buf;
msg->mr = port->msg_buf.mr;
return ORTE_SUCCESS;
}
void mca_oob_ud_msg_return (mca_oob_ud_msg_t *msg)
{
opal_free_list_t *list = &msg->port->free_msgs;
if (NULL != msg->peer) {
mca_oob_ud_peer_release (msg->peer);
}
msg->peer = NULL;
msg->cbfunc = NULL;
msg->qp = NULL;
msg->req = NULL;
opal_free_list_return_st (list, &msg->super);
}
static void mca_oob_ud_msg_construct (mca_oob_ud_msg_t *msg)
{
memset ((char *)msg + sizeof (msg->super), 0, sizeof (*msg) - sizeof (msg->super));
OBJ_CONSTRUCT(&msg->status_changed, opal_condition_t);
OBJ_CONSTRUCT(&msg->lock, opal_mutex_t);
}
static void mca_oob_ud_msg_destruct (mca_oob_ud_msg_t *msg)
{
OBJ_DESTRUCT(&msg->status_changed);
OBJ_DESTRUCT(&msg->lock);
if (NULL != msg->peer) {
mca_oob_ud_peer_release (msg->peer);
}
}
int mca_oob_ud_msg_post_send (mca_oob_ud_msg_t *msg)
{
int rc = ORTE_SUCCESS;
msg->status = MCA_OOB_UD_MSG_STATUS_POSTED;
OPAL_THREAD_LOCK(&msg->peer->peer_lock);
if (MCA_OOB_UD_MSG_ACK == msg->hdr->msg_type ||
MCA_OOB_UD_MSG_NACK == msg->hdr->msg_type) {
rc = mca_oob_ud_qp_post_send (msg->qp, &msg->wr, 1);
} else {
rc = mca_oob_ud_peer_post_msg (msg->peer, msg);
}
if (ORTE_SUCCESS != rc && false == msg->persist) {
msg->status = MCA_OOB_UD_MSG_STATUS_ERROR;
mca_oob_ud_msg_return (msg);
}
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s oob:ud:msg_post_send posted send for msg %p with id %" PRIu64,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) msg, msg->hdr->msg_id);
OPAL_THREAD_UNLOCK(&msg->peer->peer_lock);
return rc;
}
int mca_oob_ud_msg_status_update (mca_oob_ud_msg_t *msg, mca_oob_ud_status_t status)
{
int rc;
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s oob:ud:msg_status_update setting status of msg %p to %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) msg, (int) status);
OPAL_THREAD_LOCK(&msg->lock);
if (status != msg->status) {
if (MCA_OOB_UD_MSG_STATUS_COMPLETE == status) {
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s oob:ud:msg_status_update setting peer %s as available",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&msg->peer->peer_name));
msg->peer->peer_available = true;
}
switch (status) {
case MCA_OOB_UD_MSG_STATUS_TIMEOUT:
rc = ORTE_ERR_TIMEOUT;
break;
case MCA_OOB_UD_MSG_STATUS_COMPLETE:
rc = ORTE_SUCCESS;
break;
case MCA_OOB_UD_MSG_STATUS_ERROR:
default:
rc = ORTE_ERROR;
}
if (msg->cbfunc) {
msg->cbfunc (msg, rc);
}
/* signal status change */
msg->status = status;
opal_condition_signal (&msg->status_changed);
OPAL_THREAD_UNLOCK(&msg->lock);
if (false == msg->persist) {
mca_oob_ud_msg_return (msg);
}
return ORTE_SUCCESS;
}
OPAL_THREAD_UNLOCK(&msg->lock);
return ORTE_SUCCESS;
}
static void mca_oob_ud_req_return (mca_oob_ud_req_t *req)
{
opal_output_verbose(15, orte_oob_base_framework.framework_output,
"%s oob:ud:req_return returning req %p",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) req);
mca_oob_ud_req_append_to_list (req, NULL);
if (NULL != req->req_peer) {
mca_oob_ud_peer_release (req->req_peer);
req->req_peer = NULL;
}
if (NULL != req->req_wr.send) {
free (req->req_wr.send);
req->req_wr.send = NULL;
}
if (NULL != req->req_sge) {
free (req->req_sge);
req->req_sge = NULL;
}
OBJ_RELEASE(req);
}
void mca_oob_ud_req_complete (mca_oob_ud_req_t *req, int rc)
{
int i;
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s oob:ud:req_complete %s request %p completed with status %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (req->type == MCA_OOB_UD_REQ_SEND) ? "SEND":"RECV", (void *) req, rc);
if (NULL != req->req_qp) {
(void) mca_oob_ud_qp_data_release (req->req_qp);
req->req_qp = NULL;
}
/* deregister memory *before* handing it to the callback */
MCA_OOB_UD_REQ_DEREG_MR(req);
switch (req->type) {
case MCA_OOB_UD_REQ_SEND:
if (req->req_data_type != MCA_OOB_UD_REQ_TR) {
req->rml_msg->status = rc;
}
break;
case MCA_OOB_UD_REQ_RECV:
if ((req->req_target.jobid == ORTE_PROC_MY_NAME->jobid) &&
(req->req_target.vpid == ORTE_PROC_MY_NAME->vpid)) {
opal_output_verbose(1, orte_oob_base_framework.framework_output,
"%s DELIVERING TO RML",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
if (MCA_OOB_UD_REQ_IOV == req->req_data_type) {
char *data = (char *)calloc(req->req_data.iov.count, sizeof(struct iovec));
int datalen = 0;
for (i = 0 ; i < req->req_data.iov.count; ++i) {
memcpy (&data[datalen], req->req_data.iov.uiov[i].iov_base, req->req_data.iov.uiov[i].iov_len);
datalen += req->req_data.iov.uiov[i].iov_len;
}
ORTE_RML_POST_MESSAGE(&req->req_origin, req->req_tag, req->req_seq_num, data, datalen);
free(data);
} else {
ORTE_RML_POST_MESSAGE(&req->req_origin, req->req_tag, req->req_seq_num,
req->req_data.buf.p, req->req_data.buf.size);
}
} else {
opal_output_verbose(1, orte_oob_base_framework.framework_output,
"%s UD PROMOTING ROUTED MESSAGE FOR %s TO OOB",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&req->req_target));
orte_rml_send_t *snd = OBJ_NEW(orte_rml_send_t);
snd->dst = req->req_target;
snd->origin = req->req_origin;
snd->tag = req->req_tag;
snd->seq_num = req->req_seq_num;
if (MCA_OOB_UD_REQ_IOV == req->req_data_type) {
char *data = (char *)calloc(req->req_data.iov.count, sizeof(struct iovec));
int datalen = 0;
for (i = 0 ; i < req->req_data.iov.count; ++i) {
memcpy (&data[datalen], req->req_data.iov.uiov[i].iov_base, req->req_data.iov.uiov[i].iov_len);
datalen += req->req_data.iov.uiov[i].iov_len;
}
snd->data = data;
snd->count = datalen;
} else {
char *data = (char *)calloc(req->req_data.buf.size, sizeof(char));
memcpy (data, req->req_data.buf.p, req->req_data.buf.size);
snd->data = data;
snd->count = req->req_data.buf.size;
}
snd->cbfunc.iov = NULL;
snd->cbdata = NULL;
/* activate the OOB send state */
ORTE_OOB_SEND(snd);
}
break;
default:
break;
}
mca_oob_ud_req_return (req);
}
void mca_oob_ud_req_append_to_list (mca_oob_ud_req_t *req, opal_list_t *list)
{
OPAL_THREAD_LOCK(&mca_oob_ud_component.ud_match_lock);
if (NULL != req->req_list) {
opal_list_remove_item (req->req_list, (opal_list_item_t *) req);
}
if (NULL != list) {
opal_list_append (list, (opal_list_item_t *) req);
}
req->req_list = list;
OPAL_THREAD_UNLOCK(&mca_oob_ud_component.ud_match_lock);
}
bool mca_oob_ud_req_is_in_list (mca_oob_ud_req_t *req, opal_list_t *list)
{
opal_list_item_t *item;
bool rc = false;
OPAL_THREAD_LOCK(&mca_oob_ud_component.ud_match_lock);
for (item = opal_list_get_first (list) ;
item != opal_list_get_end (list) ;
item = opal_list_get_next (item)) {
if (item == (opal_list_item_t *) req) {
rc = true;
break;
}
}
OPAL_THREAD_UNLOCK(&mca_oob_ud_component.ud_match_lock);
return rc;
}
void mca_oob_ud_req_abort (mca_oob_ud_req_t *req)
{
/* caller should have removed this request from any owner list */
req->req_list = NULL;
if (NULL != req->req_qp) {
mca_oob_ud_qp_data_release (req->req_qp);
req->req_qp = NULL;
}
/* free up request resources */
mca_oob_ud_req_complete (req, ORTE_ERR_INTERUPTED);
}
int mca_oob_ud_msg_wait (mca_oob_ud_msg_t *msg)
{
OPAL_THREAD_LOCK(&msg->lock);
/* wait for ack */
while (MCA_OOB_UD_MSG_STATUS_POSTED == msg->status) {
opal_condition_wait (&msg->status_changed, &msg->lock);
}
OPAL_THREAD_UNLOCK(&msg->lock);
switch (msg->status) {
case MCA_OOB_UD_MSG_STATUS_TIMEOUT:
return ORTE_ERR_TIMEOUT;
case MCA_OOB_UD_MSG_STATUS_COMPLETE:
return ORTE_SUCCESS;
case MCA_OOB_UD_MSG_STATUS_ERROR:
default:
return ORTE_ERROR;
}
}

Просмотреть файл

@ -1,281 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
* reserved.
* 2014 Mellanox Technologies, Inc.
* All rights reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#if !defined(MCA_OOB_UD_REQ_H)
#define MCA_OOB_UD_REQ_H
#include "oob_ud_peer.h"
#include "orte_config.h"
#include "orte/types.h"
#include "opal/threads/condition.h"
#include "opal/mca/event/event.h"
#include "opal/class/opal_free_list.h"
#include "orte/mca/rml/rml.h"
#include <infiniband/verbs.h>
#include "oob_ud_qp.h"
struct mca_oob_ud_peer_t;
enum mca_oob_ud_req_type_t {
MCA_OOB_UD_REQ_RECV,
MCA_OOB_UD_REQ_SEND
};
typedef enum mca_oob_ud_req_type_t mca_oob_ud_req_type_t;
enum mca_oob_ud_req_state_t {
MCA_OOB_UD_REQ_ACTIVE,
MCA_OOB_UD_REQ_PENDING,
MCA_OOB_UD_REQ_COMPLETE
};
typedef enum mca_oob_ud_req_state_t mca_oob_ud_req_state_t;
enum mca_oob_ud_req_data_type_t {
MCA_OOB_UD_REQ_IOV,
MCA_OOB_UD_REQ_BUF,
MCA_OOB_UD_REQ_TR
};
typedef enum mca_oob_ud_req_data_type_t mca_oob_ud_req_data_type_t;
enum mca_oob_ud_msg_type_t {
MCA_OOB_UD_MSG_REQUEST = 37,
MCA_OOB_UD_MSG_REPLY = 38,
MCA_OOB_UD_MSG_COMPLETE = 39,
MCA_OOB_UD_MSG_PING = 40,
MCA_OOB_UD_MSG_ACK = 41,
MCA_OOB_UD_MSG_NACK = 42,
MCA_OOB_UD_MSG_DATA_OK = 43,
MCA_OOB_UD_MSG_END = 44
};
typedef enum mca_oob_ud_msg_type_t mca_oob_ud_msg_type_t;
struct mca_oob_ud_msg_hdr_t {
mca_oob_ud_msg_type_t msg_type;
void *msg_rem_ctx;
void *msg_lcl_ctx;
orte_process_name_t msg_origin;
orte_process_name_t msg_target;
int msg_channel;
int msg_seq_num;
uint64_t msg_id;
struct {
/* the receiver can get the qpn and lid from the work completion */
uint32_t qkey;
orte_process_name_t name;
uint8_t port_num;
} ra;
union {
struct {
int tag;
int data_len;
int mtu;
bool data_follows;
bool data_iovec_used;
} req;
struct {
uint32_t qpn;
int data_len;
int tag;
int mtu;
} rep;
} msg_data;
};
typedef struct mca_oob_ud_msg_hdr_t mca_oob_ud_msg_hdr_t;
struct mca_oob_ud_req_t {
opal_list_item_t super;
mca_oob_ud_req_type_t type;
mca_oob_ud_req_state_t state;
union {
struct ibv_send_wr *send;
struct ibv_recv_wr *recv;
} req_wr;
/* storage for ib grh */
struct ibv_grh *req_grh;
struct ibv_mr *req_grh_mr;
struct ibv_sge *req_sge;
/* negotiated mtu */
int req_mtu;
uint32_t req_rem_qpn;
int req_rem_data_len;
int req_packet_count;
struct mca_oob_ud_peer_t *req_peer;
struct mca_oob_ud_port_t *req_port;
struct mca_oob_ud_qp_t *req_qp;
/* remote context (request or response) */
void *req_rem_ctx;
/* retry timer */
struct {
opal_event_t event;
struct timeval value;
} timer;
/* user request */
orte_process_name_t req_target;
orte_process_name_t req_origin;
mca_oob_ud_req_data_type_t req_data_type;
union {
struct {
struct ibv_mr **mr;
struct iovec *uiov;
int count;
}iov;
struct {
struct ibv_mr *mr;
char *p;
int size;
}buf;
}req_data;
int req_tag;
int req_channel;
int req_seq_num;
int req_rc;
void *req_cbdata;
/* what list is this request in */
opal_list_t *req_list;
bool req_is_eager;
orte_rml_send_t *rml_msg;
};
typedef struct mca_oob_ud_req_t mca_oob_ud_req_t;
OBJ_CLASS_DECLARATION(mca_oob_ud_req_t);
enum mca_oob_ud_status_t {
/* message posted */
MCA_OOB_UD_MSG_STATUS_POSTED,
/* remote side receive the message (ack'd) */
MCA_OOB_UD_MSG_STATUS_COMPLETE,
/* request message timed out */
MCA_OOB_UD_MSG_STATUS_TIMEOUT,
/* other failure */
MCA_OOB_UD_MSG_STATUS_ERROR
};
typedef enum mca_oob_ud_status_t mca_oob_ud_status_t;
struct mca_oob_ud_msg_t {
opal_free_list_item_t super;
struct ibv_send_wr wr;
struct ibv_sge sge;
mca_oob_ud_msg_hdr_t *hdr;
struct ibv_mr *mr;
/* qp this request was sent over */
struct mca_oob_ud_qp_t *qp;
struct mca_oob_ud_port_t *port;
opal_mutex_t lock;
opal_condition_t status_changed;
mca_oob_ud_status_t status;
bool persist;
mca_oob_ud_req_t *req;
void (*cbfunc) (struct mca_oob_ud_msg_t *, int);
struct mca_oob_ud_peer_t *peer;
};
typedef struct mca_oob_ud_msg_t mca_oob_ud_msg_t;
OBJ_CLASS_DECLARATION(mca_oob_ud_msg_t);
static inline int mca_oob_ud_recv_alloc (mca_oob_ud_req_t *recv_req)
{
int iov_index;
size_t alloc_size = recv_req->req_rem_data_len;
if (MCA_OOB_UD_REQ_IOV == recv_req->req_data_type) {
for (iov_index = 0 ; iov_index < recv_req->req_data.iov.count - 1 ; ++iov_index) {
alloc_size -= recv_req->req_data.iov.uiov[iov_index].iov_len;
}
recv_req->req_data.iov.uiov[iov_index].iov_len = alloc_size;
recv_req->req_data.iov.uiov[iov_index].iov_base = calloc (alloc_size, 1);
if (NULL == recv_req->req_data.iov.uiov[iov_index].iov_base) {
return ORTE_ERROR;
}
} else {
recv_req->req_data.buf.p = (char *)calloc(recv_req->req_rem_data_len, sizeof(char));
if (NULL == recv_req->req_data.buf.p) {
return ORTE_ERROR;
}
recv_req->req_data.buf.size = recv_req->req_rem_data_len;
}
return ORTE_SUCCESS;
}
#define MCA_OOB_UD_REQ_DEREG_MR(req) \
if (MCA_OOB_UD_REQ_IOV == req->req_data_type) { \
if (req->req_data.iov.mr) { \
for (i = 0 ; i < req->req_data.iov.count ; ++i) { \
if (req->req_data.iov.mr[i]) { \
(void) ibv_dereg_mr (req->req_data.iov.mr[i]); \
req->req_data.iov.mr[i] = NULL; \
} \
} \
free (req->req_data.iov.mr); \
req->req_data.iov.mr = NULL; \
} \
} else { \
if (req->req_data.buf.mr) { \
(void) ibv_dereg_mr (req->req_data.buf.mr); \
req->req_data.buf.mr = NULL; \
} \
}
int mca_oob_ud_msg_get (struct mca_oob_ud_port_t *port, mca_oob_ud_req_t *req,
mca_oob_ud_qp_t *qp, mca_oob_ud_peer_t *peer, bool persist,
mca_oob_ud_msg_t **msgp);
int mca_oob_ud_msg_init (opal_free_list_item_t *item, void *context);
void mca_oob_ud_msg_return (mca_oob_ud_msg_t *msg);
void mca_oob_ud_req_timer_set (mca_oob_ud_req_t *req, const struct timeval *timeout,
int max_tries, void (*cb)(evutil_socket_t, short, void *));
int mca_oob_ud_msg_post_send (mca_oob_ud_msg_t *msg);
int mca_oob_ud_msg_wait (mca_oob_ud_msg_t *msg);
int mca_oob_ud_msg_status_update (mca_oob_ud_msg_t *msg, mca_oob_ud_status_t status);
void mca_oob_ud_req_complete (mca_oob_ud_req_t *req, int rc);
void mca_oob_ud_req_abort (mca_oob_ud_req_t *req);
void mca_oob_ud_req_append_to_list (mca_oob_ud_req_t *req, opal_list_t *list);
bool mca_oob_ud_req_is_in_list (mca_oob_ud_req_t *req, opal_list_t *list);
#endif

Просмотреть файл

@ -1,543 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
* 2014 Mellanox Technologies, Inc.
* All rights reserved.
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#include "oob_ud_send.h"
#include "orte/mca/errmgr/errmgr.h"
static void mca_oob_ud_send_cb (mca_oob_ud_msg_t *msg, int rc)
{
mca_oob_ud_send_complete (msg->req, rc);
}
static int mca_oob_ud_send_self (orte_rml_send_t *msg)
{
unsigned int srco, dsto;
mca_oob_ud_req_t *req;
int srci, dsti;
int rc, size;
MCA_OOB_UD_IOV_SIZE(msg, size);
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s mca_oob_ud_send_self: sending %d bytes to myself",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), size);
rc = mca_oob_ud_get_recv_req (*ORTE_PROC_MY_NAME, msg->tag, &req, (msg->iov != NULL) ? true : false);
if (ORTE_SUCCESS != rc) {
return rc;
}
req->req_rem_data_len = size;
req->req_is_eager = true;
rc = mca_oob_ud_recv_alloc (req);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
if (MCA_OOB_UD_REQ_IOV == req->req_data_type) {
free (req->req_data.iov.uiov);
}
OBJ_RELEASE(req);
return rc;
}
srci = dsti = 0;
srco = dsto = 0;
if (msg->iov != NULL) {
do {
req->req_data_type = MCA_OOB_UD_REQ_IOV;
size_t copy = min(msg->iov[srci].iov_len - srco,
req->req_data.iov.uiov[dsti].iov_len - dsto);
memmove ((unsigned char *) req->req_data.iov.uiov[dsti].iov_base + dsto,
(unsigned char *) msg->iov[srci].iov_base + srco, copy);
srco += copy;
if (srco == msg->iov[srci].iov_len) {
srci++;
srco = 0;
}
dsto += copy;
if (dsto == req->req_data.iov.uiov[dsti].iov_len) {
dsti++;
dsto = 0;
}
} while (srci < req->req_data.iov.count && dsti < msg->count);
} else {
req->req_data_type = MCA_OOB_UD_REQ_BUF;
opal_buffer_t *buffer;
buffer = OBJ_NEW(opal_buffer_t);
if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(buffer, msg->buffer))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buffer);
return rc;
}
if (OPAL_SUCCESS != (rc = opal_dss.unload(buffer, (void **)&req->req_data.buf.p, &req->req_data.buf.size)))
{
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buffer);
free(req->req_data.buf.p);
return rc;
}
OBJ_RELEASE(buffer);
}
req->state = MCA_OOB_UD_REQ_COMPLETE;
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s mca_oob_ud_send_self: complete. calling callbacks",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
/* queue up recv callback */
mca_oob_ud_event_queue_completed (req);
req->rml_msg->status = ORTE_SUCCESS;
return size;
}
int mca_oob_ud_process_send_nb(int fd, short args, void *cbdata)
{
mca_oob_ud_msg_op_t *op = (mca_oob_ud_msg_op_t*)cbdata;
orte_process_name_t hop;
mca_oob_ud_peer_t *peer;
mca_oob_ud_port_t *port;
mca_oob_ud_msg_t *req_msg;
mca_oob_ud_req_t *send_req;
bool send_eager = false;
char *pack_ptr;
int rc, size, i;
if (OPAL_EQUAL == orte_util_compare_name_fields
(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, &op->msg->dst)) {
return mca_oob_ud_send_self (op->msg);
}
/* if we have a route to this peer, then we can reach it */
hop = orte_routed.get_route(NULL, &op->msg->dst);
if (ORTE_JOBID_INVALID == hop.jobid ||
ORTE_VPID_INVALID == hop.vpid) {
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
rc = mca_oob_ud_peer_lookup (&hop, &peer);
if(ORTE_SUCCESS != rc || NULL == peer) {
ORTE_ERROR_LOG((NULL == peer) ? ORTE_ERR_UNREACH : rc);
return (NULL == peer) ? ORTE_ERR_UNREACH : rc;
}
opal_output_verbose(2, orte_oob_base_framework.framework_output,
"%s oob:ud:send_nb to pear %s via hop %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&op->msg->dst), ORTE_NAME_PRINT(&hop));
/* NTH: TODO -- get a random port? */
port = (mca_oob_ud_port_t *) opal_list_get_first (&((mca_oob_ud_device_t *)peer->peer_context)->ports);
send_req = OBJ_NEW(mca_oob_ud_req_t);
if (!send_req) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* fill in request */
send_req->req_target = op->msg->dst;
send_req->req_origin = op->msg->origin;
send_req->req_tag = op->msg->tag;
send_req->req_seq_num = op->msg->seq_num;
if (op->msg->data != NULL) {
size = op->msg->count;
send_req->req_data_type = MCA_OOB_UD_REQ_TR;
send_req->req_data.buf.p = (char *)calloc(size, sizeof(char));
memcpy(send_req->req_data.buf.p, op->msg->data, op->msg->count);
send_req->req_data.buf.size = op->msg->count;
} else {
MCA_OOB_UD_IOV_SIZE(op->msg, size);
if (op->msg->iov != NULL) {
send_req->req_data_type = MCA_OOB_UD_REQ_IOV;
send_req->req_data.iov.uiov = op->msg->iov;
send_req->req_data.iov.count = op->msg->count;
} else {
send_req->req_data_type = MCA_OOB_UD_REQ_BUF;
opal_buffer_t *buffer;
buffer = OBJ_NEW(opal_buffer_t);
if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(buffer, op->msg->buffer))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buffer);
return rc;
}
if (OPAL_SUCCESS != (rc = opal_dss.unload(buffer, (void **)&send_req->req_data.buf.p, &send_req->req_data.buf.size)))
{
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buffer);
free(send_req->req_data.buf.p);
return rc;
}
OBJ_RELEASE(buffer);
}
}
send_req->rml_msg = op->msg;
send_req->req_cbdata = op->msg->cbdata;
send_req->req_peer = peer;
send_req->req_mtu = port->mtu;
send_req->req_port = port;
send_req->req_rc = 0;
send_req->state = MCA_OOB_UD_REQ_PENDING;
send_req->type = MCA_OOB_UD_REQ_SEND;
OBJ_RETAIN(peer);
if (size + sizeof (mca_oob_ud_msg_hdr_t) <= (unsigned int)port->mtu) {
send_eager = true;
}
rc = mca_oob_ud_msg_get (port, send_req, &port->listen_qp, peer, false, &req_msg);
if (ORTE_SUCCESS != rc) {
OBJ_RELEASE (send_req);
return rc;
}
/* fill in message header */
req_msg->hdr->msg_type = MCA_OOB_UD_MSG_REQUEST;
req_msg->hdr->msg_rem_ctx = send_req;
req_msg->hdr->msg_origin = op->msg->origin;
req_msg->hdr->msg_target = op->msg->dst;
req_msg->hdr->msg_seq_num = op->msg->seq_num;
req_msg->hdr->msg_data.req.data_len = size;
req_msg->hdr->msg_data.req.mtu = port->mtu;
req_msg->hdr->msg_data.req.tag = op->msg->tag;
if (MCA_OOB_UD_REQ_IOV == send_req->req_data_type) {
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s-%s send_nb: tag %d size %lu. msg: %p. peer = %p. req = %p."
"count = %d. uiov = %p.\n",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&op->msg->dst),
op->msg->tag, (unsigned long)size,
(void *) req_msg,
(void *) peer, (void *) send_req,
send_req->req_data.iov.count, (void *) send_req->req_data.iov.uiov);
} else {
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s-%s send_nb: tag %d size %lu. msg: %p. peer = %p. req = %p."
"buffer = %p.\n",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&op->msg->dst),
op->msg->tag, (unsigned long)size,
(void *) req_msg,
(void *) peer, (void *) send_req, (void *) send_req->req_data.buf.p);
}
if (!send_eager) {
mca_oob_ud_req_append_to_list (send_req, &mca_oob_ud_component.ud_active_sends);
/* send request */
return mca_oob_ud_msg_post_send (req_msg);
}
pack_ptr = (char *)(req_msg->hdr + 1);
if (op->msg->iov != NULL) {
for (i = 0 ; i < op->msg->count ; ++i) {
memcpy (pack_ptr, op->msg->iov[i].iov_base, op->msg->iov[i].iov_len);
pack_ptr += op->msg->iov[i].iov_len;
}
} else {
memcpy(pack_ptr, send_req->req_data.buf.p, send_req->req_data.buf.size);
}
send_req->req_list = NULL;
req_msg->hdr->msg_data.req.data_follows = true;
req_msg->cbfunc = mca_oob_ud_send_cb;
req_msg->req = send_req;
do {
/* send request */
rc = mca_oob_ud_msg_post_send (req_msg);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
break;
}
} while (0);
return rc;
}
static void mca_oob_ud_send_try_to (int fd, short event, void *ctx)
{
OPAL_THREAD_LOCK(&mca_oob_ud_component.ud_match_lock);
(void) mca_oob_ud_send_try ((mca_oob_ud_req_t *) ctx);
OPAL_THREAD_UNLOCK(&mca_oob_ud_component.ud_match_lock);
}
int mca_oob_ud_send_try (mca_oob_ud_req_t *send_req) {
int wr_index, wr_count, sge_count, sge_index, iov_index;
unsigned int iov_left, iov_offset, packet_size;
const unsigned int mtu = send_req->req_mtu;
const struct timeval aquire_timeout = {0, 500000};
mca_oob_ud_msg_t *com_msg;
int data_len;
int rc = ORTE_SUCCESS;
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s oob:ud:send_try sending to %s, tag = %d, "
"req = %p",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&send_req->req_peer->peer_name),
send_req->req_tag, (void *) send_req);
do {
if (NULL == send_req->req_qp) {
rc = mca_oob_ud_qp_data_aquire (send_req->req_port, &send_req->req_qp);
if (ORTE_SUCCESS != rc) {
break;
}
}
(void) mca_oob_ud_qp_purge (send_req->req_qp);
rc = mca_oob_ud_msg_get (send_req->req_port, send_req, send_req->req_qp, send_req->req_peer, false,
&com_msg);
if (ORTE_SUCCESS != rc) {
break;
}
if (MCA_OOB_UD_REQ_IOV == send_req->req_data_type) {
if (NULL == send_req->req_data.iov.mr) {
/* allocate space for memory registers */
send_req->req_data.iov.mr = (struct ibv_mr **) calloc (send_req->req_data.iov.count, sizeof (struct ibv_mr *));
if (NULL == send_req->req_data.iov.mr) {
rc = ORTE_ERR_OUT_OF_RESOURCE;
ORTE_ERROR_LOG(rc);
break;
}
}
rc = mca_oob_ud_register_iov (send_req->req_data.iov.uiov, send_req->req_data.iov.count,
send_req->req_data.iov.mr, send_req->req_port->device->ib_pd,
mtu, &sge_count, &wr_count, &data_len);
if (ORTE_SUCCESS != rc) {
break;
}
} else {
data_len = send_req->req_data.buf.size;
rc = mca_oob_ud_register_buf(send_req->req_data.buf.p, send_req->req_data.buf.size,
&send_req->req_data.buf.mr, send_req->req_port->device->ib_pd,
mtu, &sge_count, &wr_count);
if (ORTE_SUCCESS != rc) {
break;
}
}
wr_count = (data_len + mtu - 1) / mtu;
if (data_len > 0) {
data_len = data_len + 0;
}
if (MCA_OOB_UD_REQ_IOV == send_req->req_data_type) {
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:send_try sending %d bytes in %d "
"work requests, %d sges. uiov = %p.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), data_len,
wr_count, sge_count, (void *) send_req->req_data.iov.uiov);
} else {
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:send_try sending %d bytes in %d "
"work requests, %d sges. buf = %p", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), data_len,
wr_count, sge_count, (void *) send_req->req_data.buf.p);
}
if (wr_count && NULL == send_req->req_wr.send) {
send_req->req_wr.send = (struct ibv_send_wr *) calloc (wr_count, sizeof (struct ibv_send_wr));
if (NULL == send_req->req_wr.send) {
rc = ORTE_ERR_OUT_OF_RESOURCE;
ORTE_ERROR_LOG(rc);
break;
}
}
if (wr_count && NULL == send_req->req_sge) {
send_req->req_sge = (struct ibv_sge *) calloc (sge_count, sizeof (struct ibv_sge));
if (NULL == send_req->req_sge) {
rc = ORTE_ERR_OUT_OF_RESOURCE;
ORTE_ERROR_LOG(rc);
break;
}
}
if (MCA_OOB_UD_REQ_IOV == send_req->req_data_type) {
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s oob:ud:send_try posting message using iovec",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
iov_left = send_req->req_data.iov.uiov[0].iov_len;
iov_offset = 0;
iov_index = 0;
for (wr_index = 0, sge_index = 0 ; wr_index < wr_count ; ++wr_index) {
int sge_first = sge_index;
packet_size = 0;
do {
int to_send = min (iov_left, mtu - packet_size);
mca_oob_ud_fill_sge(send_req->req_sge + sge_index++,
(char *)send_req->req_data.iov.uiov[iov_index].iov_base + iov_offset,
to_send, send_req->req_data.iov.mr[iov_index]->lkey);
iov_offset += to_send;
iov_left -= to_send;
packet_size += to_send;
if (0 == iov_left) {
iov_index++;
iov_offset = 0;
if (iov_index < send_req->req_data.iov.count) {
iov_left = send_req->req_data.iov.uiov[iov_index].iov_len;
}
}
} while ((packet_size < mtu) && (iov_left > 0));
mca_oob_ud_fill_send_wr(send_req->req_wr.send + wr_index,
send_req->req_sge + sge_first,
sge_index - sge_first, send_req->req_peer);
/* we don't care about completions for data */
send_req->req_wr.send[wr_index].send_flags = IBV_SEND_SOLICITED;
/* sequence number */
send_req->req_wr.send[wr_index].imm_data = wr_index;
send_req->req_wr.send[wr_index].wr.ud.remote_qpn = send_req->req_rem_qpn;
send_req->req_wr.send[wr_index].opcode = IBV_WR_SEND_WITH_IMM;
if (wr_index + 1 < wr_count) {
send_req->req_wr.send[wr_index].next = send_req->req_wr.send + wr_index + 1;
}
}
} else {//data is in buffer
unsigned int buffer_offset = 0;
unsigned int buffer_size = send_req->req_data.buf.size;
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s oob:ud:send_try posting message using buffer",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
for (wr_index = 0, sge_index = 0 ; wr_index < wr_count ; ++wr_index) {
int sge_first = sge_index;
packet_size = 0;
do {
int to_send = min (buffer_size, mtu - packet_size);
mca_oob_ud_fill_sge(send_req->req_sge + sge_index++,
(char *)send_req->req_data.buf.p + buffer_offset,
to_send, send_req->req_data.buf.mr->lkey);
buffer_offset += to_send;
buffer_size -= to_send;
packet_size += to_send;
} while ((packet_size < mtu) && (buffer_size > 0));
mca_oob_ud_fill_send_wr(send_req->req_wr.send + wr_index,
send_req->req_sge + sge_first,
sge_index - sge_first, send_req->req_peer);
/* we don't care about completions for data */
send_req->req_wr.send[wr_index].send_flags = IBV_SEND_SOLICITED;
/* sequence number */
send_req->req_wr.send[wr_index].imm_data = wr_index;
send_req->req_wr.send[wr_index].wr.ud.remote_qpn = send_req->req_rem_qpn;
send_req->req_wr.send[wr_index].opcode = IBV_WR_SEND_WITH_IMM;
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s oob:ud:send_try imm_data = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), wr_index);
if (wr_index + 1 < wr_count) {
send_req->req_wr.send[wr_index].next = send_req->req_wr.send + wr_index + 1;
}
}
}
/* send data */
rc = mca_oob_ud_qp_post_send (send_req->req_qp, send_req->req_wr.send, 0);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
break;
}
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s oob:ud:send_try posting completion message",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
/* Fill in completion message. This message will go to the peers listen QP but
must originate from our data qp to ensure that it is sent last. */
com_msg->hdr->msg_type = MCA_OOB_UD_MSG_COMPLETE;
com_msg->hdr->msg_lcl_ctx = send_req->req_rem_ctx;
com_msg->hdr->msg_rem_ctx = send_req;
/* send message header */
rc = mca_oob_ud_msg_post_send (com_msg);
/* post_send already returned the message */
com_msg = NULL;
} while (0);
if (ORTE_ERR_TEMP_OUT_OF_RESOURCE == rc) {
/* set timer to retry post */
mca_oob_ud_req_timer_set (send_req, &aquire_timeout, 1, mca_oob_ud_send_try_to);
rc = ORTE_SUCCESS;
}
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
/* damn */
return mca_oob_ud_send_complete (send_req, rc);
}
send_req->state = MCA_OOB_UD_REQ_ACTIVE;
return rc;
}
int mca_oob_ud_send_complete (mca_oob_ud_req_t *send_req, int rc)
{
mca_oob_ud_req_complete (send_req, rc);
return rc;
}

Просмотреть файл

@ -1,55 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2014 Mellanox Technologies, Inc.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#if !defined(MCA_OOB_UD_SEND_H)
#define MCA_OOB_UD_SEND_H
#include "oob_ud_component.h"
#define min(a,b) ((a) < (b) ? (a) : (b))
#define MCA_OOB_UD_IOV_SIZE(msg, size) \
do { \
if (msg->iov != NULL) { \
int i; \
for (i = 0, (size) = 0 ; i < (msg->count) ; ++i) { \
(size) += (msg->iov)[i].iov_len; \
} \
} else { \
(size) = msg->buffer->bytes_used; \
} \
} while (0);
/* State machine for processing message */
typedef struct {
opal_object_t super;
opal_event_t ev;
orte_rml_send_t *msg;
} mca_oob_ud_msg_op_t;
OBJ_CLASS_DECLARATION(mca_oob_ud_msg_op_t);
#define ORTE_ACTIVATE_UD_POST_SEND(ms, cbfunc) \
do { \
mca_oob_ud_msg_op_t *mop; \
opal_output_verbose(5, orte_oob_base_framework.framework_output,\
"%s:[%s:%d] post send to %s", \
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
__FILE__, __LINE__, \
ORTE_NAME_PRINT(&((ms)->dst))); \
mop = OBJ_NEW(mca_oob_ud_msg_op_t); \
mop->msg = (ms); \
opal_event_set(mca_oob_ud_module.ev_base, &mop->ev, -1, \
OPAL_EV_WRITE, (cbfunc), mop); \
opal_event_set_priority(&mop->ev, ORTE_MSG_PRI); \
opal_event_active(&mop->ev, OPAL_EV_WRITE, 1); \
} while(0);
#endif

Просмотреть файл

@ -1,7 +0,0 @@
#
# owner/status file
# owner: institution that is responsible for this package
# status: e.g. active, maintenance, unmaintained
#
owner: MELLANOX
status: maintenance