1
1

Merge ssh://ct-fe1/usr/projects/hpctools/hjelmn/ompi-trunk-git into HEAD

This commit was SVN r26344.
Этот коммит содержится в:
Nathan Hjelm 2012-04-26 22:06:12 +00:00
родитель e84f9ec8c3
Коммит e1e0d466e5
16 изменённых файлов: 4305 добавлений и 0 удалений

57
orte/mca/oob/ud/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,57 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
dist_pkgdata_DATA = help-oob-ud.txt
sources = \
oob_ud_component.c \
oob_ud_module.c \
oob_ud.h \
oob_ud_event.c \
oob_ud_peer.c \
oob_ud_peer.h \
oob_ud_ping.c \
oob_ud_qp.c \
oob_ud_qp.h \
oob_ud_recv.c \
oob_ud_req.c \
oob_ud_req.h \
oob_ud_send.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_oob_ud_DSO
component_noinst =
component_install = mca_oob_ud.la
else
component_noinst = libmca_oob_ud.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_oob_ud_la_SOURCES = $(sources)
mca_oob_ud_la_LDFLAGS = -module -avoid-version -libverbs
noinst_LTLIBRARIES = $(component_noinst)
libmca_oob_ud_la_SOURCES = $(sources)
libmca_oob_ud_la_LDFLAGS = -module -avoid-version

27
orte/mca/oob/ud/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,27 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_oob_ud_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_orte_oob_ud_CONFIG],[
AC_CONFIG_FILES([orte/mca/oob/ud/Makefile])
AC_CHECK_HEADER([infiniband/verbs.h])
AC_CHECK_LIB([ibverbs], [ibv_create_qp])
])dnl

18
orte/mca/oob/ud/help-oob-ud.txt Обычный файл
Просмотреть файл

@ -0,0 +1,18 @@
# -*- text -*-
#
# Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2006 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#

218
orte/mca/oob/ud/oob_ud.h Обычный файл
Просмотреть файл

@ -0,0 +1,218 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#if !defined(MCA_OOB_UD_H)
#define MCA_OOB_UD_H
#include "orte_config.h"
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif
#include "opal/types.h"
#include "orte/types.h"
#include "opal/mca/base/base.h"
#include "opal/class/opal_free_list.h"
#include "opal/class/opal_hash_table.h"
#include "opal/threads/mutex.h"
#include "opal/threads/condition.h"
#include "opal/threads/threads.h"
#include "opal/mca/timer/base/base.h"
#include "opal/include/opal_stdint.h"
#include "opal/mca/memchecker/base/base.h"
#include "orte/mca/oob/oob.h"
#include "orte/mca/oob/base/base.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "math.h"
#include <infiniband/verbs.h>
#include "oob_ud_qp.h"
#include "oob_ud_peer.h"
#include "oob_ud_req.h"
/* Use for valgrind checks*/
#ifdef HAVE_VALGRIND
#include <valgrind/memcheck.h>
#else
#define VALGRIND_MAKE_MEM_DEFINED(addr,len)
#endif
BEGIN_C_DECLS
enum {
MCA_OOB_UD_SEND_WR = 0x10000000,
MCA_OOB_UD_RECV_WR = 0x20000000
};
enum {
MCA_OOB_UD_DEBUG_NONE,
MCA_OOB_UD_DEBUG_ALL
};
static inline void mca_oob_ud_fill_send_wr (struct ibv_send_wr *wr, struct ibv_sge *sge,
int num_sge, const mca_oob_ud_peer_t *peer)
{
wr->wr_id = MCA_OOB_UD_SEND_WR;
wr->next = NULL;
wr->sg_list = sge;
wr->num_sge = num_sge;
wr->opcode = IBV_WR_SEND;
wr->send_flags = IBV_SEND_SIGNALED;
wr->wr.ud.ah = peer->peer_ah;
wr->wr.ud.remote_qpn = peer->peer_qpn;
wr->wr.ud.remote_qkey = peer->peer_qkey;
}
static inline void mca_oob_ud_fill_recv_wr (struct ibv_recv_wr *wr, struct ibv_sge *sge,
int num_sge)
{
wr->wr_id = MCA_OOB_UD_RECV_WR;
wr->next = NULL;
wr->sg_list = sge;
wr->num_sge = num_sge;
}
static inline void mca_oob_ud_fill_sge (struct ibv_sge *sge, void *addr,
uint32_t length, uint32_t lkey)
{
sge->addr = (uint64_t)addr;
sge->length = length;
sge->lkey = lkey;
}
struct mca_oob_ud_device_t {
opal_list_item_t super;
struct ibv_context *ib_context;
struct ibv_comp_channel *ib_channel;
struct ibv_pd *ib_pd;
opal_event_t event;
opal_list_t ports;
};
typedef struct mca_oob_ud_device_t mca_oob_ud_device_t;
OBJ_CLASS_DECLARATION(mca_oob_ud_device_t);
/* events */
void mca_oob_ud_event_start_monitor (mca_oob_ud_device_t *device);
void mca_oob_ud_event_stop_monitor (mca_oob_ud_device_t *device);
struct mca_oob_ud_reg_mem_t {
char *ptr;
size_t len;
struct ibv_mr *mr;
};
typedef struct mca_oob_ud_reg_mem_t mca_oob_ud_reg_mem_t;
struct mca_oob_ud_port_t {
opal_list_item_t super;
mca_oob_ud_device_t *device;
mca_oob_ud_qp_t listen_qp;
opal_free_list_t data_qps;
opal_free_list_t free_msgs;
int mtu;
uint16_t lid;
uint8_t port_num;
mca_oob_ud_reg_mem_t grh_buf;
mca_oob_ud_reg_mem_t msg_buf;
};
typedef struct mca_oob_ud_port_t mca_oob_ud_port_t;
OBJ_CLASS_DECLARATION(mca_oob_ud_port_t);
int mca_oob_ud_port_post_one_recv (mca_oob_ud_port_t *port, int msg_num);
void mca_oob_ud_port_get_uri (mca_oob_ud_port_t *port, char *uri);
struct mca_oob_ud_component_t {
mca_oob_base_component_2_0_0_t super; /**< base OOB component */
opal_list_t ud_devices;
opal_list_t ud_pending_recvs;
opal_list_t ud_active_recvs;
opal_list_t ud_active_sends;
opal_list_t ud_unexpected_recvs;
opal_list_t ud_event_queued_reqs;
opal_list_t ud_event_processing_msgs;
opal_list_t ud_completed;
opal_event_t ud_complete_event;
opal_mutex_t ud_lock;
int ud_min_qp;
int ud_max_qp;
int ud_recv_buffer_count;
int ud_send_buffer_count;
opal_mutex_t ud_match_lock;
opal_hash_table_t ud_peers;
};
typedef struct mca_oob_ud_component_t mca_oob_ud_component_t;
ORTE_MODULE_DECLSPEC extern mca_oob_ud_component_t mca_oob_ud_component;
ORTE_MODULE_DECLSPEC extern mca_oob_t mca_oob_ud_module;
char *mca_oob_ud_get_addr (void);
int mca_oob_ud_set_addr (const orte_process_name_t *name, const char *uri);
int mca_oob_ud_ping(const orte_process_name_t* name, const char* uri,
const struct timeval *timeout);
int mca_oob_ud_send_nb(orte_process_name_t* target, orte_process_name_t* origin,
struct iovec* iov, int count, int tag, int flags,
orte_rml_callback_fn_t cbfunc, void* cbdata);
int mca_oob_ud_send_try (mca_oob_ud_req_t *send_req);
int mca_oob_ud_send_complete (mca_oob_ud_req_t *send_req, int rc);
/* recv */
int mca_oob_ud_recv_nb(orte_process_name_t* peer, struct iovec* iov, int count,
int tag, int flags, orte_rml_callback_fn_t cbfunc,
void* cbdata);
int mca_oob_ud_recv_cancel(orte_process_name_t* name, int tag);
int mca_oob_ud_recv_complete (mca_oob_ud_req_t *recv_req);
int mca_oob_ud_recv_try (mca_oob_ud_req_t *recv_req);
int mca_oob_ud_recv_match_send (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer,
mca_oob_ud_msg_hdr_t *msg_hdr, mca_oob_ud_req_t **reqp);
int mca_oob_ud_recv_match (mca_oob_ud_req_t *recv_req);
int mca_oob_ud_get_recv_req (const orte_process_name_t name, const int tag, mca_oob_ud_req_t **reqp);
int mca_oob_ud_ft_event(int state);
int mca_oob_ud_register_iov (struct iovec *iov, int count, struct ibv_mr **ib_mr,
struct ibv_pd *ib_pd, unsigned int mtu, int *sge_countp,
int *wr_countp, int *data_lenp);
void mca_oob_ud_event_queue_completed (mca_oob_ud_req_t *req);
END_C_DECLS
#endif

257
orte/mca/oob/ud/oob_ud_component.c Обычный файл
Просмотреть файл

@ -0,0 +1,257 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#include "orte_config.h"
#include "orte/types.h"
#include "opal/types.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/proc_info.h"
#include "oob_ud.h"
static int mca_oob_ud_component_open (void);
static int mca_oob_ud_component_close (void);
static int mca_oob_ud_component_register (void);
static mca_oob_t *mca_oob_ud_component_init (int *priority);
mca_oob_ud_component_t mca_oob_ud_component = {
{
{
MCA_OOB_BASE_VERSION_2_0_0,
"ud", /* MCA module name */
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
mca_oob_ud_component_open,
mca_oob_ud_component_close,
NULL, /* component query */
mca_oob_ud_component_register
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
mca_oob_ud_component_init
}
};
static int mca_oob_ud_component_open (void)
{
OBJ_CONSTRUCT(&mca_oob_ud_component.ud_devices, opal_list_t);
OBJ_CONSTRUCT(&mca_oob_ud_component.ud_active_sends, opal_list_t);
OBJ_CONSTRUCT(&mca_oob_ud_component.ud_pending_recvs, opal_list_t);
OBJ_CONSTRUCT(&mca_oob_ud_component.ud_active_recvs, opal_list_t);
OBJ_CONSTRUCT(&mca_oob_ud_component.ud_event_queued_reqs, opal_list_t);
OBJ_CONSTRUCT(&mca_oob_ud_component.ud_unexpected_recvs, opal_list_t);
OBJ_CONSTRUCT(&mca_oob_ud_component.ud_completed, opal_list_t);
OBJ_CONSTRUCT(&mca_oob_ud_component.ud_event_processing_msgs, opal_list_t);
OBJ_CONSTRUCT(&mca_oob_ud_component.ud_lock, opal_mutex_t);
OBJ_CONSTRUCT(&mca_oob_ud_component.ud_match_lock, opal_mutex_t);
OBJ_CONSTRUCT(&mca_oob_ud_component.ud_peers, opal_hash_table_t);
return ORTE_SUCCESS;
}
static int mca_oob_ud_component_close (void)
{
OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:component_close entering",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
OBJ_DESTRUCT(&mca_oob_ud_component.ud_devices);
OBJ_DESTRUCT(&mca_oob_ud_component.ud_active_sends);
OBJ_DESTRUCT(&mca_oob_ud_component.ud_pending_recvs);
OBJ_DESTRUCT(&mca_oob_ud_component.ud_active_recvs);
OBJ_DESTRUCT(&mca_oob_ud_component.ud_event_queued_reqs);
OBJ_DESTRUCT(&mca_oob_ud_component.ud_unexpected_recvs);
OBJ_DESTRUCT(&mca_oob_ud_component.ud_lock);
OBJ_DESTRUCT(&mca_oob_ud_component.ud_match_lock);
OBJ_DESTRUCT(&mca_oob_ud_component.ud_peers);
OBJ_DESTRUCT(&mca_oob_ud_component.ud_completed);
OBJ_DESTRUCT(&mca_oob_ud_component.ud_event_processing_msgs);
return ORTE_SUCCESS;
}
static int mca_oob_ud_component_register (void)
{
mca_base_param_reg_int (&mca_oob_ud_component.super.oob_base,
"min_qp", "Minimum number of UD queue pairs "
"to allocate (default: 8)", false, false,
8, &mca_oob_ud_component.ud_min_qp);
mca_base_param_reg_int (&mca_oob_ud_component.super.oob_base,
"max_qp", "Maximum number of UD queue pairs "
"to allocate (default: 32)", false, false,
32, &mca_oob_ud_component.ud_max_qp);
mca_base_param_reg_int (&mca_oob_ud_component.super.oob_base,
"recv_buffers", "Number of MTU sized recv "
"buffers to post (default: 512)", false, false,
512, &mca_oob_ud_component.ud_recv_buffer_count);
mca_base_param_reg_int (&mca_oob_ud_component.super.oob_base,
"send_buffers", "Number of MTU sized sent "
"buffers to post (default: 512)", false, false,
512, &mca_oob_ud_component.ud_send_buffer_count);
return ORTE_SUCCESS;
}
static int port_mtus[] = {0, 256, 512, 1024, 2048, 4096};
static inline int mca_oob_ud_port_setup (mca_oob_ud_port_t *port)
{
int rc;
struct ibv_port_attr port_attr;
rc = ibv_query_port (port->device->ib_context, port->port_num, &port_attr);
if (0 != rc || IBV_PORT_ACTIVE != port_attr.state || 0 == port_attr.lid) {
/* skip this port */
return ORTE_ERROR;
}
port->lid = port_attr.lid;
port->mtu = port_attr.active_mtu > IBV_MTU_4096 ? 2048 : port_mtus[port_attr.active_mtu];
OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:port_setup found port: num = %u,"
"lid = %u, mtu = %u", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
port->port_num, port->lid, port->mtu));
return rc;
}
static inline int mca_oob_ud_device_setup (mca_oob_ud_device_t *device,
struct ibv_device *ib_device)
{
int rc, port_num;
struct ibv_device_attr dev_attr;
OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:device_setup attempting to setup ib device %p",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) ib_device));
device->ib_context = ibv_open_device (ib_device);
if (NULL == device->ib_context) {
OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:device_setup error opening device. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno));
return ORTE_ERROR;
}
rc = ibv_query_device (device->ib_context, &dev_attr);
if (0 != rc) {
OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:device_setup error querying device. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno));
return ORTE_ERROR;
}
device->ib_channel = ibv_create_comp_channel (device->ib_context);
if (NULL == device->ib_channel) {
OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:device_setup error completing completion channel."
"errno = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno));
return ORTE_ERROR;
}
device->ib_pd = ibv_alloc_pd (device->ib_context);
if (NULL == device->ib_pd) {
OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:device_setup error allocating protection domain."
"errno = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno));
return ORTE_ERROR;
}
for (port_num = 1 ; port_num <= dev_attr.phys_port_cnt ; ++port_num) {
mca_oob_ud_port_t *port = OBJ_NEW(mca_oob_ud_port_t);
if (NULL == port) {
opal_output (0, "oob:ud:device_setup malloc failure. errno = %d", errno);
return ORTE_ERR_OUT_OF_RESOURCE;
}
port->device = device;
port->port_num = port_num;
rc = mca_oob_ud_port_setup (port);
if (ORTE_SUCCESS != rc) {
OBJ_RELEASE(port);
continue;
}
opal_list_append (&device->ports, (opal_list_item_t *) port);
break;
}
if (0 == opal_list_get_size(&device->ports)) {
OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:device_setup could not init device. no usable "
"ports present", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
return ORTE_ERROR;
}
return ORTE_SUCCESS;
}
static mca_oob_t *mca_oob_ud_component_init(int *priority)
{
struct ibv_device **devices;
int num_devices, i, rc;
*priority = 0;
opal_hash_table_init (&mca_oob_ud_component.ud_peers, 1024);
devices = ibv_get_device_list (&num_devices);
if (NULL == devices || 0 == num_devices) {
OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:component_init no devices found",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
return NULL;
}
for (i = 0 ; i < num_devices ; ++i) {
mca_oob_ud_device_t *device = OBJ_NEW(mca_oob_ud_device_t);
if (NULL == device) {
opal_output (0, "oob:ud:component_init malloc failure. errno = %d",
errno);
return NULL;
}
rc = mca_oob_ud_device_setup (device, devices[i]);
if (ORTE_SUCCESS != rc) {
OBJ_RELEASE(device);
continue;
}
opal_list_append (&mca_oob_ud_component.ud_devices,
(opal_list_item_t *) device);
/* NTH: support only 1 device for now */
break;
}
ibv_free_device_list (devices);
if (0 == opal_list_get_size (&mca_oob_ud_component.ud_devices)) {
OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:component_init no usable devices found.",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
return NULL;
}
return &mca_oob_ud_module;
}

588
orte/mca/oob/ud/oob_ud_event.c Обычный файл
Просмотреть файл

@ -0,0 +1,588 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#include "oob_ud.h"
#define min(a,b) ((a) < (b) ? (a) : (b))
static int mca_oob_ud_event_send_ack (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer, mca_oob_ud_msg_hdr_t *msg);
static int mca_oob_ud_event_send_nack (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer, mca_oob_ud_msg_hdr_t *msg);
static int mca_oob_ud_event_handle_ack (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer,
mca_oob_ud_msg_hdr_t *msg_hdr);
static int mca_oob_ud_event_handle_nack (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer,
mca_oob_ud_msg_hdr_t *msg_hdr);
static int mca_oob_ud_event_handle_completion (mca_oob_ud_port_t *port, mca_oob_ud_msg_hdr_t *msg);
static int mca_oob_ud_event_handle_data_ok (mca_oob_ud_port_t *port, mca_oob_ud_msg_hdr_t *msg);
static int mca_oob_ud_event_handle_req (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer, mca_oob_ud_msg_hdr_t *msg_hdr);
static int mca_oob_ud_event_handle_rep (mca_oob_ud_port_t *port, mca_oob_ud_msg_hdr_t *msg);
static int mca_oob_ud_event_handle_end (mca_oob_ud_peer_t *peer, mca_oob_ud_msg_hdr_t *msg_hdr);
static void *mca_oob_ud_event_dispatch(int fd, int flags, void *context);
static void *mca_oob_ud_complete_dispatch(int fd, int flags, void *context);
static void mca_oob_ud_stop_events(mca_oob_ud_device_t *device);
static inline opal_list_item_t *mca_oob_ud_list_get_first (opal_list_t *list)
{
return (opal_list_get_size (list) == 0) ? NULL : opal_list_get_first (list);
}
static inline opal_list_item_t *mca_oob_ud_list_get_next (opal_list_t *list, opal_list_item_t *item)
{
opal_list_item_t *next = opal_list_get_next (item);
return (opal_list_get_end(list) == next) ? NULL : next;
}
static bool event_started = false;
void mca_oob_ud_event_start_monitor (mca_oob_ud_device_t *device)
{
if (!event_started) {
opal_progress_event_users_increment ();
opal_event_set (opal_event_base, &device->event, device->ib_channel->fd,
OPAL_EV_READ, mca_oob_ud_event_dispatch, (void *) device);
opal_event_add (&device->event, NULL);
event_started = true;
}
}
void mca_oob_ud_event_stop_monitor (mca_oob_ud_device_t *device)
{
if (event_started) {
opal_progress_event_users_decrement ();
opal_event_del (&device->event);
mca_oob_ud_stop_events (device);
event_started = false;
}
}
struct mca_oob_ud_msg_item_t {
opal_list_item_t super;
mca_oob_ud_msg_hdr_t *hdr;
mca_oob_ud_port_t *port;
mca_oob_ud_peer_t *peer;
int msg_num;
};
typedef struct mca_oob_ud_msg_item_t mca_oob_ud_msg_item_t;
OBJ_CLASS_DECLARATION(mca_oob_ud_msg_item_t);
static void mca_oob_ud_msg_item_construct (mca_oob_ud_msg_item_t *item)
{
memset ((char *) item + sizeof (item->super), 0, sizeof (*item) - sizeof (item->super));
}
static void mca_oob_ud_msg_item_destruct (mca_oob_ud_msg_item_t *item)
{
if (item->hdr) {
/* repost the receive request */
mca_oob_ud_port_post_one_recv (item->port, item->msg_num);
}
}
OBJ_CLASS_INSTANCE(mca_oob_ud_msg_item_t, opal_list_item_t,
mca_oob_ud_msg_item_construct,
mca_oob_ud_msg_item_destruct);
static int mca_oob_ud_msg_item_cmp (opal_list_item_t **a, opal_list_item_t **b)
{
mca_oob_ud_msg_item_t *aitem = *((mca_oob_ud_msg_item_t **) a);
mca_oob_ud_msg_item_t *bitem = *((mca_oob_ud_msg_item_t **) b);
if (aitem->peer == bitem->peer) {
return (aitem->hdr->msg_id > bitem->hdr->msg_id ? 1 : -1);
} else {
return (aitem->peer > bitem->peer) ? 1 : -1;
}
}
static int mca_oob_ud_process_messages (struct ibv_cq *event_cq, mca_oob_ud_port_t *port)
{
mca_oob_ud_msg_item_t *msg_item, *next_item;
opal_list_t *processing_msgs = &mca_oob_ud_component.ud_event_processing_msgs;
mca_oob_ud_peer_t *peer;
mca_oob_ud_msg_hdr_t *msg_hdr;
int msg_num, i, count;
struct ibv_wc wc[40];
bool peer_nacked;
count = ibv_poll_cq (event_cq, 40, wc);
if (count < 0)
return count;
/* acknowlege the events */
ibv_ack_cq_events (event_cq, count);
for (i = 0 ; i < count ; ++i) {
msg_num = (int)(wc[i].wr_id & (~MCA_OOB_UD_RECV_WR));
msg_hdr = (mca_oob_ud_msg_hdr_t *) (port->msg_buf.ptr + msg_num * port->mtu);
VALGRIND_MAKE_MEM_DEFINED(msg_hdr, wc[i].byte_len);
if (!(wc[i].wr_id & MCA_OOB_UD_RECV_WR) || IBV_WC_SUCCESS != wc[i].status) {
mca_oob_ud_port_post_one_recv (port, msg_num);
continue;
}
peer = mca_oob_ud_get_peer (port, &msg_hdr->ra.name, wc[i].src_qp, msg_hdr->ra.qkey,
wc[i].slid, msg_hdr->ra.port_num);
if (peer) {
if (MCA_OOB_UD_MSG_ACK != msg_hdr->msg_type && MCA_OOB_UD_MSG_NACK != msg_hdr->msg_type &&
MCA_OOB_UD_MSG_END != msg_hdr->msg_type) {
mca_oob_ud_msg_item_t *msg_item = OBJ_NEW(mca_oob_ud_msg_item_t);
msg_item->msg_num = msg_num;
msg_item->hdr = msg_hdr;
msg_item->port = port;
msg_item->peer = peer;
opal_list_append (processing_msgs, (opal_list_item_t *) msg_item);
} else {
if (MCA_OOB_UD_MSG_ACK == msg_hdr->msg_type) {
(void) mca_oob_ud_event_handle_ack (port, peer, msg_hdr);
} else if (MCA_OOB_UD_MSG_NACK == msg_hdr->msg_type) {
(void) mca_oob_ud_event_handle_nack (port, peer, msg_hdr);
} else {
mca_oob_ud_event_handle_end (peer, msg_hdr);
}
mca_oob_ud_port_post_one_recv (port, msg_num);
}
} else {
OPAL_OUTPUT_VERBOSE((10, mca_oob_base_output, "%s oob:ud:process_message got a null peer for message id %"
PRIu64, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg_hdr->msg_id));
mca_oob_ud_port_post_one_recv (port, msg_num);
}
}
/* Sort messages by peer then id */
opal_list_sort (processing_msgs, mca_oob_ud_msg_item_cmp);
/* Send ACKs/NACKs and throw away out-of-order messages */
msg_item = (mca_oob_ud_msg_item_t *) mca_oob_ud_list_get_first (processing_msgs);
for (peer = NULL, peer_nacked = false ; NULL != msg_item ; msg_item = next_item) {
if (peer != msg_item->peer) {
peer_nacked = false;
}
peer = msg_item->peer;
next_item = (mca_oob_ud_msg_item_t *) mca_oob_ud_list_get_next (processing_msgs,
(opal_list_item_t *)msg_item);
if (false == peer_nacked) {
if (msg_item->hdr->msg_id > peer->peer_expected_id) {
(void) mca_oob_ud_event_send_nack (msg_item->port, peer, msg_item->hdr);
peer_nacked = true;
} else if (NULL == next_item || (next_item->peer != msg_item->peer)) {
(void) mca_oob_ud_event_send_ack (msg_item->port, msg_item->peer, msg_item->hdr);
}
}
if (msg_item->hdr->msg_id != peer->peer_expected_id) {
opal_list_remove_item (processing_msgs, (opal_list_item_t *) msg_item);
OBJ_RELEASE(msg_item);
} else {
peer->peer_expected_id++;
}
}
/* Process remaining messages */
while (NULL !=
(msg_item = (mca_oob_ud_msg_item_t *) opal_list_remove_first (processing_msgs))) {
switch (msg_item->hdr->msg_type) {
case MCA_OOB_UD_MSG_REQUEST:
mca_oob_ud_event_handle_req (port, msg_item->peer, msg_item->hdr);
break;
case MCA_OOB_UD_MSG_REPLY:
mca_oob_ud_event_handle_rep (port, msg_item->hdr);
break;
case MCA_OOB_UD_MSG_COMPLETE:
mca_oob_ud_event_handle_completion (port, msg_item->hdr);
break;
case MCA_OOB_UD_MSG_DATA_OK:
mca_oob_ud_event_handle_data_ok (port, msg_item->hdr);
break;
case MCA_OOB_UD_MSG_END:
mca_oob_ud_event_handle_end (peer, msg_item->hdr);
break;
default:
/* do nothing */
break;
}
OBJ_RELEASE(msg_item);
}
return count;
}
static int mca_oob_ud_event_handle_ack (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer,
mca_oob_ud_msg_hdr_t *msg_hdr)
{
mca_oob_ud_msg_t *msg;
OPAL_OUTPUT_VERBOSE((10, mca_oob_base_output, "%s oob:ud:event_handle_ack got ack for msg id %" PRIu64
" from peer %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg_hdr->msg_id,
ORTE_NAME_PRINT(&peer->peer_name)));
OPAL_THREAD_LOCK(&peer->peer_lock);
mca_oob_ud_peer_stop_timer (peer);
while (NULL !=
(msg = (mca_oob_ud_msg_t *) mca_oob_ud_list_get_first (&peer->peer_flying_messages))) {
if (msg->hdr->msg_id > msg_hdr->msg_id) {
break;
}
(void) opal_list_remove_first (&peer->peer_flying_messages);
(void) mca_oob_ud_msg_status_update (msg, MCA_OOB_UD_MSG_STATUS_COMPLETE);
}
mca_oob_ud_peer_start_timer (peer);
OPAL_THREAD_UNLOCK(&peer->peer_lock);
return ORTE_SUCCESS;
}
static int mca_oob_ud_event_handle_nack (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer,
mca_oob_ud_msg_hdr_t *msg_hdr)
{
mca_oob_ud_msg_t *msg;
OPAL_OUTPUT_VERBOSE((10, mca_oob_base_output, "%s oob:ud:event_handle_nack got nack for msg id %" PRIu64
" from peer %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg_hdr->msg_id,
ORTE_NAME_PRINT(&peer->peer_name)));
OPAL_THREAD_LOCK(&peer->peer_lock);
mca_oob_ud_peer_stop_timer (peer);
while (NULL !=
(msg = (mca_oob_ud_msg_t *) mca_oob_ud_list_get_first (&peer->peer_flying_messages))) {
if (msg->hdr->msg_id >= msg_hdr->msg_id) {
break;
}
(void) opal_list_remove_first (&peer->peer_flying_messages);
(void) mca_oob_ud_msg_status_update (msg, MCA_OOB_UD_MSG_STATUS_COMPLETE);
}
/* repost remaining messages */
mca_oob_ud_peer_post_all (peer);
/* reset and start the timer */
mca_oob_ud_peer_reset_timer (peer);
mca_oob_ud_peer_start_timer (peer);
OPAL_THREAD_UNLOCK(&peer->peer_lock);
return ORTE_SUCCESS;
}
static int mca_oob_ud_event_handle_end (mca_oob_ud_peer_t *peer, mca_oob_ud_msg_hdr_t *msg_hdr)
{
OPAL_OUTPUT_VERBOSE((10, mca_oob_base_output, "%s oob:ud:event_handle_end got end message from peer %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->peer_name)));
mca_oob_ud_peer_lost (peer);
return ORTE_SUCCESS;
}
static int mca_oob_ud_event_send_ack (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer, mca_oob_ud_msg_hdr_t *msg_hdr)
{
mca_oob_ud_msg_hdr_t tmp_hdr;
int rc = ORTE_SUCCESS;
struct ibv_send_wr wr;
struct ibv_sge sge;
OPAL_OUTPUT_VERBOSE((10, mca_oob_base_output, "%s oob:ud:event_send_ack sending ack for message id %"
PRIu64 " peer = %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg_hdr->msg_id,
ORTE_NAME_PRINT(&peer->peer_name)));
/* reuse registered buffer to send ack (just need to change the type/return address) */
memcpy (&tmp_hdr, msg_hdr, sizeof (tmp_hdr));
msg_hdr->msg_type = MCA_OOB_UD_MSG_ACK;
/* set return address */
msg_hdr->ra.qkey = 0;
msg_hdr->ra.name = *ORTE_PROC_MY_NAME;
msg_hdr->ra.port_num = port->port_num;
mca_oob_ud_fill_sge (&sge, msg_hdr, sizeof (*msg_hdr), port->msg_buf.mr->lkey);
mca_oob_ud_fill_send_wr (&wr, &sge, 1, peer);
rc = mca_oob_ud_qp_post_send (&port->listen_qp, &wr, 1);
if (ORTE_SUCCESS != rc) {
opal_output (0, "oob:ud:event_send_ack error posting ack!");
return rc;
}
memcpy (msg_hdr, &tmp_hdr, sizeof (tmp_hdr));
return ORTE_SUCCESS;
}
static int mca_oob_ud_event_send_nack (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer, mca_oob_ud_msg_hdr_t *msg_hdr)
{
mca_oob_ud_msg_hdr_t tmp_hdr;
int rc = ORTE_SUCCESS;
struct ibv_send_wr wr;
struct ibv_sge sge;
OPAL_OUTPUT_VERBOSE((10, mca_oob_base_output, "%s oob:ud:event_send_nack sending nack for message id %"
PRIu64 " peer = %s. msg_id = %" PRIu64, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
peer->peer_expected_id, ORTE_NAME_PRINT(&peer->peer_name), msg_hdr->msg_id));
/* reuse registered buffer to send the nack (just need to change the type/return address) */
memcpy (&tmp_hdr, msg_hdr, sizeof (tmp_hdr));
msg_hdr->msg_type = MCA_OOB_UD_MSG_NACK;
/* set return address */
msg_hdr->ra.qkey = 0;
msg_hdr->ra.name = *ORTE_PROC_MY_NAME;
msg_hdr->ra.port_num = port->port_num;
msg_hdr->msg_id = peer->peer_expected_id;
mca_oob_ud_fill_sge (&sge, msg_hdr, sizeof (*msg_hdr), port->msg_buf.mr->lkey);
mca_oob_ud_fill_send_wr (&wr, &sge, 1, peer);
rc = mca_oob_ud_qp_post_send (&port->listen_qp, &wr, 1);
if (ORTE_SUCCESS != rc) {
opal_output (0, "oob:ud:event_send_ack error posting nack!");
return rc;
}
memcpy (msg_hdr, &tmp_hdr, sizeof (tmp_hdr));
return ORTE_SUCCESS;
}
void mca_oob_ud_event_queue_completed (mca_oob_ud_req_t *req)
{
struct timeval now = {0, 0};
mca_oob_ud_req_append_to_list (req, &mca_oob_ud_component.ud_event_queued_reqs);
if (!opal_event_evtimer_pending (&mca_oob_ud_component.ud_complete_event, &now)) {
opal_event_evtimer_set (opal_event_base, &mca_oob_ud_component.ud_complete_event,
mca_oob_ud_complete_dispatch, NULL);
opal_event_add (&mca_oob_ud_component.ud_complete_event, &now);
}
}
static int mca_oob_ud_event_handle_completion (mca_oob_ud_port_t *port, mca_oob_ud_msg_hdr_t *msg_hdr)
{
mca_oob_ud_req_t *recv_req = msg_hdr->msg_lcl_ctx;
bool brc;
if (NULL == recv_req) {
return ORTE_ERROR;
}
OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:event_handle_completion got "
"completion message for request %p", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(void *) recv_req));
brc = mca_oob_ud_req_is_in_list (recv_req, &mca_oob_ud_component.ud_active_recvs);
if (false == brc) {
/* duplicate completion message? */
OPAL_OUTPUT_VERBOSE((0, mca_oob_base_output, "%s oob:ud:event_handle_completion apparent duplicate completion. "
"request %p. req list = %p", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) recv_req,
(void *) recv_req->req_list));
return ORTE_SUCCESS;
}
recv_req->state = MCA_OOB_UD_REQ_COMPLETE;
mca_oob_ud_event_queue_completed (recv_req);
return ORTE_SUCCESS;
}
static int mca_oob_ud_event_handle_data_ok (mca_oob_ud_port_t *port, mca_oob_ud_msg_hdr_t *msg_hdr)
{
mca_oob_ud_req_t *send_req = msg_hdr->msg_lcl_ctx;
bool brc;
if (NULL == send_req) {
/* ack! */
return ORTE_ERROR;
}
OPAL_OUTPUT_VERBOSE((10, mca_oob_base_output, "%s oob:ud:event_handle_data_ok got data ok message for "
"request %p", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) send_req));
brc = mca_oob_ud_req_is_in_list (send_req, &mca_oob_ud_component.ud_active_sends);
if (false == brc) {
OPAL_OUTPUT_VERBOSE((0, mca_oob_base_output, "%s oob:ud:event_handle_data_ok apparent duplicate data ok. "
"request %p. req list = %p", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) send_req,
(void *) send_req->req_list));
/* duplicate data ok message? */
return ORTE_SUCCESS;
}
send_req->state = MCA_OOB_UD_REQ_COMPLETE;
mca_oob_ud_event_queue_completed (send_req);
return ORTE_SUCCESS;
}
static int mca_oob_ud_event_handle_req (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer, mca_oob_ud_msg_hdr_t *msg_hdr)
{
mca_oob_ud_req_t *recv_req;
int rc;
rc = mca_oob_ud_recv_match_send (port, peer, msg_hdr, &recv_req);
if (ORTE_SUCCESS == rc) {
mca_oob_ud_event_queue_completed (recv_req);
}
return rc;
}
static int mca_oob_ud_event_handle_rep (mca_oob_ud_port_t *port, mca_oob_ud_msg_hdr_t *msg_hdr)
{
mca_oob_ud_req_t *send_req = (mca_oob_ud_req_t *) msg_hdr->msg_lcl_ctx;
bool brc;
OPAL_OUTPUT_VERBOSE((10, mca_oob_base_output, "%s oob:ud:event_handle_rep got reply for request %p",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) send_req));
brc = mca_oob_ud_req_is_in_list (send_req, &mca_oob_ud_component.ud_active_sends);
if (false == brc) {
OPAL_OUTPUT_VERBOSE((0, mca_oob_base_output, "%s oob:ud:event_handle_rep no send matches reply",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* duplicate reply message? */
return ORTE_SUCCESS;
}
send_req->req_mtu = min(send_req->req_mtu, msg_hdr->msg_data.rep.mtu);
send_req->req_rem_data_len = msg_hdr->msg_data.rep.data_len;
send_req->req_rem_ctx = msg_hdr->msg_rem_ctx;
send_req->req_rem_qpn = msg_hdr->msg_data.rep.qpn;
mca_oob_ud_event_queue_completed (send_req);
return ORTE_SUCCESS;
}
static void *mca_oob_ud_event_dispatch(int fd, int flags, void *context)
{
int rc;
mca_oob_ud_device_t *device = (mca_oob_ud_device_t *) context;
mca_oob_ud_port_t *port = NULL;
struct ibv_cq *event_cq = NULL;
void *event_context = NULL;
do {
rc = ibv_get_cq_event (device->ib_channel, &event_cq, &event_context);
} while (rc && errno == EINTR);
if (NULL == event_cq) {
/* re-arm the event */
opal_event_add (&port->device->event, NULL);
return NULL;
}
port = (mca_oob_ud_port_t *) event_context;
rc = mca_oob_ud_process_messages (event_cq, port);
if (rc < 0) {
opal_output (0, "%s oob:ud:event_dispatch error processing messages",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
return NULL;
}
if (ibv_req_notify_cq(event_cq, 0)) {
opal_output (0, "%s oob:ud:event_dispatch error asking for cq notifications",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
}
/* re-arm the event */
opal_event_add (&port->device->event, NULL);
return NULL;
}
static void *mca_oob_ud_complete_dispatch(int fd, int flags, void *context)
{
mca_oob_ud_req_t *req;
OPAL_THREAD_LOCK(&mca_oob_ud_component.ud_match_lock);
while (NULL !=
(req = (mca_oob_ud_req_t *) opal_list_remove_first (&mca_oob_ud_component.ud_event_queued_reqs))) {
OPAL_THREAD_UNLOCK(&mca_oob_ud_component.ud_match_lock);
OPAL_OUTPUT_VERBOSE((10, mca_oob_base_output, "%s oob:ud:event_process processing request %p",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) req));
req->req_list = NULL;
switch (req->type) {
case MCA_OOB_UD_REQ_RECV:
case MCA_OOB_UD_REQ_UNEX:
if (req->state == MCA_OOB_UD_REQ_COMPLETE) {
mca_oob_ud_recv_complete (req);
} else {
mca_oob_ud_req_append_to_list (req, &mca_oob_ud_component.ud_active_recvs);
mca_oob_ud_recv_try (req);
}
break;
case MCA_OOB_UD_REQ_SEND:
if (req->state == MCA_OOB_UD_REQ_COMPLETE) {
mca_oob_ud_send_complete (req, ORTE_SUCCESS);
} else {
mca_oob_ud_req_append_to_list (req, &mca_oob_ud_component.ud_active_sends);
mca_oob_ud_send_try (req);
}
break;
default:
break;
}
OPAL_THREAD_LOCK(&mca_oob_ud_component.ud_match_lock);
}
return NULL;
}
static void mca_oob_ud_stop_events (mca_oob_ud_device_t *device)
{
opal_list_item_t *item;
OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:stop_events stopping event processing",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
for (item = opal_list_get_first (&device->ports) ;
item != opal_list_get_end (&device->ports) ;
item = opal_list_get_next (item)) {
mca_oob_ud_port_t *port = (mca_oob_ud_port_t *) item;
/* flush all receives */
mca_oob_ud_qp_to_reset (&port->listen_qp);
}
OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:stop_events events stopped",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
}

536
orte/mca/oob/ud/oob_ud_module.c Обычный файл
Просмотреть файл

@ -0,0 +1,536 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#include "orte_config.h"
#include "orte/types.h"
#include "opal/types.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/proc_info.h"
#include "orte/mca/routed/routed.h"
#include "oob_ud.h"
#define min(a,b) ((a) < (b) ? (a) : (b))
static int mca_oob_ud_module_init (void);
static int mca_oob_ud_module_fini (void);
mca_oob_t mca_oob_ud_module = {
mca_oob_ud_module_init,
mca_oob_ud_module_fini,
mca_oob_ud_get_addr,
mca_oob_ud_set_addr,
mca_oob_ud_ping,
mca_oob_ud_send_nb,
mca_oob_ud_recv_nb,
mca_oob_ud_recv_cancel,
mca_oob_ud_ft_event
};
void mca_oob_ud_device_construct (mca_oob_ud_device_t *device);
void mca_oob_ud_device_destruct (mca_oob_ud_device_t *device);
OBJ_CLASS_INSTANCE(mca_oob_ud_device_t, opal_list_item_t,
mca_oob_ud_device_construct,
mca_oob_ud_device_destruct);
void mca_oob_ud_port_construct (mca_oob_ud_port_t *port);
void mca_oob_ud_port_destruct (mca_oob_ud_port_t *port);
OBJ_CLASS_INSTANCE(mca_oob_ud_port_t, opal_list_item_t,
mca_oob_ud_port_construct,
mca_oob_ud_port_destruct);
/* uri must be at least 27 bytes in size */
void mca_oob_ud_port_get_uri (mca_oob_ud_port_t *port, char *uri)
{
sprintf (uri, "ud://%u.%u.%u", port->listen_qp.ib_qp->qp_num,
port->lid, port->port_num);
}
char *mca_oob_ud_get_addr (void)
{
/* NTH: qp_num - 32 bits (10), lid - 16 bits (5), port - 8 bits (3) + ud:// + 3 .'s + \0 = 27 chars */
char *contact_info = (char *) calloc(opal_list_get_size(&mca_oob_ud_component.ud_devices) * 27, 1);
char *ptr = contact_info;
opal_list_item_t *item, *port_item;
*ptr = 0;
for (item = opal_list_get_first (&mca_oob_ud_component.ud_devices) ;
item != opal_list_get_end (&mca_oob_ud_component.ud_devices) ;
item = opal_list_get_next (item)) {
mca_oob_ud_device_t *device = (mca_oob_ud_device_t *) item;
for (port_item = opal_list_get_first (&device->ports);
port_item != opal_list_get_end (&device->ports);
port_item = opal_list_get_next (port_item)) {
if (ptr != contact_info) {
ptr += sprintf (ptr, ";");
}
mca_oob_ud_port_get_uri ((mca_oob_ud_port_t *) port_item, ptr);
ptr += strlen (ptr);
}
}
OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:get_addr contact information: %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), contact_info));
return contact_info;
}
int mca_oob_ud_set_addr (const orte_process_name_t *name, const char *uri)
{
mca_oob_ud_peer_t *peer = NULL;
int rc;
OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:set_addr: setting location for peer %s from %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(name), uri));
OPAL_THREAD_LOCK(&mca_oob_ud_component.ud_lock);
(void) mca_oob_ud_peer_lookup (name, &peer);
if (NULL == uri) {
if (NULL != peer) {
mca_oob_ud_peer_release (peer);
}
peer = NULL;
} else if (NULL == peer) {
peer = mca_oob_ud_peer_from_uri (uri);
if (NULL == peer) {
return ORTE_ERR_BAD_PARAM;
}
} else {
rc = mca_oob_ud_peer_update_with_uri (peer, uri);
if (ORTE_SUCCESS != rc) {
return rc;
}
}
if (NULL != peer) {
peer->peer_name = *name;
peer->needs_notification = true;
}
opal_hash_table_set_value_uint64(&mca_oob_ud_component.ud_peers,
orte_util_hash_name(name),
(void *)peer);
OPAL_THREAD_UNLOCK(&mca_oob_ud_component.ud_lock);
return ORTE_SUCCESS;
}
static int mca_oob_ud_listen_create (mca_oob_ud_port_t *port) {
return mca_oob_ud_qp_init (&port->listen_qp, port, port->device->ib_channel, NULL, false);
}
/* mca_oob_ud_listen_destroy:
*
* Destory the listen queue pair associated with a port.
*/
static int mca_oob_ud_listen_destroy (mca_oob_ud_port_t *port)
{
if (NULL == port || NULL == port->listen_qp.ib_qp) {
return ORTE_SUCCESS;
}
OBJ_DESTRUCT(&port->listen_qp);
return ORTE_SUCCESS;
}
static inline int mca_oob_ud_port_recv_start (mca_oob_ud_port_t *port)
{
int i, rc;
rc = mca_oob_ud_qp_to_rts (&port->listen_qp);
if (ORTE_SUCCESS != rc) {
return rc;
}
OPAL_OUTPUT_VERBOSE((1, mca_oob_base_output, "%s oob:ud:port_recv_start posting"
"%d message buffers", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
mca_oob_ud_component.ud_recv_buffer_count));
for (i = 0 ; i < mca_oob_ud_component.ud_recv_buffer_count ; ++i) {
rc = mca_oob_ud_port_post_one_recv (port, i);
if (ORTE_SUCCESS != rc) {
return rc;
}
}
rc = ibv_req_notify_cq (port->listen_qp.ib_recv_cq, 0);
if (0 != rc) {
opal_output (0, "%s oob:ud:port_recv_start error requesting completion"
"notifications. rc = %d, errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rc, errno);
return ORTE_ERROR;
}
return ORTE_SUCCESS;
}
static inline int mca_oob_ud_alloc_reg_mem (struct ibv_pd *pd, mca_oob_ud_reg_mem_t *reg_mem,
const int buffer_len)
{
reg_mem->len = buffer_len;
reg_mem->ptr = NULL;
reg_mem->mr = NULL;
OPAL_OUTPUT_VERBOSE ((5, mca_oob_base_output, "%s oob:ud:alloc_reg_mem allocing and"
"registering %d bytes of memory with pd %p",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), buffer_len, (void *) pd));
posix_memalign ((void **)&reg_mem->ptr, sysconf(_SC_PAGESIZE), buffer_len);
if (NULL == reg_mem->ptr) {
opal_output (0, "%s oob:ud:alloc_reg_mem malloc failed! errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
return ORTE_ERR_OUT_OF_RESOURCE;
}
memset (reg_mem->ptr, 0, buffer_len);
reg_mem->mr = ibv_reg_mr (pd, reg_mem->ptr, buffer_len, IBV_ACCESS_LOCAL_WRITE);
if (NULL == reg_mem->mr) {
opal_output (0, "%s oob:ud:alloc_reg_mem failed to register memory. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
return ORTE_ERROR;
}
return ORTE_SUCCESS;
}
static inline void mca_oob_ud_free_reg_mem (mca_oob_ud_reg_mem_t *reg_mem)
{
if (reg_mem->mr) {
(void) ibv_dereg_mr (reg_mem->mr);
}
if (reg_mem->ptr) {
free (reg_mem->ptr);
}
memset (reg_mem, 0, sizeof (mca_oob_ud_reg_mem_t));
}
static int mca_oob_ud_port_alloc_buffers (mca_oob_ud_port_t *port) {
int total_buffer_count = mca_oob_ud_component.ud_recv_buffer_count +
mca_oob_ud_component.ud_send_buffer_count;
opal_list_item_t *item;
int rc, i;
rc = mca_oob_ud_alloc_reg_mem (port->device->ib_pd, &port->grh_buf,
mca_oob_ud_component.ud_recv_buffer_count * sizeof (struct ibv_grh));
if (ORTE_SUCCESS != rc) {
return rc;
}
rc = mca_oob_ud_alloc_reg_mem (port->device->ib_pd, &port->msg_buf,
total_buffer_count * port->mtu);
if (ORTE_SUCCESS != rc) {
return rc;
}
rc = opal_free_list_init (&port->free_msgs, sizeof (mca_oob_ud_msg_t),
OBJ_CLASS(mca_oob_ud_msg_t), mca_oob_ud_component.ud_send_buffer_count,
mca_oob_ud_component.ud_send_buffer_count, 0);
if (ORTE_SUCCESS != rc) {
return rc;
}
for (i = 0, item = opal_list_get_first (&port->free_msgs.super) ;
item != opal_list_get_end (&port->free_msgs.super) ;
item = opal_list_get_next (item), ++i) {
char *ptr = port->msg_buf.ptr + (i + mca_oob_ud_component.ud_recv_buffer_count) *
port->mtu;
mca_oob_ud_msg_init ((mca_oob_ud_msg_t *) item, port,
ptr, port->msg_buf.mr);
}
return rc;
}
int mca_oob_ud_port_post_one_recv (mca_oob_ud_port_t *port, int msg_num)
{
char *grh_buf = port->grh_buf.ptr + msg_num * sizeof (struct ibv_grh);
char *msg_buf = port->msg_buf.ptr + msg_num * port->mtu;
struct ibv_recv_wr wr;
struct ibv_sge sge[2];
/* GRH */
mca_oob_ud_fill_sge(sge, grh_buf, sizeof (struct ibv_grh), port->grh_buf.mr->lkey);
/* message */
mca_oob_ud_fill_sge(sge + 1, msg_buf, port->mtu, port->msg_buf.mr->lkey);
mca_oob_ud_fill_recv_wr (&wr, sge, 2);
wr.wr_id = MCA_OOB_UD_RECV_WR | (uint64_t)msg_num;
return mca_oob_ud_qp_post_recv (&port->listen_qp, &wr);
}
static int mca_oob_ud_module_init (void)
{
opal_list_item_t *item, *item2;
int rc;
OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:init initializing oob/openib. # of devices = %u",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(unsigned int) opal_list_get_size (&mca_oob_ud_component.ud_devices)));
for (item = opal_list_get_first (&mca_oob_ud_component.ud_devices);
item != opal_list_get_end (&mca_oob_ud_component.ud_devices);
item = opal_list_get_next (item)) {
mca_oob_ud_device_t *device = (mca_oob_ud_device_t *) item;
/* start monitoring the device for completions */
for (item2 = opal_list_get_first (&device->ports) ;
item2 != opal_list_get_end (&device->ports) ;
item2 = opal_list_get_next (item2)) {
mca_oob_ud_port_t *port = (mca_oob_ud_port_t *) item2;
rc = mca_oob_ud_listen_create (port);
if (0 != rc) {
continue;
}
rc = mca_oob_ud_port_alloc_buffers (port);
if (ORTE_SUCCESS != rc) {
mca_oob_ud_listen_destroy (port);
continue;
}
rc = opal_free_list_init (&port->data_qps,
sizeof (mca_oob_ud_qp_t),
OBJ_CLASS(mca_oob_ud_qp_t),
mca_oob_ud_component.ud_min_qp,
mca_oob_ud_component.ud_max_qp,
2);
if (OPAL_SUCCESS != rc) {
mca_oob_ud_listen_destroy (port);
continue;
}
rc = mca_oob_ud_port_recv_start (port);
if (ORTE_SUCCESS != rc) {
mca_oob_ud_listen_destroy (port);
}
/* NTH: only supports one port for now */
break;
}
mca_oob_ud_event_start_monitor (device);
}
return ORTE_SUCCESS;
}
static void mca_oob_ud_cancel_all_in_list (opal_list_t *list)
{
opal_list_item_t *item;
while (NULL != (item = opal_list_remove_first (list))) {
((mca_oob_ud_req_t *)item)->req_list = NULL;
mca_oob_ud_req_abort ((mca_oob_ud_req_t *) item);
}
}
static void mca_oob_ud_empty_list (opal_list_t *list)
{
opal_list_item_t *item;
while (NULL != (item = opal_list_remove_first (list))) {
OBJ_RELEASE(item);
}
}
static int mca_oob_ud_module_fini (void)
{
mca_oob_ud_peer_t *peer;
opal_list_item_t *item;
uint64_t key;
void *node;
int rc;
OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:fini entering",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
OPAL_THREAD_LOCK(&mca_oob_ud_component.ud_lock);
OPAL_THREAD_LOCK(&mca_oob_ud_component.ud_match_lock);
if (ORTE_VPID_INVALID != ORTE_PROC_MY_PARENT->vpid) {
if (ORTE_SUCCESS == mca_oob_ud_peer_lookup (ORTE_PROC_MY_PARENT, &peer) && NULL != peer) {
mca_oob_ud_peer_handle_end (peer);
}
}
/* abort active receives */
mca_oob_ud_cancel_all_in_list (&mca_oob_ud_component.ud_active_recvs);
mca_oob_ud_cancel_all_in_list (&mca_oob_ud_component.ud_active_sends);
mca_oob_ud_cancel_all_in_list (&mca_oob_ud_component.ud_pending_recvs);
mca_oob_ud_empty_list (&mca_oob_ud_component.ud_unexpected_recvs);
mca_oob_ud_empty_list (&mca_oob_ud_component.ud_event_queued_reqs);
OPAL_THREAD_UNLOCK(&mca_oob_ud_component.ud_match_lock);
rc = opal_hash_table_get_first_key_uint64 (&mca_oob_ud_component.ud_peers, &key,
(void **) &peer, &node);
if (OPAL_SUCCESS == rc) {
do {
if (NULL != peer) {
mca_oob_ud_peer_release (peer);
}
rc = opal_hash_table_get_next_key_uint64 (&mca_oob_ud_component.ud_peers, &key,
(void **) &peer, node, &node);
} while (OPAL_SUCCESS == rc);
}
opal_hash_table_remove_all (&mca_oob_ud_component.ud_peers);
for (item = opal_list_get_first (&mca_oob_ud_component.ud_devices);
item != opal_list_get_end (&mca_oob_ud_component.ud_devices);
item = opal_list_get_next (item)) {
mca_oob_ud_event_stop_monitor ((mca_oob_ud_device_t *) item);
}
mca_oob_ud_empty_list (&mca_oob_ud_component.ud_devices);
OPAL_THREAD_UNLOCK(&mca_oob_ud_component.ud_lock);
return 0;
}
void mca_oob_ud_device_construct (mca_oob_ud_device_t *device)
{
memset((char *) device + sizeof (device->super), 0, sizeof (*device) - sizeof (device->super));
OBJ_CONSTRUCT(&device->ports, opal_list_t);
}
void mca_oob_ud_device_destruct (mca_oob_ud_device_t *device)
{
opal_list_item_t *item;
while (NULL != (item = opal_list_remove_first (&device->ports))) {
OBJ_RELEASE(item);
}
if (device->ib_pd) {
(void) ibv_dealloc_pd (device->ib_pd);
}
if (device->ib_channel) {
(void) ibv_destroy_comp_channel (device->ib_channel);
}
if (device->ib_context) {
(void) ibv_close_device (device->ib_context);
}
OBJ_DESTRUCT(&device->ports);
memset (device, 0, sizeof (mca_oob_ud_device_t));
}
void mca_oob_ud_port_construct (mca_oob_ud_port_t *port)
{
memset((char *) port + sizeof (port->super), 0, sizeof (*port) - sizeof (port->super));
OBJ_CONSTRUCT(&port->data_qps, opal_free_list_t);
OBJ_CONSTRUCT(&port->free_msgs, opal_free_list_t);
OBJ_CONSTRUCT(&port->listen_qp, opal_free_list_item_t);
}
void mca_oob_ud_port_destruct (mca_oob_ud_port_t *port)
{
(void) mca_oob_ud_listen_destroy (port);
OBJ_DESTRUCT(&port->data_qps);
OBJ_DESTRUCT(&port->free_msgs);
mca_oob_ud_free_reg_mem (&port->grh_buf);
mca_oob_ud_free_reg_mem (&port->msg_buf);
}
int mca_oob_ud_ft_event(int state) {
return ORTE_SUCCESS;
}
int mca_oob_ud_register_iov (struct iovec *iov, int count, struct ibv_mr **ib_mr,
struct ibv_pd *ib_pd, unsigned int mtu, int *sge_countp,
int *wr_countp, int *data_lenp)
{
int data_len, iov_index, sge_count;
unsigned int packet_size = 0;
*wr_countp = 0;
*data_lenp = 0;
*sge_countp = 0;
for (iov_index = 0, data_len = 0, sge_count = 0 ; iov_index < count ; ++iov_index) {
unsigned int iov_left = iov[iov_index].iov_len;
data_len += iov_left;
sge_count++;
do {
unsigned int to_trans = min (iov_left, mtu - packet_size);
packet_size = (to_trans < iov_left) ? 0 : packet_size + to_trans;
iov_left -= to_trans;
if (0 == packet_size && iov_left) {
sge_count++;
}
} while (iov_left);
/* register buffers */
if (NULL == ib_mr[iov_index]) {
ib_mr[iov_index] = ibv_reg_mr (ib_pd,
iov[iov_index].iov_base,
iov[iov_index].iov_len,
IBV_ACCESS_LOCAL_WRITE |
IBV_ACCESS_REMOTE_WRITE);
if (NULL == ib_mr[iov_index]) {
/* Ruh-roh */
opal_output (0, "%s oob:ud:register_iov error registering memory. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
}
*wr_countp = (data_len + mtu - 1) / mtu;
*sge_countp = sge_count;
*data_lenp = data_len;
return ORTE_SUCCESS;
}

385
orte/mca/oob/ud/oob_ud_peer.c Обычный файл
Просмотреть файл

@ -0,0 +1,385 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#include "oob_ud_peer.h"
#include "oob_ud.h"
#include "opal/include/opal_stdint.h"
#include "orte/util/name_fns.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/notifier/notifier.h"
#include "orte/mca/state/state.h"
#include "orte/mca/routed/routed.h"
static void mca_oob_ud_peer_construct (mca_oob_ud_peer_t *peer);
static void mca_oob_ud_peer_destruct (mca_oob_ud_peer_t *peer);
OBJ_CLASS_INSTANCE(mca_oob_ud_peer_t, opal_object_t,
mca_oob_ud_peer_construct,
mca_oob_ud_peer_destruct);
int mca_oob_ud_peer_lookup (const orte_process_name_t *name, mca_oob_ud_peer_t **peer) {
int rc;
*peer = NULL;
rc = opal_hash_table_get_value_uint64(&mca_oob_ud_component.ud_peers,
orte_util_hash_name(name), (void**)peer);
if (OPAL_SUCCESS != rc) {
return ORTE_ERR_UNREACH;
}
return ORTE_SUCCESS;
}
static inline int mca_oob_ud_parse_uri (const char *uri, uint32_t *qp_num,
uint16_t *lid, uint16_t *port_num)
{
int rc;
rc = sscanf (uri, "ud://%u.%hu.%hu", qp_num, lid, port_num);
if (3 != rc) {
opal_output (0, "%s oob:ud:parse_uri error parsing uri. expected 3 elements. got %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rc);
return ORTE_ERR_BAD_PARAM;
}
return ORTE_SUCCESS;
}
int mca_oob_ud_peer_update_with_uri (mca_oob_ud_peer_t *peer, const char *uri)
{
opal_list_item_t *item;
struct ibv_ah_attr ah_attr;
mca_oob_ud_device_t *device;
uint32_t qp_num;
/* NTH: port is 16-bit here because C90 does not support hh in sscanf */
uint16_t lid, port_num;
int rc;
rc = mca_oob_ud_parse_uri (uri, &qp_num, &lid, &port_num);
if (ORTE_SUCCESS != rc) {
return rc;
}
if (peer->peer_lid != lid || peer->peer_port != port_num) {
if (NULL != peer->peer_ah) {
(void) ibv_destroy_ah (peer->peer_ah);
peer->peer_ah = NULL;
}
}
peer->peer_qpn = qp_num;
peer->peer_qkey = 0; /* NTH: todo -- add qkey support if needed */
peer->peer_lid = lid;
peer->peer_port = port_num;
if (NULL == peer->peer_ah) {
memset (&ah_attr, 0, sizeof (ah_attr));
ah_attr.dlid = lid;
ah_attr.port_num = port_num;
for (item = opal_list_get_first (&mca_oob_ud_component.ud_devices);
item != opal_list_get_end (&mca_oob_ud_component.ud_devices);
item = opal_list_get_next (item)) {
device = (mca_oob_ud_device_t *)item;
/* try to create an address handle using this device */
peer->peer_ah = ibv_create_ah (device->ib_pd, &ah_attr);
if (NULL != peer->peer_ah) {
peer->peer_context = (void *) item;
break;
}
}
if (NULL == peer->peer_ah) {
free (peer);
return ORTE_ERROR;
}
}
return ORTE_SUCCESS;
}
mca_oob_ud_peer_t *mca_oob_ud_get_peer (struct mca_oob_ud_port_t *port,
orte_process_name_t *name,
uint32_t qpn, uint32_t qkey,
uint16_t lid, uint8_t port_num)
{
struct ibv_ah_attr ah_attr;
mca_oob_ud_peer_t *peer;
int rc;
rc = mca_oob_ud_peer_lookup (name, &peer);
if (ORTE_SUCCESS == rc) {
OPAL_OUTPUT_VERBOSE((20, mca_oob_base_output, "%s oob:ud:peer_from_msg_hdr using "
"cached peer", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
return peer;
}
OPAL_OUTPUT_VERBOSE((10, mca_oob_base_output, "%s oob:ud:peer_from_msg_hdr creating "
"peer from return address", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
peer = OBJ_NEW(mca_oob_ud_peer_t);
if (NULL == peer) {
return NULL;
}
peer->peer_qpn = qpn;
peer->peer_qkey = qkey;
peer->peer_name = *name;
peer->peer_lid = lid;
peer->peer_port = port_num;
memset (&ah_attr, 0, sizeof (ah_attr));
ah_attr.dlid = peer->peer_lid;
ah_attr.port_num = peer->peer_port;
peer->peer_ah = ibv_create_ah (port->device->ib_pd, &ah_attr);
if (NULL == peer->peer_ah) {
free (peer);
return NULL;
}
peer->peer_context = port->device;
OPAL_THREAD_LOCK(&mca_oob_ud_component.ud_lock);
opal_hash_table_set_value_uint64(&mca_oob_ud_component.ud_peers,
orte_util_hash_name(name),
(void *) peer);
OPAL_THREAD_UNLOCK(&mca_oob_ud_component.ud_lock);
return peer;
}
mca_oob_ud_peer_t *mca_oob_ud_peer_from_uri (const char *uri)
{
mca_oob_ud_peer_t *peer;
int rc;
peer = OBJ_NEW(mca_oob_ud_peer_t);
if (NULL == peer) {
return NULL;
}
rc = mca_oob_ud_peer_update_with_uri (peer, uri);
if (ORTE_SUCCESS != rc) {
OBJ_RELEASE (peer);
peer = NULL;
}
return peer;
}
static void mca_oob_ud_peer_construct (mca_oob_ud_peer_t *peer)
{
memset ((char *) peer + sizeof (peer->super), 0, sizeof (*peer) - sizeof (peer->super));
OBJ_CONSTRUCT(&peer->peer_flying_messages, opal_list_t);
peer->peer_expected_id = 1;
}
void mca_oob_ud_peer_handle_end (mca_oob_ud_peer_t *peer)
{
mca_oob_ud_port_t *port = NULL;
mca_oob_ud_msg_t *msg = NULL;
int rc;
OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:peer_handle_end telling peer %s i "
"am going away", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&peer->peer_name)));
do {
/* tell the peer that we are deleting them */
if (NULL == peer || NULL == peer->peer_context || false == peer->peer_available ||
false == peer->needs_notification) {
OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:peer_handle_end don't need to tell %s i "
"am going away", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&peer->peer_name)));
break;
}
port = (mca_oob_ud_port_t *) opal_list_get_first (&((mca_oob_ud_device_t *)peer->peer_context)->ports);
if (NULL == port) {
OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:peer_handle_end can't tell %s i "
"am going away (no port)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&peer->peer_name)));
break;
}
rc = mca_oob_ud_msg_get (port, NULL, &port->listen_qp, peer, true, &msg);
if (ORTE_SUCCESS != rc) {
OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:peer_handle_end can't tell %s i "
"am going away (no message buffer)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&peer->peer_name)));
break;
}
peer->peer_timer.tries = 2;
peer->peer_timer.value.tv_usec = 500000;
msg->hdr->msg_type = MCA_OOB_UD_MSG_END;
rc = mca_oob_ud_qp_post_send (&port->listen_qp, &msg->wr, 1);
if (ORTE_SUCCESS != rc) {
OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:peer_handle_end can't tell %s i "
"am going away (send failed)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&peer->peer_name)));
break;
}
} while (0);
if (NULL != msg) {
mca_oob_ud_msg_return (msg);
}
}
void mca_oob_ud_peer_lost (mca_oob_ud_peer_t *peer)
{
OPAL_THREAD_LOCK(&peer->peer_lock);
if (true == peer->peer_available) {
peer->peer_available = false;
OPAL_OUTPUT_VERBOSE((10, mca_oob_base_output, "%s oob:ud:peer_lost lost connectivity to peer "
"%s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->peer_name)));
/* inform the ERRMGR framework that we have lost a connection so
* it can decide if this is important, what to do about it, etc.
*/
ORTE_ACTIVATE_PROC_STATE(&peer->peer_name, ORTE_PROC_STATE_COMM_FAILED);
}
OPAL_THREAD_UNLOCK(&peer->peer_lock);
}
void mca_oob_ud_peer_release (mca_oob_ud_peer_t *peer)
{
OBJ_RELEASE(peer);
}
static void mca_oob_ud_peer_destruct (mca_oob_ud_peer_t *peer)
{
if (NULL != peer->peer_ah) {
(void) ibv_destroy_ah (peer->peer_ah);
}
}
static void mca_oob_ud_peer_msg_timeout (int fd, short event, void *ctx)
{
mca_oob_ud_peer_t *peer = (mca_oob_ud_peer_t *) ctx;
mca_oob_ud_msg_t *msg = (mca_oob_ud_msg_t *) opal_list_get_first (&peer->peer_flying_messages);
OPAL_THREAD_LOCK(&peer->peer_lock);
if (false == peer->peer_timer.active) {
return;
}
peer->peer_timer.active = false;
OPAL_OUTPUT_VERBOSE((10, mca_oob_base_output, "%s oob:ud:peer_msg_timeout timeout sending to peer "
"%s. first message = %" PRIu64 " which has length %d" , ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&peer->peer_name), msg->hdr->msg_id, msg->wr.sg_list[0].length));
if (peer->peer_timer.tries == 0) {
opal_list_item_t *item;
while (NULL != (item = opal_list_remove_first (&peer->peer_flying_messages))) {
msg = (mca_oob_ud_msg_t *) item;
mca_oob_ud_msg_status_update (msg, MCA_OOB_UD_MSG_STATUS_TIMEOUT);
if (msg->req) {
mca_oob_ud_req_complete (msg->req, ORTE_ERR_TIMEOUT);
}
}
OPAL_THREAD_UNLOCK(&peer->peer_lock);
mca_oob_ud_peer_lost (peer);
return;
}
peer->peer_timer.tries--;
mca_oob_ud_peer_post_all (peer);
mca_oob_ud_peer_start_timer (peer);
OPAL_THREAD_UNLOCK(&peer->peer_lock);
}
int mca_oob_ud_peer_post_msg (mca_oob_ud_peer_t *peer, mca_oob_ud_msg_t *msg)
{
int rc;
msg->hdr->msg_id = ++peer->peer_next_id;
rc = mca_oob_ud_qp_post_send (msg->qp, &msg->wr, 1);
if (ORTE_SUCCESS != rc) {
return rc;
}
opal_list_append (&peer->peer_flying_messages, (opal_list_item_t *) msg);
if (false == peer->peer_timer.active) {
mca_oob_ud_peer_reset_timer (peer);
mca_oob_ud_peer_start_timer (peer);
}
return ORTE_SUCCESS;
}
void mca_oob_ud_peer_stop_timer (mca_oob_ud_peer_t *peer)
{
if (peer->peer_timer.active) {
peer->peer_timer.active = false;
opal_event_evtimer_del (&peer->peer_timer.event);
}
}
void mca_oob_ud_peer_reset_timer (mca_oob_ud_peer_t *peer)
{
/* NTH: XXX -- TODO -- make these mca variables */
peer->peer_timer.tries = 5;
peer->peer_timer.value.tv_sec = 0;
peer->peer_timer.value.tv_usec = 800000;
}
void mca_oob_ud_peer_start_timer (mca_oob_ud_peer_t *peer)
{
if (!peer->peer_timer.active && opal_list_get_size (&peer->peer_flying_messages)) {
peer->peer_timer.active = true;
opal_event_evtimer_set (opal_event_base, &peer->peer_timer.event,
mca_oob_ud_peer_msg_timeout, (void *) peer);
opal_event_evtimer_add (&peer->peer_timer.event, &peer->peer_timer.value);
}
}
void mca_oob_ud_peer_post_all (mca_oob_ud_peer_t *peer)
{
opal_list_item_t *item;
OPAL_OUTPUT_VERBOSE((10, mca_oob_base_output, "%s oob:ud:peer_post_all reposting all messages for peer %p",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) peer));
for (item = opal_list_get_first (&peer->peer_flying_messages) ;
item != opal_list_get_end (&peer->peer_flying_messages) ;
item = opal_list_get_next (item)) {
mca_oob_ud_msg_t *msg = (mca_oob_ud_msg_t *) item;
(void) mca_oob_ud_qp_post_send (msg->qp, &msg->wr, 1);
}
}

95
orte/mca/oob/ud/oob_ud_peer.h Обычный файл
Просмотреть файл

@ -0,0 +1,95 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#if !defined(MCA_OOB_UD_PEER_H)
#define MCA_OOB_UD_PEER_H
#include "orte_config.h"
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif
#include "orte/types.h"
#include "opal/mca/base/base.h"
#include "opal/class/opal_free_list.h"
#include "opal/class/opal_hash_table.h"
#include "opal/threads/mutex.h"
#include "opal/threads/condition.h"
#include "opal/threads/threads.h"
#include "opal/mca/timer/base/base.h"
#include "orte/mca/oob/oob.h"
#include "orte/mca/oob/base/base.h"
#include <infiniband/verbs.h>
struct mca_oob_ud_msg_hdr_t;
struct mca_oob_ud_port_t;
struct mca_oob_ud_peer_t {
opal_object_t super;
void *peer_context;
struct ibv_ah *peer_ah;
uint32_t peer_qpn;
uint32_t peer_qkey;
uint64_t peer_next_id;
uint64_t peer_expected_id;
orte_process_name_t peer_name;
uint16_t peer_lid;
uint8_t peer_port;
bool peer_available;
bool needs_notification;
opal_list_t peer_flying_messages;
opal_mutex_t peer_lock;
struct {
int tries;
opal_event_t event;
struct timeval value;
bool active;
} peer_timer;
};
typedef struct mca_oob_ud_peer_t mca_oob_ud_peer_t;
OBJ_CLASS_DECLARATION(mca_oob_ud_peer_t);
int mca_oob_ud_peer_lookup (const orte_process_name_t *name, mca_oob_ud_peer_t **peer);
int mca_oob_ud_peer_update_with_uri (mca_oob_ud_peer_t *peer, const char *uri);
mca_oob_ud_peer_t *mca_oob_ud_peer_from_uri (const char *uri);
mca_oob_ud_peer_t *mca_oob_ud_get_peer (struct mca_oob_ud_port_t *port,
orte_process_name_t *name,
uint32_t qpn, uint32_t qkey,
uint16_t lid, uint8_t port_num);
void mca_oob_ud_peer_lost (mca_oob_ud_peer_t *peer);
void mca_oob_ud_peer_release (mca_oob_ud_peer_t *peer);
struct mca_oob_ud_msg_t;
int mca_oob_ud_peer_post_msg (mca_oob_ud_peer_t *peer, struct mca_oob_ud_msg_t *msg);
void mca_oob_ud_peer_start_timer (mca_oob_ud_peer_t *peer);
void mca_oob_ud_peer_stop_timer (mca_oob_ud_peer_t *peer);
void mca_oob_ud_peer_reset_timer (mca_oob_ud_peer_t *peer);
void mca_oob_ud_peer_post_all (mca_oob_ud_peer_t *peer);
void mca_oob_ud_peer_handle_end (mca_oob_ud_peer_t *peer);
#endif

67
orte/mca/oob/ud/oob_ud_ping.c Обычный файл
Просмотреть файл

@ -0,0 +1,67 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#include "opal/mca/event/event.h"
#include "opal/opal_socket_errno.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "oob_ud.h"
int mca_oob_ud_ping(const orte_process_name_t* name, const char* uri,
const struct timeval *timeout) {
mca_oob_ud_peer_t *peer;
mca_oob_ud_port_t *port;
mca_oob_ud_msg_t *msg = NULL;
struct timeval half_timeout;
int rc;
opal_output (0, "attempting to ping %s with uri %s", ORTE_NAME_PRINT(name), uri);
peer = mca_oob_ud_peer_from_uri (uri);
if (NULL == peer) {
return ORTE_ERROR;
}
half_timeout.tv_sec = timeout->tv_sec / 2;
half_timeout.tv_usec = (timeout->tv_usec / 2 + (timeout->tv_sec % 2) * 500000) % 1000000;
half_timeout.tv_sec += (timeout->tv_usec / 2 + (timeout->tv_sec % 2) * 500000) / 1000000;
/* NTH: TODO -- get a random port? */
port = (mca_oob_ud_port_t *) opal_list_get_first (&((mca_oob_ud_device_t *)peer->peer_context)->ports);
do {
rc = mca_oob_ud_msg_get (port, NULL, &port->listen_qp, peer, true, &msg);
if (ORTE_SUCCESS != rc) {
break;
}
msg->hdr->msg_type = MCA_OOB_UD_MSG_PING;
rc = mca_oob_ud_msg_post_send (msg);
/* wait for ack */
rc = mca_oob_ud_msg_wait (msg);
opal_output (0, "ping result to %s -> %s: %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(name), rc);
} while (0);
if (NULL != msg) {
mca_oob_ud_msg_return(msg);
}
mca_oob_ud_peer_release (peer);
return rc;
}

296
orte/mca/oob/ud/oob_ud_qp.c Обычный файл
Просмотреть файл

@ -0,0 +1,296 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#include "oob_ud_qp.h"
#include "oob_ud.h"
static void mca_oob_ud_qp_constructor (mca_oob_ud_qp_t *qp);
static void mca_oob_ud_qp_destructor (mca_oob_ud_qp_t *qp);
OBJ_CLASS_INSTANCE(mca_oob_ud_qp_t, opal_free_list_item_t,
mca_oob_ud_qp_constructor,
mca_oob_ud_qp_destructor);
static inline int mca_oob_ud_qp_process_send_completions (mca_oob_ud_qp_t *qp,
int num_completions);
#define MCA_OOB_UD_CLEAR_CQ(cq) \
do { \
if (NULL == (cq)->channel) { \
struct ibv_wc wc; \
while (ibv_poll_cq ((cq), 1, &wc)); \
} \
} while (0); \
int mca_oob_ud_qp_init (mca_oob_ud_qp_t *qp, struct mca_oob_ud_port_t *port,
struct ibv_comp_channel *recv_channel,
struct ibv_comp_channel *send_channel, bool onecq)
{
struct ibv_qp_init_attr init_attr;
OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:qp_init creating UD QP on port %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), port->port_num));
/* create a UD queue pair */
memset(&init_attr, 0, sizeof(init_attr));
init_attr.qp_type = IBV_QPT_UD;
qp->ib_recv_cq = ibv_create_cq (port->device->ib_context, 16384,
port, recv_channel, 0);
if (false == onecq) {
qp->ib_send_cq = ibv_create_cq (port->device->ib_context, 16384,
port, send_channel, 0);
} else {
qp->ib_send_cq = qp->ib_recv_cq;
}
init_attr.send_cq = qp->ib_send_cq;
init_attr.recv_cq = qp->ib_recv_cq;
init_attr.cap.max_send_sge = 32;
init_attr.cap.max_recv_sge = 32; /* GRH, data */
init_attr.cap.max_inline_data = 0; /* don't use inline data for now */
/* NTH: fix these */
init_attr.cap.max_recv_wr = 4096;
init_attr.cap.max_send_wr = 4096;
qp->ib_qp = ibv_create_qp (port->device->ib_pd, &init_attr);
if (NULL == qp->ib_qp) {
opal_output(0, "%s oob:ud:qp_init could not create queue pair. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
return ORTE_ERROR;
}
/* end: create the UD queue pair */
qp->port = port;
return ORTE_SUCCESS;
}
int mca_oob_ud_qp_to_reset (mca_oob_ud_qp_t *qp)
{
struct ibv_qp_attr attr;
/* move the QP into the ERR state */
memset(&attr, 0, sizeof(attr));
attr.qp_state = IBV_QPS_ERR;
if (0 != ibv_modify_qp(qp->ib_qp, &attr, IBV_QP_STATE)) {
opal_output(0, "%s oob:ud:qp_to_reset error modifying qp to ERR. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
return ORTE_ERROR;
}
/* poll thread/event will clear failed work requests */
MCA_OOB_UD_CLEAR_CQ(qp->ib_send_cq);
MCA_OOB_UD_CLEAR_CQ(qp->ib_recv_cq);
/* move the QP into the RESET state */
memset(&attr, 0, sizeof(attr));
attr.qp_state = IBV_QPS_RESET;
if (0 != ibv_modify_qp(qp->ib_qp, &attr, IBV_QP_STATE)) {
opal_output(0, "%s oob:ud:qp_to_reset error modifying qp to RESET. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
return ORTE_ERROR;
}
return ORTE_SUCCESS;
}
int mca_oob_ud_qp_to_rts (mca_oob_ud_qp_t *qp)
{
struct mca_oob_ud_port_t *port = qp->port;
int attr_mask;
struct ibv_qp_attr attr;
/* move the QP into the INIT state */
memset(&attr, 0, sizeof(attr));
attr.qp_state = IBV_QPS_INIT;
attr.pkey_index = 0; /* NTH: might need to modify the pkey index later */
attr.port_num = port->port_num;
attr.qkey = 0;
attr_mask = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_QKEY;
if (0 != ibv_modify_qp(qp->ib_qp, &attr, attr_mask)) {
opal_output(0, "%s oob:ud:qp_to_reset error modifying qp to INIT. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
return ORTE_ERROR;
}
/* Move QP to RTR */
attr.qp_state = IBV_QPS_RTR;
if (0 != ibv_modify_qp(qp->ib_qp, &attr, IBV_QP_STATE)) {
opal_output(0, "%s oob:ud:qp_to_reset error modifying qp to RTR. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
return ORTE_ERROR;
}
/* Setup attributes */
memset(&attr, 0, sizeof(attr));
attr.qp_state = IBV_QPS_RTS;
attr.sq_psn = 0;
attr_mask = IBV_QP_STATE | IBV_QP_SQ_PSN;
if (0 != ibv_modify_qp(qp->ib_qp, &attr, attr_mask)) {
opal_output(0, "%s oob:ud:qp_to_reset error modifying qp to RTS. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
return ORTE_ERROR;
}
return ORTE_SUCCESS;
}
/* purge all work requests on a qp */
int mca_oob_ud_qp_purge (mca_oob_ud_qp_t *qp)
{
int rc;
rc = mca_oob_ud_qp_to_reset (qp);
if (ORTE_SUCCESS != rc) {
return rc;
}
return mca_oob_ud_qp_to_rts (qp);
}
static void mca_oob_ud_qp_constructor (mca_oob_ud_qp_t *qp)
{
memset ((char *)qp + sizeof(qp->super), 0, sizeof (*qp) - sizeof (qp->super));
}
static void mca_oob_ud_qp_destructor (mca_oob_ud_qp_t *qp)
{
int rc;
if (NULL != qp->ib_qp) {
/* clear qp and move to reset */
(void) mca_oob_ud_qp_to_reset (qp);
/* destroy qp */
rc = ibv_destroy_qp (qp->ib_qp);
if (0 != rc) {
opal_output (0, "IBV_DESTROY_QP FAILED! rc = %d, errno = %d", rc, errno);
}
}
if (NULL != qp->ib_send_cq) {
(void) ibv_destroy_cq (qp->ib_send_cq);
}
if (NULL != qp->ib_recv_cq && qp->ib_recv_cq != qp->ib_send_cq) {
(void) ibv_destroy_cq (qp->ib_recv_cq);
}
}
static inline int mca_oob_ud_qp_process_send_completions (mca_oob_ud_qp_t *qp,
int num_completions)
{
struct ibv_wc wc[1];
int count, rc, ret, i;
OPAL_OUTPUT_VERBOSE((20, mca_oob_base_output, "%s oob:ud:qp_process_send_completions polling "
"for %d completions", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
num_completions));
rc = ORTE_SUCCESS;
for (count = 0 ; count < num_completions ; ) {
ret = ibv_poll_cq (qp->ib_send_cq, 1, wc);
if (ret < 0) {
opal_output (0, "%s oob:ud:qp_process_send_completions error polling for completions. "
"errno = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
return ORTE_ERROR;
}
for (i = 0 ; i < ret ; ++i) {
if (IBV_WC_SUCCESS != wc[i].status) {
opal_output (0, "wc status = %d", wc[i].status);
rc = ORTE_ERROR;
}
}
count += ret;
}
return rc;
}
int mca_oob_ud_qp_post_send (mca_oob_ud_qp_t *qp, struct ibv_send_wr *wr,
int num_completions) {
struct ibv_send_wr *bad_wr;
int rc;
rc = ibv_post_send (qp->ib_qp, wr, &bad_wr);
if (0 != rc) {
opal_output (0, "%s oob:ud:qp_post_send ibv_post_send failed. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
return ORTE_ERROR;
}
return mca_oob_ud_qp_process_send_completions (qp, num_completions);
}
int mca_oob_ud_qp_post_recv (mca_oob_ud_qp_t *qp, struct ibv_recv_wr *wr) {
struct ibv_recv_wr *bad_wr;
int rc;
rc = ibv_post_recv (qp->ib_qp, wr, &bad_wr);
if (0 != rc) {
opal_output (0, "%s oob:ud:qp_post_recv ibv_post_send failed. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
return ORTE_ERROR;
}
return ORTE_SUCCESS;
}
int mca_oob_ud_qp_data_aquire (struct mca_oob_ud_port_t *port, mca_oob_ud_qp_t **qp_ptr) {
int rc;
opal_free_list_item_t *item;
do {
OPAL_FREE_LIST_GET(&port->data_qps, item, rc);
if (OPAL_SUCCESS != rc) {
OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:qp_data_aquire error "
"allocating new data qp. error = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rc));
break;
}
*qp_ptr = (mca_oob_ud_qp_t *) item;
if (NULL == (*qp_ptr)->ib_qp) {
rc = mca_oob_ud_qp_init (*qp_ptr, port, NULL, NULL, true);
if (ORTE_SUCCESS != rc) {
break;
}
rc = mca_oob_ud_qp_to_rts (*qp_ptr);
}
} while (0);
return rc;
}
int mca_oob_ud_qp_data_release (mca_oob_ud_qp_t *qp) {
int rc;
rc = mca_oob_ud_qp_purge (qp);
if (ORTE_SUCCESS != rc) {
return rc;
}
OPAL_FREE_LIST_RETURN(&qp->port->data_qps, qp);
return ORTE_SUCCESS;
}

71
orte/mca/oob/ud/oob_ud_qp.h Обычный файл
Просмотреть файл

@ -0,0 +1,71 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#if !defined(MCA_OOB_UD_QP_H)
#define MCA_OOB_UD_QP_H
#include "orte_config.h"
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif
#include "orte/types.h"
#include "opal/mca/base/base.h"
#include "opal/class/opal_free_list.h"
#include "opal/class/opal_hash_table.h"
#include "opal/threads/mutex.h"
#include "opal/threads/condition.h"
#include "opal/threads/threads.h"
#include "opal/mca/timer/base/base.h"
#include "orte/mca/oob/oob.h"
#include "orte/mca/oob/base/base.h"
#include <infiniband/verbs.h>
enum mca_oob_ud_qp_type_t {
MCA_OOB_UD_QP_DATA,
MCA_OOB_UD_QP_LISTEN
};
struct mca_oob_ud_port_t;
struct mca_oob_ud_qp_t {
opal_free_list_item_t super;
enum mca_oob_ud_qp_type_t type;
struct ibv_qp *ib_qp;
struct mca_oob_ud_port_t *port;
struct ibv_cq *ib_send_cq, *ib_recv_cq;
};
typedef struct mca_oob_ud_qp_t mca_oob_ud_qp_t;
OBJ_CLASS_DECLARATION(mca_oob_ud_qp_t);
int mca_oob_ud_qp_init (mca_oob_ud_qp_t *qp, struct mca_oob_ud_port_t *port,
struct ibv_comp_channel *recv_channel,
struct ibv_comp_channel *send_channel, bool onecq);
int mca_oob_ud_qp_to_reset (mca_oob_ud_qp_t *qp);
int mca_oob_ud_qp_to_rts (mca_oob_ud_qp_t *qp);
int mca_oob_ud_qp_purge (mca_oob_ud_qp_t *qp);
int mca_oob_ud_qp_post_send (mca_oob_ud_qp_t *qp, struct ibv_send_wr *wr, int num_completions);
int mca_oob_ud_qp_post_recv (mca_oob_ud_qp_t *qp, struct ibv_recv_wr *wr);
int mca_oob_ud_qp_data_aquire (struct mca_oob_ud_port_t *port, mca_oob_ud_qp_t **qp_ptr);
int mca_oob_ud_qp_data_release (mca_oob_ud_qp_t *qp);
#endif

638
orte/mca/oob/ud/oob_ud_recv.c Обычный файл
Просмотреть файл

@ -0,0 +1,638 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#include "orte_config.h"
#include "orte/types.h"
#include "opal/types.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "math.h"
#include "oob_ud.h"
#define min(a,b) ((a) < (b) ? (a) : (b))
static int mca_oob_ud_recv_unex_complete (mca_oob_ud_req_t *req);
static int mca_oob_ud_recv_copy (mca_oob_ud_req_t *dst, mca_oob_ud_req_t *src)
{
int rc, i;
dst->req_rem_data_len = src->req_rem_data_len;
rc = mca_oob_ud_recv_alloc (dst);
if (ORTE_SUCCESS == rc) {
unsigned char *dptr = src->req_uiov[0].iov_base;
for (i = 0 ; i < dst->req_count ; ++i) {
memcpy (dst->req_uiov[i].iov_base, dptr, dst->req_uiov[i].iov_len);
dptr += dst->req_uiov[i].iov_len;
}
}
mca_oob_ud_req_complete (dst, (ORTE_SUCCESS == rc) ? dst->req_rem_data_len : rc);
/* free io vector data */
free (src->req_uiov[0].iov_base);
free (src->req_uiov);
src->req_uiov = NULL;
OBJ_RELEASE(src);
return rc;
}
/* Caller MUST hold the matching lock before calling */
static inline int mca_oob_ud_find_recv (opal_list_t *list, const orte_process_name_t name,
const int tag, mca_oob_ud_req_t **req)
{
opal_list_item_t *item;
int rc = ORTE_ERR_NOT_FOUND;
*req = NULL;
OPAL_THREAD_LOCK(&mca_oob_ud_component.ud_match_lock);
for (item = opal_list_get_first (list) ; item != opal_list_get_end (list) ;
item = opal_list_get_next (item)) {
mca_oob_ud_req_t *recv_req = (mca_oob_ud_req_t *) item;
OPAL_OUTPUT_VERBOSE((15, mca_oob_base_output, "%s oob:ud:find_recv matching against "
"peer: %s, tag: %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&recv_req->req_origin), recv_req->req_tag));
if (OPAL_EQUAL == opal_dss.compare (&name, &recv_req->req_origin, ORTE_NAME) &&
tag == recv_req->req_tag) {
*req = recv_req;
rc = ORTE_SUCCESS;
break;
}
}
OPAL_OUTPUT_VERBOSE((15, mca_oob_base_output, "%s oob:ud:find_recv %sfound",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_SUCCESS != rc ? "not " : ""));
OPAL_THREAD_UNLOCK(&mca_oob_ud_component.ud_match_lock);
if (ORTE_SUCCESS == rc) {
mca_oob_ud_req_append_to_list (*req, NULL);
}
return rc;
}
static int mca_oob_ud_find_pending_recv (const orte_process_name_t name, const int tag,
mca_oob_ud_req_t **reqp) {
return mca_oob_ud_find_recv (&mca_oob_ud_component.ud_pending_recvs, name, tag, reqp);
}
int mca_oob_ud_get_recv_req (const orte_process_name_t name, const int tag,
mca_oob_ud_req_t **reqp) {
mca_oob_ud_req_t *req;
int rc;
OPAL_OUTPUT_VERBOSE((15, mca_oob_base_output, "%s oob:ud:get_recv_req pending receive request "
"against: %s, tag: %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&name), tag));
rc = mca_oob_ud_find_recv (&mca_oob_ud_component.ud_pending_recvs, name, tag, reqp);
if (ORTE_SUCCESS != rc) {
*reqp = req = OBJ_NEW(mca_oob_ud_req_t);
OPAL_OUTPUT_VERBOSE((10, mca_oob_base_output, "%s oob:ud:get_recv_req no matching receive. "
"created unexpected recv %p for tag %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(void *) req, tag));
req->req_origin = name;
req->req_tag = tag;
/* this receive was not expected */
req->type = MCA_OOB_UD_REQ_UNEX;
/* let mca_oob_ud_recv_alloc alloc memory for the receive */
req->req_uiov = calloc (1, sizeof (struct iovec));
req->req_flags = ORTE_RML_ALLOC;
req->req_count = 1;
} else {
OPAL_OUTPUT_VERBOSE((10, mca_oob_base_output, "%s oob:ud:get_recv_req recv %p matched",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) *reqp));
}
return ORTE_SUCCESS;
}
static inline int mca_oob_ud_find_active_recv (const orte_process_name_t name, const int tag,
mca_oob_ud_req_t **req) {
OPAL_OUTPUT_VERBOSE((15, mca_oob_base_output, "%s oob:ud:recv_match active receive request "
"against: %s, tag: %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&name), tag));
return mca_oob_ud_find_recv (&mca_oob_ud_component.ud_active_recvs, name, tag, req);
}
static inline int mca_oob_ud_find_unexpected_recv (const orte_process_name_t name, const int tag,
mca_oob_ud_req_t **req) {
OPAL_OUTPUT_VERBOSE((15, mca_oob_base_output, "%s oob:ud:recv_match unexpected receive request "
"against: %s, tag: %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&name), tag));
return mca_oob_ud_find_recv (&mca_oob_ud_component.ud_unexpected_recvs, name, tag, req);
}
int mca_oob_ud_recv_match (mca_oob_ud_req_t *recv_req) {
mca_oob_ud_req_t *urecv;
int rc;
OPAL_OUTPUT_VERBOSE((15, mca_oob_base_output, "%s oob:ud:recv_match posting receive. req = %p ",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) recv_req));
rc = mca_oob_ud_find_unexpected_recv (recv_req->req_origin, recv_req->req_tag, &urecv);
OPAL_OUTPUT_VERBOSE((15, mca_oob_base_output, "%s oob:ud:recv_match posting receive. found = %p ",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) urecv));
if (ORTE_SUCCESS == rc) {
recv_req->state = MCA_OOB_UD_REQ_COMPLETE;
return mca_oob_ud_recv_copy (recv_req, urecv);
}
recv_req->state = MCA_OOB_UD_REQ_PENDING;
mca_oob_ud_req_append_to_list (recv_req, &mca_oob_ud_component.ud_pending_recvs);
return ORTE_SUCCESS;
}
/*
* Non-blocking version of mca_oob_recv().
*
* @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive.
* @param msg (IN) Array of iovecs describing user buffers and lengths.
* @param count (IN) Number of elements in iovec array.
* @param tag (IN) User supplied tag for matching send/recv.
* @param flags (IN) May be MCA_OOB_PEEK to return up to size bytes of msg w/out removing it from the queue,
* @param cbfunc (IN) Callback function on recv completion.
* @param cbdata (IN) User data that is passed to callback function.
* @return OMPI error code (<0) on error.
*/
int mca_oob_ud_recv_nb(
orte_process_name_t* peer,
struct iovec* iov,
int count,
int tag,
int flags,
orte_rml_callback_fn_t cbfunc,
void* cbdata)
{
mca_oob_ud_req_t *recv_req;
OPAL_OUTPUT_VERBOSE((10, mca_oob_base_output, "%s oob:ud:recv_nb posting recieve. peer = %s, "
"tag = %d, count = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(peer), tag, count));
/* validate params */
if(NULL == iov || 0 == count) {
return ORTE_ERR_BAD_PARAM;
}
recv_req = OBJ_NEW(mca_oob_ud_req_t);
if (NULL == recv_req) {
opal_output(0, "oob:ud:recv_nb malloc failed! errno = %d", errno);
return ORTE_ERR_OUT_OF_RESOURCE;
}
recv_req->req_origin = *peer;
recv_req->req_uiov = iov;
recv_req->req_count = count;
recv_req->req_tag = tag;
recv_req->req_flags = flags;
recv_req->req_cbfunc = cbfunc;
recv_req->req_cbdata = cbdata;
recv_req->req_rc = 0;
recv_req->req_peer = NULL;
recv_req->type = MCA_OOB_UD_REQ_RECV;
return mca_oob_ud_recv_match (recv_req);
}
int mca_oob_ud_recv_cancel(orte_process_name_t *name, int tag)
{
mca_oob_ud_req_t *recv_req;
bool matched = false;
OPAL_OUTPUT_VERBOSE((10, mca_oob_base_output, "%s oob:ud:recv_cancel canceling receive requests "
"with name = %s, tag = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(name), tag));
/* cancel pending receives */
while (ORTE_SUCCESS == mca_oob_ud_find_pending_recv (*name, tag, &recv_req)) {
mca_oob_ud_req_abort (recv_req);
matched = true;
}
/* cancel active receives */
while (ORTE_SUCCESS == mca_oob_ud_find_active_recv (*name, tag, &recv_req)) {
mca_oob_ud_req_abort (recv_req);
matched = true;
}
return matched ? ORTE_SUCCESS : ORTE_ERR_NOT_FOUND;
}
static void mca_oob_ud_recv_try_to (int fd, short event, void *data)
{
(void) mca_oob_ud_recv_try ((mca_oob_ud_req_t *) data);
}
int mca_oob_ud_recv_try (mca_oob_ud_req_t *recv_req)
{
int rc, data_len;
int wr_count, sge_count, wr_index, sge_index, iov_index;
unsigned int iov_left, iov_offset, packet_size;
const unsigned int mtu = recv_req->req_mtu;
struct timeval aquire_timeout = {0, 500000};
mca_oob_ud_msg_t *rep_msg = NULL;
OPAL_OUTPUT_VERBOSE((10, mca_oob_base_output, "%s oob:ud:recv_try receiving from %s. rem ctx = %p",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&recv_req->req_peer->peer_name),
recv_req->req_rem_ctx));
do {
if (NULL == recv_req->req_qp) {
rc = mca_oob_ud_qp_data_aquire (recv_req->req_port, &recv_req->req_qp);
if (ORTE_SUCCESS != rc) {
break;
}
}
(void) mca_oob_ud_qp_purge (recv_req->req_qp);
rc = mca_oob_ud_msg_get (recv_req->req_port, recv_req, &recv_req->req_port->listen_qp,
recv_req->req_peer, NULL, &rep_msg);
if (ORTE_SUCCESS != rc) {
break;
}
if (NULL == recv_req->req_mr) {
/* allocate space for memory registers */
recv_req->req_mr = (struct ibv_mr **) calloc (recv_req->req_count, sizeof (struct ibv_mr *));
if (NULL == recv_req->req_mr) {
opal_output (0, "%s oob:ud:recv_try error allocating space for memory registers. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
rc = ORTE_ERR_OUT_OF_RESOURCE;
break;
}
}
rc = mca_oob_ud_register_iov (recv_req->req_uiov, recv_req->req_count,
recv_req->req_mr, recv_req->req_port->device->ib_pd,
mtu, &sge_count, &wr_count, &data_len);
if (ORTE_SUCCESS != rc) {
break;
}
data_len = min(data_len, recv_req->req_rem_data_len);
if (data_len < recv_req->req_rem_data_len && !(recv_req->req_flags & ORTE_RML_TRUNC)) {
/* receive buffers are not big enough and ORTE_RML_TRUNC was not specified.
this is probably an error condition */
rc = ORTE_ERR_BAD_PARAM;
break;
}
wr_count = (data_len + mtu - 1) / mtu;
sge_count += wr_count;
OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:recv_try receiving %d bytes in %d "
"work requests, %d sges", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), data_len,
wr_count, sge_count));
recv_req->req_packet_count = wr_count;
if (NULL == recv_req->req_wr.recv) {
/* allocate work requests */
recv_req->req_wr.recv = (struct ibv_recv_wr *) calloc (wr_count, sizeof (struct ibv_recv_wr));
if (NULL == recv_req->req_wr.recv) {
opal_output (0, "%s oob:ud:recv_try error allocating work requests. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
rc = ORTE_ERR_OUT_OF_RESOURCE;
break;
}
}
if (NULL == recv_req->req_sge) {
/* allocate scatter-gather lists. we need more to hold the grh */
recv_req->req_sge = (struct ibv_sge *) calloc (sge_count, sizeof (struct ibv_sge));
if (NULL == recv_req->req_sge) {
opal_output (0, "%s oob:ud:recv_try error allocating sges. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
rc = ORTE_ERR_OUT_OF_RESOURCE;
break;
}
}
if (NULL == recv_req->req_grh) {
/* allocate grh buffers */
recv_req->req_grh = (struct ibv_grh *) calloc (wr_count, sizeof (struct ibv_grh));
if (NULL == recv_req->req_grh) {
opal_output (0, "%s oob:ud:recv_try error allocating space for GRHs. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
rc = ORTE_ERR_OUT_OF_RESOURCE;
break;
}
}
if (NULL == recv_req->req_grh_mr) {
/* register grh buffers */
recv_req->req_grh_mr = ibv_reg_mr (recv_req->req_port->device->ib_pd, recv_req->req_grh,
wr_count * sizeof (struct ibv_grh),
IBV_ACCESS_LOCAL_WRITE);
if (NULL == recv_req->req_grh_mr) {
opal_output (0, "%s oob:ud:recv_try error allocating registering GRH memory. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
/* could not register memory */
rc = ORTE_ERR_OUT_OF_RESOURCE;
break;
}
}
rc = ORTE_SUCCESS;
iov_left = recv_req->req_uiov[0].iov_len;
iov_offset = 0;
iov_index = 0;
for (wr_index = 0, sge_index = 0 ; wr_index < wr_count ; ++wr_index) {
int sge_first = sge_index;
packet_size = 0;
if (sge_index >= sge_count) {
opal_output (0, "WTF!!!");
}
/* grh */
mca_oob_ud_fill_sge(recv_req->req_sge + sge_index++,
recv_req->req_grh + wr_index,
sizeof (struct ibv_grh),
recv_req->req_grh_mr->lkey);
do {
int to_recv = min (iov_left, mtu - packet_size);
if (sge_index >= sge_count) {
int *i = 0;
opal_output (0, "WTF!!! P2");
(*i) = 1;
}
mca_oob_ud_fill_sge(recv_req->req_sge + sge_index++,
(char *)recv_req->req_uiov[iov_index].iov_base + iov_offset,
to_recv, recv_req->req_mr[iov_index]->lkey);
iov_offset += to_recv;
iov_left -= to_recv;
packet_size += to_recv;
if (0 == iov_left) {
iov_index++;
iov_offset = 0;
if (iov_index < recv_req->req_count) {
iov_left = recv_req->req_uiov[iov_index].iov_len;
}
}
} while ((packet_size < mtu) && (iov_left > 0));
mca_oob_ud_fill_recv_wr(recv_req->req_wr.recv + wr_index,
recv_req->req_sge + sge_first,
sge_index - sge_first);
if (wr_index + 1 < wr_count) {
recv_req->req_wr.recv[wr_index].next = recv_req->req_wr.recv + wr_index + 1;
}
}
rc = mca_oob_ud_qp_post_recv (recv_req->req_qp, recv_req->req_wr.recv);
if (ORTE_SUCCESS != rc) {
break;
}
OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:recv_try posting reply message",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* ok, we have a data queue pair */
rep_msg->hdr->msg_type = MCA_OOB_UD_MSG_REPLY;
rep_msg->hdr->msg_lcl_ctx = recv_req->req_rem_ctx;
rep_msg->hdr->msg_rem_ctx = recv_req;
rep_msg->hdr->msg_data.rep.qpn = recv_req->req_qp->ib_qp->qp_num;
rep_msg->hdr->msg_data.rep.data_len = data_len;
rep_msg->hdr->msg_data.rep.mtu = mtu;
rc = mca_oob_ud_msg_post_send (rep_msg);
/* post send already returned the message */
rep_msg = NULL;
} while (0);
if (ORTE_ERR_TEMP_OUT_OF_RESOURCE == rc) {
mca_oob_ud_req_timer_set (recv_req, &aquire_timeout, 1, mca_oob_ud_recv_try_to);
rc = ORTE_SUCCESS;
}
if (ORTE_SUCCESS != rc) {
/* bad stuff happened */
if (MCA_OOB_UD_REQ_UNEX != recv_req->type) {
mca_oob_ud_req_complete (recv_req, rc);
}
OBJ_RELEASE(recv_req);
return rc;
}
recv_req->state = MCA_OOB_UD_REQ_ACTIVE;
return rc;
}
int mca_oob_ud_recv_complete (mca_oob_ud_req_t *recv_req)
{
mca_oob_ud_msg_t *dataok;
int i, j, rc = ORTE_SUCCESS;
uint32_t expected;
bool error = false, out_of_order = false;
#if defined(HAVE_VALGRIND)
int iov_index;
#endif
OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:recv_complete req = %p",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) recv_req));
if (false == recv_req->req_is_eager) {
for (i = 0, expected = 0 ; i < recv_req->req_packet_count ; ) {
struct ibv_wc wc[10];
rc = ibv_poll_cq (recv_req->req_qp->ib_recv_cq, 10, wc);
for (j = 0 ; j < rc ; ++j) {
if (wc[j].imm_data != expected) {
out_of_order = true;
}
if (IBV_WC_SUCCESS != wc[j].status) {
error = true;
}
OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:recv_complete wc status = %d. imm data = %d. "
"len = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), wc[j].status, wc[j].imm_data,
wc[j].byte_len));
expected++;
}
if (rc <= 0) {
break;
}
i += rc;
}
if (i != recv_req->req_packet_count || error || out_of_order) {
/* retry */
recv_req->state = MCA_OOB_UD_REQ_PENDING;
OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:recv_complete receive incomplete. error: %d, "
"out_of_order: %d packets: %d/%d. rc = %d, errno = %d. flags = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), error, out_of_order, i,
recv_req->req_packet_count, rc, errno, recv_req->req_flags));
mca_oob_ud_recv_try (recv_req);
return ORTE_SUCCESS;
}
OPAL_OUTPUT_VERBOSE((10, mca_oob_base_output, "%s oob:ud:recv_complete data received ok!",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* send data ok and wait for ack */
rc = mca_oob_ud_msg_get (recv_req->req_port, recv_req, &recv_req->req_port->listen_qp,
recv_req->req_peer, false, &dataok);
if (ORTE_SUCCESS != rc) {
return rc;
}
dataok->hdr->msg_type = MCA_OOB_UD_MSG_DATA_OK;
dataok->hdr->msg_lcl_ctx = recv_req->req_rem_ctx;
rc = mca_oob_ud_msg_post_send (dataok);
if (ORTE_SUCCESS != rc) {
return rc;
}
}
#if defined(HAVE_VALGRIND)
for (iov_index = 0 ; iov_index < recv_req->req_count ; ++iov_index) {
VALGRIND_MAKE_MEM_DEFINED(recv_req->req_uiov[iov_index].iov_base,
recv_req->req_uiov[iov_index].iov_len);
}
#endif
if (MCA_OOB_UD_REQ_UNEX != recv_req->type) {
mca_oob_ud_req_complete (recv_req, (ORTE_SUCCESS == rc) ? recv_req->req_rem_data_len : rc);
} else {
mca_oob_ud_recv_unex_complete (recv_req);
}
return ORTE_SUCCESS;
}
static int mca_oob_ud_recv_unex_complete (mca_oob_ud_req_t *req)
{
mca_oob_ud_req_t *recv_req;
int rc;
rc = mca_oob_ud_find_pending_recv (req->req_origin, req->req_tag, &recv_req);
if (ORTE_SUCCESS == rc) {
return mca_oob_ud_recv_copy (recv_req, req);
}
mca_oob_ud_req_append_to_list (req, &mca_oob_ud_component.ud_unexpected_recvs);
return ORTE_SUCCESS;
}
int mca_oob_ud_recv_match_send (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer, mca_oob_ud_msg_hdr_t *msg_hdr,
mca_oob_ud_req_t **reqp)
{
char *data = (msg_hdr->msg_data.req.data_follows ? (char *)(msg_hdr + 1) : NULL);
mca_oob_ud_req_t *req;
int rc, i;
*reqp = NULL;
OPAL_OUTPUT_VERBOSE((10, mca_oob_base_output, "%s oob:ud:recv_incoming_send matching incoming "
"send from peer %s with tag %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&msg_hdr->msg_origin), msg_hdr->msg_data.req.tag));
rc = mca_oob_ud_get_recv_req (msg_hdr->msg_origin, msg_hdr->msg_data.req.tag, &req);
if (ORTE_SUCCESS != rc) {
return rc;
}
req->req_rem_ctx = msg_hdr->msg_rem_ctx;
req->req_port = port;
req->req_mtu = min(port->mtu, msg_hdr->msg_data.req.mtu);
req->req_target = msg_hdr->ra.name;
req->req_rem_data_len = msg_hdr->msg_data.req.data_len;
do {
rc = mca_oob_ud_recv_alloc (req);
if (ORTE_SUCCESS != rc) {
opal_output (0, "%s oob:ud:recv_start malloc failed!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
if (MCA_OOB_UD_REQ_UNEX == req->type) {
free (req->req_uiov);
OBJ_RELEASE(req);
}
req = NULL;
break;
}
req->req_peer = peer;
OBJ_RETAIN(req->req_peer);
if (NULL == data) {
req->state = MCA_OOB_UD_REQ_ACTIVE;
break;
}
OPAL_OUTPUT_VERBOSE((10, mca_oob_base_output, "%s oob:ud:recv_incoming_send send was eager",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
req->req_is_eager = true;
for (i = 0 ; i < req->req_count; ++i) {
memcpy (req->req_uiov[i].iov_base, data, req->req_uiov[i].iov_len);
data += req->req_uiov[i].iov_len;
}
req->state = MCA_OOB_UD_REQ_COMPLETE;
} while (0);
*reqp = req;
return rc;
}

422
orte/mca/oob/ud/oob_ud_req.c Обычный файл
Просмотреть файл

@ -0,0 +1,422 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#include "oob_ud.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
static void mca_oob_ud_req_constuct (mca_oob_ud_req_t *req);
static void mca_oob_ud_req_destruct (mca_oob_ud_req_t *req);
OBJ_CLASS_INSTANCE(mca_oob_ud_req_t, opal_list_item_t, mca_oob_ud_req_constuct,
mca_oob_ud_req_destruct);
static void mca_oob_ud_msg_destruct (mca_oob_ud_msg_t *msg);
static void mca_oob_ud_msg_construct (mca_oob_ud_msg_t *msg);
OBJ_CLASS_INSTANCE(mca_oob_ud_msg_t, opal_free_list_item_t,
mca_oob_ud_msg_construct,
mca_oob_ud_msg_destruct);
static void mca_oob_ud_req_constuct (mca_oob_ud_req_t *req)
{
memset ((char *)req + sizeof (req->super), 0, sizeof (*req) - sizeof (req->super));
}
static void mca_oob_ud_req_destruct (mca_oob_ud_req_t *req)
{
int i;
if (req->req_peer) {
OBJ_RELEASE(req->req_peer);
}
if (req->req_wr.send) {
free (req->req_wr.send);
}
if (req->req_grh_mr) {
(void) ibv_dereg_mr (req->req_grh_mr);
}
if (req->req_grh) {
free (req->req_grh);
}
if (req->req_sge) {
free (req->req_sge);
}
if (req->req_mr) {
for (i = 0 ; i < req->req_count ; ++i) {
if (req->req_mr[i]) {
(void) ibv_dereg_mr (req->req_mr[i]);
}
}
/* these should have already been deregistered */
free (req->req_mr);
}
}
void mca_oob_ud_req_timer_set (mca_oob_ud_req_t *req, const struct timeval *timeout,
int max_tries, void (*cb)(evutil_socket_t, short, void *))
{
opal_event_evtimer_set (opal_event_base, &req->timer.event, cb, (void *) req);
req->timer.value.tv_sec = timeout->tv_sec;
req->timer.value.tv_usec = timeout->tv_usec;
opal_event_evtimer_add (&req->timer.event, &req->timer.value);
}
int mca_oob_ud_msg_get (struct mca_oob_ud_port_t *port, mca_oob_ud_req_t *req,
mca_oob_ud_qp_t *qp, mca_oob_ud_peer_t *peer, bool persist,
mca_oob_ud_msg_t **msgp)
{
opal_free_list_item_t *item;
opal_free_list_t *list = &port->free_msgs;
int rc;
OPAL_FREE_LIST_WAIT(list, item, rc);
if (OPAL_SUCCESS != rc) {
OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:msg_get error getting message "
"buffer. rc = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rc));
return ORTE_ERROR;
}
*msgp = (mca_oob_ud_msg_t *) item;
(*msgp)->persist = persist;
(*msgp)->req = req;
(*msgp)->peer = peer;
(*msgp)->qp = qp;
if (NULL != peer) {
OBJ_RETAIN(peer);
}
memset ((*msgp)->hdr, 0, sizeof (*((*msgp)->hdr)));
mca_oob_ud_fill_sge (&(*msgp)->sge, (*msgp)->hdr, 2048, (*msgp)->mr->lkey);
mca_oob_ud_fill_send_wr (&(*msgp)->wr, &(*msgp)->sge, 1, peer);
/* set return address */
(*msgp)->hdr->ra.name = *ORTE_PROC_MY_NAME;
(*msgp)->hdr->ra.qkey = 0;
(*msgp)->hdr->ra.port_num = port->port_num;
return ORTE_SUCCESS;
}
int mca_oob_ud_msg_init (mca_oob_ud_msg_t *msg, struct mca_oob_ud_port_t *port,
char *buf, struct ibv_mr *mr)
{
msg->port = port;
msg->hdr = (mca_oob_ud_msg_hdr_t *) buf;
msg->mr = mr;
return ORTE_SUCCESS;
}
void mca_oob_ud_msg_return (mca_oob_ud_msg_t *msg)
{
opal_free_list_t *list = &msg->port->free_msgs;
if (NULL != msg->peer) {
mca_oob_ud_peer_release (msg->peer);
}
msg->peer = NULL;
msg->cbfunc = NULL;
msg->qp = NULL;
msg->req = NULL;
OPAL_FREE_LIST_RETURN(list, msg);
}
static void mca_oob_ud_msg_construct (mca_oob_ud_msg_t *msg)
{
memset ((char *)msg + sizeof (msg->super), 0, sizeof (*msg) - sizeof (msg->super));
OBJ_CONSTRUCT(&msg->status_changed, opal_condition_t);
OBJ_CONSTRUCT(&msg->lock, opal_mutex_t);
}
static void mca_oob_ud_msg_destruct (mca_oob_ud_msg_t *msg)
{
OBJ_DESTRUCT(&msg->status_changed);
OBJ_DESTRUCT(&msg->lock);
if (NULL != msg->peer) {
mca_oob_ud_peer_release (msg->peer);
}
}
int mca_oob_ud_msg_post_send (mca_oob_ud_msg_t *msg)
{
int rc = ORTE_SUCCESS;
msg->status = MCA_OOB_UD_MSG_STATUS_POSTED;
OPAL_THREAD_LOCK(&msg->peer->peer_lock);
if (MCA_OOB_UD_MSG_ACK == msg->hdr->msg_type ||
MCA_OOB_UD_MSG_NACK == msg->hdr->msg_type) {
rc = mca_oob_ud_qp_post_send (msg->qp, &msg->wr, 1);
} else {
rc = mca_oob_ud_peer_post_msg (msg->peer, msg);
}
if (ORTE_SUCCESS != rc && false == msg->persist) {
msg->status = MCA_OOB_UD_MSG_STATUS_ERROR;
mca_oob_ud_msg_return (msg);
}
OPAL_OUTPUT_VERBOSE((10, mca_oob_base_output, "%s oob:ud:msg_post_send posted send for msg %p with id %" PRIu64,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) msg, msg->hdr->msg_id));
OPAL_THREAD_UNLOCK(&msg->peer->peer_lock);
return rc;
}
int mca_oob_ud_msg_status_update (mca_oob_ud_msg_t *msg, mca_oob_ud_status_t status)
{
int rc;
OPAL_OUTPUT_VERBOSE((10, mca_oob_base_output, "%s oob:ud:msg_status_update setting status of msg %p "
"to %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) msg, (int) status));
OPAL_THREAD_LOCK(&msg->lock);
if (status != msg->status) {
if (MCA_OOB_UD_MSG_STATUS_COMPLETE == status) {
OPAL_OUTPUT_VERBOSE((10, mca_oob_base_output, "%s oob:ud:msg_status_update setting peer %s as "
"available", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&msg->peer->peer_name)));
msg->peer->peer_available = true;
}
switch (status) {
case MCA_OOB_UD_MSG_STATUS_TIMEOUT:
rc = ORTE_ERR_TIMEOUT;
break;
case MCA_OOB_UD_MSG_STATUS_COMPLETE:
rc = ORTE_SUCCESS;
break;
case MCA_OOB_UD_MSG_STATUS_ERROR:
default:
rc = ORTE_ERROR;
}
if (msg->cbfunc) {
msg->cbfunc (msg, rc);
}
/* signal status change */
msg->status = status;
opal_condition_signal (&msg->status_changed);
OPAL_THREAD_UNLOCK(&msg->lock);
if (false == msg->persist) {
mca_oob_ud_msg_return (msg);
}
return ORTE_SUCCESS;
}
OPAL_THREAD_UNLOCK(&msg->lock);
return ORTE_SUCCESS;
}
static void mca_oob_ud_req_return (mca_oob_ud_req_t *req)
{
OPAL_OUTPUT_VERBOSE((15, mca_oob_base_output, "%s oob:ud:req_return returning req %p",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) req));
mca_oob_ud_req_append_to_list (req, NULL);
if (NULL != req->req_peer) {
mca_oob_ud_peer_release (req->req_peer);
req->req_peer = NULL;
}
if (NULL != req->req_wr.send) {
free (req->req_wr.send);
req->req_wr.send = NULL;
}
if (NULL != req->req_sge) {
free (req->req_sge);
req->req_sge = NULL;
}
if (ORTE_RML_PERSISTENT & req->req_flags) {
if (ORTE_RML_ALLOC & req->req_flags) {
int iov_index = req->req_count - 1;
/* NTH: caller took possesion of the buffer */
if (req->req_uiov[iov_index].iov_base) {
req->req_uiov[iov_index].iov_base = NULL;
req->req_uiov[iov_index].iov_len = 0;
}
}
mca_oob_ud_recv_match (req);
} else {
OBJ_RELEASE(req);
}
}
void mca_oob_ud_req_complete (mca_oob_ud_req_t *req, int rc)
{
int size, i;
OPAL_OUTPUT_VERBOSE((10, mca_oob_base_output, "%s oob:ud:req_complete request %p completed with status %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) req, rc));
if (NULL != req->req_qp) {
(void) mca_oob_ud_qp_data_release (req->req_qp);
req->req_qp = NULL;
}
/* deregister memory *before* handing it to the callback */
if (req->req_mr) {
for (i = 0 ; i < req->req_count ; ++i) {
if (req->req_mr[i]) {
(void) ibv_dereg_mr (req->req_mr[i]);
req->req_mr[i] = NULL;
}
}
}
if (req->req_cbfunc) {
req->req_rc = rc;
if ((req->req_flags & ORTE_RML_FLAG_RECURSIVE_CALLBACK) == 0) {
OPAL_THREAD_LOCK (&mca_oob_ud_component.ud_lock);
mca_oob_ud_req_append_to_list (req, &mca_oob_ud_component.ud_completed);
size = opal_list_get_size (&mca_oob_ud_component.ud_completed);
OPAL_THREAD_UNLOCK (&mca_oob_ud_component.ud_lock);
if (size > 1) {
return;
}
}
req->req_cbfunc (req->req_rc, &req->req_target, req->req_uiov, req->req_count,
req->req_tag, req->req_cbdata);
if ((req->req_flags & ORTE_RML_FLAG_RECURSIVE_CALLBACK) == 0) {
opal_list_item_t* item;
OPAL_THREAD_LOCK(&mca_oob_ud_component.ud_lock);
mca_oob_ud_req_return (req);
while(NULL !=
(item = opal_list_remove_first(&mca_oob_ud_component.ud_completed))) {
req = (mca_oob_ud_req_t *) item;
req->req_list = NULL;
OPAL_THREAD_UNLOCK(&mca_oob_ud_component.ud_lock);
req->req_cbfunc (req->req_rc, &req->req_target, req->req_uiov, req->req_count,
req->req_tag, req->req_cbdata);
OPAL_THREAD_LOCK(&mca_oob_ud_component.ud_lock);
mca_oob_ud_req_return (req);
}
OPAL_THREAD_UNLOCK(&mca_oob_ud_component.ud_lock);
} else {
mca_oob_ud_req_return (req);
}
} else {
mca_oob_ud_req_return (req);
}
}
void mca_oob_ud_req_append_to_list (mca_oob_ud_req_t *req, opal_list_t *list)
{
OPAL_THREAD_LOCK(&mca_oob_ud_component.ud_match_lock);
if (NULL != req->req_list) {
opal_list_remove_item (req->req_list, (opal_list_item_t *) req);
}
if (NULL != list) {
opal_list_append (list, (opal_list_item_t *) req);
}
req->req_list = list;
OPAL_THREAD_UNLOCK(&mca_oob_ud_component.ud_match_lock);
}
bool mca_oob_ud_req_is_in_list (mca_oob_ud_req_t *req, opal_list_t *list)
{
opal_list_item_t *item;
bool rc = false;
OPAL_THREAD_LOCK(&mca_oob_ud_component.ud_match_lock);
for (item = opal_list_get_first (list) ;
item != opal_list_get_end (list) ;
item = opal_list_get_next (item)) {
if (item == (opal_list_item_t *) req) {
rc = true;
break;
}
}
OPAL_THREAD_UNLOCK(&mca_oob_ud_component.ud_match_lock);
return rc;
}
void mca_oob_ud_req_abort (mca_oob_ud_req_t *req)
{
/* caller should have removed this request from any owner list */
req->req_list = NULL;
if (NULL != req->req_qp) {
mca_oob_ud_qp_data_release (req->req_qp);
req->req_qp = NULL;
}
/* don't call the callback */
req->req_cbfunc = NULL;
/* make sure the request is freed */
req->req_flags = 0;
/* free up request resources */
mca_oob_ud_req_complete (req, ORTE_ERR_INTERUPTED);
}
int mca_oob_ud_msg_wait (mca_oob_ud_msg_t *msg)
{
OPAL_THREAD_LOCK(&msg->lock);
/* wait for ack */
while (MCA_OOB_UD_MSG_STATUS_POSTED == msg->status) {
opal_condition_wait (&msg->status_changed, &msg->lock);
}
OPAL_THREAD_UNLOCK(&msg->lock);
switch (msg->status) {
case MCA_OOB_UD_MSG_STATUS_TIMEOUT:
return ORTE_ERR_TIMEOUT;
case MCA_OOB_UD_MSG_STATUS_COMPLETE:
return ORTE_SUCCESS;
case MCA_OOB_UD_MSG_STATUS_ERROR:
default:
return ORTE_ERROR;
}
}

231
orte/mca/oob/ud/oob_ud_req.h Обычный файл
Просмотреть файл

@ -0,0 +1,231 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#if !defined(MCA_OOB_UD_REQ_H)
#define MCA_OOB_UD_REQ_H
#include "orte_config.h"
#include "orte/types.h"
#include "opal/threads/condition.h"
#include "opal/mca/event/event.h"
#include "orte/mca/rml/rml.h"
#include <infiniband/verbs.h>
#include "oob_ud_qp.h"
struct mca_oob_ud_peer_t;
enum mca_oob_ud_req_type_t {
MCA_OOB_UD_REQ_RECV,
MCA_OOB_UD_REQ_SEND,
MCA_OOB_UD_REQ_UNEX
};
typedef enum mca_oob_ud_req_type_t mca_oob_ud_req_type_t;
enum mca_oob_ud_req_state_t {
MCA_OOB_UD_REQ_ACTIVE,
MCA_OOB_UD_REQ_PENDING,
MCA_OOB_UD_REQ_COMPLETE
};
typedef enum mca_oob_ud_req_state_t mca_oob_ud_req_state_t;
struct mca_oob_ud_req_t {
opal_list_item_t super;
mca_oob_ud_req_type_t type;
mca_oob_ud_req_state_t state;
union {
struct ibv_send_wr *send;
struct ibv_recv_wr *recv;
} req_wr;
/* storage for ib grh */
struct ibv_grh *req_grh;
struct ibv_mr *req_grh_mr;
/* memory register for iovec memory */
struct ibv_mr **req_mr;
struct ibv_sge *req_sge;
/* negotiated mtu */
int req_mtu;
uint32_t req_rem_qpn;
int req_rem_data_len;
int req_packet_count;
struct mca_oob_ud_peer_t *req_peer;
struct mca_oob_ud_port_t *req_port;
struct mca_oob_ud_qp_t *req_qp;
/* remote context (request or response) */
void *req_rem_ctx;
/* retry timer */
struct {
opal_event_t event;
struct timeval value;
} timer;
/* user request */
orte_process_name_t req_target;
orte_process_name_t req_origin;
struct iovec *req_uiov;
int req_count;
int req_tag;
int req_flags;
int req_rc;
orte_rml_callback_fn_t req_cbfunc;
void *req_cbdata;
/* what list is this request in */
opal_list_t *req_list;
bool req_is_eager;
};
typedef struct mca_oob_ud_req_t mca_oob_ud_req_t;
OBJ_CLASS_DECLARATION(mca_oob_ud_req_t);
enum mca_oob_ud_status_t {
/* message posted */
MCA_OOB_UD_MSG_STATUS_POSTED,
/* remote side receive the message (ack'd) */
MCA_OOB_UD_MSG_STATUS_COMPLETE,
/* request message timed out */
MCA_OOB_UD_MSG_STATUS_TIMEOUT,
/* other failure */
MCA_OOB_UD_MSG_STATUS_ERROR
};
typedef enum mca_oob_ud_status_t mca_oob_ud_status_t;
enum mca_oob_ud_msg_type_t {
MCA_OOB_UD_MSG_REQUEST = 37,
MCA_OOB_UD_MSG_REPLY = 38,
MCA_OOB_UD_MSG_COMPLETE = 39,
MCA_OOB_UD_MSG_PING = 40,
MCA_OOB_UD_MSG_ACK = 41,
MCA_OOB_UD_MSG_NACK = 42,
MCA_OOB_UD_MSG_DATA_OK = 43,
MCA_OOB_UD_MSG_END = 44
};
typedef enum mca_oob_ud_msg_type_t mca_oob_ud_msg_type_t;
struct mca_oob_ud_msg_hdr_t {
mca_oob_ud_msg_type_t msg_type;
void *msg_rem_ctx;
void *msg_lcl_ctx;
orte_process_name_t msg_origin;
uint64_t msg_id;
struct {
/* the receiver can get the qpn and lid from the work completion */
uint32_t qkey;
orte_process_name_t name;
uint8_t port_num;
} ra;
union {
struct {
int tag;
int data_len;
int mtu;
bool data_follows;
} req;
struct {
uint32_t qpn;
int data_len;
int tag;
int mtu;
} rep;
} msg_data;
};
typedef struct mca_oob_ud_msg_hdr_t mca_oob_ud_msg_hdr_t;
struct mca_oob_ud_msg_t {
opal_free_list_item_t super;
struct ibv_send_wr wr;
struct ibv_sge sge;
mca_oob_ud_msg_hdr_t *hdr;
struct ibv_mr *mr;
/* qp this request was sent over */
struct mca_oob_ud_qp_t *qp;
struct mca_oob_ud_port_t *port;
opal_mutex_t lock;
opal_condition_t status_changed;
mca_oob_ud_status_t status;
bool persist;
mca_oob_ud_req_t *req;
void (*cbfunc) (struct mca_oob_ud_msg_t *, int);
struct mca_oob_ud_peer_t *peer;
};
typedef struct mca_oob_ud_msg_t mca_oob_ud_msg_t;
OBJ_CLASS_DECLARATION(mca_oob_ud_msg_t);
static inline int mca_oob_ud_recv_alloc (mca_oob_ud_req_t *recv_req)
{
int iov_index;
if (recv_req->req_flags & ORTE_RML_ALLOC) {
size_t alloc_size = recv_req->req_rem_data_len;
for (iov_index = 0 ; iov_index < recv_req->req_count - 1 ; ++iov_index) {
alloc_size -= recv_req->req_uiov[iov_index].iov_len;
}
recv_req->req_uiov[iov_index].iov_len = alloc_size;
recv_req->req_uiov[iov_index].iov_base = calloc (alloc_size, 1);
if (NULL == recv_req->req_uiov[iov_index].iov_base) {
return ORTE_ERROR;
}
}
return ORTE_SUCCESS;
}
int mca_oob_ud_msg_get (struct mca_oob_ud_port_t *port, mca_oob_ud_req_t *req,
mca_oob_ud_qp_t *qp, mca_oob_ud_peer_t *peer, bool persist,
mca_oob_ud_msg_t **msgp);
int mca_oob_ud_msg_init (mca_oob_ud_msg_t *msg, struct mca_oob_ud_port_t *port,
char *buf, struct ibv_mr *mr);
void mca_oob_ud_msg_return (mca_oob_ud_msg_t *msg);
void mca_oob_ud_req_timer_set (mca_oob_ud_req_t *req, const struct timeval *timeout,
int max_tries, void (*cb)(evutil_socket_t, short, void *));
int mca_oob_ud_msg_post_send (mca_oob_ud_msg_t *msg);
int mca_oob_ud_msg_wait (mca_oob_ud_msg_t *msg);
int mca_oob_ud_msg_status_update (mca_oob_ud_msg_t *msg, mca_oob_ud_status_t status);
void mca_oob_ud_req_complete (mca_oob_ud_req_t *req, int rc);
void mca_oob_ud_req_abort (mca_oob_ud_req_t *req);
void mca_oob_ud_req_append_to_list (mca_oob_ud_req_t *req, opal_list_t *list);
bool mca_oob_ud_req_is_in_list (mca_oob_ud_req_t *req, opal_list_t *list);
#endif

399
orte/mca/oob/ud/oob_ud_send.c Обычный файл
Просмотреть файл

@ -0,0 +1,399 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#include "oob_ud.h"
#define min(a,b) ((a) < (b) ? (a) : (b))
#define MCA_OOB_UD_IOV_SIZE(iovec, count, size) \
do { \
int i; \
for (i = 0, (size) = 0 ; i < (count) ; ++i) { \
(size) += (iovec)[i].iov_len; \
} \
} while (0);
static void mca_oob_ud_send_cb (mca_oob_ud_msg_t *msg, int rc)
{
mca_oob_ud_send_complete (msg->req, ORTE_SUCCESS == rc ? msg->hdr->msg_data.req.data_len : rc);
}
static int mca_oob_ud_send_self (struct iovec* iov, int count, int tag,
int flags, orte_rml_callback_fn_t cbfunc,
void* cbdata)
{
unsigned int srco, dsto;
mca_oob_ud_req_t *req;
int srci, dsti;
int rc, size;
MCA_OOB_UD_IOV_SIZE(iov, count, size);
OPAL_OUTPUT_VERBOSE((10, mca_oob_base_output, "%s mca_oob_ud_send_self: sending %d bytes to myself",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), size));
rc = mca_oob_ud_get_recv_req (*ORTE_PROC_MY_NAME, tag, &req);
if (ORTE_SUCCESS != rc) {
return rc;
}
req->req_rem_data_len = size;
req->req_is_eager = true;
rc = mca_oob_ud_recv_alloc (req);
if (ORTE_SUCCESS != rc) {
opal_output (0, "%s oob:ud:recv_start malloc failed!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
if (MCA_OOB_UD_REQ_UNEX == req->type) {
free (req->req_uiov);
OBJ_RELEASE(req);
}
return rc;
}
srci = dsti = 0;
srco = dsto = 0;
do {
size_t copy = min(iov[srci].iov_len - srco,
req->req_uiov[dsti].iov_len - dsto);
memmove ((unsigned char *) req->req_uiov[dsti].iov_base + dsto,
(unsigned char *) iov[srci].iov_base + srco, copy);
srco += copy;
if (srco == iov[srci].iov_len) {
srci++;
srco = 0;
}
dsto += copy;
if (dsto == req->req_uiov[dsti].iov_len) {
dsti++;
dsto = 0;
}
} while (srci < req->req_count && dsti < count);
req->state = MCA_OOB_UD_REQ_COMPLETE;
OPAL_OUTPUT_VERBOSE((10, mca_oob_base_output, "%s mca_oob_ud_send_self: complete. calling callbacks",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* queue up recv callback */
mca_oob_ud_event_queue_completed (req);
if (NULL != cbfunc) {
cbfunc (ORTE_SUCCESS, ORTE_PROC_MY_NAME,
iov, count, tag, cbdata);
}
return size;
}
int mca_oob_ud_send_nb(orte_process_name_t* target,
orte_process_name_t* origin,
struct iovec* iov, int count, int tag,
int flags, orte_rml_callback_fn_t cbfunc,
void* cbdata)
{
mca_oob_ud_peer_t *peer;
mca_oob_ud_port_t *port;
mca_oob_ud_msg_t *req_msg;
mca_oob_ud_req_t *send_req;
bool send_eager = false;
char *pack_ptr;
int rc, size, i;
if (OPAL_EQUAL == orte_util_compare_name_fields
(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, target)) {
return mca_oob_ud_send_self (iov, count, tag, flags,
cbfunc, cbdata);
}
rc = mca_oob_ud_peer_lookup (target, &peer);
if(ORTE_SUCCESS != rc || NULL == peer) {
opal_output (0, "%s oob:ud:send_nb peer %s not found",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(target));
return (NULL == peer) ? ORTE_ERR_UNREACH : rc;
}
/* NTH: TODO -- get a random port? */
port = (mca_oob_ud_port_t *) opal_list_get_first (&((mca_oob_ud_device_t *)peer->peer_context)->ports);
send_req = OBJ_NEW(mca_oob_ud_req_t);
if (!send_req) {
opal_output(0, "oob:ud:send_nb malloc failed! errno = %d", errno);
return ORTE_ERR_OUT_OF_RESOURCE;
}
MCA_OOB_UD_IOV_SIZE(iov, count, size);
/* fill in request */
send_req->req_target = *target;
send_req->req_origin = *origin;
send_req->req_uiov = iov;
send_req->req_count = count;
send_req->req_tag = tag;
send_req->req_flags = flags;
send_req->req_cbfunc = cbfunc;
send_req->req_cbdata = cbdata;
send_req->req_peer = peer;
send_req->req_mtu = port->mtu;
send_req->req_port = port;
send_req->req_rc = 0;
send_req->state = MCA_OOB_UD_REQ_PENDING;
send_req->type = MCA_OOB_UD_REQ_SEND;
OBJ_RETAIN(peer);
if (size + sizeof (mca_oob_ud_msg_hdr_t) <= (unsigned int)port->mtu) {
send_eager = true;
}
rc = mca_oob_ud_msg_get (port, send_req, &port->listen_qp, peer, false, &req_msg);
if (ORTE_SUCCESS != rc) {
OBJ_RELEASE (send_req);
return rc;
}
/* fill in message header */
req_msg->hdr->msg_type = MCA_OOB_UD_MSG_REQUEST;
req_msg->hdr->msg_rem_ctx = send_req;
req_msg->hdr->msg_origin = *origin;
req_msg->hdr->msg_data.req.data_len = size;
req_msg->hdr->msg_data.req.mtu = port->mtu;
req_msg->hdr->msg_data.req.tag = tag;
OPAL_OUTPUT_VERBOSE((10, mca_oob_base_output, "%s-%s mca_oob_ud_send_nb: tag %d size %lu. msg: %p. peer = %p. req = %p."
"count = %d. uiov = %p. flags = %d\n",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(target),
tag, (unsigned long)size,
(void *) req_msg,
(void *) peer, (void *) send_req, count, (void *) iov, flags));
if (!send_eager) {
mca_oob_ud_req_append_to_list (send_req, &mca_oob_ud_component.ud_active_sends);
/* send request */
return mca_oob_ud_msg_post_send (req_msg);
}
pack_ptr = (char *)(req_msg->hdr + 1);
for (i = 0 ; i < count ; ++i) {
memcpy (pack_ptr, iov[i].iov_base, iov[i].iov_len);
pack_ptr += iov[i].iov_len;
}
send_req->req_list = NULL;
req_msg->hdr->msg_data.req.data_follows = true;
req_msg->cbfunc = mca_oob_ud_send_cb;
req_msg->req = send_req;
do {
/* send request */
rc = mca_oob_ud_msg_post_send (req_msg);
if (ORTE_SUCCESS != rc) {
opal_output (0, "msg send failed with status = %d", rc);
break;
}
} while (0);
return rc;
}
static void mca_oob_ud_send_try_to (int fd, short event, void *ctx)
{
OPAL_THREAD_LOCK(&mca_oob_ud_component.ud_match_lock);
(void) mca_oob_ud_send_try ((mca_oob_ud_req_t *) ctx);
OPAL_THREAD_UNLOCK(&mca_oob_ud_component.ud_match_lock);
}
int mca_oob_ud_send_try (mca_oob_ud_req_t *send_req) {
int wr_index, wr_count, sge_count, sge_index, iov_index;
unsigned int iov_left, iov_offset, packet_size;
const unsigned int mtu = send_req->req_mtu;
const struct timeval aquire_timeout = {0, 500000};
mca_oob_ud_msg_t *com_msg;
int data_len, rc;
OPAL_OUTPUT_VERBOSE((10, mca_oob_base_output, "%s oob:ud:send_try sending to %s, tag = %d, "
"count = %d. req = %p",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&send_req->req_peer->peer_name),
send_req->req_tag, send_req->req_count,
(void *) send_req));
do {
if (NULL == send_req->req_qp) {
rc = mca_oob_ud_qp_data_aquire (send_req->req_port, &send_req->req_qp);
if (ORTE_SUCCESS != rc) {
break;
}
}
(void) mca_oob_ud_qp_purge (send_req->req_qp);
rc = mca_oob_ud_msg_get (send_req->req_port, send_req, send_req->req_qp, send_req->req_peer, false,
&com_msg);
if (ORTE_SUCCESS != rc) {
break;
}
if (NULL == send_req->req_mr) {
/* allocate space for memory registers */
send_req->req_mr = (struct ibv_mr **) calloc (send_req->req_count, sizeof (struct ibv_mr *));
if (NULL == send_req->req_mr) {
opal_output (0, "%s oob:ud:send_try error allocating space for memory registers. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
rc = ORTE_ERR_OUT_OF_RESOURCE;
break;
}
}
rc = mca_oob_ud_register_iov (send_req->req_uiov, send_req->req_count,
send_req->req_mr, send_req->req_port->device->ib_pd,
mtu, &sge_count, &wr_count, &data_len);
if (ORTE_SUCCESS != rc) {
break;
}
wr_count = (data_len + mtu - 1) / mtu;
if (data_len > 0) {
data_len = data_len + 0;
}
OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:send_try sending %d bytes in %d "
"work requests, %d sges. uiov = %p", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), data_len,
wr_count, sge_count, (void *) send_req->req_uiov));
if (wr_count && NULL == send_req->req_wr.send) {
send_req->req_wr.send = (struct ibv_send_wr *) calloc (wr_count, sizeof (struct ibv_send_wr));
if (NULL == send_req->req_wr.send) {
opal_output (0, "%s oob:ud:send_try error allocating work requests. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
rc = ORTE_ERR_OUT_OF_RESOURCE;
break;
}
}
if (wr_count && NULL == send_req->req_sge) {
send_req->req_sge = (struct ibv_sge *) calloc (sge_count, sizeof (struct ibv_sge));
if (NULL == send_req->req_sge) {
opal_output (0, "%s oob:ud:send_try error allocating sges. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
rc = ORTE_ERR_OUT_OF_RESOURCE;
break;
}
}
iov_left = send_req->req_uiov[0].iov_len;
iov_offset = 0;
iov_index = 0;
for (wr_index = 0, sge_index = 0 ; wr_index < wr_count ; ++wr_index) {
int sge_first = sge_index;
packet_size = 0;
do {
int to_send = min (iov_left, mtu - packet_size);
mca_oob_ud_fill_sge(send_req->req_sge + sge_index++,
(char *)send_req->req_uiov[iov_index].iov_base + iov_offset,
to_send, send_req->req_mr[iov_index]->lkey);
iov_offset += to_send;
iov_left -= to_send;
packet_size += to_send;
if (0 == iov_left) {
iov_index++;
iov_offset = 0;
if (iov_index < send_req->req_count) {
iov_left = send_req->req_uiov[iov_index].iov_len;
}
}
} while ((packet_size < mtu) && (iov_left > 0));
mca_oob_ud_fill_send_wr(send_req->req_wr.send + wr_index,
send_req->req_sge + sge_first,
sge_index - sge_first, send_req->req_peer);
/* we don't care about completions for data */
send_req->req_wr.send[wr_index].send_flags = IBV_SEND_SOLICITED;
/* sequence number */
send_req->req_wr.send[wr_index].imm_data = wr_index;
send_req->req_wr.send[wr_index].wr.ud.remote_qpn = send_req->req_rem_qpn;
send_req->req_wr.send[wr_index].opcode = IBV_WR_SEND_WITH_IMM;
if (wr_index + 1 < wr_count) {
send_req->req_wr.send[wr_index].next = send_req->req_wr.send + wr_index + 1;
}
}
/* send data */
rc = mca_oob_ud_qp_post_send (send_req->req_qp, send_req->req_wr.send, 0);
if (ORTE_SUCCESS != rc) {
opal_output (0, "error posting send!");
break;
}
OPAL_OUTPUT_VERBOSE((10, mca_oob_base_output, "%s oob:ud:send_try posting completion message",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* Fill in completion message. This message will go to the peers listen QP but
must originate from our data qp to ensure that it is sent last. */
com_msg->hdr->msg_type = MCA_OOB_UD_MSG_COMPLETE;
com_msg->hdr->msg_lcl_ctx = send_req->req_rem_ctx;
com_msg->hdr->msg_rem_ctx = send_req;
/* send message header */
rc = mca_oob_ud_msg_post_send (com_msg);
/* post_send already returned the message */
com_msg = NULL;
} while (0);
if (ORTE_ERR_TEMP_OUT_OF_RESOURCE == rc) {
/* set timer to retry post */
mca_oob_ud_req_timer_set (send_req, &aquire_timeout, 1, mca_oob_ud_send_try_to);
rc = ORTE_SUCCESS;
}
if (ORTE_SUCCESS != rc) {
opal_output (0, "send error! rc = %d", rc);
/* damn */
return mca_oob_ud_send_complete (send_req, rc);
}
send_req->state = MCA_OOB_UD_REQ_ACTIVE;
return rc;
}
int mca_oob_ud_send_complete (mca_oob_ud_req_t *send_req, int rc)
{
mca_oob_ud_req_complete (send_req, (ORTE_SUCCESS == rc) ? send_req->req_rem_data_len : rc);
return rc;
}