diff --git a/orte/mca/oob/ud/Makefile.am b/orte/mca/oob/ud/Makefile.am deleted file mode 100644 index 27f3bab832..0000000000 --- a/orte/mca/oob/ud/Makefile.am +++ /dev/null @@ -1,67 +0,0 @@ -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2017 IBM Corporation. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -AM_CPPFLAGS = $(orte_oob_ud_CPPFLAGS) - -dist_ortedata_DATA = help-oob-ud.txt - -sources = \ - oob_ud_component.h \ - oob_ud_component.c \ - oob_ud.c \ - oob_ud.h \ - oob_ud_event.c \ - oob_ud_peer.c \ - oob_ud_peer.h \ - oob_ud_ping.c \ - oob_ud_ping.h \ - oob_ud_qp.c \ - oob_ud_qp.h \ - oob_ud_recv.c \ - oob_ud_req.c \ - oob_ud_req.h \ - oob_ud_send.c \ - oob_ud_send.h - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -if MCA_BUILD_orte_oob_ud_DSO -component_noinst = -component_install = mca_oob_ud.la -else -component_noinst = libmca_oob_ud.la -component_install = -endif - -mcacomponentdir = $(ortelibdir) -mcacomponent_LTLIBRARIES = $(component_install) -mca_oob_ud_la_SOURCES = $(sources) -mca_oob_ud_la_LDFLAGS = -module -avoid-version $(orte_oob_ud_LDFLAGS) -mca_oob_ud_la_LIBADD = $(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la \ - $(orte_oob_ud_LIBS) \ - $(OPAL_TOP_BUILDDIR)/opal/mca/common/verbs/lib@OPAL_LIB_PREFIX@mca_common_verbs.la - -noinst_LTLIBRARIES = $(component_noinst) -libmca_oob_ud_la_SOURCES = $(sources) -libmca_oob_ud_la_LDFLAGS = -module -avoid-version $(orte_oob_ud_LDFLAGS) -libmca_oob_ud_la_LIBADD = $(orte_oob_ud_LIBS) - diff --git a/orte/mca/oob/ud/configure.m4 b/orte/mca/oob/ud/configure.m4 deleted file mode 100644 index 70b4edbc24..0000000000 --- a/orte/mca/oob/ud/configure.m4 +++ /dev/null @@ -1,63 +0,0 @@ -# -*- shell-script -*- -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2012 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# MCA_oob_ud_CONFIG([action-if-found], [action-if-not-found]) -# ----------------------------------------------------------- -AC_DEFUN([MCA_orte_oob_ud_CONFIG],[ - # We need to know if we have verbs support - AC_REQUIRE([OPAL_CHECK_VERBS_DIR]) - - AC_CONFIG_FILES([orte/mca/oob/ud/Makefile]) - - # JMS Still have problems with AC_ARG ENABLE not yet having been - # called or CHECK_WITHDIR'ed. - - orte_oob_ud_check_save_CPPFLAGS=$CPPFLAGS - orte_oob_ud_check_save_LDFLAGS=$LDFLAGS - orte_oob_ud_check_save_LIBS=$LIBS - - OPAL_CHECK_PACKAGE([orte_oob_ud], - [infiniband/verbs.h], - [ibverbs], - [ibv_open_device], - [], - [$opal_verbs_dir], - [$opal_verbs_libdir], - [orte_oob_ud_check_happy=yes], - [orte_oob_ud_check_happy=no]) - - CPPFLAGS=$orte_oob_ud_check_save_CPPFLAGS - LDFLAGS=$orte_oob_ud_check_save_LDFLAGS - LIBS=$orte_oob_ud_check_save_LIBS - - AS_IF([test "$orte_oob_ud_check_happy" = "yes" && test "$opal_want_verbs" != "no"], - [$1], - [AS_IF([test "$opal_want_verbs" = "yes"], - [AC_MSG_WARN([--with-verbs specified, but cannot build this component]) - AC_MSG_ERROR([Cannot continue]) - ]) - $2]) - - # substitute in the things needed to build this component - AC_SUBST([orte_oob_ud_CFLAGS]) - AC_SUBST([orte_oob_ud_CPPFLAGS]) - AC_SUBST([orte_oob_ud_LDFLAGS]) - AC_SUBST([orte_oob_ud_LIBS]) -])dnl diff --git a/orte/mca/oob/ud/help-oob-ud.txt b/orte/mca/oob/ud/help-oob-ud.txt deleted file mode 100644 index 9e9b671845..0000000000 --- a/orte/mca/oob/ud/help-oob-ud.txt +++ /dev/null @@ -1,121 +0,0 @@ -# -*- text -*- -# -# Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2006 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# 2015 Mellanox Technologies, Inc. -# All rights reserved. -# Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# -[no-devices-error] -Open MPI has detected a failure in a basic verbs function call. This -is unusual, and may indicate that something is malfunctioning on this -system. - -You job will continue, but Open MPI will ignore the "ud" oob component -in this run. - -Verbs function: ibv_get_device_list() -Error: %s -Hostname: %s - -Please contact your system administrator. -# -[no-ports-usable] -Open MPI has detected that there are UD-capable Verbs devices on your -system, but none of them were able to be setup properly. This may -indicate a problem on this system. - -You job will continue, but Open MPI will ignore the "ud" oob component -in this run. - -Hostname: %s -# -[reg-mr-failed] -Failed to register memory region (MR): - -Hostname: %s -Address: %x -Length: %lu -Error: %s -# -[notify-cq-failed] -Failed to request completion notification on a completion queue (CQ): - -Hostname: %s -Error: %s -# -[create-cq-failed] -Failed to create a completion queue (CQ): - -Hostname: %s -Requested CQE: %d -Error: %s - -Check the CQE attribute. -# -[create-qp-failed] -Failed to create a queue pair (QP): - -Hostname: %s -Requested max number of outstanding WRs in the SQ: %u -Requested max number of outstanding WRs in the RQ: %u -Requested max number of SGEs in a WR in the SQ: %u -Requested max number of SGEs in a WR in the RQ: %u -Requested max number of data that can be posted inline to the SQ: %u -Error: %s - -Check requested attributes. -# -[poll-cq-failed] -Failed to poll the CQ cq for work completions: - -Hostname: %s -Number of entries: %d -Error: %s -# -[poll-cq-failed-wc] -Failed to poll the CQ cq for work completions: - -Hostname: %s -Number of entries: %d -Entry ID : %d -WC status: %d -# -[post-send-failed] -Failed to post a list of work requests (WRs) to a send queue: - -Hostname: %s -Error: %s -# -[post-recv-failed] -Failed to post a list of work requests (WRs) to a receive queue: - -Hostname: %s -Error: %s -# -[modify-qp-failed] -Failed to modify the attributes of a queue pair (QP): - -Hostname: %s -Mask for QP attributes to be modified: %d -Error: %s -# -[destroy-qp-failed] -Failed to destroy a queue pair (QP): - -Hostname: %s -Error: %s -# diff --git a/orte/mca/oob/ud/oob_ud.c b/orte/mca/oob/ud/oob_ud.c deleted file mode 100644 index 247ef4a275..0000000000 --- a/orte/mca/oob/ud/oob_ud.c +++ /dev/null @@ -1,279 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2014 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * 2014 Mellanox Technologies, Inc. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - */ - -#include "orte_config.h" -#include "orte/types.h" -#include "opal/types.h" - -#include "orte/util/name_fns.h" -#include "orte/runtime/orte_globals.h" -#include "orte/util/proc_info.h" -#include "orte/util/show_help.h" - -#include "orte/mca/routed/routed.h" - -#include "oob_ud.h" -#include "oob_ud_send.h" - -#define min(a,b) ((a) < (b) ? (a) : (b)) - -static int mca_oob_ud_module_init (void); -static void mca_oob_ud_module_fini (mca_oob_ud_peer_t **peer); -static int mca_oob_ud_set_addr (const orte_process_name_t *name, const char *uri); -static void mca_oob_ud_send_nb(orte_rml_send_t *msg); -static void mca_oob_ud_ping(const orte_process_name_t *proc); - -mca_oob_ud_module_t mca_oob_ud_module = { - { - mca_oob_ud_module_init, - mca_oob_ud_module_fini, - - mca_oob_ud_set_addr, - - mca_oob_ud_ping, - - mca_oob_ud_send_nb - } -}; - -static void mca_oob_ud_send_nb(orte_rml_send_t *msg) { - opal_output_verbose(2, orte_oob_base_framework.framework_output, - "%s oob:ud:send_nb to peer %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&msg->dst)); - - /* push this into our event base for processing */ - ORTE_ACTIVATE_UD_POST_SEND(msg, mca_oob_ud_process_send_nb); -} - -static void mca_oob_ud_ping(const orte_process_name_t *proc) { - opal_output_verbose(2, orte_oob_base_framework.framework_output, - "%s oob:ud:ping proc %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(proc)); - - /* push this into our event base for processing */ - ORTE_ACTIVATE_UD_PING(proc, mca_oob_ud_process_ping); -} - -/* uri must be at least 27 bytes in size */ -void mca_oob_ud_port_get_uri (mca_oob_ud_port_t *port, char *uri) -{ - sprintf (uri, "ud://%u.%u.%u", port->listen_qp.ib_qp->qp_num, - port->lid, port->port_num); -} - -static int mca_oob_ud_set_addr (const orte_process_name_t *name, const char *uri) -{ - mca_oob_ud_peer_t *peer = NULL; - int rc; - - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:ud:set_addr: setting location for peer %s from %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(name), uri); - - (void) mca_oob_ud_peer_lookup (name, &peer); - - if (NULL == uri) { - if (NULL != peer) { - mca_oob_ud_peer_release (peer); - } - - peer = NULL; - } else if (NULL == peer) { - peer = mca_oob_ud_peer_from_uri (uri); - if (NULL == peer) { - return ORTE_ERR_BAD_PARAM; - } - } else { - rc = mca_oob_ud_peer_update_with_uri (peer, uri); - - if (ORTE_SUCCESS != rc) { - return rc; - } - } - - if (NULL != peer) { - peer->peer_name = *name; - peer->needs_notification = true; - } - - opal_proc_table_set_value(&mca_oob_ud_module.peers, - *name, (void *)peer); - - return ORTE_SUCCESS; -} - -int mca_oob_ud_port_post_one_recv (mca_oob_ud_port_t *port, int msg_num) -{ - char *grh_buf = port->grh_buf.ptr + msg_num * sizeof (struct ibv_grh); - char *msg_buf = port->msg_buf.ptr + msg_num * port->mtu; - struct ibv_recv_wr wr; - struct ibv_sge sge[2]; - - /* GRH */ - mca_oob_ud_fill_sge(sge, grh_buf, sizeof (struct ibv_grh), port->grh_buf.mr->lkey); - - /* message */ - mca_oob_ud_fill_sge(sge + 1, msg_buf, port->mtu, port->msg_buf.mr->lkey); - - mca_oob_ud_fill_recv_wr (&wr, sge, 2); - wr.wr_id = MCA_OOB_UD_RECV_WR | (uint64_t)msg_num; - - return mca_oob_ud_qp_post_recv (&port->listen_qp, &wr); -} - -static bool module_has_been_inited = false; - -static int mca_oob_ud_module_init (void) -{ - /* protect against repeat inits */ - if (module_has_been_inited) { - return ORTE_SUCCESS; - } - module_has_been_inited = true; - - OBJ_CONSTRUCT(&mca_oob_ud_module.peers, opal_proc_table_t); - opal_proc_table_init (&mca_oob_ud_module.peers, 16, 1024); - - return ORTE_SUCCESS; -} - -static void mca_oob_ud_module_fini (mca_oob_ud_peer_t **peer) -{ - opal_process_name_t key; - void *node1, *node2; - int rc; - - rc = opal_proc_table_get_first_key (&mca_oob_ud_module.peers, &key, - (void **) peer, &node1, &node2); - if (OPAL_SUCCESS == rc) { - do { - if (NULL != *peer) { - mca_oob_ud_peer_release (*peer); - } - rc = opal_proc_table_get_next_key (&mca_oob_ud_module.peers, &key, - (void **) peer, node1, &node1, node2, &node2); - } while (OPAL_SUCCESS == rc); - } - - opal_proc_table_remove_all(&mca_oob_ud_module.peers); - - OBJ_DESTRUCT(&mca_oob_ud_module.peers); - - return; -} - -int mca_oob_ud_register_iov (struct iovec *iov, int count, struct ibv_mr **ib_mr, - struct ibv_pd *ib_pd, unsigned int mtu, int *sge_countp, - int *wr_countp, int *data_lenp) -{ - int data_len, iov_index, sge_count; - unsigned int packet_size = 0; - - opal_output_verbose (80, orte_oob_base_framework.framework_output, - "%s oob:ud:register_iov registering memory", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - *wr_countp = 0; - *data_lenp = 0; - *sge_countp = 0; - - for (iov_index = 0, data_len = 0, sge_count = 0 ; iov_index < count ; ++iov_index) { - unsigned int iov_left = iov[iov_index].iov_len; - - data_len += iov_left; - - sge_count++; - - do { - unsigned int to_trans = min (iov_left, mtu - packet_size); - - packet_size = (to_trans < iov_left) ? 0 : packet_size + to_trans; - iov_left -= to_trans; - - if (0 == packet_size && iov_left) { - sge_count++; - } - } while (iov_left); - - /* register buffers */ - if (NULL == ib_mr[iov_index]) { - ib_mr[iov_index] = ibv_reg_mr (ib_pd, - iov[iov_index].iov_base, - iov[iov_index].iov_len, - IBV_ACCESS_LOCAL_WRITE | - IBV_ACCESS_REMOTE_WRITE); - if (NULL == ib_mr[iov_index]) { - /* Ruh-roh */ - orte_show_help("help-oob-ud.txt", "reg-mr-failed", true, - orte_process_info.nodename, iov[iov_index].iov_base, - iov[iov_index].iov_len,strerror(errno)); - return ORTE_ERR_OUT_OF_RESOURCE; - } - } - } - - *wr_countp = (data_len + mtu - 1) / mtu; - *sge_countp = sge_count; - *data_lenp = data_len; - - return ORTE_SUCCESS; -} - -int mca_oob_ud_register_buf (char *buf, int size, struct ibv_mr **ib_mr_buf, - struct ibv_pd *ib_pd, unsigned int mtu, int *sge_countp, int *wr_countp) -{ - int sge_count = 0; - unsigned int packet_size = 0; - - opal_output_verbose (80, orte_oob_base_framework.framework_output, - "%s oob:ud:mca_oob_ud_register_buf registering memory", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - *wr_countp = 0; - *sge_countp = 0; - - unsigned int iov_left = size; - - sge_count++; - - do { - unsigned int to_trans = min (iov_left, mtu - packet_size); - - packet_size = (to_trans < iov_left) ? 0 : packet_size + to_trans; - iov_left -= to_trans; - - if (0 == packet_size && iov_left) { - sge_count++; - } - } while (iov_left); - - /* register buffers */ - if (NULL == *ib_mr_buf) { - *ib_mr_buf = ibv_reg_mr (ib_pd, buf, size, - IBV_ACCESS_LOCAL_WRITE | - IBV_ACCESS_REMOTE_WRITE); - if (NULL == *ib_mr_buf) { - orte_show_help("help-oob-ud.txt", "reg-mr-failed", true, - orte_process_info.nodename, buf, size, strerror(errno)); - return ORTE_ERR_OUT_OF_RESOURCE; - } - } - - *wr_countp = (size + mtu - 1) / mtu; - *sge_countp = sge_count; - - return ORTE_SUCCESS; -} diff --git a/orte/mca/oob/ud/oob_ud.h b/orte/mca/oob/ud/oob_ud.h deleted file mode 100644 index 43cd3ad18f..0000000000 --- a/orte/mca/oob/ud/oob_ud.h +++ /dev/null @@ -1,207 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2014 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * - * 2014 Mellanox Technologies, Inc. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - */ - -#if !defined(MCA_OOB_UD_H) -#define MCA_OOB_UD_H - -#include "orte_config.h" - -#ifdef HAVE_SYS_TYPES_H -#include -#endif -#include -#include - -#include "opal/types.h" -#include "orte/types.h" - -#include "opal/mca/base/base.h" -#include "opal/class/opal_free_list.h" -#include "opal/class/opal_hash_table.h" -#include "opal/threads/mutex.h" -#include "opal/threads/condition.h" -#include "opal/threads/threads.h" -#include "opal/mca/timer/base/base.h" -#include "opal/include/opal_stdint.h" -#include "opal/mca/memchecker/base/base.h" - -#include "orte/mca/oob/oob.h" -#include "orte/mca/oob/base/base.h" -#include "orte/util/name_fns.h" - -#include "orte/runtime/orte_globals.h" - -#include "oob_ud_qp.h" -#include "oob_ud_peer.h" -#include "oob_ud_req.h" - -/* Use for valgrind checks*/ -#ifdef HAVE_VALGRIND -#include -#else -#define VALGRIND_MAKE_MEM_DEFINED(addr,len) -#endif - -BEGIN_C_DECLS - -enum { - MCA_OOB_UD_SEND_WR = 0x10000000, - MCA_OOB_UD_RECV_WR = 0x20000000 -}; - -enum { - MCA_OOB_UD_DEBUG_NONE, - MCA_OOB_UD_DEBUG_ALL -}; - -static inline void mca_oob_ud_fill_send_wr (struct ibv_send_wr *wr, struct ibv_sge *sge, - int num_sge, const mca_oob_ud_peer_t *peer) -{ - wr->wr_id = MCA_OOB_UD_SEND_WR; - wr->next = NULL; - wr->sg_list = sge; - wr->num_sge = num_sge; - wr->opcode = IBV_WR_SEND; - wr->send_flags = IBV_SEND_SIGNALED; - - wr->wr.ud.ah = peer->peer_ah; - wr->wr.ud.remote_qpn = peer->peer_qpn; - wr->wr.ud.remote_qkey = peer->peer_qkey; -} - -static inline void mca_oob_ud_fill_recv_wr (struct ibv_recv_wr *wr, struct ibv_sge *sge, - int num_sge) -{ - wr->wr_id = MCA_OOB_UD_RECV_WR; - wr->next = NULL; - wr->sg_list = sge; - wr->num_sge = num_sge; -} - -static inline void mca_oob_ud_fill_sge (struct ibv_sge *sge, void *addr, - uint32_t length, uint32_t lkey) -{ - sge->addr = (uint64_t)addr; - sge->length = length; - sge->lkey = lkey; -} - - -struct mca_oob_ud_device_t { - opal_list_item_t super; - struct ibv_device_attr attr; - struct ibv_context *ib_context; - struct ibv_comp_channel *ib_channel; - struct ibv_pd *ib_pd; - - opal_event_t event; - - opal_list_t ports; -}; - -typedef struct mca_oob_ud_device_t mca_oob_ud_device_t; -OBJ_CLASS_DECLARATION(mca_oob_ud_device_t); - -/* events */ -void mca_oob_ud_event_start_monitor (mca_oob_ud_device_t *device); -void mca_oob_ud_event_stop_monitor (mca_oob_ud_device_t *device); - -struct mca_oob_ud_reg_mem_t { - char *ptr; - size_t len; - struct ibv_mr *mr; -}; -typedef struct mca_oob_ud_reg_mem_t mca_oob_ud_reg_mem_t; - -struct mca_oob_ud_port_t { - opal_list_item_t super; - mca_oob_ud_device_t *device; - mca_oob_ud_qp_t listen_qp; - opal_free_list_t data_qps; - opal_free_list_t free_msgs; - int mtu; - uint16_t lid; - uint8_t port_num; - /** current send buffer index. used by init function for free_msgs member */ - int send_buffer_index; - - mca_oob_ud_reg_mem_t grh_buf; - mca_oob_ud_reg_mem_t msg_buf; -}; - -typedef struct mca_oob_ud_port_t mca_oob_ud_port_t; -OBJ_CLASS_DECLARATION(mca_oob_ud_port_t); - - -int mca_oob_ud_port_post_one_recv (mca_oob_ud_port_t *port, int msg_num); - -void mca_oob_ud_port_get_uri (mca_oob_ud_port_t *port, char *uri); - -/* Module definition */ -typedef int (*mca_oob_ud_module_init_fn_t)(void); -typedef void (*mca_oob_ud_module_fini_fn_t)(mca_oob_ud_peer_t **peer); -typedef int (*mca_oob_ud_set_addr_fn_t)(const orte_process_name_t *name, const char *uri); -typedef void (*mca_oob_ud_ping_fn_t)(const orte_process_name_t *proc); -typedef void (*mca_oob_ud_send_nb_fn_t)(orte_rml_send_t *msg); -typedef int (*mca_oob_ud_recv_nb_fn_t)(orte_process_name_t* peer, - orte_rml_send_t *msg); -typedef int (*mca_oob_ud_recv_cancel_fn_t)(orte_process_name_t *name, int tag); - -typedef struct { - mca_oob_ud_module_init_fn_t init; - mca_oob_ud_module_fini_fn_t finalize; - mca_oob_ud_set_addr_fn_t set_addr; - mca_oob_ud_ping_fn_t ping; - mca_oob_ud_send_nb_fn_t send_nb; -} mca_oob_ud_module_api_t; - -typedef struct { - mca_oob_ud_module_api_t api; - opal_event_base_t *ev_base; /* event base for the module progress thread */ - bool ev_active; - opal_thread_t progress_thread; - opal_proc_table_t peers; // connection addresses for peers -} mca_oob_ud_module_t; - -ORTE_MODULE_DECLSPEC extern mca_oob_ud_module_t mca_oob_ud_module; - -int mca_oob_ud_process_ping(int fd, short args, void *cbdata); -int mca_oob_ud_process_send_nb(int fd, short args, void *cbdata); -int mca_oob_ud_send_try (mca_oob_ud_req_t *send_req); -int mca_oob_ud_send_complete (mca_oob_ud_req_t *send_req, int rc); - -/* recv */ -int mca_oob_ud_recv_nb(orte_process_name_t* peer, - orte_rml_send_t *msg); -int mca_oob_ud_recv_cancel(orte_process_name_t* name, int tag); - -int mca_oob_ud_recv_complete (mca_oob_ud_req_t *recv_req); -int mca_oob_ud_recv_try (mca_oob_ud_req_t *recv_req); -int mca_oob_ud_recv_match_send (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer, - mca_oob_ud_msg_hdr_t *msg_hdr, mca_oob_ud_req_t **reqp); -int mca_oob_ud_get_recv_req (const orte_process_name_t name, const int tag, mca_oob_ud_req_t **reqp, bool iovec_used); - -int mca_oob_ud_register_iov (struct iovec *iov, int count, struct ibv_mr **ib_mr, - struct ibv_pd *ib_pd, unsigned int mtu, int *sge_countp, - int *wr_countp, int *data_lenp); -int mca_oob_ud_register_buf (char *buf, int size, struct ibv_mr **ib_mr_buf, - struct ibv_pd *ib_pd, unsigned int mtu, int *sge_countp, int *wr_countp); -void mca_oob_ud_event_queue_completed (mca_oob_ud_req_t *req); - -END_C_DECLS - -#endif diff --git a/orte/mca/oob/ud/oob_ud_component.c b/orte/mca/oob/ud/oob_ud_component.c deleted file mode 100644 index 13fb8622c4..0000000000 --- a/orte/mca/oob/ud/oob_ud_component.c +++ /dev/null @@ -1,789 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2014 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * 2014 Mellanox Technologies, Inc. - * All rights reserved. - * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2015-2016 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - */ - -#include "orte_config.h" -#include "orte/types.h" -#include "opal/types.h" -#include "opal/align.h" -#include "opal/util/sys_limits.h" - -#include "orte/mca/errmgr/errmgr.h" -#include "orte/runtime/orte_globals.h" -#include "orte/util/name_fns.h" -#include "orte/util/proc_info.h" -#include "orte/util/show_help.h" - -#include "oob_ud_component.h" - -#include "opal/mca/common/verbs/common_verbs.h" - -static int mca_oob_ud_component_open (void); -static int mca_oob_ud_component_close (void); -static int mca_oob_ud_component_register (void); -static int mca_oob_ud_component_available(void); -static int mca_oob_ud_component_startup(void); -static int mca_oob_ud_component_send_nb(orte_rml_send_t *msg); -static void mca_oob_ud_component_shutdown(void); -static char* mca_oob_ud_component_get_addr(void); -static int mca_oob_ud_component_set_addr(orte_process_name_t *peer, char **uris); -static bool mca_oob_ud_component_is_reachable(char *routed, orte_process_name_t *peer); -#if OPAL_ENABLE_FT_CR == 1 -static int mca_oob_ud_component_ft_event(int state); -#endif // OPAL_ENABLE_FT_CR - -static int mca_oob_ud_listen_create (mca_oob_ud_port_t *port); -static int mca_oob_ud_listen_destroy (mca_oob_ud_port_t *port); -static int mca_oob_ud_port_alloc_buffers (mca_oob_ud_port_t *port); -static inline int mca_oob_ud_port_recv_start (mca_oob_ud_port_t *port); -static inline int mca_oob_ud_alloc_reg_mem (struct ibv_pd *pd, mca_oob_ud_reg_mem_t *reg_mem, - const int buffer_len); -static inline void mca_oob_ud_free_reg_mem (mca_oob_ud_reg_mem_t *reg_mem); -static void mca_oob_ud_cancel_all_in_list (opal_list_t *list); -static void mca_oob_ud_empty_list (opal_list_t *list); -static void mca_oob_ud_port_construct (mca_oob_ud_port_t *port); -static void mca_oob_ud_port_destruct (mca_oob_ud_port_t *port); -static void mca_oob_ud_device_construct (mca_oob_ud_device_t *device); -static void mca_oob_ud_device_destruct (mca_oob_ud_device_t *device); - -/* - * Struct of function pointers and all that to let us be initialized - */ -mca_oob_ud_component_t mca_oob_ud_component = { - { - .oob_base = { - MCA_OOB_BASE_VERSION_2_0_0, - .mca_component_name = "ud", - MCA_BASE_MAKE_VERSION(component, ORTE_MAJOR_VERSION, ORTE_MINOR_VERSION, - ORTE_RELEASE_VERSION), - .mca_open_component = mca_oob_ud_component_open, - .mca_close_component = mca_oob_ud_component_close, - .mca_register_component_params = mca_oob_ud_component_register, - }, - .oob_data = { - /* The component is checkpoint ready */ - MCA_BASE_METADATA_PARAM_CHECKPOINT - }, - .priority = 0, //set the priority so that we will select this component only if someone directs to do so - .available = mca_oob_ud_component_available, //available - .startup = mca_oob_ud_component_startup, //startup - .shutdown = mca_oob_ud_component_shutdown, //shutdown - .send_nb = mca_oob_ud_component_send_nb, //send_nb - .get_addr = mca_oob_ud_component_get_addr, - .set_addr = mca_oob_ud_component_set_addr, - .is_reachable = mca_oob_ud_component_is_reachable, //is_reachable -#if OPAL_ENABLE_FT_CR == 1 - .ft_event = mca_oob_ud_component_ft_event, -#endif // OPAL_ENABLE_FT_CR - }, -}; - -static int mca_oob_ud_component_open (void) -{ - OBJ_CONSTRUCT(&mca_oob_ud_component.ud_devices, opal_list_t); - OBJ_CONSTRUCT(&mca_oob_ud_component.ud_active_sends, opal_list_t); - - OBJ_CONSTRUCT(&mca_oob_ud_component.ud_active_recvs, opal_list_t); - OBJ_CONSTRUCT(&mca_oob_ud_component.ud_event_queued_reqs, opal_list_t); - - OBJ_CONSTRUCT(&mca_oob_ud_component.ud_event_processing_msgs, opal_list_t); - - OBJ_CONSTRUCT(&mca_oob_ud_component.ud_lock, opal_mutex_t); - OBJ_CONSTRUCT(&mca_oob_ud_component.ud_match_lock, opal_mutex_t); - - return ORTE_SUCCESS; -} - -static int mca_oob_ud_component_close (void) -{ - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:ud:component_close entering", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - OBJ_DESTRUCT(&mca_oob_ud_component.ud_devices); - OBJ_DESTRUCT(&mca_oob_ud_component.ud_active_sends); - - OBJ_DESTRUCT(&mca_oob_ud_component.ud_active_recvs); - OBJ_DESTRUCT(&mca_oob_ud_component.ud_event_queued_reqs); - - OBJ_DESTRUCT(&mca_oob_ud_component.ud_lock); - OBJ_DESTRUCT(&mca_oob_ud_component.ud_match_lock); - - OBJ_DESTRUCT(&mca_oob_ud_component.ud_event_processing_msgs); - - return ORTE_SUCCESS; -} - -static int mca_oob_ud_component_register (void) -{ - mca_base_component_t *component = &mca_oob_ud_component.super.oob_base; - - mca_oob_ud_component.ud_min_qp = 8; - - (void) mca_base_component_var_register (component, "min_qp", "Minimum number of UD queue pairs " - "to allocate (default: 8)", MCA_BASE_VAR_TYPE_INT, NULL, - 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_LOCAL, &mca_oob_ud_component.ud_min_qp); - - mca_oob_ud_component.ud_max_qp = 32; - (void) mca_base_component_var_register (component, "max_qp", "Maximum number of UD queue pairs " - "to allocate (default: 32)", MCA_BASE_VAR_TYPE_INT, NULL, - 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_LOCAL, &mca_oob_ud_component.ud_max_qp); - - mca_oob_ud_component.ud_recv_buffer_count = 512; - (void) mca_base_component_var_register (component, "recv_buffers", "Number of MTU sized recv " - "buffers to post (default: 512)", MCA_BASE_VAR_TYPE_INT, NULL, - 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_LOCAL, &mca_oob_ud_component.ud_recv_buffer_count); - - mca_oob_ud_component.ud_send_buffer_count = 512; - (void) mca_base_component_var_register (component, "send_buffers", "Number of MTU sized send " - "buffers to allocate (default: 512)", MCA_BASE_VAR_TYPE_INT, NULL, - 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_LOCAL, &mca_oob_ud_component.ud_send_buffer_count); - - mca_oob_ud_component.ud_max_retries = 5; - (void)mca_base_component_var_register(component, "peer_retries", - "Number of times to try shutting down a connection before giving up", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_LOCAL, - &mca_oob_ud_component.ud_max_retries); - - mca_oob_ud_component.ud_timeout_usec = 800000; - (void)mca_base_component_var_register(component, "peer_timeout", - "Timeout in microseconds between retransmission of data", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_LOCAL, - &mca_oob_ud_component.ud_timeout_usec); - - - mca_oob_ud_component.ud_qp_max_send_sge = 1; - (void)mca_base_component_var_register(component, "max_send_sge", - "Requested max number of outstanding WRs in the SQ", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_LOCAL, - &mca_oob_ud_component.ud_qp_max_send_sge); - - mca_oob_ud_component.ud_qp_max_recv_sge = 2; - (void)mca_base_component_var_register(component, "max_recv_sge", - "Requested max number of outstanding WRs in the RQ", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_LOCAL, - &mca_oob_ud_component.ud_qp_max_recv_sge); - - - mca_oob_ud_component.ud_qp_max_send_wr = 4096; - (void)mca_base_component_var_register(component, "max_send_wr", - "Requested max number of scatter/gather (s/g) elements in a WR in the SQ", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_LOCAL, - &mca_oob_ud_component.ud_qp_max_send_wr); - - mca_oob_ud_component.ud_qp_max_recv_wr = 4096; - (void)mca_base_component_var_register(component, "max_recv_wr", - "Requested max number of scatter/gather (s/g) elements in a WR in the RQ", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_LOCAL, - &mca_oob_ud_component.ud_qp_max_recv_wr); - - mca_oob_ud_component.ud_qp_max_inline_data = 0; - (void)mca_base_component_var_register(component, "max_inline_data", - "Requested max number of data (bytes) that can be posted inline to the SQ", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_LOCAL, - &mca_oob_ud_component.ud_qp_max_inline_data); - return ORTE_SUCCESS; -} - -static int mca_oob_ud_component_available(void) { - - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "oob:ud: component_available called"); - - /* set the module event base - this is where we would spin off a separate - * progress thread if so desired */ - mca_oob_ud_module.ev_base = orte_event_base; - - return ORTE_SUCCESS; -} - -static int port_mtus[] = {0, 256, 512, 1024, 2048, 4096}; - -static inline int mca_oob_ud_port_setup (mca_oob_ud_port_t *port) -{ - int rc; - struct ibv_port_attr port_attr; - - rc = ibv_query_port (port->device->ib_context, port->port_num, &port_attr); - if (0 != rc || IBV_PORT_ACTIVE != port_attr.state || 0 == port_attr.lid) { - /* skip this port */ - return ORTE_ERROR; - } - - port->lid = port_attr.lid; - port->mtu = port_attr.active_mtu > IBV_MTU_4096 ? 2048 : port_mtus[port_attr.active_mtu]; - - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:ud:port_setup found port: num = %u, lid = %u, mtu = %u", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - port->port_num, port->lid, port->mtu); - - return rc; -} - -static inline int mca_oob_ud_device_setup (mca_oob_ud_device_t *device, - struct ibv_device *ib_device) -{ - int rc, port_num; - - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:ud:device_setup attempting to setup ib device %p", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) ib_device); - - - device->ib_context = ibv_open_device (ib_device); - if (NULL == device->ib_context) { - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:ud:device_setup error opening device. errno = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno); - return ORTE_ERROR; - } - - rc = ibv_query_device (device->ib_context, &device->attr); - if (0 != rc) { - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:ud:device_setup error querying device. errno = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno); - return ORTE_ERROR; - } - - device->ib_channel = ibv_create_comp_channel (device->ib_context); - if (NULL == device->ib_channel) { - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:ud:device_setup error completing completion channel." - "errno = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno); - return ORTE_ERROR; - } - - device->ib_pd = ibv_alloc_pd (device->ib_context); - if (NULL == device->ib_pd) { - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:ud:device_setup error allocating protection domain." - "errno = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno); - return ORTE_ERROR; - } - - for (port_num = 1 ; port_num <= device->attr.phys_port_cnt ; ++port_num) { - mca_oob_ud_port_t *port = OBJ_NEW(mca_oob_ud_port_t); - - if (NULL == port) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERR_OUT_OF_RESOURCE; - } - - port->device = device; - port->port_num = port_num; - - rc = mca_oob_ud_port_setup (port); - if (ORTE_SUCCESS != rc) { - OBJ_RELEASE(port); - continue; - } - - opal_list_append (&device->ports, (opal_list_item_t *) port); - - break; - } - - if (0 == opal_list_get_size(&device->ports)) { - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:ud:device_setup could not init device. no usable " - "ports present", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - return ORTE_ERROR; - } - - return ORTE_SUCCESS; -} - -static int mca_oob_ud_component_startup(void) -{ - struct ibv_device **devices; - int num_devices, i, rc; - opal_list_item_t *item, *item2; - bool found_one = false; - - /* If fork support is requested, try to enable it */ - rc = opal_common_verbs_fork_test(); - if (OPAL_SUCCESS != rc) { - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:ud:device_setup failed in ibv_fork_init. errno = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno); - return ORTE_ERROR; - } - - /* If there are no devices, it is not an error; we just won't use - this component. */ - devices = ibv_get_device_list (&num_devices); - if (NULL == devices) { - return ORTE_ERR_NOT_FOUND; - } - if (0 == num_devices) { - ibv_free_device_list(devices); - return ORTE_ERR_NOT_FOUND; - } - - for (i = 0 ; i < num_devices ; ++i) { - mca_oob_ud_device_t *device = OBJ_NEW(mca_oob_ud_device_t); - - if (NULL == device) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERROR; - } - - rc = mca_oob_ud_device_setup (device, devices[i]); - if (ORTE_SUCCESS != rc) { - OBJ_RELEASE(device); - continue; - } - - opal_list_append (&mca_oob_ud_component.ud_devices, - (opal_list_item_t *) device); - - /* NTH: support only 1 device for now */ - break; - } - - ibv_free_device_list (devices); - - /* If no usable devices are found, then just ignore this component - in this run */ - if (0 == opal_list_get_size (&mca_oob_ud_component.ud_devices)) { - return ORTE_ERR_NOT_FOUND; - } - - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:ud:init initializing oob/openib. # of devices = %u", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (unsigned int) opal_list_get_size (&mca_oob_ud_component.ud_devices)); - - for (item = opal_list_get_first (&mca_oob_ud_component.ud_devices); - item != opal_list_get_end (&mca_oob_ud_component.ud_devices); - item = opal_list_get_next (item)) { - mca_oob_ud_device_t *device = (mca_oob_ud_device_t *) item; - - /* start monitoring the device for completions */ - for (item2 = opal_list_get_first (&device->ports) ; - item2 != opal_list_get_end (&device->ports) ; - item2 = opal_list_get_next (item2)) { - mca_oob_ud_port_t *port = (mca_oob_ud_port_t *) item2; - - rc = mca_oob_ud_listen_create (port); - if (0 != rc) { - continue; - } - - rc = mca_oob_ud_port_alloc_buffers (port); - if (ORTE_SUCCESS != rc) { - mca_oob_ud_listen_destroy (port); - continue; - } - - rc = opal_free_list_init (&port->data_qps, - sizeof (mca_oob_ud_qp_t), 8, - OBJ_CLASS(mca_oob_ud_qp_t), 0, 0, - mca_oob_ud_component.ud_min_qp, - mca_oob_ud_component.ud_max_qp, - 2, NULL, 0, NULL, NULL, NULL); - if (OPAL_SUCCESS != rc) { - mca_oob_ud_listen_destroy (port); - continue; - } - - rc = mca_oob_ud_port_recv_start (port); - if (ORTE_SUCCESS != rc) { - mca_oob_ud_listen_destroy (port); - continue; - } - - /* NTH: only supports one port for now */ - found_one = true; - - /* NTH: since we only support one port start monitoring now */ - mca_oob_ud_event_start_monitor (device); - - break; - } - } - - if (!found_one) { - orte_show_help("help-oob-ud.txt", "no-ports-usable", true, - orte_process_info.nodename); - return ORTE_ERR_NOT_FOUND; - } - - /* have to call the module init here so we can test for available qpair */ - if ((NULL != mca_oob_ud_module.api.init) && (ORTE_SUCCESS != (rc = mca_oob_ud_module.api.init()))){ - return ORTE_ERROR; - } - - return ORTE_SUCCESS; -} - -static void mca_oob_ud_component_shutdown(void) -{ - mca_oob_ud_peer_t *peer; - opal_list_item_t *item; - - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:ud:fini entering", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - OPAL_THREAD_LOCK(&mca_oob_ud_component.ud_lock); - OPAL_THREAD_LOCK(&mca_oob_ud_component.ud_match_lock); - - if (ORTE_VPID_INVALID != ORTE_PROC_MY_PARENT->vpid) { - if (ORTE_SUCCESS == mca_oob_ud_peer_lookup (ORTE_PROC_MY_PARENT, &peer) && NULL != peer) { - mca_oob_ud_peer_handle_end (peer); - } - } - - /* abort active receives */ - mca_oob_ud_cancel_all_in_list (&mca_oob_ud_component.ud_active_recvs); - mca_oob_ud_cancel_all_in_list (&mca_oob_ud_component.ud_active_sends); - - mca_oob_ud_empty_list (&mca_oob_ud_component.ud_event_queued_reqs); - - OPAL_THREAD_UNLOCK(&mca_oob_ud_component.ud_match_lock); - - if (NULL != mca_oob_ud_module.api.finalize) { - mca_oob_ud_module.api.finalize(&peer); - } - - for (item = opal_list_get_first (&mca_oob_ud_component.ud_devices); - item != opal_list_get_end (&mca_oob_ud_component.ud_devices); - item = opal_list_get_next (item)) { - mca_oob_ud_event_stop_monitor ((mca_oob_ud_device_t *) item); - } - - mca_oob_ud_empty_list (&mca_oob_ud_component.ud_devices); - OPAL_THREAD_UNLOCK(&mca_oob_ud_component.ud_lock); -} - -static char* mca_oob_ud_component_get_addr(void) { - /* NTH: qp_num - 32 bits (10), lid - 16 bits (5), port - 8 bits (3) + ud:// + 3 .'s + \0 = 27 chars */ - char *contact_info = (char *) calloc(opal_list_get_size(&mca_oob_ud_component.ud_devices) * 27, 1); - char *ptr = contact_info; - opal_list_item_t *item, *port_item; - *ptr = 0; - - for (item = opal_list_get_first (&mca_oob_ud_component.ud_devices) ; - item != opal_list_get_end (&mca_oob_ud_component.ud_devices) ; - item = opal_list_get_next (item)) { - - mca_oob_ud_device_t *device = (mca_oob_ud_device_t *) item; - - for (port_item = opal_list_get_first (&device->ports); - port_item != opal_list_get_end (&device->ports); - port_item = opal_list_get_next (port_item)) { - - if (ptr != contact_info) { - ptr += sprintf (ptr, ";"); - } - - mca_oob_ud_port_get_uri ((mca_oob_ud_port_t *) port_item, ptr); - ptr += strlen (ptr); - } - } - - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:ud:get_addr contact information: %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), contact_info); - - return contact_info; -} - -static int mca_oob_ud_component_send_nb(orte_rml_send_t *msg) { - if (NULL != mca_oob_ud_module.api.send_nb) { - mca_oob_ud_module.api.send_nb(msg); - return ORTE_SUCCESS; - } - return ORTE_ERROR; -} - -static int mca_oob_ud_component_set_addr(orte_process_name_t *peer, char **uris) -{ - int rc; - - OPAL_THREAD_LOCK(&mca_oob_ud_component.ud_lock); - - for (int i = 0; NULL != uris[i]; i++) { - if (0 == strncmp(uris[i], "ud:", 3)) { - if (NULL != mca_oob_ud_module.api.set_addr) { - if (ORTE_SUCCESS != (rc = mca_oob_ud_module.api.set_addr(peer, uris[i]))) { - OPAL_THREAD_UNLOCK(&mca_oob_ud_component.ud_lock); - return rc; - } - } - } - } - - OPAL_THREAD_UNLOCK(&mca_oob_ud_component.ud_lock); - - return ORTE_SUCCESS; -} - -#if OPAL_ENABLE_FT_CR == 1 -static int mca_oob_ud_component_ft_event(int state) { - (void) state; - return ORTE_SUCCESS; -} -#endif // OPAL_ENABLE_FT_CR - -static int mca_oob_ud_port_alloc_buffers (mca_oob_ud_port_t *port) { - int total_buffer_count = mca_oob_ud_component.ud_recv_buffer_count + - mca_oob_ud_component.ud_send_buffer_count; - int rc; - - rc = mca_oob_ud_alloc_reg_mem (port->device->ib_pd, &port->grh_buf, - mca_oob_ud_component.ud_recv_buffer_count * sizeof (struct ibv_grh)); - if (ORTE_SUCCESS != rc) { - return rc; - } - - - rc = mca_oob_ud_alloc_reg_mem (port->device->ib_pd, &port->msg_buf, - total_buffer_count * port->mtu); - if (ORTE_SUCCESS != rc) { - return rc; - } - - port->send_buffer_index = 0; - rc = opal_free_list_init (&port->free_msgs, sizeof (mca_oob_ud_msg_t), 8, - OBJ_CLASS(mca_oob_ud_msg_t), 0, 0, mca_oob_ud_component.ud_send_buffer_count, - mca_oob_ud_component.ud_send_buffer_count, 0, NULL, 0, NULL, mca_oob_ud_msg_init, - port); - if (ORTE_SUCCESS != rc) { - return rc; - } - - return rc; -} - -static bool mca_oob_ud_component_is_reachable(char *routed, orte_process_name_t *peer_name) -{ - orte_process_name_t hop; - - /* if we have a route to this peer, then we can reach it */ - hop = orte_routed.get_route(routed, peer_name); - if (ORTE_JOBID_INVALID == hop.jobid || - ORTE_VPID_INVALID == hop.vpid) { - ORTE_ERROR_LOG(ORTE_ERR_UNREACH); - return false; - } - return true; -} - -static void mca_oob_ud_port_construct (mca_oob_ud_port_t *port) -{ - memset((char *) port + sizeof (port->super), 0, sizeof (*port) - sizeof (port->super)); - - OBJ_CONSTRUCT(&port->data_qps, opal_free_list_t); - OBJ_CONSTRUCT(&port->free_msgs, opal_free_list_t); - OBJ_CONSTRUCT(&port->listen_qp, opal_free_list_item_t); -} - -static void mca_oob_ud_port_destruct (mca_oob_ud_port_t *port) -{ - (void) mca_oob_ud_listen_destroy (port); - OBJ_DESTRUCT(&port->data_qps); - OBJ_DESTRUCT(&port->free_msgs); - - mca_oob_ud_free_reg_mem (&port->grh_buf); - mca_oob_ud_free_reg_mem (&port->msg_buf); -} - -OBJ_CLASS_INSTANCE(mca_oob_ud_port_t, opal_list_item_t, - mca_oob_ud_port_construct, - mca_oob_ud_port_destruct); - -static int mca_oob_ud_listen_create (mca_oob_ud_port_t *port) { - return mca_oob_ud_qp_init (&port->listen_qp, port, port->device->ib_channel, NULL, false); -} - -/* mca_oob_ud_listen_destroy: - * - * Destory the listen queue pair associated with a port. - */ -static int mca_oob_ud_listen_destroy (mca_oob_ud_port_t *port) -{ - if (NULL == port || NULL == port->listen_qp.ib_qp) { - return ORTE_SUCCESS; - } - - OBJ_DESTRUCT(&port->listen_qp); - - return ORTE_SUCCESS; -} - -static inline int mca_oob_ud_port_recv_start (mca_oob_ud_port_t *port) -{ - int i, rc; - - rc = mca_oob_ud_qp_to_rts (&port->listen_qp); - if (ORTE_SUCCESS != rc) { - return rc; - } - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:ud:port_recv_start posting " - "%d message buffers", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - mca_oob_ud_component.ud_recv_buffer_count); - - for (i = 0 ; i < mca_oob_ud_component.ud_recv_buffer_count ; ++i) { - rc = mca_oob_ud_port_post_one_recv (port, i); - if (ORTE_SUCCESS != rc) { - return rc; - } - } - - rc = ibv_req_notify_cq (port->listen_qp.ib_recv_cq, 0); - if (0 != rc) { - orte_show_help("help-oob-ud.txt", "notify-cq-failed", true, - orte_process_info.nodename, strerror(errno)); - return ORTE_ERROR; - } - - return ORTE_SUCCESS; -} - -static inline int mca_oob_ud_alloc_reg_mem (struct ibv_pd *pd, mca_oob_ud_reg_mem_t *reg_mem, - const int buffer_len) -{ - size_t buffer_len_aligned, page_size; - reg_mem->len = buffer_len; - reg_mem->ptr = NULL; - reg_mem->mr = NULL; - /* The allocated buffer should be a multiple of page size. - If ibv_fork_init() has been invoked the pages are marked MADV_DONTFORK. - If we only partially use a page, any data allocated on the remainder of - the page will be inaccessible to the child process */ - page_size = opal_getpagesize(); - buffer_len_aligned = OPAL_ALIGN(buffer_len, page_size, size_t); - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:ud:alloc_reg_mem allocing and registering %d bytes of memory with pd %p", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), buffer_len, (void *) pd); - - posix_memalign ((void **)®_mem->ptr, page_size, buffer_len_aligned); - if (NULL == reg_mem->ptr) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERR_OUT_OF_RESOURCE; - } - - memset (reg_mem->ptr, 0, buffer_len); - - reg_mem->mr = ibv_reg_mr (pd, reg_mem->ptr, buffer_len, IBV_ACCESS_LOCAL_WRITE); - if (NULL == reg_mem->mr) { - orte_show_help("help-oob-ud.txt", "reg-mr-failed", true, - orte_process_info.nodename, reg_mem->ptr, buffer_len, strerror(errno)); - return ORTE_ERROR; - } - - return ORTE_SUCCESS; -} - -static inline void mca_oob_ud_free_reg_mem (mca_oob_ud_reg_mem_t *reg_mem) -{ - if (reg_mem->mr) { - (void) ibv_dereg_mr (reg_mem->mr); - } - - if (reg_mem->ptr) { - free (reg_mem->ptr); - } - - memset (reg_mem, 0, sizeof (mca_oob_ud_reg_mem_t)); -} - -static void mca_oob_ud_cancel_all_in_list (opal_list_t *list) -{ - opal_list_item_t *item; - - while (NULL != (item = opal_list_remove_first (list))) { - ((mca_oob_ud_req_t *)item)->req_list = NULL; - mca_oob_ud_req_abort ((mca_oob_ud_req_t *) item); - } -} - -static void mca_oob_ud_empty_list (opal_list_t *list) -{ - opal_list_item_t *item; - - while (NULL != (item = opal_list_remove_first (list))) { - OBJ_RELEASE(item); - } -} - -static void mca_oob_ud_device_construct (mca_oob_ud_device_t *device) -{ - memset((char *) device + sizeof (device->super), 0, sizeof (*device) - sizeof (device->super)); - - OBJ_CONSTRUCT(&device->ports, opal_list_t); -} - -static void mca_oob_ud_device_destruct (mca_oob_ud_device_t *device) -{ - opal_list_item_t *item; - - while (NULL != (item = opal_list_remove_first (&device->ports))) { - OBJ_RELEASE(item); - } - - if (device->ib_pd) { - (void) ibv_dealloc_pd (device->ib_pd); - } - - if (device->ib_channel) { - (void) ibv_destroy_comp_channel (device->ib_channel); - } - - if (device->ib_context) { - (void) ibv_close_device (device->ib_context); - } - - OBJ_DESTRUCT(&device->ports); - - memset (device, 0, sizeof (mca_oob_ud_device_t)); -} - -OBJ_CLASS_INSTANCE(mca_oob_ud_device_t, opal_list_item_t, - mca_oob_ud_device_construct, - mca_oob_ud_device_destruct); - -OBJ_CLASS_INSTANCE(mca_oob_ud_msg_op_t, - opal_object_t, - NULL, NULL); - -OBJ_CLASS_INSTANCE(mca_oob_ud_ping_t, - opal_object_t, - NULL, NULL); diff --git a/orte/mca/oob/ud/oob_ud_component.h b/orte/mca/oob/ud/oob_ud_component.h deleted file mode 100644 index 2c6aac1469..0000000000 --- a/orte/mca/oob/ud/oob_ud_component.h +++ /dev/null @@ -1,68 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2014 Mellanox Technologies, Inc. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -#if !defined(MCA_OOB_UD_COMPONENT_H) -#define MCA_OOB_UD_COMPONENT_H - -#ifdef HAVE_SYS_TIME_H -#include -#endif - -#include "opal/class/opal_bitmap.h" -#include "opal/class/opal_list.h" -#include "opal/class/opal_pointer_array.h" - -#include "orte/mca/oob/oob.h" -#include "orte/mca/routed/routed.h" -#include "oob_ud.h" -#include "oob_ud_send.h" -#include "oob_ud_ping.h" - -/** - * OOB UD Component - */ - - /** - * OOB USOCK Component - */ -typedef struct { - mca_oob_base_component_t super; /**< base OOB component */ - - opal_list_t ud_devices; - opal_list_t ud_active_recvs; - opal_list_t ud_active_sends; - opal_list_t ud_event_queued_reqs; - opal_list_t ud_event_processing_msgs; - - opal_event_t ud_complete_event; - - opal_mutex_t ud_lock; - - int ud_min_qp; - int ud_max_qp; - - int ud_recv_buffer_count; - int ud_send_buffer_count; - - opal_mutex_t ud_match_lock; - - int ud_max_retries; /**< max number of retries before declaring peer gone */ - int ud_timeout_usec; /**< timeout in microsecond between peer retries */ - - int ud_qp_max_send_sge; - int ud_qp_max_recv_sge; - int ud_qp_max_send_wr; - int ud_qp_max_recv_wr; - int ud_qp_max_inline_data; -} mca_oob_ud_component_t; - -ORTE_MODULE_DECLSPEC extern mca_oob_ud_component_t mca_oob_ud_component; - -#endif //MCA_OOB_UD_COMPONENT_H diff --git a/orte/mca/oob/ud/oob_ud_event.c b/orte/mca/oob/ud/oob_ud_event.c deleted file mode 100644 index a7374f8413..0000000000 --- a/orte/mca/oob/ud/oob_ud_event.c +++ /dev/null @@ -1,606 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights - * reserved. - * 2014 Mellanox Technologies, Inc. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - */ - -#include "oob_ud_component.h" - -#define min(a,b) ((a) < (b) ? (a) : (b)) - -static int mca_oob_ud_event_send_ack (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer, mca_oob_ud_msg_hdr_t *msg); -static int mca_oob_ud_event_send_nack (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer, mca_oob_ud_msg_hdr_t *msg); - -static int mca_oob_ud_event_handle_ack (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer, - mca_oob_ud_msg_hdr_t *msg_hdr); -static int mca_oob_ud_event_handle_nack (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer, - mca_oob_ud_msg_hdr_t *msg_hdr); - -static int mca_oob_ud_event_handle_completion (mca_oob_ud_port_t *port, mca_oob_ud_msg_hdr_t *msg); -static int mca_oob_ud_event_handle_data_ok (mca_oob_ud_port_t *port, mca_oob_ud_msg_hdr_t *msg); -static int mca_oob_ud_event_handle_req (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer, mca_oob_ud_msg_hdr_t *msg_hdr); -static int mca_oob_ud_event_handle_rep (mca_oob_ud_port_t *port, mca_oob_ud_msg_hdr_t *msg); -static int mca_oob_ud_event_handle_end (mca_oob_ud_peer_t *peer, mca_oob_ud_msg_hdr_t *msg_hdr); - -static void *mca_oob_ud_event_dispatch(int fd, int flags, void *context); -static void *mca_oob_ud_complete_dispatch(int fd, int flags, void *context); - -static void mca_oob_ud_stop_events(mca_oob_ud_device_t *device); - -static inline opal_list_item_t *mca_oob_ud_list_get_first (opal_list_t *list) -{ - return (opal_list_get_size (list) == 0) ? NULL : opal_list_get_first (list); -} - -static inline opal_list_item_t *mca_oob_ud_list_get_next (opal_list_t *list, opal_list_item_t *item) -{ - opal_list_item_t *next = opal_list_get_next (item); - - return (opal_list_get_end(list) == next) ? NULL : next; -} - -static bool event_started = false; -static bool event_completed_set = false; - -void mca_oob_ud_event_start_monitor (mca_oob_ud_device_t *device) -{ - if (!event_started) { - opal_event_set (orte_event_base, &device->event, device->ib_channel->fd, - OPAL_EV_READ, mca_oob_ud_event_dispatch, (void *) device); - opal_event_add (&device->event, NULL); - event_started = true; - } -} - -void mca_oob_ud_event_stop_monitor (mca_oob_ud_device_t *device) -{ - if (event_started) { - opal_event_del (&device->event); - mca_oob_ud_stop_events (device); - event_started = false; - } -} - -struct mca_oob_ud_msg_item_t { - opal_list_item_t super; - - mca_oob_ud_msg_hdr_t *hdr; - mca_oob_ud_port_t *port; - mca_oob_ud_peer_t *peer; - int msg_num; -}; -typedef struct mca_oob_ud_msg_item_t mca_oob_ud_msg_item_t; -OBJ_CLASS_DECLARATION(mca_oob_ud_msg_item_t); - -static void mca_oob_ud_msg_item_construct (mca_oob_ud_msg_item_t *item) -{ - memset ((char *) item + sizeof (item->super), 0, sizeof (*item) - sizeof (item->super)); -} - -static void mca_oob_ud_msg_item_destruct (mca_oob_ud_msg_item_t *item) -{ - if (item->hdr) { - /* repost the receive request */ - mca_oob_ud_port_post_one_recv (item->port, item->msg_num); - } -} - -OBJ_CLASS_INSTANCE(mca_oob_ud_msg_item_t, opal_list_item_t, - mca_oob_ud_msg_item_construct, - mca_oob_ud_msg_item_destruct); - -static int mca_oob_ud_msg_item_cmp (opal_list_item_t **a, opal_list_item_t **b) -{ - mca_oob_ud_msg_item_t *aitem = *((mca_oob_ud_msg_item_t **) a); - mca_oob_ud_msg_item_t *bitem = *((mca_oob_ud_msg_item_t **) b); - - if (aitem->peer == bitem->peer) { - return (aitem->hdr->msg_id > bitem->hdr->msg_id ? 1 : -1); - } else { - return (aitem->peer > bitem->peer) ? 1 : -1; - } -} - -static int mca_oob_ud_process_messages (struct ibv_cq *event_cq, mca_oob_ud_port_t *port) -{ - mca_oob_ud_msg_item_t *msg_item, *next_item; - opal_list_t *processing_msgs = &mca_oob_ud_component.ud_event_processing_msgs; - mca_oob_ud_peer_t *peer; - mca_oob_ud_msg_hdr_t *msg_hdr; - int msg_num, i, count; - struct ibv_wc wc[40]; - bool peer_nacked; - - count = ibv_poll_cq (event_cq, 40, wc); - if (count < 0) - return count; - - /* acknowlege the events */ - ibv_ack_cq_events (event_cq, count); - - for (i = 0 ; i < count ; ++i) { - msg_num = (int)(wc[i].wr_id & (~MCA_OOB_UD_RECV_WR)); - msg_hdr = (mca_oob_ud_msg_hdr_t *) (port->msg_buf.ptr + msg_num * port->mtu); - - VALGRIND_MAKE_MEM_DEFINED(msg_hdr, wc[i].byte_len); - - if (!(wc[i].wr_id & MCA_OOB_UD_RECV_WR) || IBV_WC_SUCCESS != wc[i].status) { - mca_oob_ud_port_post_one_recv (port, msg_num); - continue; - } - - peer = mca_oob_ud_get_peer (port, &msg_hdr->ra.name, wc[i].src_qp, msg_hdr->ra.qkey, - wc[i].slid, msg_hdr->ra.port_num); - - if (peer) { - if (MCA_OOB_UD_MSG_ACK != msg_hdr->msg_type && MCA_OOB_UD_MSG_NACK != msg_hdr->msg_type && - MCA_OOB_UD_MSG_END != msg_hdr->msg_type) { - mca_oob_ud_msg_item_t *msg_item = OBJ_NEW(mca_oob_ud_msg_item_t); - - msg_item->msg_num = msg_num; - msg_item->hdr = msg_hdr; - msg_item->port = port; - msg_item->peer = peer; - - opal_list_append (processing_msgs, (opal_list_item_t *) msg_item); - } else { - if (MCA_OOB_UD_MSG_ACK == msg_hdr->msg_type) { - (void) mca_oob_ud_event_handle_ack (port, peer, msg_hdr); - } else if (MCA_OOB_UD_MSG_NACK == msg_hdr->msg_type) { - (void) mca_oob_ud_event_handle_nack (port, peer, msg_hdr); - } else { - mca_oob_ud_event_handle_end (peer, msg_hdr); - } - - mca_oob_ud_port_post_one_recv (port, msg_num); - } - } else { - opal_output_verbose(10, orte_oob_base_framework.framework_output, - "%s oob:ud:process_message got a null peer for message id %" - PRIu64, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg_hdr->msg_id); - mca_oob_ud_port_post_one_recv (port, msg_num); - } - } - - /* Sort messages by peer then id */ - opal_list_sort (processing_msgs, mca_oob_ud_msg_item_cmp); - - /* Send ACKs/NACKs and throw away out-of-order messages */ - msg_item = (mca_oob_ud_msg_item_t *) mca_oob_ud_list_get_first (processing_msgs); - - for (peer = NULL, peer_nacked = false ; NULL != msg_item ; msg_item = next_item) { - if (peer != msg_item->peer) { - peer_nacked = false; - } - - peer = msg_item->peer; - - next_item = (mca_oob_ud_msg_item_t *) mca_oob_ud_list_get_next (processing_msgs, - (opal_list_item_t *)msg_item); - - if (false == peer_nacked) { - if (msg_item->hdr->msg_id > peer->peer_expected_id) { - (void) mca_oob_ud_event_send_nack (msg_item->port, peer, msg_item->hdr); - peer_nacked = true; - } else if (NULL == next_item || (next_item->peer != msg_item->peer)) { - (void) mca_oob_ud_event_send_ack (msg_item->port, msg_item->peer, msg_item->hdr); - } - } - - if (msg_item->hdr->msg_id != peer->peer_expected_id) { - opal_list_remove_item (processing_msgs, (opal_list_item_t *) msg_item); - OBJ_RELEASE(msg_item); - } else { - peer->peer_expected_id++; - } - } - - /* Process remaining messages */ - while (NULL != - (msg_item = (mca_oob_ud_msg_item_t *) opal_list_remove_first (processing_msgs))) { - - switch (msg_item->hdr->msg_type) { - case MCA_OOB_UD_MSG_REQUEST: - mca_oob_ud_event_handle_req (port, msg_item->peer, msg_item->hdr); - break; - case MCA_OOB_UD_MSG_REPLY: - mca_oob_ud_event_handle_rep (port, msg_item->hdr); - break; - case MCA_OOB_UD_MSG_COMPLETE: - mca_oob_ud_event_handle_completion (port, msg_item->hdr); - break; - case MCA_OOB_UD_MSG_DATA_OK: - mca_oob_ud_event_handle_data_ok (port, msg_item->hdr); - break; - case MCA_OOB_UD_MSG_END: - mca_oob_ud_event_handle_end (peer, msg_item->hdr); - break; - default: - /* do nothing */ - break; - } - - OBJ_RELEASE(msg_item); - } - - return count; -} - -static int mca_oob_ud_event_handle_ack (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer, - mca_oob_ud_msg_hdr_t *msg_hdr) -{ - mca_oob_ud_msg_t *msg; - - opal_output_verbose(10, orte_oob_base_framework.framework_output, - "%s oob:ud:event_handle_ack got ack for msg id %" PRIu64 - " from peer %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg_hdr->msg_id, - ORTE_NAME_PRINT(&peer->peer_name)); - - OPAL_THREAD_LOCK(&peer->peer_lock); - - mca_oob_ud_peer_stop_timer (peer); - - msg = (mca_oob_ud_msg_t *) mca_oob_ud_list_get_first (&peer->peer_flying_messages); - - while (NULL != (msg = (mca_oob_ud_msg_t *) mca_oob_ud_list_get_first (&peer->peer_flying_messages))) { - if (msg->hdr->msg_id > msg_hdr->msg_id) { - break; - } - - msg = (mca_oob_ud_msg_t *)opal_list_remove_first (&peer->peer_flying_messages); - (void) mca_oob_ud_msg_status_update (msg, MCA_OOB_UD_MSG_STATUS_COMPLETE); - } - - mca_oob_ud_peer_start_timer (peer); - - OPAL_THREAD_UNLOCK(&peer->peer_lock); - - return ORTE_SUCCESS; -} - -static int mca_oob_ud_event_handle_nack (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer, - mca_oob_ud_msg_hdr_t *msg_hdr) -{ - mca_oob_ud_msg_t *msg; - - opal_output_verbose(10, orte_oob_base_framework.framework_output, - "%s oob:ud:event_handle_nack got nack for msg id %" PRIu64 - " from peer %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg_hdr->msg_id, - ORTE_NAME_PRINT(&peer->peer_name)); - - OPAL_THREAD_LOCK(&peer->peer_lock); - - mca_oob_ud_peer_stop_timer (peer); - - while (NULL != - (msg = (mca_oob_ud_msg_t *) mca_oob_ud_list_get_first (&peer->peer_flying_messages))) { - if (msg->hdr->msg_id >= msg_hdr->msg_id) { - break; - } - - (void) opal_list_remove_first (&peer->peer_flying_messages); - (void) mca_oob_ud_msg_status_update (msg, MCA_OOB_UD_MSG_STATUS_COMPLETE); - } - - /* repost remaining messages */ - mca_oob_ud_peer_post_all (peer); - - /* reset and start the timer */ - mca_oob_ud_peer_reset_timer (peer); - mca_oob_ud_peer_start_timer (peer); - - OPAL_THREAD_UNLOCK(&peer->peer_lock); - - return ORTE_SUCCESS; -} - -static int mca_oob_ud_event_handle_end (mca_oob_ud_peer_t *peer, mca_oob_ud_msg_hdr_t *msg_hdr) -{ - opal_output_verbose(10, orte_oob_base_framework.framework_output, - "%s oob:ud:event_handle_end got end message from peer %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->peer_name)); - - mca_oob_ud_peer_lost (peer); - - return ORTE_SUCCESS; -} - -static int mca_oob_ud_event_send_ack (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer, mca_oob_ud_msg_hdr_t *msg_hdr) -{ - mca_oob_ud_msg_hdr_t tmp_hdr; - int rc = ORTE_SUCCESS; - struct ibv_send_wr wr; - struct ibv_sge sge; - - opal_output_verbose(10, orte_oob_base_framework.framework_output, - "%s oob:ud:event_send_ack sending ack for message id %" - PRIu64 " peer = %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg_hdr->msg_id, - ORTE_NAME_PRINT(&peer->peer_name)); - - /* reuse registered buffer to send ack (just need to change the type/return address) */ - memcpy (&tmp_hdr, msg_hdr, sizeof (tmp_hdr)); - - msg_hdr->msg_type = MCA_OOB_UD_MSG_ACK; - - /* set return address */ - msg_hdr->ra.qkey = 0; - msg_hdr->ra.name = *ORTE_PROC_MY_NAME; - msg_hdr->ra.port_num = port->port_num; - - mca_oob_ud_fill_sge (&sge, msg_hdr, sizeof (*msg_hdr), port->msg_buf.mr->lkey); - mca_oob_ud_fill_send_wr (&wr, &sge, 1, peer); - - rc = mca_oob_ud_qp_post_send (&port->listen_qp, &wr, 1); - if (ORTE_SUCCESS != rc) { - opal_output (0, "oob:ud:event_send_ack error posting ack!"); - return rc; - } - - memcpy (msg_hdr, &tmp_hdr, sizeof (tmp_hdr)); - - return ORTE_SUCCESS; -} - -static int mca_oob_ud_event_send_nack (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer, mca_oob_ud_msg_hdr_t *msg_hdr) -{ - mca_oob_ud_msg_hdr_t tmp_hdr; - int rc = ORTE_SUCCESS; - struct ibv_send_wr wr; - struct ibv_sge sge; - - opal_output_verbose(10, orte_oob_base_framework.framework_output, - "%s oob:ud:event_send_nack sending nack for message id %" - PRIu64 " peer = %s. msg_id = %" PRIu64, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - peer->peer_expected_id, ORTE_NAME_PRINT(&peer->peer_name), msg_hdr->msg_id); - - /* reuse registered buffer to send the nack (just need to change the type/return address) */ - memcpy (&tmp_hdr, msg_hdr, sizeof (tmp_hdr)); - - msg_hdr->msg_type = MCA_OOB_UD_MSG_NACK; - - /* set return address */ - msg_hdr->ra.qkey = 0; - msg_hdr->ra.name = *ORTE_PROC_MY_NAME; - msg_hdr->ra.port_num = port->port_num; - - msg_hdr->msg_id = peer->peer_expected_id; - - mca_oob_ud_fill_sge (&sge, msg_hdr, sizeof (*msg_hdr), port->msg_buf.mr->lkey); - mca_oob_ud_fill_send_wr (&wr, &sge, 1, peer); - - rc = mca_oob_ud_qp_post_send (&port->listen_qp, &wr, 1); - if (ORTE_SUCCESS != rc) { - opal_output (0, "oob:ud:event_send_ack error posting nack!"); - return rc; - } - - memcpy (msg_hdr, &tmp_hdr, sizeof (tmp_hdr)); - - return ORTE_SUCCESS; -} - -void mca_oob_ud_event_queue_completed (mca_oob_ud_req_t *req) -{ - struct timeval now = {0, 0}; - - mca_oob_ud_req_append_to_list (req, &mca_oob_ud_component.ud_event_queued_reqs); - - if (!(event_completed_set) || - !(opal_event_evtimer_pending (&mca_oob_ud_component.ud_complete_event, &now))) { - event_completed_set = true; - opal_event_evtimer_set (orte_event_base, &mca_oob_ud_component.ud_complete_event, - mca_oob_ud_complete_dispatch, NULL); - opal_event_add (&mca_oob_ud_component.ud_complete_event, &now); - } -} - -static int mca_oob_ud_event_handle_completion (mca_oob_ud_port_t *port, mca_oob_ud_msg_hdr_t *msg_hdr) -{ - mca_oob_ud_req_t *recv_req = msg_hdr->msg_lcl_ctx; - bool brc; - - if (NULL == recv_req) { - opal_output(0, "%s oob:ud:event_handle_completion msg_hdr->msg_lcl_ctx is NULL", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - return ORTE_ERROR; - } - - brc = mca_oob_ud_req_is_in_list (recv_req, &mca_oob_ud_component.ud_active_recvs); - if (false == brc) { - /* duplicate completion message? */ - opal_output_verbose(0, orte_oob_base_framework.framework_output, - "%s oob:ud:event_handle_completion apparent duplicate completion. " - "request %p. req list = %p", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) recv_req, - (void *) recv_req->req_list); - return ORTE_SUCCESS; - } - - recv_req->state = MCA_OOB_UD_REQ_COMPLETE; - mca_oob_ud_event_queue_completed (recv_req); - - return ORTE_SUCCESS; -} - -static int mca_oob_ud_event_handle_data_ok (mca_oob_ud_port_t *port, mca_oob_ud_msg_hdr_t *msg_hdr) -{ - mca_oob_ud_req_t *send_req = msg_hdr->msg_lcl_ctx; - bool brc; - - if (NULL == send_req) { - /* ack! */ - return ORTE_ERROR; - } - - opal_output_verbose(10, orte_oob_base_framework.framework_output, - "%s oob:ud:event_handle_data_ok got data ok message for request %p", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) send_req); - - brc = mca_oob_ud_req_is_in_list (send_req, &mca_oob_ud_component.ud_active_sends); - if (false == brc) { - opal_output_verbose(0, orte_oob_base_framework.framework_output, - "%s oob:ud:event_handle_data_ok apparent duplicate data ok. " - "request %p. req list = %p", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) send_req, - (void *) send_req->req_list); - /* duplicate data ok message? */ - return ORTE_SUCCESS; - } - - send_req->state = MCA_OOB_UD_REQ_COMPLETE; - mca_oob_ud_event_queue_completed (send_req); - - return ORTE_SUCCESS; -} - -static int mca_oob_ud_event_handle_req (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer, mca_oob_ud_msg_hdr_t *msg_hdr) -{ - mca_oob_ud_req_t *recv_req; - int rc; - - rc = mca_oob_ud_recv_match_send (port, peer, msg_hdr, &recv_req); - if (ORTE_SUCCESS == rc) { - mca_oob_ud_event_queue_completed (recv_req); - } - - return rc; -} - -static int mca_oob_ud_event_handle_rep (mca_oob_ud_port_t *port, mca_oob_ud_msg_hdr_t *msg_hdr) -{ - mca_oob_ud_req_t *send_req = (mca_oob_ud_req_t *) msg_hdr->msg_lcl_ctx; - bool brc; - - opal_output_verbose(10, orte_oob_base_framework.framework_output, - "%s oob:ud:event_handle_rep got reply for request %p", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) send_req); - - brc = mca_oob_ud_req_is_in_list (send_req, &mca_oob_ud_component.ud_active_sends); - if (false == brc) { - opal_output_verbose(0, orte_oob_base_framework.framework_output, - "%s oob:ud:event_handle_rep no send matches reply", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - /* duplicate reply message? */ - return ORTE_SUCCESS; - } - - send_req->req_mtu = min(send_req->req_mtu, msg_hdr->msg_data.rep.mtu); - send_req->req_rem_data_len = msg_hdr->msg_data.rep.data_len; - send_req->req_rem_ctx = msg_hdr->msg_rem_ctx; - send_req->req_rem_qpn = msg_hdr->msg_data.rep.qpn; - - mca_oob_ud_event_queue_completed (send_req); - - return ORTE_SUCCESS; -} - -static void *mca_oob_ud_event_dispatch(int fd, int flags, void *context) -{ - int rc; - mca_oob_ud_device_t *device = (mca_oob_ud_device_t *) context; - mca_oob_ud_port_t *port = NULL; - struct ibv_cq *event_cq = NULL; - void *event_context = NULL; - - do { - rc = ibv_get_cq_event (device->ib_channel, &event_cq, &event_context); - } while (rc && errno == EINTR); - - if (NULL == event_cq) { - /* re-arm the event */ - opal_output (0, "%s oob:ud:event_dispatch re-arm the event", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - opal_event_add (&port->device->event, NULL); - - return NULL; - } - - port = (mca_oob_ud_port_t *) event_context; - - rc = mca_oob_ud_process_messages (event_cq, port); - if (rc < 0) { - opal_output (0, "%s oob:ud:event_dispatch error processing messages", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - return NULL; - } - - if (ibv_req_notify_cq(event_cq, 0)) { - opal_output (0, "%s oob:ud:event_dispatch error asking for cq notifications", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - } - - /* re-arm the event */ - opal_event_add (&port->device->event, NULL); - - return NULL; -} - -static void *mca_oob_ud_complete_dispatch(int fd, int flags, void *context) -{ - mca_oob_ud_req_t *req; - - OPAL_THREAD_LOCK(&mca_oob_ud_component.ud_match_lock); - while (NULL != - (req = (mca_oob_ud_req_t *) opal_list_remove_first (&mca_oob_ud_component.ud_event_queued_reqs))) { - OPAL_THREAD_UNLOCK(&mca_oob_ud_component.ud_match_lock); - - opal_output_verbose(10, orte_oob_base_framework.framework_output, - "%s oob:ud:event_process processing request %p", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) req); - - req->req_list = NULL; - - switch (req->type) { - case MCA_OOB_UD_REQ_RECV: - if (req->state == MCA_OOB_UD_REQ_COMPLETE) { - mca_oob_ud_recv_complete (req); - } else { - mca_oob_ud_req_append_to_list (req, &mca_oob_ud_component.ud_active_recvs); - mca_oob_ud_recv_try (req); - } - break; - case MCA_OOB_UD_REQ_SEND: - if (req->state == MCA_OOB_UD_REQ_COMPLETE) { - mca_oob_ud_send_complete (req, ORTE_SUCCESS); - } else { - mca_oob_ud_req_append_to_list (req, &mca_oob_ud_component.ud_active_sends); - mca_oob_ud_send_try (req); - } - break; - default: - break; - } - - OPAL_THREAD_LOCK(&mca_oob_ud_component.ud_match_lock); - } - - return NULL; -} - -static void mca_oob_ud_stop_events (mca_oob_ud_device_t *device) -{ - opal_list_item_t *item; - - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:ud:stop_events stopping event processing", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - for (item = opal_list_get_first (&device->ports) ; - item != opal_list_get_end (&device->ports) ; - item = opal_list_get_next (item)) { - mca_oob_ud_port_t *port = (mca_oob_ud_port_t *) item; - - /* flush all receives */ - mca_oob_ud_qp_to_reset (&port->listen_qp); - } - - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:ud:stop_events events stopped", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); -} diff --git a/orte/mca/oob/ud/oob_ud_peer.c b/orte/mca/oob/ud/oob_ud_peer.c deleted file mode 100644 index bda4b9a1e0..0000000000 --- a/orte/mca/oob/ud/oob_ud_peer.c +++ /dev/null @@ -1,398 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2014 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * 2014 Mellanox Technologies, Inc. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - */ - -#include "oob_ud_peer.h" -#include "oob_ud_component.h" - -#include "opal/include/opal_stdint.h" - -#include "orte/util/name_fns.h" -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/ess/ess.h" -#include "orte/mca/state/state.h" -#include "orte/mca/routed/routed.h" - -static void mca_oob_ud_peer_construct (mca_oob_ud_peer_t *peer); -static void mca_oob_ud_peer_destruct (mca_oob_ud_peer_t *peer); - -OBJ_CLASS_INSTANCE(mca_oob_ud_peer_t, opal_object_t, - mca_oob_ud_peer_construct, - mca_oob_ud_peer_destruct); - - -int mca_oob_ud_peer_lookup (const orte_process_name_t *name, mca_oob_ud_peer_t **peer) { - int rc; - - *peer = NULL; - - rc = opal_proc_table_get_value(&mca_oob_ud_module.peers, - *name, (void**)peer); - if (OPAL_SUCCESS != rc) { - return ORTE_ERR_UNREACH; - } - - return ORTE_SUCCESS; -} - -static inline int mca_oob_ud_parse_uri (const char *uri, uint32_t *qp_num, - uint16_t *lid, uint16_t *port_num) -{ - int rc; - - rc = sscanf (uri, "ud://%u.%hu.%hu", qp_num, lid, port_num); - if (3 != rc) { - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_BAD_PARAM; - } - - return ORTE_SUCCESS; -} - -int mca_oob_ud_peer_update_with_uri (mca_oob_ud_peer_t *peer, const char *uri) -{ - opal_list_item_t *item; - struct ibv_ah_attr ah_attr; - mca_oob_ud_device_t *device; - uint32_t qp_num; - /* NTH: port is 16-bit here because C90 does not support hh in sscanf */ - uint16_t lid, port_num; - int rc; - - rc = mca_oob_ud_parse_uri (uri, &qp_num, &lid, &port_num); - if (ORTE_SUCCESS != rc) { - return rc; - } - - if (peer->peer_lid != lid || peer->peer_port != port_num) { - if (NULL != peer->peer_ah) { - (void) ibv_destroy_ah (peer->peer_ah); - peer->peer_ah = NULL; - } - } - - peer->peer_qpn = qp_num; - peer->peer_qkey = 0; /* NTH: todo -- add qkey support if needed */ - peer->peer_lid = lid; - peer->peer_port = port_num; - - if (NULL == peer->peer_ah) { - memset (&ah_attr, 0, sizeof (ah_attr)); - ah_attr.dlid = lid; - ah_attr.port_num = port_num; - - for (item = opal_list_get_first (&mca_oob_ud_component.ud_devices); - item != opal_list_get_end (&mca_oob_ud_component.ud_devices); - item = opal_list_get_next (item)) { - device = (mca_oob_ud_device_t *)item; - - /* try to create an address handle using this device */ - peer->peer_ah = ibv_create_ah (device->ib_pd, &ah_attr); - if (NULL != peer->peer_ah) { - peer->peer_context = (void *) item; - break; - } - } - - if (NULL == peer->peer_ah) { - free (peer); - return ORTE_ERROR; - } - } - - return ORTE_SUCCESS; -} - -mca_oob_ud_peer_t *mca_oob_ud_get_peer (struct mca_oob_ud_port_t *port, - orte_process_name_t *name, - uint32_t qpn, uint32_t qkey, - uint16_t lid, uint8_t port_num) -{ - struct ibv_ah_attr ah_attr; - mca_oob_ud_peer_t *peer; - int rc; - - rc = mca_oob_ud_peer_lookup (name, &peer); - if (ORTE_SUCCESS == rc) { - opal_output_verbose(20, orte_oob_base_framework.framework_output, - "%s oob:ud:peer_from_msg_hdr using cached peer", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - return peer; - } - - opal_output_verbose(10, orte_oob_base_framework.framework_output, - "%s oob:ud:peer_from_msg_hdr creating peer from return address", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - peer = OBJ_NEW(mca_oob_ud_peer_t); - if (NULL == peer) { - return NULL; - } - - peer->peer_qpn = qpn; - peer->peer_qkey = qkey; - peer->peer_name = *name; - peer->peer_lid = lid; - peer->peer_port = port_num; - - memset (&ah_attr, 0, sizeof (ah_attr)); - ah_attr.dlid = peer->peer_lid; - ah_attr.port_num = peer->peer_port; - - peer->peer_ah = ibv_create_ah (port->device->ib_pd, &ah_attr); - if (NULL == peer->peer_ah) { - free (peer); - return NULL; - } - - peer->peer_context = port->device; - - OPAL_THREAD_LOCK(&mca_oob_ud_component.ud_lock); - - opal_proc_table_set_value(&mca_oob_ud_module.peers, - *name, (void *) peer); - - OPAL_THREAD_UNLOCK(&mca_oob_ud_component.ud_lock); - - return peer; -} - -mca_oob_ud_peer_t *mca_oob_ud_peer_from_uri (const char *uri) -{ - mca_oob_ud_peer_t *peer; - int rc; - - peer = OBJ_NEW(mca_oob_ud_peer_t); - if (NULL == peer) { - return NULL; - } - - rc = mca_oob_ud_peer_update_with_uri (peer, uri); - if (ORTE_SUCCESS != rc) { - OBJ_RELEASE (peer); - peer = NULL; - } - - return peer; -} - -static void mca_oob_ud_peer_construct (mca_oob_ud_peer_t *peer) -{ - memset ((char *) peer + sizeof (peer->super), 0, sizeof (*peer) - sizeof (peer->super)); - OBJ_CONSTRUCT(&peer->peer_flying_messages, opal_list_t); - - peer->peer_expected_id = 1; -} - -void mca_oob_ud_peer_handle_end (mca_oob_ud_peer_t *peer) -{ - mca_oob_ud_port_t *port = NULL; - mca_oob_ud_msg_t *msg = NULL; - int rc; - - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:ud:peer_handle_end telling peer %s i am going away", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer->peer_name)); - - do { - /* tell the peer that we are deleting them */ - if (NULL == peer || NULL == peer->peer_context || false == peer->peer_available || - false == peer->needs_notification) { - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:ud:peer_handle_end don't need to tell %s i am going away", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer->peer_name)); - break; - } - - port = (mca_oob_ud_port_t *) opal_list_get_first (&((mca_oob_ud_device_t *)peer->peer_context)->ports); - if (NULL == port) { - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:ud:peer_handle_end can't tell %s i am going away (no port)", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer->peer_name)); - break; - } - - rc = mca_oob_ud_msg_get (port, NULL, &port->listen_qp, peer, true, &msg); - if (ORTE_SUCCESS != rc) { - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:ud:peer_handle_end can't tell %s i am going away (no message buffer)", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer->peer_name)); - break; - } - - peer->peer_timer.tries = 2; - peer->peer_timer.value.tv_usec = 500000; - - msg->hdr->msg_type = MCA_OOB_UD_MSG_END; - - rc = mca_oob_ud_qp_post_send (&port->listen_qp, &msg->wr, 1); - if (ORTE_SUCCESS != rc) { - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:ud:peer_handle_end can't tell %s i am going away (send failed)", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer->peer_name)); - break; - } - } while (0); - - if (NULL != msg) { - mca_oob_ud_msg_return (msg); - } -} - -void mca_oob_ud_peer_lost (mca_oob_ud_peer_t *peer) -{ - OPAL_THREAD_LOCK(&peer->peer_lock); - - if (true == peer->peer_available) { - peer->peer_available = false; - - opal_output_verbose(10, orte_oob_base_framework.framework_output, - "%s oob:ud:peer_lost lost connectivity to peer %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->peer_name)); - - /* inform the ERRMGR framework that we have lost a connection so - * it can decide if this is important, what to do about it, etc. - */ - ORTE_ACTIVATE_PROC_STATE(&peer->peer_name, ORTE_PROC_STATE_COMM_FAILED); - } - - OPAL_THREAD_UNLOCK(&peer->peer_lock); -} - -void mca_oob_ud_peer_release (mca_oob_ud_peer_t *peer) -{ - OBJ_RELEASE(peer); -} - -static void mca_oob_ud_peer_destruct (mca_oob_ud_peer_t *peer) -{ - - if (NULL != peer->peer_ah) { - (void) ibv_destroy_ah (peer->peer_ah); - } -} - -static void mca_oob_ud_peer_msg_timeout (int fd, short event, void *ctx) -{ - mca_oob_ud_peer_t *peer = (mca_oob_ud_peer_t *) ctx; - mca_oob_ud_msg_t *msg = (mca_oob_ud_msg_t *) opal_list_get_first (&peer->peer_flying_messages); - - OPAL_THREAD_LOCK(&peer->peer_lock); - - if (false == peer->peer_timer.active) { - return; - } - - peer->peer_timer.active = false; - - opal_output_verbose(10, orte_oob_base_framework.framework_output, - "%s oob:ud:peer_msg_timeout timeout sending to peer %s. first message = %" PRIu64 " which has length %d" , - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer->peer_name), msg->hdr->msg_id, msg->wr.sg_list[0].length); - - if (peer->peer_timer.tries == 0) { - opal_list_item_t *item; - - - while (NULL != (item = opal_list_remove_first (&peer->peer_flying_messages))) { - msg = (mca_oob_ud_msg_t *) item; - - mca_oob_ud_msg_status_update (msg, MCA_OOB_UD_MSG_STATUS_TIMEOUT); - if (msg->req) { - mca_oob_ud_req_complete (msg->req, ORTE_ERR_TIMEOUT); - } - } - - OPAL_THREAD_UNLOCK(&peer->peer_lock); - mca_oob_ud_peer_lost (peer); - return; - } - - peer->peer_timer.tries--; - mca_oob_ud_peer_post_all (peer); - mca_oob_ud_peer_start_timer (peer); - - OPAL_THREAD_UNLOCK(&peer->peer_lock); -} - -int mca_oob_ud_peer_post_msg (mca_oob_ud_peer_t *peer, mca_oob_ud_msg_t *msg) -{ - int rc; - - msg->hdr->msg_id = ++peer->peer_next_id; - - rc = mca_oob_ud_qp_post_send (msg->qp, &msg->wr, 1); - if (ORTE_SUCCESS != rc) { - return rc; - } - - opal_list_append (&peer->peer_flying_messages, (opal_list_item_t *) msg); - - if (false == peer->peer_timer.active) { - mca_oob_ud_peer_reset_timer (peer); - mca_oob_ud_peer_start_timer (peer); - } - - return ORTE_SUCCESS; -} - -void mca_oob_ud_peer_stop_timer (mca_oob_ud_peer_t *peer) -{ - if (peer->peer_timer.active) { - peer->peer_timer.active = false; - opal_event_evtimer_del (&peer->peer_timer.event); - } -} - -void mca_oob_ud_peer_reset_timer (mca_oob_ud_peer_t *peer) -{ - peer->peer_timer.tries = mca_oob_ud_component.ud_max_retries; - - peer->peer_timer.value.tv_sec = mca_oob_ud_component.ud_timeout_usec / 1000000; - peer->peer_timer.value.tv_usec = mca_oob_ud_component.ud_timeout_usec % 1000000; -} - -void mca_oob_ud_peer_start_timer (mca_oob_ud_peer_t *peer) -{ - if (!peer->peer_timer.active && opal_list_get_size (&peer->peer_flying_messages)) { - peer->peer_timer.active = true; - - opal_event_evtimer_set (orte_event_base, &peer->peer_timer.event, - mca_oob_ud_peer_msg_timeout, (void *) peer); - opal_event_evtimer_add (&peer->peer_timer.event, &peer->peer_timer.value); - } -} - -void mca_oob_ud_peer_post_all (mca_oob_ud_peer_t *peer) -{ - opal_list_item_t *item; - - opal_output_verbose(10, orte_oob_base_framework.framework_output, - "%s oob:ud:peer_post_all reposting all messages for peer %p", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) peer); - - for (item = opal_list_get_first (&peer->peer_flying_messages) ; - item != opal_list_get_end (&peer->peer_flying_messages) ; - item = opal_list_get_next (item)) { - mca_oob_ud_msg_t *msg = (mca_oob_ud_msg_t *) item; - (void) mca_oob_ud_qp_post_send (msg->qp, &msg->wr, 1); - } -} diff --git a/orte/mca/oob/ud/oob_ud_peer.h b/orte/mca/oob/ud/oob_ud_peer.h deleted file mode 100644 index 9b72f2c896..0000000000 --- a/orte/mca/oob/ud/oob_ud_peer.h +++ /dev/null @@ -1,97 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights - * reserved. - * 2014 Mellanox Technologies, Inc. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - */ - -#if !defined(MCA_OOB_UD_PEER_H) -#define MCA_OOB_UD_PEER_H - -#include "orte_config.h" - -#ifdef HAVE_SYS_TYPES_H -#include -#endif - -#include "orte/types.h" - -#include "opal/mca/base/base.h" -#include "opal/class/opal_free_list.h" -#include "opal/class/opal_hash_table.h" -#include "opal/threads/mutex.h" -#include "opal/threads/condition.h" -#include "opal/threads/threads.h" -#include "opal/mca/timer/base/base.h" - -#include "orte/mca/oob/oob.h" -#include "orte/mca/oob/base/base.h" - -#include - -struct mca_oob_ud_msg_hdr_t; -struct mca_oob_ud_port_t; - -struct mca_oob_ud_peer_t { - opal_object_t super; - - void *peer_context; - struct ibv_ah *peer_ah; - uint32_t peer_qpn; - uint32_t peer_qkey; - uint64_t peer_next_id; - uint64_t peer_expected_id; - orte_process_name_t peer_name; - uint16_t peer_lid; - uint8_t peer_port; - bool peer_available; - bool needs_notification; - - opal_list_t peer_flying_messages; - opal_mutex_t peer_lock; - - struct { - int tries; - opal_event_t event; - struct timeval value; - bool active; - } peer_timer; -}; -typedef struct mca_oob_ud_peer_t mca_oob_ud_peer_t; -OBJ_CLASS_DECLARATION(mca_oob_ud_peer_t); - - -int mca_oob_ud_peer_lookup (const orte_process_name_t *name, mca_oob_ud_peer_t **peer); - -int mca_oob_ud_peer_update_with_uri (mca_oob_ud_peer_t *peer, const char *uri); - -mca_oob_ud_peer_t *mca_oob_ud_peer_from_uri (const char *uri); - -mca_oob_ud_peer_t *mca_oob_ud_get_peer (struct mca_oob_ud_port_t *port, - orte_process_name_t *name, - uint32_t qpn, uint32_t qkey, - uint16_t lid, uint8_t port_num); - -void mca_oob_ud_peer_lost (mca_oob_ud_peer_t *peer); -void mca_oob_ud_peer_release (mca_oob_ud_peer_t *peer); - -struct mca_oob_ud_msg_t; - -int mca_oob_ud_peer_post_msg (mca_oob_ud_peer_t *peer, struct mca_oob_ud_msg_t *msg); - -void mca_oob_ud_peer_start_timer (mca_oob_ud_peer_t *peer); -void mca_oob_ud_peer_stop_timer (mca_oob_ud_peer_t *peer); -void mca_oob_ud_peer_reset_timer (mca_oob_ud_peer_t *peer); - -void mca_oob_ud_peer_post_all (mca_oob_ud_peer_t *peer); -void mca_oob_ud_peer_handle_end (mca_oob_ud_peer_t *peer); - -#endif - diff --git a/orte/mca/oob/ud/oob_ud_ping.c b/orte/mca/oob/ud/oob_ud_ping.c deleted file mode 100644 index 50d4cc0093..0000000000 --- a/orte/mca/oob/ud/oob_ud_ping.c +++ /dev/null @@ -1,70 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights - * reserved. - * 2014 Mellanox Technologies, Inc. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - */ - -#include "opal/mca/event/event.h" -#include "opal/opal_socket_errno.h" - -#include "orte/util/name_fns.h" -#include "orte/runtime/orte_globals.h" - -#include "oob_ud_ping.h" - -int mca_oob_ud_process_ping(int fd, short args, void *cbdata) -{ - mca_oob_ud_ping_t *op = (mca_oob_ud_ping_t*)cbdata; - - orte_process_name_t* name = &op->peer; - mca_oob_ud_peer_t *peer; - mca_oob_ud_port_t *port; - mca_oob_ud_msg_t *msg = NULL; - int rc; - - opal_output_verbose (2, orte_oob_base_framework.framework_output, - "%s oob:ud:ping attempting to ping %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(name)); - - rc = mca_oob_ud_peer_lookup(name, &peer); - if (rc != ORTE_SUCCESS) { - return rc; - } - - /* NTH: TODO -- get a random port? */ - port = (mca_oob_ud_port_t *) opal_list_get_first (&((mca_oob_ud_device_t *)peer->peer_context)->ports); - - do { - rc = mca_oob_ud_msg_get (port, NULL, &port->listen_qp, peer, true, &msg); - if (ORTE_SUCCESS != rc) { - break; - } - - msg->hdr->msg_type = MCA_OOB_UD_MSG_PING; - - rc = mca_oob_ud_msg_post_send (msg); - - /* wait for ack */ - rc = mca_oob_ud_msg_wait (msg); - - opal_output_verbose (2, orte_oob_base_framework.framework_output, - "%s oob:ud:ping result to %s -> %s: %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(name), rc); - } while (0); - - if (NULL != msg) { - mca_oob_ud_msg_return(msg); - } - - mca_oob_ud_peer_release (peer); - - return rc; -} diff --git a/orte/mca/oob/ud/oob_ud_ping.h b/orte/mca/oob/ud/oob_ud_ping.h deleted file mode 100644 index 1b49422b04..0000000000 --- a/orte/mca/oob/ud/oob_ud_ping.h +++ /dev/null @@ -1,39 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights - * reserved. - * 2014 Mellanox Technologies, Inc. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - */ - -#ifndef _MCA_OOB_UD_PING_H_ -#define _MCA_OOB_UD_PING_H_ - -#include "oob_ud_component.h" - -typedef struct { - opal_object_t super; - opal_event_t ev; - orte_process_name_t peer; -} mca_oob_ud_ping_t; -OBJ_CLASS_DECLARATION(mca_oob_ud_ping_t); - -#define ORTE_ACTIVATE_UD_PING(p, cbfunc) \ - do { \ - mca_oob_ud_ping_t *pop; \ - pop = OBJ_NEW(mca_oob_ud_ping_t); \ - pop->peer.jobid = (p)->jobid; \ - pop->peer.vpid = (p)->vpid; \ - opal_event_set(mca_oob_ud_module.ev_base, &pop->ev, -1, \ - OPAL_EV_WRITE, (cbfunc), pop); \ - opal_event_set_priority(&pop->ev, ORTE_MSG_PRI); \ - opal_event_active(&pop->ev, OPAL_EV_WRITE, 1); \ - } while(0); - -#endif /* _MCA_OOB_UD_PING_H_ */ diff --git a/orte/mca/oob/ud/oob_ud_qp.c b/orte/mca/oob/ud/oob_ud_qp.c deleted file mode 100644 index 1fde08c4fa..0000000000 --- a/orte/mca/oob/ud/oob_ud_qp.c +++ /dev/null @@ -1,321 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights - * reserved. - * 2014 Mellanox Technologies, Inc. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - */ - -#include "oob_ud_component.h" -#include "oob_ud_qp.h" -#include "oob_ud.h" -#include "orte/util/show_help.h" - -static void mca_oob_ud_qp_constructor (mca_oob_ud_qp_t *qp); -static void mca_oob_ud_qp_destructor (mca_oob_ud_qp_t *qp); - -OBJ_CLASS_INSTANCE(mca_oob_ud_qp_t, opal_free_list_item_t, - mca_oob_ud_qp_constructor, - mca_oob_ud_qp_destructor); - -static inline int mca_oob_ud_qp_process_send_completions (mca_oob_ud_qp_t *qp, - int num_completions); - -#define MCA_OOB_UD_CLEAR_CQ(cq) \ - do { \ - if (NULL == (cq)->channel) { \ - struct ibv_wc wc; \ - while (ibv_poll_cq ((cq), 1, &wc)); \ - } \ - } while (0); \ - -int mca_oob_ud_qp_init (mca_oob_ud_qp_t *qp, struct mca_oob_ud_port_t *port, - struct ibv_comp_channel *recv_channel, - struct ibv_comp_channel *send_channel, bool onecq) -{ - struct ibv_qp_init_attr init_attr; - int max_cqe = min(port->device->attr.max_cqe, 16384); - - opal_output_verbose(10, orte_oob_base_framework.framework_output, - "%s oob:ud:qp_init creating UD QP on port %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), port->port_num); - - /* create a UD queue pair */ - memset(&init_attr, 0, sizeof(init_attr)); - - init_attr.qp_type = IBV_QPT_UD; - - qp->ib_recv_cq = ibv_create_cq (port->device->ib_context, max_cqe, - port, recv_channel, 0); - if (NULL == qp->ib_recv_cq) { - orte_show_help("help-oob-ud.txt", "create-cq-failed", true, - orte_process_info.nodename, max_cqe, strerror(errno)); - return ORTE_ERROR; - } - if (false == onecq) { - qp->ib_send_cq = ibv_create_cq (port->device->ib_context, max_cqe, - port, send_channel, 0); - if (NULL == qp->ib_send_cq) { - orte_show_help("help-oob-ud.txt", "create-cq-failed", true, - orte_process_info.nodename, max_cqe, strerror(errno)); - return ORTE_ERROR; - } - } else { - qp->ib_send_cq = qp->ib_recv_cq; - } - - init_attr.send_cq = qp->ib_send_cq; - init_attr.recv_cq = qp->ib_recv_cq; - - mca_oob_ud_device_t *device = (mca_oob_ud_device_t *) opal_list_get_first (&mca_oob_ud_component.ud_devices); - opal_output_verbose(80, orte_oob_base_framework.framework_output, - "%s oob:ud:qp_init create queue pair for device: device->attr.max_sge = %d, device->attr.max_qp_wr = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), device->attr.max_sge, device->attr.max_qp_wr); - - init_attr.cap.max_send_sge = mca_oob_ud_component.ud_qp_max_send_sge; - init_attr.cap.max_recv_sge = mca_oob_ud_component.ud_qp_max_recv_sge; /* GRH, data */ - init_attr.cap.max_inline_data = mca_oob_ud_component.ud_qp_max_inline_data; - init_attr.cap.max_recv_wr = min(mca_oob_ud_component.ud_qp_max_recv_wr, device->attr.max_qp_wr); - init_attr.cap.max_send_wr = min(mca_oob_ud_component.ud_qp_max_send_wr, device->attr.max_qp_wr); - - qp->ib_qp = ibv_create_qp (port->device->ib_pd, &init_attr); - if (NULL == qp->ib_qp) { - orte_show_help("help-oob-ud.txt", "create-qp-failed", true, - orte_process_info.nodename, init_attr.cap.max_send_sge, init_attr.cap.max_recv_sge, - init_attr.cap.max_send_wr, init_attr.cap.max_recv_wr, init_attr.cap.max_inline_data, - strerror(errno)); - return ORTE_ERROR; - } - /* end: create the UD queue pair */ - - qp->port = port; - - return ORTE_SUCCESS; -} - -int mca_oob_ud_qp_to_reset (mca_oob_ud_qp_t *qp) -{ - struct ibv_qp_attr attr; - - /* move the QP into the ERR state */ - memset(&attr, 0, sizeof(attr)); - attr.qp_state = IBV_QPS_ERR; - - if (0 != ibv_modify_qp(qp->ib_qp, &attr, IBV_QP_STATE)) { - orte_show_help("help-oob-ud.txt", "modify-qp-failed", true, - orte_process_info.nodename, IBV_QP_STATE, strerror(errno)); - return ORTE_ERROR; - } - - /* poll thread/event will clear failed work requests */ - MCA_OOB_UD_CLEAR_CQ(qp->ib_send_cq); - MCA_OOB_UD_CLEAR_CQ(qp->ib_recv_cq); - - /* move the QP into the RESET state */ - memset(&attr, 0, sizeof(attr)); - attr.qp_state = IBV_QPS_RESET; - - if (0 != ibv_modify_qp(qp->ib_qp, &attr, IBV_QP_STATE)) { - orte_show_help("help-oob-ud.txt", "modify-qp-failed", true, - orte_process_info.nodename, IBV_QP_STATE, strerror(errno)); - return ORTE_ERROR; - } - - return ORTE_SUCCESS; -} - -int mca_oob_ud_qp_to_rts (mca_oob_ud_qp_t *qp) -{ - struct mca_oob_ud_port_t *port = qp->port; - int attr_mask; - struct ibv_qp_attr attr; - - /* move the QP into the INIT state */ - memset(&attr, 0, sizeof(attr)); - attr.qp_state = IBV_QPS_INIT; - attr.pkey_index = 0; /* NTH: might need to modify the pkey index later */ - attr.port_num = port->port_num; - attr.qkey = 0; - - attr_mask = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_QKEY; - - if (0 != ibv_modify_qp(qp->ib_qp, &attr, attr_mask)) { - orte_show_help("help-oob-ud.txt", "modify-qp-failed", true, - orte_process_info.nodename, attr_mask, strerror(errno)); - return ORTE_ERROR; - } - - /* Move QP to RTR */ - attr.qp_state = IBV_QPS_RTR; - - if (0 != ibv_modify_qp(qp->ib_qp, &attr, IBV_QP_STATE)) { - orte_show_help("help-oob-ud.txt", "modify-qp-failed", true, - orte_process_info.nodename, attr_mask, strerror(errno)); - return ORTE_ERROR; - } - - /* Setup attributes */ - memset(&attr, 0, sizeof(attr)); - attr.qp_state = IBV_QPS_RTS; - attr.sq_psn = 0; - attr_mask = IBV_QP_STATE | IBV_QP_SQ_PSN; - - if (0 != ibv_modify_qp(qp->ib_qp, &attr, attr_mask)) { - orte_show_help("help-oob-ud.txt", "modify-qp-failed", true, - orte_process_info.nodename, attr_mask, strerror(errno)); - return ORTE_ERROR; - } - - return ORTE_SUCCESS; -} - -/* purge all work requests on a qp */ -int mca_oob_ud_qp_purge (mca_oob_ud_qp_t *qp) -{ - int rc; - - rc = mca_oob_ud_qp_to_reset (qp); - if (ORTE_SUCCESS != rc) { - return rc; - } - - return mca_oob_ud_qp_to_rts (qp); -} - -static void mca_oob_ud_qp_constructor (mca_oob_ud_qp_t *qp) -{ - memset ((char *)qp + sizeof(qp->super), 0, sizeof (*qp) - sizeof (qp->super)); -} - -static void mca_oob_ud_qp_destructor (mca_oob_ud_qp_t *qp) -{ - int rc; - - if (NULL != qp->ib_qp) { - /* clear qp and move to reset */ - (void) mca_oob_ud_qp_to_reset (qp); - - /* destroy qp */ - rc = ibv_destroy_qp (qp->ib_qp); - if (0 != rc) { - orte_show_help("help-oob-ud.txt", "destroy-qp-failed", true, - orte_process_info.nodename, strerror(errno)); - } - } - - if (NULL != qp->ib_send_cq) { - (void) ibv_destroy_cq (qp->ib_send_cq); - } - - if (NULL != qp->ib_recv_cq && qp->ib_recv_cq != qp->ib_send_cq) { - (void) ibv_destroy_cq (qp->ib_recv_cq); - } -} - -static inline int mca_oob_ud_qp_process_send_completions (mca_oob_ud_qp_t *qp, - int num_completions) -{ - struct ibv_wc wc[1]; - int count, rc, ret, i; - - opal_output_verbose(10, orte_oob_base_framework.framework_output, - "%s oob:ud:qp_process_send_completions polling for %d completions", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - num_completions); - - rc = ORTE_SUCCESS; - - for (count = 0 ; count < num_completions ; ) { - ret = ibv_poll_cq (qp->ib_send_cq, 1, wc); - if (ret < 0) { - orte_show_help("help-oob-ud.txt", "poll-cq-failed", true, - orte_process_info.nodename, 1, strerror(errno)); - return ORTE_ERROR; - } - for (i = 0 ; i < ret ; ++i) { - if (IBV_WC_SUCCESS != wc[i].status) { - orte_show_help("help-oob-ud.txt", "poll-cq-failed-wc", true, - orte_process_info.nodename, 1, i, wc[i].status); - rc = ORTE_ERROR; - } - } - count += ret; - } - - return rc; -} - -int mca_oob_ud_qp_post_send (mca_oob_ud_qp_t *qp, struct ibv_send_wr *wr, - int num_completions) { - struct ibv_send_wr *bad_wr; - int rc; - - rc = ibv_post_send (qp->ib_qp, wr, &bad_wr); - if (0 != rc) { - orte_show_help("help-oob-ud.txt", "post-send-failed", true, - orte_process_info.nodename, strerror(errno)); - return ORTE_ERROR; - } - return mca_oob_ud_qp_process_send_completions (qp, num_completions); -} - -int mca_oob_ud_qp_post_recv (mca_oob_ud_qp_t *qp, struct ibv_recv_wr *wr) { - - struct ibv_recv_wr *bad_wr; - int rc; - - rc = ibv_post_recv (qp->ib_qp, wr, &bad_wr); - if (0 != rc) { - orte_show_help("help-oob-ud.txt", "post-recv-failed", true, - orte_process_info.nodename, strerror(errno)); - return ORTE_ERROR; - } - return ORTE_SUCCESS; -} - -int mca_oob_ud_qp_data_aquire (struct mca_oob_ud_port_t *port, mca_oob_ud_qp_t **qp_ptr) { - int rc = ORTE_SUCCESS; - opal_free_list_item_t *item; - - do { - item = opal_free_list_get_st (&port->data_qps); - if (NULL == item) { - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:ud:qp_data_aquire error allocating new data qp. error = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rc); - rc = ORTE_ERR_TEMP_OUT_OF_RESOURCE; - break; - } - - *qp_ptr = (mca_oob_ud_qp_t *) item; - - if (NULL == (*qp_ptr)->ib_qp) { - rc = mca_oob_ud_qp_init (*qp_ptr, port, NULL, NULL, true); - if (ORTE_SUCCESS != rc) { - break; - } - - rc = mca_oob_ud_qp_to_rts (*qp_ptr); - } - } while (0); - - return rc; -} - -int mca_oob_ud_qp_data_release (mca_oob_ud_qp_t *qp) { - int rc; - rc = mca_oob_ud_qp_purge (qp); - if (ORTE_SUCCESS != rc) { - return rc; - } - - opal_free_list_return_st (&qp->port->data_qps, &qp->super); - - return ORTE_SUCCESS; -} diff --git a/orte/mca/oob/ud/oob_ud_qp.h b/orte/mca/oob/ud/oob_ud_qp.h deleted file mode 100644 index 9354ee226b..0000000000 --- a/orte/mca/oob/ud/oob_ud_qp.h +++ /dev/null @@ -1,73 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights - * reserved. - * 2014 Mellanox Technologies, Inc. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - */ - -#if !defined(MCA_OOB_UD_QP_H) -#define MCA_OOB_UD_QP_H - -#include "orte_config.h" - -#ifdef HAVE_SYS_TYPES_H -#include -#endif - -#include "orte/types.h" - -#include "opal/mca/base/base.h" -#include "opal/class/opal_free_list.h" -#include "opal/class/opal_hash_table.h" -#include "opal/threads/mutex.h" -#include "opal/threads/condition.h" -#include "opal/threads/threads.h" -#include "opal/mca/timer/base/base.h" - -#include "orte/mca/oob/oob.h" -#include "orte/mca/oob/base/base.h" - -#include - - -enum mca_oob_ud_qp_type_t { - MCA_OOB_UD_QP_DATA, - MCA_OOB_UD_QP_LISTEN -}; - -struct mca_oob_ud_port_t; - -struct mca_oob_ud_qp_t { - opal_free_list_item_t super; - enum mca_oob_ud_qp_type_t type; - - struct ibv_qp *ib_qp; - struct mca_oob_ud_port_t *port; - - struct ibv_cq *ib_send_cq, *ib_recv_cq; -}; -typedef struct mca_oob_ud_qp_t mca_oob_ud_qp_t; -OBJ_CLASS_DECLARATION(mca_oob_ud_qp_t); - -int mca_oob_ud_qp_init (mca_oob_ud_qp_t *qp, struct mca_oob_ud_port_t *port, - struct ibv_comp_channel *recv_channel, - struct ibv_comp_channel *send_channel, bool onecq); - -int mca_oob_ud_qp_to_reset (mca_oob_ud_qp_t *qp); -int mca_oob_ud_qp_to_rts (mca_oob_ud_qp_t *qp); -int mca_oob_ud_qp_purge (mca_oob_ud_qp_t *qp); - -int mca_oob_ud_qp_post_send (mca_oob_ud_qp_t *qp, struct ibv_send_wr *wr, int num_completions); -int mca_oob_ud_qp_post_recv (mca_oob_ud_qp_t *qp, struct ibv_recv_wr *wr); - -int mca_oob_ud_qp_data_aquire (struct mca_oob_ud_port_t *port, mca_oob_ud_qp_t **qp_ptr); -int mca_oob_ud_qp_data_release (mca_oob_ud_qp_t *qp); - -#endif diff --git a/orte/mca/oob/ud/oob_ud_recv.c b/orte/mca/oob/ud/oob_ud_recv.c deleted file mode 100644 index fb1e4ef491..0000000000 --- a/orte/mca/oob/ud/oob_ud_recv.c +++ /dev/null @@ -1,539 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights - * reserved. - * 2014 Mellanox Technologies, Inc. - * All rights reserved. - * Copyright (c) 2015 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - */ - -#include "orte_config.h" -#include "orte/types.h" -#include "opal/types.h" - -#include "orte/mca/errmgr/errmgr.h" -#include "orte/runtime/orte_globals.h" -#include "orte/util/name_fns.h" -#include "orte/util/show_help.h" - -#include "math.h" - -#include "oob_ud_component.h" - -#define min(a,b) ((a) < (b) ? (a) : (b)) - -/* Caller MUST hold the matching lock before calling */ -static inline int mca_oob_ud_find_recv (opal_list_t *list, const orte_process_name_t name, - const int tag, mca_oob_ud_req_t **req) -{ - opal_list_item_t *item; - int rc = ORTE_ERR_NOT_FOUND; - - *req = NULL; - - OPAL_THREAD_LOCK(&mca_oob_ud_component.ud_match_lock); - - for (item = opal_list_get_first (list) ; item != opal_list_get_end (list) ; - item = opal_list_get_next (item)) { - mca_oob_ud_req_t *recv_req = (mca_oob_ud_req_t *) item; - - opal_output_verbose(15, orte_oob_base_framework.framework_output, - "%s oob:ud:find_recv matching against " - "peer: %s, tag: %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&recv_req->req_origin), recv_req->req_tag); - - if (OPAL_EQUAL == opal_dss.compare (&name, &recv_req->req_origin, ORTE_NAME) && - tag == recv_req->req_tag) { - *req = recv_req; - rc = ORTE_SUCCESS; - break; - } - } - - opal_output_verbose(15, orte_oob_base_framework.framework_output, - "%s oob:ud:find_recv %sfound", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_SUCCESS != rc ? "not " : ""); - - - OPAL_THREAD_UNLOCK(&mca_oob_ud_component.ud_match_lock); - - if (ORTE_SUCCESS == rc) { - mca_oob_ud_req_append_to_list (*req, NULL); - } - - return rc; -} - -int mca_oob_ud_get_recv_req (const orte_process_name_t name, const int tag, - mca_oob_ud_req_t **reqp, bool iovec_used) { - mca_oob_ud_req_t *req; - - opal_output_verbose(15, orte_oob_base_framework.framework_output, - "%s oob:ud:get_recv_req create receive request against: %s, tag: %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&name), tag); - - *reqp = req = OBJ_NEW(mca_oob_ud_req_t); - - req->req_origin = name; - req->req_tag = tag; - req->req_channel = ORTE_RML_INVALID_CHANNEL_NUM; - req->req_seq_num = 0; - /* this receive was not expected */ - req->type = MCA_OOB_UD_REQ_RECV; - - /* let mca_oob_ud_recv_alloc alloc memory for the receive */ - if (iovec_used) { - req->req_data.iov.uiov = calloc (1, sizeof (struct iovec)); - req->req_data_type = MCA_OOB_UD_REQ_IOV; - } else { - req->req_data_type = MCA_OOB_UD_REQ_BUF; - } - req->req_data.iov.count = 1; - - return ORTE_SUCCESS; -} - -static inline int mca_oob_ud_find_active_recv (const orte_process_name_t name, const int tag, - mca_oob_ud_req_t **req) { - opal_output_verbose(15, orte_oob_base_framework.framework_output, - "%s oob:ud:recv_match active receive request " - "against: %s, tag: %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&name), tag); - - return mca_oob_ud_find_recv (&mca_oob_ud_component.ud_active_recvs, name, tag, req); -} - -static void mca_oob_ud_recv_try_to (int fd, short event, void *data) -{ - (void) mca_oob_ud_recv_try ((mca_oob_ud_req_t *) data); -} - -int mca_oob_ud_recv_try (mca_oob_ud_req_t *recv_req) -{ - int rc, data_len; - int wr_count, sge_count, wr_index, sge_index, iov_index; - unsigned int iov_left, iov_offset, packet_size; - const unsigned int mtu = recv_req->req_mtu; - struct timeval aquire_timeout = {0, 500000}; - mca_oob_ud_msg_t *rep_msg = NULL; - - opal_output_verbose(10, orte_oob_base_framework.framework_output, - "%s oob:ud:recv_try receiving from %s. recv_req = %p. rem ctx = %p", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&recv_req->req_peer->peer_name), - (void *)recv_req, (void *)recv_req->req_rem_ctx); - - do { - if (NULL == recv_req->req_qp) { - rc = mca_oob_ud_qp_data_aquire (recv_req->req_port, &recv_req->req_qp); - if (ORTE_SUCCESS != rc) { - break; - } - } - - (void) mca_oob_ud_qp_purge (recv_req->req_qp); - - rc = mca_oob_ud_msg_get (recv_req->req_port, recv_req, &recv_req->req_port->listen_qp, - recv_req->req_peer, NULL, &rep_msg); - if (ORTE_SUCCESS != rc) { - break; - } - - if (MCA_OOB_UD_REQ_IOV == recv_req->req_data_type) { - if (NULL == recv_req->req_data.iov.mr) { - /* allocate space for memory registers */ - recv_req->req_data.iov.mr = (struct ibv_mr **) calloc (recv_req->req_data.iov.count, sizeof (struct ibv_mr *)); - if (NULL == recv_req->req_data.iov.mr) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - rc = ORTE_ERR_OUT_OF_RESOURCE; - break; - } - } - - rc = mca_oob_ud_register_iov (recv_req->req_data.iov.uiov, recv_req->req_data.iov.count, - recv_req->req_data.iov.mr, recv_req->req_port->device->ib_pd, - mtu, &sge_count, &wr_count, &data_len); - - if (ORTE_SUCCESS != rc) { - break; - } - } else { - data_len = recv_req->req_data.buf.size; - rc = mca_oob_ud_register_buf (recv_req->req_data.buf.p, recv_req->req_data.buf.size, - &recv_req->req_data.buf.mr, recv_req->req_port->device->ib_pd, - mtu, &sge_count, &wr_count); - - if (ORTE_SUCCESS != rc) { - break; - } - } - - data_len = min(data_len, recv_req->req_rem_data_len); - if (data_len < recv_req->req_rem_data_len) { - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:ud:recv_try receive buffers are not big. this is probably an error condition." - "data_len = %d, recv_req->req_rem_data_len = %d.", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), data_len, recv_req->req_rem_data_len); - rc = ORTE_ERR_BAD_PARAM; - break; - } - - wr_count = (data_len + mtu - 1) / mtu; - sge_count += wr_count; - - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:ud:recv_try receiving %d bytes in %d " - "work requests, %d sges", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), data_len, - wr_count, sge_count); - - recv_req->req_packet_count = wr_count; - - if (NULL == recv_req->req_wr.recv) { - /* allocate work requests */ - recv_req->req_wr.recv = (struct ibv_recv_wr *) calloc (wr_count, sizeof (struct ibv_recv_wr)); - if (NULL == recv_req->req_wr.recv) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - rc = ORTE_ERR_OUT_OF_RESOURCE; - break; - } - } - - if (NULL == recv_req->req_sge) { - /* allocate scatter-gather lists. we need more to hold the grh */ - recv_req->req_sge = (struct ibv_sge *) calloc (sge_count, sizeof (struct ibv_sge)); - if (NULL == recv_req->req_sge) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - rc = ORTE_ERR_OUT_OF_RESOURCE; - break; - } - } - - if (NULL == recv_req->req_grh) { - /* allocate grh buffers */ - recv_req->req_grh = (struct ibv_grh *) calloc (wr_count, sizeof (struct ibv_grh)); - if (NULL == recv_req->req_grh) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - rc = ORTE_ERR_OUT_OF_RESOURCE; - break; - } - } - - if (NULL == recv_req->req_grh_mr) { - /* register grh buffers */ - recv_req->req_grh_mr = ibv_reg_mr (recv_req->req_port->device->ib_pd, recv_req->req_grh, - wr_count * sizeof (struct ibv_grh), - IBV_ACCESS_LOCAL_WRITE); - if (NULL == recv_req->req_grh_mr) { - orte_show_help("help-oob-ud.txt", "reg-mr-failed", true, - orte_process_info.nodename, recv_req->req_grh, - wr_count * sizeof (struct ibv_grh), strerror(errno)); - /* could not register memory */ - rc = ORTE_ERR_OUT_OF_RESOURCE; - break; - } - } - - rc = ORTE_SUCCESS; - - if (MCA_OOB_UD_REQ_IOV == recv_req->req_data_type) { - iov_left = recv_req->req_data.iov.uiov[0].iov_len; - iov_offset = 0; - iov_index = 0; - - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:ud: recv_req->req_data.iov.uiov[0].iov_len = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)recv_req->req_data.iov.uiov[0].iov_len); - - for (wr_index = 0, sge_index = 0 ; wr_index < wr_count ; ++wr_index) { - int sge_first = sge_index; - - packet_size = 0; - - /* grh */ - mca_oob_ud_fill_sge(recv_req->req_sge + sge_index++, - recv_req->req_grh + wr_index, - sizeof (struct ibv_grh), - recv_req->req_grh_mr->lkey); - - do { - int to_recv = min (iov_left, mtu - packet_size); - - mca_oob_ud_fill_sge(recv_req->req_sge + sge_index++, - (char *)recv_req->req_data.iov.uiov[iov_index].iov_base + iov_offset, - to_recv, recv_req->req_data.iov.mr[iov_index]->lkey); - - iov_offset += to_recv; - iov_left -= to_recv; - packet_size += to_recv; - - if (0 == iov_left) { - iov_index++; - iov_offset = 0; - - if (iov_index < recv_req->req_data.iov.count) { - iov_left = recv_req->req_data.iov.uiov[iov_index].iov_len; - } - } - } while ((packet_size < mtu) && (iov_left > 0)); - - mca_oob_ud_fill_recv_wr(recv_req->req_wr.recv + wr_index, - recv_req->req_sge + sge_first, - sge_index - sge_first); - - if (wr_index + 1 < wr_count) { - recv_req->req_wr.recv[wr_index].next = recv_req->req_wr.recv + wr_index + 1; - } - } - } else { - unsigned int buffer_left = recv_req->req_data.buf.size; - unsigned int buffer_offset = 0; - - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:ud:recv_try recv_req->req_data.buf.size = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv_req->req_data.buf.size); - - for (wr_index = 0, sge_index = 0 ; wr_index < wr_count ; ++wr_index) { - int sge_first = sge_index; - - packet_size = 0; - - /* grh */ - mca_oob_ud_fill_sge(recv_req->req_sge + sge_index++, - recv_req->req_grh + wr_index, - sizeof (struct ibv_grh), - recv_req->req_grh_mr->lkey); - - do { - int to_recv = min (buffer_left, mtu - packet_size); - - mca_oob_ud_fill_sge(recv_req->req_sge + sge_index++, - (char *)recv_req->req_data.buf.p + buffer_offset, - to_recv, recv_req->req_data.buf.mr->lkey); - - buffer_offset += to_recv; - buffer_left -= to_recv; - packet_size += to_recv; - } while ((packet_size < mtu) && (buffer_left > 0)); - - mca_oob_ud_fill_recv_wr(recv_req->req_wr.recv + wr_index, - recv_req->req_sge + sge_first, - sge_index - sge_first); - - if (wr_index + 1 < wr_count) { - recv_req->req_wr.recv[wr_index].next = recv_req->req_wr.recv + wr_index + 1; - } - } - } - - rc = mca_oob_ud_qp_post_recv (recv_req->req_qp, recv_req->req_wr.recv); - if (ORTE_SUCCESS != rc) { - break; - } - - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:ud:recv_try posting reply message", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - /* ok, we have a data queue pair */ - rep_msg->hdr->msg_type = MCA_OOB_UD_MSG_REPLY; - rep_msg->hdr->msg_lcl_ctx = recv_req->req_rem_ctx; - rep_msg->hdr->msg_rem_ctx = recv_req; - - rep_msg->hdr->msg_data.rep.qpn = recv_req->req_qp->ib_qp->qp_num; - rep_msg->hdr->msg_data.rep.data_len = data_len; - rep_msg->hdr->msg_data.rep.mtu = mtu; - - rc = mca_oob_ud_msg_post_send (rep_msg); - - /* post send already returned the message */ - rep_msg = NULL; - } while (0); - - if (ORTE_ERR_TEMP_OUT_OF_RESOURCE == rc) { - mca_oob_ud_req_timer_set (recv_req, &aquire_timeout, 1, mca_oob_ud_recv_try_to); - rc = ORTE_SUCCESS; - } - - if (ORTE_SUCCESS != rc) { - /* bad stuff happened */ - mca_oob_ud_req_complete (recv_req, rc); - - if (mca_oob_ud_req_is_in_list(recv_req, &mca_oob_ud_component.ud_active_recvs)) { - opal_list_remove_item (&mca_oob_ud_component.ud_active_recvs, (opal_list_item_t *) recv_req); - } - OBJ_RELEASE(recv_req); - return rc; - } - - recv_req->state = MCA_OOB_UD_REQ_ACTIVE; - - return rc; -} - -int mca_oob_ud_recv_complete (mca_oob_ud_req_t *recv_req) -{ - mca_oob_ud_msg_t *dataok; - int i, j, rc = ORTE_SUCCESS; - uint32_t expected; - bool error = false, out_of_order = false; -#if defined(HAVE_VALGRIND) - int iov_index; -#endif - - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:ud:recv_complete req = %p", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) recv_req); - - if (false == recv_req->req_is_eager) { - for (i = 0, expected = 0 ; i < recv_req->req_packet_count ; ) { - struct ibv_wc wc[10]; - - rc = ibv_poll_cq (recv_req->req_qp->ib_recv_cq, 10, wc); - for (j = 0 ; j < rc ; ++j) { - if (wc[j].imm_data != expected) { - out_of_order = true; - } - if (IBV_WC_SUCCESS != wc[j].status) { - error = true; - } - - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:ud:recv_complete wc status = %d. imm data = %u. len = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), wc[j].status, wc[j].imm_data, - wc[j].byte_len); - - expected++; - } - - if (rc <= 0) { - break; - } - - i += rc; - } - - if (i != recv_req->req_packet_count || error || out_of_order) { - /* retry */ - recv_req->state = MCA_OOB_UD_REQ_PENDING; - - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:ud:recv_complete receive incomplete. error: %d, " - "out_of_order: %d packets: %d/%d. rc = %d, errno = %d.", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), error, out_of_order, i, - recv_req->req_packet_count, rc, errno); - mca_oob_ud_recv_try (recv_req); - - return ORTE_SUCCESS; - } - - opal_output_verbose(10, orte_oob_base_framework.framework_output, - "%s oob:ud:recv_complete data received ok!", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - /* send data ok and wait for ack */ - rc = mca_oob_ud_msg_get (recv_req->req_port, recv_req, &recv_req->req_port->listen_qp, - recv_req->req_peer, false, &dataok); - if (ORTE_SUCCESS != rc) { - return rc; - } - - dataok->hdr->msg_type = MCA_OOB_UD_MSG_DATA_OK; - dataok->hdr->msg_lcl_ctx = recv_req->req_rem_ctx; - - rc = mca_oob_ud_msg_post_send (dataok); - if (ORTE_SUCCESS != rc) { - return rc; - } - } - -#if defined(HAVE_VALGRIND) - for (iov_index = 0 ; iov_index < recv_req->req_count ; ++iov_index) { - VALGRIND_MAKE_MEM_DEFINED(recv_req->req_uiov[iov_index].iov_base, - recv_req->req_uiov[iov_index].iov_len); - } -#endif - - mca_oob_ud_req_complete (recv_req, rc); - - return ORTE_SUCCESS; -} - -int mca_oob_ud_recv_match_send (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer, mca_oob_ud_msg_hdr_t *msg_hdr, - mca_oob_ud_req_t **reqp) -{ - char *data = (msg_hdr->msg_data.req.data_follows ? (char *)(msg_hdr + 1) : NULL); - mca_oob_ud_req_t *req; - int rc, i; - - *reqp = NULL; - - opal_output_verbose(10, orte_oob_base_framework.framework_output, - "%s oob:ud:recv_incoming_send matching incoming " - "send from peer %s with tag %d (data_follows = %d, data = %p, iovec_use = %d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&msg_hdr->msg_origin), msg_hdr->msg_data.req.tag, - msg_hdr->msg_data.req.data_follows, (void *)data, msg_hdr->msg_data.req.data_iovec_used); - - rc = mca_oob_ud_get_recv_req (msg_hdr->msg_origin, msg_hdr->msg_data.req.tag, &req, msg_hdr->msg_data.req.data_iovec_used); - if (ORTE_SUCCESS != rc) { - ORTE_ERROR_LOG(rc); - return rc; - } - - req->req_rem_ctx = msg_hdr->msg_rem_ctx; - req->req_port = port; - req->req_mtu = min(port->mtu, msg_hdr->msg_data.req.mtu); - req->req_origin = msg_hdr->msg_origin; - req->req_target = msg_hdr->msg_target; - req->req_rem_data_len = msg_hdr->msg_data.req.data_len; - req->req_channel = msg_hdr->msg_channel; - req->req_seq_num = msg_hdr->msg_seq_num; - - do { - rc = mca_oob_ud_recv_alloc (req); - if (ORTE_SUCCESS != rc) { - ORTE_ERROR_LOG(rc); - free (req->req_data.iov.uiov); - OBJ_RELEASE(req); - req = NULL; - break; - } - req->req_peer = peer; - OBJ_RETAIN(req->req_peer); - - if (NULL == data) { - req->state = MCA_OOB_UD_REQ_ACTIVE; - opal_output_verbose(10, orte_oob_base_framework.framework_output, - "%s oob:ud:recv_incoming_send request still active", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - break; - } - - opal_output_verbose(10, orte_oob_base_framework.framework_output, - "%s oob:ud:recv_incoming_send send was eager", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - req->req_is_eager = true; - - if (msg_hdr->msg_data.req.data_iovec_used) { - for (i = 0 ; i < req->req_data.iov.count; ++i) { - memcpy (req->req_data.iov.uiov[i].iov_base, data, req->req_data.iov.uiov[i].iov_len); - data += req->req_data.iov.uiov[i].iov_len; - } - } else { - memcpy(req->req_data.buf.p, data, msg_hdr->msg_data.req.data_len); - } - - req->state = MCA_OOB_UD_REQ_COMPLETE; - } while (0); - - *reqp = req; - - return rc; -} diff --git a/orte/mca/oob/ud/oob_ud_req.c b/orte/mca/oob/ud/oob_ud_req.c deleted file mode 100644 index 4e804a1e9e..0000000000 --- a/orte/mca/oob/ud/oob_ud_req.c +++ /dev/null @@ -1,420 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights - * reserved. - * 2014 Mellanox Technologies, Inc. - * All rights reserved. - * Copyright (c) 2015 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - */ - -#include "oob_ud_component.h" -#include "oob_ud_req.h" - -#include "orte/util/name_fns.h" -#include "orte/runtime/orte_globals.h" - -static void mca_oob_ud_req_constuct (mca_oob_ud_req_t *req); -static void mca_oob_ud_req_destruct (mca_oob_ud_req_t *req); - -OBJ_CLASS_INSTANCE(mca_oob_ud_req_t, opal_list_item_t, mca_oob_ud_req_constuct, - mca_oob_ud_req_destruct); - -static void mca_oob_ud_msg_destruct (mca_oob_ud_msg_t *msg); -static void mca_oob_ud_msg_construct (mca_oob_ud_msg_t *msg); - -OBJ_CLASS_INSTANCE(mca_oob_ud_msg_t, opal_free_list_item_t, - mca_oob_ud_msg_construct, - mca_oob_ud_msg_destruct); - -static void mca_oob_ud_req_constuct (mca_oob_ud_req_t *req) -{ - memset ((char *)req + sizeof (req->super), 0, sizeof (*req) - sizeof (req->super)); -} - -static void mca_oob_ud_req_destruct (mca_oob_ud_req_t *req) -{ - int i; - - if (req->req_peer) { - OBJ_RELEASE(req->req_peer); - } - - if (req->req_wr.send) { - free (req->req_wr.send); - } - - if (req->req_grh_mr) { - (void) ibv_dereg_mr (req->req_grh_mr); - } - - if (req->req_grh) { - free (req->req_grh); - } - - if (req->req_sge) { - free (req->req_sge); - } - - MCA_OOB_UD_REQ_DEREG_MR(req); -} - -void mca_oob_ud_req_timer_set (mca_oob_ud_req_t *req, const struct timeval *timeout, - int max_tries, void (*cb)(evutil_socket_t, short, void *)) -{ - opal_event_evtimer_set (orte_event_base, &req->timer.event, cb, (void *) req); - req->timer.value.tv_sec = timeout->tv_sec; - req->timer.value.tv_usec = timeout->tv_usec; - opal_event_evtimer_add (&req->timer.event, &req->timer.value); -} - -int mca_oob_ud_msg_get (struct mca_oob_ud_port_t *port, mca_oob_ud_req_t *req, - mca_oob_ud_qp_t *qp, mca_oob_ud_peer_t *peer, bool persist, - mca_oob_ud_msg_t **msgp) -{ - opal_free_list_item_t *item; - opal_free_list_t *list = &port->free_msgs; - - item = opal_free_list_wait_st (list); - if (NULL == item) { - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:ud:msg_get error getting message buffer", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - return ORTE_ERROR; - } - - *msgp = (mca_oob_ud_msg_t *) item; - - (*msgp)->persist = persist; - (*msgp)->req = req; - (*msgp)->peer = peer; - (*msgp)->qp = qp; - - if (NULL != peer) { - OBJ_RETAIN(peer); - } - - memset ((*msgp)->hdr, 0, sizeof (*((*msgp)->hdr))); - - mca_oob_ud_fill_sge (&(*msgp)->sge, (*msgp)->hdr, port->mtu, (*msgp)->mr->lkey); - mca_oob_ud_fill_send_wr (&(*msgp)->wr, &(*msgp)->sge, 1, peer); - - /* set return address */ - (*msgp)->hdr->ra.name = *ORTE_PROC_MY_NAME; - (*msgp)->hdr->ra.qkey = 0; - (*msgp)->hdr->ra.port_num = port->port_num; - - return ORTE_SUCCESS; -} - -int mca_oob_ud_msg_init (opal_free_list_item_t *item, void *context) { - mca_oob_ud_port_t *port = (mca_oob_ud_port_t *) context; - int buffer_id = port->send_buffer_index++ + mca_oob_ud_component.ud_recv_buffer_count; - char *buf = port->msg_buf.ptr + buffer_id * port->mtu; - mca_oob_ud_msg_t *msg = (mca_oob_ud_msg_t *) item; - - msg->port = port; - msg->hdr = (mca_oob_ud_msg_hdr_t *) buf; - msg->mr = port->msg_buf.mr; - - return ORTE_SUCCESS; -} - -void mca_oob_ud_msg_return (mca_oob_ud_msg_t *msg) -{ - opal_free_list_t *list = &msg->port->free_msgs; - - if (NULL != msg->peer) { - mca_oob_ud_peer_release (msg->peer); - } - - msg->peer = NULL; - msg->cbfunc = NULL; - msg->qp = NULL; - msg->req = NULL; - - opal_free_list_return_st (list, &msg->super); -} - -static void mca_oob_ud_msg_construct (mca_oob_ud_msg_t *msg) -{ - memset ((char *)msg + sizeof (msg->super), 0, sizeof (*msg) - sizeof (msg->super)); - - OBJ_CONSTRUCT(&msg->status_changed, opal_condition_t); - OBJ_CONSTRUCT(&msg->lock, opal_mutex_t); -} - -static void mca_oob_ud_msg_destruct (mca_oob_ud_msg_t *msg) -{ - OBJ_DESTRUCT(&msg->status_changed); - OBJ_DESTRUCT(&msg->lock); - - if (NULL != msg->peer) { - mca_oob_ud_peer_release (msg->peer); - } -} - -int mca_oob_ud_msg_post_send (mca_oob_ud_msg_t *msg) -{ - int rc = ORTE_SUCCESS; - - msg->status = MCA_OOB_UD_MSG_STATUS_POSTED; - - OPAL_THREAD_LOCK(&msg->peer->peer_lock); - - if (MCA_OOB_UD_MSG_ACK == msg->hdr->msg_type || - MCA_OOB_UD_MSG_NACK == msg->hdr->msg_type) { - rc = mca_oob_ud_qp_post_send (msg->qp, &msg->wr, 1); - } else { - rc = mca_oob_ud_peer_post_msg (msg->peer, msg); - } - - if (ORTE_SUCCESS != rc && false == msg->persist) { - msg->status = MCA_OOB_UD_MSG_STATUS_ERROR; - mca_oob_ud_msg_return (msg); - } - - opal_output_verbose(10, orte_oob_base_framework.framework_output, - "%s oob:ud:msg_post_send posted send for msg %p with id %" PRIu64, - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) msg, msg->hdr->msg_id); - - OPAL_THREAD_UNLOCK(&msg->peer->peer_lock); - - return rc; -} - -int mca_oob_ud_msg_status_update (mca_oob_ud_msg_t *msg, mca_oob_ud_status_t status) -{ - int rc; - - opal_output_verbose(10, orte_oob_base_framework.framework_output, - "%s oob:ud:msg_status_update setting status of msg %p to %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) msg, (int) status); - - OPAL_THREAD_LOCK(&msg->lock); - - if (status != msg->status) { - if (MCA_OOB_UD_MSG_STATUS_COMPLETE == status) { - opal_output_verbose(10, orte_oob_base_framework.framework_output, - "%s oob:ud:msg_status_update setting peer %s as available", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&msg->peer->peer_name)); - - msg->peer->peer_available = true; - } - - switch (status) { - case MCA_OOB_UD_MSG_STATUS_TIMEOUT: - rc = ORTE_ERR_TIMEOUT; - break; - case MCA_OOB_UD_MSG_STATUS_COMPLETE: - rc = ORTE_SUCCESS; - break; - case MCA_OOB_UD_MSG_STATUS_ERROR: - default: - rc = ORTE_ERROR; - } - - if (msg->cbfunc) { - msg->cbfunc (msg, rc); - } - - /* signal status change */ - msg->status = status; - opal_condition_signal (&msg->status_changed); - - OPAL_THREAD_UNLOCK(&msg->lock); - - if (false == msg->persist) { - mca_oob_ud_msg_return (msg); - } - - return ORTE_SUCCESS; - } - - OPAL_THREAD_UNLOCK(&msg->lock); - - return ORTE_SUCCESS; -} - -static void mca_oob_ud_req_return (mca_oob_ud_req_t *req) -{ - opal_output_verbose(15, orte_oob_base_framework.framework_output, - "%s oob:ud:req_return returning req %p", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) req); - - mca_oob_ud_req_append_to_list (req, NULL); - - if (NULL != req->req_peer) { - mca_oob_ud_peer_release (req->req_peer); - req->req_peer = NULL; - } - - if (NULL != req->req_wr.send) { - free (req->req_wr.send); - req->req_wr.send = NULL; - } - - if (NULL != req->req_sge) { - free (req->req_sge); - req->req_sge = NULL; - } - - OBJ_RELEASE(req); -} - -void mca_oob_ud_req_complete (mca_oob_ud_req_t *req, int rc) -{ - int i; - opal_output_verbose(10, orte_oob_base_framework.framework_output, - "%s oob:ud:req_complete %s request %p completed with status %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (req->type == MCA_OOB_UD_REQ_SEND) ? "SEND":"RECV", (void *) req, rc); - - if (NULL != req->req_qp) { - (void) mca_oob_ud_qp_data_release (req->req_qp); - req->req_qp = NULL; - } - - /* deregister memory *before* handing it to the callback */ - MCA_OOB_UD_REQ_DEREG_MR(req); - - switch (req->type) { - case MCA_OOB_UD_REQ_SEND: - if (req->req_data_type != MCA_OOB_UD_REQ_TR) { - req->rml_msg->status = rc; - } - break; - case MCA_OOB_UD_REQ_RECV: - if ((req->req_target.jobid == ORTE_PROC_MY_NAME->jobid) && - (req->req_target.vpid == ORTE_PROC_MY_NAME->vpid)) { - opal_output_verbose(1, orte_oob_base_framework.framework_output, - "%s DELIVERING TO RML", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - if (MCA_OOB_UD_REQ_IOV == req->req_data_type) { - char *data = (char *)calloc(req->req_data.iov.count, sizeof(struct iovec)); - int datalen = 0; - for (i = 0 ; i < req->req_data.iov.count; ++i) { - memcpy (&data[datalen], req->req_data.iov.uiov[i].iov_base, req->req_data.iov.uiov[i].iov_len); - datalen += req->req_data.iov.uiov[i].iov_len; - } - ORTE_RML_POST_MESSAGE(&req->req_origin, req->req_tag, req->req_seq_num, data, datalen); - free(data); - } else { - ORTE_RML_POST_MESSAGE(&req->req_origin, req->req_tag, req->req_seq_num, - req->req_data.buf.p, req->req_data.buf.size); - } - } else { - opal_output_verbose(1, orte_oob_base_framework.framework_output, - "%s UD PROMOTING ROUTED MESSAGE FOR %s TO OOB", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&req->req_target)); - - orte_rml_send_t *snd = OBJ_NEW(orte_rml_send_t); - snd->dst = req->req_target; - snd->origin = req->req_origin; - snd->tag = req->req_tag; - snd->seq_num = req->req_seq_num; - if (MCA_OOB_UD_REQ_IOV == req->req_data_type) { - char *data = (char *)calloc(req->req_data.iov.count, sizeof(struct iovec)); - int datalen = 0; - for (i = 0 ; i < req->req_data.iov.count; ++i) { - memcpy (&data[datalen], req->req_data.iov.uiov[i].iov_base, req->req_data.iov.uiov[i].iov_len); - datalen += req->req_data.iov.uiov[i].iov_len; - } - snd->data = data; - snd->count = datalen; - } else { - char *data = (char *)calloc(req->req_data.buf.size, sizeof(char)); - memcpy (data, req->req_data.buf.p, req->req_data.buf.size); - snd->data = data; - snd->count = req->req_data.buf.size; - } - snd->cbfunc.iov = NULL; - snd->cbdata = NULL; - /* activate the OOB send state */ - ORTE_OOB_SEND(snd); - } - break; - default: - break; - } - - mca_oob_ud_req_return (req); -} - -void mca_oob_ud_req_append_to_list (mca_oob_ud_req_t *req, opal_list_t *list) -{ - OPAL_THREAD_LOCK(&mca_oob_ud_component.ud_match_lock); - - if (NULL != req->req_list) { - opal_list_remove_item (req->req_list, (opal_list_item_t *) req); - } - - if (NULL != list) { - opal_list_append (list, (opal_list_item_t *) req); - } - - req->req_list = list; - - OPAL_THREAD_UNLOCK(&mca_oob_ud_component.ud_match_lock); -} - -bool mca_oob_ud_req_is_in_list (mca_oob_ud_req_t *req, opal_list_t *list) -{ - opal_list_item_t *item; - bool rc = false; - - OPAL_THREAD_LOCK(&mca_oob_ud_component.ud_match_lock); - - for (item = opal_list_get_first (list) ; - item != opal_list_get_end (list) ; - item = opal_list_get_next (item)) { - if (item == (opal_list_item_t *) req) { - rc = true; - break; - } - } - - OPAL_THREAD_UNLOCK(&mca_oob_ud_component.ud_match_lock); - - return rc; -} - -void mca_oob_ud_req_abort (mca_oob_ud_req_t *req) -{ - /* caller should have removed this request from any owner list */ - req->req_list = NULL; - - if (NULL != req->req_qp) { - mca_oob_ud_qp_data_release (req->req_qp); - req->req_qp = NULL; - } - - /* free up request resources */ - mca_oob_ud_req_complete (req, ORTE_ERR_INTERUPTED); -} - -int mca_oob_ud_msg_wait (mca_oob_ud_msg_t *msg) -{ - OPAL_THREAD_LOCK(&msg->lock); - /* wait for ack */ - while (MCA_OOB_UD_MSG_STATUS_POSTED == msg->status) { - opal_condition_wait (&msg->status_changed, &msg->lock); - } - OPAL_THREAD_UNLOCK(&msg->lock); - - switch (msg->status) { - case MCA_OOB_UD_MSG_STATUS_TIMEOUT: - return ORTE_ERR_TIMEOUT; - case MCA_OOB_UD_MSG_STATUS_COMPLETE: - return ORTE_SUCCESS; - case MCA_OOB_UD_MSG_STATUS_ERROR: - default: - return ORTE_ERROR; - } -} diff --git a/orte/mca/oob/ud/oob_ud_req.h b/orte/mca/oob/ud/oob_ud_req.h deleted file mode 100644 index 6764401782..0000000000 --- a/orte/mca/oob/ud/oob_ud_req.h +++ /dev/null @@ -1,281 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights - * reserved. - * 2014 Mellanox Technologies, Inc. - * All rights reserved. - * Copyright (c) 2015 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - */ - -#if !defined(MCA_OOB_UD_REQ_H) -#define MCA_OOB_UD_REQ_H - -#include "oob_ud_peer.h" - -#include "orte_config.h" -#include "orte/types.h" -#include "opal/threads/condition.h" -#include "opal/mca/event/event.h" -#include "opal/class/opal_free_list.h" -#include "orte/mca/rml/rml.h" - -#include - -#include "oob_ud_qp.h" - -struct mca_oob_ud_peer_t; - -enum mca_oob_ud_req_type_t { - MCA_OOB_UD_REQ_RECV, - MCA_OOB_UD_REQ_SEND -}; -typedef enum mca_oob_ud_req_type_t mca_oob_ud_req_type_t; - -enum mca_oob_ud_req_state_t { - MCA_OOB_UD_REQ_ACTIVE, - MCA_OOB_UD_REQ_PENDING, - MCA_OOB_UD_REQ_COMPLETE -}; -typedef enum mca_oob_ud_req_state_t mca_oob_ud_req_state_t; - -enum mca_oob_ud_req_data_type_t { - MCA_OOB_UD_REQ_IOV, - MCA_OOB_UD_REQ_BUF, - MCA_OOB_UD_REQ_TR -}; -typedef enum mca_oob_ud_req_data_type_t mca_oob_ud_req_data_type_t; - -enum mca_oob_ud_msg_type_t { - MCA_OOB_UD_MSG_REQUEST = 37, - MCA_OOB_UD_MSG_REPLY = 38, - MCA_OOB_UD_MSG_COMPLETE = 39, - MCA_OOB_UD_MSG_PING = 40, - MCA_OOB_UD_MSG_ACK = 41, - MCA_OOB_UD_MSG_NACK = 42, - MCA_OOB_UD_MSG_DATA_OK = 43, - MCA_OOB_UD_MSG_END = 44 -}; -typedef enum mca_oob_ud_msg_type_t mca_oob_ud_msg_type_t; - -struct mca_oob_ud_msg_hdr_t { - mca_oob_ud_msg_type_t msg_type; - - void *msg_rem_ctx; - void *msg_lcl_ctx; - - orte_process_name_t msg_origin; - orte_process_name_t msg_target; - int msg_channel; - int msg_seq_num; - - uint64_t msg_id; - - struct { - /* the receiver can get the qpn and lid from the work completion */ - uint32_t qkey; - orte_process_name_t name; - uint8_t port_num; - } ra; - - union { - struct { - int tag; - int data_len; - int mtu; - bool data_follows; - bool data_iovec_used; - } req; - struct { - uint32_t qpn; - int data_len; - int tag; - int mtu; - } rep; - } msg_data; -}; -typedef struct mca_oob_ud_msg_hdr_t mca_oob_ud_msg_hdr_t; - -struct mca_oob_ud_req_t { - opal_list_item_t super; - - mca_oob_ud_req_type_t type; - mca_oob_ud_req_state_t state; - - union { - struct ibv_send_wr *send; - struct ibv_recv_wr *recv; - } req_wr; - - /* storage for ib grh */ - struct ibv_grh *req_grh; - struct ibv_mr *req_grh_mr; - - struct ibv_sge *req_sge; - - /* negotiated mtu */ - int req_mtu; - uint32_t req_rem_qpn; - int req_rem_data_len; - - int req_packet_count; - - struct mca_oob_ud_peer_t *req_peer; - struct mca_oob_ud_port_t *req_port; - struct mca_oob_ud_qp_t *req_qp; - - /* remote context (request or response) */ - void *req_rem_ctx; - - /* retry timer */ - struct { - opal_event_t event; - struct timeval value; - } timer; - - /* user request */ - orte_process_name_t req_target; - orte_process_name_t req_origin; - - mca_oob_ud_req_data_type_t req_data_type; - union { - struct { - struct ibv_mr **mr; - struct iovec *uiov; - int count; - }iov; - struct { - struct ibv_mr *mr; - char *p; - int size; - }buf; - }req_data; - - int req_tag; - int req_channel; - int req_seq_num; - int req_rc; - - void *req_cbdata; - - /* what list is this request in */ - opal_list_t *req_list; - - bool req_is_eager; - - orte_rml_send_t *rml_msg; -}; - -typedef struct mca_oob_ud_req_t mca_oob_ud_req_t; -OBJ_CLASS_DECLARATION(mca_oob_ud_req_t); - -enum mca_oob_ud_status_t { - /* message posted */ - MCA_OOB_UD_MSG_STATUS_POSTED, - /* remote side receive the message (ack'd) */ - MCA_OOB_UD_MSG_STATUS_COMPLETE, - /* request message timed out */ - MCA_OOB_UD_MSG_STATUS_TIMEOUT, - /* other failure */ - MCA_OOB_UD_MSG_STATUS_ERROR -}; -typedef enum mca_oob_ud_status_t mca_oob_ud_status_t; - -struct mca_oob_ud_msg_t { - opal_free_list_item_t super; - - struct ibv_send_wr wr; - struct ibv_sge sge; - mca_oob_ud_msg_hdr_t *hdr; - struct ibv_mr *mr; - - /* qp this request was sent over */ - struct mca_oob_ud_qp_t *qp; - struct mca_oob_ud_port_t *port; - - opal_mutex_t lock; - opal_condition_t status_changed; - mca_oob_ud_status_t status; - - bool persist; - mca_oob_ud_req_t *req; - - void (*cbfunc) (struct mca_oob_ud_msg_t *, int); - - struct mca_oob_ud_peer_t *peer; -}; -typedef struct mca_oob_ud_msg_t mca_oob_ud_msg_t; -OBJ_CLASS_DECLARATION(mca_oob_ud_msg_t); - -static inline int mca_oob_ud_recv_alloc (mca_oob_ud_req_t *recv_req) -{ - int iov_index; - - size_t alloc_size = recv_req->req_rem_data_len; - if (MCA_OOB_UD_REQ_IOV == recv_req->req_data_type) { - for (iov_index = 0 ; iov_index < recv_req->req_data.iov.count - 1 ; ++iov_index) { - alloc_size -= recv_req->req_data.iov.uiov[iov_index].iov_len; - } - - recv_req->req_data.iov.uiov[iov_index].iov_len = alloc_size; - recv_req->req_data.iov.uiov[iov_index].iov_base = calloc (alloc_size, 1); - - if (NULL == recv_req->req_data.iov.uiov[iov_index].iov_base) { - return ORTE_ERROR; - } - } else { - recv_req->req_data.buf.p = (char *)calloc(recv_req->req_rem_data_len, sizeof(char)); - if (NULL == recv_req->req_data.buf.p) { - return ORTE_ERROR; - } - recv_req->req_data.buf.size = recv_req->req_rem_data_len; - } - return ORTE_SUCCESS; -} - -#define MCA_OOB_UD_REQ_DEREG_MR(req) \ - if (MCA_OOB_UD_REQ_IOV == req->req_data_type) { \ - if (req->req_data.iov.mr) { \ - for (i = 0 ; i < req->req_data.iov.count ; ++i) { \ - if (req->req_data.iov.mr[i]) { \ - (void) ibv_dereg_mr (req->req_data.iov.mr[i]); \ - req->req_data.iov.mr[i] = NULL; \ - } \ - } \ - free (req->req_data.iov.mr); \ - req->req_data.iov.mr = NULL; \ - } \ - } else { \ - if (req->req_data.buf.mr) { \ - (void) ibv_dereg_mr (req->req_data.buf.mr); \ - req->req_data.buf.mr = NULL; \ - } \ - } - -int mca_oob_ud_msg_get (struct mca_oob_ud_port_t *port, mca_oob_ud_req_t *req, - mca_oob_ud_qp_t *qp, mca_oob_ud_peer_t *peer, bool persist, - mca_oob_ud_msg_t **msgp); -int mca_oob_ud_msg_init (opal_free_list_item_t *item, void *context); -void mca_oob_ud_msg_return (mca_oob_ud_msg_t *msg); - - -void mca_oob_ud_req_timer_set (mca_oob_ud_req_t *req, const struct timeval *timeout, - int max_tries, void (*cb)(evutil_socket_t, short, void *)); - -int mca_oob_ud_msg_post_send (mca_oob_ud_msg_t *msg); -int mca_oob_ud_msg_wait (mca_oob_ud_msg_t *msg); - -int mca_oob_ud_msg_status_update (mca_oob_ud_msg_t *msg, mca_oob_ud_status_t status); - -void mca_oob_ud_req_complete (mca_oob_ud_req_t *req, int rc); -void mca_oob_ud_req_abort (mca_oob_ud_req_t *req); - -void mca_oob_ud_req_append_to_list (mca_oob_ud_req_t *req, opal_list_t *list); -bool mca_oob_ud_req_is_in_list (mca_oob_ud_req_t *req, opal_list_t *list); - -#endif diff --git a/orte/mca/oob/ud/oob_ud_send.c b/orte/mca/oob/ud/oob_ud_send.c deleted file mode 100644 index 584a336dbc..0000000000 --- a/orte/mca/oob/ud/oob_ud_send.c +++ /dev/null @@ -1,543 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights - * reserved. - * 2014 Mellanox Technologies, Inc. - * All rights reserved. - * Copyright (c) 2015-2016 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - */ -#include "oob_ud_send.h" -#include "orte/mca/errmgr/errmgr.h" - -static void mca_oob_ud_send_cb (mca_oob_ud_msg_t *msg, int rc) -{ - mca_oob_ud_send_complete (msg->req, rc); -} - -static int mca_oob_ud_send_self (orte_rml_send_t *msg) -{ - unsigned int srco, dsto; - mca_oob_ud_req_t *req; - int srci, dsti; - int rc, size; - - MCA_OOB_UD_IOV_SIZE(msg, size); - - opal_output_verbose(10, orte_oob_base_framework.framework_output, - "%s mca_oob_ud_send_self: sending %d bytes to myself", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), size); - - rc = mca_oob_ud_get_recv_req (*ORTE_PROC_MY_NAME, msg->tag, &req, (msg->iov != NULL) ? true : false); - if (ORTE_SUCCESS != rc) { - return rc; - } - - req->req_rem_data_len = size; - req->req_is_eager = true; - - rc = mca_oob_ud_recv_alloc (req); - if (ORTE_SUCCESS != rc) { - ORTE_ERROR_LOG(rc); - if (MCA_OOB_UD_REQ_IOV == req->req_data_type) { - free (req->req_data.iov.uiov); - } - OBJ_RELEASE(req); - return rc; - } - - srci = dsti = 0; - srco = dsto = 0; - - if (msg->iov != NULL) { - do { - req->req_data_type = MCA_OOB_UD_REQ_IOV; - size_t copy = min(msg->iov[srci].iov_len - srco, - req->req_data.iov.uiov[dsti].iov_len - dsto); - - memmove ((unsigned char *) req->req_data.iov.uiov[dsti].iov_base + dsto, - (unsigned char *) msg->iov[srci].iov_base + srco, copy); - - srco += copy; - if (srco == msg->iov[srci].iov_len) { - srci++; - srco = 0; - } - - dsto += copy; - if (dsto == req->req_data.iov.uiov[dsti].iov_len) { - dsti++; - dsto = 0; - } - } while (srci < req->req_data.iov.count && dsti < msg->count); - } else { - req->req_data_type = MCA_OOB_UD_REQ_BUF; - - opal_buffer_t *buffer; - buffer = OBJ_NEW(opal_buffer_t); - - if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(buffer, msg->buffer))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(buffer); - return rc; - } - if (OPAL_SUCCESS != (rc = opal_dss.unload(buffer, (void **)&req->req_data.buf.p, &req->req_data.buf.size))) - { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(buffer); - free(req->req_data.buf.p); - return rc; - } - OBJ_RELEASE(buffer); - } - - req->state = MCA_OOB_UD_REQ_COMPLETE; - - opal_output_verbose(10, orte_oob_base_framework.framework_output, - "%s mca_oob_ud_send_self: complete. calling callbacks", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - /* queue up recv callback */ - mca_oob_ud_event_queue_completed (req); - - req->rml_msg->status = ORTE_SUCCESS; - - return size; -} - -int mca_oob_ud_process_send_nb(int fd, short args, void *cbdata) -{ - mca_oob_ud_msg_op_t *op = (mca_oob_ud_msg_op_t*)cbdata; - - orte_process_name_t hop; - mca_oob_ud_peer_t *peer; - mca_oob_ud_port_t *port; - mca_oob_ud_msg_t *req_msg; - mca_oob_ud_req_t *send_req; - bool send_eager = false; - char *pack_ptr; - int rc, size, i; - - if (OPAL_EQUAL == orte_util_compare_name_fields - (ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, &op->msg->dst)) { - return mca_oob_ud_send_self (op->msg); - } - - /* if we have a route to this peer, then we can reach it */ - hop = orte_routed.get_route(NULL, &op->msg->dst); - if (ORTE_JOBID_INVALID == hop.jobid || - ORTE_VPID_INVALID == hop.vpid) { - ORTE_ERROR_LOG(ORTE_ERR_UNREACH); - return ORTE_ERR_UNREACH; - } - - rc = mca_oob_ud_peer_lookup (&hop, &peer); - if(ORTE_SUCCESS != rc || NULL == peer) { - ORTE_ERROR_LOG((NULL == peer) ? ORTE_ERR_UNREACH : rc); - return (NULL == peer) ? ORTE_ERR_UNREACH : rc; - } - - opal_output_verbose(2, orte_oob_base_framework.framework_output, - "%s oob:ud:send_nb to pear %s via hop %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&op->msg->dst), ORTE_NAME_PRINT(&hop)); - - /* NTH: TODO -- get a random port? */ - port = (mca_oob_ud_port_t *) opal_list_get_first (&((mca_oob_ud_device_t *)peer->peer_context)->ports); - - send_req = OBJ_NEW(mca_oob_ud_req_t); - if (!send_req) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERR_OUT_OF_RESOURCE; - } - - /* fill in request */ - send_req->req_target = op->msg->dst; - send_req->req_origin = op->msg->origin; - send_req->req_tag = op->msg->tag; - send_req->req_seq_num = op->msg->seq_num; - - if (op->msg->data != NULL) { - size = op->msg->count; - - send_req->req_data_type = MCA_OOB_UD_REQ_TR; - - send_req->req_data.buf.p = (char *)calloc(size, sizeof(char)); - memcpy(send_req->req_data.buf.p, op->msg->data, op->msg->count); - send_req->req_data.buf.size = op->msg->count; - } else { - MCA_OOB_UD_IOV_SIZE(op->msg, size); - - if (op->msg->iov != NULL) { - send_req->req_data_type = MCA_OOB_UD_REQ_IOV; - send_req->req_data.iov.uiov = op->msg->iov; - send_req->req_data.iov.count = op->msg->count; - } else { - send_req->req_data_type = MCA_OOB_UD_REQ_BUF; - - opal_buffer_t *buffer; - buffer = OBJ_NEW(opal_buffer_t); - - if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(buffer, op->msg->buffer))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(buffer); - return rc; - } - - if (OPAL_SUCCESS != (rc = opal_dss.unload(buffer, (void **)&send_req->req_data.buf.p, &send_req->req_data.buf.size))) - { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(buffer); - free(send_req->req_data.buf.p); - return rc; - } - OBJ_RELEASE(buffer); - } - } - send_req->rml_msg = op->msg; - send_req->req_cbdata = op->msg->cbdata; - send_req->req_peer = peer; - send_req->req_mtu = port->mtu; - send_req->req_port = port; - send_req->req_rc = 0; - - send_req->state = MCA_OOB_UD_REQ_PENDING; - send_req->type = MCA_OOB_UD_REQ_SEND; - - OBJ_RETAIN(peer); - - if (size + sizeof (mca_oob_ud_msg_hdr_t) <= (unsigned int)port->mtu) { - send_eager = true; - } - - rc = mca_oob_ud_msg_get (port, send_req, &port->listen_qp, peer, false, &req_msg); - if (ORTE_SUCCESS != rc) { - OBJ_RELEASE (send_req); - return rc; - } - - /* fill in message header */ - req_msg->hdr->msg_type = MCA_OOB_UD_MSG_REQUEST; - req_msg->hdr->msg_rem_ctx = send_req; - - req_msg->hdr->msg_origin = op->msg->origin; - req_msg->hdr->msg_target = op->msg->dst; - req_msg->hdr->msg_seq_num = op->msg->seq_num; - - req_msg->hdr->msg_data.req.data_len = size; - req_msg->hdr->msg_data.req.mtu = port->mtu; - req_msg->hdr->msg_data.req.tag = op->msg->tag; - - if (MCA_OOB_UD_REQ_IOV == send_req->req_data_type) { - opal_output_verbose(10, orte_oob_base_framework.framework_output, - "%s-%s send_nb: tag %d size %lu. msg: %p. peer = %p. req = %p." - "count = %d. uiov = %p.\n", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&op->msg->dst), - op->msg->tag, (unsigned long)size, - (void *) req_msg, - (void *) peer, (void *) send_req, - send_req->req_data.iov.count, (void *) send_req->req_data.iov.uiov); - } else { - opal_output_verbose(10, orte_oob_base_framework.framework_output, - "%s-%s send_nb: tag %d size %lu. msg: %p. peer = %p. req = %p." - "buffer = %p.\n", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&op->msg->dst), - op->msg->tag, (unsigned long)size, - (void *) req_msg, - (void *) peer, (void *) send_req, (void *) send_req->req_data.buf.p); - } - - if (!send_eager) { - mca_oob_ud_req_append_to_list (send_req, &mca_oob_ud_component.ud_active_sends); - - /* send request */ - return mca_oob_ud_msg_post_send (req_msg); - } - - pack_ptr = (char *)(req_msg->hdr + 1); - - if (op->msg->iov != NULL) { - for (i = 0 ; i < op->msg->count ; ++i) { - memcpy (pack_ptr, op->msg->iov[i].iov_base, op->msg->iov[i].iov_len); - pack_ptr += op->msg->iov[i].iov_len; - } - } else { - memcpy(pack_ptr, send_req->req_data.buf.p, send_req->req_data.buf.size); - } - - send_req->req_list = NULL; - - req_msg->hdr->msg_data.req.data_follows = true; - - req_msg->cbfunc = mca_oob_ud_send_cb; - req_msg->req = send_req; - - do { - /* send request */ - rc = mca_oob_ud_msg_post_send (req_msg); - if (ORTE_SUCCESS != rc) { - ORTE_ERROR_LOG(rc); - break; - } - } while (0); - - return rc; -} - -static void mca_oob_ud_send_try_to (int fd, short event, void *ctx) -{ - OPAL_THREAD_LOCK(&mca_oob_ud_component.ud_match_lock); - (void) mca_oob_ud_send_try ((mca_oob_ud_req_t *) ctx); - OPAL_THREAD_UNLOCK(&mca_oob_ud_component.ud_match_lock); -} - -int mca_oob_ud_send_try (mca_oob_ud_req_t *send_req) { - int wr_index, wr_count, sge_count, sge_index, iov_index; - unsigned int iov_left, iov_offset, packet_size; - const unsigned int mtu = send_req->req_mtu; - const struct timeval aquire_timeout = {0, 500000}; - mca_oob_ud_msg_t *com_msg; - int data_len; - int rc = ORTE_SUCCESS; - - opal_output_verbose(10, orte_oob_base_framework.framework_output, - "%s oob:ud:send_try sending to %s, tag = %d, " - "req = %p", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&send_req->req_peer->peer_name), - send_req->req_tag, (void *) send_req); - - do { - if (NULL == send_req->req_qp) { - rc = mca_oob_ud_qp_data_aquire (send_req->req_port, &send_req->req_qp); - if (ORTE_SUCCESS != rc) { - break; - } - } - - (void) mca_oob_ud_qp_purge (send_req->req_qp); - - rc = mca_oob_ud_msg_get (send_req->req_port, send_req, send_req->req_qp, send_req->req_peer, false, - &com_msg); - if (ORTE_SUCCESS != rc) { - break; - } - - if (MCA_OOB_UD_REQ_IOV == send_req->req_data_type) { - if (NULL == send_req->req_data.iov.mr) { - /* allocate space for memory registers */ - send_req->req_data.iov.mr = (struct ibv_mr **) calloc (send_req->req_data.iov.count, sizeof (struct ibv_mr *)); - if (NULL == send_req->req_data.iov.mr) { - rc = ORTE_ERR_OUT_OF_RESOURCE; - ORTE_ERROR_LOG(rc); - break; - } - } - - rc = mca_oob_ud_register_iov (send_req->req_data.iov.uiov, send_req->req_data.iov.count, - send_req->req_data.iov.mr, send_req->req_port->device->ib_pd, - mtu, &sge_count, &wr_count, &data_len); - - if (ORTE_SUCCESS != rc) { - break; - } - } else { - data_len = send_req->req_data.buf.size; - rc = mca_oob_ud_register_buf(send_req->req_data.buf.p, send_req->req_data.buf.size, - &send_req->req_data.buf.mr, send_req->req_port->device->ib_pd, - mtu, &sge_count, &wr_count); - - if (ORTE_SUCCESS != rc) { - break; - } - } - - wr_count = (data_len + mtu - 1) / mtu; - - if (data_len > 0) { - data_len = data_len + 0; - } - - if (MCA_OOB_UD_REQ_IOV == send_req->req_data_type) { - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:ud:send_try sending %d bytes in %d " - "work requests, %d sges. uiov = %p.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), data_len, - wr_count, sge_count, (void *) send_req->req_data.iov.uiov); - } else { - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:ud:send_try sending %d bytes in %d " - "work requests, %d sges. buf = %p", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), data_len, - wr_count, sge_count, (void *) send_req->req_data.buf.p); - } - - if (wr_count && NULL == send_req->req_wr.send) { - send_req->req_wr.send = (struct ibv_send_wr *) calloc (wr_count, sizeof (struct ibv_send_wr)); - if (NULL == send_req->req_wr.send) { - rc = ORTE_ERR_OUT_OF_RESOURCE; - ORTE_ERROR_LOG(rc); - break; - } - } - - if (wr_count && NULL == send_req->req_sge) { - send_req->req_sge = (struct ibv_sge *) calloc (sge_count, sizeof (struct ibv_sge)); - - if (NULL == send_req->req_sge) { - rc = ORTE_ERR_OUT_OF_RESOURCE; - ORTE_ERROR_LOG(rc); - break; - } - } - - if (MCA_OOB_UD_REQ_IOV == send_req->req_data_type) { - opal_output_verbose(10, orte_oob_base_framework.framework_output, - "%s oob:ud:send_try posting message using iovec", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - iov_left = send_req->req_data.iov.uiov[0].iov_len; - iov_offset = 0; - iov_index = 0; - - for (wr_index = 0, sge_index = 0 ; wr_index < wr_count ; ++wr_index) { - int sge_first = sge_index; - - packet_size = 0; - - do { - int to_send = min (iov_left, mtu - packet_size); - - mca_oob_ud_fill_sge(send_req->req_sge + sge_index++, - (char *)send_req->req_data.iov.uiov[iov_index].iov_base + iov_offset, - to_send, send_req->req_data.iov.mr[iov_index]->lkey); - - iov_offset += to_send; - iov_left -= to_send; - packet_size += to_send; - - if (0 == iov_left) { - iov_index++; - iov_offset = 0; - - if (iov_index < send_req->req_data.iov.count) { - iov_left = send_req->req_data.iov.uiov[iov_index].iov_len; - } - } - } while ((packet_size < mtu) && (iov_left > 0)); - - mca_oob_ud_fill_send_wr(send_req->req_wr.send + wr_index, - send_req->req_sge + sge_first, - sge_index - sge_first, send_req->req_peer); - - /* we don't care about completions for data */ - send_req->req_wr.send[wr_index].send_flags = IBV_SEND_SOLICITED; - - /* sequence number */ - send_req->req_wr.send[wr_index].imm_data = wr_index; - send_req->req_wr.send[wr_index].wr.ud.remote_qpn = send_req->req_rem_qpn; - send_req->req_wr.send[wr_index].opcode = IBV_WR_SEND_WITH_IMM; - - if (wr_index + 1 < wr_count) { - send_req->req_wr.send[wr_index].next = send_req->req_wr.send + wr_index + 1; - } - } - } else {//data is in buffer - unsigned int buffer_offset = 0; - unsigned int buffer_size = send_req->req_data.buf.size; - - opal_output_verbose(10, orte_oob_base_framework.framework_output, - "%s oob:ud:send_try posting message using buffer", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - for (wr_index = 0, sge_index = 0 ; wr_index < wr_count ; ++wr_index) { - int sge_first = sge_index; - - packet_size = 0; - - do { - int to_send = min (buffer_size, mtu - packet_size); - - mca_oob_ud_fill_sge(send_req->req_sge + sge_index++, - (char *)send_req->req_data.buf.p + buffer_offset, - to_send, send_req->req_data.buf.mr->lkey); - - buffer_offset += to_send; - buffer_size -= to_send; - packet_size += to_send; - } while ((packet_size < mtu) && (buffer_size > 0)); - - mca_oob_ud_fill_send_wr(send_req->req_wr.send + wr_index, - send_req->req_sge + sge_first, - sge_index - sge_first, send_req->req_peer); - - /* we don't care about completions for data */ - send_req->req_wr.send[wr_index].send_flags = IBV_SEND_SOLICITED; - - /* sequence number */ - send_req->req_wr.send[wr_index].imm_data = wr_index; - send_req->req_wr.send[wr_index].wr.ud.remote_qpn = send_req->req_rem_qpn; - send_req->req_wr.send[wr_index].opcode = IBV_WR_SEND_WITH_IMM; - - opal_output_verbose(10, orte_oob_base_framework.framework_output, - "%s oob:ud:send_try imm_data = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), wr_index); - - if (wr_index + 1 < wr_count) { - send_req->req_wr.send[wr_index].next = send_req->req_wr.send + wr_index + 1; - } - } - } - - /* send data */ - rc = mca_oob_ud_qp_post_send (send_req->req_qp, send_req->req_wr.send, 0); - if (ORTE_SUCCESS != rc) { - ORTE_ERROR_LOG(rc); - break; - } - - opal_output_verbose(10, orte_oob_base_framework.framework_output, - "%s oob:ud:send_try posting completion message", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - /* Fill in completion message. This message will go to the peers listen QP but - must originate from our data qp to ensure that it is sent last. */ - com_msg->hdr->msg_type = MCA_OOB_UD_MSG_COMPLETE; - com_msg->hdr->msg_lcl_ctx = send_req->req_rem_ctx; - com_msg->hdr->msg_rem_ctx = send_req; - - /* send message header */ - rc = mca_oob_ud_msg_post_send (com_msg); - - /* post_send already returned the message */ - com_msg = NULL; - } while (0); - - if (ORTE_ERR_TEMP_OUT_OF_RESOURCE == rc) { - /* set timer to retry post */ - mca_oob_ud_req_timer_set (send_req, &aquire_timeout, 1, mca_oob_ud_send_try_to); - rc = ORTE_SUCCESS; - } - - if (ORTE_SUCCESS != rc) { - ORTE_ERROR_LOG(rc); - /* damn */ - return mca_oob_ud_send_complete (send_req, rc); - } - - send_req->state = MCA_OOB_UD_REQ_ACTIVE; - - return rc; -} - -int mca_oob_ud_send_complete (mca_oob_ud_req_t *send_req, int rc) -{ - mca_oob_ud_req_complete (send_req, rc); - - return rc; -} diff --git a/orte/mca/oob/ud/oob_ud_send.h b/orte/mca/oob/ud/oob_ud_send.h deleted file mode 100644 index 630693baba..0000000000 --- a/orte/mca/oob/ud/oob_ud_send.h +++ /dev/null @@ -1,55 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2014 Mellanox Technologies, Inc. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - */ - -#if !defined(MCA_OOB_UD_SEND_H) -#define MCA_OOB_UD_SEND_H - -#include "oob_ud_component.h" - -#define min(a,b) ((a) < (b) ? (a) : (b)) - -#define MCA_OOB_UD_IOV_SIZE(msg, size) \ - do { \ - if (msg->iov != NULL) { \ - int i; \ - for (i = 0, (size) = 0 ; i < (msg->count) ; ++i) { \ - (size) += (msg->iov)[i].iov_len; \ - } \ - } else { \ - (size) = msg->buffer->bytes_used; \ - } \ - } while (0); - -/* State machine for processing message */ -typedef struct { - opal_object_t super; - opal_event_t ev; - orte_rml_send_t *msg; -} mca_oob_ud_msg_op_t; -OBJ_CLASS_DECLARATION(mca_oob_ud_msg_op_t); - -#define ORTE_ACTIVATE_UD_POST_SEND(ms, cbfunc) \ - do { \ - mca_oob_ud_msg_op_t *mop; \ - opal_output_verbose(5, orte_oob_base_framework.framework_output,\ - "%s:[%s:%d] post send to %s", \ - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \ - __FILE__, __LINE__, \ - ORTE_NAME_PRINT(&((ms)->dst))); \ - mop = OBJ_NEW(mca_oob_ud_msg_op_t); \ - mop->msg = (ms); \ - opal_event_set(mca_oob_ud_module.ev_base, &mop->ev, -1, \ - OPAL_EV_WRITE, (cbfunc), mop); \ - opal_event_set_priority(&mop->ev, ORTE_MSG_PRI); \ - opal_event_active(&mop->ev, OPAL_EV_WRITE, 1); \ - } while(0); -#endif diff --git a/orte/mca/oob/ud/owner.txt b/orte/mca/oob/ud/owner.txt deleted file mode 100644 index 6163d58e4c..0000000000 --- a/orte/mca/oob/ud/owner.txt +++ /dev/null @@ -1,7 +0,0 @@ -# -# owner/status file -# owner: institution that is responsible for this package -# status: e.g. active, maintenance, unmaintained -# -owner: MELLANOX -status: maintenance