From 55915c388558028f6be1c04cc6f4b4573f2aff73 Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Tue, 19 Feb 2019 10:27:47 -0700 Subject: [PATCH] rml/ofi: remove per discussion at the 2/19/19 devel-core meeting, remove rml/ofi from 4.0.x Signed-off-by: Howard Pritchard --- orte/mca/rml/ofi/Makefile.am | 53 -- orte/mca/rml/ofi/configure.m4 | 31 - orte/mca/rml/ofi/rml_ofi.h | 213 ----- orte/mca/rml/ofi/rml_ofi_component.c | 1191 -------------------------- orte/mca/rml/ofi/rml_ofi_request.h | 137 --- orte/mca/rml/ofi/rml_ofi_send.c | 1052 ----------------------- 6 files changed, 2677 deletions(-) delete mode 100644 orte/mca/rml/ofi/Makefile.am delete mode 100644 orte/mca/rml/ofi/configure.m4 delete mode 100644 orte/mca/rml/ofi/rml_ofi.h delete mode 100644 orte/mca/rml/ofi/rml_ofi_component.c delete mode 100644 orte/mca/rml/ofi/rml_ofi_request.h delete mode 100644 orte/mca/rml/ofi/rml_ofi_send.c diff --git a/orte/mca/rml/ofi/Makefile.am b/orte/mca/rml/ofi/Makefile.am deleted file mode 100644 index a6a4f90f0a..0000000000 --- a/orte/mca/rml/ofi/Makefile.am +++ /dev/null @@ -1,53 +0,0 @@ -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2015-2017 Intel, Inc. All rights reserved. -# Copyright (c) 2017 Los Alamos National Security, LLC. All rights -# reserved. -# Copyright (c) 2017 IBM Corporation. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -AM_CPPFLAGS = $(opal_common_ofi_CPPFLAGS) - -sources = \ - rml_ofi.h \ - rml_ofi_request.h \ - rml_ofi_component.c \ - rml_ofi_send.c - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -if MCA_BUILD_orte_rml_ofi_DSO -component_noinst = -component_install = mca_rml_ofi.la -else -component_noinst = libmca_rml_ofi.la -component_install = -endif - -mcacomponentdir = $(ortelibdir) -mcacomponent_LTLIBRARIES = $(component_install) -mca_rml_ofi_la_SOURCES = $(sources) -mca_rml_ofi_la_LDFLAGS = -module -avoid-version -mca_rml_ofi_la_LIBADD = $(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la \ - $(OPAL_TOP_BUILDDIR)/opal/mca/common/ofi/lib@OPAL_LIB_PREFIX@mca_common_ofi.la - -noinst_LTLIBRARIES = $(component_noinst) -libmca_rml_ofi_la_SOURCES = $(sources) -libmca_rml_ofi_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/rml/ofi/configure.m4 b/orte/mca/rml/ofi/configure.m4 deleted file mode 100644 index 35327c29d4..0000000000 --- a/orte/mca/rml/ofi/configure.m4 +++ /dev/null @@ -1,31 +0,0 @@ -# -*- shell-script -*- -# -# Copyright (c) 2013-2014 Intel, Inc. All rights reserved -# -# Copyright (c) 2014-2015 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2017 Los Alamos National Security, LLC. All rights -# reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# MCA_orte_rml_ofi_POST_CONFIG(will_build) -# ---------------------------------------- -# Only require the tag if we're actually going to be built - -# MCA_mtl_ofi_CONFIG([action-if-can-compile], -# [action-if-cant-compile]) -# ------------------------------------------------ -AC_DEFUN([MCA_orte_rml_ofi_CONFIG],[ - AC_CONFIG_FILES([orte/mca/rml/ofi/Makefile]) - - # ensure we already ran the common OFI libfabric config - AC_REQUIRE([MCA_opal_common_ofi_CONFIG]) - - AS_IF([test "$opal_common_ofi_happy" = "yes"], - [$1], - [$2]) -])dnl diff --git a/orte/mca/rml/ofi/rml_ofi.h b/orte/mca/rml/ofi/rml_ofi.h deleted file mode 100644 index 465d28c484..0000000000 --- a/orte/mca/rml/ofi/rml_ofi.h +++ /dev/null @@ -1,213 +0,0 @@ -/* - * Copyright (c) 2015 Intel, Inc. All rights reserved - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_RML_OFI_RML_OFI_H -#define MCA_RML_OFI_RML_OFI_H - -#include "orte_config.h" - -#include "opal/dss/dss_types.h" -#include "opal/mca/event/event.h" -#include "opal/mca/pmix/pmix.h" -#include "orte/mca/rml/base/base.h" - -#include -#include -#include -#include -#include -#include - -#include "rml_ofi_request.h" - -/** the maximum open OFI ofi_prov - assuming system will have no more than 20 transports*/ -#define MAX_OFI_PROVIDERS 40 -#define RML_OFI_PROV_ID_INVALID 0xFF - -/** RML/OFI key values **/ -/* (char*) ofi socket address (type IN) of the node process is running on */ -#define OPAL_RML_OFI_FI_SOCKADDR_IN "rml.ofi.fisockaddrin" -/* (char*) ofi socket address (type PSM) of the node process is running on */ -#define OPAL_RML_OFI_FI_ADDR_PSMX "rml.ofi.fiaddrpsmx" - -// MULTI_BUF_SIZE_FACTOR defines how large the multi recv buffer will be. -// In order to use FI_MULTI_RECV feature efficiently, we need to have a -// large recv buffer so that we don't need to repost the buffer often to -// get the remaining data when the buffer is full -#define MULTI_BUF_SIZE_FACTOR 128 -#define MIN_MULTI_BUF_SIZE (1024 * 1024) - -#define OFIADDR "ofiaddr" - -#define CLOSE_FID(fd) \ - do { \ - int _ret = 0; \ - if (0 != (fd)) { \ - _ret = fi_close(&(fd)->fid); \ - fd = NULL; \ - if (0 != _ret) { \ - opal_output_verbose(10,orte_rml_base_framework.framework_output, \ - " %s - fi_close failed with error- %d", \ - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),ret); \ - } \ - } \ - } while (0); - - -#define RML_OFI_RETRY_UNTIL_DONE(FUNC) \ - do { \ - do { \ - ret = FUNC; \ - if(OPAL_LIKELY(0 == ret)) {break;} \ - } while(-FI_EAGAIN == ret); \ - } while(0); - -BEGIN_C_DECLS - -struct orte_rml_ofi_module_t; - -/** This structure will hold the ep and all ofi objects for each transport -and also the corresponding fi_info -**/ -typedef struct { - - /** ofi provider ID **/ - uint8_t ofi_prov_id; - - /** fi_info for this transport */ - struct fi_info *fabric_info; - - /** Fabric Domain handle */ - struct fid_fabric *fabric; - - /** Access Domain handle */ - struct fid_domain *domain; - - /** Address vector handle */ - struct fid_av *av; - - /** Completion queue handle */ - struct fid_cq *cq; - - /** Endpoint to communicate on */ - struct fid_ep *ep; - - /** Endpoint name */ - char ep_name[FI_NAME_MAX]; - - /** Endpoint name length */ - size_t epnamelen; - - /** OFI memory region */ - struct fid_mr *mr_multi_recv; - - /** buffer for tx and rx */ - void *rxbuf; - - uint64_t rxbuf_size; - - /* event,fd associated with the cq */ - int fd; - - /*event associated with progress fn */ - opal_event_t progress_event; - bool progress_ev_active; - - struct fi_context rx_ctx1; - -} ofi_transport_ofi_prov_t; - - - struct orte_rml_ofi_module_t { - orte_rml_base_module_t api; - - /** current ofi transport id the component is using, this will be initialised - ** in the open_ofi_prov() call **/ - int cur_transport_id; - - /** Fabric info structure of all supported transports in system **/ - struct fi_info *fi_info_list; - - /** OFI ep and corr fi_info for all the transports (ofi_providers) **/ - ofi_transport_ofi_prov_t ofi_prov[MAX_OFI_PROVIDERS]; - - size_t min_ofi_recv_buf_sz; - - /** "Any source" address */ - fi_addr_t any_addr; - - /** number of ofi providers currently opened **/ - uint8_t ofi_prov_open_num; - - /** Unique message id for every message that is fragmented to be sent over OFI **/ - uint32_t cur_msgid; - - /* hashtable stores the peer addresses */ - opal_hash_table_t peers; - - opal_list_t recv_msg_queue_list; - opal_list_t queued_routing_messages; - opal_event_t *timer_event; - struct timeval timeout; -} ; -typedef struct orte_rml_ofi_module_t orte_rml_ofi_module_t; - -/* For every first send initiated to new peer - * select the peer provider, peer ep-addr, - * local provider and populate in orte_rml_ofi_peer_t instance. - * Insert this in hash table. - * */ -typedef struct { - opal_object_t super; - char* ofi_prov_name; /* peer (dest) provider chosen */ - void* ofi_ep; /* peer (dest) ep chosen */ - size_t ofi_ep_len; /* peer (dest) ep length */ - uint8_t src_prov_id; /* index of the local (src) provider used for this peer */ -} orte_rml_ofi_peer_t; -OBJ_CLASS_DECLARATION(orte_rml_ofi_peer_t); - -ORTE_MODULE_DECLSPEC extern orte_rml_component_t mca_rml_ofi_component; -extern orte_rml_ofi_module_t orte_rml_ofi; - -int orte_rml_ofi_send_buffer_nb(struct orte_rml_base_module_t *mod, - orte_process_name_t* peer, - struct opal_buffer_t* buffer, - orte_rml_tag_t tag, - orte_rml_buffer_callback_fn_t cbfunc, - void* cbdata); -int orte_rml_ofi_send_nb(struct orte_rml_base_module_t *mod, - orte_process_name_t* peer, - struct iovec* iov, - int count, - orte_rml_tag_t tag, - orte_rml_callback_fn_t cbfunc, - void* cbdata); - -/****************** INTERNAL OFI Functions*************/ -void free_ofi_prov_resources( int ofi_prov_id); -void print_provider_list_info (struct fi_info *fi ); -void print_provider_info (struct fi_info *cur_fi ); -int cq_progress_handler(int sd, short flags, void *cbdata); -int get_ofi_prov_id( opal_list_t *attributes); - -/** Send callback */ -int orte_rml_ofi_send_callback(struct fi_cq_data_entry *wc, - orte_rml_ofi_request_t*); - -/** Error callback */ -int orte_rml_ofi_error_callback(struct fi_cq_err_entry *error, - orte_rml_ofi_request_t*); - -/* OFI Recv handler */ -int orte_rml_ofi_recv_handler(struct fi_cq_data_entry *wc, uint8_t ofi_prov_id); - -bool user_override(void); -END_C_DECLS - -#endif diff --git a/orte/mca/rml/ofi/rml_ofi_component.c b/orte/mca/rml/ofi/rml_ofi_component.c deleted file mode 100644 index b0cc89b3e1..0000000000 --- a/orte/mca/rml/ofi/rml_ofi_component.c +++ /dev/null @@ -1,1191 +0,0 @@ -/* - * Copyright (c) 2015-2017 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" -#include "orte/constants.h" - -#include "opal/mca/base/base.h" -#include "opal/util/argv.h" -#include "opal/util/net.h" -#include "opal/util/output.h" -#include "opal/mca/backtrace/backtrace.h" -#include "opal/mca/event/event.h" - -#if OPAL_ENABLE_FT_CR == 1 -#include "orte/mca/rml/rml.h" -#include "orte/mca/state/state.h" -#endif -#include "orte/mca/rml/base/base.h" -#include "orte/mca/rml/rml_types.h" -#include "orte/mca/routed/routed.h" -#include "orte/mca/errmgr/errmgr.h" -#include "orte/util/name_fns.h" -#include "orte/runtime/orte_globals.h" - -#include "rml_ofi.h" - - -static int rml_ofi_component_open(void); -static int rml_ofi_component_close(void); -static int rml_ofi_component_register(void); - -static int rml_ofi_component_init(void); -static orte_rml_base_module_t* open_conduit(opal_list_t *attributes); -static orte_rml_pathway_t* query_transports(void); - -/** - * component definition - */ -orte_rml_component_t mca_rml_ofi_component = { - /* First, the mca_base_component_t struct containing meta - information about the component itself */ - - .base = { - ORTE_RML_BASE_VERSION_3_0_0, - - .mca_component_name = "ofi", - MCA_BASE_MAKE_VERSION(component, ORTE_MAJOR_VERSION, ORTE_MINOR_VERSION, - ORTE_RELEASE_VERSION), - .mca_open_component = rml_ofi_component_open, - .mca_close_component = rml_ofi_component_close, - .mca_register_component_params = rml_ofi_component_register - }, - .data = { - /* The component is checkpoint ready */ - MCA_BASE_METADATA_PARAM_CHECKPOINT - }, - .priority = 10, - .open_conduit = open_conduit, - .query_transports = query_transports, - .close_conduit = NULL -}; - -/* Local variables */ -orte_rml_ofi_module_t orte_rml_ofi = { - .api = { - .component = (struct orte_rml_component_t*)&mca_rml_ofi_component, - .ping = NULL, - .send_nb = orte_rml_ofi_send_nb, - .send_buffer_nb = orte_rml_ofi_send_buffer_nb, - .purge = NULL - } -}; - -/* Local variables */ -static bool init_done = false; -static char *ofi_transports_supported = NULL; -static char *initial_ofi_transports_supported = NULL; -static bool ofi_desired = false; -static bool routing_desired = false; - -/* return true if user override for choice of ofi provider */ -bool user_override(void) -{ - if( 0 == strcmp(initial_ofi_transports_supported, ofi_transports_supported ) ) - return false; - else - return true; -} - -static int -rml_ofi_component_open(void) -{ - /* Initialise endpoint and all queues */ - - orte_rml_ofi.fi_info_list = NULL; - orte_rml_ofi.min_ofi_recv_buf_sz = MIN_MULTI_BUF_SIZE; - orte_rml_ofi.cur_msgid = 1; - orte_rml_ofi.cur_transport_id = RML_OFI_PROV_ID_INVALID; - orte_rml_ofi.ofi_prov_open_num = 0; - OBJ_CONSTRUCT(&orte_rml_ofi.peers, opal_hash_table_t); - opal_hash_table_init(&orte_rml_ofi.peers, 128); - OBJ_CONSTRUCT(&orte_rml_ofi.recv_msg_queue_list, opal_list_t); - - for( uint8_t ofi_prov_id=0; ofi_prov_id < MAX_OFI_PROVIDERS ; ofi_prov_id++) { - orte_rml_ofi.ofi_prov[ofi_prov_id].fabric = NULL; - orte_rml_ofi.ofi_prov[ofi_prov_id].domain = NULL; - orte_rml_ofi.ofi_prov[ofi_prov_id].av = NULL; - orte_rml_ofi.ofi_prov[ofi_prov_id].cq = NULL; - orte_rml_ofi.ofi_prov[ofi_prov_id].ep = NULL; - orte_rml_ofi.ofi_prov[ofi_prov_id].ep_name[0] = 0; - orte_rml_ofi.ofi_prov[ofi_prov_id].epnamelen = 0; - orte_rml_ofi.ofi_prov[ofi_prov_id].mr_multi_recv = NULL; - orte_rml_ofi.ofi_prov[ofi_prov_id].rxbuf = NULL; - orte_rml_ofi.ofi_prov[ofi_prov_id].rxbuf_size = 0; - orte_rml_ofi.ofi_prov[ofi_prov_id].progress_ev_active = false; - orte_rml_ofi.ofi_prov[ofi_prov_id].ofi_prov_id = RML_OFI_PROV_ID_INVALID; - } - - opal_output_verbose(10,orte_rml_base_framework.framework_output," from %s:%d rml_ofi_component_open()",__FILE__,__LINE__); - - if (!ORTE_PROC_IS_HNP && !ORTE_PROC_IS_DAEMON) { - return ORTE_ERROR; - } - if (!ofi_desired) { - return ORTE_ERROR; - } - return ORTE_SUCCESS; -} - - -void free_ofi_prov_resources( int ofi_prov_id) -{ - - int ret=0; - opal_output_verbose(10,orte_rml_base_framework.framework_output, - " %s - free_ofi_prov_resources() begin. OFI ofi_prov_id- %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),ofi_prov_id); - if (orte_rml_ofi.ofi_prov[ofi_prov_id].ep) { - opal_output_verbose(10,orte_rml_base_framework.framework_output, - " %s - close ep",ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - CLOSE_FID(orte_rml_ofi.ofi_prov[ofi_prov_id].ep); - } - if (orte_rml_ofi.ofi_prov[ofi_prov_id].mr_multi_recv) { - opal_output_verbose(10,orte_rml_base_framework.framework_output, - " %s - close mr_multi_recv",ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - CLOSE_FID(orte_rml_ofi.ofi_prov[ofi_prov_id].mr_multi_recv); - } - if (orte_rml_ofi.ofi_prov[ofi_prov_id].cq) { - opal_output_verbose(10,orte_rml_base_framework.framework_output, - " %s - close cq",ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - CLOSE_FID(orte_rml_ofi.ofi_prov[ofi_prov_id].cq); - } - if (orte_rml_ofi.ofi_prov[ofi_prov_id].av) { - CLOSE_FID(orte_rml_ofi.ofi_prov[ofi_prov_id].av); - } - if (orte_rml_ofi.ofi_prov[ofi_prov_id].domain) { - opal_output_verbose(10,orte_rml_base_framework.framework_output, - " %s - close domain",ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - CLOSE_FID(orte_rml_ofi.ofi_prov[ofi_prov_id].domain); - } - if (orte_rml_ofi.ofi_prov[ofi_prov_id].fabric) { - opal_output_verbose(10,orte_rml_base_framework.framework_output, - " %s - close fabric",ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - fi_close((fid_t)orte_rml_ofi.ofi_prov[ofi_prov_id].fabric); - } - if (orte_rml_ofi.ofi_prov[ofi_prov_id].rxbuf) { - free(orte_rml_ofi.ofi_prov[ofi_prov_id].rxbuf); - } - - orte_rml_ofi.ofi_prov[ofi_prov_id].fabric = NULL; - orte_rml_ofi.ofi_prov[ofi_prov_id].domain = NULL; - orte_rml_ofi.ofi_prov[ofi_prov_id].av = NULL; - orte_rml_ofi.ofi_prov[ofi_prov_id].cq = NULL; - orte_rml_ofi.ofi_prov[ofi_prov_id].ep = NULL; - orte_rml_ofi.ofi_prov[ofi_prov_id].ep_name[0] = 0; - orte_rml_ofi.ofi_prov[ofi_prov_id].epnamelen = 0; - orte_rml_ofi.ofi_prov[ofi_prov_id].rxbuf = NULL; - orte_rml_ofi.ofi_prov[ofi_prov_id].rxbuf_size = 0; - orte_rml_ofi.ofi_prov[ofi_prov_id].fabric_info = NULL; - orte_rml_ofi.ofi_prov[ofi_prov_id].mr_multi_recv = NULL; - orte_rml_ofi.ofi_prov[ofi_prov_id].ofi_prov_id = RML_OFI_PROV_ID_INVALID; - - - if( orte_rml_ofi.ofi_prov[ofi_prov_id].progress_ev_active) { - opal_output_verbose(10,orte_rml_base_framework.framework_output, - " %s - deleting progress event", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - opal_event_del( &orte_rml_ofi.ofi_prov[ofi_prov_id].progress_event); - } - - return; -} - - -static int -rml_ofi_component_close(void) -{ - - int rc; - opal_object_t *value; - uint64_t key; - void *node; - uint8_t ofi_prov_id; - - opal_output_verbose(10,orte_rml_base_framework.framework_output, - " %s - rml_ofi_component_close() -begin, total open OFI providers = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),orte_rml_ofi.ofi_prov_open_num); - - if (orte_rml_ofi.fi_info_list) { - (void) fi_freeinfo(orte_rml_ofi.fi_info_list); - } - - /* Close endpoint and all queues */ - for (ofi_prov_id=0; ofi_prov_id < orte_rml_ofi.ofi_prov_open_num; ofi_prov_id++) { - free_ofi_prov_resources(ofi_prov_id); - } - - /* release all peers from the hash table */ - rc = opal_hash_table_get_first_key_uint64(&orte_rml_ofi.peers, &key, - (void **)&value, &node); - while (OPAL_SUCCESS == rc) { - if (NULL != value) { - OBJ_RELEASE(value); - } - rc = opal_hash_table_get_next_key_uint64 (&orte_rml_ofi.peers, &key, - (void **) &value, node, &node); - } - OBJ_DESTRUCT(&orte_rml_ofi.peers); - OPAL_LIST_DESTRUCT(&orte_rml_ofi.recv_msg_queue_list); - - opal_output_verbose(10,orte_rml_base_framework.framework_output, - " %s - rml_ofi_component_close() end",ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - return ORTE_SUCCESS; -} - -static int rml_ofi_component_register(void) -{ - mca_base_component_t *component = &mca_rml_ofi_component.base; - - initial_ofi_transports_supported = "fabric,ethernet"; - ofi_transports_supported = strdup(initial_ofi_transports_supported); - mca_base_component_var_register(component, "transports", - "Comma-delimited list of transports to support (default=\"fabric,ethernet\"", - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, - OPAL_INFO_LVL_2, - MCA_BASE_VAR_SCOPE_LOCAL, - &ofi_transports_supported); - - - ofi_desired = false; - mca_base_component_var_register(component, "desired", - "Use OFI for coll conduit", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, - OPAL_INFO_LVL_2, - MCA_BASE_VAR_SCOPE_LOCAL, - &ofi_desired); - - routing_desired = false; - mca_base_component_var_register(component, "routing", - "Route OFI messages", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, - OPAL_INFO_LVL_2, - MCA_BASE_VAR_SCOPE_LOCAL, - &routing_desired); - - return ORTE_SUCCESS; -} - -void print_provider_info (struct fi_info *cur_fi ) -{ - //Display all the details in the fi_info structure - opal_output_verbose(1,orte_rml_base_framework.framework_output, - " %s - Print_provider_info() ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - opal_output_verbose(10,orte_rml_base_framework.framework_output, - " Provider name : %s",cur_fi->fabric_attr->prov_name); - opal_output_verbose(10,orte_rml_base_framework.framework_output, - " Protocol : %s",fi_tostr(&cur_fi->ep_attr->protocol,FI_TYPE_PROTOCOL)); - opal_output_verbose(10,orte_rml_base_framework.framework_output, - " EP Type : %s",fi_tostr(&cur_fi->ep_attr->type,FI_TYPE_EP_TYPE)); - opal_output_verbose(10,orte_rml_base_framework.framework_output, - " address_format : %s",fi_tostr(&cur_fi->addr_format,FI_TYPE_ADDR_FORMAT)); -} - -void print_provider_list_info (struct fi_info *fi ) -{ - struct fi_info *cur_fi = fi; - int fi_count = 0; - //Display all the details in the fi_info structure - opal_output_verbose(10,orte_rml_base_framework.framework_output, - " %s - Print_provider_list_info() ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - while( NULL != cur_fi ) { - fi_count++; - opal_output_verbose(10,orte_rml_base_framework.framework_output, - " %d.\n",fi_count); - print_provider_info( cur_fi); - cur_fi = cur_fi->next; - } - opal_output_verbose(10,orte_rml_base_framework.framework_output, - "Total # of providers supported is %d\n",fi_count); -} - -/* - * This returns all the supported transports in the system that support endpoint type RDM (reliable datagram) - * The providers returned is a list of type opal_valut_t holding opal_list_t - */ -static orte_rml_pathway_t* query_transports(void) -{ - - opal_output_verbose(10,orte_rml_base_framework.framework_output, - "%s:%d OFI Query Interface not implemented",__FILE__,__LINE__); - return NULL; -} - - -/** - ofi_prov [in]: the ofi ofi_prov_id that triggered the progress fn - **/ -static int orte_rml_ofi_progress(ofi_transport_ofi_prov_t* prov) -{ - ssize_t ret; - int count=0; /* number of messages read and processed */ - struct fi_cq_data_entry wc = { 0 }; - struct fi_cq_err_entry error = { 0 }; - orte_rml_ofi_request_t *ofi_req; - - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s orte_rml_ofi_progress called for OFI ofi_provid %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - prov->ofi_prov_id); - /** - * Read the work completions from the CQ. - * From the completion's op_context, we get the associated OFI request. - * Call the request's callback. - */ - while (true) { - /* Read the cq - that triggered the libevent to call this progress fn. */ - ret = fi_cq_read(prov->cq, (void *)&wc, 1); - if (0 < ret) { - opal_output_verbose(15, orte_rml_base_framework.framework_output, - "%s cq read for OFI ofi_provid %d - wc.flags = %llx", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - prov->ofi_prov_id, (long long unsigned int)wc.flags); - count++; - // check the flags to see if this is a send-completion or receive - if ( wc.flags & FI_SEND ) - { - opal_output_verbose(15, orte_rml_base_framework.framework_output, - "%s Send completion received on OFI provider id %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - prov->ofi_prov_id); - if (NULL != wc.op_context) { - /* get the context from the wc and call the message handler */ - ofi_req = TO_OFI_REQ(wc.op_context); - assert(ofi_req); - ret = orte_rml_ofi_send_callback(&wc, ofi_req); - if (ORTE_SUCCESS != ret) { - opal_output(orte_rml_base_framework.framework_output, - "Error returned by OFI send callback handler when a send completion was received on OFI prov: %zd", - ret); - } - } - } else if ( (wc.flags & FI_RECV) && (wc.flags & FI_MULTI_RECV) ) { - opal_output_verbose(15, orte_rml_base_framework.framework_output, - "%s Received message on OFI ofi_prov_id %d - but buffer is consumed, need to repost", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - prov->ofi_prov_id); - // reposting buffer - ret = fi_recv(orte_rml_ofi.ofi_prov[prov->ofi_prov_id].ep, - orte_rml_ofi.ofi_prov[prov->ofi_prov_id].rxbuf, - orte_rml_ofi.ofi_prov[prov->ofi_prov_id].rxbuf_size, - fi_mr_desc(orte_rml_ofi.ofi_prov[prov->ofi_prov_id].mr_multi_recv), - 0,&(prov->rx_ctx1)); - // call the receive message handler that will call the rml_base - ret = orte_rml_ofi_recv_handler(&wc, prov->ofi_prov_id); - if (ORTE_SUCCESS != ret) { - opal_output(orte_rml_base_framework.framework_output, - "Error returned by OFI Recv handler when handling the received message on the prov: %zd", - ret); - } - } else if ( wc.flags & FI_RECV ) { - opal_output_verbose(15, orte_rml_base_framework.framework_output, - "%s Received message on OFI provider id %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - prov->ofi_prov_id); - // call the receive message handler that will call the rml_base - ret = orte_rml_ofi_recv_handler(&wc, prov->ofi_prov_id); - if (ORTE_SUCCESS != ret) { - opal_output(orte_rml_base_framework.framework_output, - "Error returned by OFI Recv handler when handling the received message on the OFI prov: %zd", - ret); - } - } else if ( wc.flags & FI_MULTI_RECV ) { - opal_output_verbose(15, orte_rml_base_framework.framework_output, - "%s Received buffer overrun message on OFI provider id %d - need to repost", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - prov->ofi_prov_id); - // reposting buffer - ret = fi_recv(orte_rml_ofi.ofi_prov[prov->ofi_prov_id].ep, - orte_rml_ofi.ofi_prov[prov->ofi_prov_id].rxbuf, - orte_rml_ofi.ofi_prov[prov->ofi_prov_id].rxbuf_size, - fi_mr_desc(orte_rml_ofi.ofi_prov[prov->ofi_prov_id].mr_multi_recv), - 0,&(prov->rx_ctx1)); - if (ORTE_SUCCESS != ret) { - opal_output(orte_rml_base_framework.framework_output, - "Error returned by OFI when reposting buffer on the OFI prov: %zd", - ret); - } - }else { - opal_output_verbose(1,orte_rml_base_framework.framework_output, - "CQ has unhandled completion event with FLAG wc.flags = 0x%llx", - (long long unsigned int)wc.flags); - } - } else if (ret == -FI_EAVAIL) { - /** - * An error occured and is being reported via the CQ. - * Read the error and forward it to the upper layer. - */ - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s cq_read for OFI provider id %d returned error 0x%zx <%s>", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - prov->ofi_prov_id, ret, - fi_strerror((int) -ret) ); - ret = fi_cq_readerr(prov->cq,&error,0); - if (0 > ret) { - opal_output_verbose(1,orte_rml_base_framework.framework_output, - "Error returned from fi_cq_readerr: %zd", ret); - } - assert(error.op_context); - /* get the context from wc and call the error handler */ - ofi_req = TO_OFI_REQ(error.op_context); - assert(ofi_req); - ret = orte_rml_ofi_error_callback(&error, ofi_req); - if (ORTE_SUCCESS != ret) { - opal_output_verbose(1,orte_rml_base_framework.framework_output, - "Error returned by request error callback: %zd", - ret); - } - break; - } else if (ret == -FI_EAGAIN){ - /** - * The CQ is empty. Return. - */ - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s Empty cq for OFI provider id %d,exiting from ofi_progress()", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - prov->ofi_prov_id ); - break; - } else { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s cq_read for OFI provider id %d returned error 0x%zx <%s>", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - prov->ofi_prov_id, ret, - fi_strerror((int) -ret) ); - break; - } - } - return count; -} - - -/* - * call the ofi_progress() fn to read the cq - * - */ -int cq_progress_handler(int sd, short flags, void *cbdata) -{ - ofi_transport_ofi_prov_t* prov = (ofi_transport_ofi_prov_t*)cbdata; - int count; - - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s cq_progress_handler called for OFI Provider id %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - prov->ofi_prov_id); - - /* call the progress fn to read the cq and process the message - * for the ofi provider */ - count = orte_rml_ofi_progress(prov); - return count; -} - - -/* - * Returns the number of ofi-providers available - */ -static int rml_ofi_component_init(void) -{ - int ret, fi_version; - struct fi_info *hints, *fabric_info; - struct fi_cq_attr cq_attr = {0}; - struct fi_av_attr av_attr = {0}; - uint8_t cur_ofi_prov; - opal_buffer_t modex, entry, *eptr; - - opal_output_verbose(10,orte_rml_base_framework.framework_output, - "%s - Entering rml_ofi_component_init()",ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - - if (init_done) { - return orte_rml_ofi.ofi_prov_open_num; - } - - - /** - * Hints to filter providers - * See man fi_getinfo for a list of all filters - * mode: Select capabilities MTL is prepared to support. - * In this case, MTL will pass in context into communication calls - * ep_type: reliable datagram operation - * caps: Capabilities required from the provider. - * Tag matching is specified to implement MPI semantics. - * msg_order: Guarantee that messages with same tag are ordered. - */ - - hints = fi_allocinfo(); - if (!hints) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s:%d: Could not allocate fi_info\n", - __FILE__, __LINE__); - return orte_rml_ofi.ofi_prov_open_num; - } - - /** - * Refine filter for additional capabilities - * endpoint type : Reliable datagram - * threading: Disable locking - * control_progress: enable async progress - */ - hints->mode = FI_CONTEXT; - hints->ep_attr->type = FI_EP_RDM; /* Reliable datagram */ - - hints->domain_attr->threading = FI_THREAD_UNSPEC; - hints->domain_attr->control_progress = FI_PROGRESS_AUTO; - hints->domain_attr->data_progress = FI_PROGRESS_AUTO; - hints->domain_attr->av_type = FI_AV_MAP; - - /** - * FI_VERSION provides binary backward and forward compatibility support - * Specify the version of OFI is coded to, the provider will select struct - * layouts that are compatible with this version. - */ - fi_version = FI_VERSION(1, 3); - - /** - * fi_getinfo: returns information about fabric services for reaching a - * remote node or service. this does not necessarily allocate resources. - * Pass NULL for name/service because we want a list of providers supported. - */ - ret = fi_getinfo(fi_version, /* OFI version requested */ - NULL, /* Optional name or fabric to resolve */ - NULL, /* Optional service name or port to request */ - 0ULL, /* Optional flag */ - hints, /* In: Hints to filter providers */ - &orte_rml_ofi.fi_info_list); /* Out: List of matching providers */ - if (0 != ret) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s:%d: fi_getinfo failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); - fi_freeinfo(hints); - return ORTE_ERROR; - } - - /* added for debug purpose - Print the provider info - print_transports_query(); - print_provider_list_info(orte_rml_ofi.fi_info_list); - */ - - /* create a buffer for constructing our modex blob */ - OBJ_CONSTRUCT(&modex, opal_buffer_t); - - /** create the OFI objects for each transport in the system - * (fi_info_list) and store it in the ofi_prov array **/ - orte_rml_ofi.ofi_prov_open_num = 0; // start the ofi_prov_id from 0 - for(fabric_info = orte_rml_ofi.fi_info_list; - NULL != fabric_info && orte_rml_ofi.ofi_prov_open_num < MAX_OFI_PROVIDERS; - fabric_info = fabric_info->next) - { - opal_output_verbose(10,orte_rml_base_framework.framework_output, - "%s:%d beginning to add endpoint for OFI_provider_id=%d ",__FILE__,__LINE__, - orte_rml_ofi.ofi_prov_open_num); - print_provider_info(fabric_info); - cur_ofi_prov = orte_rml_ofi.ofi_prov_open_num; - orte_rml_ofi.ofi_prov[cur_ofi_prov].ofi_prov_id = orte_rml_ofi.ofi_prov_open_num ; - orte_rml_ofi.ofi_prov[cur_ofi_prov].fabric_info = fabric_info; - - // set FI_MULTI_RECV flag for all recv operations - fabric_info->rx_attr->op_flags = FI_MULTI_RECV; - /** - * Open fabric - * The getinfo struct returns a fabric attribute struct that can be used to - * instantiate the virtual or physical network. This opens a "fabric - * provider". See man fi_fabric for details. - */ - - ret = fi_fabric(fabric_info->fabric_attr, /* In: Fabric attributes */ - &orte_rml_ofi.ofi_prov[cur_ofi_prov].fabric, /* Out: Fabric handle */ - NULL); /* Optional context for fabric events */ - if (0 != ret) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s:%d: fi_fabric failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); - orte_rml_ofi.ofi_prov[cur_ofi_prov].fabric = NULL; - /* abort this current transport, but check if next transport can be opened */ - continue; - } - - - /** - * Create the access domain, which is the physical or virtual network or - * hardware port/collection of ports. Returns a domain object that can be - * used to create endpoints. See man fi_domain for details. - */ - ret = fi_domain(orte_rml_ofi.ofi_prov[cur_ofi_prov].fabric, /* In: Fabric object */ - fabric_info, /* In: Provider */ - &orte_rml_ofi.ofi_prov[cur_ofi_prov].domain, /* Out: Domain oject */ - NULL); /* Optional context for domain events */ - if (0 != ret) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s:%d: fi_domain failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); - orte_rml_ofi.ofi_prov[cur_ofi_prov].domain = NULL; - /* abort this current transport, but check if next transport can be opened */ - continue; - } - - /** - * Create a transport level communication endpoint. To use the endpoint, - * it must be bound to completion counters or event queues and enabled, - * and the resources consumed by it, such as address vectors, counters, - * completion queues, etc. - * see man fi_endpoint for more details. - */ - ret = fi_endpoint(orte_rml_ofi.ofi_prov[cur_ofi_prov].domain, /* In: Domain object */ - fabric_info, /* In: Provider */ - &orte_rml_ofi.ofi_prov[cur_ofi_prov].ep, /* Out: Endpoint object */ - NULL); /* Optional context */ - if (0 != ret) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s:%d: fi_endpoint failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); - free_ofi_prov_resources(cur_ofi_prov); - /* abort this current transport, but check if next transport can be opened */ - continue; - } - - /** - * Save the maximum inject size. - */ - //orte_rml_ofi.max_inject_size = prov->tx_attr->inject_size; - - /** - * Create the objects that will be bound to the endpoint. - * The objects include: - * - completion queue for events - * - address vector of other endpoint addresses - * - dynamic memory-spanning memory region - */ - cq_attr.format = FI_CQ_FORMAT_DATA; - cq_attr.wait_obj = FI_WAIT_FD; - cq_attr.wait_cond = FI_CQ_COND_NONE; - ret = fi_cq_open(orte_rml_ofi.ofi_prov[cur_ofi_prov].domain, - &cq_attr, &orte_rml_ofi.ofi_prov[cur_ofi_prov].cq, NULL); - if (ret) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s:%d: fi_cq_open failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); - free_ofi_prov_resources(cur_ofi_prov); - /* abort this current transport, but check if next transport can be opened */ - continue; - } - - /** - * The remote fi_addr will be stored in the ofi_endpoint struct. - * So, we use the AV in "map" mode. - */ - av_attr.type = FI_AV_MAP; - ret = fi_av_open(orte_rml_ofi.ofi_prov[cur_ofi_prov].domain, - &av_attr, &orte_rml_ofi.ofi_prov[cur_ofi_prov].av, NULL); - if (ret) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s:%d: fi_av_open failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); - free_ofi_prov_resources(cur_ofi_prov); - /* abort this current transport, but check if next transport can be opened */ - continue; - } - - /** - * Bind the CQ and AV to the endpoint object. - */ - ret = fi_ep_bind(orte_rml_ofi.ofi_prov[cur_ofi_prov].ep, - (fid_t)orte_rml_ofi.ofi_prov[cur_ofi_prov].cq, - FI_SEND | FI_RECV); - if (0 != ret) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s:%d: fi_bind CQ-EP failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); - free_ofi_prov_resources(cur_ofi_prov); - /* abort this current transport, but check if next transport can be opened */ - continue; - } - - ret = fi_ep_bind(orte_rml_ofi.ofi_prov[cur_ofi_prov].ep, - (fid_t)orte_rml_ofi.ofi_prov[cur_ofi_prov].av, - 0); - if (0 != ret) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s:%d: fi_bind AV-EP failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); - free_ofi_prov_resources(cur_ofi_prov); - /* abort this current transport, but check if next transport can be opened */ - continue; - } - - /** - * Enable the endpoint for communication - * This commits the bind operations. - */ - ret = fi_enable(orte_rml_ofi.ofi_prov[cur_ofi_prov].ep); - if (0 != ret) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s:%d: fi_enable failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); - free_ofi_prov_resources(cur_ofi_prov); - /* abort this current transport, but check if next transport can be opened */ - continue; - } - opal_output_verbose(10,orte_rml_base_framework.framework_output, - "%s:%d ep enabled for ofi_prov_id - %d ",__FILE__,__LINE__, - orte_rml_ofi.ofi_prov[cur_ofi_prov].ofi_prov_id); - - - /** - * Get our address and publish it with modex. - **/ - orte_rml_ofi.ofi_prov[cur_ofi_prov].epnamelen = sizeof (orte_rml_ofi.ofi_prov[cur_ofi_prov].ep_name); - ret = fi_getname((fid_t)orte_rml_ofi.ofi_prov[cur_ofi_prov].ep, - &orte_rml_ofi.ofi_prov[cur_ofi_prov].ep_name[0], - &orte_rml_ofi.ofi_prov[cur_ofi_prov].epnamelen); - if (ret) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s:%d: fi_getname failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); - free_ofi_prov_resources(cur_ofi_prov); - /* abort this current transport, but check if next transport can be opened */ - continue; - } - - /* create the modex entry for this provider */ - OBJ_CONSTRUCT(&entry, opal_buffer_t); - /* pack the provider's name */ - if (OPAL_SUCCESS != (ret = opal_dss.pack(&entry, &(orte_rml_ofi.ofi_prov[cur_ofi_prov].fabric_info->fabric_attr->prov_name), 1, OPAL_STRING))) { - OBJ_DESTRUCT(&entry); - free_ofi_prov_resources(cur_ofi_prov); - continue; - } - /* pack the provider's local index */ - if (OPAL_SUCCESS != (ret = opal_dss.pack(&entry, &cur_ofi_prov, 1, OPAL_UINT8))) { - OBJ_DESTRUCT(&entry); - free_ofi_prov_resources(cur_ofi_prov); - continue; - } - /* pack the size of the provider's connection blob */ - if (OPAL_SUCCESS != (ret = opal_dss.pack(&entry, &orte_rml_ofi.ofi_prov[cur_ofi_prov].epnamelen, 1, OPAL_SIZE))) { - OBJ_DESTRUCT(&entry); - free_ofi_prov_resources(cur_ofi_prov); - continue; - } - /* pack the blob itself */ - if (OPAL_SUCCESS != (ret = opal_dss.pack(&entry, orte_rml_ofi.ofi_prov[cur_ofi_prov].ep_name, - orte_rml_ofi.ofi_prov[cur_ofi_prov].epnamelen, OPAL_BYTE))) { - OBJ_DESTRUCT(&entry); - free_ofi_prov_resources(cur_ofi_prov); - continue; - } - /* add this entry to the overall modex object */ - eptr = &entry; - if (OPAL_SUCCESS != (ret = opal_dss.pack(&modex, &eptr, 1, OPAL_BUFFER))) { - OBJ_DESTRUCT(&entry); - free_ofi_prov_resources(cur_ofi_prov); - continue; - } - OBJ_DESTRUCT(&entry); - - /*print debug information on opal_modex_string */ - switch ( orte_rml_ofi.ofi_prov[cur_ofi_prov].fabric_info->addr_format) { - case FI_SOCKADDR_IN : - opal_output_verbose(1,orte_rml_base_framework.framework_output, - "%s:%d In FI_SOCKADDR_IN. ",__FILE__,__LINE__); - /* Address is of type sockaddr_in (IPv4) */ - opal_output_verbose(1,orte_rml_base_framework.framework_output, - "%s sending Opal modex string for ofi prov_id %d, epnamelen = %lu ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - cur_ofi_prov, orte_rml_ofi.ofi_prov[cur_ofi_prov].epnamelen); - /*[debug] - print the sockaddr - port and s_addr */ - struct sockaddr_in* ep_sockaddr = (struct sockaddr_in*)orte_rml_ofi.ofi_prov[cur_ofi_prov].ep_name; - opal_output_verbose(1,orte_rml_base_framework.framework_output, - "%s port = 0x%x, InternetAddr = 0x%s ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ntohs(ep_sockaddr->sin_port), inet_ntoa(ep_sockaddr->sin_addr)); - break; - } - - /** - * Set the ANY_SRC address. - */ - orte_rml_ofi.any_addr = FI_ADDR_UNSPEC; - - /** - * Allocate tx,rx buffers and Post a multi-RECV buffer for each endpoint - **/ - //[TODO later] For now not considering ep_attr prefix_size (add this later) - orte_rml_ofi.ofi_prov[cur_ofi_prov].rxbuf_size = MIN_MULTI_BUF_SIZE * MULTI_BUF_SIZE_FACTOR; - orte_rml_ofi.ofi_prov[cur_ofi_prov].rxbuf = malloc(orte_rml_ofi.ofi_prov[cur_ofi_prov].rxbuf_size); - - ret = fi_mr_reg(orte_rml_ofi.ofi_prov[cur_ofi_prov].domain, - orte_rml_ofi.ofi_prov[cur_ofi_prov].rxbuf, - orte_rml_ofi.ofi_prov[cur_ofi_prov].rxbuf_size, - FI_RECV, 0, 0, 0, &orte_rml_ofi.ofi_prov[cur_ofi_prov].mr_multi_recv, - &orte_rml_ofi.ofi_prov[cur_ofi_prov].rx_ctx1); - if (ret) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s:%d: fi_mr_reg failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); - free_ofi_prov_resources(cur_ofi_prov); - /* abort this current transport, but check if next transport can be opened */ - continue; - } - - ret = fi_setopt(&orte_rml_ofi.ofi_prov[cur_ofi_prov].ep->fid, FI_OPT_ENDPOINT, FI_OPT_MIN_MULTI_RECV, - &orte_rml_ofi.min_ofi_recv_buf_sz, sizeof(orte_rml_ofi.min_ofi_recv_buf_sz) ); - if (ret) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s:%d: fi_setopt failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); - free_ofi_prov_resources(cur_ofi_prov); - /* abort this current transport, but check if next transport can be opened */ - continue; - } - - ret = fi_recv(orte_rml_ofi.ofi_prov[cur_ofi_prov].ep, - orte_rml_ofi.ofi_prov[cur_ofi_prov].rxbuf, - orte_rml_ofi.ofi_prov[cur_ofi_prov].rxbuf_size, - fi_mr_desc(orte_rml_ofi.ofi_prov[cur_ofi_prov].mr_multi_recv), - 0,&orte_rml_ofi.ofi_prov[cur_ofi_prov].rx_ctx1); - if (ret) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s:%d: fi_recv failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); - free_ofi_prov_resources(cur_ofi_prov); - /* abort this current transport, but check if next transport can be opened */ - continue; - } - /** - * get the fd and register the progress fn - **/ - ret = fi_control(&orte_rml_ofi.ofi_prov[cur_ofi_prov].cq->fid, FI_GETWAIT, - (void *) &orte_rml_ofi.ofi_prov[cur_ofi_prov].fd); - if (0 != ret) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s:%d: fi_control failed to get fd: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); - free_ofi_prov_resources(cur_ofi_prov); - /* abort this current transport, but check if next transport can be opened */ - continue; - } - - /* - create the event that will wait on the fd*/ - /* use the opal_event_set to do a libevent set on the fd - * so when something is available to read, the cq_porgress_handler - * will be called */ - opal_event_set(orte_event_base, - &orte_rml_ofi.ofi_prov[cur_ofi_prov].progress_event, - orte_rml_ofi.ofi_prov[cur_ofi_prov].fd, - OPAL_EV_READ|OPAL_EV_PERSIST, - cq_progress_handler, - &orte_rml_ofi.ofi_prov[cur_ofi_prov]); - opal_event_add(&orte_rml_ofi.ofi_prov[cur_ofi_prov].progress_event, 0); - orte_rml_ofi.ofi_prov[cur_ofi_prov].progress_ev_active = true; - - /** update the number of ofi_provs in the ofi_prov[] array **/ - opal_output_verbose(10,orte_rml_base_framework.framework_output, - "%s:%d ofi_prov id - %d created ",__FILE__,__LINE__,orte_rml_ofi.ofi_prov_open_num); - orte_rml_ofi.ofi_prov_open_num++; - } - if (fabric_info != NULL && orte_rml_ofi.ofi_prov_open_num >= MAX_OFI_PROVIDERS ) { - opal_output_verbose(1,orte_rml_base_framework.framework_output, - "%s:%d fi_getinfo list not fully parsed as MAX_OFI_PROVIDERS - %d reached ",__FILE__,__LINE__,orte_rml_ofi.ofi_prov_open_num); - } - - /** - * Free providers info since it's not needed anymore. - */ - fi_freeinfo(hints); - hints = NULL; - /* check if at least one ofi_prov was successfully opened */ - if (0 < orte_rml_ofi.ofi_prov_open_num) { - uint8_t *data; - int32_t sz; - - opal_output_verbose(10,orte_rml_base_framework.framework_output, - "%s:%d ofi providers openened=%d returning orte_rml_ofi.api", - __FILE__,__LINE__,orte_rml_ofi.ofi_prov_open_num); - - OBJ_CONSTRUCT(&orte_rml_ofi.recv_msg_queue_list,opal_list_t); - /* post the modex object */ - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s calling OPAL_MODEX_SEND_STRING for RML/OFI ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - ret = opal_dss.unload(&modex, (void**)(&data), &sz); - OBJ_DESTRUCT(&modex); - if (OPAL_SUCCESS != ret) { - ORTE_ERROR_LOG(ret); - return ret; - } - OPAL_MODEX_SEND_STRING(ret, OPAL_PMIX_GLOBAL, - "rml.ofi", data, sz); - free(data); - if (OPAL_SUCCESS != ret) { - ORTE_ERROR_LOG(ret); - return ret; - } - } else { - opal_output_verbose(1,orte_rml_base_framework.framework_output, - "%s:%d Failed to open any OFI Providers",__FILE__,__LINE__); - } - - return orte_rml_ofi.ofi_prov_open_num; -} - -/* return : the ofi_prov_id that corresponds to the transport requested by the attributes - if transport is not found RML_OFI_PROV_ID_INVALID is returned. - @[in]attributes : the attributes passed in to open_conduit reg the transport requested -*/ -int get_ofi_prov_id(opal_list_t *attributes) -{ - int ofi_prov_id = RML_OFI_PROV_ID_INVALID, prov_num=0; - char **providers = NULL, *provider; - struct fi_info *cur_fi; - char *comp_attrib = NULL; - char **comps; - int i; - bool choose_fabric= false; - - /* check the list of attributes in below order - * Attribute should have ORTE_RML_TRANSPORT_ATTRIB key - * with values "ethernet" or "fabric". "fabric" is higher priority. - * (or) ORTE_RML_OFI_PROV_NAME key with values "socket" or "OPA" - * if both above attributes are missing return failure - */ - //if (orte_get_attribute(attributes, ORTE_RML_TRANSPORT_ATTRIB, (void**)&transport, OPAL_STRING) ) { - - if (orte_get_attribute(attributes, ORTE_RML_TRANSPORT_TYPE, (void**)&comp_attrib, OPAL_STRING) && - NULL != comp_attrib) { - comps = opal_argv_split(comp_attrib, ','); - for (i=0; NULL != comps[i]; i++) { - if (NULL != strstr(ofi_transports_supported, comps[i])) { - if (0 == strcmp(comps[i], "ethernet")) { - opal_output_verbose(20,orte_rml_base_framework.framework_output, - "%s - user requested opening conduit using OFI ethernet/sockets provider", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - opal_argv_append_nosize(&providers, "sockets"); - } else if (0 == strcmp(comps[i], "fabric")) { - opal_output_verbose(20,orte_rml_base_framework.framework_output, - "%s - user requested opening conduit using OFI fabric provider", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - opal_argv_prepend_nosize(&providers, "fabric"); /* fabric is higher priority so prepend it */ - } - } - } - } - /* if from the transport we don't know which provider we want, then check for the ORTE_RML_OFI_PROV_NAME_ATTRIB */ - if (NULL == providers) { - if (orte_get_attribute(attributes, ORTE_RML_PROVIDER_ATTRIB, (void**)&provider, OPAL_STRING)) { - opal_argv_append_nosize(&providers, provider); - } else { - ofi_prov_id = RML_OFI_PROV_ID_INVALID; - } - } - if (NULL != providers) { - /* go down the list of preferences in order */ - for (i=0; NULL != providers[i] && RML_OFI_PROV_ID_INVALID == ofi_prov_id; i++) { - // if generic transport "fabric" is requested then choose first available non-socket provider - if (0 == strcmp(providers[i],"fabric")) - choose_fabric=true; - else - choose_fabric=false; - // loop the orte_rml_ofi.ofi_provs[] and see if someone matches - for (prov_num = 0; prov_num < orte_rml_ofi.ofi_prov_open_num; prov_num++ ) { - cur_fi = orte_rml_ofi.ofi_prov[prov_num].fabric_info; - if (choose_fabric) { - opal_output_verbose(20,orte_rml_base_framework.framework_output, - "%s - get_ofi_prov_id() -> comparing sockets != %s to choose first available fabric provider", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - cur_fi->fabric_attr->prov_name); - if (0 != strcmp("sockets", cur_fi->fabric_attr->prov_name)) { - ofi_prov_id = prov_num; - opal_output_verbose(20,orte_rml_base_framework.framework_output, - "%s - Choosing provider %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - cur_fi->fabric_attr->prov_name); - break; - } - } else { - opal_output_verbose(20,orte_rml_base_framework.framework_output, - "%s - get_ofi_prov_id() -> comparing %s = %s ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - providers[i], cur_fi->fabric_attr->prov_name); - if (0 == strcmp(providers[i], cur_fi->fabric_attr->prov_name)) { - ofi_prov_id = prov_num; - opal_output_verbose(20,orte_rml_base_framework.framework_output, "%s - Choosing provider %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - cur_fi->fabric_attr->prov_name); - break; - } - } - } - } - } - opal_output_verbose(20,orte_rml_base_framework.framework_output, - "%s - get_ofi_prov_id(), returning ofi_prov_id=%d ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),ofi_prov_id); - return ofi_prov_id; -} - -/* - * Allocate a new module and initialise ofi_prov information - * for the requested provider and return the module * - */ -static orte_rml_base_module_t* make_module( int ofi_prov_id) -{ - orte_rml_ofi_module_t *mod = NULL; - - opal_output_verbose(20,orte_rml_base_framework.framework_output, - "%s - rml_ofi make_module() begin ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - if (RML_OFI_PROV_ID_INVALID == ofi_prov_id) { - opal_output_verbose(20,orte_rml_base_framework.framework_output, - "%s - open_conduit did not select any ofi provider, returning NULL ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - return NULL; - } - - - /* create a new module */ - mod = (orte_rml_ofi_module_t*)calloc(1,sizeof(orte_rml_ofi_module_t)); - if (NULL == mod) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return NULL; - } - /* copy the APIs over to it and the OFI provider information */ - memcpy(mod, &orte_rml_ofi, sizeof(orte_rml_ofi_module_t)); - /* setup the remaining data locations in mod, associate conduit with ofi provider selected*/ - mod->cur_transport_id = ofi_prov_id; - /* set the routed module */ - if (routing_desired) { - mod->api.routed = orte_routed.assign_module(NULL); - } else { - mod->api.routed = orte_routed.assign_module("direct"); - } - if (NULL == mod->api.routed) { - /* we can't work */ - opal_output_verbose(1,orte_rml_base_framework.framework_output, - "%s - Failed to get%srouted support, disqualifying ourselves", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - routing_desired ? " " : " direct "); - free(mod); - return NULL; - } - return (orte_rml_base_module_t*)mod; -} - - -/* Order of attributes honoring * -* ORTE_RML_INCLUDE_COMP_ATTRIB * -* ORTE_RML_EXCLUDE_COMP_ATTRIB * -* ORTE_RML_TRANSPORT_ATTRIB * -* ORTE_RML_PROVIDER_ATTRIB */ -static orte_rml_base_module_t* open_conduit(opal_list_t *attributes) -{ - char *comp_attrib = NULL; - char **comps; - int i; - orte_attribute_t *attr; - - opal_output_verbose(20,orte_rml_base_framework.framework_output, - "%s - Entering rml_ofi_open_conduit()", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - /* Open all ofi endpoints */ - if (!init_done) { - rml_ofi_component_init(); - init_done = true; - } - - /* check if atleast 1 ofi provider is initialised */ - if ( 0 >= orte_rml_ofi.ofi_prov_open_num) { - opal_output_verbose(20,orte_rml_base_framework.framework_output, - "%s - Init did not open any Ofi endpoints, returning NULL", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - return NULL; - } - - /* someone may require this specific component, so look for "ofi" */ - if (orte_get_attribute(attributes, ORTE_RML_INCLUDE_COMP_ATTRIB, (void**)&comp_attrib, OPAL_STRING) && - NULL != comp_attrib) { - /* they specified specific components - could be multiple */ - comps = opal_argv_split(comp_attrib, ','); - for (i=0; NULL != comps[i]; i++) { - if (0 == strcmp(comps[i], "ofi")) { - /* we are a candidate, */ - opal_argv_free(comps); - return make_module(get_ofi_prov_id(attributes)); - } - } - /* we are not a candidate */ - opal_argv_free(comps); - return NULL; - } else if (orte_get_attribute(attributes, ORTE_RML_EXCLUDE_COMP_ATTRIB, (void**)&comp_attrib, OPAL_STRING) && - NULL != comp_attrib) { - /* see if we are on the list */ - comps = opal_argv_split(comp_attrib, ','); - for (i=0; NULL != comps[i]; i++) { - if (0 == strcmp(comps[i], "ofi")) { - /* we cannot be a candidate */ - opal_argv_free(comps); - return NULL; - } - } - } - - if (orte_get_attribute(attributes, ORTE_RML_TRANSPORT_TYPE, (void**)&comp_attrib, OPAL_STRING) && - NULL != comp_attrib) { - opal_output_verbose(20,orte_rml_base_framework.framework_output, - "%s - ORTE_RML_TRANSPORT_TYPE = %s ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), comp_attrib); - comps = opal_argv_split(comp_attrib, ','); - for (i=0; NULL != comps[i]; i++) { - if (NULL != strstr(ofi_transports_supported, comps[i])) { - /* we are a candidate, */ - opal_output_verbose(20,orte_rml_base_framework.framework_output, - "%s - Opening conduit using OFI.. ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - opal_argv_free(comps); - return make_module(get_ofi_prov_id(attributes)); - } - } - opal_argv_free(comps); - } - - /* Alternatively, check the attributes to see if we qualify - we only handle - * "pt2pt" */ - OPAL_LIST_FOREACH(attr, attributes, orte_attribute_t) { - /* [TODO] add any additional attributes check here */ - - } - opal_output_verbose(20,orte_rml_base_framework.framework_output, - "%s - ofi is not a candidate as per attributes, returning NULL", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - /* if we get here, we cannot handle it */ - return NULL; -} - -static void pr_cons(orte_rml_ofi_peer_t *ptr) -{ - ptr->ofi_prov_name = NULL; - ptr->ofi_ep = NULL; - ptr->ofi_ep_len = 0; - ptr->src_prov_id = RML_OFI_PROV_ID_INVALID; -} - -static void pr_des(orte_rml_ofi_peer_t *ptr) -{ - if ( NULL != ptr->ofi_prov_name) - free(ptr->ofi_prov_name); - if ( 0 < ptr->ofi_ep_len) - free( ptr->ofi_ep); -} - -OBJ_CLASS_INSTANCE(orte_rml_ofi_peer_t, - opal_object_t, - pr_cons, pr_des); diff --git a/orte/mca/rml/ofi/rml_ofi_request.h b/orte/mca/rml/ofi/rml_ofi_request.h deleted file mode 100644 index 54b8203ae8..0000000000 --- a/orte/mca/rml/ofi/rml_ofi_request.h +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Copyright (c) 2015 Intel, Inc. All rights reserved - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef ORTE_RML_OFI_REQUEST_H -#define ORTE_RML_OFI_REQUEST_H - - -#define TO_OFI_REQ(_ptr_ctx) \ - container_of((_ptr_ctx), orte_rml_ofi_request_t, ctx) - -typedef enum { - ORTE_RML_OFI_SEND, - ORTE_RML_OFI_RECV, - ORTE_RML_OFI_ACK, - ORTE_RML_OFI_PROBE -} orte_rml_ofi_request_type_t; -/* orte_rml_ofi_msg_header_t contains the header information for the message being sent. -The header and data is passed on to the destination. The destination will re-construct the -orte_rml_sent_t struct once it receives this header and data.This header has the required information -to construct the orte_rml_sent_t struct and also if the message is split into packets, -then the packet information - total number of packets and the current packet number. -*/ -struct orte_rml_ofi_msg_header_t{ - opal_process_name_t origin; // originator process id from the send message - opal_process_name_t dst; // Destination process id from the send message - uint32_t seq_num; // seq_num from the send message - orte_rml_tag_t tag; // tag from the send message - uint32_t msgid; // unique msgid added by ofi plugin to keep track of fragmented msgs - uint32_t tot_pkts; // total packets this msg will be fragmented into by ofi plugin - uint32_t cur_pkt_num; // current packet number - }; -typedef struct orte_rml_ofi_msg_header_t orte_rml_ofi_msg_header_t; - -/* -orte_rml_ofi_pkts_t defines the packets in the message. Each packet contains header information -and the data. Create a list of packets to hold the entire message. -*/ -typedef struct { - //list_item_t - opal_list_item_t super; - /* header + data size */ - size_t pkt_size; - //header + data - void *data; -}orte_rml_ofi_send_pkt_t; -OBJ_CLASS_DECLARATION(orte_rml_ofi_send_pkt_t); - -/* -orte_rml_ofi_recv_pkt_t defines the packets in the receiving end of message. -Each packet contains the packet number and the data. -Create a list of packets to hold the entire message. -*/ -typedef struct { - //list_item_t - opal_list_item_t super; - /* current packet number */ - uint32_t cur_pkt_num; - /*data size */ - size_t pkt_size; - //data - void *data; -}orte_rml_ofi_recv_pkt_t; -OBJ_CLASS_DECLARATION(orte_rml_ofi_recv_pkt_t); - -/* -orte_rml_ofi_request_t holds the send request (orte_rml_send_t) -*/ -typedef struct { - opal_object_t super; - - /** OFI context */ - struct fi_context ctx; - - orte_rml_send_t *send; - - /** OFI provider_id the request will use - this is - * the reference to element into the orte_rml_ofi.ofi_prov[] **/ - uint8_t ofi_prov_id; - - /** OFI Request type */ - orte_rml_ofi_request_type_t type; - - /** Completion count used by blocking and/or synchronous operations */ - volatile int completion_count; - - /** Reference to the RML used to lookup */ - /* source of an ANY_SOURCE Recv */ - struct orte_rml_base_module_t* rml; - - /** header being sent **/ - orte_rml_ofi_msg_header_t hdr; - - /** Pack buffer */ - void *data_blob; - - /** Pack buffer size */ - size_t length; - - /** Header and data in a list of Packets orte_rml_ofi_send_pkt_t */ - opal_list_t pkt_list; - -} orte_rml_ofi_request_t; -OBJ_CLASS_DECLARATION(orte_rml_ofi_request_t); - - -/* This will hold all the pckts received at the destination. -Each entry will be indexed by [sender,msgid] and will have -all the packets for that msgid and sender. -*/ -typedef struct { - - opal_list_item_t super; //list_item_t - uint32_t msgid; // unique msgid added by ofi plugin to keep track of fragmented msgs - opal_process_name_t sender; // originator process id from the send message - uint32_t tot_pkts; // total packets this msg will be fragmented into by ofi plugin - uint32_t pkt_recd; // current packet number - opal_list_t pkt_list; // list holding Packets in this msg of type orte_rml_ofi_recv_pkt_t -} ofi_recv_msg_queue_t; -OBJ_CLASS_DECLARATION( ofi_recv_msg_queue_t); - -/* define an object for transferring send requests to the event lib */ -typedef struct { - opal_object_t super; - opal_event_t ev; - orte_rml_send_t send; - /* ofi provider id */ - int ofi_prov_id; -} ofi_send_request_t; -OBJ_CLASS_DECLARATION(ofi_send_request_t); - -#endif diff --git a/orte/mca/rml/ofi/rml_ofi_send.c b/orte/mca/rml/ofi/rml_ofi_send.c deleted file mode 100644 index 99a143c925..0000000000 --- a/orte/mca/rml/ofi/rml_ofi_send.c +++ /dev/null @@ -1,1052 +0,0 @@ -/* - * Copyright (c) 2015-2017 Intel, Inc. All rights reserved. - * Copyright (c) 2017 Los Alamos National Security, LLC. All rights - * reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" - -#include "opal/dss/dss_types.h" -#include "opal/util/net.h" -#include "opal/util/output.h" -#include "opal/mca/event/event.h" - -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/rml/base/base.h" -#include "orte/mca/rml/rml_types.h" - -#include -#include -#include -#include -#include -#include - -#include "rml_ofi.h" - -static void ofi_req_cons(orte_rml_ofi_request_t *ptr) -{ - OBJ_CONSTRUCT(&ptr->pkt_list, opal_list_t); -} -static void ofi_req_des(orte_rml_ofi_request_t *ptr) -{ - OPAL_LIST_DESTRUCT(&ptr->pkt_list); -} -OBJ_CLASS_INSTANCE(orte_rml_ofi_request_t, - opal_object_t, - ofi_req_cons, ofi_req_des); - - -static void ofi_send_req_cons(ofi_send_request_t *ptr) -{ - OBJ_CONSTRUCT(&ptr->send, orte_rml_send_t); -} -OBJ_CLASS_INSTANCE(ofi_send_request_t, - opal_object_t, - ofi_send_req_cons, NULL); - -OBJ_CLASS_INSTANCE(orte_rml_ofi_send_pkt_t, - opal_list_item_t, - NULL, NULL); - -OBJ_CLASS_INSTANCE(orte_rml_ofi_recv_pkt_t, - opal_list_item_t, - NULL, NULL); - - -static void ofi_recv_msg_queue_cons(ofi_recv_msg_queue_t *ptr) -{ - ptr->msgid = 0; - ptr->tot_pkts = 1; - ptr->pkt_recd = 0; - OBJ_CONSTRUCT(&ptr->pkt_list, opal_list_t); -} -static void ofi_recv_msg_queue_des(ofi_recv_msg_queue_t *ptr) -{ - OPAL_LIST_DESTRUCT(&ptr->pkt_list); -} -OBJ_CLASS_INSTANCE(ofi_recv_msg_queue_t, - opal_list_item_t, - ofi_recv_msg_queue_cons, ofi_recv_msg_queue_des); - -static void send_self_exe(int fd, short args, void* data) -{ - orte_self_send_xfer_t *xfer = (orte_self_send_xfer_t*)data; - - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s rml_send_to_self ofi callback executing for tag %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), xfer->tag); - - /* execute the send callback function - note that - * send-to-self always returns a SUCCESS status - */ - if (NULL != xfer->iov) { - if (NULL != xfer->cbfunc.iov) { - /* non-blocking iovec send */ - xfer->cbfunc.iov(ORTE_SUCCESS, ORTE_PROC_MY_NAME, xfer->iov, xfer->count, - xfer->tag, xfer->cbdata); - } - } else if (NULL != xfer->buffer) { - if (NULL != xfer->cbfunc.buffer) { - /* non-blocking buffer send */ - xfer->cbfunc.buffer(ORTE_SUCCESS, ORTE_PROC_MY_NAME, xfer->buffer, - xfer->tag, xfer->cbdata); - } - } else { - /* should never happen */ - abort(); - } - - /* cleanup the memory */ - OBJ_RELEASE(xfer); -} - -/** Send callback */ -/* [Desc] This is called from the progress fn when a send completion -** is received in the cq -** wc [in] : the completion queue data entry -** ofi_send_req [in]: ofi send request with the send msg and callback -*/ -int orte_rml_ofi_send_callback(struct fi_cq_data_entry *wc, - orte_rml_ofi_request_t* ofi_req) -{ - orte_rml_ofi_send_pkt_t *ofi_send_pkt, *next; - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s orte_rml_ofi_send_callback called, completion count = %d, msgid = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ofi_req->completion_count, ofi_req->hdr.msgid); - assert(ofi_req->completion_count > 0); - ofi_req->completion_count--; - if ( 0 == ofi_req->completion_count ) { - // call the callback fn of the sender - ofi_req->send->status = ORTE_SUCCESS; - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s calling ORTE_RML_SEND_COMPLETE macro for msgid = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ofi_req->hdr.msgid); - ORTE_RML_SEND_COMPLETE(ofi_req->send); - OPAL_LIST_FOREACH_SAFE(ofi_send_pkt, next, &ofi_req->pkt_list, orte_rml_ofi_send_pkt_t) { - free( ofi_send_pkt->data); - ofi_send_pkt->pkt_size=0; - opal_list_remove_item(&ofi_req->pkt_list, &ofi_send_pkt->super); - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s Removed pkt from list ",ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ); - OBJ_RELEASE(ofi_send_pkt); - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s Released packet ",ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ); - } - free(ofi_req->data_blob); - OBJ_RELEASE(ofi_req); - } - - // [TODO] need to check for error before returning success - return ORTE_SUCCESS; -} - -/** Error callback */ -/* [Desc] This is called from the progress fn when a send completion -** is received in the cq -** wc [in] : the completion queue data entry -** ofi_send_req [in]: ofi send request with the send msg and callback -*/ -int orte_rml_ofi_error_callback(struct fi_cq_err_entry *error, - orte_rml_ofi_request_t* ofi_req) -{ - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s orte_rml_ofi_error_callback called ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ); - switch(error->err) { - default: - /* call the send-callback fn with error and return, also return failure status */ - ofi_req->send->status = ORTE_ERR_CONDUIT_SEND_FAIL; - ORTE_RML_SEND_COMPLETE(ofi_req->send); - } - return ORTE_SUCCESS; -} - -/** Recv handler */ -/* [Desc] This is called from the progress fn when a recv completion -** is received in the cq -** wc [in] : the completion queue data entry */ -int orte_rml_ofi_recv_handler(struct fi_cq_data_entry *wc, uint8_t ofi_prov_id) -{ - orte_rml_ofi_msg_header_t msg_hdr; - uint32_t msglen, datalen = 0; - char *data, *totdata, *nextpkt; - ofi_recv_msg_queue_t *recv_msg_queue, *new_msg; - orte_rml_ofi_recv_pkt_t *ofi_recv_pkt, *new_pkt, *next; - bool msg_in_queue = false; - - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s orte_rml_ofi_recv_handler called ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ); - /*copy the header and data from buffer and pass it on - ** since this is the ofi_prov recv buffer don't want it to be released as - ** considering re-using it, so for now copying to newly allocated *data - ** the *data will be released by orte_rml_base functions */ - - memcpy(&msg_hdr,wc->buf,sizeof(orte_rml_ofi_msg_header_t)); - msglen = wc->len - sizeof(orte_rml_ofi_msg_header_t); - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s Received packet -> msg id = %d wc->len = %lu, msglen = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg_hdr.msgid, wc->len, msglen ); - data = (char *)malloc(msglen); - memcpy(data,((char *)wc->buf+sizeof(orte_rml_ofi_msg_header_t)),msglen); - opal_output_verbose(15, orte_rml_base_framework.framework_output, - "%s header info of received packet -> cur_pkt_num = %d, tot_pkts = %d ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg_hdr.cur_pkt_num, msg_hdr.tot_pkts ); - /* To accomodate message bigger than recv buffer size, - check if current message is in multiple blocks and append them before sending it to RML */ - if ( msg_hdr.tot_pkts == 1) { - /* Since OFI is point-to-point, no need to check if the intended destination is me - send to RML */ - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s Posting Recv for msgid %d, from peer - %s , Tag = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg_hdr.msgid, ORTE_NAME_PRINT(&msg_hdr.origin),msg_hdr.tag ); - ORTE_RML_POST_MESSAGE(&msg_hdr.origin, msg_hdr.tag, msg_hdr.seq_num,data,msglen); - } else { - msg_in_queue = false; - new_pkt = OBJ_NEW(orte_rml_ofi_recv_pkt_t); - new_pkt->cur_pkt_num = msg_hdr.cur_pkt_num; - new_pkt->pkt_size = msglen; - new_pkt->data = data; - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s Just beofe checking if this message-pkt is already in queue. msgid-%d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg_hdr.msgid ); - /* check if the queue has the [msgid,sender] entry */ - OPAL_LIST_FOREACH(recv_msg_queue, &orte_rml_ofi.recv_msg_queue_list, ofi_recv_msg_queue_t) { - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s Checking msgid-%d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv_msg_queue->msgid ); - if( (recv_msg_queue->msgid == msg_hdr.msgid) && (recv_msg_queue->sender.jobid == msg_hdr.origin.jobid) - && (recv_msg_queue->sender.vpid == msg_hdr.origin.vpid) ) { - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s Found Msg entry in queue for msgid %d, sender jobid=%d, sender vpid=%d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv_msg_queue->msgid, recv_msg_queue->sender.jobid, recv_msg_queue->sender.vpid); - msg_in_queue = true; - - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s msgid %d, tot_pkts=%d, opal_list_get_size()=%lu,total pkt_recd=%d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv_msg_queue->msgid, recv_msg_queue->tot_pkts, - opal_list_get_size(&recv_msg_queue->pkt_list), recv_msg_queue->pkt_recd ); - if( recv_msg_queue->tot_pkts == (recv_msg_queue->pkt_recd +1) ) { - /* all packets received for this message - post message to rml and remove this from queue */ - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s All packets recd for msgid %d, tot_pkts=%d, opal_list_get_size()=%lu,total pkt_recd=%d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv_msg_queue->msgid, recv_msg_queue->tot_pkts, - opal_list_get_size(&recv_msg_queue->pkt_list), recv_msg_queue->pkt_recd ); - totdata = NULL; - datalen = 0; - OPAL_LIST_FOREACH(ofi_recv_pkt, &recv_msg_queue->pkt_list, orte_rml_ofi_recv_pkt_t) { - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s Adding data for packet %d, pktlength = %lu, cumulative datalen so far = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ofi_recv_pkt->cur_pkt_num, ofi_recv_pkt->pkt_size, datalen ); - if (0 == datalen) { - if (NULL != totdata) { - free(totdata); - } - totdata = (char *)malloc(ofi_recv_pkt->pkt_size); - if( totdata == NULL) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s Error: malloc failed for msgid %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),recv_msg_queue->msgid ); - return 1; //[TODO: error-handling needs to be implemented - } - memcpy(totdata,ofi_recv_pkt->data,ofi_recv_pkt->pkt_size); - - } else { - totdata = realloc(totdata,datalen+ofi_recv_pkt->pkt_size); - if (NULL != totdata ) { - memcpy((totdata+datalen),ofi_recv_pkt->data,ofi_recv_pkt->pkt_size); - } else { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s Error: realloc failed for msgid %d, from sender jobid=%d, sender vpid=%d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv_msg_queue->msgid, recv_msg_queue->sender.jobid, - recv_msg_queue->sender.vpid); - return 1; //[TODO: error-handling needs to be implemented - } - } - datalen += ofi_recv_pkt->pkt_size; - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s packet %d done, datalen = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ofi_recv_pkt->cur_pkt_num,datalen); - } - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s Adding leftover data recd, datalen = %d, new_pkt->pkt_size = %lu", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), datalen, new_pkt->pkt_size); - //add the last packet - totdata =realloc(totdata,datalen+new_pkt->pkt_size); - if( NULL != totdata ) { - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s Realloc completed for leftover data recd, datalen = %d, new->pkt->pkt_size = %lu", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), datalen, new_pkt->pkt_size); - nextpkt = totdata+datalen; - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s totdata = %p,nextpkt = %p ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *)totdata, (void *)nextpkt); - memcpy(nextpkt,new_pkt->data,new_pkt->pkt_size); - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s memcpy completed for leftover data recd, datalen = %d, new->pkt->pkt_size = %lu", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), datalen, new_pkt->pkt_size); - datalen += new_pkt->pkt_size; - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s Posting Recv for msgid %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg_hdr.msgid ); - ORTE_RML_POST_MESSAGE(&msg_hdr.origin, msg_hdr.tag, msg_hdr.seq_num,totdata,datalen);\ - - // free the pkts - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s msgid %d - posting recv completed, freeing packets", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg_hdr.msgid ); - OPAL_LIST_FOREACH_SAFE(ofi_recv_pkt, next, &recv_msg_queue->pkt_list, orte_rml_ofi_recv_pkt_t) { - free( ofi_recv_pkt->data); - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s freed data for packet %d",ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ofi_recv_pkt->cur_pkt_num ); - ofi_recv_pkt->pkt_size=0; - opal_list_remove_item(&recv_msg_queue->pkt_list, &ofi_recv_pkt->super); - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s Removed pkt from list ",ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ); - OBJ_RELEASE(ofi_recv_pkt); - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s Released packet ",ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ); - } - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s freeing packets completed",ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ); - //free the msg from the queue-list - opal_list_remove_item(&orte_rml_ofi.recv_msg_queue_list,&recv_msg_queue->super); - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s Successfully removed msg from queue", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ); - OBJ_RELEASE(recv_msg_queue); - } else { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s Error: realloc failed for msgid %d, from sender jobid=%d, sender vpid=%d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv_msg_queue->msgid, recv_msg_queue->sender.jobid, - recv_msg_queue->sender.vpid); - return 1; //[TODO: error-handling needs to be implemented - } - } else { - /* add this packet to the msg in the queue ordered by cur_pkt_num */ - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s Adding packet to list, msgid %d, pkt - %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv_msg_queue->msgid, msg_hdr.cur_pkt_num ); - - bool pkt_added = false; - OPAL_LIST_FOREACH(ofi_recv_pkt, &recv_msg_queue->pkt_list, orte_rml_ofi_recv_pkt_t) { - if( msg_hdr.cur_pkt_num < ofi_recv_pkt->cur_pkt_num ) { - opal_list_insert_pos(&recv_msg_queue->pkt_list, (opal_list_item_t*)ofi_recv_pkt, &new_pkt->super); - recv_msg_queue->pkt_recd++; - pkt_added = true; - break; - } - } - if (!pkt_added) { - opal_list_append(&recv_msg_queue->pkt_list,&new_pkt->super); - recv_msg_queue->pkt_recd++; - } - } - } - break; //we found the msg or added it so exit out of the msg_queue loop - } - if( !msg_in_queue ) { - /*add to the queue as this is the first packet for [msgid,sender] */ - new_msg = OBJ_NEW(ofi_recv_msg_queue_t); - new_msg->msgid = msg_hdr.msgid; - new_msg->sender = msg_hdr.origin; - new_msg->tot_pkts = msg_hdr.tot_pkts; - new_msg->pkt_recd = 1; - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s Adding first Msg entry in queue for msgid %d, sender jobid=%d, sender vpid=%d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), new_msg->msgid, new_msg->sender.jobid, new_msg->sender.vpid); - opal_list_append(&new_msg->pkt_list, &new_pkt->super); - opal_list_append(&orte_rml_ofi.recv_msg_queue_list, &new_msg->super); - - } - } - return ORTE_SUCCESS; -} - -/* populate_peer_ofi_addr - * [Desc] This fn does a PMIx Modex recv on "rml.ofi" key - * to get the ofi address blob of all providers on the peer. - * Then it populates the array parameter peer_ofi_addr[] - * with providername, ofi_ep_name and ofi_ep_namelen - * [in] peer -> peer address - * [out] peer_ofi_addr[] -> array to hold the provider details on the peer - * [Return value] -> total providers on success. OPAL_ERROR if fails to load array. - */ -static int populate_peer_ofi_addr(orte_process_name_t *peer, orte_rml_ofi_peer_t *peer_ofi_addr ) -{ - - uint8_t *data; - int32_t sz, cnt; - opal_buffer_t modex, *entry; - char *prov_name; - uint8_t prov_num; - size_t entrysize; - uint8_t *bytes; - uint8_t tot_prov=0,cur_prov; - int ret = OPAL_ERROR; - - OPAL_MODEX_RECV_STRING(ret, "rml.ofi", peer, (void**)&data, &sz); - if (OPAL_SUCCESS != ret) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s rml:ofi::populate_peer_ofi_addr() Modex_Recv Failed for peer %s. ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(peer)); - return OPAL_ERROR; - } - - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s rml:ofi::populate_peer_ofi_addr() Modex_Recv Succeeded. ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - /* load the data into a buffer for unpacking */ - OBJ_CONSTRUCT(&modex, opal_buffer_t); - opal_dss.load(&modex, data, sz); - cnt = 1; - /* cycle thru the returned providers and see which one we want to use */ - for(cur_prov=0;OPAL_SUCCESS == (ret = opal_dss.unpack(&modex, &entry, &cnt, OPAL_BUFFER));cur_prov++) { - /* unpack the provider name */ - cnt = 1; - if (OPAL_SUCCESS != (ret = opal_dss.unpack(entry, &prov_name, &cnt, OPAL_STRING))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(entry); - break; - } - /* unpack the provider's index on the remote peer - note that there - * is no guarantee that the same provider has the same local index! */ - cnt = 1; - if (OPAL_SUCCESS != (ret = opal_dss.unpack(entry, &prov_num, &cnt, OPAL_UINT8))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(entry); - break; - } - /* unpack the size of their connection blob */ - cnt = 1; - if (OPAL_SUCCESS != (ret = opal_dss.unpack(entry, &entrysize, &cnt, OPAL_SIZE))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(entry); - break; - } - /* create the necessary space */ - bytes = (uint8_t*)malloc(entrysize); - /* unpack the connection blob */ - cnt = entrysize; - if (OPAL_SUCCESS != (ret = opal_dss.unpack(entry, bytes, &cnt, OPAL_BYTE))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(entry); - break; - } - /* done with the buffer */ - OBJ_RELEASE(entry); - peer_ofi_addr[cur_prov].ofi_prov_name = prov_name; - peer_ofi_addr[cur_prov].ofi_ep = bytes; - peer_ofi_addr[cur_prov].ofi_ep_len = entrysize; - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s rml:ofi:populate_peer_ofi_addr() Unpacked peer provider %s ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),peer_ofi_addr[cur_prov].ofi_prov_name); - } - OBJ_DESTRUCT(&modex); // releases the data returned by the modex_recv - tot_prov=cur_prov; - return tot_prov; -} - - -/* check_provider_in_peer(prov_name, peer_ofi_addr) - * [Desc] This fn checks for a match of prov_name in the peer_ofi_addr array - * and returns the index of the match or OPAL_ERROR if not found. - * The peer_ofi_addr array has all the ofi providers in peer. - * [in] prov_name -> The provider name we want to use to send this message to peer. - * [in] tot_prov -> total provider entries in array - * [in] peer_ofi_addr[] -> array of provider details on the peer - * [in] local_ofi_prov_idx -> the index of local provider we are comparing with - * (index into orte_rml_ofi.ofi_prov[] array. - * [Return value] -> index that matches provider on success. OPAL_ERROR if no match found. - */ -static int check_provider_in_peer( char *prov_name, int tot_prov, orte_rml_ofi_peer_t *peer_ofi_addr, int local_ofi_prov_idx ) -{ - int idx; - int ret = OPAL_ERROR; - - for( idx=0; idx < tot_prov; idx++) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s rml:ofi:check_provider_in_peer() checking peer provider %s to match %s ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),peer_ofi_addr[idx].ofi_prov_name,prov_name); - if ( 0 == strcmp(prov_name, peer_ofi_addr[idx].ofi_prov_name) ) { - /* we found a matching provider on peer */ - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s rml:ofi:check_provider_in_peer() matched provider %s ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),peer_ofi_addr[idx].ofi_prov_name); - if ( 0 == strcmp(prov_name, "sockets") ) { - /* check if the address is reachable */ - struct sockaddr_in *ep_sockaddr, *ep_sockaddr2; - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s rml:ofi:check_provider_in_peer() checking if sockets provider is reachable ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - ep_sockaddr = (struct sockaddr_in*)peer_ofi_addr[idx].ofi_ep; - ep_sockaddr2 = (struct sockaddr_in*)orte_rml_ofi.ofi_prov[local_ofi_prov_idx].ep_name; - if (opal_net_samenetwork((struct sockaddr*)ep_sockaddr, (struct sockaddr*)ep_sockaddr2, 24)) { - /* we found same ofi provider reachable via ethernet on peer so return this idx*/ - ret = idx; - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s rml:ofi:check_provider_in_peer() sockets provider is reachable ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - break; - } - } else { - ret = idx; - break; - } - } - } - return ret; -} - -static void send_msg(int fd, short args, void *cbdata) -{ - ofi_send_request_t *req = (ofi_send_request_t*)cbdata; - orte_process_name_t *peer = &(req->send.dst); - orte_rml_tag_t tag = req->send.tag; - char *dest_ep_name; - size_t dest_ep_namelen = 0; - int ret = OPAL_ERROR, rc; - uint32_t total_packets; - fi_addr_t dest_fi_addr; - orte_rml_send_t *snd; - orte_rml_ofi_request_t* ofi_send_req = OBJ_NEW( orte_rml_ofi_request_t ); - uint8_t ofi_prov_id = req->ofi_prov_id; - orte_rml_ofi_send_pkt_t* ofi_msg_pkt; - size_t datalen_per_pkt, hdrsize, data_in_pkt; // the length of data in per packet excluding the header size - orte_rml_ofi_peer_t* pr; - uint64_t ui64; - struct sockaddr_in* ep_sockaddr; - - snd = OBJ_NEW(orte_rml_send_t); - snd->dst = *peer; - snd->origin = *ORTE_PROC_MY_NAME; - snd->tag = tag; - if (NULL != req->send.iov) { - snd->iov = req->send.iov; - snd->count = req->send.count; - snd->cbfunc.iov = req->send.cbfunc.iov; - } else { - snd->buffer = req->send.buffer; - snd->cbfunc.buffer = req->send.cbfunc.buffer; - } - snd->cbdata = req->send.cbdata; - - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s send_msg_transport to peer %s at tag %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(peer), tag); - - /* get the peer address from our internal hash table */ - memcpy(&ui64, (char*)peer, sizeof(uint64_t)); - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s getting contact info for DAEMON peer %s from internal hash table", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(peer)); - if (OPAL_SUCCESS != (ret = opal_hash_table_get_value_uint64(&orte_rml_ofi.peers, - ui64, (void**)&pr) || NULL == pr)) { - orte_rml_ofi_peer_t peer_ofi_addr[MAX_OFI_PROVIDERS]; - int tot_peer_prov=0, peer_prov_id=ofi_prov_id; - bool peer_match_found=false; - - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s rml:ofi:Send peer OFI contact info not found in internal hash - checking modex", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - /* Do Modex_recv and populate the peer's providers and ofi ep address in peer_ofi_addr[] array */ - if( OPAL_ERROR == ( tot_peer_prov = populate_peer_ofi_addr( peer, peer_ofi_addr ))) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s rml:ofi::send_msg() Error when Populating peer ofi_addr array ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - snd->status = ORTE_ERR_ADDRESSEE_UNKNOWN; - ORTE_RML_SEND_COMPLETE(snd); - //OBJ_RELEASE( ofi_send_req); - return ; - } - /* decide the provider we want to use from the list of providers in peer as per below order. - * 1. if the user specified the transport for this conduit (even giving us a prioritized list of candidates), - * then the one we selected is the _only_ one we will use. If the remote peer has a matching endpoint, - * then we use it - otherwise, we error out - * 2. if the user did not specify a transport, then we look for matches against _all_ of - * our available transports, starting with fabric and then going to Ethernet, taking the first one that matches. - * 3. if we cannot find any match, then we error out - */ - if ( true == user_override() ) { - /*case 1. User has specified the provider, find a match in peer for the current selected provider or error out*/ - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s rml:ofi::send_msg() Case1. looking for a match for current provider", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - if( OPAL_ERROR == ( peer_prov_id = check_provider_in_peer( orte_rml_ofi.ofi_prov[ofi_prov_id].fabric_info->fabric_attr->prov_name, - tot_peer_prov, peer_ofi_addr, ofi_prov_id ) )) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s rml:ofi::send_msg() Peer is Unreachable - no common ofi provider ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - snd->status = ORTE_ERR_ADDRESSEE_UNKNOWN; - ORTE_RML_SEND_COMPLETE(snd); - //OBJ_RELEASE( ofi_send_req); - return ; - } - peer_match_found = true; - } else { - /* case 2. look for any matching fabric (other than ethernet) provider */ - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s rml:ofi::send_msg() Case 2 - looking for any match for fabric provider", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - for(int cur_prov_id=0; cur_prov_id < orte_rml_ofi.ofi_prov_open_num && !peer_match_found ; cur_prov_id++) { - if( 0 != strcmp( orte_rml_ofi.ofi_prov[cur_prov_id].fabric_info->fabric_attr->prov_name, "sockets" ) ) { - peer_prov_id = check_provider_in_peer( orte_rml_ofi.ofi_prov[cur_prov_id].fabric_info->fabric_attr->prov_name, - tot_peer_prov, peer_ofi_addr, cur_prov_id ); - if (OPAL_ERROR != peer_prov_id) { - peer_match_found = true; - ofi_prov_id = cur_prov_id; - } - } - } - /* if we haven't found a common provider for local node and peer to send message yet, check for ethernet */ - if(!peer_match_found) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s rml:ofi::send_msg() Case 2 - common fabric to peer not found,looking for ethernet provider", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - } - for(int cur_prov_id=0; cur_prov_id < orte_rml_ofi.ofi_prov_open_num && !peer_match_found ; cur_prov_id++) { - if( 0 == strcmp( orte_rml_ofi.ofi_prov[cur_prov_id].fabric_info->fabric_attr->prov_name, "sockets" ) ) { - peer_prov_id = check_provider_in_peer( orte_rml_ofi.ofi_prov[cur_prov_id].fabric_info->fabric_attr->prov_name, - tot_peer_prov, peer_ofi_addr, cur_prov_id ); - if (OPAL_ERROR != peer_prov_id) { - peer_match_found = true; - ofi_prov_id = cur_prov_id; - } - } - } - /* if we haven't found a common provider yet, then error out - case 3 */ - if ( !peer_match_found ) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s rml:ofi::send_msg() Peer is Unreachable - no common ofi provider ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - snd->status = ORTE_ERR_ADDRESSEE_UNKNOWN; - ORTE_RML_SEND_COMPLETE(snd); - //OBJ_RELEASE( ofi_send_req); - return ; - } - } - /* creating a copy of the chosen provider to put it in hashtable - * as the ofi_peer_addr array is local */ - pr = OBJ_NEW(orte_rml_ofi_peer_t); - pr->ofi_ep_len = peer_ofi_addr[peer_prov_id].ofi_ep_len; - pr->ofi_ep = malloc(pr->ofi_ep_len); - memcpy(pr->ofi_ep,peer_ofi_addr[peer_prov_id].ofi_ep,pr->ofi_ep_len); - pr->ofi_prov_name = strdup(peer_ofi_addr[peer_prov_id].ofi_prov_name); - pr->src_prov_id = ofi_prov_id; - if(OPAL_SUCCESS != - (rc = opal_hash_table_set_value_uint64(&orte_rml_ofi.peers, ui64, (void*)pr))) { - opal_output_verbose(15, orte_rml_base_framework.framework_output, - "%s: ofi address insertion into hash table failed for peer %s ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(peer)); - ORTE_ERROR_LOG(rc); - } - dest_ep_name = pr->ofi_ep; - dest_ep_namelen = pr->ofi_ep_len; - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s rml:ofi: Peer ofi provider details added to hash table. Sending to provider %s on peer %s ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),pr->ofi_prov_name,ORTE_NAME_PRINT(peer)); - } else { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s rml:ofi: OFI peer contact info got from hash table", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - dest_ep_name = pr->ofi_ep; - dest_ep_namelen = pr->ofi_ep_len; - ofi_prov_id = pr->src_prov_id; - } - - //[Debug] printing additional info of IP - switch ( orte_rml_ofi.ofi_prov[ofi_prov_id].fabric_info->addr_format) - { - case FI_SOCKADDR_IN : - /* Address is of type sockaddr_in (IPv4) */ - /*[debug] - print the sockaddr - port and s_addr */ - ep_sockaddr = (struct sockaddr_in*)dest_ep_name; - opal_output_verbose(1,orte_rml_base_framework.framework_output, - "%s peer %s epnamelen is %lu, port = %d (or) 0x%x, InternetAddr = 0x%s ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),ORTE_NAME_PRINT(peer), - (unsigned long)orte_rml_ofi.ofi_prov[ofi_prov_id].epnamelen,ntohs(ep_sockaddr->sin_port), - ntohs(ep_sockaddr->sin_port),inet_ntoa(ep_sockaddr->sin_addr)); - /*[end debug]*/ - break; - } - //[Debug] end debug - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s peer ep name obtained for %s. length=%lu", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(peer), dest_ep_namelen); - ret = fi_av_insert(orte_rml_ofi.ofi_prov[ofi_prov_id].av, dest_ep_name,1,&dest_fi_addr,0,NULL); - if( ret != 1) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s fi_av_insert failed in send_msg() returned %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),ret ); - /* call the send-callback fn with error and return, also return failure status */ - snd->status = ORTE_ERR_ADDRESSEE_UNKNOWN; - - ORTE_RML_SEND_COMPLETE(snd); - - return; - } - ofi_send_req->send = snd; - ofi_send_req->completion_count = 1; - - /* [DESC] we want to send the pid,seqnum,tag in addition to the data - * copy all of this to header of message from the ofi_send_t* send - */ - ofi_send_req->hdr.dst = ofi_send_req->send->dst; - ofi_send_req->hdr.origin = ofi_send_req->send->origin; - ofi_send_req->hdr.seq_num = ofi_send_req->send->seq_num; - ofi_send_req->hdr.tag = ofi_send_req->send->tag; - - /* - * also insert ofi plugin specific header details - - * the unique msgid, for now initalise total_packets to 1 - */ - ofi_send_req->hdr.msgid = orte_rml_ofi.cur_msgid; - orte_rml_ofi.cur_msgid += 1; - total_packets = 1; - - /* copy the buffer/iov/data to the ofi_send_req->datablob and update ofi_send_req->length*/ - ofi_send_req->length = 0; - if( NULL != ofi_send_req->send->buffer) { - ofi_send_req->length = ofi_send_req->send->buffer->bytes_used; - ofi_send_req->data_blob = (char *)malloc(ofi_send_req->length); - memcpy(ofi_send_req->data_blob , - ofi_send_req->send->buffer->base_ptr, - ofi_send_req->send->buffer->bytes_used); - } else if ( NULL != ofi_send_req->send->iov) { - for (int i=0; i < ofi_send_req->send->count; i++) { - ofi_send_req->length += ofi_send_req->send->iov[i].iov_len; - } - ofi_send_req->data_blob = (char *)malloc(ofi_send_req->length); - int iovlen=0; - for (int i=0; i < ofi_send_req->send->count; i++) { - memcpy(((char *)ofi_send_req->data_blob + iovlen ), - ofi_send_req->send->iov[i].iov_base, - ofi_send_req->send->iov[i].iov_len); - iovlen += ofi_send_req->send->iov[i].iov_len; - } - } else { - //just send the data - ofi_send_req->length = ofi_send_req->send->count; - ofi_send_req->data_blob = (char *)malloc(ofi_send_req->length); - memcpy(ofi_send_req->data_blob , - ofi_send_req->send->data, - ofi_send_req->send->count); - } - - - opal_output_verbose(15, orte_rml_base_framework.framework_output, - "%s Completed copying all data into ofi_send_req->data_blob, total data - %lu bytes", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ofi_send_req->length ); - - /* Each packet will have header information, so the data length in each packet is datalen_per_packet. - * check if the ofi_send_req->send->buffer->bytes_used is greater than the data per packet datalen_per_packet(recv buffer) - * if so fragment and add info to header and send it in a loop back-to-back */ - hdrsize = sizeof(orte_rml_ofi_msg_header_t); - datalen_per_pkt = MIN_MULTI_BUF_SIZE - hdrsize; - if (ofi_send_req->length > datalen_per_pkt ) - { - total_packets = ( ofi_send_req->length / datalen_per_pkt ) + 1 ; - } - ofi_send_req->hdr.tot_pkts = total_packets; - - opal_output_verbose(15, orte_rml_base_framework.framework_output, - "%s datalen_per_pkt = %lu, ofi_send_req->length= %lu, total packets = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), datalen_per_pkt, ofi_send_req->length, total_packets ); - - /* in a loop send create and send the packets */ - for(size_t pkt_num=1,sent_data=0; sent_data < ofi_send_req->length; pkt_num++) { - ofi_send_req->hdr.cur_pkt_num = pkt_num; - /* create the packet */ - ofi_msg_pkt = OBJ_NEW(orte_rml_ofi_send_pkt_t); - data_in_pkt = ((ofi_send_req->length - sent_data) >= datalen_per_pkt) ? - datalen_per_pkt : (ofi_send_req->length - sent_data); - ofi_msg_pkt->pkt_size = hdrsize + data_in_pkt; - opal_output_verbose(15, orte_rml_base_framework.framework_output, - "%s Packet %lu -> data_in_pkt= %lu, header_size= %lu, pkt_size=%lu", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), pkt_num,data_in_pkt,hdrsize,ofi_msg_pkt->pkt_size ); - /* copy the header and data for this pkt */ - ofi_msg_pkt->data = malloc( ofi_msg_pkt->pkt_size); - memcpy(ofi_msg_pkt->data, &ofi_send_req->hdr, hdrsize ); - memcpy( ( (char *)ofi_msg_pkt->data + hdrsize ), - ((char*)ofi_send_req->data_blob + sent_data), - data_in_pkt); - opal_output_verbose(15, orte_rml_base_framework.framework_output, - "%s Copying header, data into packets completed", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ); - /* add it to list */ - opal_list_append(&(ofi_send_req->pkt_list), &ofi_msg_pkt->super); - opal_output_verbose(15, orte_rml_base_framework.framework_output, - "%s adding packet %lu to list done successful", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),pkt_num ); - sent_data += data_in_pkt; - } - - if( ofi_send_req->hdr.tot_pkts != ofi_send_req->hdr.cur_pkt_num ) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s Error: Total packets calculated [%d] does not match total created-%d pkts to peer %s with tag %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ofi_send_req->hdr.tot_pkts, ofi_send_req->hdr.cur_pkt_num, - ORTE_NAME_PRINT(peer), tag); - } - /* do the fi_send() for all the pkts */ - ofi_send_req->completion_count= ofi_send_req->hdr.tot_pkts; - OPAL_LIST_FOREACH(ofi_msg_pkt, &ofi_send_req->pkt_list, orte_rml_ofi_send_pkt_t) { - /* debug purpose - copying the header from packet to verify if it is correct */ - struct orte_rml_ofi_msg_header_t *cur_hdr; - cur_hdr = (struct orte_rml_ofi_msg_header_t* ) ofi_msg_pkt->data; - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s Sending Pkt[%d] of total %d pkts for msgid:%d to peer %s with tag %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cur_hdr->cur_pkt_num, ofi_send_req->completion_count, - cur_hdr->msgid, ORTE_NAME_PRINT(peer), tag); - /* end debug*/ - - RML_OFI_RETRY_UNTIL_DONE(fi_send(orte_rml_ofi.ofi_prov[ofi_prov_id].ep, - ofi_msg_pkt->data, - ofi_msg_pkt->pkt_size, - fi_mr_desc(orte_rml_ofi.ofi_prov[ofi_prov_id].mr_multi_recv), - dest_fi_addr, - (void *)&ofi_send_req->ctx)); - - } - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s End of send_msg_transport. fi_send completed to peer %s with tag %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(peer), tag); - OBJ_RELEASE(req); -} - -int orte_rml_ofi_send_nb(struct orte_rml_base_module_t* mod, - orte_process_name_t* peer, - struct iovec* iov, - int count, - orte_rml_tag_t tag, - orte_rml_callback_fn_t cbfunc, - void* cbdata) -{ - orte_rml_recv_t *rcv; - int bytes; - orte_self_send_xfer_t *xfer; - int i; - char* ptr; - ofi_send_request_t *req; - orte_rml_ofi_module_t *ofi_mod = (orte_rml_ofi_module_t*)mod; - int ofi_prov_id = ofi_mod->cur_transport_id; - - - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s rml_ofi_send_transport to peer %s at tag %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(peer), tag); - - - if( (0 > ofi_prov_id) || ( ofi_prov_id >= orte_rml_ofi.ofi_prov_open_num ) ) { - /* Invalid ofi_prov ID provided */ - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_BAD_PARAM; - } - if (ORTE_RML_TAG_INVALID == tag) { - /* cannot send to an invalid tag */ - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_BAD_PARAM; - } - if (NULL == peer || - OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_NAME_INVALID, peer)) { - /* cannot send to an invalid peer */ - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_BAD_PARAM; - } - - /* if this is a message to myself, then just post the message - * for receipt - no need to dive into the ofi send_msg() - */ - if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, peer, ORTE_PROC_MY_NAME)) { /* local delivery */ - OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output, - "%s rml_send_iovec_to_self at tag %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tag)); - /* send to self is a tad tricky - we really don't want - * to track the send callback function throughout the recv - * process and execute it upon receipt as this would provide - * very different timing from a non-self message. Specifically, - * if we just retain a pointer to the incoming data - * and then execute the send callback prior to the receive, - * then the caller will think we are done with the data and - * can release it. So we have to copy the data in order to - * execute the send callback prior to receiving the message. - * - * In truth, this really is a better mimic of the non-self - * message behavior. If we actually pushed the message out - * on the wire and had it loop back, then we would receive - * a new block of data anyway. - */ - - /* setup the send callback */ - xfer = OBJ_NEW(orte_self_send_xfer_t); - xfer->iov = iov; - xfer->count = count; - xfer->cbfunc.iov = cbfunc; - xfer->tag = tag; - xfer->cbdata = cbdata; - /* setup the event for the send callback */ - opal_event_set(orte_event_base, &xfer->ev, -1, OPAL_EV_WRITE, send_self_exe, xfer); - opal_event_set_priority(&xfer->ev, ORTE_MSG_PRI); - opal_event_active(&xfer->ev, OPAL_EV_WRITE, 1); - - /* copy the message for the recv */ - rcv = OBJ_NEW(orte_rml_recv_t); - rcv->sender = *peer; - rcv->tag = tag; - /* get the total number of bytes in the iovec array */ - bytes = 0; - for (i = 0 ; i < count ; ++i) { - bytes += iov[i].iov_len; - } - /* get the required memory allocation */ - if (0 < bytes) { - rcv->iov.iov_base = (IOVBASE_TYPE*)malloc(bytes); - rcv->iov.iov_len = bytes; - /* transfer the bytes */ - ptr = (char*)rcv->iov.iov_base; - for (i = 0 ; i < count ; ++i) { - memcpy(ptr, iov[i].iov_base, iov[i].iov_len); - ptr += iov[i].iov_len; - } - } - /* post the message for receipt - since the send callback was posted - * first and has the same priority, it will execute first - */ - ORTE_RML_ACTIVATE_MESSAGE(rcv); - return ORTE_SUCCESS; - } - - /* get ourselves into an event to protect against - * race conditions and threads - */ - req = OBJ_NEW(ofi_send_request_t); - req->ofi_prov_id = ofi_prov_id; - req->send.dst = *peer; - req->send.iov = iov; - req->send.count = count; - req->send.tag = tag; - req->send.cbfunc.iov = cbfunc; - req->send.cbdata = cbdata; - - /* setup the event for the send callback */ - opal_event_set(orte_event_base, &req->ev, -1, OPAL_EV_WRITE, send_msg, req); - opal_event_set_priority(&req->ev, ORTE_MSG_PRI); - opal_event_active(&req->ev, OPAL_EV_WRITE, 1); - - return ORTE_SUCCESS; -} - - -int orte_rml_ofi_send_buffer_nb(struct orte_rml_base_module_t *mod, - orte_process_name_t* peer, - struct opal_buffer_t* buffer, - orte_rml_tag_t tag, - orte_rml_buffer_callback_fn_t cbfunc, - void* cbdata) -{ - orte_rml_recv_t *rcv; - orte_self_send_xfer_t *xfer; - ofi_send_request_t *req; - orte_rml_ofi_module_t *ofi_mod = (orte_rml_ofi_module_t*)mod; - int ofi_prov_id = ofi_mod->cur_transport_id; - - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s rml_ofi_send_buffer_transport to peer %s at tag %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(peer), tag); - - - if( (0 > ofi_prov_id) || ( ofi_prov_id >= orte_rml_ofi.ofi_prov_open_num ) ) { - /* Invalid ofi_prov ID provided */ - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_BAD_PARAM; - } - if (ORTE_RML_TAG_INVALID == tag) { - /* cannot send to an invalid tag */ - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_BAD_PARAM; - } - if (NULL == peer || - OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_NAME_INVALID, peer)) { - /* cannot send to an invalid peer */ - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_BAD_PARAM; - } - /* if this is a message to myself, then just post the message - * for receipt - no need to dive into the oob - */ - if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, peer, ORTE_PROC_MY_NAME)) { /* local delivery */ - OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output, - "%s rml_send_iovec_to_self at tag %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tag)); - /* send to self is a tad tricky - we really don't want - * to track the send callback function throughout the recv - * process and execute it upon receipt as this would provide - * very different timing from a non-self message. Specifically, - * if we just retain a pointer to the incoming data - * and then execute the send callback prior to the receive, - * then the caller will think we are done with the data and - * can release it. So we have to copy the data in order to - * execute the send callback prior to receiving the message. - * - * In truth, this really is a better mimic of the non-self - * message behavior. If we actually pushed the message out - * on the wire and had it loop back, then we would receive - * a new block of data anyway. - */ - - /* setup the send callback */ - xfer = OBJ_NEW(orte_self_send_xfer_t); - xfer->buffer = buffer; - xfer->cbfunc.buffer = cbfunc; - xfer->tag = tag; - xfer->cbdata = cbdata; - /* setup the event for the send callback */ - opal_event_set(orte_event_base, &xfer->ev, -1, OPAL_EV_WRITE, send_self_exe, xfer); - opal_event_set_priority(&xfer->ev, ORTE_MSG_PRI); - opal_event_active(&xfer->ev, OPAL_EV_WRITE, 1); - - /* copy the message for the recv */ - rcv = OBJ_NEW(orte_rml_recv_t); - rcv->sender = *peer; - rcv->tag = tag; - rcv->iov.iov_base = (IOVBASE_TYPE*)malloc(buffer->bytes_used); - memcpy(rcv->iov.iov_base, buffer->base_ptr, buffer->bytes_used); - rcv->iov.iov_len = buffer->bytes_used; - /* post the message for receipt - since the send callback was posted - * first and has the same priority, it will execute first - */ - ORTE_RML_ACTIVATE_MESSAGE(rcv); - return ORTE_SUCCESS; - } - - /* get ourselves into an event to protect against - * race conditions and threads - */ - req = OBJ_NEW(ofi_send_request_t); - req->ofi_prov_id = ofi_prov_id; - req->send.dst = *peer; - req->send.buffer = buffer; - req->send.tag = tag; - req->send.cbfunc.buffer = cbfunc; - req->send.cbdata = cbdata; - - /* setup the event for the send callback */ - opal_event_set(orte_event_base, &req->ev, -1, OPAL_EV_WRITE, send_msg, req); - opal_event_set_priority(&req->ev, ORTE_MSG_PRI); - opal_event_active(&req->ev, OPAL_EV_WRITE, 1); - - return ORTE_SUCCESS; -}