diff --git a/opal/mca/common/libfabric/libfabric/Makefile.am b/opal/mca/common/libfabric/libfabric/Makefile.am index 174c553aaf..cbd6d8f528 100644 --- a/opal/mca/common/libfabric/libfabric/Makefile.am +++ b/opal/mca/common/libfabric/libfabric/Makefile.am @@ -18,6 +18,10 @@ else !HAVE_LD_VERSION_SCRIPT libfabric_version_script = endif !HAVE_LD_VERSION_SCRIPT +rdmaincludedir = $(includedir)/rdma + +rdmainclude_HEADERS = + # internal utility functions shared by in-tree providers: common_srcs = \ src/common.c \ @@ -31,10 +35,12 @@ src_libfabric_la_SOURCES = \ include/fi_enosys.h \ include/fi_indexer.h \ include/fi_list.h \ + include/fi_log.h \ include/fi_rbuf.h \ include/prov.h \ src/fabric.c \ src/fi_tostr.c \ + src/log.c \ $(common_srcs) if HAVE_SOCKETS @@ -154,7 +160,7 @@ libusnic_direct_sources = \ _usnic_files = \ $(libusnic_direct_sources) \ - prov/usnic/src/fi_usnic.h \ + prov/usnic/src/fi_ext_usnic.h \ prov/usnic/src/usdf.h \ prov/usnic/src/usdf_av.c \ prov/usnic/src/usdf_av.h \ @@ -189,6 +195,9 @@ _usnic_cppflags = \ -DHAVE_LIBNL3=$(HAVE_LIBNL3) $(USNIC_LIBNL_CPPFLAGS) \ -I$(top_srcdir)/prov/usnic/src/usnic_direct +rdmainclude_HEADERS += \ + prov/usnic/src/fi_ext_usnic.h + if HAVE_USNIC_DL pkglib_LTLIBRARIES += libusnic-fi.la libusnic_fi_la_CPPFLAGS = $(AM_CPPFLAGS) $(_usnic_cppflags) @@ -242,9 +251,7 @@ src_libfabric_la_LDFLAGS = -version-info 1 -export-dynamic \ $(libfabric_version_script) src_libfabric_la_DEPENDENCIES = $(srcdir)/libfabric.map -rdmaincludedir = $(includedir)/rdma - -rdmainclude_HEADERS = \ +rdmainclude_HEADERS += \ $(top_srcdir)/include/rdma/fabric.h \ $(top_srcdir)/include/rdma/fi_atomic.h \ $(top_srcdir)/include/rdma/fi_cm.h \ diff --git a/opal/mca/common/libfabric/libfabric/README b/opal/mca/common/libfabric/libfabric/README index 1f2c7411c1..a2d2c93754 100644 --- a/opal/mca/common/libfabric/libfabric/README +++ b/opal/mca/common/libfabric/libfabric/README @@ -1,7 +1,7 @@ This README is for userspace RDMA fabric library. -Version Libfabric v0.0.2 -Released on 2015-01-24 +Version Libfabric v1.0.0-rc1 +Released on 2015-02-03 Building ======== diff --git a/opal/mca/common/libfabric/libfabric/config.h.in b/opal/mca/common/libfabric/libfabric/config.h.in index 77ddb25809..dc4271223e 100644 --- a/opal/mca/common/libfabric/libfabric/config.h.in +++ b/opal/mca/common/libfabric/libfabric/config.h.in @@ -1,5 +1,9 @@ /* config.h.in. Generated from configure.ac by autoheader. */ +/* defined to 1 if libfabric was configured with --enable-debug, 0 otherwise + */ +#undef ENABLE_DEBUG + /* Set to 1 to use c11 atomic functions */ #undef HAVE_ATOMICS diff --git a/opal/mca/common/libfabric/libfabric/config/distscript.pl b/opal/mca/common/libfabric/libfabric/config/distscript.pl index 10eb41d877..3a16df184a 100755 --- a/opal/mca/common/libfabric/libfabric/config/distscript.pl +++ b/opal/mca/common/libfabric/libfabric/config/distscript.pl @@ -32,6 +32,15 @@ sub subst { $copy =~ s/\@VERSION\@/Libfabric v$version/g; $copy =~ s/\@DATE\@/$today/g; + # Note that there appears to be a bug in some versions of Pandoc + # that will escape the appearance of @ in generated man pages + # (e.g., in the "@VERSION@" that appears in the man page version + # field). So rather than be clever in the regexp's above, do the + # simple/clear thing and repeat the same regexp's as above, but + # with double-escaped @'s. + $copy =~ s/\\\@VERSION\\\@/Libfabric v$version/g; + $copy =~ s/\\\@DATE\\\@/$today/g; + if ($copy ne $orig) { print "*** VERSION/DATE-ifying $file...\n"; open(OUT, ">$file") || die "Can't write to $file: $!"; diff --git a/opal/mca/common/libfabric/libfabric/configure.ac b/opal/mca/common/libfabric/libfabric/configure.ac index a8b6737e1a..39f5b2d412 100644 --- a/opal/mca/common/libfabric/libfabric/configure.ac +++ b/opal/mca/common/libfabric/libfabric/configure.ac @@ -1,7 +1,7 @@ dnl Process this file with autoconf to produce a configure script. AC_PREREQ(2.57) -AC_INIT([libfabric], [0.0.2], [ofiwg@lists.openfabrics.org]) +AC_INIT([libfabric], [1.0.0-rc1], [ofiwg@lists.openfabrics.org]) AC_CONFIG_SRCDIR([src/fabric.c]) AC_CONFIG_AUX_DIR(config) AC_CONFIG_MACRO_DIR(config) @@ -13,8 +13,13 @@ AC_ARG_ENABLE([debug], [AS_HELP_STRING([--enable-debug], [Enable debugging @<:@default=no@:>@]) ], - [CFLAGS="$CFLAGS -g -O0 -Wall"], - [enable_debug=no]) + [CFLAGS="$CFLAGS -g -O0 -Wall" + dbg=1], + [enable_debug=no + dbg=0]) + +AC_DEFINE_UNQUOTED([ENABLE_DEBUG],[$dbg], + [defined to 1 if libfabric was configured with --enable-debug, 0 otherwise]) dnl Fix autoconf's habit of adding -g -O2 by default AS_IF([test -z "$CFLAGS"], diff --git a/opal/mca/common/libfabric/libfabric/include/fi.h b/opal/mca/common/libfabric/libfabric/include/fi.h index ae8d0f6566..b3c10a6710 100644 --- a/opal/mca/common/libfabric/libfabric/include/fi.h +++ b/opal/mca/common/libfabric/libfabric/include/fi.h @@ -190,6 +190,11 @@ size_t fi_datatype_size(enum fi_datatype datatype); uint64_t fi_tag_bits(uint64_t mem_tag_format); uint64_t fi_tag_format(uint64_t tag_bits); +int fi_send_allowed(uint64_t caps); +int fi_recv_allowed(uint64_t caps); +int fi_rma_initiate_allowed(uint64_t caps); +int fi_rma_target_allowed(uint64_t caps); + #define RDMA_CONF_DIR SYSCONFDIR "/" RDMADIR #define FI_CONF_DIR RDMA_CONF_DIR "/fabric" diff --git a/opal/mca/common/libfabric/libfabric/include/fi_enosys.h b/opal/mca/common/libfabric/libfabric/include/fi_enosys.h index 019d98c1fe..3bf4cd0e20 100644 --- a/opal/mca/common/libfabric/libfabric/include/fi_enosys.h +++ b/opal/mca/common/libfabric/libfabric/include/fi_enosys.h @@ -235,6 +235,8 @@ static struct fi_ops_ep X = { .setopt = fi_no_setopt, .tx_ctx = fi_no_tx_ctx, .rx_ctx = fi_no_rx_ctx, + .rx_size_left = fi_no_rx_size_left, + .tx_size_left = fi_no_tx_size_left, }; */ int fi_no_enable(struct fid_ep *ep); @@ -243,12 +245,14 @@ int fi_no_getopt(fid_t fid, int level, int optname, void *optval, size_t *optlen); int fi_no_setopt(fid_t fid, int level, int optname, const void *optval, size_t optlen); -int fi_no_tx_ctx(struct fid_sep *sep, int index, +int fi_no_tx_ctx(struct fid_ep *sep, int index, struct fi_tx_attr *attr, struct fid_ep **tx_ep, void *context); -int fi_no_rx_ctx(struct fid_sep *sep, int index, +int fi_no_rx_ctx(struct fid_ep *sep, int index, struct fi_rx_attr *attr, struct fid_ep **rx_ep, void *context); +ssize_t fi_no_rx_size_left(struct fid_ep *ep); +ssize_t fi_no_tx_size_left(struct fid_ep *ep); /* static struct fi_ops_msg X = { @@ -262,8 +266,6 @@ static struct fi_ops_msg X = { .inject = fi_no_msg_inject, .senddata = fi_no_msg_senddata, .injectdata = fi_no_msg_injectdata, - .rx_size_left = fi_no_msg_rx_size_left, - .tx_size_left = fi_no_msg_tx_size_left, }; */ ssize_t fi_no_msg_recv(struct fid_ep *ep, void *buf, size_t len, void *desc, @@ -284,8 +286,6 @@ ssize_t fi_no_msg_senddata(struct fid_ep *ep, const void *buf, size_t len, void uint64_t data, fi_addr_t dest_addr, void *context); ssize_t fi_no_msg_injectdata(struct fid_ep *ep, const void *buf, size_t len, uint64_t data, fi_addr_t dest_addr); -ssize_t fi_no_msg_rx_size_left(struct fid_ep *ep); -ssize_t fi_no_msg_tx_size_left(struct fid_ep *ep); /* static struct fi_ops_wait X = { diff --git a/opal/mca/common/libfabric/libfabric/include/fi_log.h b/opal/mca/common/libfabric/libfabric/include/fi_log.h new file mode 100644 index 0000000000..f2cfa795bc --- /dev/null +++ b/opal/mca/common/libfabric/libfabric/include/fi_log.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2015, Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#if !defined(FI_LOG_H) +#define FI_LOG_H + +#if HAVE_CONFIG_H +# include +#endif /* HAVE_CONFIG_H */ + +extern int fi_log_level; + +void fi_log_init(void); +void fi_warn_impl(const char *prov, const char *fmt, ...); +void fi_log_impl(int level, const char *prov, const char *fmt, ...); +void fi_debug_impl(const char *prov, const char *fmt, ...); + +/* Callers are responsible for including their own trailing "\n". Non-provider + * code should pass prov=NULL. + */ +#define FI_WARN(prov, ...) fi_warn_impl(prov, __VA_ARGS__) + +#define FI_LOG(level, prov, ...) \ + do { \ + if ((level) <= fi_log_level) \ + fi_log_impl(level, prov, __VA_ARGS__); \ + } while (0) + +#if ENABLE_DEBUG +# define FI_DEBUG(prov, ...) fi_debug_impl(prov, __VA_ARGS__) +#else +# define FI_DEBUG(prov, ...) do {} while (0) +#endif + +#endif /* !defined(FI_LOG_H) */ diff --git a/opal/mca/common/libfabric/libfabric/include/fi_rbuf.h b/opal/mca/common/libfabric/libfabric/include/fi_rbuf.h index c447bf2768..7907b80bac 100644 --- a/opal/mca/common/libfabric/libfabric/include/fi_rbuf.h +++ b/opal/mca/common/libfabric/libfabric/include/fi_rbuf.h @@ -277,4 +277,10 @@ static inline size_t rbfdsread(struct ringbuffd *rbfd, void *buf, size_t len, return ret; } +static inline size_t rbfdwait(struct ringbuffd *rbfd, int timeout) +{ + return fi_poll_fd(rbfd->fd[RB_READ_FD], timeout); +} + + #endif /* RBUF_H */ diff --git a/opal/mca/common/libfabric/libfabric/include/rdma/fabric.h b/opal/mca/common/libfabric/libfabric/include/rdma/fabric.h index 70ff46b9c3..8a3ea06f81 100644 --- a/opal/mca/common/libfabric/libfabric/include/rdma/fabric.h +++ b/opal/mca/common/libfabric/libfabric/include/rdma/fabric.h @@ -77,7 +77,6 @@ struct fid_cntr; struct fid_ep; struct fid_pep; struct fid_stx; -struct fid_sep; struct fid_mr; typedef struct fid *fid_t; @@ -96,7 +95,8 @@ typedef struct fid *fid_t; #define FI_MSG (1ULL << 1) #define FI_RMA (1ULL << 2) #define FI_TAGGED (1ULL << 3) -#define FI_ATOMICS (1ULL << 4) +#define FI_ATOMIC (1ULL << 4) +#define FI_ATOMICS FI_ATOMIC #define FI_DYNAMIC_MR (1ULL << 7) #define FI_NAMED_RX_CTX (1ULL << 8) #define FI_BUFFERED_RECV (1ULL << 9) @@ -121,6 +121,7 @@ typedef struct fid *fid_t; #define FI_WRITE (1ULL << 17) #define FI_RECV (1ULL << 18) #define FI_SEND (1ULL << 19) +#define FI_TRANSMIT FI_SEND #define FI_REMOTE_READ (1ULL << 20) #define FI_REMOTE_WRITE (1ULL << 21) @@ -133,6 +134,7 @@ typedef struct fid *fid_t; #define FI_MORE (1ULL << 29) #define FI_PEEK (1ULL << 30) #define FI_TRIGGER (1ULL << 31) +#define FI_FENCE (1ULL << 32) struct fi_ioc { @@ -220,6 +222,7 @@ enum { #define FI_LOCAL_MR (1ULL << 1) #define FI_PROV_MR_ATTR (1ULL << 2) #define FI_MSG_PREFIX (1ULL << 3) +#define FI_ASYNC_IOV (1ULL << 4) struct fi_tx_attr { uint64_t caps; @@ -230,6 +233,7 @@ struct fi_tx_attr { size_t inject_size; size_t size; size_t iov_limit; + size_t rma_iov_limit; }; struct fi_rx_attr { @@ -435,6 +439,7 @@ enum fi_type { FI_TYPE_AV_TYPE, FI_TYPE_ATOMIC_TYPE, FI_TYPE_ATOMIC_OP, + FI_TYPE_VERSION, }; char *fi_tostr(const void *data, enum fi_type datatype); diff --git a/opal/mca/common/libfabric/libfabric/include/rdma/fi_domain.h b/opal/mca/common/libfabric/libfabric/include/rdma/fi_domain.h index 8cc7827039..18fc22eae4 100644 --- a/opal/mca/common/libfabric/libfabric/include/rdma/fi_domain.h +++ b/opal/mca/common/libfabric/libfabric/include/rdma/fi_domain.h @@ -120,7 +120,7 @@ struct fi_ops_domain { int (*endpoint)(struct fid_domain *domain, struct fi_info *info, struct fid_ep **ep, void *context); int (*scalable_ep)(struct fid_domain *domain, struct fi_info *info, - struct fid_sep **sep, void *context); + struct fid_ep **sep, void *context); int (*cntr_open)(struct fid_domain *domain, struct fi_cntr_attr *attr, struct fid_cntr **cntr, void *context); int (*poll_open)(struct fid_domain *domain, struct fi_poll_attr *attr, @@ -223,6 +223,11 @@ static inline uint64_t fi_mr_key(struct fid_mr *mr) return mr->key; } +static inline int fi_mr_bind(struct fid_mr *mr, struct fid *bfid, uint64_t flags) +{ + return mr->fid.ops->bind(&mr->fid, bfid, flags); +} + static inline int fi_av_open(struct fid_domain *domain, struct fi_av_attr *attr, struct fid_av **av, void *context) diff --git a/opal/mca/common/libfabric/libfabric/include/rdma/fi_endpoint.h b/opal/mca/common/libfabric/libfabric/include/rdma/fi_endpoint.h index 8503cda30d..e028d1d68a 100644 --- a/opal/mca/common/libfabric/libfabric/include/rdma/fi_endpoint.h +++ b/opal/mca/common/libfabric/libfabric/include/rdma/fi_endpoint.h @@ -71,12 +71,14 @@ struct fi_ops_ep { void *optval, size_t *optlen); int (*setopt)(fid_t fid, int level, int optname, const void *optval, size_t optlen); - int (*tx_ctx)(struct fid_sep *sep, int index, + int (*tx_ctx)(struct fid_ep *sep, int index, struct fi_tx_attr *attr, struct fid_ep **tx_ep, void *context); - int (*rx_ctx)(struct fid_sep *sep, int index, + int (*rx_ctx)(struct fid_ep *sep, int index, struct fi_rx_attr *attr, struct fid_ep **rx_ep, void *context); + ssize_t (*rx_size_left)(struct fid_ep *ep); + ssize_t (*tx_size_left)(struct fid_ep *ep); }; struct fi_ops_msg { @@ -99,8 +101,6 @@ struct fi_ops_msg { uint64_t data, fi_addr_t dest_addr, void *context); ssize_t (*injectdata)(struct fid_ep *ep, const void *buf, size_t len, uint64_t data, fi_addr_t dest_addr); - ssize_t (*rx_size_left)(struct fid_ep *ep); - ssize_t (*tx_size_left)(struct fid_ep *ep); }; struct fi_ops_cm; @@ -139,12 +139,6 @@ struct fid_stx { struct fi_ops_ep *ops; }; -struct fid_sep { - struct fid fid; - struct fi_ops_ep *ops; - struct fi_ops_cm *cm; -}; - #ifndef FABRIC_DIRECT static inline int @@ -163,7 +157,7 @@ fi_endpoint(struct fid_domain *domain, struct fi_info *info, static inline int fi_scalable_ep(struct fid_domain *domain, struct fi_info *info, - struct fid_sep **sep, void *context) + struct fid_ep **sep, void *context) { return domain->ops->scalable_ep(domain, info, sep, context); } @@ -178,7 +172,7 @@ static inline int fi_pep_bind(struct fid_pep *pep, struct fid *bfid, uint64_t fl return pep->fid.ops->bind(&pep->fid, bfid, flags); } -static inline int fi_scalable_ep_bind(struct fid_sep *sep, struct fid *bfid, uint64_t flags) +static inline int fi_scalable_ep_bind(struct fid_ep *sep, struct fid *bfid, uint64_t flags) { return sep->fid.ops->bind(&sep->fid, bfid, flags); } @@ -211,17 +205,29 @@ fi_getopt(fid_t fid, int level, int optname, } static inline int -fi_tx_context(struct fid_sep *sep, int index, struct fi_tx_attr *attr, +fi_tx_context(struct fid_ep *ep, int index, struct fi_tx_attr *attr, struct fid_ep **tx_ep, void *context) { - return sep->ops->tx_ctx(sep, index, attr, tx_ep, context); + return ep->ops->tx_ctx(ep, index, attr, tx_ep, context); } static inline int -fi_rx_context(struct fid_sep *sep, int index, struct fi_rx_attr *attr, +fi_rx_context(struct fid_ep *ep, int index, struct fi_rx_attr *attr, struct fid_ep **rx_ep, void *context) { - return sep->ops->rx_ctx(sep, index, attr, rx_ep, context); + return ep->ops->rx_ctx(ep, index, attr, rx_ep, context); +} + +static inline ssize_t +fi_rx_size_left(struct fid_ep *ep) +{ + return ep->ops->rx_size_left(ep); +} + +static inline ssize_t +fi_tx_size_left(struct fid_ep *ep) +{ + return ep->ops->tx_size_left(ep); } static inline int @@ -298,18 +304,6 @@ fi_injectdata(struct fid_ep *ep, const void *buf, size_t len, return ep->msg->injectdata(ep, buf, len, data, dest_addr); } -static inline ssize_t -fi_rx_size_left(struct fid_ep *ep) -{ - return ep->msg->rx_size_left(ep); -} - -static inline ssize_t -fi_tx_size_left(struct fid_ep *ep) -{ - return ep->msg->tx_size_left(ep); -} - #else // FABRIC_DIRECT #include #endif diff --git a/opal/mca/common/libfabric/libfabric/include/rdma/fi_errno.h b/opal/mca/common/libfabric/libfabric/include/rdma/fi_errno.h index 637f33a499..a7acf5a1b3 100644 --- a/opal/mca/common/libfabric/libfabric/include/rdma/fi_errno.h +++ b/opal/mca/common/libfabric/libfabric/include/rdma/fi_errno.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2013-2014 Intel Corporation. All rights reserved. + * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -41,6 +42,8 @@ extern "C" { /* FI directly mapped errno values */ +#define FI_SUCCESS 0 + //#define FI_EPERM EPERM /* Operation not permitted */ #define FI_ENOENT ENOENT /* No such file or directory */ //#define FI_ESRCH ESRCH /* No such process */ @@ -183,6 +186,8 @@ extern "C" { #define FI_ENOEQ 261 /* Missing or unavailable event queue */ #define FI_EDOMAIN 262 /* Invalid resource domain */ #define FI_ENOCQ 263 /* Missing or unavailable completion queue */ +#define FI_ECRC 264 /* CRC error */ +#define FI_ETRUNC 265 /* Truncation error */ const char *fi_strerror(int errnum); diff --git a/opal/mca/common/libfabric/libfabric/include/rdma/fi_tagged.h b/opal/mca/common/libfabric/libfabric/include/rdma/fi_tagged.h index f4dd98af21..f6c35f98d9 100644 --- a/opal/mca/common/libfabric/libfabric/include/rdma/fi_tagged.h +++ b/opal/mca/common/libfabric/libfabric/include/rdma/fi_tagged.h @@ -43,6 +43,7 @@ extern "C" { #endif #define FI_CLAIM (1ULL << 0) +#define FI_DISCARD FI_CANCEL struct fi_msg_tagged { const struct iovec *msg_iov; diff --git a/opal/mca/common/libfabric/libfabric/libfabric.spec b/opal/mca/common/libfabric/libfabric/libfabric.spec index 389a9e334a..994d618ead 100644 --- a/opal/mca/common/libfabric/libfabric/libfabric.spec +++ b/opal/mca/common/libfabric/libfabric/libfabric.spec @@ -1,6 +1,6 @@ Name: libfabric -Version: 0.0.2 -Release: 1%{?dist} +Version: 1.0.0-rc1 +Release: 1.rc1%{?dist} Summary: User-space RDMA Fabric Interfaces Group: System Environment/Libraries License: GPLv2 or BSD diff --git a/opal/mca/common/libfabric/libfabric/libfabric.spec.in b/opal/mca/common/libfabric/libfabric/libfabric.spec.in index ef77e8ca5b..164d494e3b 100644 --- a/opal/mca/common/libfabric/libfabric/libfabric.spec.in +++ b/opal/mca/common/libfabric/libfabric/libfabric.spec.in @@ -1,6 +1,6 @@ Name: libfabric Version: @VERSION@ -Release: 1%{?dist} +Release: 1.rc1%{?dist} Summary: User-space RDMA Fabric Interfaces Group: System Environment/Libraries License: GPLv2 or BSD diff --git a/opal/mca/common/libfabric/libfabric/man/fabric.7 b/opal/mca/common/libfabric/libfabric/man/fabric.7 index 228f9521b3..deb8d6d692 100644 --- a/opal/mca/common/libfabric/libfabric/man/fabric.7 +++ b/opal/mca/common/libfabric/libfabric/man/fabric.7 @@ -1,4 +1,4 @@ -.TH fabric 7 "2014\-12\-15" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH fabric 7 "2014\-12\-15" "Libfabric Programmer\[aq]s Manual" "Libfabric v1.0.0-rc1" .SH NAME .PP Fabric Interface Library diff --git a/opal/mca/common/libfabric/libfabric/man/fi_av.3 b/opal/mca/common/libfabric/libfabric/man/fi_av.3 index 71b8014039..2f35096fee 100644 --- a/opal/mca/common/libfabric/libfabric/man/fi_av.3 +++ b/opal/mca/common/libfabric/libfabric/man/fi_av.3 @@ -1,4 +1,4 @@ -.TH fi_av 3 "2014-11-14" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH fi_av 3 "2015\-02\-01" "Libfabric Programmer\[aq]s Manual" "Libfabric v1.0.0-rc1" .SH NAME .PP fi_av - Address vector operations @@ -37,7 +37,7 @@ int\ fi_av_insertsym(struct\ fid_av\ *av,\ const\ char\ *node, \ \ \ \ size_t\ nodecnt,\ const\ char\ *service,\ size_t\ svccnt, \ \ \ \ fi_addr_t\ *fi_addr,\ uint64_t\ flags,\ void\ *context); -int\ fi_av_remove(struct\ fid_av\ *av,\ fi_addr_t\ fi_addr,\ size_t\ count, +int\ fi_av_remove(struct\ fid_av\ *av,\ fi_addr_t\ *fi_addr,\ size_t\ count, \ \ \ \ uint64_t\ flags); int\ fi_av_lookup(struct\ fid_av\ *av,\ fi_addr_t\ fi_addr, @@ -227,6 +227,11 @@ Note that any events queued on an event queue referencing the AV are left untouched. It is recommended that callers retrieve all events associated with the AV before closing it. +.PP +When closing the address vector, there must be no opened endpoints +associated with the AV. +If resources are still associated with the AV when attempting to close, +the call will return -FI_EBUSY. .SS fi_av_bind .PP Associates an event queue with the AV. diff --git a/opal/mca/common/libfabric/libfabric/man/fi_cm.3 b/opal/mca/common/libfabric/libfabric/man/fi_cm.3 index b46738fe40..add7e89d0a 100644 --- a/opal/mca/common/libfabric/libfabric/man/fi_cm.3 +++ b/opal/mca/common/libfabric/libfabric/man/fi_cm.3 @@ -1,4 +1,4 @@ -.TH fi_cm 3 "2015\-01\-01" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH fi_cm 3 "2015\-01\-01" "Libfabric Programmer\[aq]s Manual" "Libfabric v1.0.0-rc1" .SH NAME .PP fi_cm - Connection management operations diff --git a/opal/mca/common/libfabric/libfabric/man/fi_cntr.3 b/opal/mca/common/libfabric/libfabric/man/fi_cntr.3 index 2a9af9f189..81c02995c8 100644 --- a/opal/mca/common/libfabric/libfabric/man/fi_cntr.3 +++ b/opal/mca/common/libfabric/libfabric/man/fi_cntr.3 @@ -1,4 +1,4 @@ -.TH fi_cntr 3 "2014\-12\-15" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH fi_cntr 3 "2015\-01\-29" "Libfabric Programmer\[aq]s Manual" "Libfabric v1.0.0-rc1" .SH NAME .PP fi_cntr - Completion and event counter operations @@ -146,8 +146,11 @@ This field is ignored if wait_obj is not FI_WAIT_SET. .SS fi_close .PP The fi_close call releases all resources associated with a counter. -The counter must not be bound to any other resources prior to being -freed. +When closing the counter, there must be no opened endpoints, transmit +contexts, receive contexts or memory regions associated with the +counter. +If resources are still associated with the counter when attempting to +close, the call will return -FI_EBUSY. .SS fi_cntr_control .PP The fi_cntr_control call is used to access provider or implementation diff --git a/opal/mca/common/libfabric/libfabric/man/fi_cq.3 b/opal/mca/common/libfabric/libfabric/man/fi_cq.3 index d2ac0d06b6..8bf0e7af2a 100644 --- a/opal/mca/common/libfabric/libfabric/man/fi_cq.3 +++ b/opal/mca/common/libfabric/libfabric/man/fi_cq.3 @@ -1,10 +1,12 @@ -.TH fi_cq 3 "2014\-12\-15" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH fi_cq 3 "2015\-02\-01" "Libfabric Programmer\[aq]s Manual" "Libfabric v1.0.0-rc1" .SH NAME .PP fi_cq - Completion queue operations .PP fi_cq_open / fi_close : Open/close a completion queue .PP +fi_control : Control CQ operation or attributes. +.PP fi_cq_read / fi_cq_readfrom / fi_cq_readerr : Read a completion from a completion queue .PP @@ -78,6 +80,10 @@ information. .PP \f[I]flags\f[] : Additional flags to apply to the operation .PP +\f[I]command\f[] : Command of control operation to perform on CQ. +.PP +\f[I]arg\f[] : Optional control argument +.PP \f[I]cond\f[] : Condition that must be met before a completion is generated .PP @@ -288,8 +294,12 @@ This field is ignored if wait_obj is not FI_WAIT_SET. .PP The fi_close call releases all resources associated with a completion queue. -The CQ must not be bound to any other resources prior to being closed. Any completions which remain on the CQ when it is closed are lost. +.PP +When closing the CQ, there must be no opened endpoints, transmit +contexts, or receive contexts associated with the CQ. +If resources are still associated with the CQ when attempting to close, +the call will return -FI_EBUSY. .SS fi_control .PP The fi_control call is used to access provider or implementation @@ -298,11 +308,13 @@ Access to the CQ should be serialized across all calls when fi_control is invoked, as it may redirect the implementation of CQ operations. The following control commands are usable with an CQ. .PP -*FI_GETWAIT (void *\f[I])\f[] : This command allows the user to retrieve +\f[I]FI_GETWAIT (void **)\f[] : This command allows the user to retrieve the low-level wait object associated with the CQ. The format of the wait-object is specified during CQ creation, through the CQ attributes. -See fi_eq.3 for addition details using control with FI_GETWAIT. +The fi_control arg parameter should be an address where a pointer to the +returned wait object will be written. +See fi_eq.3 for addition details using fi_control with FI_GETWAIT. .SS fi_cq_read / fi_cq_readfrom .PP The fi_cq_read and fi_cq_readfrom operations perform a non-blocking read diff --git a/opal/mca/common/libfabric/libfabric/man/fi_direct.7 b/opal/mca/common/libfabric/libfabric/man/fi_direct.7 index c53ad405e0..5b94739ce9 100644 --- a/opal/mca/common/libfabric/libfabric/man/fi_direct.7 +++ b/opal/mca/common/libfabric/libfabric/man/fi_direct.7 @@ -1,4 +1,4 @@ -.TH fi_direct 7 "2014\-11\-21" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH fi_direct 7 "2014\-11\-21" "Libfabric Programmer\[aq]s Manual" "Libfabric v1.0.0-rc1" .SH NAME .PP Direct fabric provider access diff --git a/opal/mca/common/libfabric/libfabric/man/fi_domain.3 b/opal/mca/common/libfabric/libfabric/man/fi_domain.3 index 41e074eecb..ed243b50b5 100644 --- a/opal/mca/common/libfabric/libfabric/man/fi_domain.3 +++ b/opal/mca/common/libfabric/libfabric/man/fi_domain.3 @@ -1,4 +1,4 @@ -.TH fi_domain 3 "2015\-01\-12" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH fi_domain 3 "2015\-02\-01" "Libfabric Programmer\[aq]s Manual" "Libfabric v1.0.0-rc1" .SH NAME .PP fi_domain - Open a fabric access domain @@ -81,8 +81,8 @@ then memory registration requests complete synchronously. .PP The fi_close call is used to release all resources associated with a domain or interface. -All items associated with the opened domain must be released prior to -calling fi_close. +All objects associated with the opened domain must be released prior to +calling fi_close, otherwise the call will return -FI_EBUSY. .SH DOMAIN ATTRIBUTES .PP The \f[C]fi_domain_attr\f[] structure defines the set of attributes @@ -238,9 +238,11 @@ allocated below the fabric interfaces. provider requires the use of an application thread to complete an asynchronous request. When manual progress is set, the provider will attempt to advance an -asynchronous operation forward when the application invokes any event -queue read or wait operation where the completion will be reported. -Progress also occurs when the application processes a poll or wait set. +asynchronous operation forward when the application attempts to wait on +or read an event queue, completion queue, or counter where the completed +operation will be reported. +Progress also occurs when the application processes a poll or wait set +that has been associated with the event or completion queue. .PP Only wait operations defined by the fabric interface will result in an operation progressing. diff --git a/opal/mca/common/libfabric/libfabric/man/fi_endpoint.3 b/opal/mca/common/libfabric/libfabric/man/fi_endpoint.3 index 244d0d2698..e8369767ee 100644 --- a/opal/mca/common/libfabric/libfabric/man/fi_endpoint.3 +++ b/opal/mca/common/libfabric/libfabric/man/fi_endpoint.3 @@ -1,4 +1,4 @@ -.TH fi_endpoint 3 "2015\-01\-16" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH fi_endpoint 3 "2015\-02\-01" "Libfabric Programmer\[aq]s Manual" "Libfabric v1.0.0-rc1" .SH NAME .PP fi_endpoint - Fabric endpoint operations @@ -19,6 +19,11 @@ Associate a scalable endpoint with an address vector .RS .RE .TP +.B fi_pep_bind +Associate a passive endpoint with an event queue +.RS +.RE +.TP .B fi_enable Transitions an endpoint into an active state. .RS @@ -48,6 +53,12 @@ Get or set endpoint options. Open a transmit or receive context. .RS .RE +.TP +.B fi_rx_size_left / fi_tx_size_left +Query the lower bound on how many RX/TX operations may be posted without +an operation returning -FI_EAGAIN. +.RS +.RE .SH SYNOPSIS .IP .nf @@ -60,16 +71,16 @@ int\ fi_endpoint(struct\ fid_domain\ *domain,\ struct\ fi_info\ *info, \ \ \ \ struct\ fid_ep\ **ep,\ void\ *context); int\ fi_scalable_ep(struct\ fid_domain\ *domain,\ struct\ fi_info\ *info, -\ \ \ \ struct\ fid_sep\ **ep,\ void\ *context); +\ \ \ \ struct\ fid_ep\ **sep,\ void\ *context); int\ fi_passive_ep(struct\ fi_fabric\ *fabric,\ struct\ fi_info\ *info, \ \ \ \ struct\ fid_pep\ **pep,\ void\ *context); -int\ fi_tx_context(struct\ fid_ep\ *ep,\ int\ index, +int\ fi_tx_context(struct\ fid_ep\ *sep,\ int\ index, \ \ \ \ struct\ fi_tx_attr\ *attr,\ struct\ fid_ep\ **tx_ep, \ \ \ \ void\ *context); -int\ fi_rx_context(struct\ fid_ep\ *ep,\ int\ index, +int\ fi_rx_context(struct\ fid_ep\ *sep,\ int\ index, \ \ \ \ struct\ fi_rx_attr\ *attr,\ struct\ fid_ep\ **rx_ep, \ \ \ \ void\ *context); @@ -85,7 +96,9 @@ int\ fi_close(struct\ fid\ *ep); int\ fi_ep_bind(struct\ fid_ep\ *ep,\ struct\ fid\ *fid,\ uint64_t\ flags); -int\ fi_scalable_ep_bind(struct\ fid_sep\ *sep,\ struct\ fid\ *fid,\ uint64_t\ flags); +int\ fi_scalable_ep_bind(struct\ fid_ep\ *sep,\ struct\ fid\ *fid,\ uint64_t\ flags); + +int\ fi_pep_bind(struct\ fid_pep\ *pep,\ struct\ fid\ *fid,\ uint64_t\ flags); int\ fi_enable(struct\ fid_ep\ *ep); @@ -100,6 +113,10 @@ int\ fi_getopt(struct\ fid_\ *ep,\ int\ level,\ int\ optname, int\ fi_setopt(struct\ fid\ *ep,\ int\ level,\ int\ optname, \ \ \ \ const\ void\ *optval,\ size_t\ optlen); + +ssize_t\ fi_rx_size_left(struct\ fid_ep\ *ep); + +ssize_t\ fi_tx_size_left(struct\ fid_ep\ *ep); \f[] .fi .SH ARGUMENTS @@ -115,6 +132,8 @@ opened, obtained from fi_getinfo. .PP \f[I]sep\f[] : A scalable fabric endpoint. .PP +\f[I]pep\f[] : A passive fabric endpoint. +.PP \f[I]fid\f[] : Fabric identifier of an associated resource. .PP \f[I]context\f[] : Context associated with the endpoint or asynchronous @@ -142,9 +161,19 @@ incoming connection requests. Active endpoints belong to access domains and can perform data transfers. .PP -Data transfer interfaces are bound to active endpoints. Active endpoints may be connection-oriented or connectionless, and may provide data reliability. +The data transfer interfaces -- messages (fi_msg), tagged messages +(fi_tagged), RMA (fi_rma), and atomics (fi_atomic) -- are associated +with active endpoints. +In basic configurations, an active endpoint has transmit and receive +queues. +In general, operations that generate traffic on the fabric are posted to +the transmit queue. +This includes all RMA and atomic operations, along with sent messages +and sent tagged messages. +Operations that post buffers for receiving incoming data are submitted +to the receive queue. .PP Active endpoints are created in the disabled state. They must transition into an enabled state before accepting data @@ -188,6 +217,11 @@ fi_info connreq must reference the corresponding request. .SS fi_close .PP Closes an endpoint and release all resources associated with it. +.PP +When closing a scalable endpoint, there must be no opened transmit +contexts, or receive contexts associated with the scalable endpoint. +If resources are still associated with the scalable endpoint when +attempting to close, the call will return -FI_EBUSY. .SS fi_ep_bind .PP fi_ep_bind is used to associate an endpoint with hardware resources. @@ -207,9 +241,10 @@ This is specified using fi_ep_bind flags. The following flags may be used separately or OR\[aq]ed together when binding an endpoint to a completion domain CQ. .PP -\f[I]FI_SEND\f[] : Directs the completion of outbound data transfer +\f[I]FI_TRANSMIT\f[] : Directs the completion of outbound data transfer requests to the specified completion queue. This includes send message, RMA, and atomic operations. +The FI_SEND flag may be used interchangeably. .PP \f[I]FI_RECV\f[] : Directs the notification of inbound data transfers to the specified completion queue. @@ -285,6 +320,10 @@ successful RMA write or atomic operation is initiated from a remote endpoint that targets the given endpoint. .PP Connectionless endpoints must be bound to a single address vector. +If an endpoint is using a shared transmit and/or receive context, the +shared contexts must be bound to the endpoint. +CQs, counters, AV, and shared contexts must be bound to endpoints before +they are enabled. .SS fi_scalable_ep_bind .PP fi_scalable_ep_bind is used to associate a scalable endpoint with an @@ -314,6 +353,9 @@ The endpoint must have been configured to support cancelable operations Canceling an operation causes the fabric provider to search for the operation and, if it is still pending, complete it as having been canceled. +If multiple outstanding operations match the context parameter, only one +will be canceled. +In this case, the operation which is canceled is provider specific. The cancel operation will complete within a bounded period of time. .SS fi_alias .PP @@ -373,6 +415,26 @@ needed on receives posted after the value has been changed. It is recommended that applications that want to override the default MIN_MULTI_RECV value set this option before enabling the corresponding endpoint. +.SS fi_rx_size_left +.PP +The fi_rx_size_left call returns a lower bound on the number of receive +operations that may be posted to the given endpoint without that +operation returning -FI_EAGAIN. +Depending on the specific details of the subsequently posted receive +operations (e.g., number of iov entries, which receive function is +called, etc.) +, it may be possible to post more receive operations than originally +indicated by fi_rx_size_left. +.SS fi_tx_size_left +.PP +The fi_tx_size_left call returns a lower bound on the number of transmit +operations that may be posted to the given endpoint without that +operation returning -FI_EAGAIN. +Depending on the specific details of the subsequently posted transmit +operations (e.g., number of iov entries, which transmit function is +called, etc.) +, it may be possible to post more transmit operations than originally +indicated by fi_tx_size_left. .SH ENDPOINT ATTRIBUTES .PP The fi_ep_attr structure defines the set of attributes associated with @@ -755,6 +817,7 @@ struct\ fi_tx_attr\ { \ \ \ \ size_t\ \ \ \ inject_size; \ \ \ \ size_t\ \ \ \ size; \ \ \ \ size_t\ \ \ \ iov_limit; +\ \ \ \ size_t\ \ \ \ rma_iov_limit; }; \f[] .fi @@ -796,6 +859,17 @@ operation. .PP \f[I]iov_limit\f[] : This is the maximum number of IO vectors (scatter-gather elements) that a single posted operation may reference. +.PP +\f[I]rma_iov_limit\f[] : This is the maximum number of RMA IO vectors +(scatter-gather elements) that an RMA or atomic operation may reference. +The rma_iov_limit corresponds to the rma_iov_count values in RMA and +atomic operations. +See struct fi_msg_rma and struct fi_msg_atomic in fi_rma.3 and +fi_atomic.3, for additional details. +This limit applies to both the number of RMA IO vectors that may be +specified when initiating an operation from the local endpoint, as well +as the maximum number of IO vectors that may be carried in a single +request from a remote endpoint. .SS fi_rx_context .PP Receive contexts are independent receive queues for receiving incoming @@ -896,12 +970,14 @@ processing, with the potential cost of serializing access across multiple endpoints. Support for sharable contexts is domain specific. .PP -Conceptually, sharable contexts are transmit queues that may be accessed -by many endpoints. +Conceptually, sharable transmit contexts are transmit queues that may be +accessed by many endpoints. The use of a shared transmit context is mostly opaque to an application. Applications must allocate and bind shared transmit contexts to -endpoints, but otherwise transmit operations are posted directly to the -endpoint. +endpoints, but operations are posted directly to the endpoint. +Shared transmit contexts are not associated with completion queues or +counters. +Completed operations are posted to the CQs bound to the endpoint. An endpoint may only be associated with a single shared transmit context. .PP @@ -909,7 +985,13 @@ Unlike shared transmit contexts, applications interact directly with shared receive contexts. Users post receive buffers directly to a shared receive context, with the buffers usable by any endpoint bound to the shared receive context. -An endpoint may only be associated with a single receive context. +Shared receive contexts are not associated with completion queues or +counters. +Completed receive operations are posted to the CQs bound to the +endpoint. +An endpoint may only be associated with a single receive context, and +all connectless endpoints associated with a shared receive context must +also share the same address vector. .PP Endpoints associated with a shared transmit context may use dedicated receive contexts, and vice-versa. diff --git a/opal/mca/common/libfabric/libfabric/man/fi_eq.3 b/opal/mca/common/libfabric/libfabric/man/fi_eq.3 index 89a4b7dd3a..b9cb12c21e 100644 --- a/opal/mca/common/libfabric/libfabric/man/fi_eq.3 +++ b/opal/mca/common/libfabric/libfabric/man/fi_eq.3 @@ -1,10 +1,12 @@ -.TH fi_eq 3 "2015\-01\-01" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH fi_eq 3 "2015\-02\-01" "Libfabric Programmer\[aq]s Manual" "Libfabric v1.0.0-rc1" .SH NAME .PP fi_eq - Event queue operations .PP fi_eq_open / fi_close : Open/close an event queue .PP +fi_control : Control operation of EQ +.PP fi_eq_read / fi_eq_readerr : Read an event from an event queue .PP fi_eq_write : Writes an event to an event queue @@ -66,6 +68,10 @@ information. .PP \f[I]flags\f[] : Additional flags to apply to the operation .PP +\f[I]command\f[] : Command of control operation to perform on EQ. +.PP +\f[I]arg\f[] : Optional control argument +.PP \f[I]prov_errno\f[] : Provider specific error value .PP \f[I]err_data\f[] : Provider specific error data related to a completion @@ -160,8 +166,10 @@ This field is ignored if wait_obj is not FI_WAIT_SET. .SS fi_close .PP The fi_close call releases all resources associated with an event queue. -The EQ must not be bound to any other resources prior to being closed. Any events which remain on the EQ when it is closed are lost. +.PP +The EQ must not be bound to any other objects prior to being closed, +otherwise the call will return -FI_EBUSY. .SS fi_control .PP The fi_control call is used to access provider or implementation diff --git a/opal/mca/common/libfabric/libfabric/man/fi_errno.3 b/opal/mca/common/libfabric/libfabric/man/fi_errno.3 index 4113e6ca9b..35031831f9 100644 --- a/opal/mca/common/libfabric/libfabric/man/fi_errno.3 +++ b/opal/mca/common/libfabric/libfabric/man/fi_errno.3 @@ -1,4 +1,4 @@ -.TH fi_errno 3 "2015\-01\-08" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH fi_errno 3 "2015\-01\-08" "Libfabric Programmer\[aq]s Manual" "Libfabric v1.0.0-rc1" .SH NAME .PP fi_errno - fabric errors diff --git a/opal/mca/common/libfabric/libfabric/man/fi_fabric.3 b/opal/mca/common/libfabric/libfabric/man/fi_fabric.3 index fe7666329d..15491c62fc 100644 --- a/opal/mca/common/libfabric/libfabric/man/fi_fabric.3 +++ b/opal/mca/common/libfabric/libfabric/man/fi_fabric.3 @@ -1,4 +1,4 @@ -.TH fi_fabric 3 "2015\-01\-08" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH fi_fabric 3 "2015\-01\-24" "Libfabric Programmer\[aq]s Manual" "Libfabric v1.0.0-rc1" .SH NAME .PP fi_fabric - Fabric domain operations @@ -90,6 +90,10 @@ uint64_t flags .PP \f[I]FI_TYPE_MSG_ORDER\f[] : struct fi_ep_attr::msg_order field .PP +\f[I]FI_TYPE_VERSION\f[] : Returns the library version of libfabric in +string form. +The data parameter is ignored. +.PP fi_tostr() will return a pointer to an internal libfabric buffer that should not be modified, and will be overwritten the next time fi_tostr() is invoked. diff --git a/opal/mca/common/libfabric/libfabric/man/fi_getinfo.3 b/opal/mca/common/libfabric/libfabric/man/fi_getinfo.3 index 617dec154e..f5f2ac2549 100644 --- a/opal/mca/common/libfabric/libfabric/man/fi_getinfo.3 +++ b/opal/mca/common/libfabric/libfabric/man/fi_getinfo.3 @@ -1,4 +1,4 @@ -.TH fi_getinfo 3 "2015\-01\-20" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH fi_getinfo 3 "2015\-02\-01" "Libfabric Programmer\[aq]s Manual" "Libfabric v1.0.0-rc1" .SH NAME .PP fi_getinfo / fi_freeinfo - Obtain / free fabric interface information @@ -250,11 +250,11 @@ send and receive tagged messages. Applications can use the FI_SEND and FI_RECV flags to optimize an endpoint as send-only or receive-only. .PP -\f[I]FI_ATOMICS\f[] : Specifies that the endpoint supports some set of +\f[I]FI_ATOMIC\f[] : Specifies that the endpoint supports some set of atomic operations. Endpoints supporting this capability support operations defined by struct fi_ops_atomic. -In the absence of any relevant flags, FI_ATOMICS implies the ability to +In the absence of any relevant flags, FI_ATOMIC implies the ability to initiate and be the target of remote atomic reads and writes. Applications can use the FI_READ, FI_WRITE, FI_REMOTE_READ, and FI_REMOTE_WRITE flags to restrict the types of atomic operations @@ -306,11 +306,11 @@ data, which may adversely affect performance. .PP \f[I]FI_READ\f[] : Indicates that the user requires an endpoint capable of initiating reads against remote memory regions. -Remote reads include some RMA and atomic operations. +This flag requires that FI_RMA and/or FI_ATOMIC be set. .PP \f[I]FI_WRITE\f[] : Indicates that the user requires an endpoint capable of initiating writes against remote memory regions. -Remote writes include some RMA and most atomic operations. +This flag requires that FI_RMA and/or FI_ATOMIC be set. .PP \f[I]FI_SEND\f[] : Indicates that the user requires an endpoint capable of sending message data transfers. @@ -324,12 +324,11 @@ message functionality. .PP \f[I]FI_REMOTE_READ\f[] : Indicates that the user requires an endpoint capable of receiving read memory operations from remote endpoints. -Remote read operations include some RMA and atomic operations. +This flag requires that FI_RMA and/or FI_ATOMIC be set. .PP \f[I]FI_REMOTE_WRITE\f[] : Indicates that the user requires an endpoint capable of receiving write memory operations from remote endpoints. -Remote write operations include some RMA operations and most atomic -operations. +This flag requires that FI_RMA and/or FI_ATOMIC be set. .PP \f[I]FI_REMOTE_CQ_DATA\f[] : Applications may include a small message with a data transfer that is placed directly into a remote event queue @@ -362,6 +361,15 @@ assumption that fi_cancel will not be used by the application. triggered operations. Endpoints support this capability must meet the usage model as described by fi_trigger.3. +.PP +\f[I]FI_FENCE\f[] : Indicates that the endpoint support the FI_FENCE +flag on data transfer operations. +Support requires tracking that all previous transmit requests to a +specified remote endpoint complete prior to initiating the fenced +operation. +Fenced operations are often used to enforce ordering between operations +that are not otherwise guaranteed by the underlying provider or +protocol. .SH MODE .PP The operational mode bits are used to convey requirements that an @@ -449,6 +457,18 @@ associated with a registration request, and the resulting memory region will start at a base address of 0. Applications can request that providers select MR attributes by forcing this bit set after fi_getinfo returns. +.PP +\f[I]FI_ASYNC_IOV\f[] : Applications can reference multiple data buffers +as part of a single transmit operation through the use of IO vectors +(SGEs). +Typically, the contents of an IO vector are copied by the provider into +an internal buffer area, or directly to the underlying hardware. +However, when a large number of IOV entries are supported, IOV buffering +may have a negative impact on performance and memory consumption. +The FI_ASYNC_IOV mode indicates that the application must provide the +buffering needed for the IO vectors. +When set, an application must not modify an IO vector until the +associated operation has completed. .SH ENDPOINT TYPES .PP \f[I]FI_EP_UNSPEC\f[] : The type of endpoint is not specified. diff --git a/opal/mca/common/libfabric/libfabric/man/fi_mr.3 b/opal/mca/common/libfabric/libfabric/man/fi_mr.3 index 5c479edb4a..3cf8c77b66 100644 --- a/opal/mca/common/libfabric/libfabric/man/fi_mr.3 +++ b/opal/mca/common/libfabric/libfabric/man/fi_mr.3 @@ -1,4 +1,4 @@ -.TH fi_mr 3 "2014\-12\-19" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH fi_mr 3 "2015\-01\-29" "Libfabric Programmer\[aq]s Manual" "Libfabric v1.0.0-rc1" .SH NAME .PP fi_mr - Memory region operations @@ -14,8 +14,8 @@ memory region fi_mr_key : Return the remote key needed to access a registered memory region .PP -fi_mr_bind : Associate a registered memory region with an event -collector. +fi_mr_bind : Associate a registered memory region with a completion +queue or counter. .SH SYNOPSIS .IP .nf @@ -39,7 +39,7 @@ void\ *\ fi_mr_desc(struct\ fid_mr\ *mr); uint64_t\ fi_mr_key(struct\ fid_mr\ *mr); -int\ fi_mr_bind(struct\ fid_mr\ *mr,\ struct\ fid\ *ec,\ uint64_t\ flags); +int\ fi_mr_bind(struct\ fid_mr\ *mr,\ struct\ fid\ *bfid,\ uint64_t\ flags); \f[] .fi .SH ARGUMENTS @@ -48,7 +48,7 @@ int\ fi_mr_bind(struct\ fid_mr\ *mr,\ struct\ fid\ *ec,\ uint64_t\ flags); .PP \f[I]mr\f[] : Memory region .PP -\f[I]ec\f[] : Event queue or counter +\f[I]bfid\f[] : Fabric identifier of an associated resource. .PP \f[I]context\f[] : User specified context associated with the memory region. @@ -236,6 +236,11 @@ Fi_close is used to release all resources associated with a registering a memory region. Once unregistered, further access to the registered memory is not guaranteed. +.PP +When closing the MR, there must be no opened endpoints or counters +associated with the MR. +If resources are still associated with the MR when attempting to close, +the call will return -FI_EBUSY. .SS fi_mr_desc / fi_mr_key .PP The local memory descriptor and remote protection key associated with a @@ -244,14 +249,14 @@ The memory registration must have completed successfully before invoking these calls. .SS fi_mr_bind .PP -The fi_mr_bind function associates a memory region with an event counter -or queue, for providers that support the generation of events based on -fabric operations. +The fi_mr_bind function associates a memory region with a counter, for +providers that support the generation of completions based on fabric +operations. The type of events tracked against the memory region is based on the bitwise OR of the following flags. .PP -\f[I]FI_WRITE\f[] : Generates an event whenever a remote RMA write or -atomic operation modify the memory region. +\f[I]FI_REMOTE_WRITE\f[] : Generates an event whenever a remote RMA +write or atomic operation modify the memory region. .SH FLAGS .PP The following flags are usable with fi_mr_reg, fi_mr_regv, diff --git a/opal/mca/common/libfabric/libfabric/man/fi_msg.3 b/opal/mca/common/libfabric/libfabric/man/fi_msg.3 index 5d44b89200..4fa4ac0a22 100644 --- a/opal/mca/common/libfabric/libfabric/man/fi_msg.3 +++ b/opal/mca/common/libfabric/libfabric/man/fi_msg.3 @@ -1,4 +1,4 @@ -.TH fi_msg 3 "2015\-01\-23" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH fi_msg 3 "2015\-01\-28" "Libfabric Programmer\[aq]s Manual" "Libfabric v1.0.0-rc1" .SH NAME .PP fi_msg - Message data transfer operations @@ -42,10 +42,6 @@ ssize_t\ fi_inject(struct\ fid_ep\ *ep,\ void\ *buf,\ size_t\ len, ssize_t\ fi_senddata(struct\ fid_ep\ *ep,\ void\ *buf,\ size_t\ len, \ \ \ \ void\ *desc,\ uint64_t\ data,\ fi_addr_t\ dest_addr,\ void\ *context); - -ssize_t\ fi_rx_size_left(struct\ fid_ep\ *ep); - -ssize_t\ fi_tx_size_left(struct\ fid_ep\ *ep); \f[] .fi .SH ARGUMENTS @@ -101,10 +97,6 @@ asynchronously. Users should not touch the posted data buffer(s) until the receive operation has completed. .PP -The "size_left" functions -- fi_rx_size_left, fi_tx_size_left -- return -a lower bound on the number of receive/send operations that may be -posted to the given endpoint without returning -FI_EAGAIN. -.PP Completed message operations are reported to the user through one or more event collectors associated with the endpoint. Users provide context which are associated with each operation, and is @@ -181,26 +173,6 @@ The fi_recvmsg call supports posting buffers over both connected and unconnected endpoints, with the ability to control the receive operation per call through the use of flags. The fi_recvmsg function takes a struct fi_msg as input. -.SS fi_rx_size_left -.PP -The fi_rx_size_left call returns a lower bound on the number of receive -operations that may be posted to the given endpoint without that -operation returning -FI_EAGAIN. -Depending on the specific details of the subsequently posted receive -operations (e.g., number of iov entries, which receive function is -called, etc.) -, it may be possible to post more receive operations than originally -indicated by fi_rx_size_left. -.SS fi_tx_size_left -.PP -The fi_tx_size_left call returns a lower bound on the number of send -operations that may be posted to the given endpoint without that -operation returning -FI_EAGAIN. -Depending on the specific details of the subsequently posted send -operations (e.g., number of iov entries, which send function is called, -etc.) -, it may be possible to post more send operations than originally -indicated by fi_tx_size_left. .SH FLAGS .PP The fi_recvmsg and fi_sendmsg calls allow the user to specify flags @@ -254,6 +226,11 @@ FI_OPT_MIN_MULTI_RECV). \f[I]FI_REMOTE_COMPLETE\f[] : Applies to fi_sendmsg. Indicates that a completion should not be generated until the operation has completed on the remote side. +.PP +\f[I]FI_FENCE\f[] : Applies to transmits. +Indicates that the requested operation, also known as the fenced +operation, be deferred until all previous operations targeting the same +target endpoint have completed. .SH RETURN VALUE .PP Returns 0 on success. diff --git a/opal/mca/common/libfabric/libfabric/man/fi_poll.3 b/opal/mca/common/libfabric/libfabric/man/fi_poll.3 index fa50cbf23d..0ee3c548e6 100644 --- a/opal/mca/common/libfabric/libfabric/man/fi_poll.3 +++ b/opal/mca/common/libfabric/libfabric/man/fi_poll.3 @@ -1,4 +1,4 @@ -.TH fi_poll 3 "2015\-01\-06" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH fi_poll 3 "2015\-01\-29" "Libfabric Programmer\[aq]s Manual" "Libfabric v1.0.0-rc1" .SH NAME .PP fi_poll - Polling and wait set operations @@ -82,7 +82,7 @@ The use of this field is reserved and must be set to 0 by the caller. .PP The fi_close call releases all resources associated with a poll set. The poll set must not be associated with any other resources prior to -being closed. +being closed, otherwise the call will return -FI_EBUSY. .SS fi_poll_add .PP Associates an event queue or counter with a poll set. @@ -150,7 +150,7 @@ The use of this field is reserved and must be set to 0 by the caller. .PP The fi_close call releases all resources associated with a wait set. The wait set must not be bound to any other opened resources prior to -being closed. +being closed, otherwise the call will return -FI_EBUSY. .SS fi_wait .PP Waits on a wait set until one or more of its underlying wait objects is diff --git a/opal/mca/common/libfabric/libfabric/man/fi_rma.3 b/opal/mca/common/libfabric/libfabric/man/fi_rma.3 index 4e71fa130c..361ebc569a 100644 --- a/opal/mca/common/libfabric/libfabric/man/fi_rma.3 +++ b/opal/mca/common/libfabric/libfabric/man/fi_rma.3 @@ -1,4 +1,4 @@ -.TH fi_rma 3 "2015\-01\-07" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH fi_rma 3 "2015\-01\-28" "Libfabric Programmer\[aq]s Manual" "Libfabric v1.0.0-rc1" .SH NAME .PP fi_rma - Remote memory access operations @@ -231,6 +231,10 @@ data into a local buffer and transfer out of that buffer. \f[I]FI_REMOTE_COMPLETE\f[] : Applies to fi_writemsg. Indicates that a completion should not be generated until the operation has completed on the remote side. +.PP +\f[I]FI_FENCE\f[] : Indicates that the requested operation, also known +as the fenced operation, be deferred until all previous operations +targeting the same target endpoint have completed. .SH RETURN VALUE .PP Returns 0 on success. diff --git a/opal/mca/common/libfabric/libfabric/man/fi_rx_size_left.3 b/opal/mca/common/libfabric/libfabric/man/fi_rx_size_left.3 index 7c0724c989..06fd8cbc37 100644 --- a/opal/mca/common/libfabric/libfabric/man/fi_rx_size_left.3 +++ b/opal/mca/common/libfabric/libfabric/man/fi_rx_size_left.3 @@ -1 +1 @@ -.so man3/fi_msg.3 +.so man3/fi_endpoint.3 diff --git a/opal/mca/common/libfabric/libfabric/man/fi_tagged.3 b/opal/mca/common/libfabric/libfabric/man/fi_tagged.3 index 8841599b2d..3c775432a4 100644 --- a/opal/mca/common/libfabric/libfabric/man/fi_tagged.3 +++ b/opal/mca/common/libfabric/libfabric/man/fi_tagged.3 @@ -1,4 +1,4 @@ -.TH fi_tagged 3 "2015\-01\-06" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH fi_tagged 3 "2015\-02\-01" "Libfabric Programmer\[aq]s Manual" "Libfabric v1.0.0-rc1" .SH NAME .PP fi_tagged - Tagged data transfer operations @@ -224,6 +224,10 @@ When set, FI_CLAIM indicates that when a search successfully finds a matching message, the message is claimed by caller. Subsequent searches cannot find the same message, although they may match other messages that have the same tag. +.PP +An application can request that a buffered message be discarded by using +the FI_DISCARD flag as part of the search. +When set, FI_DISCARD indicates that any matching message be dropped. .SH FLAGS .PP The fi_trecvmsg and fi_tsendmsg calls allow the user to specify flags @@ -265,12 +269,21 @@ data into a local buffer and transfer out of that buffer. Indicates that a completion should not be generated until the operation has completed on the remote side. .PP +\f[I]FI_FENCE\f[] : Applies to transmits. +Indicates that the requested operation, also known as the fenced +operation, be deferred until all previous operations targeting the same +target endpoint have completed. +.PP The following flags may be used with fi_tsearch. .PP \f[I]FI_CLAIM\f[] : Indicates that when a search successfully finds a matching message, the message is claimed by caller. Subsequent searches cannot find the same message, although they may match other messages that have the same tag. +.PP +\f[I]FI_DISCARD\f[] : Indicates that if a search successfully finds a +matching message, that the message is discarded by the provider, as the +data is not needed by the application. .SH RETURN VALUE .PP The tagged send and receive calls return 0 on success. diff --git a/opal/mca/common/libfabric/libfabric/man/fi_trigger.3 b/opal/mca/common/libfabric/libfabric/man/fi_trigger.3 index caa3570a59..209ea0af9e 100644 --- a/opal/mca/common/libfabric/libfabric/man/fi_trigger.3 +++ b/opal/mca/common/libfabric/libfabric/man/fi_trigger.3 @@ -1,4 +1,4 @@ -.TH fi_trigger 3 "2015\-01\-01" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH fi_trigger 3 "2015\-01\-01" "Libfabric Programmer\[aq]s Manual" "Libfabric v1.0.0-rc1" .SH NAME .PP fi_trigger - Triggered operations diff --git a/opal/mca/common/libfabric/libfabric/man/fi_tx_size_left.3 b/opal/mca/common/libfabric/libfabric/man/fi_tx_size_left.3 index 7c0724c989..06fd8cbc37 100644 --- a/opal/mca/common/libfabric/libfabric/man/fi_tx_size_left.3 +++ b/opal/mca/common/libfabric/libfabric/man/fi_tx_size_left.3 @@ -1 +1 @@ -.so man3/fi_msg.3 +.so man3/fi_endpoint.3 diff --git a/opal/mca/common/libfabric/libfabric/man/fi_version.3 b/opal/mca/common/libfabric/libfabric/man/fi_version.3 index 7ae1945953..104495abd0 100644 --- a/opal/mca/common/libfabric/libfabric/man/fi_version.3 +++ b/opal/mca/common/libfabric/libfabric/man/fi_version.3 @@ -1,4 +1,4 @@ -.TH fi_version 3 "2015\-01\-08" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH fi_version 3 "2015\-01\-08" "Libfabric Programmer\[aq]s Manual" "Libfabric v1.0.0-rc1" .SH NAME .PP fi_version - Version of the library interfaces diff --git a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx.h b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx.h index cfa550122a..9dabf6037b 100644 --- a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx.h +++ b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx.h @@ -40,62 +40,6 @@ extern "C" { #define PSM_PFX "libfabric:psm" -#define PSMX_FREE_LIST_INIT(head, tail, type, count) \ - do { \ - int i; \ - type *item; \ - head = tail = NULL; \ - for (i=0; inext = head; \ - head = item; \ - if (!tail) \ - tail = head; \ - } \ - } while (0) - -#define PSMX_FREE_LIST_GET(head, tail, type, item) \ - do { \ - if (head) { \ - item = head; \ - head = head->next; \ - if (!head) \ - tail = head; \ - item->next = NULL; \ - } \ - else { \ - item = calloc(sizeof(type), 1); \ - if (!item) {\ - fprintf(stderr, "%s: out of memory.\n", __func__); \ - exit(-1); \ - } \ - } \ - } while (0) - -#define PSMX_FREE_LIST_PUT(head, tail, type, item) \ - do { \ - memset(item, 0, sizeof(type)); \ - if (tail) \ - tail->next = item; \ - else \ - head = tail = item; \ - } while (0) - -#define PSMX_FREE_LIST_FINALIZE(head, tail, type) \ - do { \ - type *next; \ - while (head) { \ - next = head->next; \ - free(head); \ - head = next; \ - } \ - tail = NULL; \ - } while (0) - #define PSMX_TIME_OUT 120 #define PSMX_OP_FLAGS (FI_INJECT | FI_MULTI_RECV | FI_EVENT | \ @@ -253,6 +197,7 @@ struct psmx_multi_recv { struct psmx_fid_fabric { struct fid_fabric fabric; + struct psmx_fid_domain *active_domain; }; struct psmx_fid_domain { @@ -306,13 +251,7 @@ struct psmx_cq_event { } cqe; int error; uint64_t source; - struct psmx_cq_event *next; -}; - -struct psmx_cq_event_queue { - struct psmx_cq_event *head; - struct psmx_cq_event *tail; - size_t count; + struct slist_entry list_entry; }; struct psmx_fid_wait { @@ -344,11 +283,13 @@ struct psmx_fid_cq { struct psmx_fid_domain *domain; int format; int entry_size; - struct psmx_cq_event_queue event_queue; - struct psmx_cq_event_queue free_list; + size_t event_count; + struct slist event_queue; + struct slist free_list; struct psmx_cq_event *pending_error; struct psmx_fid_wait *wait; int wait_cond; + int wait_is_local; }; enum psmx_triggered_op { @@ -489,6 +430,7 @@ struct psmx_fid_cntr { uint64_t counter_last_read; uint64_t error_counter_last_read; struct psmx_fid_wait *wait; + int wait_is_local; struct psmx_trigger *trigger; pthread_mutex_t trigger_lock; }; @@ -656,6 +598,12 @@ static inline void psmx_cntr_inc(struct psmx_fid_cntr *cntr) psmx_wait_signal((struct fid_wait *)cntr->wait); } +static inline void psmx_progress(struct psmx_fid_domain *domain) +{ + psmx_cq_poll_mq(NULL, domain, NULL, 0, NULL); + psmx_am_progress(domain); +} + ssize_t _psmx_send(struct fid_ep *ep, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, void *context, uint64_t flags); diff --git a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_atomic.c b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_atomic.c index abe04a59ee..2eb3896321 100644 --- a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_atomic.c +++ b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_atomic.c @@ -1096,11 +1096,28 @@ static ssize_t psmx_atomic_readwritemsg(struct fid_ep *ep, size_t result_count, uint64_t flags) { - if (!msg || msg->iov_count != 1) + void *buf; + size_t count; + + if (!msg) return -EINVAL; - return _psmx_atomic_readwrite(ep, msg->msg_iov[0].addr, - msg->msg_iov[0].count, + if (msg->op == FI_ATOMIC_READ) { + if (result_count != 1) + return -EINVAL; + + buf = NULL; + count = resultv[0].count; + } + else { + if (msg->iov_count != 1) + return -EINVAL; + + buf = msg->msg_iov[0].addr; + count = msg->msg_iov[0].count; + } + + return _psmx_atomic_readwrite(ep, buf, count, msg->desc ? msg->desc[0] : NULL, resultv[0].addr, result_desc ? result_desc[0] : NULL, diff --git a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_cntr.c b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_cntr.c index b6638ae179..7b96e1c242 100644 --- a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_cntr.c +++ b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_cntr.c @@ -193,12 +193,19 @@ void psmx_cntr_add_trigger(struct psmx_fid_cntr *cntr, struct psmx_trigger *trig psmx_cntr_check_trigger(cntr); } +#define PSMX_CNTR_POLL_THRESHOLD 100 static uint64_t psmx_cntr_read(struct fid_cntr *cntr) { struct psmx_fid_cntr *cntr_priv; + static int poll_cnt = 0; cntr_priv = container_of(cntr, struct psmx_fid_cntr, cntr); + if (poll_cnt++ == PSMX_CNTR_POLL_THRESHOLD) { + psmx_progress(cntr_priv->domain); + poll_cnt = 0; + } + cntr_priv->counter_last_read = cntr_priv->counter; return cntr_priv->counter_last_read; @@ -264,8 +271,7 @@ static int psmx_cntr_wait(struct fid_cntr *cntr, uint64_t threshold, int timeout break; } else { - psmx_cq_poll_mq(NULL, cntr_priv->domain, NULL, 0, NULL); - psmx_am_progress(cntr_priv->domain); + psmx_progress(cntr_priv->domain); } if (cntr_priv->counter >= threshold) @@ -293,6 +299,9 @@ static int psmx_cntr_close(fid_t fid) cntr = container_of(fid, struct psmx_fid_cntr, cntr.fid); + if (cntr->wait && cntr->wait_is_local) + fi_close((fid_t)cntr->wait); + pthread_mutex_destroy(&cntr->trigger_lock); free(cntr); @@ -351,6 +360,7 @@ int psmx_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr, struct psmx_fid_cntr *cntr_priv; struct psmx_fid_wait *wait = NULL; struct fi_wait_attr wait_attr; + int wait_is_local = 0; int events; uint64_t flags; int err; @@ -392,6 +402,7 @@ int psmx_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr, &wait_attr, (struct fid_wait **)&wait); if (err) return err; + wait_is_local = 1; break; default: @@ -407,6 +418,7 @@ int psmx_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr, cntr_priv->domain = domain_priv; cntr_priv->events = events; cntr_priv->wait = wait; + cntr_priv->wait_is_local = wait_is_local; cntr_priv->flags = flags; cntr_priv->cntr.fid.fclass = FI_CLASS_CNTR; cntr_priv->cntr.fid.context = context; diff --git a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_cq.c b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_cq.c index ce4f8d40bf..67a603512d 100644 --- a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_cq.c +++ b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_cq.c @@ -32,40 +32,25 @@ #include "psmx.h" -#define PSMX_CQ_EMPTY(cq) (!cq->event_queue.head) - void psmx_cq_enqueue_event(struct psmx_fid_cq *cq, struct psmx_cq_event *event) { - struct psmx_cq_event_queue *ceq = &cq->event_queue; - - if (ceq->tail) { - ceq->tail->next = event; - ceq->tail = event; - } - else { - ceq->head = ceq->tail = event; - } - ceq->count++; + slist_insert_tail(&event->list_entry, &cq->event_queue); + cq->event_count++; if (cq->wait) psmx_wait_signal((struct fid_wait *)cq->wait); } static struct psmx_cq_event *psmx_cq_dequeue_event(struct psmx_fid_cq *cq) { - struct psmx_cq_event_queue *ceq = &cq->event_queue; - struct psmx_cq_event *event; + struct slist_entry *entry; - if (!ceq->head) + if (slist_empty(&cq->event_queue)) return NULL; - event = ceq->head; - ceq->head = event->next; - ceq->count--; - if (!ceq->head) - ceq->tail = NULL; + entry = slist_remove_head(&cq->event_queue); + cq->event_count--; - event->next = NULL; - return event; + return container_of(entry, struct psmx_cq_event, list_entry); } struct psmx_cq_event *psmx_cq_create_event(struct psmx_fid_cq *cq, @@ -76,7 +61,17 @@ struct psmx_cq_event *psmx_cq_create_event(struct psmx_fid_cq *cq, { struct psmx_cq_event *event; - PSMX_FREE_LIST_GET(cq->free_list.head, cq->free_list.tail, struct psmx_cq_event, event); + if (!slist_empty(&cq->free_list)) { + event = container_of(slist_remove_head(&cq->free_list), + struct psmx_cq_event, list_entry); + } + else { + event = calloc(1, sizeof(*event)); + if (!event) { + fprintf(stderr, "%s: out of memory.\n", __func__); + exit(-1); + } + } if ((event->error = !!err)) { event->cqe.err.op_context = op_context; @@ -117,7 +112,7 @@ struct psmx_cq_event *psmx_cq_create_event(struct psmx_fid_cq *cq, break; default: - fprintf(stderr, "%s: unsupported CC format %d\n", __func__, cq->format); + fprintf(stderr, "%s: unsupported CQ format %d\n", __func__, cq->format); return NULL; } @@ -168,8 +163,18 @@ static struct psmx_cq_event *psmx_cq_create_event_from_status( event = event_in; } else { - PSMX_FREE_LIST_GET(cq->free_list.head, cq->free_list.tail, - struct psmx_cq_event, event); + if (!slist_empty(&cq->free_list)) { + event = container_of(slist_remove_head(&cq->free_list), + struct psmx_cq_event, list_entry); + } + else { + event = calloc(1, sizeof(*event)); + if (!event) { + fprintf(stderr, "%s: out of memory.\n", __func__); + exit(-1); + } + } + event->error = !!psm_status->error_code; } @@ -458,7 +463,7 @@ static ssize_t psmx_cq_readfrom(struct fid_cq *cq, void *buf, size_t count, cq_priv = container_of(cq, struct psmx_fid_cq, cq); - if (PSMX_CQ_EMPTY(cq_priv) || !buf) { + if (slist_empty(&cq_priv->event_queue) || !buf) { ret = psmx_cq_poll_mq(cq_priv, cq_priv->domain, (struct psmx_cq_event *)buf, count, src_addr); if (ret > 0) @@ -482,10 +487,8 @@ static ssize_t psmx_cq_readfrom(struct fid_cq *cq, void *buf, size_t count, if (psmx_cq_get_event_src_addr(cq_priv, event, src_addr)) *src_addr = FI_ADDR_NOTAVAIL; - PSMX_FREE_LIST_PUT(cq_priv->free_list.head, - cq_priv->free_list.tail, - struct psmx_cq_event, - event); + memset(event, 0, sizeof(*event)); + slist_insert_tail(&event->list_entry, &cq_priv->free_list); read_count++; buf += cq_priv->entry_size; @@ -595,7 +598,7 @@ static ssize_t psmx_cq_sreadfrom(struct fid_cq *cq, void *buf, size_t count, threshold = 1; /* NOTE: "cond" is only a hint, not a mandatory condition. */ - event_count = cq_priv->event_queue.count; + event_count = cq_priv->event_count; if (event_count < threshold) { if (cq_priv->wait) { psmx_wait_wait((struct fid_wait *)cq_priv->wait, timeout); @@ -607,7 +610,7 @@ static ssize_t psmx_cq_sreadfrom(struct fid_cq *cq, void *buf, size_t count, break; /* CQ may be updated asynchronously by the AM handlers */ - if (cq_priv->event_queue.count > event_count) + if (cq_priv->event_count > event_count) break; if (timeout < 0) @@ -641,19 +644,20 @@ static const char *psmx_cq_strerror(struct fid_cq *cq, int prov_errno, const voi static int psmx_cq_close(fid_t fid) { struct psmx_fid_cq *cq; + struct slist_entry *entry; + struct psmx_cq_event *item; cq = container_of(fid, struct psmx_fid_cq, cq.fid); - PSMX_FREE_LIST_FINALIZE(cq->free_list.head, cq->free_list.tail, struct psmx_cq_event); - - if (cq->wait) { - if (cq->wait->type == FI_WAIT_FD) { - close(cq->wait->fd[0]); - close(cq->wait->fd[1]); - } - free(cq->wait); + while (!slist_empty(&cq->free_list)) { + entry = slist_remove_head(&cq->free_list); + item = container_of(entry, struct psmx_cq_event, list_entry); + free(item); } + if (cq->wait && cq->wait_is_local) + fi_close((fid_t)cq->wait); + free(cq); return 0; @@ -703,9 +707,12 @@ int psmx_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, struct psmx_fid_domain *domain_priv; struct psmx_fid_cq *cq_priv; struct psmx_fid_wait *wait = NULL; + struct psmx_cq_event *event; struct fi_wait_attr wait_attr; + int wait_is_local = 0; int entry_size; int err; + int i; domain_priv = container_of(domain, struct psmx_fid_domain, domain); switch (attr->format) { @@ -758,6 +765,7 @@ int psmx_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, &wait_attr, (struct fid_wait **)&wait); if (err) return err; + wait_is_local = 1; break; default: @@ -792,14 +800,25 @@ int psmx_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, cq_priv->wait = wait; if (wait) cq_priv->wait_cond = attr->wait_cond; + cq_priv->wait_is_local = wait_is_local; cq_priv->cq.fid.fclass = FI_CLASS_CQ; cq_priv->cq.fid.context = context; cq_priv->cq.fid.ops = &psmx_fi_ops; cq_priv->cq.ops = &psmx_cq_ops; - PSMX_FREE_LIST_INIT(cq_priv->free_list.head, cq_priv->free_list.tail, - struct psmx_cq_event, 64); + slist_init(&cq_priv->event_queue); + slist_init(&cq_priv->free_list); + +#define PSMX_FREE_LIST_SIZE 64 + for (i=0; ilist_entry, &cq_priv->free_list); + } *cq = &cq_priv->cq; return 0; diff --git a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_domain.c b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_domain.c index 98def5b6fa..59f5d01587 100644 --- a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_domain.c +++ b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_domain.c @@ -64,6 +64,7 @@ static int psmx_domain_close(fid_t fid) if (err != PSM_OK) psm_ep_close(domain->psm_ep, PSM_EP_CLOSE_FORCE, 0); + domain->fabric->active_domain = NULL; free(domain); return 0; @@ -90,6 +91,7 @@ static struct fi_ops_domain psmx_domain_ops = { int psmx_domain_open(struct fid_fabric *fabric, struct fi_info *info, struct fid_domain **domain, void *context) { + struct psmx_fid_fabric *fabric_priv; struct psmx_fid_domain *domain_priv; struct psm_ep_open_opts opts; psm_uuid_t uuid; @@ -97,6 +99,12 @@ int psmx_domain_open(struct fid_fabric *fabric, struct fi_info *info, psmx_debug("%s\n", __func__); + fabric_priv = container_of(fabric, struct psmx_fid_fabric, fabric); + if (fabric_priv->active_domain) { + psmx_debug("%s: a domain has been opened for the fabric\n"); + return -EBUSY; + } + if (!info->domain_attr->name || strncmp(info->domain_attr->name, "psm", 3)) return -EINVAL; @@ -112,7 +120,7 @@ int psmx_domain_open(struct fid_fabric *fabric, struct fi_info *info, domain_priv->domain.ops = &psmx_domain_ops; domain_priv->domain.mr = &psmx_mr_ops; domain_priv->mode = info->mode; - domain_priv->fabric = container_of(fabric, struct psmx_fid_fabric, fabric); + domain_priv->fabric = fabric_priv; psm_ep_open_opts_get_defaults(&opts); @@ -154,6 +162,7 @@ int psmx_domain_open(struct fid_fabric *fabric, struct fi_info *info, goto err_out_close_ep; } + fabric_priv->active_domain = domain_priv; *domain = &domain_priv->domain; return 0; @@ -171,6 +180,10 @@ err_out: int psmx_domain_check_features(struct psmx_fid_domain *domain, int ep_cap) { + int rma_target; + + rma_target = fi_rma_target_allowed(ep_cap); + if ((ep_cap & PSMX_CAPS) != ep_cap) return -EINVAL; @@ -180,10 +193,10 @@ int psmx_domain_check_features(struct psmx_fid_domain *domain, int ep_cap) if ((ep_cap & FI_MSG) && domain->msg_ep) return -EBUSY; - if ((ep_cap & FI_RMA) && domain->rma_ep) + if ((ep_cap & FI_RMA) && rma_target && domain->rma_ep) return -EBUSY; - if ((ep_cap & FI_ATOMICS) && domain->atomics_ep) + if ((ep_cap & FI_ATOMICS) && rma_target && domain->atomics_ep) return -EBUSY; return 0; @@ -192,6 +205,7 @@ int psmx_domain_check_features(struct psmx_fid_domain *domain, int ep_cap) int psmx_domain_enable_ep(struct psmx_fid_domain *domain, struct psmx_fid_ep *ep) { uint64_t ep_cap = 0; + int rma_target; if (ep) ep_cap = ep->caps; @@ -214,10 +228,12 @@ int psmx_domain_enable_ep(struct psmx_fid_domain *domain, struct psmx_fid_ep *ep domain->am_initialized = 1; } - if (ep_cap & FI_RMA) + rma_target = fi_rma_target_allowed(ep_cap); + + if ((ep_cap & FI_RMA) && rma_target) domain->rma_ep = ep; - if (ep_cap & FI_ATOMICS) + if ((ep_cap & FI_ATOMICS) && rma_target) domain->atomics_ep = ep; if (ep_cap & FI_TAGGED) diff --git a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_ep.c b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_ep.c index 0fe43ab31a..a225cbb9de 100644 --- a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_ep.c +++ b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_ep.c @@ -237,11 +237,6 @@ static int psmx_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) return 0; } -static inline int psmx_ep_progress(struct psmx_fid_ep *ep) -{ - return psmx_cq_poll_mq(NULL, ep->domain, NULL, 0, NULL); -} - static int psmx_ep_control(fid_t fid, int command, void *arg) { struct fi_alias *alias; @@ -293,6 +288,8 @@ static struct fi_ops_ep psmx_ep_ops = { .enable = psmx_ep_enable, .tx_ctx = fi_no_tx_ctx, .rx_ctx = fi_no_rx_ctx, + .rx_size_left = fi_no_rx_size_left, + .tx_size_left = fi_no_tx_size_left, }; int psmx_ep_open(struct fid_domain *domain, struct fi_info *info, diff --git a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_init.c b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_init.c index 9baaab158d..d8b405ebdc 100644 --- a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_init.c +++ b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_init.c @@ -53,7 +53,7 @@ static int psmx_reserve_tag_bits(int *caps, uint64_t *max_tag_value) psmx_debug("%s: unable to reserve tag bit for FI_MSG support.\n" "ADVICE: please reduce the asked max_tag_value, " "or remove FI_MSG from the asked capabilities, " - "or set SFI_PSM_AM_MSG=1 to use an alternative (but less " + "or set OFI_PSM_AM_MSG=1 to use an alternative (but less " "optimized) message queue implementation.\n", __func__); return -1; @@ -62,7 +62,7 @@ static int psmx_reserve_tag_bits(int *caps, uint64_t *max_tag_value) psmx_debug("%s: unable to reserve tag bit for FI_MSG support. " "FI_MSG is removed from the capabilities.\n" "ADVICE: please reduce the asked max_tag_value, " - "or set SFI_PSM_AM_MSG=1 to use an alternative (but less " + "or set OFI_PSM_AM_MSG=1 to use an alternative (but less " "optimized) message queue implementation.\n", __func__); ret_caps &= ~FI_MSG; @@ -77,7 +77,7 @@ static int psmx_reserve_tag_bits(int *caps, uint64_t *max_tag_value) psmx_debug("%s: unable to reserve tag bit for tagged RMA acceleration.\n" "ADVICE: please reduce the asked max_tag_value, " "or remove FI_RMA from the asked capabilities, " - "or set SFI_PSM_TAGGED_RMA=0 to disable RMA acceleration.\n", + "or set OFI_PSM_TAGGED_RMA=0 to disable RMA acceleration.\n", __func__); return -1; } @@ -85,7 +85,7 @@ static int psmx_reserve_tag_bits(int *caps, uint64_t *max_tag_value) psmx_debug("%s: unable to reserve tag bit for tagged RMA acceleration. " "FI_RMA is removed from the capabilities.\n" "ADVICE: please reduce the asked max_tag_value, " - "or set SFI_PSM_TAGGED_RMA=0 to disable RMA acceleration.\n", + "or set OFI_PSM_TAGGED_RMA=0 to disable RMA acceleration.\n", __func__); ret_caps &= ~FI_RMA; } @@ -369,12 +369,12 @@ PSM_INI psmx_debug("%s\n", __func__); - psmx_env.name_server = psmx_get_int_env("SFI_PSM_NAME_SERVER", 0); - psmx_env.am_msg = psmx_get_int_env("SFI_PSM_AM_MSG", 0); - psmx_env.tagged_rma = psmx_get_int_env("SFI_PSM_TAGGED_RMA", 0); - psmx_env.debug = psmx_get_int_env("SFI_PSM_DEBUG", 0); - psmx_env.warning = psmx_get_int_env("SFI_PSM_WARNING", 1); - psmx_env.uuid = getenv("SFI_PSM_UUID"); + psmx_env.name_server = psmx_get_int_env("OFI_PSM_NAME_SERVER", 0); + psmx_env.am_msg = psmx_get_int_env("OFI_PSM_AM_MSG", 0); + psmx_env.tagged_rma = psmx_get_int_env("OFI_PSM_TAGGED_RMA", 0); + psmx_env.debug = psmx_get_int_env("OFI_PSM_DEBUG", 0); + psmx_env.warning = psmx_get_int_env("OFI_PSM_WARNING", 1); + psmx_env.uuid = getenv("OFI_PSM_UUID"); psm_error_register_handler(NULL, PSM_ERRHANDLER_NO_HANDLER); @@ -388,12 +388,12 @@ PSM_INI return NULL; } - check_version = psmx_get_int_env("SFI_PSM_VERSION_CHECK", 1); + check_version = psmx_get_int_env("OFI_PSM_VERSION_CHECK", 1); if (check_version && major != PSM_VERNO_MAJOR) { fprintf(stderr, "%s: PSM version mismatch: header %d.%d, library %d.%d.\n", __func__, PSM_VERNO_MAJOR, PSM_VERNO_MINOR, major, minor); - fprintf(stderr, "\tSet envar SFI_PSM_VERSION_CHECK=0 to bypass version check.\n"); + fprintf(stderr, "\tSet envar OFI_PSM_VERSION_CHECK=0 to bypass version check.\n"); return NULL; } diff --git a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_msg.c b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_msg.c index 2de6baa823..adaef4943e 100644 --- a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_msg.c +++ b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_msg.c @@ -359,7 +359,5 @@ struct fi_ops_msg psmx_msg_ops = { .inject = psmx_inject, .senddata = fi_no_msg_senddata, .injectdata = fi_no_msg_injectdata, - .rx_size_left = fi_no_msg_rx_size_left, - .tx_size_left = fi_no_msg_tx_size_left, }; diff --git a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_msg2.c b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_msg2.c index d05bbf64bb..ca4b2292c5 100644 --- a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_msg2.c +++ b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_msg2.c @@ -626,7 +626,5 @@ struct fi_ops_msg psmx_msg2_ops = { .inject = psmx_inject2, .senddata = fi_no_msg_senddata, .injectdata = fi_no_msg_injectdata, - .rx_size_left = fi_no_msg_rx_size_left, - .tx_size_left = fi_no_msg_tx_size_left, }; diff --git a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_poll.c b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_poll.c index 9eb7c33bbe..1861b8c494 100644 --- a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_poll.c +++ b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_poll.c @@ -82,7 +82,7 @@ static int psmx_poll_poll(struct fid_poll *pollset, void **context, int count) poll_priv = container_of(pollset, struct psmx_fid_poll, poll.fid); - psmx_cq_poll_mq(NULL, poll_priv->domain, NULL, 0, NULL); + psmx_progress(poll_priv->domain); head = &poll_priv->poll_list_head; for (p = head->next; p != head && ret_count < count; p = p->next) { @@ -90,7 +90,7 @@ static int psmx_poll_poll(struct fid_poll *pollset, void **context, int count) switch (list_item->fid->fclass) { case FI_CLASS_CQ: cq = container_of(list_item->fid, struct psmx_fid_cq, cq); - if (cq->event_queue.count) { + if (cq->event_count) { *context++ = cq->cq.fid.context; ret_count++; } diff --git a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_util.c b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_util.c index db50285add..202147d821 100644 --- a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_util.c +++ b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_util.c @@ -82,10 +82,10 @@ static void psmx_name_server_cleanup(void *args) /************************************************************* * A simple name resolution mechanism for client-server style * applications. The server side has to run first. The client - * side then passes the server name as the first parameter + * side then passes the server name as the "node" parameter * of fi_getinfo call and the resulting provider info should - * have the transport address of the server in the dest_addr - * field. Both side has to use the same UUID. + * have the transport address of the server in the "dest_addr" + * field. Both sides have to use the same UUID. *************************************************************/ void *psmx_name_server(void *args) { @@ -181,7 +181,7 @@ void *psmx_resolve_name(const char *servername, int port) n = getaddrinfo(servername, service, &hints, &res); if (n < 0) { - fprintf(stderr, "%s:(%s:%d):%s\n", __func__, servername, port, gai_strerror(n)); + psmx_debug("%s:(%s:%d):%s\n", __func__, servername, port, gai_strerror(n)); free(service); return NULL; } @@ -200,7 +200,7 @@ void *psmx_resolve_name(const char *servername, int port) free(service); if (sockfd < 0) { - fprintf(stderr, "%s: couldn't connect to %s:%d\n", __func__, servername, port); + psmx_debug("%s: couldn't connect to %s:%d\n", __func__, servername, port); return NULL; } diff --git a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_wait.c b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_wait.c index 66813cb85e..e0bddacd8e 100644 --- a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_wait.c +++ b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_wait.c @@ -32,6 +32,78 @@ #include "psmx.h" +/* It is necessary to have a separate thread making progress in order + * for the wait functions to succeed. This thread is only created when + * wait functions are called and. In order to minimize performance + * impact, it only goes active during te time when wait calls are + * blocked. + */ +static pthread_t psmx_wait_thread; +static pthread_mutex_t psmx_wait_mutex; +static pthread_cond_t psmx_wait_cond; +static volatile int psmx_wait_thread_ready = 0; +static volatile int psmx_wait_thread_enabled = 0; +static volatile int psmx_wait_thread_busy = 0; + +static void *psmx_wait_progress(void *args) +{ + struct psmx_fid_domain *domain = args; + + psmx_wait_thread_ready = 1; + pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); + pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); + + while (1) { + pthread_mutex_lock(&psmx_wait_mutex); + pthread_cond_wait(&psmx_wait_cond, &psmx_wait_mutex); + pthread_mutex_unlock(&psmx_wait_mutex); + pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL); + + psmx_wait_thread_busy = 1; + while (psmx_wait_thread_enabled) + psmx_progress(domain); + + psmx_wait_thread_busy = 0; + + pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); + } + + return NULL; +} + +static void psmx_wait_start_progress(struct psmx_fid_domain *domain) +{ + pthread_attr_t attr; + int err; + + if (!domain) + return; + + if (!psmx_wait_thread) { + pthread_mutex_init(&psmx_wait_mutex, NULL); + pthread_cond_init(&psmx_wait_cond, NULL); + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_DETACHED); + err = pthread_create(&psmx_wait_thread, &attr, psmx_wait_progress, (void *)domain); + if (err) + fprintf(stderr, "%s: cannot create wait progress thread\n", __func__); + pthread_attr_destroy(&attr); + while (!psmx_wait_thread_ready) + ; + } + + psmx_wait_thread_enabled = 1; + pthread_cond_signal(&psmx_wait_cond); +} + +static void psmx_wait_stop_progress(void) +{ + psmx_wait_thread_enabled = 0; + + while (psmx_wait_thread_busy) + ; +} + int psmx_wait_get_obj(struct psmx_fid_wait *wait, void *arg) { void *obj_ptr; @@ -76,6 +148,9 @@ int psmx_wait_wait(struct fid_wait *wait, int timeout) int err = 0; wait_priv = container_of(wait, struct psmx_fid_wait, wait.fid); + + psmx_wait_start_progress(wait_priv->fabric->active_domain); + switch (wait_priv->type) { case FI_WAIT_UNSPEC: /* TODO: optimized custom wait */ @@ -98,6 +173,8 @@ int psmx_wait_wait(struct fid_wait *wait, int timeout) break; } + psmx_wait_stop_progress(); + return err; } diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock.h b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock.h index b31dcecd64..06ebcf6639 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock.h +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock.h @@ -58,22 +58,24 @@ #define SOCK_EP_MAX_MSG_SZ (1<<23) #define SOCK_EP_MAX_INJECT_SZ ((1<<8) - 1) -#define SOCK_EP_MAX_BUFF_RECV (1<<23) -#define SOCK_EP_MAX_ORDER_RAW_SZ (0) -#define SOCK_EP_MAX_ORDER_WAR_SZ (0) -#define SOCK_EP_MAX_ORDER_WAW_SZ (0) +#define SOCK_EP_MAX_BUFF_RECV (1<<20) +#define SOCK_EP_MAX_ORDER_RAW_SZ SOCK_EP_MAX_MSG_SZ +#define SOCK_EP_MAX_ORDER_WAR_SZ SOCK_EP_MAX_MSG_SZ +#define SOCK_EP_MAX_ORDER_WAW_SZ SOCK_EP_MAX_MSG_SZ #define SOCK_EP_MEM_TAG_FMT (0) #define SOCK_EP_MAX_EP_CNT (128) #define SOCK_EP_MAX_TX_CNT (16) #define SOCK_EP_MAX_RX_CNT (16) #define SOCK_EP_MAX_IOV_LIMIT (8) -#define SOCK_EP_MAX_TX_CTX_SZ (1<<12) +#define SOCK_EP_TX_SZ (256) +#define SOCK_EP_TX_ENTRY_SZ (256) #define SOCK_EP_MIN_MULTI_RECV (64) -#define SOCK_EP_MAX_ATOMIC_SZ (512) +#define SOCK_EP_MAX_ATOMIC_SZ (256) #define SOCK_EP_MAX_CTX_BITS (16) #define SOCK_PE_POLL_TIMEOUT (100000) #define SOCK_PE_MAX_ENTRIES (128) +#define SOCK_PE_MIN_ENTRIES (1) #define SOCK_EQ_DEF_SZ (1<<8) #define SOCK_CQ_DEF_SZ (1<<8) @@ -82,13 +84,18 @@ #define SOCK_CQ_DATA_SIZE (sizeof(uint64_t)) #define SOCK_TAG_SIZE (sizeof(uint64_t)) +#define SOCK_PEP_LISTENER_TIMEOUT (10000) +#define SOCK_CM_COMM_TIMEOUT (5000) +#define SOCK_EP_MAX_RETRY (5) +#define SOCK_EP_MAX_CM_DATA_SZ (256) -#define SOCK_EP_RDM_CAP (FI_MSG | FI_RMA | FI_TAGGED | FI_ATOMICS | FI_DYNAMIC_MR | \ - FI_NAMED_RX_CTX | FI_BUFFERED_RECV | FI_DIRECTED_RECV | \ - FI_INJECT | FI_MULTI_RECV | FI_SOURCE | FI_READ | FI_WRITE | \ - FI_RECV | FI_SEND | FI_REMOTE_READ | FI_REMOTE_WRITE | \ - FI_REMOTE_CQ_DATA | FI_COMPLETION | FI_REMOTE_SIGNAL | \ - FI_REMOTE_COMPLETE | FI_PEEK | FI_CANCEL) +#define SOCK_EP_RDM_CAP (FI_MSG | FI_RMA | FI_TAGGED | FI_ATOMICS | \ + FI_DYNAMIC_MR | FI_NAMED_RX_CTX | FI_BUFFERED_RECV | \ + FI_DIRECTED_RECV | FI_INJECT | FI_MULTI_RECV | \ + FI_SOURCE | FI_READ | FI_WRITE | FI_RECV | FI_SEND | \ + FI_REMOTE_READ | FI_REMOTE_WRITE | FI_REMOTE_CQ_DATA | \ + FI_COMPLETION | FI_REMOTE_SIGNAL | FI_REMOTE_COMPLETE | \ + FI_MORE | FI_CANCEL | FI_FENCE) #define SOCK_EP_MSG_CAP SOCK_EP_RDM_CAP @@ -96,14 +103,10 @@ FI_NAMED_RX_CTX | FI_BUFFERED_RECV | FI_DIRECTED_RECV | \ FI_INJECT | FI_MULTI_RECV | FI_SOURCE | FI_RECV | FI_SEND | \ FI_REMOTE_CQ_DATA | FI_COMPLETION | FI_REMOTE_SIGNAL | \ - FI_REMOTE_COMPLETE | FI_PEEK | FI_CANCEL) - -#define SOCK_DEF_OPS (FI_SEND | FI_RECV | \ - FI_BUFFERED_RECV | FI_READ | FI_WRITE | \ - FI_REMOTE_READ | FI_REMOTE_WRITE) - -#define SOCK_DGRAM_DEF_OPS (FI_SEND | FI_RECV | FI_BUFFERED_RECV) + FI_REMOTE_COMPLETE | FI_MORE | FI_CANCEL | \ + FI_FENCE) +#define SOCK_DEF_OPS (FI_SEND | FI_RECV | FI_BUFFERED_RECV) #define SOCK_EP_MSG_ORDER (FI_ORDER_RAR | FI_ORDER_RAW | FI_ORDER_RAS| \ FI_ORDER_WAR | FI_ORDER_WAW | FI_ORDER_WAS | \ @@ -117,6 +120,8 @@ #define SOCK_MAJOR_VERSION 1 #define SOCK_MINOR_VERSION 0 +#define SOCK_INJECT_OK(_flgs) (((_flgs) & FI_INJECT) && ((!(_flgs)) & FI_FENCE)) + struct sock_fabric{ struct fid_fabric fab_fid; atomic_t ref; @@ -137,6 +142,7 @@ struct sock_conn_map { int size; struct sock_domain *domain; fastlock_t lock; + struct sockaddr_storage curr_addr; }; struct sock_domain { @@ -145,6 +151,7 @@ struct sock_domain { struct sock_fabric *fab; fastlock_t lock; atomic_t ref; + short ep_count; struct sock_eq *eq; struct sock_eq *mr_eq; @@ -155,7 +162,7 @@ struct sock_domain { struct sock_conn_map r_cmap; pthread_t listen_thread; int listening; - int service; + char service[NI_MAXSERV]; int signal_fds[2]; struct sockaddr_storage src_addr; }; @@ -195,7 +202,8 @@ struct sock_mr { struct sock_av_addr { struct sockaddr_storage addr; uint8_t valid; - uint8_t reserved[7]; + uint16_t rem_ep_id; + uint8_t reserved[5]; }; struct sock_av_table_hdr { @@ -366,18 +374,17 @@ struct sock_comp { }; struct sock_ep { - union { - struct fid_ep ep; - struct fid_sep sep; - struct fid_pep pep; - } fid; + struct fid_ep ep; size_t fclass; uint64_t op_flags; uint8_t connected; + uint8_t tx_shared; + uint8_t rx_shared; + uint16_t ep_id; + uint16_t rem_ep_id; uint16_t buffered_len; uint16_t min_multi_recv; - char reserved[4]; atomic_t ref; struct sock_comp comp; @@ -407,24 +414,25 @@ struct sock_ep { struct sockaddr_in *dest_addr; fi_addr_t conn_addr; uint16_t key; + int socket; + + pthread_t listener_thread; + int do_listen; }; struct sock_pep { - struct fid_pep pep; + struct fid_pep pep; struct sock_fabric *sock_fab; - struct sock_domain *dom; + + int do_listen; + pthread_t listener_thread; + int signal_fds[2]; + int socket; + int listener_sock_fd; + + struct sockaddr_in src_addr; struct fi_info info; - - int sock_fd; - char service[NI_MAXSERV]; - - struct sock_eq *eq; - - struct sock_cq *send_cq; - struct sock_cq *recv_cq; - - uint64_t op_flags; - uint64_t pep_cap; + struct sock_eq *eq; }; struct sock_rx_entry { @@ -432,6 +440,7 @@ struct sock_rx_entry { uint8_t is_buffered; uint8_t is_busy; uint8_t is_claimed; + uint8_t is_complete; uint8_t reserved[5]; uint64_t used; @@ -523,10 +532,10 @@ struct sock_tx_ctx { struct sock_msg_hdr{ uint8_t version; uint8_t op_type; - uint16_t rx_id; - uint16_t pe_entry_id; + uint8_t rx_id; uint8_t dest_iov_len; - uint8_t reserved[1]; + uint16_t ep_id; + uint16_t pe_entry_id; uint64_t flags; uint64_t msg_len; @@ -660,7 +669,7 @@ struct sock_pe_entry{ struct sock_pe{ struct sock_domain *domain; - + int num_free_entries; struct sock_pe_entry pe_table[SOCK_PE_MAX_ENTRIES]; fastlock_t lock; @@ -700,10 +709,16 @@ struct sock_cq { sock_cq_report_fn report_completion; }; -struct sock_conn_req { - int type; +struct sock_conn_hdr { + uint8_t type; + uint8_t reserved[7]; fid_t c_fid; fid_t s_fid; +}; + +struct sock_conn_req { + struct sock_conn_hdr hdr; + uint16_t ep_id; struct fi_info info; struct sockaddr_in src_addr; struct sockaddr_in dest_addr; @@ -712,14 +727,20 @@ struct sock_conn_req { struct fi_ep_attr ep_attr; struct fi_domain_attr domain_attr; struct fi_fabric_attr fabric_attr; + struct sockaddr_in from_addr; + char user_data[0]; +}; + +struct sock_conn_response { + struct sock_conn_hdr hdr; + char user_data[0]; }; enum { - SOCK_CONNREQ, - SOCK_ACCEPT, - SOCK_REJECT, - SOCK_CONNECTED, - SOCK_SHUTDOWN + SOCK_CONN_REQ, + SOCK_CONN_ACCEPT, + SOCK_CONN_REJECT, + SOCK_CONN_SHUTDOWN, }; int sock_verify_info(struct fi_info *hints); @@ -757,19 +778,20 @@ struct sock_conn *sock_ep_lookup_conn(struct sock_ep *ep); int sock_rdm_ep(struct fid_domain *domain, struct fi_info *info, struct fid_ep **ep, void *context); int sock_rdm_sep(struct fid_domain *domain, struct fi_info *info, - struct fid_sep **sep, void *context); + struct fid_ep **sep, void *context); int sock_dgram_ep(struct fid_domain *domain, struct fi_info *info, struct fid_ep **ep, void *context); int sock_dgram_sep(struct fid_domain *domain, struct fi_info *info, - struct fid_sep **sep, void *context); + struct fid_ep **sep, void *context); int sock_msg_ep(struct fid_domain *domain, struct fi_info *info, struct fid_ep **ep, void *context); int sock_msg_sep(struct fid_domain *domain, struct fi_info *info, - struct fid_sep **sep, void *context); + struct fid_ep **sep, void *context); int sock_msg_passive_ep(struct fid_fabric *fabric, struct fi_info *info, struct fid_pep **pep, void *context); +int sock_ep_enable(struct fid_ep *ep); int sock_stx_ctx(struct fid_domain *domain, @@ -791,8 +813,7 @@ ssize_t sock_eq_report_event(struct sock_eq *sock_eq, uint32_t event, const void *buf, size_t len, uint64_t flags); ssize_t sock_eq_report_error(struct sock_eq *sock_eq, fid_t fid, void *context, int err, int prov_errno, void *err_data); -int sock_eq_openwait(struct sock_eq *eq, char *service); -struct fi_info * sock_ep_msg_process_info(struct sock_conn_req *req); +int sock_eq_openwait(struct sock_eq *eq, const char *service); int sock_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr, struct fid_cntr **cntr, void *context); @@ -816,10 +837,10 @@ struct sock_mr *sock_mr_verify_desc(struct sock_domain *domain, void *desc, struct sock_mr * sock_mr_get_entry(struct sock_domain *domain, uint16_t key); -struct sock_rx_ctx *sock_rx_ctx_alloc(struct fi_rx_attr *attr, void *context); +struct sock_rx_ctx *sock_rx_ctx_alloc(const struct fi_rx_attr *attr, void *context); void sock_rx_ctx_free(struct sock_rx_ctx *rx_ctx); -struct sock_tx_ctx *sock_tx_ctx_alloc(struct fi_tx_attr *attr, void *context); +struct sock_tx_ctx *sock_tx_ctx_alloc(const struct fi_tx_attr *attr, void *context); void sock_tx_ctx_free(struct sock_tx_ctx *tx_ctx); void sock_tx_ctx_start(struct sock_tx_ctx *tx_ctx); void sock_tx_ctx_write(struct sock_tx_ctx *tx_ctx, const void *buf, size_t len); @@ -842,8 +863,8 @@ fi_addr_t _sock_av_lookup(struct sock_av *av, struct sockaddr *addr); fi_addr_t sock_av_get_fiaddr(struct sock_av *av, struct sock_conn *conn); fi_addr_t sock_av_lookup_key(struct sock_av *av, int key); struct sock_conn *sock_av_lookup_addr(struct sock_av *av, fi_addr_t addr); -int sock_av_compare_addr(struct sock_av *av, - fi_addr_t addr1, fi_addr_t addr2); +int sock_av_compare_addr(struct sock_av *av, fi_addr_t addr1, fi_addr_t addr2); +uint16_t sock_av_lookup_ep_id(struct sock_av *av, fi_addr_t addr); struct sock_conn *sock_conn_map_lookup_key(struct sock_conn_map *conn_map, @@ -866,6 +887,8 @@ void sock_pe_add_tx_ctx(struct sock_pe *pe, struct sock_tx_ctx *ctx); void sock_pe_add_rx_ctx(struct sock_pe *pe, struct sock_rx_ctx *ctx); int sock_pe_progress_rx_ctx(struct sock_pe *pe, struct sock_rx_ctx *rx_ctx); int sock_pe_progress_tx_ctx(struct sock_pe *pe, struct sock_tx_ctx *tx_ctx); +void sock_pe_remove_tx_ctx(struct sock_tx_ctx *tx_ctx); +void sock_pe_remove_rx_ctx(struct sock_rx_ctx *rx_ctx); void sock_pe_finalize(struct sock_pe *pe); diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_atomic.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_atomic.c index c6d0a1e3c1..0be6ffae4b 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_atomic.c +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_atomic.c @@ -74,7 +74,7 @@ static ssize_t sock_ep_tx_atomic(struct fid_ep *ep, switch (ep->fid.fclass) { case FI_CLASS_EP: - sock_ep = container_of(ep, struct sock_ep, fid.ep); + sock_ep = container_of(ep, struct sock_ep, ep); tx_ctx = sock_ep->tx_ctx; break; @@ -97,11 +97,13 @@ static ssize_t sock_ep_tx_atomic(struct fid_ep *ep, } else { conn = sock_av_lookup_addr(tx_ctx->av, msg->addr); } - assert(conn); + + if (!conn) + return -FI_EAGAIN; src_len = 0; datatype_sz = fi_datatype_size(msg->datatype); - if (flags & FI_INJECT) { + if (SOCK_INJECT_OK(flags)) { for (i=0; i< msg->iov_count; i++) { src_len += (msg->msg_iov[i].count * datatype_sz); } @@ -130,7 +132,7 @@ static ssize_t sock_ep_tx_atomic(struct fid_ep *ep, tx_op.atomic.res_iov_len = result_count; tx_op.atomic.cmp_iov_len = compare_count; - if (flags & FI_INJECT) + if (SOCK_INJECT_OK(flags)) tx_op.src_iov_len = src_len; else tx_op.src_iov_len = msg->iov_count; @@ -147,7 +149,8 @@ static ssize_t sock_ep_tx_atomic(struct fid_ep *ep, sock_tx_ctx_write(tx_ctx, &msg->data, sizeof(uint64_t)); } - if (flags & FI_INJECT) { + src_len = 0; + if (SOCK_INJECT_OK(flags)) { for (i=0; i< msg->iov_count; i++) { sock_tx_ctx_write(tx_ctx, msg->msg_iov[i].addr, msg->msg_iov[i].count * datatype_sz); @@ -469,6 +472,7 @@ static int sock_ep_atomic_valid(struct fid_ep *ep, enum fi_datatype datatype, switch(datatype){ case FI_FLOAT: case FI_DOUBLE: + case FI_LONG_DOUBLE: if (op == FI_BOR || op == FI_BAND || op == FI_BXOR || op == FI_MSWAP) return -FI_ENOENT; @@ -476,7 +480,6 @@ static int sock_ep_atomic_valid(struct fid_ep *ep, enum fi_datatype datatype, case FI_FLOAT_COMPLEX: case FI_DOUBLE_COMPLEX: - case FI_LONG_DOUBLE: case FI_LONG_DOUBLE_COMPLEX: return -FI_ENOENT; default: diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_av.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_av.c index a758acdffe..f9529270e7 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_av.c +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_av.c @@ -133,6 +133,25 @@ struct sock_conn *sock_av_lookup_addr(struct sock_av *av, return sock_conn_map_lookup_key(av->cmap, av->key[idx]); } +uint16_t sock_av_lookup_ep_id(struct sock_av *av, fi_addr_t addr) +{ + int index = ((uint64_t)addr & av->mask); + struct sock_av_addr *av_addr; + + if (index >= av->table_hdr->stored || index < 0) { + return AF_INET; + } + + if (!av->cmap) { + SOCK_LOG_ERROR("EP with no AV bound\n"); + return 0; + } + + av_addr = idm_lookup(&av->addr_idm, index); + return av_addr->rem_ep_id; +} + + static inline void sock_av_report_success(struct sock_av *av, int *index, uint64_t flags) { @@ -158,6 +177,7 @@ static int sock_check_table_in(struct sock_av *_av, struct sockaddr_in *addr, char sa_ip[INET_ADDRSTRLEN]; struct sock_av_addr *av_addr; size_t new_count, table_sz; + uint16_t rem_ep_id; if ((_av->attr.flags & FI_EVENT) && !_av->eq) return -FI_ENOEQ; @@ -166,8 +186,13 @@ static int sock_check_table_in(struct sock_av *_av, struct sockaddr_in *addr, for (i = 0; i < count; i++) { for (j = 0; j < _av->table_hdr->stored; j++) { av_addr = &_av->table[j]; - if (memcmp(&av_addr->addr, &addr[i], - sizeof(struct sockaddr_in)) == 0) { + + rem_ep_id = ((struct sockaddr_in*)&addr[i])->sin_family; + ((struct sockaddr_in*)&addr[i])->sin_family = AF_INET; + + if ((memcmp(&av_addr->addr, &addr[i], + sizeof(struct sockaddr_in)) == 0) && + av_addr->rem_ep_id == rem_ep_id) { SOCK_LOG_INFO("Found addr in shared av\n"); if (idm_set(&_av->addr_idm, _av->key[j], av_addr) < 0) { if (fi_addr) @@ -215,6 +240,9 @@ static int sock_check_table_in(struct sock_av *_av, struct sockaddr_in *addr, } } + rem_ep_id = ((struct sockaddr_in*)&addr[i])->sin_family; + ((struct sockaddr_in*)&addr[i])->sin_family = AF_INET; + av_addr = &_av->table[_av->table_hdr->stored]; memcpy(sa_ip, inet_ntoa((&addr[i])->sin_addr), INET_ADDRSTRLEN); SOCK_LOG_INFO("AV-INSERT:src_addr: family: %d, IP is %s, port: %d\n", @@ -222,6 +250,7 @@ static int sock_check_table_in(struct sock_av *_av, struct sockaddr_in *addr, ntohs(((struct sockaddr_in*)&addr[i])->sin_port)); memcpy(&av_addr->addr, &addr[i], sizeof(struct sockaddr_in)); + av_addr->rem_ep_id = rem_ep_id; if (idm_set(&_av->addr_idm, _av->table_hdr->stored, av_addr) < 0) { if (fi_addr) fi_addr[i] = FI_ADDR_NOTAVAIL; diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_cntr.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_cntr.c index 364fe7f710..2bdadbebe9 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_cntr.c +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_cntr.c @@ -42,6 +42,7 @@ #include #include "sock.h" +#include "sock_util.h" const struct fi_cntr_attr sock_cntr_attr = { .events = FI_CNTR_EVENTS_COMP, @@ -56,6 +57,10 @@ int sock_cntr_progress(struct sock_cntr *cntr) struct sock_rx_ctx *rx_ctx; struct dlist_entry *entry; + if (cntr->domain->progress_mode == FI_PROGRESS_AUTO && + !sock_progress_thread_wait) + return 0; + for (entry = cntr->tx_list.next; entry != &cntr->tx_list; entry = entry->next) { tx_ctx = container_of(entry, struct sock_tx_ctx, cntr_entry); @@ -74,8 +79,7 @@ static uint64_t sock_cntr_read(struct fid_cntr *cntr) { struct sock_cntr *_cntr; _cntr = container_of(cntr, struct sock_cntr, cntr_fid); - if (_cntr->domain->progress_mode == FI_PROGRESS_MANUAL) - sock_cntr_progress(_cntr); + sock_cntr_progress(_cntr); return atomic_get(&_cntr->value); } diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_comm.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_comm.c index 9c1ef5c21e..4da1c0ebe4 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_comm.c +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_comm.c @@ -202,16 +202,21 @@ ssize_t sock_comm_peek(struct sock_conn *conn, void *buf, size_t len) int sock_comm_buffer_init(struct sock_conn *conn) { + int optval; uint64_t flags; socklen_t size = SOCK_COMM_BUF_SZ; socklen_t optlen = sizeof(socklen_t); + optval = 1; + setsockopt(conn->sock_fd, IPPROTO_TCP, TCP_NODELAY, + &optval, sizeof optval); + flags = fcntl(conn->sock_fd, F_GETFL, 0); fcntl(conn->sock_fd, F_SETFL, flags | O_NONBLOCK); rbinit(&conn->inbuf, SOCK_COMM_BUF_SZ); rbinit(&conn->outbuf, SOCK_COMM_BUF_SZ); - + setsockopt(conn->sock_fd, SOL_SOCKET, SO_RCVBUF, &size, optlen); setsockopt(conn->sock_fd, SOL_SOCKET, SO_SNDBUF, &size, optlen); @@ -221,10 +226,10 @@ int sock_comm_buffer_init(struct sock_conn *conn) optlen = sizeof(socklen_t); getsockopt(conn->sock_fd, SOL_SOCKET, SO_SNDBUF, &size, &optlen); SOCK_LOG_INFO("SO_SNDBUF: %d\n", size); + return 0; } - void sock_comm_buffer_finalize(struct sock_conn *conn) { rbfree(&conn->inbuf); diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_conn.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_conn.c index ded6f51cfd..1a192d5de5 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_conn.c +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_conn.c @@ -102,22 +102,27 @@ struct sock_conn *sock_conn_map_lookup_key(struct sock_conn_map *conn_map, #define SOCK_ADDR_IN_PORT(sa)SOCK_ADDR_IN_PTR(sa)->sin_port #define SOCK_ADDR_IN_ADDR(sa)SOCK_ADDR_IN_PTR(sa)->sin_addr +static int sock_compare_addr(struct sockaddr_in *addr1, + struct sockaddr_in *addr2) +{ + if ((SOCK_ADDR_IN_ADDR(addr1).s_addr == + SOCK_ADDR_IN_ADDR(addr2).s_addr) && + (SOCK_ADDR_IN_PORT(addr1) == SOCK_ADDR_IN_PORT(addr2))) + return 1; + return 0; +} + uint16_t sock_conn_map_lookup(struct sock_conn_map *map, struct sockaddr_in *addr) { int i; struct sockaddr_in *entry; - fastlock_acquire(&map->lock); for (i=0; i < map->used; i++) { entry = (struct sockaddr_in *)&(map->table[i].addr); - if ((SOCK_ADDR_IN_ADDR(entry).s_addr == - SOCK_ADDR_IN_ADDR(addr).s_addr) && - (SOCK_ADDR_IN_PORT(entry) == SOCK_ADDR_IN_PORT(addr))) { - fastlock_release(&map->lock); + if (sock_compare_addr(entry, addr)) { return i+1; } } - fastlock_release(&map->lock); return 0; } @@ -126,11 +131,8 @@ static int sock_conn_map_insert(struct sock_conn_map *map, int conn_fd) { int index; - fastlock_acquire(&map->lock); - if (map->size == map->used) { if (sock_conn_map_increase(map, map->size * 2)) { - fastlock_release(&map->lock); return 0; } } @@ -140,7 +142,6 @@ static int sock_conn_map_insert(struct sock_conn_map *map, map->table[index].sock_fd = conn_fd; sock_comm_buffer_init(&map->table[index]); map->used++; - fastlock_release(&map->lock); return index + 1; } @@ -172,6 +173,10 @@ uint16_t sock_conn_map_connect(struct sock_domain *dom, flags = fcntl(conn_fd, F_GETFL, 0); fcntl(conn_fd, F_SETFL, flags | O_NONBLOCK); + fastlock_acquire(&map->lock); + memcpy(&map->curr_addr, addr, sizeof(struct sockaddr_in)); + fastlock_release(&map->lock); + if (connect(conn_fd, addr, sizeof *addr) < 0) { if (errno == EINPROGRESS) { /* timeout after 5 secs */ @@ -224,12 +229,24 @@ uint16_t sock_conn_map_connect(struct sock_domain *dom, reply = ntohs(reply); SOCK_LOG_INFO("Connect response: %d\n", ntohs(reply)); + + if (reply == 0) { + fastlock_acquire(&map->lock); ret = sock_conn_map_insert(map, addr, conn_fd); + fastlock_release(&map->lock); } else { - ret = sock_conn_map_lookup(map, addr); + ret = 0; close(conn_fd); + SOCK_LOG_INFO("waiting for an accept\n"); + while (!ret) { + fastlock_acquire(&map->lock); + ret = sock_conn_map_lookup(map, addr); + fastlock_release(&map->lock); + } + SOCK_LOG_INFO("got accept\n"); } + return ret; } @@ -238,8 +255,13 @@ uint16_t sock_conn_map_match_or_connect(struct sock_domain *dom, struct sockaddr_in *addr) { uint16_t index; - return (index = sock_conn_map_lookup(map, addr)) ? - index : sock_conn_map_connect(dom, map, addr); + fastlock_acquire(&map->lock); + index = sock_conn_map_lookup(map, addr); + fastlock_release(&map->lock); + + if (!index) + index = sock_conn_map_connect(dom, map, addr); + return index; } static void *_sock_conn_listen(void *arg) @@ -253,10 +275,9 @@ static void *_sock_conn_listen(void *arg) struct sockaddr_in remote; socklen_t addr_size; struct pollfd poll_fds[2]; - char service[NI_MAXSERV]; struct sockaddr_in addr; char sa_ip[INET_ADDRSTRLEN]; - unsigned short port; + unsigned short port, response; uint16_t index; memset(&hints, 0, sizeof(hints)); @@ -264,14 +285,14 @@ static void *_sock_conn_listen(void *arg) hints.ai_socktype = SOCK_STREAM; hints.ai_flags = AI_PASSIVE; - sprintf(service, "%d", domain->service); - if(getaddrinfo(NULL, service, &hints, &s_res)) { - SOCK_LOG_ERROR("no available AF_INET address\n"); - perror("no available AF_INET address"); + ret = getaddrinfo(NULL, domain->service, &hints, &s_res); + if (ret) { + SOCK_LOG_ERROR("no available AF_INET address, service %s, %s\n", + domain->service, gai_strerror(ret)); return NULL; } - SOCK_LOG_INFO("Binding listener thread to port: %d\n", domain->service); + SOCK_LOG_INFO("Binding listener thread to port: %s\n", domain->service); for (p=s_res; p; p=p->ai_next) { listen_fd = socket(p->ai_family, p->ai_socktype, p->ai_protocol); if (listen_fd >= 0) { @@ -291,16 +312,17 @@ static void *_sock_conn_listen(void *arg) freeaddrinfo(s_res); if (listen_fd < 0) { - SOCK_LOG_ERROR("failed to listen to port: %d\n", domain->service); + SOCK_LOG_ERROR("failed to listen to port: %s\n", domain->service); goto err; } - if (domain->service == 0) { + if (atoi(domain->service) == 0) { addr_size = sizeof(struct sockaddr_in); if (getsockname(listen_fd, (struct sockaddr*)&addr, &addr_size)) goto err; - domain->service = ntohs(addr.sin_port); - SOCK_LOG_INFO("Bound to port: %d\n", domain->service); + snprintf(domain->service, sizeof domain->service, "%d", + ntohs(addr.sin_port)); + SOCK_LOG_INFO("Bound to port: %s\n", domain->service); } if (listen(listen_fd, 0)) { @@ -309,7 +331,7 @@ static void *_sock_conn_listen(void *arg) } ((struct sockaddr_in*)&(domain->src_addr))->sin_port = - htons(domain->service); + htons(atoi(domain->service)); domain->listening = 1; poll_fds[0].fd = listen_fd; @@ -343,15 +365,34 @@ static void *_sock_conn_listen(void *arg) remote.sin_port = port; SOCK_LOG_INFO("Remote port: %d\n", ntohs(port)); - index = sock_conn_map_lookup(map, &remote); - port = (index) ? 1 : 0; - ret = send(conn_fd, &port, sizeof(port), 0); - if (ret != sizeof(port)) - SOCK_LOG_ERROR("Cannot exchange port\n"); - if (index == 0) + fastlock_acquire(&map->lock); + index = sock_conn_map_lookup(map, &remote); + response = (index) ? 1 : 0; + if (response == 0) { + if (sock_compare_addr((struct sockaddr_in*)&map->curr_addr, + &remote)) { + ret = memcmp(&domain->src_addr, &remote, + sizeof(struct sockaddr_in)); + + if (ret > 0 || + (ret == 0 && atoi(domain->service) > port)) { + response = 1; + SOCK_LOG_INFO("Rejecting accept\n"); + } + } + } + fastlock_release(&map->lock); + + ret = send(conn_fd, &response, sizeof(response), 0); + if (ret != sizeof(response)) + SOCK_LOG_ERROR("Cannot exchange port\n"); + + if (!response) { + fastlock_acquire(&map->lock); sock_conn_map_insert(map, &remote, conn_fd); - else + fastlock_release(&map->lock); + } else close(conn_fd); } diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_cq.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_cq.c index 45ec697f78..05abc5fe79 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_cq.c +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_cq.c @@ -55,6 +55,10 @@ int sock_cq_progress(struct sock_cq *cq) struct sock_rx_ctx *rx_ctx; struct dlist_entry *entry; + if (cq->domain->progress_mode == FI_PROGRESS_AUTO && + !sock_progress_thread_wait) + return 0; + for (entry = cq->tx_list.next; entry != &cq->tx_list; entry = entry->next) { tx_ctx = container_of(entry, struct sock_tx_ctx, cq_entry); @@ -111,13 +115,15 @@ static ssize_t _sock_cq_write(struct sock_cq *cq, fi_addr_t addr, goto out; } - rbfdwrite(&cq->cq_rbfd, buf, len); - rbfdcommit(&cq->cq_rbfd); - ret = len; rbwrite(&cq->addr_rb, &addr, sizeof(fi_addr_t)); rbcommit(&cq->addr_rb); + + rbfdwrite(&cq->cq_rbfd, buf, len); + rbfdcommit(&cq->cq_rbfd); + ret = len; + if (cq->signal) sock_wait_signal(cq->waitset); out: @@ -218,16 +224,31 @@ static void sock_cq_set_report_fn(struct sock_cq *sock_cq) } } +static inline ssize_t sock_cq_rbuf_read(struct sock_cq *cq, void *buf, + size_t count, fi_addr_t *src_addr, + size_t cq_entry_len) +{ + ssize_t i; + fi_addr_t addr; + + rbfdread(&cq->cq_rbfd, buf, cq_entry_len * count); + for(i = 0; i < count; i++) { + rbread(&cq->addr_rb, &addr, sizeof(fi_addr_t)); + if (src_addr) + src_addr[i] = addr; + } + return count; +} + ssize_t sock_cq_sreadfrom(struct fid_cq *cq, void *buf, size_t count, fi_addr_t *src_addr, const void *cond, int timeout) { - int ret; - fi_addr_t addr; + int ret = 0; int64_t threshold; struct timeval now; struct sock_cq *sock_cq; double start_ms, end_ms; - ssize_t i, bytes_read, num_read, cq_entry_len; + ssize_t cq_entry_len, avail; sock_cq = container_of(cq, struct sock_cq, cq_fid); cq_entry_len = sock_cq->cq_entry_size; @@ -246,8 +267,9 @@ ssize_t sock_cq_sreadfrom(struct fid_cq *cq, void *buf, size_t count, timeout -= (end_ms - start_ms); timeout = timeout < 0 ? 0 : timeout; } - } - + } else + sock_cq_progress(sock_cq); + if (sock_cq->attr.wait_cond == FI_CQ_COND_THRESHOLD) { threshold = MIN((int64_t)cond, count); }else{ @@ -255,23 +277,21 @@ ssize_t sock_cq_sreadfrom(struct fid_cq *cq, void *buf, size_t count, } fastlock_acquire(&sock_cq->lock); - bytes_read = rbfdsread(&sock_cq->cq_rbfd, buf, - cq_entry_len*threshold, timeout); - - if (bytes_read == 0) { - ret = -FI_ETIMEDOUT; - goto out; - } - - num_read = bytes_read/cq_entry_len; - for(i=0; i < num_read; i++) { - rbread(&sock_cq->addr_rb, &addr, sizeof(fi_addr_t)); - if (src_addr) - src_addr[i] = addr; - } - ret = num_read; -out: + if ((avail = rbfdused(&sock_cq->cq_rbfd))) + ret = sock_cq_rbuf_read(sock_cq, buf, + MIN(threshold, avail / cq_entry_len), + src_addr, cq_entry_len); fastlock_release(&sock_cq->lock); + + if (ret == 0) { + ret = rbfdwait(&sock_cq->cq_rbfd, timeout); + fastlock_acquire(&sock_cq->lock); + if (ret != -FI_ETIMEDOUT && (avail = rbfdused(&sock_cq->cq_rbfd))) + ret = sock_cq_rbuf_read(sock_cq, buf, + MIN(threshold, avail / cq_entry_len), + src_addr, cq_entry_len); + fastlock_release(&sock_cq->lock); + } return ret; } diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ctx.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ctx.c index f92e090746..f2d19fad79 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ctx.c +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ctx.c @@ -41,7 +41,7 @@ #include "sock_util.h" -struct sock_rx_ctx *sock_rx_ctx_alloc(struct fi_rx_attr *attr, void *context) +struct sock_rx_ctx *sock_rx_ctx_alloc(const struct fi_rx_attr *attr, void *context) { struct sock_rx_ctx *rx_ctx; rx_ctx = calloc(1, sizeof(*rx_ctx)); @@ -71,7 +71,7 @@ void sock_rx_ctx_free(struct sock_rx_ctx *rx_ctx) free(rx_ctx); } -static struct sock_tx_ctx *sock_tx_context_alloc(struct fi_tx_attr *attr, +static struct sock_tx_ctx *sock_tx_context_alloc(const struct fi_tx_attr *attr, void *context, size_t fclass) { struct sock_tx_ctx *tx_ctx; @@ -80,7 +80,9 @@ static struct sock_tx_ctx *sock_tx_context_alloc(struct fi_tx_attr *attr, if (!tx_ctx) return NULL; - if (rbfdinit(&tx_ctx->rbfd, attr->size)) + if (rbfdinit(&tx_ctx->rbfd, + (attr->size) ? attr->size : + SOCK_EP_TX_SZ * SOCK_EP_TX_ENTRY_SZ)) goto err; dlist_init(&tx_ctx->cq_entry); @@ -99,7 +101,7 @@ static struct sock_tx_ctx *sock_tx_context_alloc(struct fi_tx_attr *attr, tx_ctx->fid.ctx.fid.context = context; break; case FI_CLASS_STX_CTX: - tx_ctx->fid.stx.fid.fclass = FI_CLASS_TX_CTX; + tx_ctx->fid.stx.fid.fclass = FI_CLASS_STX_CTX; tx_ctx->fid.stx.fid.context = context; break; default: @@ -114,12 +116,12 @@ err: } -struct sock_tx_ctx *sock_tx_ctx_alloc(struct fi_tx_attr *attr, void *context) +struct sock_tx_ctx *sock_tx_ctx_alloc(const struct fi_tx_attr *attr, void *context) { return sock_tx_context_alloc(attr, context, FI_CLASS_TX_CTX); } -struct sock_tx_ctx *sock_stx_ctx_alloc(struct fi_tx_attr *attr, void *context) +struct sock_tx_ctx *sock_stx_ctx_alloc(const struct fi_tx_attr *attr, void *context) { return sock_tx_context_alloc(attr, context, FI_CLASS_STX_CTX); } @@ -151,6 +153,6 @@ void sock_tx_ctx_commit(struct sock_tx_ctx *tx_ctx) void sock_tx_ctx_abort(struct sock_tx_ctx *tx_ctx) { rbfdabort(&tx_ctx->rbfd); - fastlock_release(&tx_ctx->rlock); + fastlock_release(&tx_ctx->wlock); } diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_dom.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_dom.c index f66d72921d..0ca03f6b8b 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_dom.c +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_dom.c @@ -47,7 +47,7 @@ const struct fi_domain_attr sock_domain_attr = { .threading = FI_THREAD_SAFE, .control_progress = FI_PROGRESS_AUTO, .data_progress = FI_PROGRESS_AUTO, - .mr_key_size = 0, + .mr_key_size = sizeof(uint16_t), .cq_data_size = sizeof(uint64_t), .ep_cnt = SOCK_EP_MAX_EP_CNT, .tx_ctx_cnt = SOCK_EP_MAX_TX_CNT, @@ -72,6 +72,7 @@ int sock_verify_domain_attr(struct fi_domain_attr *attr) case FI_THREAD_FID: case FI_THREAD_DOMAIN: case FI_THREAD_COMPLETION: + case FI_THREAD_ENDPOINT: break; default: SOCK_LOG_INFO("Invalid threading model!\n"); @@ -180,13 +181,15 @@ static int sock_mr_bind(struct fid *fid, struct fid *bfid, uint64_t flags) case FI_CLASS_CQ: cq = container_of(bfid, struct sock_cq, cq_fid.fid); assert(mr->domain == cq->domain); - mr->cq = cq; + if (flags & FI_REMOTE_WRITE) + mr->cq = cq; break; case FI_CLASS_CNTR: cntr = container_of(bfid, struct sock_cntr, cntr_fid.fid); assert(mr->domain == cntr->domain); - mr->cntr = cntr; + if (flags & FI_REMOTE_WRITE) + mr->cntr = cntr; break; default: @@ -361,7 +364,7 @@ int sock_endpoint(struct fid_domain *domain, struct fi_info *info, } int sock_scalable_ep(struct fid_domain *domain, struct fi_info *info, - struct fid_sep **sep, void *context) + struct fid_ep **sep, void *context) { switch (info->ep_type) { case FI_EP_RDM: @@ -406,7 +409,6 @@ int sock_domain(struct fid_fabric *fabric, struct fi_info *info, struct fid_domain **dom, void *context) { int ret, flags; - char service[NI_MAXSERV]; struct sock_domain *sock_domain; if(info && info->domain_attr){ @@ -424,11 +426,11 @@ int sock_domain(struct fid_fabric *fabric, struct fi_info *info, if(info && info->src_addr) { if (getnameinfo(info->src_addr, info->src_addrlen, NULL, 0, - service, sizeof(service), NI_NUMERICSERV)) { + sock_domain->service, sizeof(sock_domain->service), + NI_NUMERICSERV)) { SOCK_LOG_ERROR("could not resolve src_addr\n"); goto err; } - sock_domain->service = atoi(service); sock_domain->info = *info; memcpy(&sock_domain->src_addr, info->src_addr, sizeof(struct sockaddr_in)); @@ -455,6 +457,7 @@ int sock_domain(struct fid_fabric *fabric, struct fi_info *info, goto err; } + sock_domain->ep_count = AF_INET; sock_domain->r_cmap.domain = sock_domain; fastlock_init(&sock_domain->r_cmap.lock); if(socketpair(AF_UNIX, SOCK_STREAM, 0, sock_domain->signal_fds) < 0) diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ep.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ep.c index b3d1dfadbb..c37f4e023b 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ep.c +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ep.c @@ -55,20 +55,25 @@ extern const struct fi_fabric_attr sock_fabric_attr; extern const char const sock_fab_name[]; extern const char const sock_dom_name[]; +extern const char const sock_prov_name[]; -static void sock_dequeue_tx_ctx(struct sock_tx_ctx *tx_ctx) -{ - fastlock_acquire(&tx_ctx->domain->pe->lock); - dlist_remove(&tx_ctx->pe_entry); - fastlock_release(&tx_ctx->domain->pe->lock); -} +const struct fi_tx_attr sock_stx_attr = { + .caps = SOCK_EP_RDM_CAP, + .op_flags = SOCK_DEF_OPS, + .msg_order = SOCK_EP_MSG_ORDER, + .inject_size = SOCK_EP_MAX_INJECT_SZ, + .size = SOCK_EP_TX_SZ, + .iov_limit = SOCK_EP_MAX_IOV_LIMIT, +}; -static void sock_dequeue_rx_ctx(struct sock_rx_ctx *rx_ctx) -{ - fastlock_acquire(&rx_ctx->domain->pe->lock); - dlist_remove(&rx_ctx->pe_entry); - fastlock_release(&rx_ctx->domain->pe->lock); -} +const struct fi_rx_attr sock_srx_attr = { + .caps = SOCK_EP_RDM_CAP, + .op_flags = SOCK_DEF_OPS, + .msg_order = SOCK_EP_MSG_ORDER, + .total_buffered_recv = SOCK_EP_MAX_BUFF_RECV, + .size = SOCK_EP_MAX_MSG_SZ, + .iov_limit = SOCK_EP_MAX_IOV_LIMIT, +}; static int sock_ctx_close(struct fid *fid) { @@ -78,7 +83,7 @@ static int sock_ctx_close(struct fid *fid) switch (fid->fclass) { case FI_CLASS_TX_CTX: tx_ctx = container_of(fid, struct sock_tx_ctx, fid.ctx.fid); - sock_dequeue_tx_ctx(tx_ctx); + sock_pe_remove_tx_ctx(tx_ctx); atomic_dec(&tx_ctx->ep->num_rx_ctx); atomic_dec(&tx_ctx->domain->ref); sock_tx_ctx_free(tx_ctx); @@ -86,7 +91,7 @@ static int sock_ctx_close(struct fid *fid) case FI_CLASS_RX_CTX: rx_ctx = container_of(fid, struct sock_rx_ctx, ctx.fid); - sock_dequeue_rx_ctx(rx_ctx); + sock_pe_remove_rx_ctx(rx_ctx); atomic_dec(&rx_ctx->ep->num_rx_ctx); atomic_dec(&rx_ctx->domain->ref); sock_rx_ctx_free(rx_ctx); @@ -95,14 +100,14 @@ static int sock_ctx_close(struct fid *fid) case FI_CLASS_STX_CTX: tx_ctx = container_of(fid, struct sock_tx_ctx, fid.stx.fid); atomic_dec(&tx_ctx->domain->ref); - sock_dequeue_tx_ctx(tx_ctx); + sock_pe_remove_tx_ctx(tx_ctx); sock_tx_ctx_free(tx_ctx); break; case FI_CLASS_SRX_CTX: rx_ctx = container_of(fid, struct sock_rx_ctx, ctx.fid); atomic_dec(&rx_ctx->domain->ref); - sock_dequeue_rx_ctx(rx_ctx); + sock_pe_remove_rx_ctx(rx_ctx); sock_rx_ctx_free(rx_ctx); break; @@ -141,10 +146,6 @@ static int sock_ctx_bind_cq(struct fid *fid, struct fid *bfid, uint64_t flags) tx_ctx->comp.write_cq_event = 1; } - if (!tx_ctx->progress) { - tx_ctx->progress = 1; - sock_pe_add_tx_ctx(tx_ctx->domain->pe, tx_ctx); - } dlist_insert_tail(&tx_ctx->cq_entry, &sock_cq->tx_list); break; @@ -168,10 +169,6 @@ static int sock_ctx_bind_cq(struct fid *fid, struct fid *bfid, uint64_t flags) rx_ctx->comp.rem_write_cq_event = 1; } - if (!rx_ctx->progress) { - rx_ctx->progress = 1; - sock_pe_add_rx_ctx(rx_ctx->domain->pe, rx_ctx); - } dlist_insert_tail(&rx_ctx->cq_entry, &sock_cq->rx_list); break; @@ -195,10 +192,6 @@ static int sock_ctx_bind_cq(struct fid *fid, struct fid *bfid, uint64_t flags) tx_ctx->comp.write_cq_event = 1; } - if (!tx_ctx->progress) { - tx_ctx->progress = 1; - sock_pe_add_tx_ctx(tx_ctx->domain->pe, tx_ctx); - } dlist_insert_tail(&tx_ctx->cq_entry, &sock_cq->tx_list); break; @@ -228,10 +221,6 @@ static int sock_ctx_bind_cntr(struct fid *fid, struct fid *bfid, uint64_t flags) if (flags & FI_WRITE) tx_ctx->comp.write_cntr = cntr; - if (!tx_ctx->progress) { - tx_ctx->progress = 1; - sock_pe_add_tx_ctx(tx_ctx->domain->pe, tx_ctx); - } dlist_insert_tail(&tx_ctx->cntr_entry, &cntr->tx_list); break; @@ -247,10 +236,6 @@ static int sock_ctx_bind_cntr(struct fid *fid, struct fid *bfid, uint64_t flags) if (flags & FI_REMOTE_WRITE) rx_ctx->comp.rem_write_cntr = cntr; - if (!rx_ctx->progress) { - rx_ctx->progress = 1; - sock_pe_add_rx_ctx(rx_ctx->domain->pe, rx_ctx); - } dlist_insert_tail(&rx_ctx->cntr_entry, &cntr->rx_list); break; @@ -265,10 +250,6 @@ static int sock_ctx_bind_cntr(struct fid *fid, struct fid *bfid, uint64_t flags) if (flags & FI_WRITE) tx_ctx->comp.write_cntr = cntr; - if (!tx_ctx->progress) { - tx_ctx->progress = 1; - sock_pe_add_tx_ctx(tx_ctx->domain->pe, tx_ctx); - } dlist_insert_tail(&tx_ctx->cntr_entry, &cntr->tx_list); break; @@ -289,6 +270,9 @@ static int sock_ctx_bind(struct fid *fid, struct fid *bfid, uint64_t flags) case FI_CLASS_CNTR: return sock_ctx_bind_cntr(fid, bfid, flags); + case FI_CLASS_MR: + return 0; + default: SOCK_LOG_ERROR("Invalid bind()\n"); return -FI_EINVAL; @@ -367,11 +351,19 @@ static int sock_ctx_enable(struct fid_ep *ep) case FI_CLASS_RX_CTX: rx_ctx = container_of(ep, struct sock_rx_ctx, ctx.fid); rx_ctx->enabled = 1; + if (!rx_ctx->progress) { + sock_pe_add_rx_ctx(rx_ctx->domain->pe, rx_ctx); + rx_ctx->progress = 1; + } return 0; case FI_CLASS_TX_CTX: tx_ctx = container_of(ep, struct sock_tx_ctx, fid.ctx.fid); tx_ctx->enabled = 1; + if (!tx_ctx->progress) { + sock_pe_add_tx_ctx(tx_ctx->domain->pe, tx_ctx); + tx_ctx->progress = 1; + } return 0; default: @@ -422,31 +414,12 @@ static int sock_ctx_setopt(fid_t fid, int level, int optname, return 0; } -static ssize_t sock_ep_cancel(fid_t fid, void *context) +static ssize_t sock_rx_ctx_cancel(struct sock_rx_ctx *rx_ctx, void *context) { - int ret; - struct sock_rx_ctx *rx_ctx; - struct sock_rx_entry *rx_entry; - struct sock_ep *sock_ep; struct dlist_entry *entry; + ssize_t ret = -FI_ENOENT; + struct sock_rx_entry *rx_entry; - switch (fid->fclass) { - case FI_CLASS_EP: - sock_ep = container_of(fid, struct sock_ep, fid.ep.fid); - rx_ctx = sock_ep->rx_ctx; - break; - - case FI_CLASS_RX_CTX: - case FI_CLASS_SRX_CTX: - rx_ctx = container_of(fid, struct sock_rx_ctx, ctx.fid); - break; - - default: - SOCK_LOG_ERROR("Invalid ep type\n"); - return -FI_EINVAL; - } - - ret = -FI_ENOENT; fastlock_acquire(&rx_ctx->lock); for (entry = rx_ctx->rx_entry_list.next; entry != &rx_ctx->rx_entry_list; entry = entry->next) { @@ -454,7 +427,7 @@ static ssize_t sock_ep_cancel(fid_t fid, void *context) rx_entry = container_of(entry, struct sock_rx_entry, entry); if (rx_entry->is_busy || rx_entry->used) continue; - + if ((uint64_t)context == rx_entry->context) { dlist_remove(&rx_entry->entry); sock_rx_release_entry(rx_entry); @@ -466,6 +439,35 @@ static ssize_t sock_ep_cancel(fid_t fid, void *context) return ret; } +static ssize_t sock_ep_cancel(fid_t fid, void *context) +{ + struct sock_rx_ctx *rx_ctx = NULL; + struct sock_ep *sock_ep; + + switch (fid->fclass) { + case FI_CLASS_EP: + sock_ep = container_of(fid, struct sock_ep, ep.fid); + rx_ctx = sock_ep->rx_ctx; + break; + + case FI_CLASS_RX_CTX: + case FI_CLASS_SRX_CTX: + rx_ctx = container_of(fid, struct sock_rx_ctx, ctx.fid); + sock_ep = rx_ctx->ep; + break; + + default: + SOCK_LOG_ERROR("Invalid ep type\n"); + return -FI_EINVAL; + } + + if (!(sock_ep->info.caps & FI_CANCEL)) { + return -FI_EINVAL; + } + + return sock_rx_ctx_cancel(rx_ctx, context); +} + struct fi_ops_ep sock_ctx_ep_ops = { .size = sizeof(struct fi_ops_ep), .enable = sock_ctx_enable, @@ -474,6 +476,8 @@ struct fi_ops_ep sock_ctx_ep_ops = { .setopt = sock_ctx_setopt, .tx_ctx = fi_no_tx_ctx, .rx_ctx = fi_no_rx_ctx, + .rx_size_left = fi_no_rx_size_left, + .tx_size_left = fi_no_tx_size_left, }; static int sock_ep_close(struct fid *fid) @@ -482,16 +486,13 @@ static int sock_ep_close(struct fid *fid) switch(fid->fclass) { case FI_CLASS_EP: - sock_ep = container_of(fid, struct sock_ep, fid.ep.fid); + sock_ep = container_of(fid, struct sock_ep, ep.fid); break; case FI_CLASS_SEP: - sock_ep = container_of(fid, struct sock_ep, fid.sep.fid); + sock_ep = container_of(fid, struct sock_ep, ep.fid); break; - case FI_CLASS_PEP: - sock_ep = container_of(fid, struct sock_ep, fid.pep.fid); - break; default: return -FI_EINVAL; } @@ -500,15 +501,13 @@ static int sock_ep_close(struct fid *fid) atomic_get(&sock_ep->num_tx_ctx)) return -FI_EBUSY; - if (sock_ep->fclass != FI_CLASS_SEP && - sock_ep->ep_attr.tx_ctx_cnt != FI_SHARED_CONTEXT) { - sock_dequeue_tx_ctx(sock_ep->tx_array[0]); + if (sock_ep->fclass != FI_CLASS_SEP && !sock_ep->tx_shared) { + sock_pe_remove_tx_ctx(sock_ep->tx_array[0]); sock_tx_ctx_free(sock_ep->tx_array[0]); } - if (sock_ep->fclass != FI_CLASS_SEP && - sock_ep->ep_attr.rx_ctx_cnt != FI_SHARED_CONTEXT) { - sock_dequeue_rx_ctx(sock_ep->rx_array[0]); + if (sock_ep->fclass != FI_CLASS_SEP && !sock_ep->rx_shared) { + sock_pe_remove_rx_ctx(sock_ep->rx_array[0]); sock_rx_ctx_free(sock_ep->rx_array[0]); } @@ -538,16 +537,13 @@ static int sock_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) switch(fid->fclass) { case FI_CLASS_EP: - ep = container_of(fid, struct sock_ep, fid.ep.fid); + ep = container_of(fid, struct sock_ep, ep.fid); break; case FI_CLASS_SEP: - ep = container_of(fid, struct sock_ep, fid.sep.fid); + ep = container_of(fid, struct sock_ep, ep.fid); break; - case FI_CLASS_PEP: - ep = container_of(fid, struct sock_ep, fid.pep.fid); - break; default: return -FI_EINVAL; } @@ -556,12 +552,10 @@ static int sock_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) case FI_CLASS_EQ: eq = container_of(bfid, struct sock_eq, eq.fid); ep->eq = eq; - if ((eq->attr.wait_obj == FI_WAIT_FD) && (eq->wait_fd < 0)) - sock_eq_openwait(eq, (char *)&ep->domain->service); break; case FI_CLASS_MR: - return -FI_EINVAL; + return 0; case FI_CLASS_CQ: cq = container_of(bfid, struct sock_cq, cq_fid.fid); @@ -621,9 +615,32 @@ static int sock_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) for (i = 0; i < ep->ep_attr.rx_ctx_cnt; i++) { rx_ctx = ep->rx_array[i]; - if (!rx_ctx) + if (!rx_ctx) continue; + if (rx_ctx->ctx.fid.fclass == FI_CLASS_SRX_CTX) { + if (flags & FI_RECV) { + ep->comp.recv_cq = cq; + if (flags & FI_COMPLETION) + ep->comp.recv_cq_event = 1; + } + + if (flags & FI_REMOTE_READ) { + ep->comp.rem_read_cq = cq; + if (flags & FI_COMPLETION) + ep->comp.rem_read_cq_event = 1; + } + + if (flags & FI_REMOTE_WRITE) { + ep->comp.rem_write_cq = cq; + if (flags & FI_COMPLETION) + ep->comp.rem_write_cq_event = 1; + } + + dlist_insert_tail(&rx_ctx->cq_entry, &cq->rx_list); + continue; + } + if ((ret = sock_ctx_bind_cq(&rx_ctx->ctx.fid, bfid, flags))) return ret; @@ -673,6 +690,21 @@ static int sock_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) if (!rx_ctx) continue; + + if (rx_ctx->ctx.fid.fclass == FI_CLASS_SRX_CTX) { + + if (flags & FI_RECV) + rx_ctx->comp.recv_cntr = cntr; + + if (flags & FI_REMOTE_READ) + rx_ctx->comp.rem_read_cntr = cntr; + + if (flags & FI_REMOTE_WRITE) + rx_ctx->comp.rem_write_cntr = cntr; + + dlist_insert_tail(&rx_ctx->cntr_entry, &cntr->rx_list); + continue; + } if ((ret = sock_ctx_bind_cntr(&rx_ctx->ctx.fid, bfid, flags))) @@ -737,16 +769,13 @@ static int sock_ep_control(struct fid *fid, int command, void *arg) switch(fid->fclass) { case FI_CLASS_EP: - ep = container_of(fid, struct sock_ep, fid.ep.fid); + ep = container_of(fid, struct sock_ep, ep.fid); break; case FI_CLASS_SEP: - ep = container_of(fid, struct sock_ep, fid.sep.fid); + ep = container_of(fid, struct sock_ep, ep.fid); break; - case FI_CLASS_PEP: - ep = container_of(fid, struct sock_ep, fid.pep.fid); - break; default: return -FI_EINVAL; } @@ -759,7 +788,7 @@ static int sock_ep_control(struct fid *fid, int command, void *arg) return -FI_ENOMEM; *new_ep = *ep; new_ep->op_flags = alias->flags; - *alias->fid = &new_ep->fid.ep.fid; + *alias->fid = &new_ep->ep.fid; break; case FI_GETOPSFLAG: @@ -785,29 +814,49 @@ struct fi_ops sock_ep_fi_ops = { .ops_open = fi_no_ops_open, }; -static int sock_ep_enable(struct fid_ep *ep) +int sock_ep_enable(struct fid_ep *ep) { int i; struct sock_ep *sock_ep; - sock_ep = container_of(ep, struct sock_ep, fid.ep); + sock_ep = container_of(ep, struct sock_ep, ep); if (sock_ep->tx_ctx && - sock_ep->tx_ctx->fid.ctx.fid.fclass == FI_CLASS_TX_CTX) + sock_ep->tx_ctx->fid.ctx.fid.fclass == FI_CLASS_TX_CTX) { sock_ep->tx_ctx->enabled = 1; + if (!sock_ep->tx_ctx->progress) { + sock_pe_add_tx_ctx(sock_ep->domain->pe, sock_ep->tx_ctx); + sock_ep->tx_ctx->progress = 1; + } + } if (sock_ep->rx_ctx && - sock_ep->rx_ctx->ctx.fid.fclass == FI_CLASS_RX_CTX) + sock_ep->rx_ctx->ctx.fid.fclass == FI_CLASS_RX_CTX) { sock_ep->rx_ctx->enabled = 1; + if (!sock_ep->rx_ctx->progress) { + sock_pe_add_rx_ctx(sock_ep->domain->pe, sock_ep->rx_ctx); + sock_ep->rx_ctx->progress = 1; + } + } for (i = 0; i < sock_ep->ep_attr.tx_ctx_cnt; i++) { - if (sock_ep->tx_array[i]) + if (sock_ep->tx_array[i]) { sock_ep->tx_array[i]->enabled = 1; + if (!sock_ep->tx_array[i]->progress) { + sock_pe_add_tx_ctx(sock_ep->domain->pe, sock_ep->tx_array[i]); + sock_ep->tx_array[i]->progress = 1; + } + } } for (i = 0; i < sock_ep->ep_attr.rx_ctx_cnt; i++) { - if (sock_ep->rx_array[i]) + if (sock_ep->rx_array[i]) { sock_ep->rx_array[i]->enabled = 1; + if (!sock_ep->rx_array[i]->progress) { + sock_pe_add_rx_ctx(sock_ep->domain->pe, sock_ep->rx_array[i]); + sock_ep->rx_array[i]->progress = 1; + } + } } return 0; } @@ -816,7 +865,7 @@ static int sock_ep_getopt(fid_t fid, int level, int optname, void *optval, size_t *optlen) { struct sock_ep *sock_ep; - sock_ep = container_of(fid, struct sock_ep, fid.ep.fid); + sock_ep = container_of(fid, struct sock_ep, ep.fid); if (level != FI_OPT_ENDPOINT) return -ENOPROTOOPT; @@ -838,7 +887,7 @@ static int sock_ep_setopt(fid_t fid, int level, int optname, { int i; struct sock_ep *sock_ep; - sock_ep = container_of(fid, struct sock_ep, fid.ep.fid); + sock_ep = container_of(fid, struct sock_ep, ep.fid); if (level != FI_OPT_ENDPOINT) return -ENOPROTOOPT; @@ -861,13 +910,13 @@ static int sock_ep_setopt(fid_t fid, int level, int optname, return 0; } -static int sock_ep_tx_ctx(struct fid_sep *ep, int index, struct fi_tx_attr *attr, +static int sock_ep_tx_ctx(struct fid_ep *ep, int index, struct fi_tx_attr *attr, struct fid_ep **tx_ep, void *context) { struct sock_ep *sock_ep; struct sock_tx_ctx *tx_ctx; - sock_ep = container_of(ep, struct sock_ep, fid.sep); + sock_ep = container_of(ep, struct sock_ep, ep); if (index >= sock_ep->ep_attr.tx_ctx_cnt) return -FI_EINVAL; @@ -894,13 +943,13 @@ static int sock_ep_tx_ctx(struct fid_sep *ep, int index, struct fi_tx_attr *attr return 0; } -static int sock_ep_rx_ctx(struct fid_sep *ep, int index, struct fi_rx_attr *attr, +static int sock_ep_rx_ctx(struct fid_ep *ep, int index, struct fi_rx_attr *attr, struct fid_ep **rx_ep, void *context) { struct sock_ep *sock_ep; struct sock_rx_ctx *rx_ctx; - sock_ep = container_of(ep, struct sock_ep, fid.sep); + sock_ep = container_of(ep, struct sock_ep, ep); if (index >= sock_ep->ep_attr.rx_ctx_cnt) return -FI_EINVAL; @@ -937,23 +986,45 @@ struct fi_ops_ep sock_ep_ops ={ .setopt = sock_ep_setopt, .tx_ctx = sock_ep_tx_ctx, .rx_ctx = sock_ep_rx_ctx, + .rx_size_left = fi_no_rx_size_left, + .tx_size_left = fi_no_tx_size_left, }; +static int sock_verify_tx_attr(const struct fi_tx_attr *attr) +{ + if (!attr) + return 0; + + if (attr->inject_size > SOCK_EP_MAX_INJECT_SZ) + return -FI_ENODATA; + + if (attr->size > SOCK_EP_TX_SZ) + return -FI_ENODATA; + + if (attr->iov_limit > SOCK_EP_MAX_IOV_LIMIT) + return -FI_ENODATA; + + return 0; +} + int sock_stx_ctx(struct fid_domain *domain, struct fi_tx_attr *attr, struct fid_stx **stx, void *context) { struct sock_domain *dom; struct sock_tx_ctx *tx_ctx; + if (attr && sock_verify_tx_attr(attr)) + return -FI_EINVAL; + dom = container_of(domain, struct sock_domain, dom_fid); - tx_ctx = sock_tx_ctx_alloc(attr, context); + tx_ctx = sock_tx_ctx_alloc(attr ? attr : &sock_stx_attr, context); if (!tx_ctx) return -FI_ENOMEM; tx_ctx->domain = dom; - tx_ctx->fid.ctx.fid.fclass = FI_CLASS_STX_CTX; - + tx_ctx->fid.stx.fid.fclass = FI_CLASS_STX_CTX; + tx_ctx->fid.stx.fid.ops = &sock_ctx_ops; tx_ctx->fid.stx.ops = &sock_ep_ops; atomic_inc(&dom->ref); @@ -962,14 +1033,34 @@ int sock_stx_ctx(struct fid_domain *domain, return 0; } +static int sock_verify_rx_attr(const struct fi_rx_attr *attr) +{ + if (!attr) + return 0; + + if (attr->total_buffered_recv > SOCK_EP_MAX_BUFF_RECV) + return -FI_ENODATA; + + if (attr->size > SOCK_EP_TX_SZ) + return -FI_ENODATA; + + if (attr->iov_limit > SOCK_EP_MAX_IOV_LIMIT) + return -FI_ENODATA; + + return 0; +} + int sock_srx_ctx(struct fid_domain *domain, struct fi_rx_attr *attr, struct fid_ep **srx, void *context) { struct sock_domain *dom; struct sock_rx_ctx *rx_ctx; + if (attr && sock_verify_rx_attr(attr)) + return -FI_EINVAL; + dom = container_of(domain, struct sock_domain, dom_fid); - rx_ctx = sock_rx_ctx_alloc(attr, context); + rx_ctx = sock_rx_ctx_alloc(attr ? attr : &sock_srx_attr, context); if (!rx_ctx) return -FI_ENOMEM; @@ -983,6 +1074,8 @@ int sock_srx_ctx(struct fid_domain *domain, /* default config */ rx_ctx->min_multi_recv = SOCK_EP_MIN_MULTI_RECV; + rx_ctx->attr.total_buffered_recv = rx_ctx->attr.total_buffered_recv ? + rx_ctx->attr.total_buffered_recv : SOCK_EP_MAX_BUFF_RECV; *srx = &rx_ctx->ctx; atomic_inc(&dom->ref); @@ -1014,13 +1107,22 @@ struct fi_info *sock_fi_info(enum fi_ep_type ep_type, if (hints->caps) _info->caps = hints->caps; + + if (hints->ep_attr) + *(_info->ep_attr) = *(hints->ep_attr); + + if (hints->tx_attr) + *(_info->tx_attr) = *(hints->tx_attr); + + if (hints->rx_attr) + *(_info->rx_attr) = *(hints->rx_attr); *(_info->domain_attr) = sock_domain_attr; *(_info->fabric_attr) = sock_fabric_attr; _info->domain_attr->name = strdup(sock_dom_name); _info->fabric_attr->name = strdup(sock_fab_name); - _info->fabric_attr->prov_name = strdup(sock_fab_name); + _info->fabric_attr->prov_name = strdup(sock_prov_name); return _info; } @@ -1053,36 +1155,27 @@ int sock_alloc_endpoint(struct fid_domain *domain, struct fi_info *info, switch (fclass) { case FI_CLASS_EP: - sock_ep->fid.ep.fid.fclass = FI_CLASS_EP; - sock_ep->fid.ep.fid.context = context; - sock_ep->fid.ep.fid.ops = &sock_ep_fi_ops; + sock_ep->ep.fid.fclass = FI_CLASS_EP; + sock_ep->ep.fid.context = context; + sock_ep->ep.fid.ops = &sock_ep_fi_ops; - sock_ep->fid.ep.ops = &sock_ep_ops; - sock_ep->fid.ep.cm = &sock_ep_cm_ops; - sock_ep->fid.ep.msg = &sock_ep_msg_ops; - sock_ep->fid.ep.rma = &sock_ep_rma; - sock_ep->fid.ep.tagged = &sock_ep_tagged; - sock_ep->fid.ep.atomic = &sock_ep_atomic; + sock_ep->ep.ops = &sock_ep_ops; + sock_ep->ep.cm = &sock_ep_cm_ops; + sock_ep->ep.msg = &sock_ep_msg_ops; + sock_ep->ep.rma = &sock_ep_rma; + sock_ep->ep.tagged = &sock_ep_tagged; + sock_ep->ep.atomic = &sock_ep_atomic; break; case FI_CLASS_SEP: - sock_ep->fid.sep.fid.fclass = FI_CLASS_SEP; - sock_ep->fid.sep.fid.context = context; - sock_ep->fid.sep.fid.ops = &sock_ep_fi_ops; + sock_ep->ep.fid.fclass = FI_CLASS_SEP; + sock_ep->ep.fid.context = context; + sock_ep->ep.fid.ops = &sock_ep_fi_ops; - sock_ep->fid.sep.ops = &sock_ep_ops; - sock_ep->fid.sep.cm = &sock_ep_cm_ops; + sock_ep->ep.ops = &sock_ep_ops; + sock_ep->ep.cm = &sock_ep_cm_ops; break; - case FI_CLASS_PEP: - sock_ep->fid.pep.fid.fclass = FI_CLASS_SEP; - sock_ep->fid.pep.fid.context = context; - sock_ep->fid.pep.fid.ops = &sock_ep_fi_ops; - - sock_ep->fid.pep.ops = &sock_ep_ops; - sock_ep->fid.pep.cm = &sock_ep_cm_ops; - break; - default: goto err; } @@ -1090,6 +1183,10 @@ int sock_alloc_endpoint(struct fid_domain *domain, struct fi_info *info, sock_ep->fclass = fclass; *ep = sock_ep; + fastlock_acquire(&sock_dom->lock); + sock_ep->ep_id = sock_dom->ep_count++; + fastlock_release(&sock_dom->lock); + if (info) { sock_ep->ep_type = info->ep_type; sock_ep->info.caps = info->caps; @@ -1100,7 +1197,9 @@ int sock_alloc_endpoint(struct fid_domain *domain, struct fi_info *info, memcpy(sock_ep->src_addr, info->src_addr, sizeof(struct sockaddr_in)); ((struct sockaddr_in*)sock_ep->src_addr)->sin_port = - htons(sock_dom->service); + htons(atoi(sock_dom->service)); + ((struct sockaddr_in*)sock_ep->src_addr)->sin_family = + sock_ep->ep_id; } if (info->dest_addr) { @@ -1121,7 +1220,8 @@ int sock_alloc_endpoint(struct fid_domain *domain, struct fi_info *info, sock_ep->tx_attr = *info->tx_attr; sock_ep->op_flags = info->tx_attr->op_flags; sock_ep->tx_attr.size = sock_ep->tx_attr.size ? - sock_ep->tx_attr.size : SOCK_EP_MAX_TX_CTX_SZ; + sock_ep->tx_attr.size : + (SOCK_EP_TX_SZ * SOCK_EP_TX_ENTRY_SZ); } if (info->rx_attr) { @@ -1132,12 +1232,18 @@ int sock_alloc_endpoint(struct fid_domain *domain, struct fi_info *info, sock_ep->rx_attr.total_buffered_recv : SOCK_EP_MAX_BUFF_RECV; } + sock_ep->info.connreq = info->connreq; } atomic_init(&sock_ep->ref, 0); atomic_init(&sock_ep->num_tx_ctx, 0); atomic_init(&sock_ep->num_rx_ctx, 0); + if (sock_ep->ep_attr.tx_ctx_cnt == FI_SHARED_CONTEXT) + sock_ep->tx_shared = 1; + if (sock_ep->ep_attr.rx_ctx_cnt == FI_SHARED_CONTEXT) + sock_ep->rx_shared = 1; + if (sock_ep->fclass != FI_CLASS_SEP) { sock_ep->ep_attr.tx_ctx_cnt = 1; sock_ep->ep_attr.rx_ctx_cnt = 1; diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ep_dgram.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ep_dgram.c index 53a7559491..cec87b3fa6 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ep_dgram.c +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ep_dgram.c @@ -71,16 +71,16 @@ const struct fi_ep_attr sock_dgram_ep_attr = { const struct fi_tx_attr sock_dgram_tx_attr = { .caps = SOCK_EP_DGRAM_CAP, - .op_flags = SOCK_DGRAM_DEF_OPS, + .op_flags = SOCK_DEF_OPS, .msg_order = SOCK_EP_MSG_ORDER, .inject_size = SOCK_EP_MAX_INJECT_SZ, - .size = SOCK_EP_MAX_TX_CTX_SZ, + .size = SOCK_EP_TX_SZ, .iov_limit = SOCK_EP_MAX_IOV_LIMIT, }; const struct fi_rx_attr sock_dgram_rx_attr = { .caps = SOCK_EP_DGRAM_CAP, - .op_flags = SOCK_DGRAM_DEF_OPS, + .op_flags = SOCK_DEF_OPS, .msg_order = SOCK_EP_MSG_ORDER, .total_buffered_recv = SOCK_EP_MAX_BUFF_RECV, .size = SOCK_EP_MAX_MSG_SZ, @@ -200,18 +200,12 @@ static struct fi_info *sock_dgram_fi_info(struct fi_info *hints, if (!_info) return NULL; - if (!hints->caps) - _info->caps = SOCK_EP_DGRAM_CAP; - - if (!hints->tx_attr) - *(_info->tx_attr) = sock_dgram_tx_attr; - - if (!hints->rx_attr) - *(_info->rx_attr) = sock_dgram_rx_attr; - - if (!hints->ep_attr) - *(_info->ep_attr) = sock_dgram_ep_attr; + _info->caps = SOCK_EP_DGRAM_CAP; + *(_info->tx_attr) = sock_dgram_tx_attr; + *(_info->rx_attr) = sock_dgram_rx_attr; + *(_info->ep_attr) = sock_dgram_ep_attr; + _info->caps |= (_info->rx_attr->caps | _info->tx_attr->caps); return _info; } @@ -256,9 +250,6 @@ int sock_dgram_getinfo(uint32_t version, const char *node, const char *service, return ret; } - src_addr = calloc(1, sizeof(struct sockaddr_in)); - dest_addr = calloc(1, sizeof(struct sockaddr_in)); - memset(&sock_hints, 0, sizeof(struct addrinfo)); sock_hints.ai_family = AF_INET; sock_hints.ai_socktype = SOCK_STREAM; @@ -293,6 +284,11 @@ int sock_dgram_getinfo(uint32_t version, const char *node, const char *service, goto err; } + src_addr = calloc(1, sizeof(struct sockaddr_in)); + if (!src_addr) { + ret = -FI_ENOMEM; + goto err; + } memcpy(src_addr, result->ai_addr, result->ai_addrlen); freeaddrinfo(result); } else if (node || service) { @@ -317,6 +313,11 @@ int sock_dgram_getinfo(uint32_t version, const char *node, const char *service, goto err; } + dest_addr = calloc(1, sizeof(struct sockaddr_in)); + if (!dest_addr) { + ret = -FI_ENOMEM; + goto err; + } memcpy(dest_addr, result->ai_addr, result->ai_addrlen); udp_sock = socket(AF_INET, SOCK_DGRAM, 0); @@ -328,7 +329,12 @@ int sock_dgram_getinfo(uint32_t version, const char *node, const char *service, goto err; } - len = sizeof(struct sockaddr_in); + len = sizeof(struct sockaddr_in); + src_addr = calloc(1, sizeof(struct sockaddr_in)); + if (!src_addr) { + ret = -FI_ENOMEM; + goto err; + } ret = getsockname(udp_sock, (struct sockaddr*)src_addr, &len); if (ret != 0) { SOCK_LOG_ERROR("getsockname failed\n"); @@ -341,11 +347,25 @@ int sock_dgram_getinfo(uint32_t version, const char *node, const char *service, } if (hints->src_addr) { + if (!src_addr) { + src_addr = calloc(1, sizeof(struct sockaddr_in)); + if (!src_addr) { + ret = -FI_ENOMEM; + goto err; + } + } assert(hints->src_addrlen == sizeof(struct sockaddr_in)); memcpy(src_addr, hints->src_addr, hints->src_addrlen); } if (hints->dest_addr) { + if (!dest_addr) { + dest_addr = calloc(1, sizeof(struct sockaddr_in)); + if (!dest_addr) { + ret = -FI_ENOMEM; + goto err; + } + } assert(hints->dest_addrlen == sizeof(struct sockaddr_in)); memcpy(dest_addr, hints->dest_addr, hints->dest_addrlen); } @@ -368,14 +388,18 @@ int sock_dgram_getinfo(uint32_t version, const char *node, const char *service, goto err; } + if (src_addr) + free(src_addr); + if (dest_addr) + free(dest_addr); *info = _info; - free(src_addr); - free(dest_addr); return 0; err: - free(src_addr); - free(dest_addr); + if (src_addr) + free(src_addr); + if (dest_addr) + free(dest_addr); SOCK_LOG_ERROR("fi_getinfo failed\n"); return ret; } @@ -433,12 +457,12 @@ int sock_dgram_ep(struct fid_domain *domain, struct fi_info *info, if (ret) return ret; - *ep = &endpoint->fid.ep; + *ep = &endpoint->ep; return 0; } int sock_dgram_sep(struct fid_domain *domain, struct fi_info *info, - struct fid_sep **sep, void *context) + struct fid_ep **sep, void *context) { int ret; struct sock_ep *endpoint; @@ -447,6 +471,6 @@ int sock_dgram_sep(struct fid_domain *domain, struct fi_info *info, if (ret) return ret; - *sep = &endpoint->fid.sep; + *sep = &endpoint->ep; return 0; } diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ep_msg.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ep_msg.c index ac417c3ac5..49ec58cd81 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ep_msg.c +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ep_msg.c @@ -75,7 +75,7 @@ const struct fi_tx_attr sock_msg_tx_attr = { .op_flags = SOCK_DEF_OPS, .msg_order = SOCK_EP_MSG_ORDER, .inject_size = SOCK_EP_MAX_INJECT_SZ, - .size = SOCK_EP_MAX_TX_CTX_SZ, + .size = SOCK_EP_TX_SZ, .iov_limit = SOCK_EP_MAX_IOV_LIMIT, }; @@ -201,18 +201,12 @@ static struct fi_info *sock_msg_fi_info(struct fi_info *hints, if (!_info) return NULL; - if (!hints->caps) - _info->caps = SOCK_EP_MSG_CAP; - - if (!hints->tx_attr) - *(_info->tx_attr) = sock_msg_tx_attr; - - if (!hints->rx_attr) - *(_info->rx_attr) = sock_msg_rx_attr; - - if (!hints->ep_attr) - *(_info->ep_attr) = sock_msg_ep_attr; + _info->caps = SOCK_EP_MSG_CAP; + *(_info->tx_attr) = sock_msg_tx_attr; + *(_info->rx_attr) = sock_msg_rx_attr; + *(_info->ep_attr) = sock_msg_ep_attr; + _info->caps |= (_info->rx_attr->caps | _info->tx_attr->caps); return _info; } @@ -257,9 +251,6 @@ int sock_msg_getinfo(uint32_t version, const char *node, const char *service, return ret; } - src_addr = calloc(1, sizeof(struct sockaddr_in)); - dest_addr = calloc(1, sizeof(struct sockaddr_in)); - memset(&sock_hints, 0, sizeof(struct addrinfo)); sock_hints.ai_family = AF_INET; sock_hints.ai_socktype = SOCK_STREAM; @@ -294,6 +285,11 @@ int sock_msg_getinfo(uint32_t version, const char *node, const char *service, goto err; } + src_addr = calloc(1, sizeof(struct sockaddr_in)); + if (!src_addr) { + ret = -FI_ENOMEM; + goto err; + } memcpy(src_addr, result->ai_addr, result->ai_addrlen); freeaddrinfo(result); } else if (node || service) { @@ -318,6 +314,11 @@ int sock_msg_getinfo(uint32_t version, const char *node, const char *service, goto err; } + dest_addr = calloc(1, sizeof(struct sockaddr_in)); + if (!dest_addr) { + ret = -FI_ENOMEM; + goto err; + } memcpy(dest_addr, result->ai_addr, result->ai_addrlen); udp_sock = socket(AF_INET, SOCK_DGRAM, 0); @@ -329,7 +330,12 @@ int sock_msg_getinfo(uint32_t version, const char *node, const char *service, goto err; } - len = sizeof(struct sockaddr_in); + len = sizeof(struct sockaddr_in); + src_addr = calloc(1, sizeof(struct sockaddr_in)); + if (!src_addr) { + ret = -FI_ENOMEM; + goto err; + } ret = getsockname(udp_sock, (struct sockaddr*)src_addr, &len); if (ret != 0) { SOCK_LOG_ERROR("getsockname failed\n"); @@ -347,17 +353,38 @@ int sock_msg_getinfo(uint32_t version, const char *node, const char *service, } if (hints->dest_addr) { + if (!dest_addr) { + dest_addr = calloc(1, sizeof(struct sockaddr_in)); + if (!dest_addr) { + ret = -FI_ENOMEM; + goto err; + } + } assert(hints->dest_addrlen == sizeof(struct sockaddr_in)); memcpy(dest_addr, hints->dest_addr, hints->dest_addrlen); } if (dest_addr) { + if (!dest_addr) { + dest_addr = calloc(1, sizeof(struct sockaddr_in)); + if (!dest_addr) { + ret = -FI_ENOMEM; + goto err; + } + } memcpy(sa_ip, inet_ntoa(dest_addr->sin_addr), INET_ADDRSTRLEN); SOCK_LOG_INFO("dest_addr: family: %d, IP is %s\n", ((struct sockaddr_in*)dest_addr)->sin_family, sa_ip); } if (src_addr) { + if (!src_addr) { + src_addr = calloc(1, sizeof(struct sockaddr_in)); + if (!src_addr) { + ret = -FI_ENOMEM; + goto err; + } + } memcpy(sa_ip, inet_ntoa(src_addr->sin_addr), INET_ADDRSTRLEN); SOCK_LOG_INFO("src_addr: family: %d, IP is %s\n", ((struct sockaddr_in*)src_addr)->sin_family, sa_ip); @@ -370,13 +397,17 @@ int sock_msg_getinfo(uint32_t version, const char *node, const char *service, } *info = _info; - free(src_addr); - free(dest_addr); + if (src_addr) + free(src_addr); + if (dest_addr) + free(dest_addr); return 0; err: - free(src_addr); - free(dest_addr); + if (src_addr) + free(src_addr); + if (dest_addr) + free(dest_addr); SOCK_LOG_ERROR("fi_getinfo failed\n"); return ret; } @@ -389,7 +420,7 @@ static int sock_ep_cm_getname(fid_t fid, void *addr, size_t *addrlen) return -FI_ETOOSMALL; } - sock_ep = container_of(fid, struct sock_ep, fid.ep.fid); + sock_ep = container_of(fid, struct sock_ep, ep.fid); *addrlen = MIN(*addrlen, sizeof(struct sockaddr_in)); memcpy(addr, sock_ep->src_addr, *addrlen); return 0; @@ -404,88 +435,312 @@ static int sock_ep_cm_getpeer(struct fid_ep *ep, void *addr, size_t *addrlen) return -FI_ETOOSMALL; } - sock_ep = container_of(ep, struct sock_ep, fid.ep); + sock_ep = container_of(ep, struct sock_ep, ep); *addrlen = MIN(*addrlen, sizeof(struct sockaddr_in)); memcpy(addr, sock_ep->dest_addr, *addrlen); return 0; } +static int sock_ep_cm_create_socket() +{ + int sock, optval; + sock = socket(AF_INET, SOCK_DGRAM, 0); + if (sock < 0) + return 0; + + optval = 1; + setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, + &optval, sizeof optval); + return sock; +} + + +static int sock_ep_cm_send_msg(int sock_fd, + const struct sockaddr_in *addr, void *msg, size_t len) +{ + int ret, retry = 0; + unsigned char response; + struct sockaddr_in from_addr; + socklen_t addr_len; + char sa_ip[INET_ADDRSTRLEN] = {0}; + + memcpy(sa_ip, inet_ntoa(addr->sin_addr), INET_ADDRSTRLEN); + SOCK_LOG_INFO("Sending message to %s:%d\n", + sa_ip, ntohs(addr->sin_port)); + + while (retry < SOCK_EP_MAX_RETRY) { + ret = sendto(sock_fd, (char *)msg, len, 0, addr, sizeof *addr); + SOCK_LOG_INFO("Total Sent: %d\n", ret); + if (ret < 0) + return -1; + + ret = fi_poll_fd(sock_fd, SOCK_CM_COMM_TIMEOUT); + retry++; + if (ret <= 0) { + continue; + } + + addr_len = sizeof(struct sockaddr_in); + ret = recvfrom(sock_fd, &response, sizeof(response), 0, + &from_addr, &addr_len); + SOCK_LOG_INFO("Received ACK: %d\n", ret); + if (ret == sizeof(response)) + return 0; + } + return -1; +} + +static int sock_ep_cm_send_ack(int sock_fd, struct sockaddr_in *addr) +{ + int ack_sent = 0, retry = 0, ret; + unsigned char response; + + while(!ack_sent && retry < SOCK_EP_MAX_RETRY) { + ret = sendto(sock_fd, &response, sizeof(response), 0, + addr, sizeof *addr); + retry++; + + SOCK_LOG_INFO("ack: %d\n", ret); + + if (ret == sizeof(response)) { + ack_sent = 1; + break; + } + + if (ret == EWOULDBLOCK || ret == EAGAIN) + usleep(SOCK_CM_COMM_TIMEOUT * 1000); + } + return ack_sent; +} + +static void *sock_msg_ep_listener_thread (void *data) +{ + struct sock_ep *ep = (struct sock_ep *)data; + struct sock_conn_response *conn_response = NULL; + + struct fi_eq_cm_entry cm_entry; + struct fi_eq_err_entry cm_err_entry; + + struct sockaddr_in from_addr; + socklen_t addr_len; + int ret, user_data_sz; + struct fid_ep *fid_ep; + struct sock_ep *sock_ep; + + SOCK_LOG_INFO("Starting listener thread for EP: %p\n", ep); + ep->do_listen = 1; + + while((volatile int)ep->do_listen) { + ret = fi_poll_fd(ep->socket, -1); + if (ret <= 0) + continue; + + if (conn_response == NULL) { + conn_response = (struct sock_conn_response*) + calloc(1, sizeof(*conn_response) + + SOCK_EP_MAX_CM_DATA_SZ); + if (!conn_response) { + SOCK_LOG_ERROR("cannot allocate\n"); + return NULL; + } + } + + addr_len = sizeof(struct sockaddr_in); + ret = recvfrom(ep->socket, (char*)conn_response, + sizeof(*conn_response) + SOCK_EP_MAX_CM_DATA_SZ, + 0, &from_addr, &addr_len); + if (ret <= 0) + continue; + + SOCK_LOG_INFO("Total received: %d\n", ret); + + if (ret < sizeof(*conn_response) || + !sock_ep_cm_send_ack(ep->socket, &from_addr)) + continue; + + user_data_sz = 0; + switch (conn_response->hdr.type) { + + case SOCK_CONN_ACCEPT: + SOCK_LOG_INFO("Received SOCK_CONN_ACCEPT\n"); + memset(&cm_entry, 0, sizeof(cm_entry)); + cm_entry.fid = conn_response->hdr.c_fid; + + if (ret > sizeof(struct sock_conn_response)) { + user_data_sz = ret - + sizeof(struct sock_conn_response); + memcpy(&cm_entry.data, + (char *)conn_response + + sizeof(struct sock_conn_response), + user_data_sz); + } + + fid_ep = container_of(conn_response->hdr.c_fid, + struct fid_ep, fid); + sock_ep = container_of(fid_ep, struct sock_ep, ep); + sock_ep->connected = 1; + sock_ep_enable(&ep->ep); + if (sock_eq_report_event(ep->eq, FI_CONNECTED, &cm_entry, + sizeof(cm_entry) + user_data_sz, 0)) + SOCK_LOG_ERROR("Error in writing to EQ\n"); + break; + + case SOCK_CONN_REJECT: + SOCK_LOG_INFO("Received SOCK_CONN_REJECT\n"); + memset(&cm_err_entry, 0, sizeof(cm_err_entry)); + cm_err_entry.fid = conn_response->hdr.c_fid; + cm_err_entry.context = NULL; + cm_err_entry.data = 0; + cm_err_entry.err = -FI_ECONNREFUSED; + cm_err_entry.prov_errno = 0; + cm_err_entry.err_data = NULL; + + if (ret > sizeof(struct sock_conn_response)) { + user_data_sz = ret - + sizeof(struct sock_conn_response); + memcpy(&cm_entry.data, + (char *)conn_response + + sizeof(struct sock_conn_response), + user_data_sz); + } + + if (sock_eq_report_event(ep->eq, FI_ECONNREFUSED, + &cm_err_entry, + sizeof (cm_err_entry) + + user_data_sz, 0)) + SOCK_LOG_ERROR("Error in writing to EQ\n"); + goto out; + + default: + SOCK_LOG_ERROR("Invalid event\n"); + break; + } + conn_response = NULL; + } + +out: + if (conn_response) + free(conn_response); + close(ep->socket); + ep->socket = 0; + return NULL; +} + static int sock_ep_cm_connect(struct fid_ep *ep, const void *addr, const void *param, size_t paramlen) { - struct sock_conn_req req; + struct sock_conn_req *req; struct sock_ep *_ep; struct sock_eq *_eq; - _ep = container_of(ep, struct sock_ep, fid.ep); + _ep = container_of(ep, struct sock_ep, ep); _eq = _ep->eq; - if (!_eq) { - SOCK_LOG_ERROR("no EQ bound with this ep\n"); + if (!_eq || paramlen > SOCK_EP_MAX_CM_DATA_SZ) + return -FI_EINVAL; + + req = (struct sock_conn_req*)calloc(1, + sizeof(*req) + paramlen); + if (!req) + return -FI_ENOMEM; + + _ep->rem_ep_id = ((struct sockaddr *)addr)->sa_family; + ((struct sockaddr *)addr)->sa_family = AF_INET; + + req->hdr.type = SOCK_CONN_REQ; + req->ep_id = _ep->ep_id; + req->hdr.c_fid = &ep->fid; + req->hdr.s_fid = 0; + memcpy(&req->info, &_ep->info, sizeof(struct fi_info)); + memcpy(&req->src_addr, _ep->info.src_addr, sizeof(struct sockaddr_in)); + memcpy(&req->dest_addr, _ep->info.dest_addr, sizeof(struct sockaddr_in)); + memcpy(&req->tx_attr, _ep->info.tx_attr, sizeof(struct fi_tx_attr)); + memcpy(&req->rx_attr, _ep->info.rx_attr, sizeof(struct fi_rx_attr)); + memcpy(&req->ep_attr, _ep->info.ep_attr, sizeof(struct fi_ep_attr)); + memcpy(&req->domain_attr, _ep->info.domain_attr, sizeof(struct fi_domain_attr)); + memcpy(&req->fabric_attr, _ep->info.fabric_attr, sizeof(struct fi_fabric_attr)); + if (param && paramlen) + memcpy(&req->user_data, param, paramlen); + + if (!_ep->socket) { + _ep->socket = sock_ep_cm_create_socket(); + if (!_ep->socket) { + free (req); + return -FI_EIO; + } + } + + if (sock_ep_cm_send_msg(_ep->socket, addr, req, sizeof (*req) + paramlen)) + return -FI_EIO; + + if (pthread_create(&_ep->listener_thread, NULL, + sock_msg_ep_listener_thread, (void *)_ep)) { + SOCK_LOG_ERROR("Couldn't create listener thread\n"); + free (req); return -FI_EINVAL; } - if(((struct sockaddr *)addr)->sa_family != AF_INET) { - SOCK_LOG_ERROR("invalid address type to connect: only IPv4 supported\n"); - return -FI_EINVAL; - } - - req.type = SOCK_CONNREQ; - req.c_fid = &ep->fid; - req.s_fid = 0; - memcpy(&req.info, &_ep->info, sizeof(struct fi_info)); - memcpy(&req.src_addr, _ep->info.src_addr, sizeof(struct sockaddr_in)); - memcpy(&req.dest_addr, _ep->info.dest_addr, sizeof(struct sockaddr_in)); - memcpy(&req.tx_attr, _ep->info.tx_attr, sizeof(struct fi_tx_attr)); - memcpy(&req.rx_attr, _ep->info.rx_attr, sizeof(struct fi_rx_attr)); - memcpy(&req.ep_attr, _ep->info.ep_attr, sizeof(struct fi_ep_attr)); - memcpy(&req.domain_attr, _ep->info.domain_attr, sizeof(struct fi_domain_attr)); - memcpy(&req.fabric_attr, _ep->info.fabric_attr, sizeof(struct fi_fabric_attr)); - - if (sock_util_sendto(_eq->wait_fd, &req, sizeof(struct sock_conn_req), - (struct sockaddr_in *)addr, sizeof(struct sockaddr_in), 0)) - return -errno; - + free (req); return 0; } static int sock_ep_cm_accept(struct fid_ep *ep, const void *param, size_t paramlen) { struct sock_conn_req *req; - struct sock_domain *_dom; + struct fi_eq_cm_entry cm_entry; + struct sock_conn_response *response; struct sockaddr_in *addr; - socklen_t addrlen; struct sock_ep *_ep; struct sock_eq *_eq; + int ret; - _ep = container_of(ep, struct sock_ep, fid.ep); + _ep = container_of(ep, struct sock_ep, ep); _eq = _ep->eq; - if (!_eq) { - SOCK_LOG_ERROR("no EQ bound with this ep\n"); + if (!_eq || paramlen > SOCK_EP_MAX_CM_DATA_SZ) return -FI_EINVAL; - } - _dom = _ep->domain; - addr = _dom->info.dest_addr; - addrlen = _dom->info.dest_addrlen; - req = (struct sock_conn_req *)_dom->info.connreq; + response = (struct sock_conn_response*)calloc(1, + sizeof(*response) + paramlen); + if (!response) + return -FI_ENOMEM; + + req = (struct sock_conn_req *)_ep->info.connreq; if (!req) { SOCK_LOG_ERROR("invalid connreq for cm_accept\n"); return -FI_EINVAL; } + + memcpy(&response->hdr, &req->hdr, sizeof(struct sock_conn_hdr)); + if (param && paramlen) + memcpy(&response->user_data, param, paramlen); - if (((struct sockaddr *)addr)->sa_family != AF_INET) { - SOCK_LOG_ERROR("invalid address type to connect: only IPv4 supported\n"); - return -FI_EINVAL; + addr = &req->from_addr; + _ep->rem_ep_id = req->ep_id; + response->hdr.type = SOCK_CONN_ACCEPT; + response->hdr.s_fid = &ep->fid; + + _ep->socket = sock_ep_cm_create_socket(); + if (!_ep->socket) { + ret = -FI_EIO; + goto out; } - - req->type = SOCK_ACCEPT; - req->s_fid = &ep->fid; - - if (sock_util_sendto(_eq->wait_fd, req, sizeof(req->type) + - sizeof(req->c_fid) + sizeof(req->s_fid), addr, addrlen, 0)) - return -errno; - + + if (sock_ep_cm_send_msg(_ep->socket, addr, response, + sizeof (*response) + paramlen)) { + close(_ep->socket); + ret = -FI_EIO; + goto out; + } + + sock_ep_enable(ep); + memset(&cm_entry, 0, sizeof(cm_entry)); + cm_entry.fid = &ep->fid; + _ep->connected = 1; + ret = sock_eq_report_event(_eq, FI_CONNECTED, &cm_entry, + sizeof(cm_entry), 0); +out: free(req); - return 0; + free(response); + _ep->info.connreq = NULL; + return ret; } struct fi_ops_cm sock_ep_cm_ops = { @@ -552,7 +807,7 @@ int sock_msg_ep(struct fid_domain *domain, struct fi_info *info, if (ret) return ret; - *ep = &endpoint->fid.ep; + *ep = &endpoint->ep; return 0; } @@ -572,19 +827,19 @@ static int sock_pep_fi_bind(fid_t fid, struct fid *bfid, uint64_t flags) return -FI_EINVAL; } pep->eq = eq; - if ((eq->attr.wait_obj == FI_WAIT_FD) && (eq->wait_fd < 0)) - sock_eq_openwait(eq, (char *)&pep->service); - return 0; } static int sock_pep_fi_close(fid_t fid) { + int c; struct sock_pep *pep; pep = container_of(fid, struct sock_pep, pep.fid); + pep->do_listen = 0; + write(pep->signal_fds[0], &c, 1); + pthread_join(pep->listener_thread, NULL); free(pep); - return 0; } @@ -596,9 +851,181 @@ static struct fi_ops sock_pep_fi_ops = { .ops_open = fi_no_ops_open, }; +static struct fi_info * sock_ep_msg_process_info(struct sock_conn_req *req) +{ + req->info.src_addr = &req->src_addr; + req->info.dest_addr = &req->dest_addr; + req->info.tx_attr = &req->tx_attr; + req->info.rx_attr = &req->rx_attr; + req->info.ep_attr = &req->ep_attr; + req->info.domain_attr = &req->domain_attr; + req->info.fabric_attr = &req->fabric_attr; + req->info.domain_attr->name = NULL; + req->info.fabric_attr->name = NULL; + req->info.fabric_attr->prov_name = NULL; + if (sock_verify_info(&req->info)) { + SOCK_LOG_INFO("incoming conn_req not supported\n"); + errno = EINVAL; + return NULL; + } + + return sock_fi_info(FI_EP_MSG, &req->info, + req->info.dest_addr, req->info.src_addr); +} + +static void *sock_pep_listener_thread (void *data) +{ + struct sock_pep *pep = (struct sock_pep *)data; + struct sock_conn_req *conn_req = NULL; + struct fi_eq_cm_entry cm_entry; + struct sockaddr_in from_addr; + struct pollfd poll_fds[2]; + + socklen_t addr_len; + int ret, user_data_sz, tmp; + + SOCK_LOG_INFO("Starting listener thread for PEP: %p\n", pep); + + poll_fds[0].fd = pep->socket; + poll_fds[1].fd = pep->signal_fds[1]; + poll_fds[0].events = poll_fds[1].events = POLLIN; + while((volatile int)pep->do_listen) { + if (poll(poll_fds, 2, -1) > 0) { + if (poll_fds[1].revents & POLLIN) { + read(pep->signal_fds[1], &tmp, 1); + continue; + } + } else + return NULL; + + if (conn_req == NULL) { + conn_req = (struct sock_conn_req*)calloc(1, + sizeof(*conn_req) + + SOCK_EP_MAX_CM_DATA_SZ); + if (!conn_req) { + SOCK_LOG_ERROR("cannot allocate\n"); + return NULL; + } + } + + addr_len = sizeof(struct sockaddr_in); + ret = recvfrom(pep->socket, (char*)conn_req, + sizeof(*conn_req) + SOCK_EP_MAX_CM_DATA_SZ, 0, + &from_addr, &addr_len); + if (ret <= 0) + continue; + memcpy(&conn_req->from_addr, &from_addr, sizeof(struct sockaddr_in)); + + SOCK_LOG_INFO("Msg received: %d\n", ret); + memset(&cm_entry, 0, sizeof(cm_entry)); + user_data_sz = 0; + + if (conn_req->hdr.type == SOCK_CONN_REQ) { + SOCK_LOG_INFO("Received SOCK_CONN_REQ\n"); + if (ret < sizeof(*conn_req) || + !sock_ep_cm_send_ack(pep->socket, &from_addr)) { + SOCK_LOG_ERROR("Invalid connection request\n"); + break; + } + + cm_entry.info = sock_ep_msg_process_info(conn_req); + cm_entry.info->connreq = (fi_connreq_t)conn_req; + if (ret > sizeof(struct sock_conn_req)) { + user_data_sz = ret - sizeof(struct sock_conn_req); + memcpy(&cm_entry.data, + (char *)conn_req + sizeof(struct sock_conn_req), + user_data_sz); + } + + if (sock_eq_report_event(pep->eq, FI_CONNREQ, &cm_entry, + sizeof(cm_entry) + user_data_sz, 0)) + SOCK_LOG_ERROR("Error in writing to EQ\n"); + } else { + SOCK_LOG_ERROR("Invalid event\n"); + } + conn_req = NULL; + } + + if (conn_req) + free(conn_req); + close(pep->socket); + pep->socket = 0; + return NULL; +} + +static int sock_pep_create_listener_thread(struct sock_pep *pep) +{ + int optval, ret; + socklen_t addr_size; + struct sockaddr_in addr; + struct addrinfo *s_res = NULL, *p; + struct addrinfo hints; + char sa_ip[INET_ADDRSTRLEN] = {0}; + char sa_port[NI_MAXSERV] = {0}; + + pep->do_listen = 1; + + memset(&hints, 0, sizeof(hints)); + hints.ai_family = AF_INET; + hints.ai_socktype = SOCK_DGRAM; + hints.ai_flags = AI_PASSIVE; + hints.ai_protocol = IPPROTO_UDP; + + memcpy(sa_ip, inet_ntoa(pep->src_addr.sin_addr), INET_ADDRSTRLEN); + sprintf(sa_port, "%d", ntohs(pep->src_addr.sin_port)); + + ret = getaddrinfo(sa_ip, sa_port, &hints, &s_res); + if (ret) { + SOCK_LOG_ERROR("no available AF_INET address service:%s, %s\n", + sa_port, gai_strerror(ret)); + return -FI_EINVAL; + } + + for (p=s_res; p; p=p->ai_next) { + pep->socket = socket(p->ai_family, p->ai_socktype, + p->ai_protocol); + if (pep->socket >= 0) { + optval = 1; + setsockopt(pep->socket, SOL_SOCKET, SO_REUSEADDR, &optval, + sizeof optval); + if (!bind(pep->socket, s_res->ai_addr, s_res->ai_addrlen)) + break; + close(pep->socket); + pep->socket = -1; + } + } + + freeaddrinfo(s_res); + if (pep->socket < 0) + return -FI_EIO; + + optval = 1; + setsockopt(pep->socket, SOL_SOCKET, SO_REUSEADDR, &optval, + sizeof optval); + + if (pep->src_addr.sin_port == 0) { + addr_size = sizeof(addr); + if (getsockname(pep->socket, (struct sockaddr*)&addr, &addr_size)) + return -FI_EINVAL; + pep->src_addr.sin_port = addr.sin_port; + } + + SOCK_LOG_INFO("Listener thread bound to %s:%d\n", + sa_ip, ntohs(pep->src_addr.sin_port)); + + if (pthread_create(&pep->listener_thread, NULL, + sock_pep_listener_thread, (void *)pep)) { + SOCK_LOG_ERROR("Couldn't create listener thread\n"); + return -FI_EINVAL; + } + return 0; +} + static int sock_pep_listen(struct fid_pep *pep) { - return 0; + struct sock_pep *_pep; + _pep = container_of(pep, struct sock_pep, pep); + return sock_pep_create_listener_thread(_pep); } static int sock_pep_reject(struct fid_pep *pep, fi_connreq_t connreq, @@ -606,38 +1033,52 @@ static int sock_pep_reject(struct fid_pep *pep, fi_connreq_t connreq, { struct sock_conn_req *req; struct sockaddr_in *addr; - socklen_t addrlen; struct sock_pep *_pep; struct sock_eq *_eq; + struct sock_conn_response *response; + int ret = 0; _pep = container_of(pep, struct sock_pep, pep); _eq = _pep->eq; - if (!_eq) { - SOCK_LOG_ERROR("no EQ bound with this pep\n"); + if (!_eq || paramlen > SOCK_EP_MAX_CM_DATA_SZ) return -FI_EINVAL; - } req = (struct sock_conn_req *)connreq; - if (!req) { - SOCK_LOG_ERROR("invalid connreq for cm_accept\n"); - return -FI_EINVAL; + if (!req) + return 0; + + response = (struct sock_conn_response*) + calloc(1, sizeof(*response) + paramlen); + if (!response) + return -FI_ENOMEM; + + memcpy(&response->hdr, &req->hdr, sizeof(struct sock_conn_hdr)); + if (param && paramlen) + memcpy(&response->user_data, param, paramlen); + + addr = &req->from_addr; + response->hdr.type = SOCK_CONN_REJECT; + response->hdr.s_fid = NULL; + + if (!_pep->socket) { + _pep->socket = sock_ep_cm_create_socket(); + if (!_pep->socket) { + ret = -FI_EIO; + goto out; + } } - addr = &req->src_addr; - addrlen = sizeof(struct sockaddr_in); - if (((struct sockaddr *)addr)->sa_family != AF_INET) { - SOCK_LOG_ERROR("invalid address type to connect: only IPv4 supported\n"); - return -FI_EINVAL; + + if (sock_ep_cm_send_msg(_pep->socket, addr, req, + sizeof(struct sock_conn_response))) { + ret = -FI_EIO; + goto out; } + ret = 0; - req->type = SOCK_REJECT; - req->s_fid = NULL; - - if (sock_util_sendto(_eq->wait_fd, req, sizeof(req->type) + - sizeof(req->c_fid), addr, addrlen, 0)) - return -errno; - +out: free(req); - return 0; + free(response); + return ret; } static struct fi_ops_cm sock_pep_cm_ops = { @@ -652,7 +1093,7 @@ static struct fi_ops_cm sock_pep_cm_ops = { }; int sock_msg_sep(struct fid_domain *domain, struct fi_info *info, - struct fid_sep **sep, void *context) + struct fid_ep **sep, void *context) { int ret; struct sock_ep *endpoint; @@ -661,29 +1102,18 @@ int sock_msg_sep(struct fid_domain *domain, struct fi_info *info, if (ret) return ret; - *sep = &endpoint->fid.sep; - return 0; -} - -int sock_msg_pep(struct fid_fabric *fabric, struct fi_info *info, - struct fid_pep **pep, void *context) -{ - int ret; - struct sock_ep *endpoint; - - ret = sock_msg_endpoint(NULL, info, &endpoint, context, FI_CLASS_PEP); - if (ret) - return ret; - - *pep = &endpoint->fid.pep; + *sep = &endpoint->ep; return 0; } int sock_msg_passive_ep(struct fid_fabric *fabric, struct fi_info *info, - struct fid_pep **pep, void *context) + struct fid_pep **pep, void *context) { + int ret, flags; struct sock_pep *_pep; - int ret; + char hostname[HOST_NAME_MAX]; + struct addrinfo sock_hints; + struct addrinfo *result = NULL; if (info) { ret = sock_verify_info(info); @@ -698,36 +1128,36 @@ int sock_msg_passive_ep(struct fid_fabric *fabric, struct fi_info *info, return -FI_ENOMEM; if(info) { - struct sockaddr *dest_addr = (struct sockaddr *)info->dest_addr; - struct sockaddr *src_addr = (struct sockaddr *)info->src_addr; - if (!dest_addr || !src_addr) { - SOCK_LOG_ERROR("invalid dest_addr or src_addr\n"); - goto err; - } - - if (!dest_addr->sa_family) { - if(getnameinfo(src_addr, sizeof(*src_addr), NULL, 0, - _pep->service, - sizeof(_pep->service), - NI_NUMERICSERV)) { - SOCK_LOG_ERROR("could not resolve src_addr\n"); - goto err; - } + if (info->src_addr) { + memcpy(&_pep->src_addr, info->src_addr, + sizeof(struct sockaddr_in)); } else { - if(getnameinfo(dest_addr, sizeof(*dest_addr), NULL, 0, - _pep->service, - sizeof(_pep->service), - NI_NUMERICSERV)) { - SOCK_LOG_ERROR("could not resolve dest_addr\n"); + gethostname(hostname, HOST_NAME_MAX); + + memset(&sock_hints, 0, sizeof(struct addrinfo)); + sock_hints.ai_family = AF_INET; + sock_hints.ai_socktype = SOCK_STREAM; + ret = getaddrinfo(hostname, NULL, &sock_hints, &result); + + if (ret != 0) { + ret = FI_EINVAL; + SOCK_LOG_INFO("getaddrinfo failed!\n"); goto err; } + memcpy(&_pep->src_addr, result->ai_addr, result->ai_addrlen); } _pep->info = *info; } else { SOCK_LOG_ERROR("invalid fi_info\n"); goto err; } - + + if(socketpair(AF_UNIX, SOCK_STREAM, 0, _pep->signal_fds) < 0) + goto err; + + flags = fcntl(_pep->signal_fds[1], F_GETFL, 0); + fcntl(_pep->signal_fds[1], F_SETFL, flags | O_NONBLOCK); + _pep->pep.fid.fclass = FI_CLASS_PEP; _pep->pep.fid.context = context; _pep->pep.fid.ops = &sock_pep_fi_ops; @@ -736,30 +1166,9 @@ int sock_msg_passive_ep(struct fid_fabric *fabric, struct fi_info *info, _pep->sock_fab = container_of(fabric, struct sock_fabric, fab_fid); *pep = &_pep->pep; - return 0; - err: free(_pep); - return -errno; + return ret; } -struct fi_info * sock_ep_msg_process_info(struct sock_conn_req *req) -{ - req->info.src_addr = &req->src_addr; - req->info.dest_addr = &req->dest_addr; - req->info.tx_attr = &req->tx_attr; - req->info.rx_attr = &req->rx_attr; - req->info.ep_attr = &req->ep_attr; - req->info.domain_attr = &req->domain_attr; - req->info.fabric_attr = &req->fabric_attr; - if (sock_verify_info(&req->info)) { - SOCK_LOG_INFO("incoming conn_req not supported\n"); - errno = EINVAL; - return NULL; - } - - /* reverse src_addr and dest_addr */ - return sock_fi_info(FI_EP_MSG, &req->info, - req->info.dest_addr, req->info.src_addr); -} diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ep_rdm.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ep_rdm.c index c69872947d..0a58ba666b 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ep_rdm.c +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ep_rdm.c @@ -75,7 +75,7 @@ const struct fi_tx_attr sock_rdm_tx_attr = { .op_flags = SOCK_DEF_OPS, .msg_order = SOCK_EP_MSG_ORDER, .inject_size = SOCK_EP_MAX_INJECT_SZ, - .size = SOCK_EP_MAX_TX_CTX_SZ, + .size = SOCK_EP_TX_SZ, .iov_limit = SOCK_EP_MAX_IOV_LIMIT, }; @@ -202,13 +202,12 @@ static struct fi_info *sock_rdm_fi_info(struct fi_info *hints, if (!_info) return NULL; - if (!hints->caps) - _info->caps = SOCK_EP_RDM_CAP; - + _info->caps = SOCK_EP_RDM_CAP; *(_info->tx_attr) = sock_rdm_tx_attr; *(_info->rx_attr) = sock_rdm_rx_attr; *(_info->ep_attr) = sock_rdm_ep_attr; + _info->caps |= (_info->rx_attr->caps | _info->tx_attr->caps); return _info; } @@ -249,9 +248,6 @@ int sock_rdm_getinfo(uint32_t version, const char *node, const char *service, return ret; } - src_addr = calloc(1, sizeof(struct sockaddr_in)); - dest_addr = calloc(1, sizeof(struct sockaddr_in)); - memset(&sock_hints, 0, sizeof(struct addrinfo)); sock_hints.ai_family = AF_INET; sock_hints.ai_socktype = SOCK_STREAM; @@ -286,6 +282,11 @@ int sock_rdm_getinfo(uint32_t version, const char *node, const char *service, goto err; } + src_addr = calloc(1, sizeof(struct sockaddr_in)); + if (!src_addr) { + ret = -FI_ENOMEM; + goto err; + } memcpy(src_addr, result->ai_addr, result->ai_addrlen); freeaddrinfo(result); } else if (node || service) { @@ -310,6 +311,11 @@ int sock_rdm_getinfo(uint32_t version, const char *node, const char *service, goto err; } + dest_addr = calloc(1, sizeof(struct sockaddr_in)); + if (!dest_addr) { + ret = -FI_ENOMEM; + goto err; + } memcpy(dest_addr, result->ai_addr, result->ai_addrlen); udp_sock = socket(AF_INET, SOCK_DGRAM, 0); @@ -321,7 +327,12 @@ int sock_rdm_getinfo(uint32_t version, const char *node, const char *service, goto err; } - len = sizeof(struct sockaddr_in); + len = sizeof(struct sockaddr_in); + src_addr = calloc(1, sizeof(struct sockaddr_in)); + if (!src_addr) { + ret = -FI_ENOMEM; + goto err; + } ret = getsockname(udp_sock, (struct sockaddr*)src_addr, &len); if (ret != 0) { SOCK_LOG_ERROR("getsockname failed\n"); @@ -334,11 +345,25 @@ int sock_rdm_getinfo(uint32_t version, const char *node, const char *service, } if (hints->src_addr) { + if (!src_addr) { + src_addr = calloc(1, sizeof(struct sockaddr_in)); + if (!src_addr) { + ret = -FI_ENOMEM; + goto err; + } + } assert(hints->src_addrlen == sizeof(struct sockaddr_in)); memcpy(src_addr, hints->src_addr, hints->src_addrlen); } if (hints->dest_addr) { + if (!dest_addr) { + dest_addr = calloc(1, sizeof(struct sockaddr_in)); + if (!dest_addr) { + ret = -FI_ENOMEM; + goto err; + } + } assert(hints->dest_addrlen == sizeof(struct sockaddr_in)); memcpy(dest_addr, hints->dest_addr, hints->dest_addrlen); } @@ -362,13 +387,17 @@ int sock_rdm_getinfo(uint32_t version, const char *node, const char *service, } *info = _info; - free(src_addr); - free(dest_addr); + if (src_addr) + free(src_addr); + if (dest_addr) + free(dest_addr); return 0; err: - free(src_addr); - free(dest_addr); + if (src_addr) + free(src_addr); + if (dest_addr) + free(dest_addr); SOCK_LOG_ERROR("fi_getinfo failed\n"); return ret; } @@ -426,12 +455,12 @@ int sock_rdm_ep(struct fid_domain *domain, struct fi_info *info, if (ret) return ret; - *ep = &endpoint->fid.ep; + *ep = &endpoint->ep; return 0; } int sock_rdm_sep(struct fid_domain *domain, struct fi_info *info, - struct fid_sep **sep, void *context) + struct fid_ep **sep, void *context) { int ret; struct sock_ep *endpoint; @@ -440,7 +469,7 @@ int sock_rdm_sep(struct fid_domain *domain, struct fi_info *info, if (ret) return ret; - *sep = &endpoint->fid.sep; + *sep = &endpoint->ep; return 0; } diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_eq.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_eq.c index 90c15467af..8ca2230463 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_eq.c +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_eq.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include "sock.h" @@ -56,18 +57,17 @@ ssize_t sock_eq_sread(struct fid_eq *eq, uint32_t *event, void *buf, size_t len, sock_eq = container_of(eq, struct sock_eq, eq); - fastlock_acquire(&sock_eq->lock); if(!dlistfd_empty(&sock_eq->err_list)) { - ret = -FI_EAVAIL; - goto out; + return -FI_EAVAIL; } - + if(dlistfd_empty(&sock_eq->list)) { ret = dlistfd_wait_avail(&sock_eq->list, timeout); if(ret <= 0) - goto out; + return ret; } + fastlock_acquire(&sock_eq->lock); list = sock_eq->list.list.next; entry = container_of(list, struct sock_eq_entry, entry); @@ -208,115 +208,6 @@ static struct fi_ops_eq sock_eq_ops = { .strerror = sock_eq_strerror, }; -ssize_t sock_eq_fd_sread(struct fid_eq *eq, uint32_t *event, void *buf, - size_t len, int timeout, uint64_t flags) -{ - struct sock_eq *sock_eq; - struct fid_ep *fid_ep; - struct sock_ep *sock_ep; - int ret; - struct sock_conn_req *req; - socklen_t addrlen; - struct sockaddr_in addr; - struct fi_eq_cm_entry *entry; - struct fi_eq_err_entry err; - - req = (struct sock_conn_req *)calloc(1, sizeof(struct sock_conn_req)); - if (!req) { - SOCK_LOG_ERROR("calloc for conn_req failed\n"); - errno = ENOMEM; - return 0; - } - sock_eq = container_of(eq, struct sock_eq, eq); - - addrlen = sizeof(struct sockaddr_in); - ret = sock_util_recvfrom(sock_eq->wait_fd, req, sizeof *req, &addr, &addrlen, - timeout); - - entry = (struct fi_eq_cm_entry *)buf; - switch (req->type) { - case SOCK_ACCEPT: - SOCK_LOG_INFO("received SOCK_ACCEPT\n"); - if (ret != sizeof req->type + sizeof req->c_fid + sizeof req->s_fid) { - SOCK_LOG_ERROR("recvfrom value invalid: %d\n", ret); - return 0; - } - *event = FI_CONNECTED; - entry->info = NULL; - entry->fid = req->c_fid; - fid_ep = container_of(req->c_fid, struct fid_ep, fid); - sock_ep = container_of(fid_ep, struct sock_ep, fid.ep); - sock_ep->connected = 1; - req->type = SOCK_CONNECTED; - if (sock_util_sendto(sock_eq->wait_fd, req, sizeof(req->type) + - sizeof(req->c_fid) + sizeof(req->s_fid), &addr, addrlen, 0)) - return 0; - free(req); - break; - case SOCK_CONNREQ: - SOCK_LOG_INFO("received SOCK_CONNREQ\n"); - if (ret != sizeof *req) { - SOCK_LOG_ERROR("recvfrom value invalid: %d\n", ret); - return 0; - } - *event = FI_CONNREQ; - entry->info = sock_ep_msg_process_info(req); - entry->info->connreq = (fi_connreq_t)req; - if (!entry->info) { - SOCK_LOG_ERROR("failed create new info\n"); - return -errno; - } - break; - case SOCK_REJECT: - SOCK_LOG_INFO("received SOCK_REJECT\n"); - if (ret != sizeof req->type + sizeof req->c_fid) { - SOCK_LOG_ERROR("recvfrom value invalid: %d\n", ret); - return 0; - } - err.fid = req->c_fid; - err.context = NULL; - err.data = 0; - err.err = -FI_ECONNREFUSED; - err.prov_errno = 0; - err.err_data = NULL; - sock_eq_report_event(sock_eq, 0, &err, sizeof err, 0); - free(req); - break; - case SOCK_CONNECTED: - SOCK_LOG_INFO("received SOCK_CONNECTED\n"); - *event = FI_CONNECTED; - entry->info = NULL; - entry->fid = req->s_fid; - fid_ep = container_of(req->s_fid, struct fid_ep, fid); - sock_ep = container_of(fid_ep, struct sock_ep, fid.ep); - sock_ep->connected = 1; - free(req); - break; - case SOCK_SHUTDOWN: - SOCK_LOG_INFO("received SOCK_SHUTDOWN\n"); - *event = FI_SHUTDOWN; - entry->info = NULL; - entry->fid = req->s_fid; - free(req); - break; - default: - SOCK_LOG_ERROR("unexpected req to EQ\n"); - free(req); - return 0; - } - - return sizeof *entry ; -} - -static struct fi_ops_eq sock_eq_fd_ops = { - .size = sizeof(struct fi_ops_eq), - .read = sock_eq_read, - .readerr = sock_eq_readerr, - .write = sock_eq_write, - .sread = sock_eq_fd_sread, - .strerror = sock_eq_strerror, -}; - int sock_eq_fi_close(struct fid *fid) { struct sock_eq *sock_eq; @@ -404,12 +295,12 @@ static struct fi_eq_attr _sock_eq_def_attr ={ .wait_set = NULL, }; -int sock_eq_openwait(struct sock_eq *eq, char *service) +int sock_eq_openwait(struct sock_eq *eq, const char *service) { SOCK_LOG_INFO("enter\n"); struct addrinfo *s_res = NULL, *p; struct addrinfo hints; - int optval; + int optval, ret; if (eq->wait_fd > 0 && !strncmp((char *)&eq->service, service, NI_MAXSERV)) { @@ -426,9 +317,10 @@ int sock_eq_openwait(struct sock_eq *eq, char *service) hints.ai_flags = AI_PASSIVE; hints.ai_protocol = IPPROTO_UDP; - if(getaddrinfo(NULL, service, &hints, &s_res)) { - SOCK_LOG_ERROR("no available AF_INET address\n"); - perror("no available AF_INET address"); + ret = getaddrinfo(NULL, service, &hints, &s_res); + if (ret) { + SOCK_LOG_ERROR("no available AF_INET address service:%s, %s\n", + service, gai_strerror(ret)); return -FI_EINVAL; } @@ -506,7 +398,6 @@ int sock_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr, break; case FI_WAIT_FD: sock_eq->signal = 0; - sock_eq->eq.ops = &sock_eq_fd_ops; break; case FI_WAIT_MUTEX_COND: diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_fabric.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_fabric.c index 467685e810..8a06ddd96d 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_fabric.c +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_fabric.c @@ -44,6 +44,7 @@ const char const sock_fab_name[] = "IP"; const char const sock_dom_name[] = "sockets"; +const char const sock_prov_name[] = "sockets"; const struct fi_fabric_attr sock_fabric_attr = { .fabric = NULL, @@ -237,7 +238,7 @@ static void fi_sockets_fini(void) } struct fi_provider sock_prov = { - .name = "IP", + .name = sock_prov_name, .version = FI_VERSION(SOCK_MAJOR_VERSION, SOCK_MINOR_VERSION), .fi_version = FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), .getinfo = sock_getinfo, @@ -255,5 +256,10 @@ SOCKETS_INI sock_log_level = SOCK_ERROR; } + tmp = getenv("OFI_SOCK_PROGRESS_YIELD_TIME"); + if (tmp) { + sock_progress_thread_wait = atoi(tmp); + } + return (&sock_prov); } diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_msg.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_msg.c index a5c5447e95..8ec908f84d 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_msg.c +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_msg.c @@ -67,7 +67,7 @@ static ssize_t sock_ep_recvmsg(struct fid_ep *ep, const struct fi_msg *msg, switch (ep->fid.fclass) { case FI_CLASS_EP: - sock_ep = container_of(ep, struct sock_ep, fid.ep); + sock_ep = container_of(ep, struct sock_ep, ep); rx_ctx = sock_ep->rx_ctx; break; @@ -157,7 +157,7 @@ static ssize_t sock_ep_sendmsg(struct fid_ep *ep, const struct fi_msg *msg, switch (ep->fid.fclass) { case FI_CLASS_EP: - sock_ep = container_of(ep, struct sock_ep, fid.ep); + sock_ep = container_of(ep, struct sock_ep, ep); tx_ctx = sock_ep->tx_ctx; break; @@ -177,7 +177,8 @@ static ssize_t sock_ep_sendmsg(struct fid_ep *ep, const struct fi_msg *msg, } else { conn = sock_av_lookup_addr(tx_ctx->av, msg->addr); } - assert(conn); + if (!conn) + return -FI_EAGAIN; SOCK_LOG_INFO("New sendmsg on TX: %p using conn: %p\n", tx_ctx, conn); @@ -187,7 +188,7 @@ static ssize_t sock_ep_sendmsg(struct fid_ep *ep, const struct fi_msg *msg, tx_op.op = SOCK_OP_SEND; total_len = 0; - if (flags & FI_INJECT) { + if (SOCK_INJECT_OK(flags)) { for (i=0; i< msg->iov_count; i++) { total_len += msg->msg_iov[i].iov_len; } @@ -221,7 +222,7 @@ static ssize_t sock_ep_sendmsg(struct fid_ep *ep, const struct fi_msg *msg, sock_tx_ctx_write(tx_ctx, &msg->data, sizeof(uint64_t)); } - if (flags & FI_INJECT) { + if (SOCK_INJECT_OK(flags)) { for (i=0; i< msg->iov_count; i++) { sock_tx_ctx_write(tx_ctx, msg->msg_iov[i].iov_base, msg->msg_iov[i].iov_len); @@ -317,8 +318,6 @@ struct fi_ops_msg sock_ep_msg_ops = { .sendmsg = sock_ep_sendmsg, .inject = sock_ep_inject, .senddata = sock_ep_senddata, - .rx_size_left = fi_no_msg_rx_size_left, - .tx_size_left = fi_no_msg_tx_size_left, }; static ssize_t sock_ep_trecvmsg(struct fid_ep *ep, @@ -331,7 +330,7 @@ static ssize_t sock_ep_trecvmsg(struct fid_ep *ep, switch (ep->fid.fclass) { case FI_CLASS_EP: - sock_ep = container_of(ep, struct sock_ep, fid.ep); + sock_ep = container_of(ep, struct sock_ep, ep); rx_ctx = sock_ep->rx_ctx; break; @@ -423,7 +422,7 @@ static ssize_t sock_ep_tsendmsg(struct fid_ep *ep, switch (ep->fid.fclass) { case FI_CLASS_EP: - sock_ep = container_of(ep, struct sock_ep, fid.ep); + sock_ep = container_of(ep, struct sock_ep, ep); tx_ctx = sock_ep->tx_ctx; break; @@ -439,10 +438,11 @@ static ssize_t sock_ep_tsendmsg(struct fid_ep *ep, assert(tx_ctx->enabled && msg->iov_count <= SOCK_EP_MAX_IOV_LIMIT); conn = sock_av_lookup_addr(tx_ctx->av, msg->addr); - assert(conn); + if (!conn) + return -FI_EAGAIN; total_len = 0; - if (flags & FI_INJECT) { + if (SOCK_INJECT_OK(flags)) { for (i=0; i< msg->iov_count; i++) { total_len += msg->msg_iov[i].iov_len; } @@ -479,7 +479,7 @@ static ssize_t sock_ep_tsendmsg(struct fid_ep *ep, } sock_tx_ctx_write(tx_ctx, &msg->tag, sizeof(uint64_t)); - if (flags & FI_INJECT) { + if (SOCK_INJECT_OK(flags)) { for (i=0; i< msg->iov_count; i++) { sock_tx_ctx_write(tx_ctx, msg->msg_iov[i].iov_base, msg->msg_iov[i].iov_len); @@ -579,7 +579,7 @@ static ssize_t sock_ep_tsearch(struct fid_ep *ep, uint64_t *tag, uint64_t ignore switch (ep->fid.fclass) { case FI_CLASS_EP: - sock_ep = container_of(ep, struct sock_ep, fid.ep); + sock_ep = container_of(ep, struct sock_ep, ep); rx_ctx = sock_ep->rx_ctx; break; diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_poll.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_poll.c index 41774e34b9..ad7cbad9ba 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_poll.c +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_poll.c @@ -96,8 +96,7 @@ static int sock_poll_poll(struct fid_poll *pollset, void **context, int count) switch (list_item->fid->fclass) { case FI_CLASS_CQ: cq = container_of(list_item->fid, struct sock_cq, cq_fid); - if (cq->domain->progress_mode == FI_PROGRESS_MANUAL) - sock_cq_progress(cq); + sock_cq_progress(cq); fastlock_acquire(&cq->lock); if (rbfdused(&cq->cq_rbfd)) { *context++ = cq->cq_fid.fid.context; @@ -108,8 +107,7 @@ static int sock_poll_poll(struct fid_poll *pollset, void **context, int count) case FI_CLASS_CNTR: cntr = container_of(list_item->fid, struct sock_cntr, cntr_fid); - if (cntr->domain->progress_mode == FI_PROGRESS_MANUAL) - sock_cntr_progress(cntr); + sock_cntr_progress(cntr); fastlock_acquire(&cntr->mut); if (atomic_get(&cntr->value) >= atomic_get(&cntr->threshold)) { *context++ = cntr->cntr_fid.fid.context; diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_progress.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_progress.c index a9f287cfd8..983280dc72 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_progress.c +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_progress.c @@ -111,6 +111,7 @@ static void sock_pe_release_entry(struct sock_pe *pe, else pe_entry->conn->rx_pe_entry = NULL; + pe->num_free_entries++; pe_entry->conn = NULL; memset(&pe_entry->pe.rx, 0, sizeof(pe_entry->pe.rx)); memset(&pe_entry->pe.tx, 0, sizeof(pe_entry->pe.tx)); @@ -124,7 +125,7 @@ static void sock_pe_release_entry(struct sock_pe *pe, pe_entry->buf = 0; dlist_remove(&pe_entry->entry); - dlist_insert_tail(&pe_entry->entry, &pe->free_list); + dlist_insert_head(&pe_entry->entry, &pe->free_list); SOCK_LOG_INFO("progress entry %p released\n", pe_entry); } @@ -133,11 +134,16 @@ static struct sock_pe_entry *sock_pe_acquire_entry(struct sock_pe *pe) struct dlist_entry *entry; struct sock_pe_entry *pe_entry; + if (dlist_empty(&pe->free_list)) + return NULL; + + pe->num_free_entries--; entry = pe->free_list.next; pe_entry = container_of(entry, struct sock_pe_entry, entry); dlist_remove(&pe_entry->entry); dlist_insert_tail(&pe_entry->entry, &pe->busy_list); - SOCK_LOG_INFO("progress entry %p acquired \n", pe_entry); + SOCK_LOG_INFO("progress entry %p acquired : %lu\n", pe_entry, + PE_INDEX(pe, pe_entry)); return pe_entry; } @@ -145,16 +151,17 @@ static void sock_pe_report_tx_completion(struct sock_pe_entry *pe_entry) { int ret1 = 0, ret2 = 0; - if (pe_entry->comp->send_cq && - (!pe_entry->comp->send_cq_event || - (pe_entry->comp->send_cq_event && - (pe_entry->msg_hdr.flags & FI_COMPLETION)))) - ret1 = pe_entry->comp->send_cq->report_completion( - pe_entry->comp->send_cq, pe_entry->addr, pe_entry); + if (!(pe_entry->flags & FI_INJECT)) { + if (pe_entry->comp->send_cq && + (!pe_entry->comp->send_cq_event || + (pe_entry->comp->send_cq_event && + (pe_entry->msg_hdr.flags & FI_COMPLETION)))) + ret1 = pe_entry->comp->send_cq->report_completion( + pe_entry->comp->send_cq, pe_entry->addr, pe_entry); + } if (pe_entry->comp->send_cntr) ret2 = sock_cntr_inc(pe_entry->comp->send_cntr); - if (ret1 < 0 || ret2 < 0) { SOCK_LOG_ERROR("Failed to report completion %p\n", @@ -175,8 +182,8 @@ static void sock_pe_report_rx_completion(struct sock_pe_entry *pe_entry) if (pe_entry->comp->recv_cq && (!pe_entry->comp->recv_cq_event || - (pe_entry->comp->recv_cq_event && - (pe_entry->msg_hdr.flags & FI_COMPLETION)))) + (pe_entry->comp->recv_cq_event && + (pe_entry->msg_hdr.flags & FI_COMPLETION)))) ret1 = pe_entry->comp->recv_cq->report_completion( pe_entry->comp->recv_cq, pe_entry->addr, pe_entry); @@ -197,7 +204,7 @@ static void sock_pe_report_rx_completion(struct sock_pe_entry *pe_entry) } } -void sock_pe_report_mr_completion(struct sock_domain *domain, +static void sock_pe_report_mr_completion(struct sock_domain *domain, struct sock_pe_entry *pe_entry) { int i; @@ -219,7 +226,7 @@ void sock_pe_report_mr_completion(struct sock_domain *domain, } } -void sock_pe_report_remote_write(struct sock_rx_ctx *rx_ctx, +static void sock_pe_report_remote_write(struct sock_rx_ctx *rx_ctx, struct sock_pe_entry *pe_entry) { pe_entry->buf = pe_entry->pe.rx.rx_iov[0].iov.addr; @@ -246,7 +253,26 @@ void sock_pe_report_remote_write(struct sock_rx_ctx *rx_ctx, sock_cntr_inc(pe_entry->comp->rem_write_cntr); } -void sock_pe_report_remote_read(struct sock_rx_ctx *rx_ctx, +static void sock_pe_report_write_completion(struct sock_pe_entry *pe_entry) +{ + if (!(pe_entry->flags & FI_INJECT)) { + sock_pe_report_tx_completion(pe_entry); + + if (pe_entry->comp->write_cq && + (pe_entry->comp->send_cq != pe_entry->comp->write_cq) && + (!pe_entry->comp->write_cq_event || + (pe_entry->comp->write_cq_event && + (pe_entry->msg_hdr.flags & FI_COMPLETION)))) + pe_entry->comp->write_cq->report_completion( + pe_entry->comp->write_cq, pe_entry->addr, pe_entry); + } + + if (pe_entry->comp->write_cntr && + pe_entry->comp->write_cntr != pe_entry->comp->send_cntr) + sock_cntr_inc(pe_entry->comp->write_cntr); +} + +static void sock_pe_report_remote_read(struct sock_rx_ctx *rx_ctx, struct sock_pe_entry *pe_entry) { pe_entry->buf = pe_entry->pe.rx.rx_iov[0].iov.addr; @@ -273,6 +299,25 @@ void sock_pe_report_remote_read(struct sock_rx_ctx *rx_ctx, sock_cntr_inc(pe_entry->comp->rem_read_cntr); } +static void sock_pe_report_read_completion(struct sock_pe_entry *pe_entry) +{ + if (!(pe_entry->flags & FI_INJECT)) { + sock_pe_report_tx_completion(pe_entry); + + if (pe_entry->comp->read_cq && + (pe_entry->comp->read_cq != pe_entry->comp->send_cq) && + (!pe_entry->comp->read_cq_event || + (pe_entry->comp->read_cq_event && + (pe_entry->msg_hdr.flags & FI_COMPLETION)))) + pe_entry->comp->read_cq->report_completion( + pe_entry->comp->read_cq, pe_entry->addr, pe_entry); + } + + if (pe_entry->comp->read_cntr && + pe_entry->comp->read_cntr != pe_entry->comp->send_cntr) + sock_cntr_inc(pe_entry->comp->read_cntr); +} + static void sock_pe_report_error(struct sock_pe_entry *pe_entry, int rem) { if (pe_entry->comp->recv_cntr) @@ -288,7 +333,9 @@ static void sock_pe_progress_pending_ack(struct sock_pe *pe, int len, data_len, i; struct sock_conn *conn = pe_entry->conn; - assert(conn); + if (!conn) + return; + if (conn->tx_pe_entry != NULL && conn->tx_pe_entry != pe_entry) { SOCK_LOG_INFO("Cannot progress %p as conn %p is being used by %p\n", pe_entry, conn, conn->tx_pe_entry); @@ -341,6 +388,7 @@ static void sock_pe_progress_pending_ack(struct sock_pe *pe, } static void sock_pe_send_response(struct sock_pe *pe, + struct sock_rx_ctx *rx_ctx, struct sock_pe_entry *pe_entry, size_t data_len, uint8_t op_type) { @@ -354,7 +402,9 @@ static void sock_pe_send_response(struct sock_pe *pe, response->msg_hdr.version = SOCK_WIRE_PROTO_VERSION; response->msg_hdr.op_type = op_type; response->msg_hdr.msg_len = htonll(response->msg_hdr.msg_len); - response->msg_hdr.rx_id = htons(pe_entry->msg_hdr.rx_id); + response->msg_hdr.rx_id = pe_entry->msg_hdr.rx_id; + response->msg_hdr.ep_id = htons(sock_av_lookup_ep_id(rx_ctx->av, + pe_entry->addr)); pe->pe_atomic = NULL; pe_entry->done_len = 0; @@ -397,8 +447,7 @@ static int sock_pe_handle_ack(struct sock_pe *pe, struct sock_pe_entry *pe_entry waiting_entry, response->pe_entry_id); assert(waiting_entry->type == SOCK_PE_TX); - if (!(waiting_entry->flags & FI_INJECT)) - sock_pe_report_tx_completion(waiting_entry); + sock_pe_report_tx_completion(waiting_entry); waiting_entry->is_complete = 1; pe_entry->is_complete = 1; return 0; @@ -432,8 +481,30 @@ static int sock_pe_handle_read_complete(struct sock_pe *pe, return 0; len += waiting_entry->pe.tx.data.tx_iov[i].dst.iov.len; } + + sock_pe_report_read_completion(waiting_entry); + waiting_entry->is_complete = 1; + pe_entry->is_complete = 1; + return 0; +} + +static int sock_pe_handle_write_complete(struct sock_pe *pe, + struct sock_pe_entry *pe_entry) +{ + struct sock_pe_entry *waiting_entry; + struct sock_msg_response *response; + + if (sock_pe_read_response(pe_entry)) + return 0; + + response = &pe_entry->response; + assert(response->pe_entry_id <= SOCK_PE_MAX_ENTRIES); + waiting_entry = &pe->pe_table[response->pe_entry_id]; + SOCK_LOG_INFO("Received ack for PE entry %p (index: %d)\n", + waiting_entry, response->pe_entry_id); - sock_pe_report_tx_completion(waiting_entry); + assert(waiting_entry->type == SOCK_PE_TX); + sock_pe_report_write_completion(waiting_entry); waiting_entry->is_complete = 1; pe_entry->is_complete = 1; return 0; @@ -471,14 +542,16 @@ static int sock_pe_handle_atomic_complete(struct sock_pe *pe, len += (waiting_entry->pe.tx.data.tx_iov[i].res.ioc.count * datatype_sz); } - if (!(waiting_entry->flags & FI_INJECT)) - sock_pe_report_tx_completion(waiting_entry); + if (waiting_entry->pe.rx.rx_op.atomic.res_iov_len) + sock_pe_report_read_completion(waiting_entry); + else + sock_pe_report_write_completion(waiting_entry); + waiting_entry->is_complete = 1; pe_entry->is_complete = 1; return 0; } - static int sock_pe_process_rx_read(struct sock_pe *pe, struct sock_rx_ctx *rx_ctx, struct sock_pe_entry *pe_entry) { @@ -507,7 +580,8 @@ static int sock_pe_process_rx_read(struct sock_pe *pe, struct sock_rx_ctx *rx_ct (void*)pe_entry->pe.rx.rx_iov[i].iov.addr, pe_entry->pe.rx.rx_iov[i].iov.len, pe_entry->pe.rx.rx_iov[i].iov.key); - sock_pe_send_response(pe, pe_entry, 0, SOCK_OP_READ_ERROR); + sock_pe_send_response(pe, rx_ctx, pe_entry, 0, + SOCK_OP_READ_ERROR); return -FI_EINVAL; } @@ -524,8 +598,8 @@ static int sock_pe_process_rx_read(struct sock_pe *pe, struct sock_rx_ctx *rx_ct } sock_pe_report_remote_read(rx_ctx, pe_entry); - sock_pe_send_response(pe, pe_entry, data_len, - SOCK_OP_READ_COMPLETE); + sock_pe_send_response(pe, rx_ctx, pe_entry, data_len, + SOCK_OP_READ_COMPLETE); return 0; } @@ -563,7 +637,7 @@ static int sock_pe_process_rx_write(struct sock_pe *pe, struct sock_rx_ctx *rx_c (void*)pe_entry->pe.rx.rx_iov[i].iov.addr, pe_entry->pe.rx.rx_iov[i].iov.len, pe_entry->pe.rx.rx_iov[i].iov.key); - sock_pe_send_response(pe, pe_entry, 0, + sock_pe_send_response(pe, rx_ctx, pe_entry, 0, SOCK_OP_WRITE_ERROR); break; } @@ -598,7 +672,8 @@ static int sock_pe_process_rx_write(struct sock_pe *pe, struct sock_rx_ctx *rx_c out: sock_pe_report_remote_write(rx_ctx, pe_entry); sock_pe_report_mr_completion(rx_ctx->domain, pe_entry); - sock_pe_send_response(pe, pe_entry, 0, SOCK_OP_WRITE_COMPLETE); + sock_pe_send_response(pe, rx_ctx, pe_entry, 0, + SOCK_OP_WRITE_COMPLETE); return ret; } @@ -898,6 +973,14 @@ static int sock_pe_update_atomic(void *cmp, void *dst, void *src, break; } + case FI_LONG_DOUBLE: + { + long double *_cmp, *_dst, *_src; + _cmp = cmp, _src = src, _dst = dst; + SOCK_ATOMIC_UPDATE_FLOAT(_cmp, _src, _dst); + break; + } + default: SOCK_LOG_ERROR("Atomic datatype not supported\n"); break; @@ -968,7 +1051,8 @@ static int sock_pe_process_rx_atomic(struct sock_pe *pe, struct sock_rx_ctx *rx_ (void*)pe_entry->pe.rx.rx_iov[i].ioc.addr, pe_entry->pe.rx.rx_iov[i].ioc.count * datatype_sz, pe_entry->pe.rx.rx_iov[i].ioc.key); - sock_pe_send_response(pe, pe_entry, 0, SOCK_OP_ATOMIC_ERROR); + sock_pe_send_response(pe, rx_ctx, pe_entry, 0, + SOCK_OP_ATOMIC_ERROR); goto err; } if (mr->flags & FI_MR_OFFSET) @@ -1003,7 +1087,7 @@ static int sock_pe_process_rx_atomic(struct sock_pe *pe, struct sock_rx_ctx *rx_ sock_pe_report_remote_write(rx_ctx, pe_entry); sock_pe_report_mr_completion(rx_ctx->domain, pe_entry); - sock_pe_send_response(pe, pe_entry, + sock_pe_send_response(pe, rx_ctx, pe_entry, pe_entry->pe.rx.rx_op.atomic.res_iov_len ? entry_len : 0, SOCK_OP_ATOMIC_COMPLETE); return ret; @@ -1019,24 +1103,26 @@ int sock_pe_progress_buffered_rx(struct sock_rx_ctx *rx_ctx) struct dlist_entry *entry; struct sock_pe_entry pe_entry; struct sock_rx_entry *rx_buffered, *rx_posted; - int i, rem, offset, len, used_len, dst_offset; + int i, rem = 0, offset, len, used_len, dst_offset; if (dlist_empty(&rx_ctx->rx_entry_list) || dlist_empty(&rx_ctx->rx_buffered_list)) - goto out; + return 0; for (entry = rx_ctx->rx_buffered_list.next; entry != &rx_ctx->rx_buffered_list;) { rx_buffered = container_of(entry, struct sock_rx_entry, entry); entry = entry->next; + + if (!rx_buffered->is_complete) + continue; rx_posted = sock_rx_get_entry(rx_ctx, rx_buffered->addr, rx_buffered->tag); if (!rx_posted) continue; - rx_ctx->buffered_len -= rem; SOCK_LOG_INFO("Consuming buffered entry: %p, ctx: %p\n", rx_buffered, rx_ctx); SOCK_LOG_INFO("Consuming posted entry: %p, ctx: %p\n", @@ -1044,6 +1130,7 @@ int sock_pe_progress_buffered_rx(struct sock_rx_ctx *rx_ctx) offset = 0; rem = rx_buffered->iov[0].iov.len; + rx_ctx->buffered_len -= rem; used_len = rx_posted->used; for (i = 0; i < rx_posted->rx_op.dest_iov_len && rem > 0; i++) { if (used_len >= rx_posted->rx_op.dest_iov_len) { @@ -1085,7 +1172,7 @@ int sock_pe_progress_buffered_rx(struct sock_rx_ctx *rx_ctx) if (rem) { SOCK_LOG_INFO("Not enough space in posted recv buffer\n"); sock_pe_report_error(&pe_entry, rem); - goto out; + return 0; } else { sock_pe_report_rx_completion(&pe_entry); } @@ -1096,8 +1183,6 @@ int sock_pe_progress_buffered_rx(struct sock_rx_ctx *rx_ctx) if (pe_entry.flags & FI_MULTI_RECV) sock_rx_release_entry(rx_posted); } - -out: return 0; } @@ -1128,21 +1213,23 @@ static int sock_pe_process_rx_send(struct sock_pe *pe, struct sock_rx_ctx *rx_ct if (pe_entry->done_len == len && !pe_entry->pe.rx.rx_entry) { data_len = pe_entry->msg_hdr.msg_len - len; - fastlock_acquire(&rx_ctx->lock); /* progress buffered recvs, if any */ + fastlock_acquire(&rx_ctx->lock); sock_pe_progress_buffered_rx(rx_ctx); - rx_entry = sock_rx_get_entry(rx_ctx, pe_entry->addr, pe_entry->tag); - SOCK_LOG_INFO("Consuming posted entry: %p\n", rx_entry); + SOCK_LOG_INFO("Consuming posted entry: %p\n", rx_entry); + if (!rx_entry) { SOCK_LOG_INFO("%p: No matching recv, buffering recv (len=%llu)\n", pe_entry, (long long unsigned int)data_len); rx_entry = sock_rx_new_buffered_entry(rx_ctx, data_len); - if (!rx_entry) + if (!rx_entry) { + fastlock_release(&rx_ctx->lock); return -FI_ENOMEM; + } rx_entry->addr = pe_entry->addr; rx_entry->tag = pe_entry->tag; @@ -1151,10 +1238,9 @@ static int sock_pe_process_rx_send(struct sock_pe *pe, struct sock_rx_ctx *rx_ct rx_entry->comp = pe_entry->comp; pe_entry->context = rx_entry->context; } + fastlock_release(&rx_ctx->lock); pe_entry->context = rx_entry->context; pe_entry->pe.rx.rx_entry = rx_entry; - rx_entry->is_busy = 1; - fastlock_release(&rx_ctx->lock); } rx_entry = pe_entry->pe.rx.rx_entry; @@ -1203,6 +1289,7 @@ static int sock_pe_process_rx_send(struct sock_pe *pe, struct sock_rx_ctx *rx_ct fastlock_release(&rx_ctx->lock); pe_entry->is_complete = 1; + rx_entry->is_complete = 1; rx_entry->is_busy = 0; /* report error, if any */ @@ -1214,12 +1301,13 @@ static int sock_pe_process_rx_send(struct sock_pe *pe, struct sock_rx_ctx *rx_ct if (!rx_entry->is_buffered) sock_pe_report_rx_completion(pe_entry); } - - if (pe_entry->msg_hdr.flags & FI_REMOTE_COMPLETE) { - sock_pe_send_response(pe, pe_entry, 0, SOCK_OP_SEND_COMPLETE); - } - + out: + if (pe_entry->msg_hdr.flags & FI_REMOTE_COMPLETE) { + sock_pe_send_response(pe, rx_ctx, pe_entry, 0, + SOCK_OP_SEND_COMPLETE); + } + if (!rx_entry->is_buffered && (!(rx_entry->flags & FI_MULTI_RECV) || (pe_entry->flags & FI_MULTI_RECV))) @@ -1261,13 +1349,13 @@ static int sock_pe_process_recv(struct sock_pe *pe, struct sock_rx_ctx *rx_ctx, break; case SOCK_OP_SEND_COMPLETE: - case SOCK_OP_WRITE_COMPLETE: - case SOCK_OP_WRITE_ERROR: - case SOCK_OP_READ_ERROR: - case SOCK_OP_ATOMIC_ERROR: ret = sock_pe_handle_ack(pe, pe_entry); break; + case SOCK_OP_WRITE_COMPLETE: + ret = sock_pe_handle_write_complete(pe, pe_entry); + break; + case SOCK_OP_READ_COMPLETE: ret = sock_pe_handle_read_complete(pe, pe_entry); break; @@ -1276,6 +1364,12 @@ static int sock_pe_process_recv(struct sock_pe *pe, struct sock_rx_ctx *rx_ctx, ret = sock_pe_handle_atomic_complete(pe, pe_entry); break; + case SOCK_OP_WRITE_ERROR: + case SOCK_OP_READ_ERROR: + case SOCK_OP_ATOMIC_ERROR: + ret = sock_pe_handle_ack(pe, pe_entry); + break; + default: ret = -FI_ENOSYS; SOCK_LOG_ERROR("Operation not supported\n"); @@ -1306,9 +1400,10 @@ static int sock_pe_peek_hdr(struct sock_pe *pe, return -1; msg_hdr->msg_len = ntohll(msg_hdr->msg_len); - msg_hdr->rx_id = ntohs(msg_hdr->rx_id); + msg_hdr->rx_id = msg_hdr->rx_id; msg_hdr->flags = ntohll(msg_hdr->flags); msg_hdr->pe_entry_id = ntohs(msg_hdr->pe_entry_id); + msg_hdr->ep_id = ntohs(msg_hdr->ep_id); SOCK_LOG_INFO("PE RX (Hdr peek): MsgLen: %lu, TX-ID: %d, Type: %d\n", msg_hdr->msg_len, msg_hdr->rx_id, msg_hdr->op_type); @@ -1318,6 +1413,9 @@ static int sock_pe_peek_hdr(struct sock_pe *pe, static int sock_pe_read_hdr(struct sock_pe *pe, struct sock_rx_ctx *rx_ctx, struct sock_pe_entry *pe_entry) { + int match; + struct sock_ep *ep; + struct dlist_entry *entry; struct sock_msg_hdr *msg_hdr; struct sock_conn *conn = pe_entry->conn; @@ -1334,14 +1432,35 @@ static int sock_pe_read_hdr(struct sock_pe *pe, struct sock_rx_ctx *rx_ctx, if (msg_hdr->rx_id != rx_ctx->rx_id) return -1; + + if (rx_ctx->ctx.fid.fclass == FI_CLASS_SRX_CTX) { + match = 0; + for (entry = rx_ctx->ep_list.next; + entry != &rx_ctx->ep_list; entry = entry->next) { + ep = container_of(entry, struct sock_ep, rx_ctx_entry); + if (ep->ep_id == msg_hdr->ep_id) { + match = 1; + break; + } + } + if (!match) + return -1; + } else { + if (msg_hdr->ep_id != rx_ctx->ep->ep_id) { + SOCK_LOG_INFO("Mismatch: %d:%d\n", + msg_hdr->ep_id,rx_ctx->ep->ep_id); + return -1; + } + } sock_pe_recv_field(pe_entry, (void*)msg_hdr, sizeof(struct sock_msg_hdr), 0); msg_hdr->msg_len = ntohll(msg_hdr->msg_len); - msg_hdr->rx_id = ntohs(msg_hdr->rx_id); + msg_hdr->rx_id = msg_hdr->rx_id; msg_hdr->flags = ntohll(msg_hdr->flags); msg_hdr->pe_entry_id = ntohs(msg_hdr->pe_entry_id); + msg_hdr->ep_id = ntohs(msg_hdr->ep_id); pe_entry->pe.rx.header_read = 1; SOCK_LOG_INFO("PE RX (Hdr read): MsgLen: %lu, TX-ID: %d, Type: %d\n", @@ -1397,7 +1516,7 @@ static int sock_pe_progress_tx_atomic(struct sock_pe *pe, } /* data */ - if (pe_entry->flags & FI_INJECT) { + if (SOCK_INJECT_OK(pe_entry->flags)) { if (sock_pe_send_field(pe_entry, &pe_entry->pe.tx.data.inject[0], pe_entry->pe.tx.tx_op.src_iov_len, len)) @@ -1454,7 +1573,7 @@ static int sock_pe_progress_tx_write(struct sock_pe *pe, len += dest_iov_len; /* data */ - if (pe_entry->flags & FI_INJECT) { + if (SOCK_INJECT_OK(pe_entry->flags)) { if (sock_pe_send_field(pe_entry, &pe_entry->pe.tx.data.inject[0], pe_entry->pe.tx.tx_op.src_iov_len, len)) return 0; @@ -1541,7 +1660,7 @@ static int sock_pe_progress_tx_send(struct sock_pe *pe, len += SOCK_CQ_DATA_SIZE; } - if (pe_entry->flags & FI_INJECT) { + if (SOCK_INJECT_OK(pe_entry->flags)) { if (sock_pe_send_field(pe_entry, pe_entry->pe.tx.data.inject, pe_entry->pe.tx.tx_op.src_iov_len, len)) return 0; @@ -1567,8 +1686,7 @@ static int sock_pe_progress_tx_send(struct sock_pe *pe, SOCK_LOG_INFO("Send complete\n"); if (!(pe_entry->flags & FI_REMOTE_COMPLETE)) { - if (!(pe_entry->flags & FI_INJECT)) - sock_pe_report_tx_completion(pe_entry); + sock_pe_report_tx_completion(pe_entry); pe_entry->is_complete = 1; } } @@ -1583,10 +1701,9 @@ static int sock_pe_progress_tx_entry(struct sock_pe *pe, int ret; struct sock_conn *conn = pe_entry->conn; - if (pe_entry->pe.tx.send_done) + if (!pe_entry->conn || pe_entry->pe.tx.send_done) return 0; - assert(pe_entry->conn); if (conn->tx_pe_entry != NULL && conn->tx_pe_entry != pe_entry) { SOCK_LOG_INFO("Cannot progress %p as conn %p is being used by %p\n", pe_entry, conn, conn->tx_pe_entry); @@ -1598,6 +1715,12 @@ static int sock_pe_progress_tx_entry(struct sock_pe *pe, conn->tx_pe_entry = pe_entry; } + if ((pe_entry->flags & FI_FENCE) && + (tx_ctx->pe_entry_list.next != &pe_entry->ctx_entry)) { + SOCK_LOG_INFO("Waiting for FI_FENCE\n"); + return 0; + } + if (!pe_entry->pe.tx.header_sent) { if (sock_pe_send_field(pe_entry, &pe_entry->msg_hdr, sizeof(struct sock_msg_hdr), 0)) @@ -1633,15 +1756,51 @@ static int sock_pe_progress_tx_entry(struct sock_pe *pe, return ret; } +static int sock_pe_progress_rx_pe_entry(struct sock_pe *pe, + struct sock_pe_entry *pe_entry, + struct sock_rx_ctx *rx_ctx) +{ + int ret; + + if (pe_entry->pe.rx.pending_send) { + sock_pe_progress_pending_ack(pe, pe_entry); + if (pe_entry->is_complete) { + sock_pe_release_entry(pe, pe_entry); + SOCK_LOG_INFO("[%p] RX done\n", pe_entry); + } + return 0; + } + + if (!pe_entry->pe.rx.header_read) { + if (sock_pe_read_hdr(pe, rx_ctx, pe_entry) == -1) { + sock_pe_release_entry(pe, pe_entry); + return 0; + } + } + + if (pe_entry->pe.rx.header_read) { + ret = sock_pe_process_recv(pe, rx_ctx, pe_entry); + if (ret < 0) + return ret; + } + + if (pe_entry->is_complete) { + sock_pe_release_entry(pe, pe_entry); + SOCK_LOG_INFO("[%p] RX done\n", pe_entry); + } + return 0; +} + static int sock_pe_new_rx_entry(struct sock_pe *pe, struct sock_rx_ctx *rx_ctx, struct sock_ep *ep, struct sock_conn *conn, int key) { + int ret; struct sock_pe_entry *pe_entry; pe_entry = sock_pe_acquire_entry(pe); if (!pe_entry) { - SOCK_LOG_ERROR("Error in getting PE entry\n"); - return -FI_EINVAL; + SOCK_LOG_INFO("Cannot get PE entry\n"); + return 0; } memset(&pe_entry->pe.rx, 0, sizeof(struct sock_rx_pe_entry)); @@ -1652,12 +1811,12 @@ static int sock_pe_new_rx_entry(struct sock_pe *pe, struct sock_rx_ctx *rx_ctx, pe_entry->is_complete = 0; pe_entry->done_len = 0; - if (ep->ep_type == FI_EP_MSG) + if (ep->ep_type == FI_EP_MSG || !ep->av) pe_entry->addr = FI_ADDR_NOTAVAIL; else pe_entry->addr = sock_av_lookup_key(ep->av, key); - if (ep->ep_attr.rx_ctx_cnt == FI_SHARED_CONTEXT) + if (rx_ctx->ctx.fid.fclass == FI_CLASS_SRX_CTX) pe_entry->comp = &ep->comp; else pe_entry->comp = &rx_ctx->comp; @@ -1671,7 +1830,9 @@ static int sock_pe_new_rx_entry(struct sock_pe *pe, struct sock_rx_ctx *rx_ctx, /* link to tracking list in rx_ctx */ dlist_init(&pe_entry->ctx_entry); dlist_insert_tail(&pe_entry->ctx_entry, &rx_ctx->pe_entry_list); - return 0; + + ret = sock_pe_progress_rx_pe_entry(pe, pe_entry, rx_ctx); + return ret; } static int sock_pe_new_tx_entry(struct sock_pe *pe, struct sock_tx_ctx *tx_ctx) @@ -1680,12 +1841,11 @@ static int sock_pe_new_tx_entry(struct sock_pe *pe, struct sock_tx_ctx *tx_ctx) struct sock_msg_hdr *msg_hdr; struct sock_pe_entry *pe_entry; struct sock_ep *ep; - uint16_t rx_id; pe_entry = sock_pe_acquire_entry(pe); if (!pe_entry) { - SOCK_LOG_ERROR("Failed to get free PE entry \n"); - return -FI_EINVAL; + SOCK_LOG_INFO("Cannot get free PE entry \n"); + return 0; } memset(&pe_entry->pe.tx, 0, sizeof(struct sock_tx_pe_entry)); @@ -1698,6 +1858,7 @@ static int sock_pe_new_tx_entry(struct sock_pe *pe, struct sock_tx_ctx *tx_ctx) pe_entry->ep = tx_ctx->ep; pe_entry->pe.tx.tx_ctx = tx_ctx; + dlist_init(&pe_entry->ctx_entry); dlist_insert_tail(&pe_entry->ctx_entry, &tx_ctx->pe_entry_list); /* fill in PE tx entry */ @@ -1717,9 +1878,9 @@ static int sock_pe_new_tx_entry(struct sock_pe *pe, struct sock_tx_ctx *tx_ctx) rbfdread(&tx_ctx->rbfd, &pe_entry->buf, sizeof(uint64_t)); rbfdread(&tx_ctx->rbfd, &ep, sizeof(uint64_t)); - if (ep && ep->ep_attr.tx_ctx_cnt == FI_SHARED_CONTEXT) + if (ep && tx_ctx->fid.stx.fid.fclass == FI_CLASS_STX_CTX) pe_entry->comp = &ep->comp; - else + else pe_entry->comp = &tx_ctx->comp; if (pe_entry->flags & FI_REMOTE_CQ_DATA) { @@ -1738,7 +1899,7 @@ static int sock_pe_new_tx_entry(struct sock_pe *pe, struct sock_tx_ctx *tx_ctx) case SOCK_OP_SEND: case SOCK_OP_TSEND: - if (pe_entry->flags & FI_INJECT) { + if (SOCK_INJECT_OK(pe_entry->flags)) { rbfdread(&tx_ctx->rbfd, &pe_entry->pe.tx.data.inject[0], pe_entry->pe.tx.tx_op.src_iov_len); msg_hdr->msg_len += pe_entry->pe.tx.tx_op.src_iov_len; @@ -1754,7 +1915,7 @@ static int sock_pe_new_tx_entry(struct sock_pe *pe, struct sock_tx_ctx *tx_ctx) case SOCK_OP_WRITE: - if (pe_entry->flags & FI_INJECT) { + if (SOCK_INJECT_OK(pe_entry->flags)) { rbfdread(&tx_ctx->rbfd, &pe_entry->pe.tx.data.inject[0], pe_entry->pe.tx.tx_op.src_iov_len); msg_hdr->msg_len += pe_entry->pe.tx.tx_op.src_iov_len; @@ -1796,7 +1957,7 @@ static int sock_pe_new_tx_entry(struct sock_pe *pe, struct sock_tx_ctx *tx_ctx) case SOCK_OP_ATOMIC: msg_hdr->msg_len += sizeof(struct sock_op); datatype_sz = fi_datatype_size(pe_entry->pe.tx.tx_op.atomic.datatype); - if (pe_entry->flags & FI_INJECT) { + if (SOCK_INJECT_OK(pe_entry->flags)) { rbfdread(&tx_ctx->rbfd, &pe_entry->pe.tx.data.inject[0], pe_entry->pe.tx.tx_op.src_iov_len); msg_hdr->msg_len += pe_entry->pe.tx.tx_op.src_iov_len; @@ -1845,18 +2006,21 @@ static int sock_pe_new_tx_entry(struct sock_pe *pe, struct sock_tx_ctx *tx_ctx) msg_hdr->version = SOCK_WIRE_PROTO_VERSION; if (tx_ctx->av) { - rx_id = (uint16_t)SOCK_GET_RX_ID(pe_entry->addr, - tx_ctx->av->rx_ctx_bits); - msg_hdr->rx_id = htons(rx_id); + msg_hdr->rx_id = (uint16_t)SOCK_GET_RX_ID(pe_entry->addr, + tx_ctx->av->rx_ctx_bits); + msg_hdr->ep_id = sock_av_lookup_ep_id(tx_ctx->av, pe_entry->addr); } else { msg_hdr->rx_id = 0; + msg_hdr->ep_id = ep->rem_ep_id; } - msg_hdr->dest_iov_len = pe_entry->pe.tx.tx_op.src_iov_len; + + msg_hdr->dest_iov_len = pe_entry->pe.tx.tx_op.dest_iov_len; msg_hdr->flags = htonll(pe_entry->flags); pe_entry->total_len = msg_hdr->msg_len; msg_hdr->msg_len = htonll(msg_hdr->msg_len); msg_hdr->pe_entry_id = htons(msg_hdr->pe_entry_id); - return 0; + msg_hdr->ep_id = htons(msg_hdr->ep_id); + return sock_pe_progress_tx_entry(pe, tx_ctx, pe_entry); } void sock_pe_add_tx_ctx(struct sock_pe *pe, struct sock_tx_ctx *ctx) @@ -1875,28 +2039,44 @@ void sock_pe_add_rx_ctx(struct sock_pe *pe, struct sock_rx_ctx *ctx) SOCK_LOG_INFO("RX ctx added to PE\n"); } +void sock_pe_remove_tx_ctx(struct sock_tx_ctx *tx_ctx) +{ + fastlock_acquire(&tx_ctx->domain->pe->lock); + dlist_remove(&tx_ctx->pe_entry); + fastlock_release(&tx_ctx->domain->pe->lock); +} + +void sock_pe_remove_rx_ctx(struct sock_rx_ctx *rx_ctx) +{ + fastlock_acquire(&rx_ctx->domain->pe->lock); + dlist_remove(&rx_ctx->pe_entry); + fastlock_release(&rx_ctx->domain->pe->lock); +} + int sock_pe_progress_rx_ep(struct sock_pe *pe, struct sock_ep *ep, struct sock_rx_ctx *rx_ctx) { struct sock_conn *conn; struct sock_conn_map *map; int i, ret = 0, data_avail; - struct pollfd poll_fd; map = &ep->domain->r_cmap; assert(map != NULL); - poll_fd.events = POLLIN; for (i=0; iused; i++) { conn = &map->table[i]; + if (!conn) + continue; + + if (rbused(&conn->outbuf)) + sock_comm_flush(conn); data_avail = 0; if (rbused(&conn->inbuf) > 0) { data_avail = 1; } else { - poll_fd.fd = conn->sock_fd; - ret = poll(&poll_fd, 1, 0); - if (ret < 0) { + ret = fi_poll_fd(conn->sock_fd, 0); + if (ret < 0 && errno != EINTR) { SOCK_LOG_INFO("Error polling fd: %d\n", conn->sock_fd); return ret; @@ -1904,7 +2084,8 @@ int sock_pe_progress_rx_ep(struct sock_pe *pe, struct sock_ep *ep, data_avail = (ret == 1); } - if (data_avail && conn->rx_pe_entry == NULL) { + if (data_avail && conn->rx_pe_entry == NULL && + !dlist_empty(&pe->free_list)) { /* new RX PE entry */ ret = sock_pe_new_rx_entry(pe, rx_ctx, ep, conn, i); if (ret < 0) @@ -1931,15 +2112,18 @@ int sock_pe_progress_rx_ctx(struct sock_pe *pe, struct sock_rx_ctx *rx_ctx) /* check for incoming data */ if (rx_ctx->ctx.fid.fclass == FI_CLASS_SRX_CTX) { for (entry = rx_ctx->ep_list.next; - entry != &rx_ctx->ep_list; entry = entry->next) { + entry != &rx_ctx->ep_list; ) { ep = container_of(entry, struct sock_ep, rx_ctx_entry); - if ((ret = sock_pe_progress_rx_ep(pe, ep, rx_ctx)) < 0) + entry = entry->next; + ret = sock_pe_progress_rx_ep(pe, ep, rx_ctx); + if (ret < 0) goto out; } } else { ep = rx_ctx->ep; - if ((ret = sock_pe_progress_rx_ep(pe, ep, rx_ctx)) < 0) + ret = sock_pe_progress_rx_ep(pe, ep, rx_ctx); + if (ret < 0) goto out; } @@ -1949,34 +2133,9 @@ int sock_pe_progress_rx_ctx(struct sock_pe *pe, struct sock_rx_ctx *rx_ctx) pe_entry = container_of(entry, struct sock_pe_entry, ctx_entry); entry = entry->next; - - if (pe_entry->pe.rx.pending_send) { - sock_pe_progress_pending_ack(pe, pe_entry); - if (pe_entry->is_complete) { - sock_pe_release_entry(pe, pe_entry); - SOCK_LOG_INFO("[%p] RX done\n", pe_entry); - } - continue; - } - - - if (!pe_entry->pe.rx.header_read) { - if (sock_pe_read_hdr(pe, rx_ctx, pe_entry) == -1) { - sock_pe_release_entry(pe, pe_entry); - continue; - } - } - - if (pe_entry->pe.rx.header_read) { - ret = sock_pe_process_recv(pe, rx_ctx, pe_entry); - if (ret < 0) - goto out; - } - - if (pe_entry->is_complete) { - sock_pe_release_entry(pe, pe_entry); - SOCK_LOG_INFO("[%p] RX done\n", pe_entry); - } + ret = sock_pe_progress_rx_pe_entry(pe, pe_entry, rx_ctx); + if (ret < 0) + goto out; } out: @@ -1996,8 +2155,8 @@ int sock_pe_progress_tx_ctx(struct sock_pe *pe, struct sock_tx_ctx *tx_ctx) /* check tx_ctx rbuf */ fastlock_acquire(&tx_ctx->rlock); - while (!rbfdempty(&tx_ctx->rbfd) && - !dlist_empty(&pe->free_list)) { + if (!rbfdempty(&tx_ctx->rbfd) && + pe->num_free_entries > SOCK_PE_MIN_ENTRIES) { /* new TX PE entry */ ret = sock_pe_new_tx_entry(pe, tx_ctx); if (ret < 0) { @@ -2019,7 +2178,7 @@ int sock_pe_progress_tx_ctx(struct sock_pe *pe, struct sock_tx_ctx *tx_ctx) SOCK_LOG_ERROR("Error in progressing %p\n", pe_entry); goto out; } - + if (pe_entry->is_complete) { sock_pe_release_entry(pe, pe_entry); SOCK_LOG_INFO("[%p] TX done\n", pe_entry); @@ -2044,6 +2203,11 @@ static void *sock_pe_progress_thread(void *data) SOCK_LOG_INFO("Progress thread started\n"); while (pe->do_progress) { + if (sock_progress_thread_wait) { + pthread_yield(); + usleep(sock_progress_thread_wait * 1000); + } + /* progress tx */ if (!dlistfd_empty(&pe->tx_list)) { for (entry = pe->tx_list.list.next; @@ -2091,9 +2255,10 @@ static void sock_pe_init_table( dlist_init(&pe->busy_list); for (i=0; ipe_table[i].entry, &pe->free_list); + dlist_insert_head(&pe->pe_table[i].entry, &pe->free_list); } + pe->num_free_entries = SOCK_PE_MAX_ENTRIES; SOCK_LOG_INFO("PE table init: OK\n"); } @@ -2104,7 +2269,6 @@ struct sock_pe *sock_pe_init(struct sock_domain *domain) return NULL; sock_pe_init_table(pe); - dlistfd_head_init(&pe->tx_list); dlistfd_head_init(&pe->rx_list); fastlock_init(&pe->lock); diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_rma.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_rma.c index dd37754b7e..0a4987f00d 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_rma.c +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_rma.c @@ -71,7 +71,7 @@ static ssize_t sock_ep_rma_readmsg(struct fid_ep *ep, switch (ep->fid.fclass) { case FI_CLASS_EP: - sock_ep = container_of(ep, struct sock_ep, fid.ep); + sock_ep = container_of(ep, struct sock_ep, ep); tx_ctx = sock_ep->tx_ctx; break; @@ -90,7 +90,8 @@ static ssize_t sock_ep_rma_readmsg(struct fid_ep *ep, msg->rma_iov_count <= SOCK_EP_MAX_IOV_LIMIT); conn = sock_av_lookup_addr(tx_ctx->av, msg->addr); - assert(conn); + if (!conn) + return -FI_EAGAIN; total_len = sizeof(struct sock_op_send) + (msg->iov_count * sizeof(union sock_iov)) + @@ -214,7 +215,7 @@ static ssize_t sock_ep_rma_writemsg(struct fid_ep *ep, switch (ep->fid.fclass) { case FI_CLASS_EP: - sock_ep = container_of(ep, struct sock_ep, fid.ep); + sock_ep = container_of(ep, struct sock_ep, ep); tx_ctx = sock_ep->tx_ctx; break; @@ -236,7 +237,9 @@ static ssize_t sock_ep_rma_writemsg(struct fid_ep *ep, } else { conn = sock_av_lookup_addr(tx_ctx->av, msg->addr); } - assert(conn); + + if (!conn) + return -FI_EAGAIN; flags |= tx_ctx->attr.op_flags; memset(&tx_op, 0, sizeof(struct sock_op)); @@ -244,7 +247,7 @@ static ssize_t sock_ep_rma_writemsg(struct fid_ep *ep, tx_op.dest_iov_len = msg->rma_iov_count; total_len = 0; - if (flags & FI_INJECT) { + if (SOCK_INJECT_OK(flags)) { for (i=0; i< msg->iov_count; i++) { total_len += msg->msg_iov[i].iov_len; } @@ -277,11 +280,11 @@ static ssize_t sock_ep_rma_writemsg(struct fid_ep *ep, } src_len = 0; - if (flags & FI_INJECT) { + if (SOCK_INJECT_OK(flags)) { for (i=0; i< msg->iov_count; i++) { sock_tx_ctx_write(tx_ctx, msg->msg_iov[i].iov_base, msg->msg_iov[i].iov_len); - src_len += tx_iov.iov.len; + src_len += msg->msg_iov[i].iov_len; } } else { for (i = 0; i< msg->iov_count; i++) { @@ -349,16 +352,22 @@ static ssize_t sock_ep_rma_writev(struct fid_ep *ep, size_t count, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { + int i; + size_t len; struct fi_msg_rma msg; struct fi_rma_iov rma_iov; msg.msg_iov = iov; msg.desc = desc; msg.iov_count = count; + msg.rma_iov_count = 1; + + for (i = 0, len = 0; i < count; i++) + len += iov[i].iov_len; rma_iov.addr = addr; rma_iov.key = key; - rma_iov.len = 1; + rma_iov.len = len; msg.rma_iov = &rma_iov; msg.context = context; @@ -380,10 +389,11 @@ static ssize_t sock_ep_rma_writedata(struct fid_ep *ep, const void *buf, msg_iov.iov_len = len; msg.desc = &desc; msg.iov_count = 1; + msg.rma_iov_count = 1; rma_iov.addr = addr; rma_iov.key = key; - rma_iov.len = 1; + rma_iov.len = len; msg.rma_iov = &rma_iov; msg.msg_iov = &msg_iov; @@ -407,10 +417,11 @@ static ssize_t sock_ep_rma_inject(struct fid_ep *ep, const void *buf, msg_iov.iov_len = len; msg.msg_iov = &msg_iov; msg.iov_count = 1; + msg.rma_iov_count = 1; rma_iov.addr = addr; rma_iov.key = key; - rma_iov.len = 1; + rma_iov.len = len; msg.rma_iov = &rma_iov; msg.msg_iov = &msg_iov; @@ -431,10 +442,11 @@ static ssize_t sock_ep_rma_injectdata(struct fid_ep *ep, const void *buf, msg_iov.iov_len = len; msg.msg_iov = &msg_iov; msg.iov_count = 1; + msg.rma_iov_count = 1; rma_iov.addr = addr; rma_iov.key = key; - rma_iov.len = 1; + rma_iov.len = len; msg.rma_iov = &rma_iov; msg.msg_iov = &msg_iov; diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_rx_entry.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_rx_entry.c index ba98d9e4c0..536fa1d33c 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_rx_entry.c +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_rx_entry.c @@ -93,6 +93,7 @@ struct sock_rx_entry *sock_rx_new_buffered_entry(struct sock_rx_ctx *rx_ctx, rx_ctx->buffered_len += len; dlist_insert_tail(&rx_entry->entry, &rx_ctx->rx_buffered_list); + rx_entry->is_busy = 1; return rx_entry; } @@ -126,6 +127,7 @@ struct sock_rx_entry *sock_rx_get_entry(struct sock_rx_ctx *rx_ctx, if (entry == &rx_ctx->rx_entry_list) rx_entry = NULL; - + else + rx_entry->is_busy = 1; return rx_entry; } diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_util.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_util.c index 744a22d24d..a6de05640a 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_util.c +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_util.c @@ -55,100 +55,4 @@ #include "sock_util.h" int sock_log_level = SOCK_ERROR; -char host[128] = {0}; -#define SOCK_SENDTO_TIMEOUT 5 - -int sock_util_sendto(int fd, void *buf, size_t len, struct sockaddr_in *addr, - socklen_t addrlen, int timeout) -{ - struct timeval tv; - fd_set writefds; - socklen_t optlen; - int optval; - - if (sendto(fd, buf, len, 0, addr, addrlen) < 0) { - SOCK_LOG_ERROR("sendto failed with error %d - %s\n", errno, - strerror(errno)); - return -errno; - } - - if (timeout) { - tv.tv_sec = 0; - tv.tv_usec = timeout; - } else { - tv.tv_sec = SOCK_SENDTO_TIMEOUT; - tv.tv_usec = 0; - } - FD_ZERO(&writefds); - FD_SET(fd, &writefds); - if (select(fd+1, NULL, &writefds, NULL, &tv) > 0) { - optlen = sizeof(int); - getsockopt(fd, SOL_SOCKET, SO_ERROR, &optval, &optlen); - - if (optval) { - SOCK_LOG_ERROR("failed to sendto %d - %s\n", optval, - strerror(optval)); - close(fd); - return -errno; - } - } else { - SOCK_LOG_ERROR("Timeout or error to sendto %d - %s\n", optval, - strerror(optval)); - close(fd); - errno = ETIMEDOUT; - return -FI_ETIMEDOUT; - } - - return 0; -} - -int sock_util_recvfrom(int fd, void *buf, size_t len, struct sockaddr_in *addr, - socklen_t *addrlen, int timeout) -{ - struct timeval tv; - struct timeval *tv_ptr; - fd_set readfds; - socklen_t optlen; - int optval; - int ret; - - if (timeout < 0) { - /* negative timeout means an infinite timeout */ - tv_ptr = NULL; - } else { - tv.tv_sec = 0; - tv.tv_usec = timeout; - tv_ptr = &tv; - } - - FD_ZERO(&readfds); - FD_SET(fd, &readfds); - if (select(fd+1, &readfds, NULL, NULL, tv_ptr) > 0) { - optlen = sizeof(int); - getsockopt(fd, SOL_SOCKET, SO_ERROR, &optval, &optlen); - - if (optval) { - SOCK_LOG_ERROR("failed to connect %d - %s\n", optval, - strerror(optval)); - close(fd); - return 0; - } - - } else { - SOCK_LOG_ERROR("Timeout or error to connect %d - %s\n", optval, - strerror(optval)); - close(fd); - errno = ETIMEDOUT; - return 0; - } - - /* read */ - ret = recvfrom(fd, buf, len, 0, addr, addrlen); - if (ret < 0) { - SOCK_LOG_ERROR("error recvfrom for sread: %d - %s\n", errno, - strerror(errno)); - return 0; - } - - return ret; -} +useconds_t sock_progress_thread_wait = 0; diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_util.h b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_util.h index 720202d2d6..7c6aee815f 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_util.h +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_util.h @@ -44,6 +44,7 @@ #define SOCK_INFO (3) extern int sock_log_level; +extern useconds_t sock_progress_thread_wait; #define SOCK_LOG_INFO(...) do { \ if (sock_log_level >= SOCK_INFO) { \ @@ -68,7 +69,3 @@ extern int sock_log_level; #endif -int sock_util_sendto(int fd, void *buf, size_t len, struct sockaddr_in *addr, - socklen_t addrlen, int timeout); -int sock_util_recvfrom(int fd, void *buf, size_t len, struct sockaddr_in *addr, - socklen_t *addrlen, int timeout); diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_wait.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_wait.c index 823da52af6..fa3d2bce44 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_wait.c +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_wait.c @@ -127,15 +127,13 @@ static int sock_wait_wait(struct fid_wait *wait_fid, int timeout) case FI_CLASS_CQ: cq = container_of(list_item->fid, struct sock_cq, cq_fid); - if (cq->domain->progress_mode == FI_PROGRESS_MANUAL) - sock_cq_progress(cq); + sock_cq_progress(cq); break; case FI_CLASS_CNTR: cntr = container_of(list_item->fid, struct sock_cntr, cntr_fid); - if (cntr->domain->progress_mode == FI_PROGRESS_MANUAL) - sock_cntr_progress(cntr); + sock_cntr_progress(cntr); break; } } @@ -220,8 +218,9 @@ int sock_wait_close(fid_t fid) wait = container_of(fid, struct sock_wait, wait_fid.fid); head = &wait->fid_list; - for (p = head->next; p != head; p = p->next) { + for (p = head->next; p != head;) { list_item = container_of(p, struct sock_fid_list, entry); + p = p->next; free(list_item); } @@ -284,7 +283,7 @@ int sock_wait_open(struct fid_fabric *fabric, struct fi_wait_attr *attr, free(wait); return err; } - + wait->wait_fid.fid.fclass = FI_CLASS_WAIT; wait->wait_fid.fid.context = 0; wait->wait_fid.fid.ops = &sock_wait_fi_ops; @@ -292,6 +291,7 @@ int sock_wait_open(struct fid_fabric *fabric, struct fi_wait_attr *attr, wait->fab = fab; wait->type = wait_obj_type; atomic_inc(&fab->ref); + dlist_init(&wait->fid_list); *waitset = &wait->wait_fid; return 0; diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/fi_usnic.h b/opal/mca/common/libfabric/libfabric/prov/usnic/src/fi_ext_usnic.h similarity index 90% rename from opal/mca/common/libfabric/libfabric/prov/usnic/src/fi_usnic.h rename to opal/mca/common/libfabric/libfabric/prov/usnic/src/fi_ext_usnic.h index c71cb49d94..59ac5b751a 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/fi_usnic.h +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/fi_ext_usnic.h @@ -30,18 +30,20 @@ * SOFTWARE. */ -#ifndef _FI_USNIC_H_ -#define _FI_USNIC_H_ +#ifndef _FI_EXT_USNIC_H_ +#define _FI_EXT_USNIC_H_ #include #include #define FI_PROTO_RUDP 100 +#define FI_USNIC_INFO_VERSION 1 + /* * usNIC specific info */ -struct fi_usnic_info { +struct fi_usnic_info_v1 { uint32_t ui_link_speed; uint32_t ui_netmask_be; char ui_ifname[IFNAMSIZ]; @@ -51,6 +53,13 @@ struct fi_usnic_info { uint32_t ui_cq_per_vf; }; +struct fi_usnic_info { + uint32_t ui_version; + union { + struct fi_usnic_info_v1 v1; + } ui; +}; + /* * usNIC-specific AV ops */ @@ -69,4 +78,4 @@ struct fi_usnic_ops_av { int (*get_distance)(struct fid_av *av, void *addr, int *metric); }; -#endif /* _FI_USNIC_H_ */ +#endif /* _FI_EXT_USNIC_H_ */ diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_av.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_av.c index 9977d3b0e1..d4817e072f 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_av.c +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_av.c @@ -63,8 +63,7 @@ #include "usdf_av.h" #include "usdf_timer.h" -/* would like to move to include/rdma */ -#include "fi_usnic.h" +#include "fi_ext_usnic.h" static void usdf_av_insert_async_complete(struct usdf_av_insert *insert) diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_cq.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_cq.c index 8fbf8a221f..a6ef765988 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_cq.c +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_cq.c @@ -82,7 +82,23 @@ usdf_cq_readerr(struct fid_cq *fcq, struct fi_cq_err_entry *entry, entry->op_context = cq->cq_comp.uc_context; entry->flags = 0; entry->err = FI_EIO; - entry->prov_errno = cq->cq_comp.uc_status; + switch (cq->cq_comp.uc_status) { + case USD_COMPSTAT_SUCCESS: + entry->prov_errno = FI_SUCCESS; + break; + case USD_COMPSTAT_ERROR_CRC: + entry->prov_errno = FI_ECRC; + break; + case USD_COMPSTAT_ERROR_TRUNC: + entry->prov_errno = FI_ETRUNC; + break; + case USD_COMPSTAT_ERROR_TIMEOUT: + entry->prov_errno = FI_ETIMEDOUT; + break; + case USD_COMPSTAT_ERROR_INTERNAL: + entry->prov_errno = FI_EOTHER; + break; + } cq->cq_comp.uc_status = 0; diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_ep_dgram.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_ep_dgram.c index f0445421e5..471fa84c8f 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_ep_dgram.c +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_ep_dgram.c @@ -142,9 +142,11 @@ usdf_ep_dgram_enable(struct fid_ep *fep) fail: if (ep->e.dg.ep_hdr_ptr != NULL) { free(ep->e.dg.ep_hdr_ptr); + ep->e.dg.ep_hdr_ptr = NULL; } if (ep->e.dg.ep_qp != NULL) { usd_destroy_qp(ep->e.dg.ep_qp); + ep->e.dg.ep_qp = NULL; } return ret; } @@ -286,6 +288,20 @@ static struct fi_ops_ep usdf_base_dgram_ops = { .setopt = fi_no_setopt, .tx_ctx = fi_no_tx_ctx, .rx_ctx = fi_no_rx_ctx, + .rx_size_left = usdf_dgram_rx_size_left, + .tx_size_left = usdf_dgram_tx_size_left, +}; + +static struct fi_ops_ep usdf_base_dgram_prefix_ops = { + .size = sizeof(struct fi_ops_ep), + .enable = usdf_ep_dgram_enable, + .cancel = fi_no_cancel, + .getopt = fi_no_getopt, + .setopt = fi_no_setopt, + .tx_ctx = fi_no_tx_ctx, + .rx_ctx = fi_no_rx_ctx, + .rx_size_left = usdf_dgram_prefix_rx_size_left, + .tx_size_left = usdf_dgram_prefix_tx_size_left, }; static struct fi_ops_msg usdf_dgram_ops = { @@ -299,8 +315,6 @@ static struct fi_ops_msg usdf_dgram_ops = { .inject = usdf_dgram_inject, .senddata = usdf_dgram_senddata, .injectdata = fi_no_msg_injectdata, - .rx_size_left = usdf_dgram_rx_size_left, - .tx_size_left = usdf_dgram_tx_size_left, }; static struct fi_ops_msg usdf_dgram_prefix_ops = { @@ -314,8 +328,6 @@ static struct fi_ops_msg usdf_dgram_prefix_ops = { .inject = usdf_dgram_inject, .senddata = usdf_dgram_senddata, .injectdata = fi_no_msg_injectdata, - .rx_size_left = usdf_dgram_prefix_rx_size_left, - .tx_size_left = usdf_dgram_prefix_tx_size_left, }; static struct fi_ops_cm usdf_cm_dgram_ops = { @@ -369,7 +381,6 @@ usdf_ep_dgram_open(struct fid_domain *domain, struct fi_info *info, ep->ep_fid.fid.fclass = FI_CLASS_EP; ep->ep_fid.fid.context = context; ep->ep_fid.fid.ops = &usdf_ep_dgram_ops; - ep->ep_fid.ops = &usdf_base_dgram_ops; ep->ep_fid.cm = &usdf_cm_dgram_ops; ep->ep_domain = udp; ep->ep_caps = info->caps; @@ -393,9 +404,11 @@ usdf_ep_dgram_open(struct fid_domain *domain, struct fi_info *info, goto fail; } + ep->ep_fid.ops = &usdf_base_dgram_prefix_ops; info->ep_attr->msg_prefix_size = USDF_HDR_BUF_ENTRY; ep->ep_fid.msg = &usdf_dgram_prefix_ops; } else { + ep->ep_fid.ops = &usdf_base_dgram_ops; ep->ep_fid.msg = &usdf_dgram_ops; } atomic_init(&ep->ep_refcnt, 0); diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_ep_msg.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_ep_msg.c index 0501f677bd..6858e87a05 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_ep_msg.c +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_ep_msg.c @@ -581,6 +581,8 @@ static struct fi_ops_ep usdf_base_msg_ops = { .setopt = usdf_ep_msg_setopt, .tx_ctx = fi_no_tx_ctx, .rx_ctx = fi_no_rx_ctx, + .rx_size_left = fi_no_rx_size_left, + .tx_size_left = fi_no_tx_size_left, }; static struct fi_ops_cm usdf_cm_msg_ops = { diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_ep_rdm.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_ep_rdm.c index 7fdcc0ea66..2c73c96244 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_ep_rdm.c +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_ep_rdm.c @@ -638,6 +638,8 @@ static struct fi_ops_ep usdf_base_rdm_ops = { .setopt = usdf_ep_rdm_setopt, .tx_ctx = fi_no_tx_ctx, .rx_ctx = fi_no_rx_ctx, + .rx_size_left = fi_no_rx_size_left, + .tx_size_left = fi_no_tx_size_left, }; static struct fi_ops_cm usdf_cm_rdm_ops = { diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_fabric.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_fabric.c index 9d13b43070..be3a0bbb22 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_fabric.c +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_fabric.c @@ -66,7 +66,7 @@ #include "libnl_utils.h" #include "usdf.h" -#include "fi_usnic.h" +#include "fi_ext_usnic.h" #include "usdf_progress.h" #include "usdf_timer.h" #include "usdf_dgram.h" @@ -780,12 +780,12 @@ usdf_usnic_getinfo(struct fid_fabric *fabric, struct fi_usnic_info *uip) fp = fab_ftou(fabric); dap = fp->fab_dev_attrs; - uip->ui_link_speed = dap->uda_bandwidth; - uip->ui_netmask_be = dap->uda_netmask_be; - strcpy(uip->ui_ifname, dap->uda_ifname); - uip->ui_num_vf = dap->uda_num_vf; - uip->ui_qp_per_vf = dap->uda_qp_per_vf; - uip->ui_cq_per_vf = dap->uda_cq_per_vf; + uip->ui.v1.ui_link_speed = dap->uda_bandwidth; + uip->ui.v1.ui_netmask_be = dap->uda_netmask_be; + strcpy(uip->ui.v1.ui_ifname, dap->uda_ifname); + uip->ui.v1.ui_num_vf = dap->uda_num_vf; + uip->ui.v1.ui_qp_per_vf = dap->uda_qp_per_vf; + uip->ui.v1.ui_cq_per_vf = dap->uda_cq_per_vf; return 0; } diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_msg.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_msg.c index ef95a3cc1e..2fb873a4e1 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_msg.c +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_msg.c @@ -142,11 +142,16 @@ _usdf_msg_post_recv(struct usdf_rx *rx, void *buf, size_t len) rq->urq_post_index = (rq->urq_post_index + 1) & rq->urq_post_index_mask; - desc = vnic_rq_next_desc(vrq); + desc = rq->urq_next_desc; rq_enet_desc_enc(desc, (dma_addr_t) buf, RQ_ENET_TYPE_ONLY_SOP, len); wmb(); - vnic_rq_post(vrq, buf, 0, (dma_addr_t) buf, len, 0); + iowrite32(rq->urq_post_index, &vrq->ctrl->posted_index); + + rq->urq_next_desc = (struct rq_enet_desc *) + ((uintptr_t)rq->urq_desc_ring + + ((rq->urq_post_index)<<4)); + rq->urq_recv_credits -= 1; return 0; } diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_pep.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_pep.c index 0f04d40993..e884d57703 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_pep.c +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_pep.c @@ -58,7 +58,7 @@ #include "fi.h" #include "fi_enosys.h" -#include "fi_usnic.h" +#include "fi_ext_usnic.h" #include "usnic_direct.h" #include "usd.h" #include "usdf.h" diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_rdm.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_rdm.c index 1b5199ac0a..3fd6cb9de3 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_rdm.c +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_rdm.c @@ -391,11 +391,16 @@ _usdf_rdm_post_recv(struct usdf_rx *rx, void *buf, size_t len) rq->urq_post_index = (rq->urq_post_index + 1) & rq->urq_post_index_mask; - desc = vnic_rq_next_desc(vrq); + desc = rq->urq_next_desc; rq_enet_desc_enc(desc, (dma_addr_t) buf, RQ_ENET_TYPE_ONLY_SOP, len); wmb(); - vnic_rq_post(vrq, buf, 0, (dma_addr_t) buf, len, 0); + iowrite32(rq->urq_post_index, &vrq->ctrl->posted_index); + + rq->urq_next_desc = (struct rq_enet_desc *) + ((uintptr_t)rq->urq_desc_ring + + ((rq->urq_post_index)<<4)); + rq->urq_recv_credits -= 1; return 0; } diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/kcompat.h b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/kcompat.h index 2bfc00d519..4f4136e8cc 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/kcompat.h +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/kcompat.h @@ -201,12 +201,20 @@ static inline bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys return true; } #endif /*CONFIG_RFS_ACCEL*/ - -#if ((RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6, 5))) -#define skb_get_rxhash(skb) (skb)->rxhash -#endif /*RHEL_RELEASE_VERSION == 6.5*/ #endif /*LINUX >= 3.3.0*/ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 14, 0)) +#define skb_get_hash_raw(skb) (skb)->rxhash +#endif + +#if !defined(__VMKLNX__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 24)) +#define enic_wq_lock(wq_lock) spin_lock_irqsave(wq_lock, flags) +#define enic_wq_unlock(wq_lock) spin_unlock_irqrestore(wq_lock, flags) +#else +#define enic_wq_lock(wq_lock) spin_lock(wq_lock) +#define enic_wq_unlock(wq_lock) spin_unlock(wq_lock) +#endif /* ! vmklnx && kernel < 2.6.24 */ + #ifdef CONFIG_RFS_ACCEL #if ((RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6, 5) \ && RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7, 0))) @@ -235,10 +243,26 @@ static inline bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys #define napi_hash_add(napi) do {} while(0) #define skb_mark_napi_id(skb, napi) do {} while(0) #endif /*CONFIG_NET_RX_BUSY_POLL*/ + #if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 00)) #define __vlan_hwaccel_put_tag(a, b, c) __vlan_hwaccel_put_tag(a, c); #endif /* KERNEL < 3.9.0 */ +#if ((LINUX_VERSION_CODE <= KERNEL_VERSION(3, 4, 0)) && \ + (!RHEL_RELEASE_CODE || RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(6, 0))) +#define net_warn_ratelimited(fmt, ...) \ + do { \ + if (net_ratelimit()) \ + pr_warn(fmt, ##__VA_ARGS__); \ + } while (0) +#endif /* kernel <= 3.4 && rhel < 6.0 */ + +#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2, 6, 26)) +#define enic_pci_dma_mapping_error(pdev, dma) pci_dma_mapping_error(dma) +#else +#define enic_pci_dma_mapping_error(pdev, dma) pci_dma_mapping_error(pdev, dma) +#endif /* Kernel version <= 2.6.26 */ + /* Kernel version-specific definitions */ #if (LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 14)) static inline signed long schedule_timeout_uninterruptible(signed long timeout) @@ -454,6 +478,8 @@ struct napi_struct { #undef pr_err #define pr_err(fmt, ...) \ printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) +#undef pr_warn +#define pr_warn pr_warning #undef pr_warning #define pr_warning(fmt, ...) \ printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__) diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd.h b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd.h index b630e83822..318d5064c1 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd.h +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd.h @@ -201,6 +201,9 @@ struct usd_rq { char *urq_rxbuf; char **urq_post_addr; + uint32_t urq_recv_credits; /* number of available descriptors */ + struct rq_enet_desc *urq_desc_ring; + struct rq_enet_desc *urq_next_desc; uint32_t urq_post_index; /* next rxbuf to post */ uint32_t urq_post_index_mask; uint32_t urq_last_comp; diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd_poll.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd_poll.c index 4bded53d51..a43aafa49c 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd_poll.c +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd_poll.c @@ -99,7 +99,7 @@ usd_desc_to_rq_comp( rcvbuf_len = 0; do { rq_enet_desc_dec( (struct rq_enet_desc *) - ((uintptr_t)rq->urq_vnic_rq.ring.descs + (i<<4)), + ((uintptr_t)rq->urq_desc_ring + (i<<4)), &bus_addr, &type, &len); rcvbuf_len += len; i = (i - 1) & rq->urq_post_index_mask; @@ -127,7 +127,7 @@ usd_desc_to_rq_comp( * reported as released until next RX */ credits = (q_index - rq->urq_last_comp) & rq->urq_post_index_mask; - rq->urq_vnic_rq.ring.desc_avail += credits; + rq->urq_recv_credits += credits; rq->urq_last_comp = q_index; return 0; diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd_post.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd_post.c index 024b5601bb..3a3f66ff61 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd_post.c +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd_post.c @@ -59,12 +59,10 @@ usd_get_recv_credits( struct usd_qp *uqp) { struct usd_qp_impl *qp; - struct vnic_rq *vrq; qp = to_qpi(uqp); - vrq = &qp->uq_rq.urq_vnic_rq; - return vrq->ring.desc_avail; + return qp->uq_rq.urq_recv_credits; } int @@ -77,45 +75,48 @@ usd_post_recv( struct vnic_rq *vrq; struct rq_enet_desc *desc; struct iovec *iovp; + uint32_t index; + uint32_t count; unsigned i; qp = to_qpi(uqp); rq = &qp->uq_rq; vrq = &rq->urq_vnic_rq; + desc = rq->urq_next_desc; + index = rq->urq_post_index; + + iovp = recv_list->urd_iov; + count = 0; while (recv_list != NULL) { - - iovp = recv_list->urd_iov; - - /* XXX - this should be rewritten along the lines of post_send */ - - rq->urq_context[rq->urq_post_index] = recv_list->urd_context; - rq->urq_post_index = (rq->urq_post_index + 1) - & rq->urq_post_index_mask; - - desc = vnic_rq_next_desc(vrq); + rq->urq_context[index] = recv_list->urd_context; rq_enet_desc_enc(desc, (dma_addr_t) iovp[0].iov_base, RQ_ENET_TYPE_ONLY_SOP, iovp[0].iov_len); - wmb(); - vnic_rq_post(vrq, iovp[0].iov_base, 0, - (dma_addr_t) iovp[0].iov_base, iovp[0].iov_len, 0); + count++; + + index = (index+1) & rq->urq_post_index_mask; + desc = (struct rq_enet_desc *) ((uintptr_t)rq->urq_desc_ring + + (index<<4)); for (i = 1; i < recv_list->urd_iov_cnt; ++i) { - - rq->urq_context[rq->urq_post_index] = recv_list->urd_context; - rq->urq_post_index = (rq->urq_post_index + 1) - & rq->urq_post_index_mask; - - desc = vnic_rq_next_desc(vrq); + rq->urq_context[index] = recv_list->urd_context; rq_enet_desc_enc(desc, (dma_addr_t) iovp[i].iov_base, RQ_ENET_TYPE_NOT_SOP, iovp[i].iov_len); - wmb(); - vnic_rq_post(vrq, iovp[i].iov_base, 0, - (dma_addr_t) iovp[i].iov_base, iovp[i].iov_len, - 0); + count++; + + index = (index+1) & rq->urq_post_index_mask; + desc = (struct rq_enet_desc *) ((uintptr_t)rq->urq_desc_ring + + (index<<4)); } recv_list = recv_list->urd_next; } + wmb(); + iowrite32(index, &vrq->ctrl->posted_index); + + rq->urq_next_desc = desc; + rq->urq_post_index = index; + rq->urq_recv_credits -= count; + return 0; } diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd_post.h b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd_post.h index 77124783cf..961155df01 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd_post.h +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd_post.h @@ -162,8 +162,8 @@ _usd_post_send_iov( } wq_enet_desc_enc(desc, (uintptr_t)(iov[i].iov_base), - iov[i].iov_len, mss, header_length, offload_mode, - 1, cq_entry, fcoe_encap, vlan_tag_insert, vlan_tag, loopback); + iov[i].iov_len, mss, header_length, offload_mode, + 1, cq_entry, fcoe_encap, vlan_tag_insert, vlan_tag, loopback); wmb(); diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd_queues.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd_queues.c index ce22a25c70..0115a127a0 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd_queues.c +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd_queues.c @@ -472,27 +472,76 @@ usd_create_wq( return ret; } +static int +usd_vnic_rq_init( + struct usd_rq *rq, + struct usd_vf *vf, + uint64_t desc_ring) +{ + struct vnic_rq *vrq; + int ret; + + vrq = &rq->urq_vnic_rq; + + /* get address of control register */ + vrq->ctrl = vnic_dev_get_res(vf->vf_vdev, RES_TYPE_RQ, rq->urq_index); + if (vrq->ctrl == NULL) + return -EINVAL; + + ret = vnic_rq_disable(vrq); + if (ret != 0) + return ret; + + writeq(desc_ring, &vrq->ctrl->ring_base); + iowrite32(rq->urq_num_entries, &vrq->ctrl->ring_size); + iowrite32(0, &vrq->ctrl->fetch_index); + iowrite32(0, &vrq->ctrl->posted_index); + iowrite32(rq->urq_cq->ucq_index, &vrq->ctrl->cq_index); + iowrite32(0, &vrq->ctrl->error_interrupt_enable); + iowrite32(0, &vrq->ctrl->error_interrupt_offset); + iowrite32(0, &vrq->ctrl->dropped_packet_count); + iowrite32(0, &vrq->ctrl->error_status); + + rq->urq_state |= USD_QS_VNIC_INITIALIZED; + rq->urq_next_desc = rq->urq_desc_ring; + rq->urq_recv_credits = rq->urq_num_entries - 1; + + return 0; +} + /* * Allocate the resources for a previously created RQ */ static int -usd_create_vnic_rq( - struct usd_rq *rq, - struct usd_vf *vf) +usd_create_rq(struct usd_qp_impl *qp) { + struct usd_rq *rq; + uint32_t ring_size; int ret; - /* Allocate resources for RQ */ - ret = vnic_rq_alloc(vf->vf_vdev, &rq->urq_vnic_rq, - rq->urq_index, rq->urq_num_entries, sizeof(struct rq_enet_desc)); - if (ret != 0) { - return ret; - } + rq = &qp->uq_rq; - vnic_rq_init(&rq->urq_vnic_rq, rq->urq_cq->ucq_index, 0, 0); - rq->urq_state |= USD_QS_VNIC_INITIALIZED; + /* Allocate resources for RQ */ + ring_size = sizeof(struct rq_enet_desc) * rq->urq_num_entries; + ret = usd_alloc_mr(qp->uq_dev, ring_size, (void **)&rq->urq_desc_ring); + if (ret != 0) + return ret; + + ret = usd_vnic_rq_init(rq, qp->uq_vf, (uint64_t)rq->urq_desc_ring); + if (ret != 0) + goto out; + + rq->urq_post_index_mask = (rq->urq_num_entries-1); + rq->urq_post_index = 0; + rq->urq_last_comp = (rq->urq_num_entries-1); return 0; +out: + if (rq->urq_desc_ring != NULL) { + usd_free_mr(rq->urq_desc_ring); + rq->urq_desc_ring = NULL; + } + return ret; } static int @@ -952,12 +1001,6 @@ usd_create_qp_normal( } num_wq_entries = wq->uwq_num_entries; - - copybuf_size = USD_SEND_MAX_COPY * num_wq_entries; - ret = usd_alloc_mr(dev, copybuf_size, (void **)&wq->uwq_copybuf); - if (ret != 0) - goto fail; - num_rq_entries = rq->urq_num_entries; rq->urq_context = calloc(sizeof(void *), num_rq_entries); @@ -967,10 +1010,6 @@ usd_create_qp_normal( goto fail; } - rq->urq_post_index_mask = (rq->urq_num_entries-1); - rq->urq_post_index = 0; - rq->urq_last_comp = (rq->urq_num_entries-1); - /* * Issue verbs command to create the QP. This does not actually * instanstiate the filter in the VIC yet, need to bring the @@ -986,6 +1025,16 @@ usd_create_qp_normal( rq->urq_state |= USD_QS_VERBS_CREATED; wq->uwq_state |= USD_QS_VERBS_CREATED; + /* + * Create/regmr for wq copybuf after verbs QP is created + * because QP number information may be needed to register + * mr under shared PD + */ + copybuf_size = USD_SEND_MAX_COPY * num_wq_entries; + ret = usd_alloc_mr(dev, copybuf_size, (void **)&wq->uwq_copybuf); + if (ret != 0) + goto fail; + ret = usd_map_vf(dev, &vf_info, &vf); if (ret != 0) { goto fail; @@ -1013,7 +1062,7 @@ usd_create_qp_normal( if (ret != 0) { goto fail; } - ret = usd_create_vnic_rq(&qp->uq_rq, qp->uq_vf); + ret = usd_create_rq(qp); if (ret != 0) { goto fail; } diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/vnic_dev.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/vnic_dev.c index 64b55d1a8b..e02cd22bdb 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/vnic_dev.c +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/vnic_dev.c @@ -110,6 +110,7 @@ struct vnic_dev { struct vnic_intr_coal_timer_info intr_coal_timer_info; struct devcmd2_controller *devcmd2; int (*devcmd_rtn)(struct vnic_dev *vdev, enum vnic_devcmd_cmd cmd, int wait); + struct vnic_gen_stats gen_stats; }; #define VNIC_MAX_RES_HDR_SIZE \ diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/vnic_rq.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/vnic_rq.c index 116330cd61..45a9c7ad09 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/vnic_rq.c +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/vnic_rq.c @@ -46,7 +46,7 @@ #include #include #include -#if __KERNEL__ +#ifdef __KERNEL__ #include #include #endif diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/vnic_stats.h b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/vnic_stats.h index 10e8a0dcbd..010998d3c2 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/vnic_stats.h +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/vnic_stats.h @@ -86,6 +86,11 @@ struct vnic_rx_stats { u64 rsvd[16]; }; +/* Generic statistics */ +struct vnic_gen_stats { + u64 dma_map_error; +}; + struct vnic_stats { struct vnic_tx_stats tx; struct vnic_rx_stats rx; diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/vnic_wq.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/vnic_wq.c index f8f14408c5..736ff25b97 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/vnic_wq.c +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/vnic_wq.c @@ -102,11 +102,14 @@ static int vnic_wq_alloc_bufs(struct vnic_wq *wq) wq->ring.desc_size * buf->index; if (buf->index + 1 == count) { buf->next = wq->bufs[0]; + buf->next->prev = buf; break; } else if (j + 1 == VNIC_WQ_BUF_BLK_ENTRIES(count)) { buf->next = wq->bufs[i + 1]; + buf->next->prev = buf; } else { buf->next = buf + 1; + buf->next->prev = buf; buf++; } } diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/vnic_wq.h b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/vnic_wq.h index b1902d82ec..4de8de87b9 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/vnic_wq.h +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/vnic_wq.h @@ -88,6 +88,7 @@ struct vnic_wq_buf { uint8_t cq_entry; /* Gets completion event from hw */ uint8_t desc_skip_cnt; /* Num descs to occupy */ uint8_t compressed_send; /* Both hdr and payload in one desc */ + struct vnic_wq_buf *prev; }; /* Break the vnic_wq_buf allocations into blocks of 32/64 entries */ diff --git a/opal/mca/common/libfabric/libfabric/prov/verbs/src/fi_verbs.c b/opal/mca/common/libfabric/libfabric/prov/verbs/src/fi_verbs.c index 195db04dc9..13b09271bb 100644 --- a/opal/mca/common/libfabric/libfabric/prov/verbs/src/fi_verbs.c +++ b/opal/mca/common/libfabric/libfabric/prov/verbs/src/fi_verbs.c @@ -689,6 +689,9 @@ static int fi_ibv_msg_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) switch (bfid->fclass) { case FI_CLASS_CQ: + if (!(flags & (FI_RECV|FI_SEND))) { + return -EINVAL; + } if (flags & FI_RECV) { if (ep->rcq) return -EINVAL; @@ -908,8 +911,6 @@ static struct fi_ops_msg fi_ibv_msg_ep_msg_ops = { .inject = fi_no_msg_inject, .senddata = fi_ibv_msg_ep_senddata, .injectdata = fi_no_msg_injectdata, - .rx_size_left = fi_no_msg_rx_size_left, - .tx_size_left = fi_no_msg_tx_size_left, }; static ssize_t @@ -1678,7 +1679,7 @@ static int fi_ibv_msg_ep_shutdown(struct fid_ep *ep, uint64_t flags) static struct fi_ops_cm fi_ibv_msg_ep_cm_ops = { .size = sizeof(struct fi_ops_cm), - .getname = NULL, /* TODO */ + .getname = fi_no_getname, .getpeer = fi_no_getpeer, .connect = fi_ibv_msg_ep_connect, .listen = fi_no_listen, @@ -1720,6 +1721,8 @@ static int fi_ibv_msg_ep_enable(struct fid_ep *ep) _ep = container_of(ep, struct fi_ibv_msg_ep, ep_fid); if (!_ep->eq) return -FI_ENOEQ; + if (!_ep->scq || !_ep->rcq) + return -FI_ENOCQ; return fi_ibv_msg_ep_create_qp(_ep); } @@ -1730,6 +1733,8 @@ static struct fi_ops_ep fi_ibv_msg_ep_base_ops = { .cancel = fi_no_cancel, .getopt = fi_ibv_msg_ep_getopt, .setopt = fi_ibv_msg_ep_setopt, + .rx_size_left = fi_no_rx_size_left, + .tx_size_left = fi_no_tx_size_left, }; static int fi_ibv_msg_ep_close(fid_t fid) @@ -1880,20 +1885,20 @@ fi_ibv_eq_cm_process_event(struct fi_ibv_eq *eq, struct rdma_cm_event *cma_event case RDMA_CM_EVENT_UNREACHABLE: eq->err.fid = fid; eq->err.err = cma_event->status; - return -EIO; + return -FI_EAVAIL; case RDMA_CM_EVENT_REJECTED: eq->err.fid = fid; eq->err.err = ECONNREFUSED; eq->err.prov_errno = cma_event->status; - return -EIO; + return -FI_EAVAIL; case RDMA_CM_EVENT_DEVICE_REMOVAL: eq->err.fid = fid; eq->err.err = ENODEV; - return -EIO; + return -FI_EAVAIL; case RDMA_CM_EVENT_ADDR_CHANGE: eq->err.fid = fid; eq->err.err = EADDRNOTAVAIL; - return -EIO; + return -FI_EAVAIL; default: return 0; } @@ -1917,7 +1922,7 @@ fi_ibv_eq_read(struct fid_eq *eq, uint32_t *event, _eq = container_of(eq, struct fi_ibv_eq, eq_fid.fid); entry = (struct fi_eq_cm_entry *) buf; if (_eq->err.err) - return -FI_EIO; + return -FI_EAVAIL; ret = rdma_get_cm_event(_eq->channel, &cma_event); if (ret) @@ -2025,6 +2030,7 @@ fi_ibv_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr, _eq->fab = container_of(fabric, struct fi_ibv_fabric, fabric_fid); switch (attr->wait_obj) { + case FI_WAIT_UNSPEC: case FI_WAIT_FD: _eq->channel = rdma_create_event_channel(); if (!_eq->channel) { @@ -2338,6 +2344,7 @@ fi_ibv_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, _cq->domain = container_of(domain, struct fi_ibv_domain, domain_fid); switch (attr->wait_obj) { + case FI_WAIT_UNSPEC: case FI_WAIT_FD: _cq->channel = ibv_create_comp_channel(_cq->domain->verbs); if (!_cq->channel) { @@ -2572,7 +2579,7 @@ static int fi_ibv_pep_listen(struct fid_pep *pep) static struct fi_ops_cm fi_ibv_pep_cm_ops = { .size = sizeof(struct fi_ops_cm), - .getname = NULL, /* TODO */ + .getname = fi_no_getname, .getpeer = fi_no_getpeer, .connect = fi_no_connect, .listen = fi_ibv_pep_listen, diff --git a/opal/mca/common/libfabric/libfabric/src/common.c b/opal/mca/common/libfabric/libfabric/src/common.c index 02d2d31131..1196bc6d92 100644 --- a/opal/mca/common/libfabric/libfabric/src/common.c +++ b/opal/mca/common/libfabric/libfabric/src/common.c @@ -155,3 +155,63 @@ size_t fi_datatype_size(enum fi_datatype datatype) } return fi_datatype_size_table[datatype]; } + +int fi_send_allowed(uint64_t caps) +{ + if (caps & FI_MSG || + caps & FI_TAGGED) { + if (caps & FI_SEND) + return 1; + if (caps & FI_RECV) + return 0; + return 1; + } + + return 0; +} + +int fi_recv_allowed(uint64_t caps) +{ + if (caps & FI_MSG || + caps & FI_TAGGED) { + if (caps & FI_RECV) + return 1; + if (caps & FI_SEND) + return 0; + return 1; + } + + return 0; +} + +int fi_rma_initiate_allowed(uint64_t caps) +{ + if (caps & FI_RMA || + caps & FI_ATOMICS) { + if (caps & FI_WRITE || + caps & FI_READ) + return 1; + if (caps & FI_REMOTE_WRITE || + caps & FI_REMOTE_READ) + return 0; + return 1; + } + + return 0; +} + +int fi_rma_target_allowed(uint64_t caps) +{ + if (caps & FI_RMA || + caps & FI_ATOMICS) { + if (caps & FI_REMOTE_WRITE || + caps & FI_REMOTE_READ) + return 1; + if (caps & FI_WRITE || + caps & FI_READ) + return 0; + return 1; + } + + return 0; +} diff --git a/opal/mca/common/libfabric/libfabric/src/enosys.c b/opal/mca/common/libfabric/libfabric/src/enosys.c index 177dfcb67c..5822d8a028 100644 --- a/opal/mca/common/libfabric/libfabric/src/enosys.c +++ b/opal/mca/common/libfabric/libfabric/src/enosys.c @@ -288,18 +288,26 @@ int fi_no_setopt(fid_t fid, int level, int optname, { return -FI_ENOSYS; } -int fi_no_tx_ctx(struct fid_sep *sep, int index, +int fi_no_tx_ctx(struct fid_ep *sep, int index, struct fi_tx_attr *attr, struct fid_ep **tx_ep, void *context) { return -FI_ENOSYS; } -int fi_no_rx_ctx(struct fid_sep *sep, int index, +int fi_no_rx_ctx(struct fid_ep *sep, int index, struct fi_rx_attr *attr, struct fid_ep **rx_ep, void *context) { return -FI_ENOSYS; } +ssize_t fi_no_rx_size_left(struct fid_ep *ep) +{ + return -FI_ENOSYS; +} +ssize_t fi_no_tx_size_left(struct fid_ep *ep) +{ + return -FI_ENOSYS; +} /* * struct fi_ops_msg @@ -349,14 +357,6 @@ ssize_t fi_no_msg_injectdata(struct fid_ep *ep, const void *buf, size_t len, { return -FI_ENOSYS; } -ssize_t fi_no_msg_rx_size_left(struct fid_ep *ep) -{ - return -FI_ENOSYS; -} -ssize_t fi_no_msg_tx_size_left(struct fid_ep *ep) -{ - return -FI_ENOSYS; -} /* * struct fi_ops_wait diff --git a/opal/mca/common/libfabric/libfabric/src/fabric.c b/opal/mca/common/libfabric/libfabric/src/fabric.c index f6ebe13497..35649cdebd 100644 --- a/opal/mca/common/libfabric/libfabric/src/fabric.c +++ b/opal/mca/common/libfabric/libfabric/src/fabric.c @@ -44,17 +44,16 @@ #include #include "fi.h" #include "prov.h" +#include "fi_log.h" #ifdef HAVE_LIBDL #include #endif -#define FI_WARN(fmt, ...) \ - do { fprintf(stderr, "%s: " fmt, PACKAGE, ##__VA_ARGS__); } while (0) - struct fi_prov { struct fi_prov *next; struct fi_provider *provider; + void *dlhandle; }; static struct fi_prov *fi_getprov(const char *prov_name); @@ -64,31 +63,62 @@ static volatile int init = 0; static pthread_mutex_t ini_lock = PTHREAD_MUTEX_INITIALIZER; -static int fi_register_provider(struct fi_provider *provider) +static void cleanup_provider(struct fi_provider *provider, void *dlhandle) +{ + if (provider && provider->cleanup) + provider->cleanup(); + +#ifdef HAVE_LIBDL + if (dlhandle) + dlclose(dlhandle); +#endif +} + +static int fi_register_provider(struct fi_provider *provider, void *dlhandle) { struct fi_prov *prov; int ret; - if (!provider) - return -FI_EINVAL; + if (!provider) { + ret = -FI_EINVAL; + goto cleanup; + } + + FI_LOG(2, NULL, "registering provider: %s (%d.%d)\n", provider->name, + FI_MAJOR(provider->version), FI_MINOR(provider->version)); if (FI_MAJOR(provider->fi_version) != FI_MAJOR_VERSION || FI_MINOR(provider->fi_version) > FI_MINOR_VERSION) { + FI_LOG(2, NULL, "provider has unsupported FI version (provider %d.%d != libfabric %d.%d); ignoring\n", + FI_MAJOR(provider->fi_version), + FI_MINOR(provider->fi_version), + FI_MAJOR_VERSION, FI_MINOR_VERSION); + ret = -FI_ENOSYS; goto cleanup; } prov = fi_getprov(provider->name); if (prov) { - /* If we have two versions of the same provider, - * keep the most recent + /* If this provider is older than an already-loaded + * provider of the same name, then discard this one. */ if (FI_VERSION_GE(prov->provider->version, provider->version)) { + FI_LOG(2, NULL, "a newer %s provider was already loaded; ignoring this one\n", + provider->name); ret = -FI_EALREADY; goto cleanup; } - prov->provider->cleanup(); + /* This provider is newer than an already-loaded + * provider of the same name, so discard the + * already-loaded one. + */ + FI_LOG(2, NULL, "an older %s provider was already loaded; keeping this one and ignoring the older one\n", + provider->name); + cleanup_provider(prov->provider, prov->dlhandle); + + prov->dlhandle = dlhandle; prov->provider = provider; return 0; } @@ -99,6 +129,7 @@ static int fi_register_provider(struct fi_provider *provider) goto cleanup; } + prov->dlhandle = dlhandle; prov->provider = provider; if (prov_tail) prov_tail->next = prov; @@ -108,7 +139,8 @@ static int fi_register_provider(struct fi_provider *provider) return 0; cleanup: - provider->cleanup(); + cleanup_provider(provider, dlhandle); + return ret; } @@ -136,6 +168,8 @@ static void fi_ini(void) if (init) goto unlock; + fi_log_init(); + #ifdef HAVE_LIBDL struct dirent **liblist; int n; @@ -156,34 +190,35 @@ static void fi_ini(void) while (n--) { if (asprintf(&lib, "%s/%s", provdir, liblist[n]->d_name) < 0) { - FI_WARN("asprintf failed to allocate memory\n"); + FI_WARN(NULL, "asprintf failed to allocate memory\n"); free(liblist[n]); goto done; } + FI_DEBUG(NULL, "opening provider lib %s\n", lib); dlhandle = dlopen(lib, RTLD_NOW); if (dlhandle == NULL) - FI_WARN("dlopen(%s): %s\n", lib, dlerror()); + FI_WARN(NULL, "dlopen(%s): %s\n", lib, dlerror()); free(liblist[n]); free(lib); inif = dlsym(dlhandle, "fi_prov_ini"); if (inif == NULL) - FI_WARN("dlsym: %s\n", dlerror()); + FI_WARN(NULL, "dlsym: %s\n", dlerror()); else - fi_register_provider((inif)()); + fi_register_provider((inif)(), dlhandle); } free(liblist); done: #endif - fi_register_provider(PSM_INIT); - fi_register_provider(USNIC_INIT); + fi_register_provider(PSM_INIT, NULL); + fi_register_provider(USNIC_INIT, NULL); - fi_register_provider(VERBS_INIT); - fi_register_provider(SOCKETS_INIT); + fi_register_provider(VERBS_INIT, NULL); + fi_register_provider(SOCKETS_INIT, NULL); init = 1; unlock: @@ -193,7 +228,7 @@ unlock: static void __attribute__((destructor)) fi_fini(void) { for (struct fi_prov *prov = prov_head; prov; prov = prov->next) - prov->provider->cleanup(); + cleanup_provider(prov->provider, prov->dlhandle); } static struct fi_prov *fi_getprov(const char *prov_name) @@ -258,6 +293,8 @@ int fi_getinfo_(uint32_t version, const char *node, const char *service, ret = prov->provider->getinfo(version, node, service, flags, hints, &cur); if (ret) { + FI_LOG(1, NULL, "fi_getinfo: provider %s returned -%d (%s)\n", + prov->provider->name, -ret, fi_strerror(-ret)); if (ret == -FI_ENODATA) { continue; } else { diff --git a/opal/mca/common/libfabric/libfabric/src/fi_tostr.c b/opal/mca/common/libfabric/libfabric/src/fi_tostr.c index 4a001a3f95..b9994bb050 100644 --- a/opal/mca/common/libfabric/libfabric/src/fi_tostr.c +++ b/opal/mca/common/libfabric/libfabric/src/fi_tostr.c @@ -487,6 +487,11 @@ static void fi_tostr_atomic_op(char *buf, enum fi_op op) } } +static void fi_tostr_version(char *buf) +{ + strcatf(buf, VERSION); +} + __attribute__((visibility ("default"))) char *fi_tostr_(const void *data, enum fi_type datatype) { @@ -560,6 +565,9 @@ char *fi_tostr_(const void *data, enum fi_type datatype) case FI_TYPE_ATOMIC_OP: fi_tostr_atomic_op(buf, enumval); break; + case FI_TYPE_VERSION: + fi_tostr_version(buf); + break; default: strcatf(buf, "Unknown type"); break; diff --git a/opal/mca/common/libfabric/libfabric/src/log.c b/opal/mca/common/libfabric/libfabric/src/log.c new file mode 100644 index 0000000000..a55b84699e --- /dev/null +++ b/opal/mca/common/libfabric/libfabric/src/log.c @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2015, Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include +#include +#include +#include + +#include "fi.h" +#include "fi_log.h" + +/* General implementation note: these functions currently use multiple fprintfs + * in a row, which can render in an ugly fashion for multithreaded code and for + * some mpirun implementations. If this bugs anyone enough then we can convert + * them to snprintf to build up the printout in a single buffer. + */ + +int fi_log_level = INT_MIN; + +void fi_log_init(void) +{ + int ret; + + if (getenv("FI_LOG_LEVEL") != NULL) { + errno = 0; + ret = strtol(getenv("FI_LOG_LEVEL"), NULL, 10); + if (errno != 0) + fprintf(stderr, + "%s: invalid value specified for FI_LOG_LEVEL (%s)\n", + PACKAGE, strerror(errno)); + else + fi_log_level = (int)ret; + } +} + +void fi_warn_impl(const char *prov, const char *fmt, ...) +{ + va_list vargs; + + if (prov != NULL) + fprintf(stderr, "%s:%s: ", PACKAGE, prov); + else + fprintf(stderr, "%s: ", PACKAGE); + va_start(vargs, fmt); + vfprintf(stderr, fmt, vargs); + va_end(vargs); +} + +void fi_log_impl(int level, const char *prov, const char *fmt, ...) +{ + va_list vargs; + + if (prov != NULL) + fprintf(stderr, "%s:%s:<%d> ", PACKAGE, prov, level); + else + fprintf(stderr, "%s:<%d> ", PACKAGE, level); + va_start(vargs, fmt); + vfprintf(stderr, fmt, vargs); + va_end(vargs); +} + +void fi_debug_impl(const char *prov, const char *fmt, ...) +{ + va_list vargs; + + if (prov != NULL) + fprintf(stderr, "%s:%s: ", PACKAGE, prov); + else + fprintf(stderr, "%s: ", PACKAGE); + va_start(vargs, fmt); + vfprintf(stderr, fmt, vargs); + va_end(vargs); +}