diff --git a/opal/mca/common/libfabric/Makefile.am b/opal/mca/common/libfabric/Makefile.am index 57bc821c14..217226bb69 100644 --- a/opal/mca/common/libfabric/Makefile.am +++ b/opal/mca/common/libfabric/Makefile.am @@ -74,10 +74,12 @@ libfabric_usnic_headers = \ libfabric/prov/usnic/src/usdf.h \ libfabric/prov/usnic/src/usdf_av.h \ libfabric/prov/usnic/src/usdf_cm.h \ + libfabric/prov/usnic/src/usdf_cq.h \ libfabric/prov/usnic/src/usdf_dgram.h \ libfabric/prov/usnic/src/usdf_endpoint.h \ libfabric/prov/usnic/src/usdf_msg.h \ libfabric/prov/usnic/src/usdf_progress.h \ + libfabric/prov/usnic/src/usdf_rdm.h \ libfabric/prov/usnic/src/usdf_timer.h \ libfabric/prov/usnic/src/usnic_direct/cq_desc.h \ libfabric/prov/usnic/src/usnic_direct/cq_enet_desc.h \ @@ -126,6 +128,7 @@ libfabric_usnic_sources = \ libfabric/prov/usnic/src/usdf_endpoint.c \ libfabric/prov/usnic/src/usdf_ep_dgram.c \ libfabric/prov/usnic/src/usdf_ep_msg.c \ + libfabric/prov/usnic/src/usdf_ep_rdm.c \ libfabric/prov/usnic/src/usdf_eq.c \ libfabric/prov/usnic/src/usdf_fabric.c \ libfabric/prov/usnic/src/usdf_mem.c \ @@ -133,6 +136,7 @@ libfabric_usnic_sources = \ libfabric/prov/usnic/src/usdf_pep.c \ libfabric/prov/usnic/src/usdf_progress.c \ libfabric/prov/usnic/src/usdf_timer.c \ + libfabric/prov/usnic/src/usdf_rdm.c \ libfabric/prov/usnic/src/usnic_direct/libnl_utils_common.c \ libfabric/prov/usnic/src/usnic_direct/usd_caps.c \ libfabric/prov/usnic/src/usnic_direct/usd_dest.c \ diff --git a/opal/mca/common/libfabric/libfabric/AUTHORS b/opal/mca/common/libfabric/libfabric/AUTHORS index a7074465cb..9954e9f6d3 100644 --- a/opal/mca/common/libfabric/libfabric/AUTHORS +++ b/opal/mca/common/libfabric/libfabric/AUTHORS @@ -3,3 +3,4 @@ Reese Faucette Jeff Squyres Jianxin Xiong Sayantan Sur +Xuyang Wang diff --git a/opal/mca/common/libfabric/libfabric/Makefile.am b/opal/mca/common/libfabric/libfabric/Makefile.am index d5881130a0..6af762e902 100644 --- a/opal/mca/common/libfabric/libfabric/Makefile.am +++ b/opal/mca/common/libfabric/libfabric/Makefile.am @@ -24,7 +24,7 @@ common_srcs = \ src/enosys.c # ensure dl-built providers link back to libfabric -linkback = -lfabric -Lsrc/.libs/ +linkback = $(top_builddir)/src/libfabric.la src_libfabric_la_SOURCES = \ include/fi.h \ @@ -32,6 +32,7 @@ src_libfabric_la_SOURCES = \ include/fi_indexer.h \ include/fi_list.h \ include/fi_rbuf.h \ + include/prov.h \ src/fabric.c \ src/fi_tostr.c \ $(common_srcs) @@ -40,26 +41,35 @@ if HAVE_SOCKETS _sockets_files = \ prov/sockets/src/sock.h \ prov/sockets/src/sock_av.c \ - prov/sockets/src/sock_dgram.c \ prov/sockets/src/sock_dom.c \ prov/sockets/src/sock_eq.c \ prov/sockets/src/sock_cq.c \ prov/sockets/src/sock_cntr.c \ prov/sockets/src/sock_poll.c \ - prov/sockets/src/sock_rdm.c \ + prov/sockets/src/sock_wait.c \ + prov/sockets/src/sock_ep_rdm.c \ + prov/sockets/src/sock_ep_dgram.c \ + prov/sockets/src/sock_ep_msg.c \ prov/sockets/src/sock_fabric.c \ prov/sockets/src/sock_ep.c \ prov/sockets/src/sock_ctx.c \ + prov/sockets/src/sock_rx_entry.c \ + prov/sockets/src/sock_progress.c \ + prov/sockets/src/sock_comm.c \ + prov/sockets/src/sock_conn.c \ + prov/sockets/src/sock_msg.c \ + prov/sockets/src/sock_rma.c \ + prov/sockets/src/sock_atomic.c \ prov/sockets/src/sock_util.c \ prov/sockets/src/sock_util.h \ - prov/sockets/src/indexer.c \ - prov/sockets/src/list.c \ - prov/sockets/src/list.h + prov/sockets/src/indexer.c if HAVE_SOCKETS_DL pkglib_LTLIBRARIES += libsockets-fi.la libsockets_fi_la_SOURCES = $(_sockets_files) $(common_srcs) -libsockets_fi_la_LDFLAGS = -module -avoid-version -shared -export-dynamic $(linkback) +libsockets_fi_la_LIBADD = $(linkback) +libsockets_fi_la_LDFLAGS = -module -avoid-version -shared -export-dynamic +libsockets_fi_la_DEPENDENCIES = $(linkback) else !HAVE_SOCKETS_DL src_libfabric_la_SOURCES += $(_sockets_files) endif !HAVE_SOCKETS_DL @@ -72,7 +82,9 @@ _verbs_files = prov/verbs/src/fi_verbs.c if HAVE_VERBS_DL pkglib_LTLIBRARIES += libverbs-fi.la libverbs_fi_la_SOURCES = $(_verbs_files) $(common_srcs) -libverbs_fi_la_LDFLAGS = -module -avoid-version -shared -export-dynamic -libverbs -lrdmacm $(linkback) +libverbs_fi_la_LIBADD = -libverbs -lrdmacm $(linkback) +libverbs_fi_la_LDFLAGS = -module -avoid-version -shared -export-dynamic +libverbs_fi_la_DEPENDENCIES = $(linkback) else !HAVE_VERBS_DL src_libfabric_la_SOURCES += $(_verbs_files) endif !HAVE_VERBS_DL @@ -149,6 +161,7 @@ _usnic_files = \ prov/usnic/src/usdf_cm.c \ prov/usnic/src/usdf_cm.h \ prov/usnic/src/usdf_cq.c \ + prov/usnic/src/usdf_cq.h \ prov/usnic/src/usdf_dgram.c \ prov/usnic/src/usdf_dgram.h \ prov/usnic/src/usdf_domain.c \ @@ -156,6 +169,7 @@ _usnic_files = \ prov/usnic/src/usdf_endpoint.h \ prov/usnic/src/usdf_ep_dgram.c \ prov/usnic/src/usdf_ep_msg.c \ + prov/usnic/src/usdf_ep_rdm.c \ prov/usnic/src/usdf_eq.c \ prov/usnic/src/usdf_fabric.c \ prov/usnic/src/usdf_mem.c \ @@ -164,6 +178,9 @@ _usnic_files = \ prov/usnic/src/usdf_pep.c \ prov/usnic/src/usdf_progress.c \ prov/usnic/src/usdf_progress.h \ + prov/usnic/src/usdf_rdm.c \ + prov/usnic/src/usdf_rdm.h \ + prov/usnic/src/usdf_rudp.h \ prov/usnic/src/usdf_timer.c \ prov/usnic/src/usdf_timer.h @@ -175,8 +192,9 @@ if HAVE_USNIC_DL pkglib_LTLIBRARIES += libusnic-fi.la libusnic_fi_la_CPPFLAGS = $(AM_CPPFLAGS) $(_usnic_cppflags) libusnic_fi_la_SOURCES = $(_usnic_files) $(common_srcs) +libusnic_fi_la_LIBADD = $(linkback) libusnic_fi_la_LDFLAGS = -module -avoid-version -shared -export-dynamic -libusnic_fi_la_LIBS = $(linkback) +libusnic_fi_la_DEPENDENCIES = $(linkback) else !HAVE_USNIC_DL AM_CPPFLAGS += $(_usnic_cppflags) src_libfabric_la_SOURCES += $(_usnic_files) @@ -188,6 +206,7 @@ if HAVE_PSM _psm_files = \ prov/psm/src/psm_am.h \ prov/psm/src/psmx.h \ + prov/psm/src/psm_am.h \ prov/psm/src/psmx_init.c \ prov/psm/src/psmx_domain.c \ prov/psm/src/psmx_cq.c \ @@ -209,7 +228,9 @@ _psm_files = \ if HAVE_PSM_DL pkglib_LTLIBRARIES += libpsmx-fi.la libpsmx_fi_la_SOURCES = $(_psm_files) $(common_srcs) -libpsmx_fi_la_LDFLAGS = -module -avoid-version -shared -export-dynamic $(linkback) +libpsmx_fi_la_LIBADD = $(linkback) +libpsmx_fi_la_LDFLAGS = -module -avoid-version -shared -export-dynamic +libpsmx_fi_la_DEPENDENCIES = $(linkback) else !HAVE_PSM_DL src_libfabric_la_SOURCES += $(_psm_files) endif !HAVE_PSM_DL diff --git a/opal/mca/common/libfabric/libfabric/README b/opal/mca/common/libfabric/libfabric/README index 68716b91ed..9781d2ecd6 100644 --- a/opal/mca/common/libfabric/libfabric/README +++ b/opal/mca/common/libfabric/libfabric/README @@ -1,7 +1,7 @@ This README is for userspace RDMA fabric library. Version Libfabric v0.0.2 -Released on 2014-12-09 +Released on 2014-12-19 Building ======== diff --git a/opal/mca/common/libfabric/libfabric/config.h.in b/opal/mca/common/libfabric/libfabric/config.h.in index c5d23451bb..9ab56faf5e 100644 --- a/opal/mca/common/libfabric/libfabric/config.h.in +++ b/opal/mca/common/libfabric/libfabric/config.h.in @@ -33,6 +33,18 @@ /* Define to 1 if you have the header file. */ #undef HAVE_MEMORY_H +/* psm provider is built */ +#undef HAVE_PSM + +/* psm provider is built as DSO */ +#undef HAVE_PSM_DL + +/* sockets provider is built */ +#undef HAVE_SOCKETS + +/* sockets provider is built as DSO */ +#undef HAVE_SOCKETS_DL + /* Define to 1 if you have the header file. */ #undef HAVE_STDINT_H @@ -57,6 +69,18 @@ /* Define to 1 if you have the header file. */ #undef HAVE_UNISTD_H +/* usnic provider is built */ +#undef HAVE_USNIC + +/* usnic provider is built as DSO */ +#undef HAVE_USNIC_DL + +/* verbs provider is built */ +#undef HAVE_VERBS + +/* verbs provider is built as DSO */ +#undef HAVE_VERBS_DL + /* Define to 1 to enable valgrind annotations */ #undef INCLUDE_VALGRIND diff --git a/opal/mca/common/libfabric/libfabric/config/fi_provider.m4 b/opal/mca/common/libfabric/libfabric/config/fi_provider.m4 index fe0a16bec3..ca91acd8b9 100644 --- a/opal/mca/common/libfabric/libfabric/config/fi_provider.m4 +++ b/opal/mca/common/libfabric/libfabric/config/fi_provider.m4 @@ -88,7 +88,11 @@ AC_DEFUN([FI_PROVIDER_SETUP],[ ], [AC_MSG_NOTICE([$1 provider disabled])]) - # Set conditionals for HAVE_ and HAVE__DL + AC_DEFINE_UNQUOTED([HAVE_]m4_translit([$1], [a-z], [A-Z]), $$1_happy, [$1 provider is built]) + AC_DEFINE_UNQUOTED([HAVE_]m4_translit([$1], [a-z], [A-Z])[_DL], $$1_dl, [$1 provider is built as DSO]) + + # Set AM conditionals for HAVE_ and HAVE__DL + # as well as AC defines AM_CONDITIONAL([HAVE_]m4_translit([$1], [a-z], [A-Z]), [test $$1_happy -eq 1]) AM_CONDITIONAL([HAVE_]m4_translit([$1], [a-z], [A-Z])[_DL], diff --git a/opal/mca/common/libfabric/libfabric/include/fi.h b/opal/mca/common/libfabric/libfabric/include/fi.h index 2b358e986c..ae8d0f6566 100644 --- a/opal/mca/common/libfabric/libfabric/include/fi.h +++ b/opal/mca/common/libfabric/libfabric/include/fi.h @@ -73,6 +73,8 @@ static inline uint64_t htonll(uint64_t x) { return x; } static inline uint64_t ntohll(uint64_t x) { return x; } #endif +#define sizeof_field(type, field) sizeof(((type *)0)->field) + #define MIN(a, b) ((a) < (b) ? a : b) #define MAX(a, b) ((a) > (b) ? a : b) @@ -176,10 +178,7 @@ static inline int atomic_get(atomic_t *atomic) #endif // HAVE_ATOMICS - /* non exported symbols */ -int fi_init(void); - int fi_read_file(const char *dir, const char *file, char *buf, size_t size); int fi_poll_fd(int fd, int timeout); int fi_wait_cond(pthread_cond_t *cond, pthread_mutex_t *mut, int timeout); @@ -191,8 +190,6 @@ size_t fi_datatype_size(enum fi_datatype datatype); uint64_t fi_tag_bits(uint64_t mem_tag_format); uint64_t fi_tag_format(uint64_t tag_bits); -int fi_version_register(uint32_t version, struct fi_provider *provider); - #define RDMA_CONF_DIR SYSCONFDIR "/" RDMADIR #define FI_CONF_DIR RDMA_CONF_DIR "/fabric" diff --git a/opal/mca/common/libfabric/libfabric/include/fi_rbuf.h b/opal/mca/common/libfabric/libfabric/include/fi_rbuf.h index 7e64a916c9..c447bf2768 100644 --- a/opal/mca/common/libfabric/libfabric/include/fi_rbuf.h +++ b/opal/mca/common/libfabric/libfabric/include/fi_rbuf.h @@ -104,7 +104,7 @@ static inline void rbwrite(struct ringbuf *rb, const void *buf, size_t len) memcpy((char*)rb->buf + (rb->wpos & rb->size_mask), buf, len); } else { memcpy((char*)rb->buf + (rb->wpos & rb->size_mask), buf, endlen); - memcpy(rb->buf, buf, len - endlen); + memcpy(rb->buf, (char*)buf + endlen, len - endlen); } rb->wpos += len; } @@ -128,7 +128,7 @@ static inline void rbpeek(struct ringbuf *rb, void *buf, size_t len) memcpy(buf, (char*)rb->buf + (rb->rcnt & rb->size_mask), len); } else { memcpy(buf, (char*)rb->buf + (rb->rcnt & rb->size_mask), endlen); - memcpy(buf, rb->buf, len - endlen); + memcpy((char*)buf + endlen, rb->buf, len - endlen); } } diff --git a/opal/mca/common/libfabric/libfabric/include/prov.h b/opal/mca/common/libfabric/libfabric/include/prov.h new file mode 100644 index 0000000000..b3eedba76b --- /dev/null +++ b/opal/mca/common/libfabric/libfabric/include/prov.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2013-2014 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _PROV_H_ +#define _PROV_H_ + +#if HAVE_CONFIG_H +# include +#endif /* HAVE_CONFIG_H */ + +#include + +/* Provider initialization function signature that built-in providers + * must specify. */ +#define INI_SIG(name) struct fi_provider* name(void) + +/* for each provider defines for three scenarios: + * dl: externally visible ctor with known name (see fi_prov.h) + * built-in: ctor function def, don't export symbols + * not built: no-op call for ctor +*/ + +#if (HAVE_VERBS) && (HAVE_VERBS_DL) +# define VERBS_INI FI_EXT_INI +# define VERBS_INIT NULL +#elif (HAVE_VERBS) +# define VERBS_INI INI_SIG(fi_verbs_ini) +# define VERBS_INIT fi_verbs_ini() +VERBS_INI ; +#else +# define VERBS_INIT NULL +#endif + +#if (HAVE_PSM) && (HAVE_PSM_DL) +# define PSM_INI FI_EXT_INI +# define PSM_INIT NULL +#elif (HAVE_PSM) +# define PSM_INI INI_SIG(fi_psm_ini) +# define PSM_INIT fi_psm_ini() +PSM_INI ; +#else +# define PSM_INIT NULL +#endif + +#if (HAVE_SOCKETS) && (HAVE_SOCKETS_DL) +# define SOCKETS_INI FI_EXT_INI +# define SOCKETS_INIT NULL +#elif (HAVE_SOCKETS) +# define SOCKETS_INI INI_SIG(fi_sockets_ini) +# define SOCKETS_INIT fi_sockets_ini() +SOCKETS_INI ; +#else +# define SOCKETS_INIT NULL +#endif + +#if (HAVE_USNIC) && (HAVE_USNIC_DL) +# define USNIC_INI FI_EXT_INI +# define USNIC_INIT NULL +#elif (HAVE_USNIC) +# define USNIC_INI INI_SIG(fi_usnic_ini) +# define USNIC_INIT fi_usnic_ini() +USNIC_INI ; +#else +# define USNIC_INIT NULL +#endif + +#endif /* _PROV_H_ */ diff --git a/opal/mca/common/libfabric/libfabric/include/rdma/fabric.h b/opal/mca/common/libfabric/libfabric/include/rdma/fabric.h index 2a26b596e0..895fbeec59 100644 --- a/opal/mca/common/libfabric/libfabric/include/rdma/fabric.h +++ b/opal/mca/common/libfabric/libfabric/include/rdma/fabric.h @@ -201,7 +201,7 @@ enum { FI_PROTO_IB_UD, FI_PROTO_PSMX, FI_PROTO_UDP, - FI_PROTO_SOCK_RDS, + FI_PROTO_SOCK_TCP }; /* Mode bits */ @@ -232,6 +232,7 @@ struct fi_rx_attr { struct fi_ep_attr { uint32_t protocol; + uint32_t protocol_version; size_t max_msg_size; size_t inject_size; size_t total_buffered_recv; diff --git a/opal/mca/common/libfabric/libfabric/include/rdma/fi_domain.h b/opal/mca/common/libfabric/libfabric/include/rdma/fi_domain.h index e8899fd13a..9a274b121b 100644 --- a/opal/mca/common/libfabric/libfabric/include/rdma/fi_domain.h +++ b/opal/mca/common/libfabric/libfabric/include/rdma/fi_domain.h @@ -192,6 +192,20 @@ fi_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr, return domain->ops->cntr_open(domain, attr, cntr, context); } +static inline int +fi_wait_open(struct fid_domain *domain, struct fi_wait_attr *attr, + struct fid_wait **waitset) +{ + return domain->ops->wait_open(domain, attr, waitset); +} + +static inline int +fi_poll_open(struct fid_domain *domain, struct fi_poll_attr *attr, + struct fid_poll **pollset) +{ + return domain->ops->poll_open(domain, attr, pollset); +} + static inline int fi_mr_reg(struct fid_domain *domain, const void *buf, size_t len, uint64_t access, uint64_t offset, uint64_t requested_key, diff --git a/opal/mca/common/libfabric/libfabric/include/rdma/fi_eq.h b/opal/mca/common/libfabric/libfabric/include/rdma/fi_eq.h index 66e62628fe..436459af8d 100644 --- a/opal/mca/common/libfabric/libfabric/include/rdma/fi_eq.h +++ b/opal/mca/common/libfabric/libfabric/include/rdma/fi_eq.h @@ -53,7 +53,7 @@ enum fi_wait_obj { FI_WAIT_UNSPEC, FI_WAIT_SET, FI_WAIT_FD, - FI_WAIT_MUT_COND, /* pthread mutex & cond */ + FI_WAIT_MUTEX_COND, /* pthread mutex & cond */ }; struct fi_wait_attr { @@ -70,14 +70,12 @@ struct fid_wait { struct fid fid; struct fi_ops_wait *ops; }; - -struct fi_wait_obj_set { - size_t count; - enum fi_wait_obj wait_obj; - void *obj; + +struct fi_mutex_cond { + pthread_mutex_t *mutex; + pthread_cond_t *cond; }; - /* * Poll Set * Allows polling multiple event queues and counters for progress @@ -90,6 +88,10 @@ struct fi_poll_attr { struct fi_ops_poll { size_t size; int (*poll)(struct fid_poll *pollset, void **context, int count); + int (*poll_add)(struct fid_poll *pollset, struct fid *event_fid, + uint64_t flags); + int (*poll_del)(struct fid_poll *pollset, struct fid *event_fid, + uint64_t flags); }; struct fid_poll { @@ -301,6 +303,17 @@ fi_poll(struct fid_poll *pollset, void **context, int count) return pollset->ops->poll(pollset, context, count); } +static inline int +fi_poll_add(struct fid_poll *pollset, struct fid *event_fid, uint64_t flags) +{ + return pollset->ops->poll_add(pollset, event_fid, flags); +} + +static inline int +fi_poll_del(struct fid_poll *pollset, struct fid *event_fid, uint64_t flags) +{ + return pollset->ops->poll_del(pollset, event_fid, flags); +} static inline int fi_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr, diff --git a/opal/mca/common/libfabric/libfabric/include/rdma/fi_prov.h b/opal/mca/common/libfabric/libfabric/include/rdma/fi_prov.h index aeb779401b..9af1fce2a4 100644 --- a/opal/mca/common/libfabric/libfabric/include/rdma/fi_prov.h +++ b/opal/mca/common/libfabric/libfabric/include/rdma/fi_prov.h @@ -43,31 +43,33 @@ extern "C" { #endif /* - * Extension that low-level drivers should add to their .so filename - * (probably via libtool "-release" option). For example a low-level - * driver named "libfoo" should build a plug-in named "libfoo-fi.so". + * Extension that dl-loaded providers should add to their .so filename + * (probably via libtool "-release" option). For example a provider + * driver named "foo" should build a plug-in named "libfoo-fi.so", and + * place it in $prefix/$libdir/libfabric/ */ #define FI_LIB_EXTENSION "fi" #define FI_LIB_SUFFIX FI_LIB_EXTENSION ".so" -#define FI_LIB_CLASS_NAME "libfabric" +/* + * Dynamically loaded providers must export the following entry point. + * This is invoked by the libfabric framework when the provider library + * is loaded. + */ +#define FI_EXT_INI \ + __attribute__((visibility ("default"))) struct fi_provider* fi_prov_ini(void) struct fi_provider { - const char *name; uint32_t version; + uint32_t fi_version; + const char *name; int (*getinfo)(uint32_t version, const char *node, const char *service, uint64_t flags, struct fi_info *hints, struct fi_info **info); int (*fabric)(struct fi_fabric_attr *attr, struct fid_fabric **fabric, void *context); + void (*cleanup)(void); }; -int fi_register_provider(uint32_t fi_version, struct fi_provider *provider); -static inline int fi_register(struct fi_provider *provider) -{ - return fi_register_provider(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), - provider); -} - #ifdef __cplusplus } #endif diff --git a/opal/mca/common/libfabric/libfabric/libfabric.map b/opal/mca/common/libfabric/libfabric/libfabric.map index 890aa5772b..db689ddc73 100644 --- a/opal/mca/common/libfabric/libfabric/libfabric.map +++ b/opal/mca/common/libfabric/libfabric/libfabric.map @@ -6,7 +6,6 @@ FABRIC_1.0 { fi_fabric; fi_version; fi_strerror; - fi_register_provider; fi_tostr; local: *; }; diff --git a/opal/mca/common/libfabric/libfabric/libfabric.spec b/opal/mca/common/libfabric/libfabric/libfabric.spec index 062f89cb93..389a9e334a 100644 --- a/opal/mca/common/libfabric/libfabric/libfabric.spec +++ b/opal/mca/common/libfabric/libfabric/libfabric.spec @@ -1,50 +1,41 @@ -%define ver 0.0.2 - Name: libfabric Version: 0.0.2 Release: 1%{?dist} -Summary: Userspace RDMA Fabric Interfaces - +Summary: User-space RDMA Fabric Interfaces Group: System Environment/Libraries License: GPLv2 or BSD Url: http://www.github.com/ofiwg/libfabric -Source: http://www.openfabrics.org/downloads/fabrics/%{name}-%{version}.tar.gz -BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) +Source: http://www.openfabrics.org/downloads/fabrics/%{name}-%{version}.tar.bz2 +Prefix: ${_prefix} %description -libfabric provides a userspace API to access high-performance fabric +libfabric provides a user-space API to access high-performance fabric services, such as RDMA. %package devel Summary: Development files for the libfabric library Group: System Environment/Libraries +Requires: libfabric = %{version} %description devel Development files for the libfabric library. -%package utils -Summary: Examples for the libfabric library -Group: System Environment/Libraries -Requires: %{name} = %{version}-%{release} - -%description utils -Example test programs for the libfabric library. - %prep -%setup -q -n %{name}-%{ver} +%setup -q -n %{name}-%{version} %build -%configure +# defaults: with-dlopen and without-valgrind can be over-rode: +%configure %{?_without_dlopen} %{?_with_valgrind} make %{?_smp_mflags} %install -rm -rf $RPM_BUILD_ROOT -%makeinstall +rm -rf %{buildroot} +%makeinstall installdirs # remove unpackaged files from the buildroot -rm -f $RPM_BUILD_ROOT%{_libdir}/*.la +rm -f %{buildroot}%{_libdir}/*.la %clean -rm -rf $RPM_BUILD_ROOT +rm -rf %{buildroot} %post -p /sbin/ldconfig %postun -p /sbin/ldconfig @@ -52,6 +43,7 @@ rm -rf $RPM_BUILD_ROOT %files %defattr(-,root,root,-) %{_libdir}/lib*.so.* +%dir %{_libdir}/libfabric/ %doc AUTHORS COPYING README %files devel @@ -62,10 +54,6 @@ rm -rf $RPM_BUILD_ROOT %{_mandir}/man3/* %{_mandir}/man7/* -%files utils -%defattr(-,root,root,-) -%{_bindir}/* -%{_mandir}/man1/* - %changelog - +* Mon Jan 19 2015 Maintainer Name 1.0.0 +- TODO: Release manager fill this out for initial release diff --git a/opal/mca/common/libfabric/libfabric/libfabric.spec.in b/opal/mca/common/libfabric/libfabric/libfabric.spec.in index 5341e89be6..ef77e8ca5b 100644 --- a/opal/mca/common/libfabric/libfabric/libfabric.spec.in +++ b/opal/mca/common/libfabric/libfabric/libfabric.spec.in @@ -1,50 +1,41 @@ -%define ver @VERSION@ - Name: libfabric -Version: 0.0.2 +Version: @VERSION@ Release: 1%{?dist} -Summary: Userspace RDMA Fabric Interfaces - +Summary: User-space RDMA Fabric Interfaces Group: System Environment/Libraries License: GPLv2 or BSD Url: http://www.github.com/ofiwg/libfabric -Source: http://www.openfabrics.org/downloads/fabrics/%{name}-%{version}.tar.gz -BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) +Source: http://www.openfabrics.org/downloads/fabrics/%{name}-%{version}.tar.bz2 +Prefix: ${_prefix} %description -libfabric provides a userspace API to access high-performance fabric +libfabric provides a user-space API to access high-performance fabric services, such as RDMA. %package devel Summary: Development files for the libfabric library Group: System Environment/Libraries +Requires: libfabric = %{version} %description devel Development files for the libfabric library. -%package utils -Summary: Examples for the libfabric library -Group: System Environment/Libraries -Requires: %{name} = %{version}-%{release} - -%description utils -Example test programs for the libfabric library. - %prep -%setup -q -n %{name}-%{ver} +%setup -q -n %{name}-%{version} %build -%configure +# defaults: with-dlopen and without-valgrind can be over-rode: +%configure %{?_without_dlopen} %{?_with_valgrind} make %{?_smp_mflags} %install -rm -rf $RPM_BUILD_ROOT -%makeinstall +rm -rf %{buildroot} +%makeinstall installdirs # remove unpackaged files from the buildroot -rm -f $RPM_BUILD_ROOT%{_libdir}/*.la +rm -f %{buildroot}%{_libdir}/*.la %clean -rm -rf $RPM_BUILD_ROOT +rm -rf %{buildroot} %post -p /sbin/ldconfig %postun -p /sbin/ldconfig @@ -52,6 +43,7 @@ rm -rf $RPM_BUILD_ROOT %files %defattr(-,root,root,-) %{_libdir}/lib*.so.* +%dir %{_libdir}/libfabric/ %doc AUTHORS COPYING README %files devel @@ -62,10 +54,6 @@ rm -rf $RPM_BUILD_ROOT %{_mandir}/man3/* %{_mandir}/man7/* -%files utils -%defattr(-,root,root,-) -%{_bindir}/* -%{_mandir}/man1/* - %changelog - +* Mon Jan 19 2015 Maintainer Name 1.0.0 +- TODO: Release manager fill this out for initial release diff --git a/opal/mca/common/libfabric/libfabric/man/fabric.7 b/opal/mca/common/libfabric/libfabric/man/fabric.7 index f4abe79fb6..228f9521b3 100644 --- a/opal/mca/common/libfabric/libfabric/man/fabric.7 +++ b/opal/mca/common/libfabric/libfabric/man/fabric.7 @@ -1,4 +1,4 @@ -.TH fabric 7 "2014\-12\-05" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH fabric 7 "2014\-12\-15" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .SH NAME .PP Fabric Interface Library @@ -106,12 +106,13 @@ Endpoints are configured with specific communication capabilities and data transfer interfaces. .PP \f[I]fi_eq - Event Queue\f[] : Event queues, are used to collect and -report the completion of asynchronous operations. -For example, the completion of a data transfer operation submitted over -a fabric endpoint may write an event to an event queue associated with -the endpoint. -There are multiple types of event queues, and the format of the events -that they report are controlled by applications. +report the completion of asynchronous operations and events. +Event queues report events that are not directly associated with data +transfer operations. +.PP +\f[I]fi_cq - Completion Queue\f[] : Completion queues are +high-performance event queues used to report the completion of data +transfer operations. .PP \f[I]fi_cntr - Event Counters\f[] : Event counters are used to report the number of completed asynchronous operations. @@ -214,15 +215,13 @@ addresses must support FI_SOCKADDR_IN and FI_SOCKADDR_IN6 input formats. Address vectors must support FI_ADDR, FI_ADDR_INDEX, and FI_AV output formats. .IP \[bu] 2 -Access domains must support opening event queues and counters. +Access domains must support opening completion queues and counters. .IP \[bu] 2 -Event queues must support the FI_EQ_FORMAT_CONTEXT format. -.IP \[bu] 2 -Event queues associated with data transfer completions must support the -FI_EQ_FORMAT_DATA format. +Completion queues must support the FI_CQ_FORMAT_CONTEXT and +FI_CQ_FORMAT_MSG formats. .IP \[bu] 2 Event queues associated with tagged message transfers must support the -FI_EQ_FORMAT_TAGGED format. +FI_CQ_FORMAT_TAGGED format. .IP \[bu] 2 A provider is expected to be forward compatible, and must be able to be compiled against expanded \f[C]fi_xxx_ops\f[] structures that define new @@ -231,6 +230,7 @@ Any unknown functions must be set to NULL. .SH SEE ALSO .PP \f[C]fi_getinfo\f[](3), \f[C]fi_endpoint\f[](3), \f[C]fi_domain\f[](3), -\f[C]fi_av\f[](3), \f[C]fi_eq\f[](3), \f[C]fi_mr\f[](3) +\f[C]fi_av\f[](3), \f[C]fi_eq\f[](3), \f[C]fi_cq\f[](3), +\f[C]fi_cntr\f[](3), \f[C]fi_mr\f[](3) .SH AUTHORS OpenFabrics. diff --git a/opal/mca/common/libfabric/libfabric/man/fi_cntr.3 b/opal/mca/common/libfabric/libfabric/man/fi_cntr.3 index c86d54d1d4..2a9af9f189 100644 --- a/opal/mca/common/libfabric/libfabric/man/fi_cntr.3 +++ b/opal/mca/common/libfabric/libfabric/man/fi_cntr.3 @@ -1,4 +1,4 @@ -.TH fi_cntr 3 "2014\-11\-21" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH fi_cntr 3 "2014\-12\-15" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .SH NAME .PP fi_cntr - Completion and event counter operations @@ -105,7 +105,7 @@ Users may use fi_control to retrieve the underlying wait object associated with a counter, in order to use it in other system calls. The following values may be used to specify the type of wait object associated with a counter: FI_WAIT_NONE, FI_WAIT_UNSPEC, FI_WAIT_SET, -FI_WAIT_FD, and FI_WAIT_MUT_COND. +FI_WAIT_FD, and FI_WAIT_MUTEX_COND. .IP \[bu] 2 \f[I]FI_WAIT_NONE\f[] : Used to indicate that the user will not block (wait) for events on the counter. @@ -130,7 +130,7 @@ routines. However, a provider may signal an FD wait object by marking it as readable, writable, or with an error. .IP \[bu] 2 -\f[I]FI_WAIT_MUT_COND\f[] : Specifies that the counter should use a +\f[I]FI_WAIT_MUTEX_COND\f[] : Specifies that the counter should use a pthread mutex and cond variable as a wait object. .PP \f[I]wait_set\f[] : If wait_obj is FI_WAIT_SET, this field references a @@ -167,13 +167,7 @@ operational flags associated with the counter. the low-level wait object associated with the counter. The format of the wait-object is specified during counter creation, through the counter attributes. -The fi_cntr_control arg parameter should be an address where a pointer -to the returned wait object will be written. -.PP -\f[I]FI_CNTR_WAIT_MUT_COND\f[] : The counter wait is implemented using a -pthread_mutex_t and pthread_cond_t. -FI_GETWAIT will return two pointers, a reference to pthread_mutex_t * -and pthread_cond_t *, respectively. +See fi_eq.3 for addition details using control with FI_GETWAIT. .SS fi_cntr_read .PP The fi_cntr_read call returns the current value of the counter. diff --git a/opal/mca/common/libfabric/libfabric/man/fi_cq.3 b/opal/mca/common/libfabric/libfabric/man/fi_cq.3 index 14a4a81b43..d2ac0d06b6 100644 --- a/opal/mca/common/libfabric/libfabric/man/fi_cq.3 +++ b/opal/mca/common/libfabric/libfabric/man/fi_cq.3 @@ -1,4 +1,4 @@ -.TH fi_cq 3 "2014\-12\-05" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH fi_cq 3 "2014\-12\-15" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .SH NAME .PP fi_cq - Completion queue operations @@ -219,7 +219,7 @@ Users may use fi_control to retrieve the underlying wait object associated with an CQ, in order to use it in other system calls. The following values may be used to specify the type of wait object associated with an CQ: FI_WAIT_NONE, FI_WAIT_UNSPEC, FI_WAIT_SET, -FI_WAIT_FD, and FI_WAIT_MUT_COND. +FI_WAIT_FD, and FI_WAIT_MUTEX_COND. .IP \[bu] 2 \f[I]FI_WAIT_NONE\f[] : Used to indicate that the user will not block (wait) for completions on the CQ. @@ -247,7 +247,7 @@ routines. However, a provider may signal an FD wait object by marking it as readable, writable, or with an error. .IP \[bu] 2 -\f[I]FI_WAIT_MUT_COND\f[] : Specifies that the CQ should use a pthread +\f[I]FI_WAIT_MUTEX_COND\f[] : Specifies that the CQ should use a pthread mutex and cond variable as a wait object. .PP \f[I]signaling_vector\f[] : Indicates which processor core interrupts @@ -302,8 +302,7 @@ The following control commands are usable with an CQ. the low-level wait object associated with the CQ. The format of the wait-object is specified during CQ creation, through the CQ attributes. -The fi_control arg parameter should be an address where a pointer to the -returned wait object will be written. +See fi_eq.3 for addition details using control with FI_GETWAIT. .SS fi_cq_read / fi_cq_readfrom .PP The fi_cq_read and fi_cq_readfrom operations perform a non-blocking read @@ -394,6 +393,19 @@ Len must be a multiple of the size of the event to insert. .PP User events inserted into a CQ with be associated with the source address FI_ADDR_NOTAVAIL. +.SH COMPLETION FLAGS +.PP +Completion flags provide additional details regarding the completed +operation. +The following completion flags are defined. +.PP +*FI_REMOTE_CQ_DATA : This indicates that remote CQ data is available as +part of the completion. +.PP +\f[I]FI_MULTI_RECV\f[] : This flag applies to receive buffers that were +posted with the FI_MULTI_RECV flag set. +This completion flag indicates that the receive buffer referenced by the +completion has been consumed and was released by the provider. .SH RETURN VALUES .PP fi_cq_open : Returns 0 on success. diff --git a/opal/mca/common/libfabric/libfabric/man/fi_domain.3 b/opal/mca/common/libfabric/libfabric/man/fi_domain.3 index 57ec6f61d5..2e39f966ba 100644 --- a/opal/mca/common/libfabric/libfabric/man/fi_domain.3 +++ b/opal/mca/common/libfabric/libfabric/man/fi_domain.3 @@ -1,4 +1,4 @@ -.TH fi_domain 3 "2014\-11\-21" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH fi_domain 3 "2014\-12\-19" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .SH NAME .PP fi_domain - Open a fabric access domain @@ -160,6 +160,9 @@ For instance, endpoints that share the same event queue or poll set belong to the same progress domain. Applications that can allocate endpoint resources to specific threads can reduce provider locking by using FI_THREAD_PROGRESS. +.PP +\f[I]FI_THREAD_DOMAIN\f[] : A domain serialization model requires +applications to serialize access to all objects belonging to a domain. .SS Progress Models (control_progress / data_progress) .PP Progress is the ability of the underlying implementation to complete @@ -174,7 +177,7 @@ application threads. .PP Control progress indicates the method that the provider uses to make progress on asynchronous control operations. -Control operations are function which do not directly involve the +Control operations are functions which do not directly involve the transfer of application data between endpoints. They include address vector, memory registration, and connection management routines. @@ -248,9 +251,9 @@ the provider. .PP The number of outbound command queues optimally supported by the provider. -For a low-level provider, this represents the number command queues to -the hardware and/or the number of parallel transmit engines effectively -supported by the hardware and caches. +For a low-level provider, this represents the number of command queues +to the hardware and/or the number of parallel transmit engines +effectively supported by the hardware and caches. Applications which allocate more transmit contexts than this value will end up sharing underlying resources. By default, there is a single transmit context associated with each diff --git a/opal/mca/common/libfabric/libfabric/man/fi_endpoint.3 b/opal/mca/common/libfabric/libfabric/man/fi_endpoint.3 index 546d29f058..842930cd50 100644 --- a/opal/mca/common/libfabric/libfabric/man/fi_endpoint.3 +++ b/opal/mca/common/libfabric/libfabric/man/fi_endpoint.3 @@ -1,4 +1,4 @@ -.TH fi_endpoint 3 "2014\-12\-04" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH fi_endpoint 3 "2014\-12\-18" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .SH NAME .PP fi_endpoint - Fabric endpoint operations @@ -9,8 +9,8 @@ Allocate or close an endpoint. .RE .TP .B fi_ep_bind -Associate an endpoint with an event queue, completion queue, address -vector, or memory region +Associate an endpoint with an event queue, completion queue, counter, +address vector, or memory region .RS .RE .TP @@ -227,14 +227,6 @@ completion of a subsequent operation. Use of this flag may improve performance by allowing the provider to avoid writing a completion entry for every operation. .PP -The use of FI_COMPLETION is often paired with the call fi_sync. -FI_COMPLETION allows the user to suppress completions from being -generated. -In order for the application to ensure that all previous operations have -completed, the application may call fi_sync. -The successful completion of fi_sync indicates that all prior operations -have completed successfully. -.PP An endpoint may also, or instead, be bound to a fabric counter. When binding an endpoint to a counter, the following flags may be specified. @@ -346,6 +338,11 @@ The following option levels and option names and parameters are defined. \f[I]FI_OPT_MIN_MULTI_RECV - size_t\f[] : Defines the minimum receive buffer space available when the receive buffer is automatically freed (see FI_MULTI_RECV). +Modifying this value is only guaranteed to set the minimum buffer space +needed on receives posted after the value has been changed. +It is recommended that applications that want to override the default +MIN_MULTI_RECV value set this option before enabling the corresponding +endpoint. .SH ENDPOINT ATTRIBUTES .PP The fi_ep_attr structure defines the set of attributes associated with @@ -354,7 +351,8 @@ an endpoint. .nf \f[C] struct\ fi_ep_attr\ { -\ \ \ \ uint64_t\ \ protocol; +\ \ \ \ uint32_t\ \ protocol; +\ \ \ \ uint32_t\ \ protocol_version; \ \ \ \ size_t\ \ \ \ max_msg_size; \ \ \ \ size_t\ \ \ \ inject_size; \ \ \ \ size_t\ \ \ \ total_buffered_recv; @@ -376,8 +374,8 @@ A matching protocol must be used by communicating endpoints to ensure interoperability. The following protocol values are defined. Provider specific protocols are also allowed. -Provider specific protocols will be indicated by having the upper 3 -bytes of the protocol value set to the vendor OUI. +Provider specific protocols will be indicated by having the upper bit of +the protocol value set to one. .PP \f[I]FI_PROTO_UNSPEC\f[] : The protocol is not specified. This is usually provided as input, with other attributes of the socket @@ -397,6 +395,15 @@ datagram queue pairs. protocol known as PSM, performance scaled messaging. PSMX is an extended version of the PSM protocol to support the libfabric interfaces. +.SS protocol_version - Protocol Version +.PP +Identifies which version of the protocol is employeed by the provider. +The protocol version allows providers to extend an existing protocol, by +adding support for additional features or functionality for example, in +a backward compatible manner. +Providers that support different versions of the same protocol should +interoperate, but only when using the capabilities defined for the +lesser version. .SS max_msg_size - Max Message Size .PP Defines the maximum size for an application data transfer as a single @@ -584,7 +591,7 @@ submission. Number of transmit contexts to associate with the endpoint. If not specified (0), 1 context will be assigned if the endpoint supports outbound transfers. -Transmit contexts are independent command queues that may be separately +Transmit contexts are independent transmit queues that may be separately configured. Each transmit context may be bound to a separate CQ, and no ordering is defined between contexts. @@ -637,7 +644,7 @@ and require the application to explicitly create transmit and receive contexts as described below. .SS fi_tx_context .PP -Transmit contexts are independent command queues. +Transmit contexts are independent transmit queues. Ordering and synchronization between contexts are not defined. Conceptually a transmit context behaves similar to a send-only endpoint. A transmit context may be configured with relaxed capabilities, and has @@ -706,7 +713,7 @@ operation. (scatter-gather elements) that a single posted operation may reference. .SS fi_rx_context .PP -Receive contexts are independent command queues for receiving incoming +Receive contexts are independent receive queues for receiving incoming data. Ordering and synchronization between contexts are not guaranteed. Conceptually a receive context behaves similar to a receive-only @@ -797,7 +804,7 @@ processing, with the potential cost of serializing access across multiple endpoints. Support for sharable contexts is domain specific. .PP -Conceptually, sharable contexts are command queues that may be accessed +Conceptually, sharable contexts are transmit queues that may be accessed by many endpoints. The use of a shared transmit context is mostly opaque to an application. Applications must allocate and bind shared transmit contexts to @@ -935,6 +942,13 @@ such data transfers. Operations that complete in error that are not associated with valid operational context will use the endpoint context in any error reporting structures. +.PP +Users can attach both counters and completion queues to an endpoint. +When both counter and completion queue are attached, a successful +completion increments the counter and does not generate a completion +entry in the completion queue. +Operations that complete with an error increment the error counter and +generate a completion event. .SH RETURN VALUES .PP Returns 0 on success. diff --git a/opal/mca/common/libfabric/libfabric/man/fi_eq.3 b/opal/mca/common/libfabric/libfabric/man/fi_eq.3 index 0733afa988..ddb347acb3 100644 --- a/opal/mca/common/libfabric/libfabric/man/fi_eq.3 +++ b/opal/mca/common/libfabric/libfabric/man/fi_eq.3 @@ -1,4 +1,4 @@ -.TH fi_eq 3 "2014\-12\-03" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH fi_eq 3 "2014\-12\-15" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .SH NAME .PP fi_eq - Event queue operations @@ -144,7 +144,7 @@ routines. However, a provider may signal an FD wait object by marking it as readable, writable, or with an error. .IP \[bu] 2 -\f[I]FI_WAIT_MUT_COND\f[] : Specifies that the EQ should use a pthread +\f[I]FI_WAIT_MUTEX_COND\f[] : Specifies that the EQ should use a pthread mutex and cond variable as a wait object. .PP \f[I]signaling_vector\f[] : Indicates which processor core interrupts @@ -176,6 +176,17 @@ The format of the wait-object is specified during EQ creation, through the EQ attributes. The fi_control arg parameter should be an address where a pointer to the returned wait object will be written. +This should be an \[aq]int *\[aq] for FI_WAIT_FD, or \[aq]struct +fi_mutex_cond\[aq] for FI_WAIT_MUTEX_COND. +.IP +.nf +\f[C] +struct\ fi_mutex_cond\ { +\ \ \ \ pthread_mutex_t\ \ \ \ \ *mutex; +\ \ \ \ pthread_cond_t\ \ \ \ \ \ *cond; +}; +\f[] +.fi .SS fi_eq_read .PP The fi_eq_read operations performs a non-blocking read of event data diff --git a/opal/mca/common/libfabric/libfabric/man/fi_fabric.3 b/opal/mca/common/libfabric/libfabric/man/fi_fabric.3 index 49317b30d1..649a50fb46 100644 --- a/opal/mca/common/libfabric/libfabric/man/fi_fabric.3 +++ b/opal/mca/common/libfabric/libfabric/man/fi_fabric.3 @@ -1,4 +1,4 @@ -.TH fi_fabric 3 "2014\-12\-04" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH fi_fabric 3 "2014\-12\-12" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .SH NAME .PP fi_fabric - Fabric domain operations @@ -89,6 +89,11 @@ uint64_t flags \f[I]FI_TYPE_PROTO\f[] : struct fi_ep_attr::protocol field .PP \f[I]FI_TYPE_MSG_ORDER\f[] : struct fi_ep_attr::msg_order field +.PP +fi_tostr() will return a pointer to an internal libfabric buffer that +should not be modified, and will be overwritten the next time fi_tostr() +is invoked. +fi_tostr() is not thread safe. .SH NOTES .PP The following resources are associated with fabric domains: access diff --git a/opal/mca/common/libfabric/libfabric/man/fi_getinfo.3 b/opal/mca/common/libfabric/libfabric/man/fi_getinfo.3 index 9f1042b5d4..cfcf94ce59 100644 --- a/opal/mca/common/libfabric/libfabric/man/fi_getinfo.3 +++ b/opal/mca/common/libfabric/libfabric/man/fi_getinfo.3 @@ -1,4 +1,4 @@ -.TH fi_getinfo 3 "2014\-12\-08" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH fi_getinfo 3 "2014\-12\-16" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .SH NAME .PP fi_getinfo / fi_freeinfo - Obtain / free fabric interface information @@ -158,7 +158,7 @@ When provided as hints, requested values of struct fi_tx_ctx_attr should be set. On output, the actual transmit context attributes that can be provided will be returned. -Output values will be greater than or or equal to the requested input +Output values will be greater than or equal to the requested input values. .PP \f[I]rx_attr - receive context attributes\f[] : Optionally supplied @@ -232,7 +232,7 @@ endpoint as send-only or receive-only. \f[I]FI_RMA\f[] : Specifies that the endpoint should support RMA read and write operations. Endpoints supporting this capability support operations defined by -struct fi_rma_ops. +struct fi_ops_rma. In the absence of any relevant flags, FI_RMA implies the ability to initiate and be the target of remote memory reads and writes. Applications can use the FI_READ, FI_WRITE, FI_REMOTE_READ, and @@ -241,10 +241,10 @@ by an endpoint. .PP \f[I]FI_TAGGED\f[] : Specifies that the endpoint should handle tagged message transfers. -tagged message transfers associate a user-specified key or tag with each +Tagged message transfers associate a user-specified key or tag with each message that is used for matching purposes at the remote side. Endpoints supporting this capability support operations defined by -struct fi_tagged_ops. +struct fi_ops_tagged. In the absence of any relevant flags, FI_TAGGED implies the ability to send and receive tagged messages. Applications can use the FI_SEND and FI_RECV flags to optimize an @@ -253,7 +253,7 @@ endpoint as send-only or receive-only. \f[I]FI_ATOMICS\f[] : Specifies that the endpoint supports some set of atomic operations. Endpoints supporting this capability support operations defined by -struct fi_atomic_ops. +struct fi_ops_atomic. In the absence of any relevant flags, FI_ATOMICS implies the ability to initiate and be the target of remote atomic reads and writes. Applications can use the FI_READ, FI_WRITE, FI_REMOTE_READ, and @@ -263,7 +263,7 @@ supported by an endpoint. \f[I]FI_MULTICAST\f[] : Indicates that the endpoint should support multicast data transfers. Endpoints supporting this capability support multicast operations -defined by struct fi_msg_ops, when a multicast address is specified as +defined by struct fi_ops_msg, when a multicast address is specified as the destination address. In the absence of any relevant flags, FI_MULTICAST implies the ability to send and receive messages. @@ -496,7 +496,7 @@ See \f[C]fi_av\f[](3). specific address format should be selected. Provider specific addresses may be protocol specific or a vendor proprietary format. -Applications that select FI_FORMAT_UNSPEC should be prepared to be treat +Applications that select FI_FORMAT_UNSPEC should be prepared to treat returned addressing data as opaque. FI_FORMAT_UNSPEC targets apps which make use of an out of band address exchange. @@ -512,7 +512,7 @@ interfaces examining the sa_family field. \f[I]FI_SOCKADDR_IN6\f[] : Address is of type sockaddr_in6 (IPv6). .PP \f[I]FI_SOCKADDR_IB\f[] : Address is of type sockaddr_ib (defined in -Linux kernel source +Linux kernel source) .PP \f[I]FI_ADDR_PSMX\f[] : Address is an Intel proprietary format that is used with their PSMX (extended performance scaled messaging) protocol. diff --git a/opal/mca/common/libfabric/libfabric/man/fi_mr.3 b/opal/mca/common/libfabric/libfabric/man/fi_mr.3 index 5ede63f5c3..5c479edb4a 100644 --- a/opal/mca/common/libfabric/libfabric/man/fi_mr.3 +++ b/opal/mca/common/libfabric/libfabric/man/fi_mr.3 @@ -1,4 +1,4 @@ -.TH fi_mr 3 "2014-11-14" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH fi_mr 3 "2014\-12\-19" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .SH NAME .PP fi_mr - Memory region operations @@ -90,7 +90,15 @@ In order to support as broad range of applications as possible, without unduly affecting their performance, applications that wish to manage their own local memory registrations may do so by using the memory registration calls. -Applications may use the FI_LOCAL_MR domain capability bit as a guide. +Applications may use the FI_LOCAL_MR domain mode bit as a guide. +.PP +When the FI_LOCAL_MR mode bit is set, applications must register all +data buffers that will be accessed by the local hardware and provide a +valid mem_desc parameter into applicable data transfer operations. +When FI_LOCAL_MR is zero, applications are not required to register data +buffers before using them for local operations (e.g. +send and receive data buffers), and the mem_desc parameter into data +transfer operations is ignored. .PP Providers may support applications registering any range of addresses in their virtual address space, whether or not those addresses are back by @@ -177,6 +185,9 @@ Support for user requested keys is provider specific and is determined by the FI_PROV_MR_ATTR mode bit. Access domains must be opened with the FI_PROV_MR_ATTR mode cleared in order to enable support for application selectable MR keys. +The requested_key parameter is ignored for memory registration calls +unless the access flags include either FI_REMOTE_READ or +FI_REMOTE_WRITE. .PP Remote RMA and atomic operations indicate the location within a registered memory region by specifying an address. @@ -221,18 +232,10 @@ struct\ fi_mr_attr\ { .fi .SS fi_close .PP -Fi_close may be used to release all resources associated with a -registering a memory region. +Fi_close is used to release all resources associated with a registering +a memory region. Once unregistered, further access to the registered memory is not guaranteed. -For performance reasons, unregistration processing may be done -asynchronously or lazily. -To force all queued unregistration requests to complete, applications -may call fi_sync on the domain. -Upon completion of a domain fi_sync call, all memory regions -unregistered before fi_sync was invoked will have completed, and no -further access to the registered region, either locally or remotely, via -fabric resources will be possible. .SS fi_mr_desc / fi_mr_key .PP The local memory descriptor and remote protection key associated with a diff --git a/opal/mca/common/libfabric/libfabric/man/fi_poll.3 b/opal/mca/common/libfabric/libfabric/man/fi_poll.3 index 946610f108..a45ae8574f 100644 --- a/opal/mca/common/libfabric/libfabric/man/fi_poll.3 +++ b/opal/mca/common/libfabric/libfabric/man/fi_poll.3 @@ -1,4 +1,4 @@ -.TH fi_poll 3 "2014-11-14" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH fi_poll 3 "2014\-12\-15" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .SH NAME .PP fi_poll - Polling and wait set operations @@ -37,8 +37,6 @@ int\ fi_wait_open(struct\ fid_domain\ *domain,\ struct\ fi_wait_attr\ *attr, int\ fi_close(struct\ fid\ *waitset); -int\ fi_control(struct\ fid\ *waitset,\ int\ command,\ void\ *arg); - int\ fi_wait(struct\ fid_wait\ *waitset,\ int\ timeout); \f[] .fi @@ -71,19 +69,11 @@ A poll set is defined with the following attributes. .nf \f[C] struct\ fi_poll_attr\ { -\ \ \ \ int\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ mask;\ \ \ \ \ \ /*\ valid\ attr\ fields\ */ \ \ \ \ uint64_t\ \ \ \ \ \ \ \ \ \ \ \ \ flags;\ \ \ \ \ /*\ operation\ flags\ */ }; \f[] .fi .PP -\f[I]mask\f[] : The mask field is used for forward and backward API -compatibility. -It is used by the application to indicate which fields in the attribute -structure have been set. -For this version of the API, mask should be set to FI_POLL_ATTR_MASK_V1, -indicating that all specified fields have been initialized. -.PP \f[I]flags\f[] : Flags that set the default operation of the poll set. The use of this field is reserved and must be set to 0 by the caller. .SS fi_close @@ -120,29 +110,19 @@ fi_wait_attr. .nf \f[C] struct\ fi_wait_attr\ { -\ \ \ \ int\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ mask;\ \ \ \ \ \ /*\ valid\ attr\ fields\ */ \ \ \ \ enum\ fi_wait_obj\ \ \ \ \ wait_obj;\ \ /*\ requested\ wait\ object\ */ \ \ \ \ uint64_t\ \ \ \ \ \ \ \ \ \ \ \ \ flags;\ \ \ \ \ /*\ operation\ flags\ */ }; \f[] .fi .PP -\f[I]mask\f[] : The mask field is used for forward and backward API -compatibility. -It is used by the application to indicate which fields in the attribute -structure have been set. -For this version of the API, mask should be set to FI_WAIT_ATTR_MASK_V1, -indicating that all specified fields have been initialized. -.PP \f[I]wait_obj\f[] : Wait sets are associated with specific wait object(s). Wait objects allow applications to block until the wait object is signaled, indicating that an event is available to be read. -Users may use fi_control to retrieve the underlying wait object(s) -associated with a wait set, in order to use it in other system calls. The following values may be used to specify the type of wait object -associated with an wait set: FI_WAIT_UNSPEC, FI_WAIT_FD, and -FI_WAIT_MUT_COND. +associated with a wait set: FI_WAIT_UNSPEC, FI_WAIT_FD, and +FI_WAIT_MUTEX_COND. .IP \[bu] 2 \f[I]FI_WAIT_UNSPEC\f[] : Specifies that the user will only wait on the wait set using fabric interface calls, such as fi_wait. @@ -159,7 +139,7 @@ routines. However, a provider may signal an FD wait object by marking it as readable, writable, or with an error. .IP \[bu] 2 -\f[I]FI_WAIT_MUT_COND\f[] : Specifies that the wait set should use a +\f[I]FI_WAIT_MUTEX_COND\f[] : Specifies that the wait set should use a pthread mutex and cond variable as a wait object. .PP \f[I]flags\f[] : Flags that set the default operation of the wait set. @@ -169,39 +149,6 @@ The use of this field is reserved and must be set to 0 by the caller. The fi_close call releases all resources associated with a wait set. The wait set must not be bound to any other opened resources prior to being closed. -.SS fi_control -.PP -The fi_control call is used to access provider or implementation -specific details of the wait set. -Access to the wait set should be serialized across all calls when -fi_control is invoked, as it may redirect the implementation of wait set -operations. -The following control commands are usable with a wait set. -.PP -\f[I]FI_GETWAIT (void **)\f[] : This command allows the user to retrieve -the low-level wait object(s) associated with the wait set. -The format of the wait-object is specified during wait set creation, -through the wait set attributes. -The fi_control arg parameter should be an address to a struct -fi_wait_obj_set. -.IP -.nf -\f[C] -struct\ fi_wait_obj_set\ { -\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ \ len;\ \ \ \ \ \ /*\ size\ of\ obj\ array\ entries\ */ -\ \ \ \ enum\ fi_wait_obj\ \ wait_obj;\ /*\ type\ of\ wait\ obj\ */ -\ \ \ \ void\ \ \ \ \ \ \ \ \ \ \ \ \ *obj;\ \ \ \ \ \ /*\ array\ of\ wait\ objects\ */ -}; -\f[] -.fi -.PP -On input, len should indicate the size in bytes referenced by the obj -field. -On output, the needed size will be returned. -The underlying wait objects will be returned in the obj array. -If insufficient space is provided, the results will be truncated. -The wait_obj field may be used to identify the format of the wait -objects. .SS fi_wait .PP Waits on a wait set until one or more of its underlying wait objects is diff --git a/opal/mca/common/libfabric/libfabric/man/fi_trigger.3 b/opal/mca/common/libfabric/libfabric/man/fi_trigger.3 index 8f1b79fe56..678a0462b8 100644 --- a/opal/mca/common/libfabric/libfabric/man/fi_trigger.3 +++ b/opal/mca/common/libfabric/libfabric/man/fi_trigger.3 @@ -1,4 +1,4 @@ -.TH fi_trigger 3 "2014-11-14" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH fi_trigger 3 "2014\-12\-10" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .SH NAME .PP fi_trigger - Triggered operations @@ -13,8 +13,7 @@ fi_trigger - Triggered operations .PP Triggered operations allow an application to queue a data transfer request that is deferred until a specified condition is met. -It is often used to send a message, but only after receiving all input -data. +A typical use is to send a message only after receiving all input data. .PP A triggered operation may be requested by specifying the FI_TRIGGER flag as part of the operation. @@ -23,7 +22,7 @@ FI_TRIGGER flag. Such an endpoint is referred to as a triggerable endpoint. All data transfer operations on a triggerable endpoint are deferred. .PP -Any data transfer operation is potentially be triggerable, subject to +Any data transfer operation is potentially triggerable, subject to provider constraints. Triggerable endpoints are initialized such that only those interfaces supported by the provider which are triggerable are available. @@ -63,7 +62,7 @@ event type. .PP The following trigger events are defined. .PP -\f[I]FI_TRIGGER_THRESHOL\f[] : This indicates that the data transfer +\f[I]FI_TRIGGER_THRESHOLD\f[] : This indicates that the data transfer operation will be deferred until an event counter crosses an application specified threshold value. The threshold is specified using struct fi_trigger_threshold: diff --git a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_atomic.c b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_atomic.c index b0ab052a31..1e615c138c 100644 --- a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_atomic.c +++ b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_atomic.c @@ -613,7 +613,7 @@ int psmx_am_atomic_handler(psm_am_token_t token, psm_epaddr_t epaddr, case PSMX_AM_REP_ATOMIC_COMPWRITE: req = (struct psmx_am_request *)(uintptr_t)args[1].u64; op_error = (int)args[0].u32w1; - assert(req->atomic.len == len); + assert(op_error || req->atomic.len == len); if (!op_error) memcpy(req->atomic.result, src, len); @@ -795,7 +795,6 @@ ssize_t _psmx_atomic_write(struct fid_ep *ep, size_t idx; ep_priv = container_of(ep, struct psmx_fid_ep, ep); - assert(ep_priv->domain); if (flags & FI_TRIGGER) { struct psmx_trigger *trigger; @@ -981,7 +980,6 @@ ssize_t _psmx_atomic_readwrite(struct fid_ep *ep, size_t idx; ep_priv = container_of(ep, struct psmx_fid_ep, ep); - assert(ep_priv->domain); if (flags & FI_TRIGGER) { struct psmx_trigger *trigger; @@ -1167,7 +1165,6 @@ ssize_t _psmx_atomic_compwrite(struct fid_ep *ep, size_t idx; ep_priv = container_of(ep, struct psmx_fid_ep, ep); - assert(ep_priv->domain); if (flags & FI_TRIGGER) { struct psmx_trigger *trigger; diff --git a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_av.c b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_av.c index ca1cf0a6fd..350e8ed835 100644 --- a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_av.c +++ b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_av.c @@ -118,9 +118,10 @@ static int psmx_av_insert(struct fid_av *av, const void *addr, size_t count, { struct psmx_fid_av *av_priv; psm_error_t *errors; + int error_count = 0; int *mask; int err; - int i; + int i, j; fi_addr_t *result = NULL; struct psmx_epaddr_context *epaddr_context; @@ -174,25 +175,38 @@ static int psmx_av_insert(struct fid_av *av, const void *addr, size_t count, (psm_epaddr_t *) fi_addr, 30*1e9); for (i=0; idomain, ((psm_epid_t *) addr)[i], ((psm_epaddr_t *) fi_addr)[i]); } + else { + fi_addr[i] = FI_ADDR_NOTAVAIL; + error_count++; + } } free(mask); free(errors); if (av_priv->type == FI_AV_TABLE) { + /* NOTE: unresolved addresses are left in the AV table */ if (result) { - for (i=0; ilast + i; + for (i=0; ilast + i; + if ((fi_addr_t)av_priv->psm_epaddrs[j] == FI_ADDR_NOTAVAIL) + result[i] = FI_ADDR_NOTAVAIL; + else + result[i] = j; + } } av_priv->last += count; } - return psmx_errno(err); + return count - error_count; } static int psmx_av_remove(struct fid_av *av, fi_addr_t *fi_addr, size_t count, diff --git a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_cntr.c b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_cntr.c index 123f6fe7cc..fe55f42b4b 100644 --- a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_cntr.c +++ b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_cntr.c @@ -384,7 +384,7 @@ int psmx_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr, break; case FI_WAIT_FD: - case FI_WAIT_MUT_COND: + case FI_WAIT_MUTEX_COND: wait_attr.wait_obj = attr->wait_obj; wait_attr.flags = 0; err = psmx_wait_open(domain, &wait_attr, (struct fid_wait **)&wait); @@ -394,7 +394,7 @@ int psmx_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr, default: psmx_debug("%s: attr->wait_obj=%d, supported=%d...%d\n", __func__, - attr->wait_obj, FI_WAIT_NONE, FI_WAIT_MUT_COND); + attr->wait_obj, FI_WAIT_NONE, FI_WAIT_MUTEX_COND); return -FI_EINVAL; } diff --git a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_cq.c b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_cq.c index 75d60260ea..7d3e94657f 100644 --- a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_cq.c +++ b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_cq.c @@ -84,7 +84,7 @@ struct psmx_cq_event *psmx_cq_create_event(struct psmx_fid_cq *cq, event->cqe.err.data = data; event->cqe.err.tag = tag; event->cqe.err.olen = olen; - event->cqe.err.prov_errno = 0; + event->cqe.err.prov_errno = PSM_INTERNAL_ERR; goto out; } @@ -363,7 +363,7 @@ int psmx_cq_poll_mq(struct psmx_fid_cq *cq, struct psmx_fid_domain *domain, if (mr->domain->rma_ep->remote_write_cntr) psmx_cntr_inc(mr->domain->rma_ep->remote_write_cntr); if (!cq || mr->cq == cq) - return 1; + return psm_status.error_code ? -FI_EAVAIL : 1; continue; } @@ -375,7 +375,7 @@ int psmx_cq_poll_mq(struct psmx_fid_cq *cq, struct psmx_fid_domain *domain, if (mr->domain->rma_ep->remote_read_cntr) psmx_cntr_inc(mr->domain->rma_ep->remote_read_cntr); if (!cq) - return 1; + return psm_status.error_code ? -FI_EAVAIL : 1; continue; } } @@ -434,7 +434,7 @@ int psmx_cq_poll_mq(struct psmx_fid_cq *cq, struct psmx_fid_domain *domain, } if (!cq || tmp_cq == cq) - return 1; + return psm_status.error_code ? -FI_EAVAIL : 1; } else if (err == PSM_MQ_NO_COMPLETIONS) { return 0; @@ -454,7 +454,6 @@ static ssize_t psmx_cq_readfrom(struct fid_cq *cq, void *buf, size_t count, ssize_t read_count; cq_priv = container_of(cq, struct psmx_fid_cq, cq); - assert(cq_priv->domain); if (PSMX_CQ_EMPTY(cq_priv) || !buf) { ret = psmx_cq_poll_mq(cq_priv, cq_priv->domain, @@ -601,7 +600,7 @@ static ssize_t psmx_cq_sreadfrom(struct fid_cq *cq, void *buf, size_t count, else { clock_gettime(CLOCK_REALTIME, &ts0); while (1) { - if (psmx_cq_poll_mq(cq_priv, cq_priv->domain, NULL, 0, NULL) > 0) + if (psmx_cq_poll_mq(cq_priv, cq_priv->domain, NULL, 0, NULL)) break; /* CQ may be updated asynchronously by the AM handlers */ @@ -748,7 +747,7 @@ int psmx_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, break; case FI_WAIT_FD: - case FI_WAIT_MUT_COND: + case FI_WAIT_MUTEX_COND: wait_attr.wait_obj = attr->wait_obj; wait_attr.flags = 0; err = psmx_wait_open(domain, &wait_attr, (struct fid_wait **)&wait); @@ -758,7 +757,7 @@ int psmx_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, default: psmx_debug("%s: attr->wait_obj=%d, supported=%d...%d\n", __func__, attr->wait_obj, - FI_WAIT_NONE, FI_WAIT_MUT_COND); + FI_WAIT_NONE, FI_WAIT_MUTEX_COND); return -FI_EINVAL; } diff --git a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_domain.c b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_domain.c index 2c729d2440..a5a3a5ca52 100644 --- a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_domain.c +++ b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_domain.c @@ -54,6 +54,11 @@ static int psmx_domain_close(fid_t fid) psm_mq_finalize(domain->psm_mq); #endif + /* workaround for: + * Assertion failure at psm_ep.c:1059: ep->mctxt_master == ep + */ + sleep(1); + err = psm_ep_close(domain->psm_ep, PSM_EP_CLOSE_GRACEFUL, (int64_t) PSMX_TIME_OUT * 1000000000LL); if (err != PSM_OK) diff --git a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_init.c b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_init.c index d3fdbe993e..9c1e09b747 100644 --- a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_init.c +++ b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_init.c @@ -32,8 +32,10 @@ #include "psmx.h" #include "fi.h" +#include "prov.h" struct psmx_env psmx_env; +volatile int init_count = 0; static int psmx_reserve_tag_bits(int *caps, uint64_t *max_tag_value) { @@ -107,6 +109,8 @@ static int psmx_getinfo(uint32_t version, const char *node, const char *service, uint64_t max_tag_value = 0; int err = -ENODATA; + psmx_debug("%s\n", __func__); + *info = NULL; if (psm_ep_num_devunits(&cnt) || !cnt) { @@ -114,25 +118,19 @@ static int psmx_getinfo(uint32_t version, const char *node, const char *service, return -FI_ENODATA; } - if (node && !(flags & FI_SOURCE)) { - if (service) - dest_addr = psmx_resolve_name(node, atoi(service)); - else - dest_addr = psmx_resolve_name(node, 0); - } + if (node && !(flags & FI_SOURCE)) + dest_addr = psmx_resolve_name(node, 0); if (hints) { switch (hints->ep_type) { case FI_EP_UNSPEC: case FI_EP_RDM: break; - case FI_EP_MSG: - ep_type = FI_EP_MSG; break; default: - psmx_debug("%s: hints->ep_type=%d, supported=%d,%d,%d.\n", + psmx_debug("%s: hints->ep_type=%d, supported=%d,%d.\n", __func__, hints->ep_type, FI_EP_UNSPEC, - FI_EP_RDM, FI_EP_MSG); + FI_EP_RDM); goto err_out; } @@ -279,6 +277,8 @@ static int psmx_fabric(struct fi_fabric_attr *attr, { struct psmx_fid_fabric *fabric_priv; + psmx_debug("%s\n", __func__); + if (strncmp(attr->name, "psm", 3)) return -FI_ENODATA; @@ -294,11 +294,21 @@ static int psmx_fabric(struct fi_fabric_attr *attr, return 0; } +static void psmx_fini(void) +{ + psmx_debug("%s\n", __func__); + + if (! --init_count) + psm_finalize(); +} + static struct fi_provider psmx_prov = { .name = "PSM", .version = FI_VERSION(0, 9), + .fi_version = FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), .getinfo = psmx_getinfo, .fabric = psmx_fabric, + .cleanup = psmx_fini }; static int psmx_get_int_env(char *name, int default_value) @@ -320,12 +330,14 @@ static int psmx_get_int_env(char *name, int default_value) return default_value; } -static void __attribute__((constructor)) psmx_ini(void) +PSM_INI { int major, minor; int check_version; int err; + psmx_debug("%s\n", __func__); + psmx_env.name_server = psmx_get_int_env("SFI_PSM_NAME_SERVER", 0); psmx_env.am_msg = psmx_get_int_env("SFI_PSM_AM_MSG", 0); psmx_env.tagged_rma = psmx_get_int_env("SFI_PSM_TAGGED_RMA", 0); @@ -342,7 +354,7 @@ static void __attribute__((constructor)) psmx_ini(void) if (err != PSM_OK) { fprintf(stderr, "%s: psm_init failed: %s\n", __func__, psm_error_get_string(err)); - return; + return NULL; } check_version = psmx_get_int_env("SFI_PSM_VERSION_CHECK", 1); @@ -351,13 +363,10 @@ static void __attribute__((constructor)) psmx_ini(void) fprintf(stderr, "%s: PSM version mismatch: header %d.%d, library %d.%d.\n", __func__, PSM_VERNO_MAJOR, PSM_VERNO_MINOR, major, minor); fprintf(stderr, "\tSet envar SFI_PSM_VERSION_CHECK=0 to bypass version check.\n"); - return; + return NULL; } - (void) fi_register(&psmx_prov); + init_count++; + return (&psmx_prov); } -static void __attribute__((destructor)) psmx_fini(void) -{ - psm_finalize(); -} diff --git a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_msg.c b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_msg.c index dda0d8f361..612895697a 100644 --- a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_msg.c +++ b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_msg.c @@ -206,7 +206,6 @@ ssize_t _psmx_send(struct fid_ep *ep, const void *buf, size_t len, size_t idx; ep_priv = container_of(ep, struct psmx_fid_ep, ep); - assert(ep_priv->domain); if (flags & FI_TRIGGER) { struct psmx_trigger *trigger; diff --git a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_msg2.c b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_msg2.c index eae43a6116..ca4b2292c5 100644 --- a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_msg2.c +++ b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_msg2.c @@ -491,7 +491,6 @@ static ssize_t _psmx_send2(struct fid_ep *ep, const void *buf, size_t len, size_t idx; ep_priv = container_of(ep, struct psmx_fid_ep, ep); - assert(ep_priv->domain); if (!buf) return -EINVAL; diff --git a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_poll.c b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_poll.c index ec603194c3..9eb7c33bbe 100644 --- a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_poll.c +++ b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_poll.c @@ -143,6 +143,8 @@ static struct fi_ops psmx_fi_ops = { static struct fi_ops_poll psmx_poll_ops = { .size = sizeof(struct fi_ops_poll), .poll = psmx_poll_poll, + .poll_add = psmx_poll_add, + .poll_del = psmx_poll_del, }; int psmx_poll_open(struct fid_domain *domain, struct fi_poll_attr *attr, diff --git a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_rma.c b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_rma.c index 054c8ae857..535a573f5d 100644 --- a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_rma.c +++ b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_rma.c @@ -450,7 +450,6 @@ ssize_t _psmx_read(struct fid_ep *ep, void *buf, size_t len, size_t idx; ep_priv = container_of(ep, struct psmx_fid_ep, ep); - assert(ep_priv->domain); if (flags & FI_TRIGGER) { struct psmx_trigger *trigger; @@ -619,7 +618,6 @@ ssize_t _psmx_write(struct fid_ep *ep, const void *buf, size_t len, size_t idx; ep_priv = container_of(ep, struct psmx_fid_ep, ep); - assert(ep_priv->domain); if (flags & FI_TRIGGER) { struct psmx_trigger *trigger; diff --git a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_wait.c b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_wait.c index 1776694f97..7bed36435e 100644 --- a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_wait.c +++ b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_wait.c @@ -34,15 +34,10 @@ int psmx_wait_get_obj(struct psmx_fid_wait *wait, void *arg) { - struct fi_wait_obj_set *wait_obj_set = arg; void *obj_ptr; int obj_size = 0; int obj_type = FI_WAIT_NONE; - int ret_count = 0; - struct { - pthread_mutex_t *mutex; - pthread_cond_t *cond; - } mutex_cond; + struct fi_mutex_cond mutex_cond; if (!arg) return -EINVAL; @@ -55,7 +50,7 @@ int psmx_wait_get_obj(struct psmx_fid_wait *wait, void *arg) obj_ptr = &wait->fd[0]; break; - case FI_WAIT_MUT_COND: + case FI_WAIT_MUTEX_COND: mutex_cond.mutex = &wait->mutex; mutex_cond.cond = &wait->cond; obj_size = sizeof(mutex_cond); @@ -69,14 +64,9 @@ int psmx_wait_get_obj(struct psmx_fid_wait *wait, void *arg) } if (obj_size) { - ret_count = 1; - if (wait_obj_set->count) - memcpy(wait_obj_set->obj, obj_ptr, obj_size); + memcpy(arg, obj_ptr, obj_size); } - wait_obj_set->count = ret_count; - wait_obj_set->wait_obj = obj_type; - return 0; } @@ -99,7 +89,7 @@ int psmx_wait_wait(struct fid_wait *wait, int timeout) err = -FI_ETIMEDOUT; break; - case FI_WAIT_MUT_COND: + case FI_WAIT_MUTEX_COND: err = fi_wait_cond(&wait_priv->cond, &wait_priv->mutex, timeout); break; @@ -127,7 +117,7 @@ void psmx_wait_signal(struct fid_wait *wait) write(wait_priv->fd[1], &c, 1); break; - case FI_WAIT_MUT_COND: + case FI_WAIT_MUTEX_COND: pthread_cond_signal(&wait_priv->cond); break; } @@ -182,7 +172,7 @@ static int psmx_wait_init(struct psmx_fid_wait *wait, int type) } break; - case FI_WAIT_MUT_COND: + case FI_WAIT_MUTEX_COND: pthread_mutex_init(&wait->mutex, NULL); pthread_cond_init(&wait->cond, NULL); break; @@ -210,14 +200,14 @@ int psmx_wait_open(struct fid_domain *domain, struct fi_wait_attr *attr, break; case FI_WAIT_FD: - case FI_WAIT_MUT_COND: + case FI_WAIT_MUTEX_COND: type = attr->wait_obj; break; default: psmx_debug("%s: attr->wait_obj=%d, supported=%d,%d,%d\n", __func__, attr->wait_obj, FI_WAIT_UNSPEC, - FI_WAIT_FD, FI_WAIT_MUT_COND); + FI_WAIT_FD, FI_WAIT_MUTEX_COND); return -FI_EINVAL; } } diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/list.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/list.c deleted file mode 100644 index b357627f8a..0000000000 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/list.c +++ /dev/null @@ -1,266 +0,0 @@ -/* - * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include - -#include "list.h" - -#define LIST_DEF_NUM_ENTRIES (128) - -#define ENQUEUE_LIST(_head, _tail, _elem) do{ \ - (_elem)->next = NULL; \ - if(NULL == (_head)){ \ - (_head) = (_tail) = (_elem); \ - }else{ \ - (_tail)->next = (_elem); \ - } \ - }while(0) - -#define DEQUEUE_LIST(_head, _tail, _elem) do{ \ - if(NULL == _head){ \ - _elem = NULL; \ - }else{ \ - _elem = _head; \ - _head = _head->next; \ - if(_head == NULL) \ - _tail = NULL; \ - } \ - }while(0) - -static int _list_enqueue(list_element_t *element) -{ - if(!element) - return -1; - ENQUEUE_LIST(element->list->head, - element->list->tail, element); - return 0; -} - -static list_element_t *_list_dequeue(list_t *list) -{ - list_element_t *element; - DEQUEUE_LIST(list->head, list->tail, element); - return element; -} - -static int _list_enqueue_free_list(list_element_t *element) -{ - if(!element) - return -1; - ENQUEUE_LIST(element->list->free_head, - element->list->free_tail, element); - return 0; -} - -static list_element_t *_list_dequeue_free_list(list_t *list) -{ - list_element_t *element; - DEQUEUE_LIST(list->free_head, list->free_tail, element); - return element; -} - -list_t *new_list(size_t length) -{ - int i; - list_t *list = (list_t *)malloc(sizeof(list_t) + - length * sizeof(list_element_t)); - - memset(list, 0, sizeof(list_t) + - length * sizeof(list_element_t)); - - list->curr_len = 0; - list->max_len = length; - list->head = list->tail = NULL; - list->free_head = list->free_tail = NULL; - - if(0 != fastlock_init(&(list->lock))) - goto err; - - list_element_t *elements = (list_element_t *) - ((char*)list + sizeof(list_t)); - - for(i=0; ilist = list; - if(0 != _list_enqueue_free_list(element)) - goto err1; - } - return list; - -err1: - fastlock_destroy(&(list->lock)); - -err: - free(list); - return NULL; -} - -void free_list(list_t *list) -{ - fastlock_destroy(&(list->lock)); - free((void *)list); -} - -int enqueue_item(list_t *list, void *data) -{ - int ret; - fastlock_acquire(&(list->lock)); - list_element_t *elem = _list_dequeue_free_list(list); - if(!elem){ - int i; - list_element_t *elements; - - if(list->curr_len == list->max_len){ - list = realloc(list, - sizeof(list_t) + list->max_len * sizeof(list_element_t) + - sizeof(list_element_t) * LIST_DEF_NUM_ENTRIES); - if(!list){ - fastlock_release(&(list->lock)); - return -1; - } - - elements = (list_element_t *) ((char*)list + sizeof(list_t) + - sizeof(list_element_t) * list->max_len); - memset(elements, 0, sizeof(list_element_t) * - LIST_DEF_NUM_ENTRIES); - - for(i=0; ilock)); - return -1; - } - } - list->max_len += LIST_DEF_NUM_ENTRIES; - elem = _list_dequeue_free_list(list); - if(!elem){ - fastlock_release(&(list->lock)); - return -1; - } - } - } - - elem->next = NULL; - elem->data = data; - elem->len = 0; - ret = _list_enqueue(elem); - if(!ret) - list->curr_len++; - fastlock_release(&(list->lock)); - return ret; -} - -void *dequeue_item(list_t *list) -{ - fastlock_acquire(&(list->lock)); - if(list->curr_len > 0){ - void *data; - list_element_t *element = _list_dequeue(list); - - list->curr_len--; - data = element->data; - _list_enqueue_free_list(element); - fastlock_release(&(list->lock)); - return data; - } - fastlock_release(&(list->lock)); - return NULL; -} - -void *peek_item(list_t *list) -{ - fastlock_acquire(&(list->lock)); - if(list->curr_len > 0){ - list_element_t *element = _list_dequeue(list); - fastlock_release(&(list->lock)); - return element->data; - } - fastlock_release(&(list->lock)); - return NULL; -} - -int delete_item(list_t *list, void *item) -{ - fastlock_acquire(&(list->lock)); - list_element_t *curr; - list_element_t *prev = NULL; - - for(curr = list->head; curr != NULL; curr = curr->next){ - if(curr->data == item) { - if(prev == NULL) { - list->head = curr->next; - } else { - prev->next = curr->next; - } - - if(list->tail == curr) - list->tail = NULL; - - _list_enqueue_free_list(curr); - list->curr_len--; - fastlock_release(&(list->lock)); - return 0; - } - prev = curr; - } - fastlock_release(&(list->lock)); - return -1; -} - -int find_item(list_t *list, void *item) -{ - fastlock_acquire(&(list->lock)); - list_element_t *curr = list->head; - - while(curr){ - if(curr->data == item){ - fastlock_release(&(list->lock)); - return 0; - } - curr=curr->next; - } - fastlock_release(&(list->lock)); - return -1; -} - -ssize_t list_length(list_t *list) -{ - ssize_t len; - fastlock_acquire(&(list->lock)); - len = list->curr_len; - fastlock_release(&(list->lock)); - return len; -} diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/list.h b/opal/mca/common/libfabric/libfabric/prov/sockets/src/list.h deleted file mode 100644 index 86187c7c38..0000000000 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/list.h +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef _LIST_H_ -#define _LIST_H_ - -#include "fi.h" - -typedef struct _list_t list_t; -typedef struct _list_element_t -{ - void *data; - size_t len; - list_t *list; - struct _list_element_t *next; -}list_element_t; - -struct _list_t -{ - list_element_t *head, *tail; - list_element_t *free_head, *free_tail; - size_t curr_len; - size_t max_len; - fastlock_t lock; -}; - -list_t *new_list(size_t length); -void free_list(list_t *list); - -int enqueue_item(list_t *list, void *item); -void *peek_item(list_t *list); -void *dequeue_item(list_t *list); -int find_item(list_t *list, void *item); -int delete_item(list_t *list, void *item); -ssize_t list_length(list_t *list); - -#endif /* _LIST_H_ */ diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock.h b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock.h index def9e52857..994780f1c4 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock.h +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock.h @@ -45,47 +45,74 @@ #include #include #include +#include #include #include #include -#include "list.h" #include #include #ifndef _SOCK_H_ #define _SOCK_H_ -#define SOCK_EP_MAX_MSG_SZ (1<<22) -#define SOCK_EP_MAX_INJECT_SZ (1<<12) -#define SOCK_EP_MAX_BUFF_RECV (1<<22) +#define SOCK_EP_MAX_MSG_SZ (1<<23) +#define SOCK_EP_MAX_INJECT_SZ ((1<<8) - 1) +#define SOCK_EP_MAX_BUFF_RECV (1<<23) #define SOCK_EP_MAX_ORDER_RAW_SZ (0) #define SOCK_EP_MAX_ORDER_WAR_SZ (0) #define SOCK_EP_MAX_ORDER_WAW_SZ (0) #define SOCK_EP_MEM_TAG_FMT (0) -#define SOCK_EP_MSG_ORDER (0) #define SOCK_EP_MAX_EP_CNT (128) #define SOCK_EP_MAX_TX_CNT (16) #define SOCK_EP_MAX_RX_CNT (16) #define SOCK_EP_MAX_IOV_LIMIT (8) #define SOCK_EP_MAX_TX_CTX_SZ (1<<12) +#define SOCK_EP_MIN_MULTI_RECV (64) +#define SOCK_EP_MAX_ATOMIC_SZ (512) +#define SOCK_EP_MAX_CTX_BITS (16) #define SOCK_PE_POLL_TIMEOUT (100000) #define SOCK_PE_MAX_ENTRIES (128) -#define SOCK_EQ_DEF_SZ (1<<12) -#define SOCK_CQ_DEF_SZ (1<<12) +#define SOCK_EQ_DEF_SZ (1<<8) +#define SOCK_CQ_DEF_SZ (1<<8) -#define SOCK_EP_RDM_CAP (FI_MSG | FI_INJECT | FI_SOURCE | FI_SEND | FI_RECV) -#define SOCK_EP_DGRAM_CAP (FI_MSG | FI_INJECT | FI_SOURCE | FI_SEND | FI_RECV) -#define SOCK_OPS_CAP (FI_INJECT | FI_SEND | FI_RECV ) +#define SOCK_CQ_DATA_SIZE (sizeof(uint64_t)) +#define SOCK_TAG_SIZE (sizeof(uint64_t)) + + +#define SOCK_EP_RDM_CAP (FI_MSG | FI_RMA | FI_TAGGED | FI_ATOMICS | FI_DYNAMIC_MR | \ + FI_NAMED_RX_CTX | FI_BUFFERED_RECV | FI_DIRECTED_RECV | \ + FI_INJECT | FI_MULTI_RECV | FI_SOURCE | FI_READ | FI_WRITE | \ + FI_RECV | FI_SEND | FI_REMOTE_READ | FI_REMOTE_WRITE | \ + FI_REMOTE_CQ_DATA | FI_COMPLETION | FI_REMOTE_SIGNAL | \ + FI_REMOTE_COMPLETE | FI_PEEK | FI_CANCEL) + +#define SOCK_EP_MSG_CAP SOCK_EP_RDM_CAP + +#define SOCK_EP_DGRAM_CAP (FI_MSG | FI_TAGGED | FI_DYNAMIC_MR | \ + FI_NAMED_RX_CTX | FI_BUFFERED_RECV | FI_DIRECTED_RECV | \ + FI_INJECT | FI_MULTI_RECV | FI_SOURCE | FI_RECV | FI_SEND | \ + FI_REMOTE_CQ_DATA | FI_COMPLETION | FI_REMOTE_SIGNAL | \ + FI_REMOTE_COMPLETE | FI_PEEK | FI_CANCEL) + +#define SOCK_DEF_OPS (FI_SEND | FI_RECV | \ + FI_BUFFERED_RECV | FI_READ | FI_WRITE | \ + FI_REMOTE_READ | FI_REMOTE_WRITE) + +#define SOCK_EP_MSG_ORDER (FI_ORDER_RAR | FI_ORDER_RAW | FI_ORDER_RAS| \ + FI_ORDER_WAR | FI_ORDER_WAW | FI_ORDER_WAS | \ + FI_ORDER_SAR | FI_ORDER_SAW | FI_ORDER_SAS) + +#define SOCK_MODE (0) + +#define SOCK_COMM_BUF_SZ (SOCK_EP_MAX_MSG_SZ) +#define SOCK_COMM_THRESHOLD (128 * 1024) #define SOCK_MAJOR_VERSION 1 #define SOCK_MINOR_VERSION 0 -extern const char const sock_fab_name[]; -extern const char const sock_dom_name[]; - struct sock_fabric{ struct fid_fabric fab_fid; atomic_t ref; @@ -93,6 +120,18 @@ struct sock_fabric{ struct sock_conn { int sock_fd; + struct sockaddr addr; + struct sock_pe_entry *rx_pe_entry; + struct sock_pe_entry *tx_pe_entry; + struct ringbuf inbuf; + struct ringbuf outbuf; +}; + +struct sock_conn_map { + struct sock_conn *table; + int used; + int size; + struct sock_domain *domain; }; struct sock_domain { @@ -101,136 +140,140 @@ struct sock_domain { struct sock_fabric *fab; fastlock_t lock; atomic_t ref; - + struct sock_eq *eq; struct sock_eq *mr_eq; - struct sock_pe *pe; + enum fi_progress progress_mode; struct index_map mr_idm; + struct sock_pe *pe; + struct sock_conn_map u_cmap; + struct sock_conn_map r_cmap; + pthread_t listen_thread; + int listening; + char service[NI_MAXSERV]; }; struct sock_cntr { - struct fid_cntr cntr_fid; - struct sock_domain *dom; - uint64_t value; - uint64_t threshold; - atomic_t ref; + struct fid_cntr cntr_fid; + struct sock_domain *domain; + atomic_t value; + atomic_t threshold; + atomic_t ref; atomic_t err_cnt; - pthread_cond_t cond; - pthread_mutex_t mut; + pthread_cond_t cond; + pthread_mutex_t mut; + struct fi_cntr_attr attr; + + struct dlist_entry rx_list; + struct dlist_entry tx_list; + + struct fid_wait *waitset; + int signal; }; struct sock_mr { - struct fid_mr mr_fid; - struct sock_domain *dom; - uint64_t access; - uint64_t offset; - uint64_t key; - size_t iov_count; - struct iovec mr_iov[1]; + struct fid_mr mr_fid; + struct sock_domain *domain; + uint64_t access; + uint64_t offset; + uint64_t key; + uint64_t flags; + size_t iov_count; + struct iovec mr_iov[1]; + + struct sock_cntr *cntr; + struct sock_cq *cq; +}; + +struct sock_av_addr { + uint16_t key; + struct sockaddr_storage addr; }; struct sock_av { - struct fid_av av_fid; - struct sock_domain *dom; - atomic_t ref; - struct fi_av_attr attr; - size_t count; - struct sockaddr_in *table; + struct fid_av av_fid; + struct sock_domain *domain; + atomic_t ref; + struct fi_av_attr attr; + uint64_t mask; + int rx_ctx_bits; + size_t stored; + struct index_map addr_idm; + socklen_t addrlen; + struct sock_conn_map *cmap; +}; + +struct sock_fid_list { + struct dlist_entry entry; + struct fid *fid; }; struct sock_poll { - struct fid_poll poll_fid; - struct sock_domain *dom; + struct fid_poll poll_fid; + struct sock_domain *domain; + struct dlist_entry fid_list; }; struct sock_wait { struct fid_wait wait_fid; - struct sock_domain *dom; + struct sock_domain *domain; + struct dlist_entry fid_list; + enum fi_wait_obj type; + union { + int fd[2]; + struct { + pthread_mutex_t mutex; + pthread_cond_t cond; + }; + }; }; enum { - SOCK_REQ_TYPE_SEND, - SOCK_REQ_TYPE_RECV, - SOCK_REQ_TYPE_USER, -}; + /* wire protocol */ + SOCK_OP_SEND = 0, + SOCK_OP_TSEND = 1, + SOCK_OP_SEND_COMPLETE = 2, -enum{ - SOCK_COMM_TYPE_SEND, - SOCK_COMM_TYPE_SENDV, - SOCK_COMM_TYPE_SENDTO, - SOCK_COMM_TYPE_SENDMSG, - SOCK_COMM_TYPE_SENDDATA, - SOCK_COMM_TYPE_SENDDATATO, -}; + SOCK_OP_WRITE = 3, + SOCK_OP_WRITE_COMPLETE = 4, + SOCK_OP_WRITE_ERROR = 5, -struct sock_req_item{ - int req_type; - int comm_type; - struct sock_ep *ep; + SOCK_OP_READ = 6, + SOCK_OP_READ_COMPLETE = 7, + SOCK_OP_READ_ERROR = 8, - void *context; - uint64_t flags; - uint64_t tag; - uint64_t data; + SOCK_OP_ATOMIC_WRITE = 9, + SOCK_OP_ATOMIC_READ_WRITE = 10, + SOCK_OP_ATOMIC_COMP_WRITE = 11, - size_t done_len; - size_t total_len; - struct sockaddr src_addr; - struct sockaddr addr; + SOCK_OP_ATOMIC_COMPLETE = 12, + SOCK_OP_ATOMIC_ERROR = 13, - union{ - struct fi_msg msg; - void *buf; - }item; - -}; - -struct sock_comm_item{ - int type; - int is_done; - void *context; - size_t done_len; - size_t total_len; - uint64_t flags; - - struct sockaddr addr; - - union{ - struct fi_msg msg; - void *buf; - }item; -}; - -enum { - SOCK_OP_SEND, + /* internal */ SOCK_OP_RECV, - SOCK_OP_WRITE, - SOCK_OP_READ, - SOCK_OP_TSEND, SOCK_OP_TRECV, - SOCK_OP_ATOMIC, - SOCK_OP_SEND_INJECT, - SOCK_OP_TSEND_INJECT, }; /* * Transmit context - ring buffer data: - * tx_op + flags + context + dest_addr + [data] + [tag] + tx_iov + * tx_op + flags + context + dest_addr + conn + [data] + [tag] + tx_iov * 8B 8B 8B 8B 8B 8B 24B+ * data - only present if flags indicate * tag - only present for TSEND op */ struct sock_op { - uint8_t op; - uint8_t src_iov_len; - uint8_t dest_iov_len; + uint8_t op; + uint8_t src_iov_len; + uint8_t dest_iov_len; union { struct { uint8_t op; uint8_t datatype; + uint8_t res_iov_len; + uint8_t cmp_iov_len; } atomic; - uint8_t reserved[5]; + uint8_t reserved[5]; }; }; @@ -240,6 +283,8 @@ struct sock_op_send { uint64_t context; uint64_t dest_addr; struct sock_conn *conn; + uint64_t buf; + struct sock_ep *ep; }; struct sock_op_tsend { @@ -249,17 +294,13 @@ struct sock_op_tsend { uint64_t dest_addr; struct sock_conn *conn; uint64_t tag; + uint64_t buf; + struct sock_ep *ep; }; union sock_iov { - struct fi_rma_iov iov; - struct fi_rma_ioc ioc; -}; - -struct sock_rxtx { - struct ringbuffd rbfd; - fastlock_t wlock; - fastlock_t rlock; + struct fi_rma_iov iov; + struct fi_rma_ioc ioc; }; struct sock_eq_entry{ @@ -278,27 +319,19 @@ struct sock_eq{ struct dlistfd_head list; struct dlistfd_head err_list; fastlock_t lock; + + struct fid_wait *waitset; + int signal; }; -struct sock_ep { - struct fid_ep ep; - - uint8_t enabled; - uint8_t connected; - +struct sock_comp { uint8_t send_cq_event; uint8_t recv_cq_event; uint8_t read_cq_event; uint8_t write_cq_event; uint8_t rem_read_cq_event; uint8_t rem_write_cq_event; - - int sock_fd; - atomic_t ref; - - struct sock_eq *eq; - struct sock_av *av; - struct sock_domain *domain; + char reserved[2]; struct sock_cq *send_cq; struct sock_cq *recv_cq; @@ -314,6 +347,29 @@ struct sock_ep { struct sock_cntr *rem_read_cntr; struct sock_cntr *rem_write_cntr; + struct sock_eq *eq; +}; + +struct sock_ep { + union{ + struct fid_ep ep; + struct fid_sep sep; + struct fid_pep pep; + }; + size_t fclass; + uint64_t op_flags; + + uint16_t buffered_len; + uint16_t min_multi_recv; + char reserved[4]; + + atomic_t ref; + struct sock_comp comp; + + struct sock_eq *eq; + struct sock_av *av; + struct sock_domain *domain; + struct sock_rx_ctx *rx_ctx; struct sock_tx_ctx *tx_ctx; @@ -333,35 +389,17 @@ struct sock_ep { enum fi_ep_type ep_type; struct sockaddr_in *src_addr; struct sockaddr_in *dest_addr; - - /* TODO: remove */ - struct sock_ep *next; - struct sock_ep *prev; - struct sock_ep *alias; - struct sock_ep *base; - - list_t *send_list; - list_t *recv_list; - int port_num; -}; - -struct sock_pep { - struct fid_pep pep; - struct sock_domain *dom; - - int sock_fd; - - struct sock_eq *eq; - - struct sock_cq *send_cq; - struct sock_cq *recv_cq; - - uint64_t op_flags; - uint64_t pep_cap; }; struct sock_rx_entry { struct sock_op rx_op; + uint8_t is_buffered; + uint8_t is_busy; + uint8_t is_claimed; + uint8_t reserved[5]; + + uint64_t used; + uint64_t total_len; uint64_t flags; uint64_t context; @@ -369,6 +407,7 @@ struct sock_rx_entry { uint64_t data; uint64_t tag; uint64_t ignore; + struct sock_comp *comp; union sock_iov iov[SOCK_EP_MAX_IOV_LIMIT]; struct dlist_entry entry; @@ -384,26 +423,25 @@ struct sock_rx_ctx { uint8_t recv_cq_event; uint8_t rem_read_cq_event; uint8_t rem_write_cq_event; - uint8_t reserved[1]; + uint16_t buffered_len; + uint16_t min_multi_recv; + uint8_t reserved[7]; uint64_t addr; - - struct sock_cq *recv_cq; - struct sock_cq *rem_read_cq; - struct sock_cq *rem_write_cq; + struct sock_comp comp; struct sock_ep *ep; + struct sock_av *av; + struct sock_eq *eq; struct sock_domain *domain; - struct sock_cntr *recv_cntr; - struct sock_cntr *rem_read_cntr; - struct sock_cntr *rem_write_cntr; - - struct dlist_entry cq_entry; struct dlist_entry pe_entry; + struct dlist_entry cq_entry; + struct dlist_entry cntr_entry; struct dlist_entry pe_entry_list; struct dlist_entry rx_entry_list; + struct dlist_entry rx_buffered_list; struct dlist_entry ep_list; fastlock_t lock; @@ -411,7 +449,11 @@ struct sock_rx_ctx { }; struct sock_tx_ctx { - struct fid_ep ctx; + union { + struct fid_ep ctx; + struct fid_stx stx; + }; + size_t fclass; struct ringbuffd rbfd; fastlock_t wlock; @@ -421,26 +463,17 @@ struct sock_tx_ctx { uint8_t enabled; uint8_t progress; - uint8_t send_cq_event; - uint8_t read_cq_event; - uint8_t write_cq_event; - uint8_t reserved[1]; - uint64_t addr; - - struct sock_cq *send_cq; - struct sock_cq *read_cq; - struct sock_cq *write_cq; + struct sock_comp comp; struct sock_ep *ep; + struct sock_av *av; + struct sock_eq *eq; struct sock_domain *domain; - struct sock_cntr *send_cntr; - struct sock_cntr *read_cntr; - struct sock_cntr *write_cntr; - - struct dlist_entry cq_entry; struct dlist_entry pe_entry; + struct dlist_entry cq_entry; + struct dlist_entry cntr_entry; struct dlist_entry pe_entry_list; struct dlist_entry ep_list; @@ -455,29 +488,84 @@ struct sock_msg_hdr{ uint8_t version; uint8_t op_type; uint16_t rx_id; - uint8_t reserved[4]; + uint16_t pe_entry_id; + uint8_t dest_iov_len; + uint8_t reserved[1]; - uint64_t src_addr; uint64_t flags; uint64_t msg_len; }; struct sock_msg_send{ struct sock_msg_hdr msg_hdr; - /* data */ /* user data */ + /* data */ +}; + +struct sock_msg_tsend{ + struct sock_msg_hdr msg_hdr; + uint64_t tag; + /* user data */ + /* data */ +}; + +struct sock_rma_write_req { + struct sock_msg_hdr msg_hdr; + /* user data */ + /* dst iov(s)*/ + /* data */ +}; + +struct sock_atomic_req { + struct sock_msg_hdr msg_hdr; + struct sock_op op; + + /* user data */ + /* dst ioc(s)*/ + /* cmp iov(s) */ + /* data */ +}; + +struct sock_msg_response { + struct sock_msg_hdr msg_hdr; + uint16_t pe_entry_id; + uint8_t reserved[6]; +}; + +struct sock_rma_read_req { + struct sock_msg_hdr msg_hdr; + /* src iov(s)*/ +}; + +struct sock_rma_read_response { + struct sock_msg_hdr msg_hdr; + uint16_t pe_entry_id; + uint8_t reserved[6]; + /* data */ +}; + +struct sock_atomic_response { + struct sock_msg_hdr msg_hdr; + uint16_t pe_entry_id; + uint8_t reserved[6]; + /* data */ }; struct sock_tx_iov { union sock_iov src; union sock_iov dst; + union sock_iov res; + union sock_iov cmp; }; struct sock_tx_pe_entry{ - struct sock_op tx_op; + struct sock_op tx_op; + struct sock_comp *comp; uint8_t header_sent; - uint8_t reserved[7]; + uint8_t send_done; + uint8_t reserved[6]; + struct sock_tx_ctx *tx_ctx; union { struct sock_tx_iov tx_iov[SOCK_EP_MAX_IOV_LIMIT]; char inject_data[SOCK_EP_MAX_INJECT_SZ]; @@ -486,8 +574,16 @@ struct sock_tx_pe_entry{ struct sock_rx_pe_entry{ struct sock_op rx_op; - void *raw_data; + + struct sock_comp *comp; + uint8_t header_read; + uint8_t pending_send; + uint8_t reserved[6]; + struct sock_rx_entry *rx_entry; + struct sock_msg_response response; union sock_iov rx_iov[SOCK_EP_MAX_IOV_LIMIT]; + char atomic_cmp[SOCK_EP_MAX_ATOMIC_SZ]; + char atomic_src[SOCK_EP_MAX_ATOMIC_SZ]; }; /* PE entry type */ @@ -509,18 +605,43 @@ struct sock_pe_entry{ uint64_t addr; uint64_t data; uint64_t tag; + uint64_t buf; uint8_t type; - uint8_t reserved[7]; + uint8_t is_complete; + uint8_t reserved[6]; uint64_t done_len; + uint64_t total_len; + uint64_t data_len; struct sock_ep *ep; - struct sock_cq *cq; + struct sock_conn *conn; + struct sock_comp *comp; + struct dlist_entry entry; + struct dlist_entry ctx_entry; +}; + +struct sock_pe{ + struct sock_domain *domain; + + struct sock_pe_entry pe_table[SOCK_PE_MAX_ENTRIES]; + fastlock_t lock; + + struct dlist_entry free_list; + struct dlist_entry busy_list; + + struct dlistfd_head tx_list; + struct dlistfd_head rx_list; + + pthread_t progress_thread; + volatile int do_progress; + struct sock_pe_entry *pe_atomic; }; typedef int (*sock_cq_report_fn) (struct sock_cq *cq, fi_addr_t addr, struct sock_pe_entry *pe_entry); + struct sock_cq { struct fid_cq cq_fid; struct sock_domain *domain; @@ -533,6 +654,9 @@ struct sock_cq { struct ringbuf cqerr_rb; fastlock_t lock; + struct fid_wait *waitset; + int signal; + struct dlist_entry ep_list; struct dlist_entry rx_list; struct dlist_entry tx_list; @@ -540,92 +664,149 @@ struct sock_cq { sock_cq_report_fn report_completion; }; + int sock_verify_info(struct fi_info *hints); int sock_verify_fabric_attr(struct fi_fabric_attr *attr); int sock_verify_domain_attr(struct fi_domain_attr *attr); -int sock_rdm_verify_ep_attr(struct fi_ep_attr *ep_attr, - struct fi_tx_attr *tx_attr, - struct fi_rx_attr *rx_attr); + +int sock_rdm_verify_ep_attr(struct fi_ep_attr *ep_attr, struct fi_tx_attr *tx_attr, + struct fi_rx_attr *rx_attr); +int sock_dgram_verify_ep_attr(struct fi_ep_attr *ep_attr, struct fi_tx_attr *tx_attr, + struct fi_rx_attr *rx_attr); +int sock_msg_verify_ep_attr(struct fi_ep_attr *ep_attr, struct fi_tx_attr *tx_attr, + struct fi_rx_attr *rx_attr); +struct fi_info *sock_fi_info(enum fi_ep_type ep_type, + struct fi_info *hints, void *src_addr, void *dest_addr); int sock_rdm_getinfo(uint32_t version, const char *node, const char *service, - uint64_t flags, struct fi_info *hints, struct fi_info **info); - + uint64_t flags, struct fi_info *hints, struct fi_info **info); int sock_dgram_getinfo(uint32_t version, const char *node, const char *service, - uint64_t flags, struct fi_info *hints, struct fi_info **info); + uint64_t flags, struct fi_info *hints, struct fi_info **info); +int sock_msg_getinfo(uint32_t version, const char *node, const char *service, + uint64_t flags, struct fi_info *hints, struct fi_info **info); +void free_fi_info(struct fi_info *info); int sock_domain(struct fid_fabric *fabric, struct fi_info *info, struct fid_domain **dom, void *context); -int sock_av_open(struct fid_domain *domain, struct fi_av_attr *attr, - struct fid_av **av, void *context); -fi_addr_t _sock_av_lookup(struct sock_av *av, struct sockaddr *addr); -int sock_av_lookup_addr(struct sock_av *av, fi_addr_t addr, - struct sock_conn **entry); +int sock_alloc_endpoint(struct fid_domain *domain, struct fi_info *info, + struct sock_ep **ep, void *context, size_t fclass); +int sock_rdm_ep(struct fid_domain *domain, struct fi_info *info, + struct fid_ep **ep, void *context); +int sock_rdm_sep(struct fid_domain *domain, struct fi_info *info, + struct fid_sep **sep, void *context); + +int sock_dgram_ep(struct fid_domain *domain, struct fi_info *info, + struct fid_ep **ep, void *context); +int sock_dgram_sep(struct fid_domain *domain, struct fi_info *info, + struct fid_sep **sep, void *context); + +int sock_msg_ep(struct fid_domain *domain, struct fi_info *info, + struct fid_ep **ep, void *context); +int sock_msg_sep(struct fid_domain *domain, struct fi_info *info, + struct fid_sep **sep, void *context); +int sock_msg_passive_ep(struct fid_fabric *fabric, struct fi_info *info, + struct fid_pep **pep, void *context); + + +int sock_stx_ctx(struct fid_domain *domain, + struct fi_tx_attr *attr, struct fid_stx **stx, void *context); +int sock_srx_ctx(struct fid_domain *domain, + struct fi_rx_attr *attr, struct fid_ep **srx, void *context); int sock_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, struct fid_cq **cq, void *context); -int _sock_cq_report_completion(struct sock_cq *sock_cq, struct sock_req_item *item); -int _sock_cq_report_error(struct sock_cq *sock_cq, struct fi_cq_err_entry *error); +int sock_cq_report_error(struct sock_cq *cq, struct sock_pe_entry *entry, + size_t olen, int err, int prov_errno, void *err_data); +int sock_cq_progress(struct sock_cq *cq); + + +int sock_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr, + struct fid_cntr **cntr, void *context); +int sock_cntr_inc(struct sock_cntr *cntr); +int sock_cntr_err_inc(struct sock_cntr *cntr); +int sock_cntr_progress(struct sock_cntr *cntr); int sock_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr, - struct fid_eq **eq, void *context); + struct fid_eq **eq, void *context); ssize_t sock_eq_report_event(struct sock_eq *sock_eq, uint32_t event, const void *buf, size_t len, uint64_t flags); ssize_t sock_eq_report_error(struct sock_eq *sock_eq, fid_t fid, void *context, int err, int prov_errno, void *err_data); -int sock_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr, - struct fid_cntr **cntr, void *context); +struct sock_mr *sock_mr_verify_key(struct sock_domain *domain, uint16_t key, + void *buf, size_t len, uint64_t access); +struct sock_mr *sock_mr_verify_desc(struct sock_domain *domain, void *desc, + void *buf, size_t len, uint64_t access); +struct sock_mr * sock_mr_get_entry(struct sock_domain *domain, uint16_t key); -int sock_rdm_ep(struct fid_domain *domain, struct fi_info *info, - struct fid_ep **ep, void *context); -int sock_dgram_ep(struct fid_domain *domain, struct fi_info *info, - struct fid_ep **ep, void *context); -int sock_passive_ep(struct fid_fabric *fabric, struct fi_info *info, - struct fid_pep **pep, void *context); - - -int sock_ep_connect(struct fid_ep *ep, const void *addr, - const void *param, size_t paramlen); - - -struct sock_rx_ctx *sock_rx_ctx_alloc(struct fi_rx_attr *attr, - void *context); -void sock_rx_ctx_add_ep(struct sock_rx_ctx *rx_ctx, struct sock_ep *ep); +struct sock_rx_ctx *sock_rx_ctx_alloc(struct fi_rx_attr *attr, void *context); void sock_rx_ctx_free(struct sock_rx_ctx *rx_ctx); - -struct sock_tx_ctx *sock_tx_ctx_alloc(struct fi_tx_attr *attr, - void *context); -void sock_tx_ctx_add_ep(struct sock_tx_ctx *tx_ctx, struct sock_ep *ep); +struct sock_tx_ctx *sock_tx_ctx_alloc(struct fi_tx_attr *attr, void *context); void sock_tx_ctx_free(struct sock_tx_ctx *tx_ctx); void sock_tx_ctx_start(struct sock_tx_ctx *tx_ctx); void sock_tx_ctx_write(struct sock_tx_ctx *tx_ctx, const void *buf, size_t len); void sock_tx_ctx_commit(struct sock_tx_ctx *tx_ctx); void sock_tx_ctx_abort(struct sock_tx_ctx *tx_ctx); -int sock_tx_ctx_read(struct sock_tx_ctx *tx_ctx, void *buf, size_t len); int sock_poll_open(struct fid_domain *domain, struct fi_poll_attr *attr, - struct fid_poll **pollset); + struct fid_poll **pollset); int sock_wait_open(struct fid_domain *domain, struct fi_wait_attr *attr, - struct fid_wait **waitset); + struct fid_wait **waitset); +int sock_wait_open(struct fid_domain *domain, struct fi_wait_attr *attr, + struct fid_wait **waitset); +void sock_wait_signal(struct fid_wait *wait_fid); +int sock_wait_get_obj(struct fid_wait *fid, void *arg); +int sock_wait_close(fid_t fid); + + +int sock_av_open(struct fid_domain *domain, struct fi_av_attr *attr, + struct fid_av **av, void *context); +fi_addr_t _sock_av_lookup(struct sock_av *av, struct sockaddr *addr); +fi_addr_t sock_av_get_fiaddr(struct sock_av *av, struct sock_conn *conn); +fi_addr_t sock_av_lookup_key(struct sock_av *av, int key); +struct sock_conn *sock_av_lookup_addr(struct sock_av *av, fi_addr_t addr); + + +struct sock_conn *sock_conn_map_lookup_key(struct sock_conn_map *conn_map, + uint16_t key); +uint16_t sock_conn_map_match_or_connect(struct sock_conn_map *map, + struct sockaddr_in *addr, int match_only); +int sock_conn_listen(struct sock_domain *domain); +int sock_conn_map_clear_pe_entry(struct sock_conn *conn_entry, uint16_t key); +void sock_conn_map_destroy(struct sock_conn_map *cmap); + struct sock_pe *sock_pe_init(struct sock_domain *domain); -int sock_pe_add_tx_ctx(struct sock_pe *pe, struct sock_tx_ctx *ctx); -int sock_pe_add_rx_ctx(struct sock_pe *pe, struct sock_rx_ctx *ctx); +void sock_pe_add_tx_ctx(struct sock_pe *pe, struct sock_tx_ctx *ctx); +void sock_pe_add_rx_ctx(struct sock_pe *pe, struct sock_rx_ctx *ctx); int sock_pe_progress_rx_ctx(struct sock_pe *pe, struct sock_rx_ctx *rx_ctx); int sock_pe_progress_tx_ctx(struct sock_pe *pe, struct sock_tx_ctx *tx_ctx); void sock_pe_finalize(struct sock_pe *pe); -void free_fi_info(struct fi_info *info); +struct sock_rx_entry *sock_rx_new_entry(struct sock_rx_ctx *rx_ctx); +struct sock_rx_entry *sock_rx_new_buffered_entry(struct sock_rx_ctx *rx_ctx, + size_t len); +struct sock_rx_entry *sock_rx_get_entry(struct sock_rx_ctx *rx_ctx, + uint64_t addr, uint64_t tag); +size_t sock_rx_avail_len(struct sock_rx_entry *rx_entry); +void sock_rx_release_entry(struct sock_rx_entry *rx_entry); + + +int sock_comm_buffer_init(struct sock_conn *conn); +void sock_comm_buffer_finalize(struct sock_conn *conn); +ssize_t sock_comm_send(struct sock_conn *conn, const void *buf, size_t len); +ssize_t sock_comm_recv(struct sock_conn *conn, void *buf, size_t len); +ssize_t sock_comm_flush(struct sock_conn *conn); #endif diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_atomic.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_atomic.c new file mode 100644 index 0000000000..1f47f47884 --- /dev/null +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_atomic.c @@ -0,0 +1,504 @@ +/* + * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include +#endif /* HAVE_CONFIG_H */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sock.h" +#include "sock_util.h" + + +static ssize_t sock_ep_tx_atomic(struct fid_ep *ep, + const struct fi_msg_atomic *msg, + const struct fi_ioc *comparev, void **compare_desc, + size_t compare_count, struct fi_ioc *resultv, + void **result_desc, size_t result_count, + uint64_t flags, int type) +{ + int i, ret; + size_t datatype_sz; + struct sock_op tx_op; + union sock_iov tx_iov; + struct sock_conn *conn; + struct sock_tx_ctx *tx_ctx; + uint64_t total_len, src_len, dst_len; + struct sock_ep *sock_ep; + + switch (ep->fid.fclass) { + case FI_CLASS_EP: + sock_ep = container_of(ep, struct sock_ep, ep); + tx_ctx = sock_ep->tx_ctx; + break; + + case FI_CLASS_TX_CTX: + tx_ctx = container_of(ep, struct sock_tx_ctx, ctx); + sock_ep = tx_ctx->ep; + break; + + default: + SOCK_LOG_ERROR("Invalid EP type\n"); + return -FI_EINVAL; + } + + assert(tx_ctx->enabled && + msg->iov_count <= SOCK_EP_MAX_IOV_LIMIT && + msg->rma_iov_count <= SOCK_EP_MAX_IOV_LIMIT); + + conn = sock_av_lookup_addr(tx_ctx->av, msg->addr); + assert(conn); + + src_len = 0; + datatype_sz = fi_datatype_size(msg->datatype); + if (flags & FI_INJECT) { + for (i=0; i< msg->iov_count; i++) { + src_len += (msg->msg_iov[i].count * datatype_sz); + } + assert(src_len <= SOCK_EP_MAX_INJECT_SZ); + total_len = src_len; + } else { + total_len = msg->iov_count * sizeof(union sock_iov); + } + + total_len += (sizeof(struct sock_op_send) + + (msg->rma_iov_count * sizeof(union sock_iov)) + + (result_count * sizeof (union sock_iov))); + + sock_tx_ctx_start(tx_ctx); + if (rbfdavail(&tx_ctx->rbfd) < total_len) { + ret = -FI_EAGAIN; + goto err; + } + + flags |= tx_ctx->attr.op_flags; + memset(&tx_op, 0, sizeof(struct sock_op)); + tx_op.op = type; + tx_op.dest_iov_len = msg->rma_iov_count; + tx_op.atomic.op = msg->op; + tx_op.atomic.datatype = msg->datatype; + tx_op.atomic.res_iov_len = result_count; + tx_op.atomic.cmp_iov_len = compare_count; + + if (flags & FI_INJECT) + tx_op.src_iov_len = src_len; + else + tx_op.src_iov_len = msg->iov_count; + + sock_tx_ctx_write(tx_ctx, &tx_op, sizeof(struct sock_op)); + sock_tx_ctx_write(tx_ctx, &flags, sizeof(uint64_t)); + sock_tx_ctx_write(tx_ctx, &msg->context, sizeof(uint64_t)); + sock_tx_ctx_write(tx_ctx, &msg->addr, sizeof(uint64_t)); + sock_tx_ctx_write(tx_ctx, &conn, sizeof(uint64_t)); + sock_tx_ctx_write(tx_ctx, &msg->msg_iov[0].addr, sizeof(uint64_t)); + sock_tx_ctx_write(tx_ctx, &sock_ep, sizeof(uint64_t)); + + if (flags & FI_REMOTE_CQ_DATA) { + sock_tx_ctx_write(tx_ctx, &msg->data, sizeof(uint64_t)); + } + + if (flags & FI_INJECT) { + for (i=0; i< msg->iov_count; i++) { + sock_tx_ctx_write(tx_ctx, msg->msg_iov[i].addr, + msg->msg_iov[i].count * datatype_sz); + src_len += (msg->msg_iov[i].count * datatype_sz); + } + } else { + for (i = 0; i< msg->iov_count; i++) { + tx_iov.ioc.addr = (uint64_t)msg->msg_iov[i].addr; + tx_iov.ioc.count = msg->msg_iov[i].count; + tx_iov.ioc.key = (uint64_t)msg->desc[i]; + sock_tx_ctx_write(tx_ctx, &tx_iov, sizeof(union sock_iov)); + src_len += (tx_iov.ioc.count * datatype_sz); + } + } + assert(src_len <= SOCK_EP_MAX_ATOMIC_SZ); + + dst_len = 0; + for (i = 0; i< msg->rma_iov_count; i++) { + tx_iov.ioc.addr = msg->rma_iov[i].addr; + tx_iov.ioc.key = msg->rma_iov[i].key; + tx_iov.ioc.count = msg->rma_iov[i].count; + sock_tx_ctx_write(tx_ctx, &tx_iov, sizeof(union sock_iov)); + dst_len += (tx_iov.ioc.count * datatype_sz); + } + + if (dst_len != src_len) { + SOCK_LOG_ERROR("Buffer length mismatch\n"); + ret = -FI_EINVAL; + goto err; + } + + dst_len = 0; + for (i = 0; i< result_count; i++) { + tx_iov.ioc.addr = (uint64_t)resultv[i].addr; + tx_iov.ioc.count = resultv[i].count; + sock_tx_ctx_write(tx_ctx, &tx_iov, sizeof(union sock_iov)); + dst_len += (tx_iov.ioc.count * datatype_sz); + } + + if (result_count && (dst_len != src_len)) { + SOCK_LOG_ERROR("Buffer length mismatch\n"); + ret = -FI_EINVAL; + goto err; + } + + for (i = 0; i< compare_count; i++) { + tx_iov.ioc.addr = (uint64_t)comparev[i].addr; + tx_iov.ioc.count = comparev[i].count; + sock_tx_ctx_write(tx_ctx, &tx_iov, sizeof(union sock_iov)); + dst_len += (tx_iov.ioc.count * datatype_sz); + } + + if (compare_count && (dst_len != src_len)) { + SOCK_LOG_ERROR("Buffer length mismatch\n"); + ret = -FI_EINVAL; + goto err; + } + + sock_tx_ctx_commit(tx_ctx); + return 0; + +err: + sock_tx_ctx_abort(tx_ctx); + return ret; +} + + +static ssize_t sock_ep_atomic_writemsg(struct fid_ep *ep, + const struct fi_msg_atomic *msg, uint64_t flags) +{ + return sock_ep_tx_atomic(ep, msg, NULL, NULL, 0, + NULL, NULL, 0, flags, SOCK_OP_ATOMIC_WRITE); +} + +static ssize_t sock_ep_atomic_write(struct fid_ep *ep, + const void *buf, size_t count, void *desc, + fi_addr_t dest_addr, uint64_t addr, + uint64_t key, enum fi_datatype datatype, + enum fi_op op, void *context) +{ + struct fi_msg_atomic msg; + struct fi_ioc msg_iov; + struct fi_rma_ioc rma_iov; + + msg_iov.addr = (void *)buf; + msg_iov.count = count; + msg.msg_iov = &msg_iov; + msg.desc = &desc; + msg.iov_count = 1; + msg.addr = dest_addr; + + rma_iov.addr = addr; + rma_iov.key = key; + rma_iov.count = count; + msg.rma_iov = &rma_iov; + msg.rma_iov_count = 1; + + msg.datatype = datatype; + msg.op = op; + msg.context = context; + msg.data = 0; + + return sock_ep_atomic_writemsg(ep, &msg, 0); +} + +static ssize_t sock_ep_atomic_writev(struct fid_ep *ep, + const struct fi_ioc *iov, void **desc, size_t count, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, enum fi_op op, void *context) +{ + struct fi_msg_atomic msg; + struct fi_rma_ioc rma_iov; + + msg.msg_iov = iov; + msg.desc = desc; + msg.iov_count = count; + msg.addr = dest_addr; + + rma_iov.addr = addr; + rma_iov.key = key; + rma_iov.count = count; + msg.rma_iov = &rma_iov; + msg.rma_iov_count = 1; + + msg.datatype = datatype; + msg.op = op; + msg.context = context; + msg.data = 0; + + return sock_ep_atomic_writemsg(ep, &msg, 0); +} + +static ssize_t sock_ep_atomic_inject(struct fid_ep *ep, const void *buf, size_t count, + fi_addr_t dest_addr, uint64_t addr, uint64_t key, + enum fi_datatype datatype, enum fi_op op) +{ + struct fi_msg_atomic msg; + struct fi_ioc msg_iov; + struct fi_rma_ioc rma_iov; + + msg_iov.addr = (void *)buf; + msg_iov.count = count; + msg.msg_iov = &msg_iov; + msg.iov_count = 1; + msg.addr = dest_addr; + + rma_iov.addr = addr; + rma_iov.key = key; + rma_iov.count = count; + msg.rma_iov = &rma_iov; + msg.rma_iov_count = 1; + + msg.datatype = datatype; + msg.op = op; + msg.data = 0; + + return sock_ep_atomic_writemsg(ep, &msg, FI_INJECT); +} + +static ssize_t sock_ep_atomic_readwritemsg(struct fid_ep *ep, + const struct fi_msg_atomic *msg, + struct fi_ioc *resultv, void **result_desc, + size_t result_count, uint64_t flags) +{ + return sock_ep_tx_atomic(ep, msg, NULL, NULL, 0, + resultv, result_desc, result_count, flags, + SOCK_OP_ATOMIC_READ_WRITE); +} + +static ssize_t sock_ep_atomic_readwrite(struct fid_ep *ep, + const void *buf, size_t count, void *desc, + void *result, void *result_desc, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, enum fi_op op, void *context) +{ + struct fi_msg_atomic msg; + struct fi_ioc msg_iov; + struct fi_rma_ioc rma_iov; + struct fi_ioc resultv; + + msg_iov.addr = (void *)buf; + msg_iov.count = count; + msg.msg_iov = &msg_iov; + + msg.desc = &desc; + msg.iov_count = 1; + msg.addr = dest_addr; + + rma_iov.addr = addr; + rma_iov.count = 1; + rma_iov.key = key; + msg.rma_iov = &rma_iov; + msg.rma_iov_count = 1; + msg.datatype = datatype; + msg.op = op; + msg.context = context; + + resultv.addr = result; + resultv.count = 1; + + return sock_ep_atomic_readwritemsg(ep, &msg, + &resultv, &result_desc, 1, 0); +} + +static ssize_t sock_ep_atomic_readwritev(struct fid_ep *ep, + const struct fi_ioc *iov, void **desc, size_t count, + struct fi_ioc *resultv, void **result_desc, size_t result_count, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, enum fi_op op, void *context) +{ + struct fi_msg_atomic msg; + struct fi_rma_ioc rma_iov; + + msg.msg_iov = iov; + msg.desc = desc; + msg.iov_count = count; + msg.addr = dest_addr; + + rma_iov.addr = addr; + rma_iov.count = 1; + rma_iov.key = key; + msg.rma_iov = &rma_iov; + msg.rma_iov_count = 1; + msg.datatype = datatype; + msg.op = op; + msg.context = context; + + return sock_ep_atomic_readwritemsg(ep, &msg, + resultv, result_desc, result_count, 0); +} + +static ssize_t sock_ep_atomic_compwritemsg(struct fid_ep *ep, + const struct fi_msg_atomic *msg, + const struct fi_ioc *comparev, void **compare_desc, size_t compare_count, + struct fi_ioc *resultv, void **result_desc, size_t result_count, + uint64_t flags) +{ + return sock_ep_tx_atomic(ep, msg, comparev, compare_desc, compare_count, + resultv, result_desc, result_count, flags, + SOCK_OP_ATOMIC_COMP_WRITE); +} + +static ssize_t sock_ep_atomic_compwrite(struct fid_ep *ep, + const void *buf, size_t count, void *desc, + const void *compare, void *compare_desc, + void *result, void *result_desc, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, enum fi_op op, void *context) +{ + struct fi_msg_atomic msg; + struct fi_ioc msg_iov; + struct fi_rma_ioc rma_iov; + struct fi_ioc resultv; + struct fi_ioc comparev; + + msg_iov.addr = (void *)buf; + msg_iov.count = count; + msg.msg_iov = &msg_iov; + + msg.desc = &desc; + msg.iov_count = 1; + msg.addr = dest_addr; + + rma_iov.addr = addr; + rma_iov.count = 1; + rma_iov.key = key; + msg.rma_iov = &rma_iov; + msg.rma_iov_count = 1; + msg.datatype = datatype; + msg.op = op; + msg.context = context; + + resultv.addr = result; + resultv.count = 1; + comparev.addr = (void*)compare; + comparev.count = 1; + + return sock_ep_atomic_compwritemsg(ep, &msg, &comparev, &compare_desc, 1, + &resultv, &result_desc, 1, 0); +} + +static ssize_t sock_ep_atomic_compwritev(struct fid_ep *ep, + const struct fi_ioc *iov, void **desc, size_t count, + const struct fi_ioc *comparev, void **compare_desc, size_t compare_count, + struct fi_ioc *resultv, void **result_desc, size_t result_count, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, enum fi_op op, void *context) +{ + struct fi_msg_atomic msg; + struct fi_rma_ioc rma_iov; + + msg.msg_iov = iov; + msg.desc = desc; + msg.iov_count = count; + msg.addr = dest_addr; + + rma_iov.addr = addr; + rma_iov.count = 1; + rma_iov.key = key; + msg.rma_iov = &rma_iov; + msg.rma_iov_count = 1; + msg.datatype = datatype; + msg.op = op; + msg.context = context; + + return sock_ep_atomic_compwritemsg(ep, &msg, comparev, compare_desc, 1, + resultv, result_desc, 1, 0); +} + +static int sock_ep_atomic_valid(struct fid_ep *ep, enum fi_datatype datatype, + enum fi_op op, size_t *count) +{ + size_t datatype_sz; + + switch(datatype){ + case FI_FLOAT: + case FI_DOUBLE: + if (op == FI_BOR || op == FI_BAND || + op == FI_BXOR || op == FI_MSWAP) + return -FI_ENOENT; + break; + + case FI_FLOAT_COMPLEX: + case FI_DOUBLE_COMPLEX: + case FI_LONG_DOUBLE: + case FI_LONG_DOUBLE_COMPLEX: + return -FI_ENOENT; + default: + break; + } + + datatype_sz = fi_datatype_size(datatype); + *count = (SOCK_EP_MAX_ATOMIC_SZ/datatype_sz); + return 0; +} + +struct fi_ops_atomic sock_ep_atomic = { + .size = sizeof(struct fi_ops_atomic), + .write = sock_ep_atomic_write, + .writev = sock_ep_atomic_writev, + .writemsg = sock_ep_atomic_writemsg, + .inject = sock_ep_atomic_inject, + .readwrite = sock_ep_atomic_readwrite, + .readwritev = sock_ep_atomic_readwritev, + .readwritemsg = sock_ep_atomic_readwritemsg, + .compwrite = sock_ep_atomic_compwrite, + .compwritev = sock_ep_atomic_compwritev, + .compwritemsg = sock_ep_atomic_compwritemsg, + .writevalid = sock_ep_atomic_valid, + .readwritevalid = sock_ep_atomic_valid, + .compwritevalid = sock_ep_atomic_valid, +}; diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_av.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_av.c index ffce4659f2..497202ff5c 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_av.c +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_av.c @@ -41,26 +41,111 @@ #include #include #include +#include #include "sock.h" +#include "sock_util.h" -static int sock_at_insert(struct fid_av *av, const void *addr, size_t count, - fi_addr_t *fi_addr, uint64_t flags, void *context) +fi_addr_t sock_av_lookup_key(struct sock_av *av, int key) { int i; + struct sock_av_addr *av_addr; + + for (i = 0; i < IDX_MAX_INDEX; i++) { + av_addr = idm_lookup(&av->addr_idm, i); + if (!av_addr) + continue; + + if (!av_addr->key) { + av_addr->key = sock_conn_map_match_or_connect( + av->cmap, + (struct sockaddr_in*)&av_addr->addr, 1); + if (!av_addr->key) { + continue; + } + } + + if (av_addr->key == key + 1) { + return i; + } + } + + SOCK_LOG_INFO("Reverse-lookup failed: %d\n", key); + return FI_ADDR_NOTAVAIL; +} + +struct sock_conn *sock_av_lookup_addr(struct sock_av *av, + fi_addr_t addr) +{ + int index = ((uint64_t)addr & av->mask); + struct sock_av_addr *av_addr; + + if (index >= av->stored || index < 0) { + SOCK_LOG_ERROR("requested rank is larger than av table\n"); + errno = EINVAL; + return NULL; + } + + if (!av->cmap) { + SOCK_LOG_ERROR("EP with no AV bound\n"); + errno = EINVAL; + return NULL; + } + + av_addr = idm_lookup(&av->addr_idm, index); + if (!av_addr->key) { + av_addr->key = sock_conn_map_match_or_connect(av->cmap, + (struct sockaddr_in*)&av_addr->addr, 0); + if (!av_addr->key) { + SOCK_LOG_ERROR("failed to match or connect to addr %lu\n", addr); + errno = EINVAL; + return NULL; + } + } + return sock_conn_map_lookup_key(av->cmap, av_addr->key); +} + +static int sock_check_table_in(struct sock_av *_av, struct sockaddr_in *addr, + fi_addr_t *fi_addr, int count) +{ + int i, ret; + struct sock_av_addr *av_addr; + av_addr = calloc(count, sizeof(struct sock_av_addr)); + if (!av_addr) + return -ENOMEM; + + for (i=0, ret = 0; iaddr_idm, _av->stored, &av_addr[i]) < 0) { + if (fi_addr) + fi_addr[i] = FI_ADDR_NOTAVAIL; + continue; + } + + if (fi_addr) + fi_addr[i] = (fi_addr_t)_av->stored; + + _av->stored++; + ret++; + } + return ret; +} + +static int sock_av_insert(struct fid_av *av, const void *addr, size_t count, + fi_addr_t *fi_addr, uint64_t flags, void *context) +{ struct sock_av *_av; _av = container_of(av, struct sock_av, av_fid); - _av->table = calloc(count, sizeof(struct sockaddr_in)); - if (!_av->table) - return -ENOMEM; - for (i=0; itable[i], &((struct sockaddr_in *)addr)[i], sizeof(struct sockaddr_in)); + switch(((struct sockaddr *)addr)->sa_family) { + case AF_INET: + return sock_check_table_in(_av, (struct sockaddr_in *)addr, + fi_addr, count); + default: + SOCK_LOG_ERROR("invalid address type inserted: only IPv4 supported\n"); + return -EINVAL; } - _av->count = count; - - return 0; } static int sock_at_remove(struct fid_av *av, fi_addr_t *fi_addr, size_t count, @@ -72,15 +157,20 @@ static int sock_at_remove(struct fid_av *av, fi_addr_t *fi_addr, size_t count, static int sock_at_lookup(struct fid_av *av, fi_addr_t fi_addr, void *addr, size_t *addrlen) { - int idx; - idx = (int)(int64_t)fi_addr; + int index; struct sock_av *_av; + struct sock_av_addr *av_addr; _av = container_of(av, struct sock_av, av_fid); - if (idx >= _av->count || idx < 0) + index = ((uint64_t)fi_addr & _av->mask); + if (index >= _av->stored || index < 0) { + SOCK_LOG_ERROR("requested address not inserted\n"); return -EINVAL; - memcpy(addr, &_av->table[idx], MIN(*addrlen, sizeof(struct sockaddr_in))); - *addrlen = sizeof(struct sockaddr_in); + } + + av_addr = idm_lookup(&_av->addr_idm, index); + addr = &av_addr->addr; + *addrlen = _av->addrlen; return 0; } @@ -90,26 +180,76 @@ static const char * sock_at_straddr(struct fid_av *av, const void *addr, return NULL; } -static int sock_am_insert(struct fid_av *av, const void *addr, size_t count, - fi_addr_t *fi_addr, uint64_t flags, void *context) +int sock_av_insertsvc(struct fid_av *av, const char *node, + const char *service, fi_addr_t *fi_addr, + uint64_t flags, void *context) { - const struct sockaddr_in *sin; - struct sockaddr_in *fin; - int i; + int ret; + struct addrinfo sock_hints; + struct addrinfo *result = NULL; + + if (!service) { + SOCK_LOG_ERROR("Port not provided\n"); + return -FI_EINVAL; + } - if (flags) - return -FI_EBADFLAGS; - if (sizeof(void *) != sizeof(*sin)) - return -FI_ENOSYS; + memset(&sock_hints, 0, sizeof(struct addrinfo)); + sock_hints.ai_family = AF_INET; + sock_hints.ai_socktype = SOCK_STREAM; + + ret = getaddrinfo(node, service, &sock_hints, &result); + if (ret) + return -ret; - sin = addr; - fin = (struct sockaddr_in *) fi_addr; - for (i = 0; i < count; i++) - memcpy(&fin[i], &sin[i], sizeof(*sin)); - - return 0; + ret = sock_av_insert(av, result->ai_addr, 1, fi_addr, flags, context); + freeaddrinfo(result); + return ret; } +int sock_av_insertsym(struct fid_av *av, const char *node, size_t nodecnt, + const char *service, size_t svccnt, fi_addr_t *fi_addr, + uint64_t flags, void *context) +{ + int ret = 0; + int var_port, var_host; + char base_host[FI_NAME_MAX] = {0}; + char tmp_host[FI_NAME_MAX] = {0}; + char tmp_port[FI_NAME_MAX] = {0}; + int hostlen, offset = 0, fmt, i, j; + + if (!node || !service) { + SOCK_LOG_ERROR("Node/service not provided\n"); + return -FI_EINVAL; + } + + hostlen = strlen(node); + while(isdigit(*(node + hostlen - (offset+1)))) + offset++; + + if (*(node + hostlen - offset) == '.') + fmt = 0; + else + fmt = offset; + + strncpy(base_host, node, hostlen - (offset)); + var_port = atoi(service); + var_host = atoi(node + hostlen - offset); + + for (i = 0; i < nodecnt; i++) { + for (j = 0; j < svccnt; j++) { + sprintf(tmp_host, "%s%0*d", base_host, fmt, var_host + i); + sprintf(tmp_port, "%d", var_port + j); + + if (sock_av_insertsvc(av, tmp_host, tmp_port, + &fi_addr[i * nodecnt + j], + flags, context) == 1) + ret++; + } + } + return ret; +} + + static int sock_am_remove(struct fid_av *av, fi_addr_t *fi_addr, size_t count, uint64_t flags) { @@ -119,8 +259,7 @@ static int sock_am_remove(struct fid_av *av, fi_addr_t *fi_addr, size_t count, static int sock_am_lookup(struct fid_av *av, fi_addr_t fi_addr, void *addr, size_t *addrlen) { - memcpy(addr, &fi_addr, MIN(*addrlen, sizeof(struct sockaddr_in))); - *addrlen = sizeof(struct sockaddr_in); + sock_at_lookup(av, fi_addr, addr, addrlen); return 0; } @@ -147,12 +286,20 @@ static int sock_av_bind(struct fid *fid, struct fid *bfid, uint64_t flags) static int sock_av_close(struct fid *fid) { struct sock_av *av; + void *addr; + int i; av = container_of(fid, struct sock_av, av_fid.fid); if (atomic_get(&av->ref)) return -FI_EBUSY; - atomic_dec(&av->dom->ref); + for (i=0; istored; i++) { + addr = idm_clear(&av->addr_idm , i); + if (addr) + free(addr); + } + + atomic_dec(&av->domain->ref); free(av); return 0; } @@ -167,7 +314,9 @@ static struct fi_ops sock_av_fi_ops = { static struct fi_ops_av sock_am_ops = { .size = sizeof(struct fi_ops_av), - .insert = sock_am_insert, + .insert = sock_av_insert, + .insertsvc = sock_av_insertsvc, + .insertsym = sock_av_insertsym, .remove = sock_am_remove, .lookup = sock_am_lookup, .straddr = sock_am_straddr @@ -175,7 +324,9 @@ static struct fi_ops_av sock_am_ops = { static struct fi_ops_av sock_at_ops = { .size = sizeof(struct fi_ops_av), - .insert = sock_at_insert, + .insert = sock_av_insert, + .insertsvc = sock_av_insertsvc, + .insertsym = sock_av_insertsym, .remove = sock_at_remove, .lookup = sock_at_lookup, .straddr = sock_at_straddr @@ -214,11 +365,15 @@ int sock_av_open(struct fid_domain *domain, struct fi_av_attr *attr, { struct sock_domain *dom; struct sock_av *_av; -// int ret; - if (attr->name || attr->flags) + if (attr->flags) return -FI_ENOSYS; + if (attr->rx_ctx_bits > SOCK_EP_MAX_CTX_BITS) { + SOCK_LOG_ERROR("Invalid rx_ctx_bits\n"); + return -EINVAL; + } + dom = container_of(domain, struct sock_domain, dom_fid); _av = calloc(1, sizeof(*_av)); @@ -238,43 +393,26 @@ int sock_av_open(struct fid_domain *domain, struct fi_av_attr *attr, _av->av_fid.ops = &sock_at_ops; break; default: - return -FI_ENOSYS; + goto err; } -#if 0 - if (ret) - return ret; -#endif atomic_init(&_av->ref, 0); atomic_inc(&dom->ref); - _av->dom = dom; + _av->domain = dom; + switch (dom->info.addr_format) { + case FI_SOCKADDR_IN: + _av->addrlen = sizeof(struct sockaddr_in); + break; + default: + SOCK_LOG_ERROR("Invalid address format: only IPv4 supported\n"); + goto err; + } + _av->rx_ctx_bits = attr->rx_ctx_bits; + _av->mask = ((uint64_t)1<<(64 - attr->rx_ctx_bits + 1))-1; _av->attr = *attr; *av = &_av->av_fid; return 0; -} - -/* TODO */ -fi_addr_t _sock_av_lookup(struct sock_av *av, struct sockaddr *addr) -{ - if (av->attr.type == FI_AV_MAP) { - return (fi_addr_t)addr; - } else { - int i; - struct sockaddr_in *addrin; - addrin = (struct sockaddr_in*)addr; - for (i = 0 ; i < av->count ; i++) { - if (av->table[i].sin_addr.s_addr == addrin->sin_addr.s_addr && - av->table[i].sin_port == addrin->sin_port) - return (fi_addr_t)i; - } - fprintf(stderr, "[sock] failed to lookup src_addr in av table\n"); - } - return FI_ADDR_NOTAVAIL; -} - -/* place holder */ -int sock_av_lookup_addr(struct sock_av *av, fi_addr_t addr, - struct sock_conn **entry) -{ - return -FI_ENOSYS; +err: + free(_av); + return -EINVAL; } diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_cntr.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_cntr.c index 1759c93752..9a4a028519 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_cntr.c +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_cntr.c @@ -37,26 +37,55 @@ #include #include #include - +#include #include #include #include "sock.h" +const struct fi_cntr_attr sock_cntr_attr = { + .events = FI_CNTR_EVENTS_COMP, + .wait_obj = FI_WAIT_MUTEX_COND, + .wait_set = NULL, + .flags = 0, +}; + +int sock_cntr_progress(struct sock_cntr *cntr) +{ + struct sock_tx_ctx *tx_ctx; + struct sock_rx_ctx *rx_ctx; + struct dlist_entry *entry; + + for (entry = cntr->tx_list.next; entry != &cntr->tx_list; + entry = entry->next) { + tx_ctx = container_of(entry, struct sock_tx_ctx, cntr_entry); + sock_pe_progress_tx_ctx(cntr->domain->pe, tx_ctx); + } + + for (entry = cntr->rx_list.next; entry != &cntr->rx_list; + entry = entry->next) { + rx_ctx = container_of(entry, struct sock_rx_ctx, cntr_entry); + sock_pe_progress_rx_ctx(cntr->domain->pe, rx_ctx); + } + return 0; +} + static uint64_t sock_cntr_read(struct fid_cntr *cntr) { struct sock_cntr *_cntr; _cntr = container_of(cntr, struct sock_cntr, cntr_fid); - return _cntr->value; + if (_cntr->domain->progress_mode == FI_PROGRESS_MANUAL) + sock_cntr_progress(_cntr); + return atomic_get(&_cntr->value); } int sock_cntr_inc(struct sock_cntr *cntr) { - pthread_mutex_lock(&cntr->mut); - cntr->value += 1; - if (cntr->value >= cntr->threshold) + fastlock_acquire(&cntr->mut); + atomic_inc(&cntr->value); + if (atomic_get(&cntr->value) >= atomic_get(&cntr->threshold)) pthread_cond_signal(&cntr->cond); - pthread_mutex_unlock(&cntr->mut); + fastlock_release(&cntr->mut); return 0; } @@ -72,11 +101,11 @@ static int sock_cntr_add(struct fid_cntr *cntr, uint64_t value) struct sock_cntr *_cntr; _cntr = container_of(cntr, struct sock_cntr, cntr_fid); - pthread_mutex_lock(&_cntr->mut); - _cntr->value += value; - if (_cntr->value >= _cntr->threshold) + fastlock_acquire(&_cntr->mut); + atomic_set(&_cntr->value, atomic_get(&_cntr->value) + value); + if (atomic_get(&_cntr->value) >= atomic_get(&_cntr->threshold)) pthread_cond_signal(&_cntr->cond); - pthread_mutex_unlock(&_cntr->mut); + fastlock_release(&_cntr->mut); return 0; } @@ -85,26 +114,88 @@ static int sock_cntr_set(struct fid_cntr *cntr, uint64_t value) struct sock_cntr *_cntr; _cntr = container_of(cntr, struct sock_cntr, cntr_fid); - pthread_mutex_lock(&_cntr->mut); - _cntr->value = value; - if (_cntr->value >= _cntr->threshold) + fastlock_acquire(&_cntr->mut); + atomic_set(&_cntr->value, value); + if (atomic_get(&_cntr->value) >= atomic_get(&_cntr->threshold)) pthread_cond_signal(&_cntr->cond); - pthread_mutex_unlock(&_cntr->mut); + fastlock_release(&_cntr->mut); return 0; } static int sock_cntr_wait(struct fid_cntr *cntr, uint64_t threshold, int timeout) { - struct sock_cntr *_cntr; int ret = 0; + struct timeval now; + double start_ms, end_ms; + struct sock_cntr *_cntr; _cntr = container_of(cntr, struct sock_cntr, cntr_fid); - pthread_mutex_lock(&_cntr->mut); - _cntr->threshold = threshold; - while (_cntr->value < _cntr->threshold && !ret) + fastlock_acquire(&_cntr->mut); + atomic_set(&_cntr->threshold, threshold); + while (atomic_get(&_cntr->value) < atomic_get(&_cntr->threshold) && !ret) { + if (_cntr->domain->progress_mode == FI_PROGRESS_MANUAL) { + if (timeout > 0) { + gettimeofday(&now, NULL); + start_ms = (double)now.tv_sec * 1000.0 + + (double)now.tv_usec / 1000.0; + } + sock_cntr_progress(_cntr); + if (timeout > 0) { + gettimeofday(&now, NULL); + end_ms = (double)now.tv_sec * 1000.0 + + (double)now.tv_usec / 1000.0; + timeout -= (end_ms - start_ms); + timeout = timeout < 0 ? 0 : timeout; + } + } ret = fi_wait_cond(&_cntr->cond, &_cntr->mut, timeout); - _cntr->threshold = ~0; - pthread_mutex_unlock(&_cntr->mut); + } + atomic_set(&_cntr->threshold, ~0); + fastlock_release(&_cntr->mut); + return -ret; +} + +int sock_cntr_control(struct fid *fid, int command, void *arg) +{ + int ret = 0; + struct sock_cntr *cntr; + + cntr = container_of(fid, struct sock_cntr, cntr_fid); + + switch (command) { + case FI_GETWAIT: + switch (cntr->attr.wait_obj) { + case FI_WAIT_NONE: + case FI_WAIT_UNSPEC: + case FI_WAIT_MUTEX_COND: + memcpy(arg, &cntr->mut, sizeof(cntr->mut)); + memcpy((char*)arg + sizeof(cntr->mut), &cntr->cond, + sizeof(cntr->cond)); + break; + + case FI_WAIT_SET: + case FI_WAIT_FD: + sock_wait_get_obj(cntr->waitset, arg); + break; + + default: + ret = -FI_EINVAL; + break; + } + break; + + case FI_GETOPSFLAG: + memcpy(arg, &cntr->attr.flags, sizeof(uint64_t)); + break; + + case FI_SETOPSFLAG: + memcpy(&cntr->attr.flags, arg, sizeof(uint64_t)); + break; + + default: + ret = -FI_EINVAL; + break; + } return ret; } @@ -115,10 +206,13 @@ static int sock_cntr_close(struct fid *fid) cntr = container_of(fid, struct sock_cntr, cntr_fid.fid); if (atomic_get(&cntr->ref)) return -FI_EBUSY; + + if (cntr->signal && cntr->attr.wait_obj == FI_WAIT_FD) + sock_wait_close(&cntr->waitset->fid); - pthread_mutex_destroy(&cntr->mut); + fastlock_destroy(&cntr->mut); pthread_cond_destroy(&cntr->cond); - atomic_dec(&cntr->dom->ref); + atomic_dec(&cntr->domain->ref); free(cntr); return 0; } @@ -127,6 +221,8 @@ uint64_t sock_cntr_readerr(struct fid_cntr *cntr) { struct sock_cntr *_cntr; _cntr = container_of(cntr, struct sock_cntr, cntr_fid); + if (_cntr->domain->progress_mode == FI_PROGRESS_MANUAL) + sock_cntr_progress(_cntr); return atomic_get(&_cntr->err_cnt); } @@ -141,18 +237,45 @@ static struct fi_ops_cntr sock_cntr_ops = { static struct fi_ops sock_cntr_fi_ops = { .size = sizeof(struct fi_ops), + .control = sock_cntr_control, .close = sock_cntr_close, }; +static int sock_cntr_verify_attr(struct fi_cntr_attr *attr) +{ + switch (attr->events) { + case FI_CNTR_EVENTS_COMP: + break; + default: + return -FI_ENOSYS; + } + + switch (attr->wait_obj) { + case FI_WAIT_NONE: + case FI_WAIT_UNSPEC: + case FI_WAIT_MUTEX_COND: + case FI_WAIT_SET: + case FI_WAIT_FD: + break; + default: + return -FI_ENOSYS; + } + if (attr->flags) + return -FI_EINVAL; + return 0; +} + int sock_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr, struct fid_cntr **cntr, void *context) { + int ret; struct sock_domain *dom; struct sock_cntr *_cntr; - int ret; - - if ((attr->events != FI_CNTR_EVENTS_COMP) || - (attr->wait_obj != FI_WAIT_MUT_COND) || attr->flags) + struct fi_wait_attr wait_attr; + struct sock_fid_list *list_entry; + struct sock_wait *wait; + + if (attr && sock_cntr_verify_attr(attr)) return -FI_ENOSYS; _cntr = calloc(1, sizeof(*_cntr)); @@ -163,27 +286,64 @@ int sock_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr, if (ret) goto err1; - ret = pthread_mutex_init(&_cntr->mut, NULL); - if (ret) - goto err2; + if(attr == NULL) + memcpy(&_cntr->attr, &sock_cntr_add, sizeof(sock_cntr_attr)); + else + memcpy(&_cntr->attr, attr, sizeof(sock_cntr_attr)); + + switch (_cntr->attr.wait_obj) { + case FI_WAIT_NONE: + case FI_WAIT_UNSPEC: + case FI_WAIT_MUTEX_COND: + _cntr->signal = 0; + break; + + case FI_WAIT_FD: + wait_attr.flags = 0; + wait_attr.wait_obj = FI_WAIT_FD; + ret = sock_wait_open(domain, &wait_attr, &_cntr->waitset); + if (ret) + goto err1; + _cntr->signal = 1; + break; + + case FI_WAIT_SET: + _cntr->waitset = attr->wait_set; + _cntr->signal = 1; + wait = container_of(attr->wait_set, struct sock_wait, wait_fid); + list_entry = calloc(1, sizeof(*list_entry)); + dlist_init(&list_entry->entry); + list_entry->fid = &_cntr->cntr_fid.fid; + dlist_insert_after(&list_entry->entry, &wait->fid_list); + + break; + + default: + break; + } + + fastlock_init(&_cntr->mut); atomic_init(&_cntr->ref, 0); atomic_init(&_cntr->err_cnt, 0); + atomic_init(&_cntr->value, 0); + atomic_init(&_cntr->threshold, ~0); + + dlist_init(&_cntr->tx_list); + dlist_init(&_cntr->rx_list); + _cntr->cntr_fid.fid.fclass = FI_CLASS_CNTR; _cntr->cntr_fid.fid.context = context; _cntr->cntr_fid.fid.ops = &sock_cntr_fi_ops; _cntr->cntr_fid.ops = &sock_cntr_ops; - _cntr->threshold = ~0; dom = container_of(domain, struct sock_domain, dom_fid); atomic_inc(&dom->ref); - _cntr->dom = dom; + _cntr->domain = dom; *cntr = &_cntr->cntr_fid; return 0; -err2: - pthread_cond_destroy(&_cntr->cond); err1: free(_cntr); return -ret; diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_comm.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_comm.c new file mode 100644 index 0000000000..08b3cd5246 --- /dev/null +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_comm.c @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include +#endif /* HAVE_CONFIG_H */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sock.h" +#include "sock_util.h" + +static ssize_t sock_comm_send_socket(struct sock_conn *conn, const void *buf, size_t len) +{ + ssize_t ret; + size_t rem = len; + size_t offset = 0, done_len = 0; + + while(rem > 0) { + len = MIN(rem, SOCK_COMM_BUF_SZ); + ret = send(conn->sock_fd, buf + offset, len, 0); + if (ret <= 0) + break; + + done_len += ret; + rem -= ret; + offset += ret; + } + SOCK_LOG_INFO("WROTE %lu on wire\n", done_len); + return done_len; +} + +ssize_t sock_comm_flush(struct sock_conn *conn) +{ + ssize_t ret1, ret2 = 0; + size_t endlen, len, xfer_len; + + len = rbused(&conn->outbuf); + endlen = conn->outbuf.size - (conn->outbuf.rcnt & conn->outbuf.size_mask); + + xfer_len = MIN(len, endlen); + ret1 = sock_comm_send_socket(conn, conn->outbuf.buf + + (conn->outbuf.rcnt & conn->outbuf.size_mask), + xfer_len); + if (ret1 > 0) + conn->outbuf.rcnt += ret1; + + if (ret1 == xfer_len && xfer_len < len) { + ret2 = sock_comm_send_socket(conn, conn->outbuf.buf + + (conn->outbuf.rcnt & conn->outbuf.size_mask), + len - xfer_len); + if (ret2 > 0) + conn->outbuf.rcnt += ret2; + else + ret2 = 0; + } + + return (ret1 > 0) ? ret1 + ret2 : 0; +} + +ssize_t sock_comm_send(struct sock_conn *conn, const void *buf, size_t len) +{ + ssize_t ret, used; + + if (len >= SOCK_COMM_THRESHOLD) { + used = rbused(&conn->outbuf); + if (used == sock_comm_flush(conn)) { + return sock_comm_send_socket(conn, buf, len); + } else + return 0; + } + + if (rbavail(&conn->outbuf) < len) { + ret = sock_comm_flush(conn); + if (ret <= 0) + return 0; + } + + ret = MIN(rbavail(&conn->outbuf), len); + rbwrite(&conn->outbuf, buf, ret); + rbcommit(&conn->outbuf); + SOCK_LOG_INFO("Buffered %lu\n", ret); + return ret; +} + +ssize_t sock_comm_recv_socket(struct sock_conn *conn, void *buf, size_t len) +{ + ssize_t ret; + + ret = recv(conn->sock_fd, buf, len, 0); + if (ret <= 0) + return 0; + + SOCK_LOG_INFO("READ from wire: %lu\n", ret); + return ret; +} + +ssize_t sock_comm_recv_buffer(struct sock_conn *conn) +{ + int ret; + size_t endlen; + endlen = conn->inbuf.size - + (conn->inbuf.wpos & conn->inbuf.size_mask); + + if ((ret = sock_comm_recv_socket(conn, (char*) conn->inbuf.buf + + (conn->inbuf.wpos & conn->inbuf.size_mask), + endlen)) <= 0) + return 0; + + conn->inbuf.wpos += ret; + rbcommit(&conn->inbuf); + if (ret != endlen) + return ret; + + if ((ret = sock_comm_recv_socket(conn, conn->inbuf.buf, + rbavail(&conn->inbuf))) <= 0) + return 0; + + conn->inbuf.wpos += ret; + rbcommit(&conn->inbuf); + return 0; +} + +ssize_t sock_comm_recv(struct sock_conn *conn, void *buf, size_t len) +{ + int ret = 0; + ssize_t used, read_len; + + used = rbused(&conn->inbuf); + if (used == 0) { + ret = sock_comm_recv_socket(conn, buf, len); + sock_comm_recv_buffer(conn); + return ret; + } + + read_len = MIN(len, used); + rbread(&conn->inbuf, buf, read_len); + if (len > used) { + ret = sock_comm_recv_socket(conn, (char*)buf + used, len - used); + if (ret <= 0) + ret = 0; + sock_comm_recv_buffer(conn); + } + SOCK_LOG_INFO("Read %lu from buffer\n", ret + read_len); + return ret + read_len; +} + +int sock_comm_buffer_init(struct sock_conn *conn) +{ + uint64_t flags; + socklen_t size = SOCK_COMM_BUF_SZ; + socklen_t optlen = sizeof(socklen_t); + + flags = fcntl(conn->sock_fd, F_GETFL, 0); + fcntl(conn->sock_fd, F_SETFL, flags | O_NONBLOCK); + + rbinit(&conn->inbuf, SOCK_COMM_BUF_SZ); + rbinit(&conn->outbuf, SOCK_COMM_BUF_SZ); + + setsockopt(conn->sock_fd, SOL_SOCKET, SO_RCVBUF, &size, optlen); + setsockopt(conn->sock_fd, SOL_SOCKET, SO_SNDBUF, &size, optlen); + + getsockopt(conn->sock_fd, SOL_SOCKET, SO_RCVBUF, &size, &optlen); + SOCK_LOG_INFO("SO_RCVBUF: %d\n", size); + + optlen = sizeof(socklen_t); + getsockopt(conn->sock_fd, SOL_SOCKET, SO_SNDBUF, &size, &optlen); + SOCK_LOG_INFO("SO_SNDBUF: %d\n", size); + return 0; +} + + +void sock_comm_buffer_finalize(struct sock_conn *conn) +{ + rbfree(&conn->inbuf); + rbfree(&conn->outbuf); +} diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_conn.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_conn.c new file mode 100644 index 0000000000..c0ad1bfc54 --- /dev/null +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_conn.c @@ -0,0 +1,263 @@ +/* + * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include +#endif /* HAVE_CONFIG_H */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sock.h" +#include "sock_util.h" + +static int _init_map(struct sock_conn_map *map, int init_size) +{ + map->table = (struct sock_conn*)calloc(init_size, + sizeof(struct sock_conn)); + if (!map->table) + return -FI_ENOMEM; + map->used = 0; + map->size = init_size; + return 0; +} + +static int _increase_map(struct sock_conn_map *map, int new_size) +{ + if (map->used + new_size > map->size) { + void *_table = realloc(map->table, map->size * sizeof(struct + sock_conn)); + if (!_table) + return -FI_ENOMEM; + + map->size = MAX(map->size, new_size) * 2; + map->table = (struct sock_conn*) _table; + } + + return 0; +} + +void sock_conn_map_destroy(struct sock_conn_map *cmap) +{ + free(cmap->table); + cmap->table = NULL; + cmap->used = cmap->size = 0; +} + +struct sock_conn *sock_conn_map_lookup_key(struct sock_conn_map *conn_map, + uint16_t key) +{ + if (key > conn_map->used) { + SOCK_LOG_ERROR("requested key is larger than conn_map size\n"); + errno = EINVAL; + return NULL; + } + + return &conn_map->table[key-1]; +} + +uint16_t sock_conn_map_match_or_connect(struct sock_conn_map *map, struct + sockaddr_in *addr, int match_only) +{ + int i, conn_fd, arg, optval; + socklen_t optlen; + char entry_ip[INET_ADDRSTRLEN]; + char sa_ip[INET_ADDRSTRLEN]; + struct sockaddr_in *entry; + struct timeval tv; + fd_set fds; + struct sock_conn *conn; + + memcpy(sa_ip, inet_ntoa(addr->sin_addr), INET_ADDRSTRLEN); + /* match */ + for (i=0; i < map->used; i++) { + entry = (struct sockaddr_in *)&map->table[i].addr; + memcpy(entry_ip, inet_ntoa(entry->sin_addr), INET_ADDRSTRLEN); + if(!strcmp(entry_ip, sa_ip)) { + return i+1; + } + } + + if (match_only) + return 0; + + /* no matching entry, connect */ + conn_fd = socket(AF_INET, SOCK_STREAM, 0); + if (conn_fd < 0) { + SOCK_LOG_ERROR("failed to create conn_fd, errno: %d\n", errno); + return 0; + } + fcntl(conn_fd, F_SETFL, O_NONBLOCK); + + if (connect(conn_fd, addr, sizeof *addr) < 0) { + if (errno == EINPROGRESS) { + /* timeout after 5 secs */ + tv.tv_sec = 5; + tv.tv_usec = 0; + FD_ZERO(&fds); + FD_SET(conn_fd, &fds); + if (select(conn_fd+1, NULL, &fds, NULL, &tv) > 0) { + optlen = sizeof(int); + getsockopt(conn_fd, SOL_SOCKET, SO_ERROR, &optval, &optlen); + + if (optval) { + SOCK_LOG_ERROR("failed to connect %d - %s\n", optval, + strerror(optval)); + close(conn_fd); + return 0; + } + } else { + SOCK_LOG_ERROR("Timeout or error to connect %d - %s\n", optval, + strerror(optval)); + close(conn_fd); + return 0; + } + } else { + SOCK_LOG_ERROR("Error connecting %d - %s\n", errno, + strerror(errno)); + close(conn_fd); + return 0; + } + } + + arg = fcntl(conn_fd, F_GETFL, NULL); + arg &= (~O_NONBLOCK); + fcntl(conn_fd, F_SETFL, arg); + + memcpy(&map->table[map->used].addr, addr, sizeof *addr); + map->table[map->used].sock_fd = conn_fd; + + conn = &map->table[map->used]; + sock_comm_buffer_init(conn); + + map->used++; + return map->used; + +} + +static void * _sock_conn_listen(void *arg) +{ + struct sock_domain *domain = (struct sock_domain*) arg; + struct sock_conn_map *map = &domain->r_cmap; + struct addrinfo *s_res = NULL, *p; + struct addrinfo hints; + int optval; + int listen_fd = 0, conn_fd; + struct sockaddr_in remote; + socklen_t addr_size; + struct sock_conn *conn; + + memset(&hints, 0, sizeof(hints)); + hints.ai_family = AF_INET; + hints.ai_socktype = SOCK_STREAM; + hints.ai_flags = AI_PASSIVE; + + if(getaddrinfo(NULL, domain->service, &hints, &s_res)) { + SOCK_LOG_ERROR("no available AF_INET address\n"); + perror("no available AF_INET address"); + return NULL; + } + + for (p=s_res; p; p=p->ai_next) { + listen_fd = socket(p->ai_family, p->ai_socktype, p->ai_protocol); + if (listen_fd >= 0) { + optval = 1; + setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &optval, sizeof + optval); + if (!bind(listen_fd, s_res->ai_addr, s_res->ai_addrlen)) + break; + close(listen_fd); + listen_fd = -1; + } + } + + freeaddrinfo(s_res); + if (listen_fd < 0) { + SOCK_LOG_ERROR("failed to listen to port: %s\n", domain->service); + goto err; + } + + if (listen(listen_fd, 128)) { + SOCK_LOG_ERROR("failed to listen socket: %d\n", errno); + goto err; + } + + while(domain->listening) { + addr_size = sizeof(struct sockaddr_in); + conn_fd = accept(listen_fd, (struct sockaddr *)&remote, &addr_size); + SOCK_LOG_INFO("CONN: accepted conn-req: %d\n", conn_fd); + if (conn_fd < 0) { + SOCK_LOG_ERROR("failed to accept: %d\n", errno); + goto err; + } + + /* TODO: lock for multi-threads */ + if ((map->size - map->used) == 0) { + _increase_map(map, map->size*2); + } + memcpy(&map->table[map->used].addr, &remote, addr_size); + map->table[map->used].sock_fd = conn_fd; + + conn = &map->table[map->used]; + sock_comm_buffer_init(conn); + + map->used++; + } + + close(listen_fd); + return NULL; + +err: + close(listen_fd); + perror("listening thread failed"); + return NULL; +} + +int sock_conn_listen(struct sock_domain *domain) +{ + _init_map(&domain->r_cmap, 128); /* TODO: init cmap size */ + domain->listening = 1; + pthread_create(&domain->listen_thread, 0, _sock_conn_listen, domain); + return 0; +} diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_cq.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_cq.c index df1bee1b11..1b6cea5a02 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_cq.c +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_cq.c @@ -49,6 +49,26 @@ #include "sock_util.h" +int sock_cq_progress(struct sock_cq *cq) +{ + struct sock_tx_ctx *tx_ctx; + struct sock_rx_ctx *rx_ctx; + struct dlist_entry *entry; + + for (entry = cq->tx_list.next; entry != &cq->tx_list; + entry = entry->next) { + tx_ctx = container_of(entry, struct sock_tx_ctx, cq_entry); + sock_pe_progress_tx_ctx(cq->domain->pe, tx_ctx); + } + + for (entry = cq->rx_list.next; entry != &cq->rx_list; + entry = entry->next) { + rx_ctx = container_of(entry, struct sock_rx_ctx, cq_entry); + sock_pe_progress_rx_ctx(cq->domain->pe, rx_ctx); + } + return 0; +} + static ssize_t sock_cq_entry_size(struct sock_cq *sock_cq) { ssize_t size; @@ -73,7 +93,7 @@ static ssize_t sock_cq_entry_size(struct sock_cq *sock_cq) case FI_CQ_FORMAT_UNSPEC: default: size = -1; - SOCK_LOG_ERROR("CQ: Invalid CQ format\n"); + SOCK_LOG_ERROR("Invalid CQ format\n"); break; } return size; @@ -85,9 +105,9 @@ static ssize_t _sock_cq_write(struct sock_cq *cq, fi_addr_t addr, ssize_t ret; fastlock_acquire(&cq->lock); - - if(rbfdavail(&cq->cq_rbfd) < len) { + if (rbfdavail(&cq->cq_rbfd) < len) { ret = -FI_ENOSPC; + SOCK_LOG_ERROR("Not enough space in CQ\n"); goto out; } @@ -98,6 +118,8 @@ static ssize_t _sock_cq_write(struct sock_cq *cq, fi_addr_t addr, rbwrite(&cq->addr_rb, &addr, sizeof(fi_addr_t)); rbcommit(&cq->addr_rb); + if (cq->signal) + sock_wait_signal(cq->waitset); out: fastlock_release(&cq->lock); return ret; @@ -109,8 +131,9 @@ static ssize_t _sock_cq_writeerr(struct sock_cq *cq, ssize_t ret; fastlock_acquire(&cq->lock); - if(rbavail(&cq->cqerr_rb) < len) { + if (rbavail(&cq->cqerr_rb) < len) { ret = -FI_ENOSPC; + SOCK_LOG_ERROR("Not enough space in CQ\n"); goto out; } @@ -118,6 +141,8 @@ static ssize_t _sock_cq_writeerr(struct sock_cq *cq, rbcommit(&cq->cqerr_rb); ret = len; + if (cq->signal) + sock_wait_signal(cq->waitset); out: fastlock_release(&cq->lock); return ret; @@ -138,7 +163,7 @@ static int sock_cq_report_msg(struct sock_cq *cq, fi_addr_t addr, struct fi_cq_msg_entry cq_entry; cq_entry.op_context = (void*)pe_entry->context; cq_entry.flags = pe_entry->flags; - cq_entry.len = pe_entry->done_len; + cq_entry.len = pe_entry->data_len; return _sock_cq_write(cq, addr, &cq_entry, sizeof(cq_entry)); } @@ -148,8 +173,8 @@ static int sock_cq_report_data(struct sock_cq *cq, fi_addr_t addr, struct fi_cq_data_entry cq_entry; cq_entry.op_context = (void*)pe_entry->context; cq_entry.flags = pe_entry->flags; - cq_entry.len = pe_entry->done_len; - cq_entry.buf = (void*)pe_entry->rx.rx_iov[0].iov.addr; + cq_entry.len = pe_entry->data_len; + cq_entry.buf = (void*)pe_entry->buf; cq_entry.data = pe_entry->data; return _sock_cq_write(cq, addr, &cq_entry, sizeof(cq_entry)); } @@ -160,8 +185,8 @@ static int sock_cq_report_tagged(struct sock_cq *cq, fi_addr_t addr, struct fi_cq_tagged_entry cq_entry; cq_entry.op_context = (void*)pe_entry->context; cq_entry.flags = pe_entry->flags; - cq_entry.len = pe_entry->done_len; - cq_entry.buf = (void*)pe_entry->rx.rx_iov[0].iov.addr; + cq_entry.len = pe_entry->data_len; + cq_entry.buf = (void*)pe_entry->buf; cq_entry.data = pe_entry->data; cq_entry.tag = pe_entry->tag; return _sock_cq_write(cq, addr, &cq_entry, sizeof(cq_entry)); @@ -188,7 +213,7 @@ static void sock_cq_set_report_fn(struct sock_cq *sock_cq) case FI_CQ_FORMAT_UNSPEC: default: - SOCK_LOG_ERROR("CQ: Invalid CQ format\n"); + SOCK_LOG_ERROR("Invalid CQ format\n"); break; } } @@ -199,12 +224,30 @@ ssize_t sock_cq_sreadfrom(struct fid_cq *cq, void *buf, size_t count, int ret; fi_addr_t addr; int64_t threshold; - ssize_t i, bytes_read, num_read, cq_entry_len; + struct timeval now; struct sock_cq *sock_cq; + double start_ms, end_ms; + ssize_t i, bytes_read, num_read, cq_entry_len; sock_cq = container_of(cq, struct sock_cq, cq_fid); cq_entry_len = sock_cq->cq_entry_size; + if (sock_cq->domain->progress_mode == FI_PROGRESS_MANUAL) { + if (timeout > 0) { + gettimeofday(&now, NULL); + start_ms = (double)now.tv_sec * 1000.0 + + (double)now.tv_usec / 1000.0; + } + sock_cq_progress(sock_cq); + if (timeout > 0) { + gettimeofday(&now, NULL); + end_ms = (double)now.tv_sec * 1000.0 + + (double)now.tv_usec / 1000.0; + timeout -= (end_ms - start_ms); + timeout = timeout < 0 ? 0 : timeout; + } + } + if (sock_cq->attr.wait_cond == FI_CQ_COND_THRESHOLD) { threshold = MIN((int64_t)cond, count); }else{ @@ -215,7 +258,7 @@ ssize_t sock_cq_sreadfrom(struct fid_cq *cq, void *buf, size_t count, bytes_read = rbfdsread(&sock_cq->cq_rbfd, buf, cq_entry_len*threshold, timeout); - if(bytes_read == 0) { + if (bytes_read == 0) { ret = -FI_ETIMEDOUT; goto out; } @@ -223,11 +266,10 @@ ssize_t sock_cq_sreadfrom(struct fid_cq *cq, void *buf, size_t count, num_read = bytes_read/cq_entry_len; for(i=0; i < num_read; i++) { rbread(&sock_cq->addr_rb, &addr, sizeof(fi_addr_t)); - if(src_addr) + if (src_addr) src_addr[i] = addr; } ret = num_read; - out: fastlock_release(&sock_cq->lock); return ret; @@ -261,9 +303,12 @@ ssize_t sock_cq_readerr(struct fid_cq *cq, struct fi_cq_err_entry *buf, sock_cq = container_of(cq, struct sock_cq, cq_fid); num_read = 0; - fastlock_acquire(&sock_cq->lock); - while(rbused(&sock_cq->cqerr_rb) >= sizeof(struct fi_cq_err_entry)) { + if (sock_cq->domain->progress_mode == FI_PROGRESS_MANUAL) + sock_cq_progress(sock_cq); + + fastlock_acquire(&sock_cq->lock); + while (rbused(&sock_cq->cqerr_rb) >= sizeof(struct fi_cq_err_entry)) { rbread(&sock_cq->cqerr_rb, (char*)buf +sizeof(struct fi_cq_err_entry) * num_read, sizeof(struct fi_cq_err_entry)); @@ -279,7 +324,7 @@ ssize_t sock_cq_write(struct fid_cq *cq, const void *buf, size_t len) struct sock_cq *sock_cq; sock_cq = container_of(cq, struct sock_cq, cq_fid); - if(!(sock_cq->attr.flags & FI_WRITE)) + if (!(sock_cq->attr.flags & FI_WRITE)) return -FI_EINVAL; return _sock_cq_write(sock_cq, FI_ADDR_NOTAVAIL, buf, len); @@ -291,7 +336,7 @@ ssize_t sock_cq_writeerr(struct fid_cq *cq, struct fi_cq_err_entry *buf, struct sock_cq *sock_cq; sock_cq = container_of(cq, struct sock_cq, cq_fid); - if(!(sock_cq->attr.flags & FI_WRITE)) + if (!(sock_cq->attr.flags & FI_WRITE)) return -FI_EINVAL; return _sock_cq_writeerr(sock_cq, buf, len); @@ -313,6 +358,9 @@ int sock_cq_close(struct fid *fid) if (atomic_get(&cq->ref)) return -FI_EBUSY; + if (cq->signal && cq->attr.wait_obj == FI_WAIT_MUTEX_COND) + sock_wait_close(&cq->waitset->fid); + rbfree(&cq->addr_rb); rbfree(&cq->cqerr_rb); rbfdfree(&cq->cq_rbfd); @@ -335,14 +383,49 @@ struct fi_ops_cq sock_cq_ops = { .strerror = sock_cq_strerror, }; +static int sock_cq_control(struct fid *fid, int command, void *arg) +{ + struct sock_cq *cq; + int ret = 0; + + cq = container_of(fid, struct sock_cq, cq_fid); + switch (command) { + case FI_GETWAIT: + switch (cq->attr.wait_obj) { + case FI_WAIT_NONE: + case FI_WAIT_FD: + case FI_WAIT_UNSPEC: + memcpy(arg, &cq->cq_rbfd.fd[RB_READ_FD], sizeof(int)); + break; + + case FI_WAIT_SET: + case FI_WAIT_MUTEX_COND: + sock_wait_get_obj(cq->waitset, arg); + break; + + default: + ret = -FI_EINVAL; + break; + } + break; + + default: + ret = -FI_EINVAL; + break; + } + + return ret; +} + struct fi_ops sock_cq_fi_ops = { .size = sizeof(struct fi_ops), + .control = sock_cq_control, .close = sock_cq_close, }; static int sock_cq_verify_attr(struct fi_cq_attr *attr) { - if(!attr) + if (!attr) return 0; switch (attr->format) { @@ -358,6 +441,8 @@ static int sock_cq_verify_attr(struct fi_cq_attr *attr) switch (attr->wait_obj) { case FI_WAIT_NONE: case FI_WAIT_FD: + case FI_WAIT_SET: + case FI_WAIT_MUTEX_COND: break; case FI_WAIT_UNSPEC: attr->wait_obj = FI_WAIT_FD; @@ -384,6 +469,9 @@ int sock_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, { struct sock_domain *sock_dom; struct sock_cq *sock_cq; + struct fi_wait_attr wait_attr; + struct sock_fid_list *list_entry; + struct sock_wait *wait; int ret; sock_dom = container_of(domain, struct sock_domain, dom_fid); @@ -402,11 +490,13 @@ int sock_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, sock_cq->cq_fid.ops = &sock_cq_ops; atomic_inc(&sock_dom->ref); - if(attr == NULL) - memcpy(&sock_cq->attr, &_sock_cq_def_attr, - sizeof(struct fi_cq_attr)); - else - memcpy(&sock_cq->attr, attr, sizeof(struct fi_cq_attr)); + if (attr == NULL) + sock_cq->attr = _sock_cq_def_attr; + else { + sock_cq->attr = *attr; + if (attr->size == 0) + sock_cq->attr.size = _sock_cq_def_attr.size; + } sock_cq->domain = sock_dom; sock_cq->cq_entry_size = sock_cq_entry_size(sock_cq); @@ -416,22 +506,53 @@ int sock_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, dlist_init(&sock_cq->rx_list); dlist_init(&sock_cq->ep_list); - if((ret = rbfdinit(&sock_cq->cq_rbfd, sock_cq->attr.size))) + if ((ret = rbfdinit(&sock_cq->cq_rbfd, sock_cq->attr.size * + sock_cq->cq_entry_size))) goto err1; - if((ret = rbinit(&sock_cq->addr_rb, - (sock_cq->attr.size/sock_cq->cq_entry_size) * sizeof(fi_addr_t)))) + if ((ret = rbinit(&sock_cq->addr_rb, + sock_cq->attr.size * sizeof(fi_addr_t)))) goto err2; - if((ret = rbinit(&sock_cq->cqerr_rb, sock_cq->attr.size))) + if ((ret = rbinit(&sock_cq->cqerr_rb, sock_cq->attr.size * + sizeof(struct fi_cq_err_entry)))) goto err3; fastlock_init(&sock_cq->lock); + switch (sock_cq->attr.wait_obj) { + case FI_WAIT_NONE: + case FI_WAIT_UNSPEC: + case FI_WAIT_FD: + break; + + case FI_WAIT_MUTEX_COND: + wait_attr.flags = 0; + wait_attr.wait_obj = FI_WAIT_MUTEX_COND; + ret = sock_wait_open(&sock_dom->dom_fid, &wait_attr, + &sock_cq->waitset); + if (ret) + goto err3; + sock_cq->signal = 1; + break; + + case FI_WAIT_SET: + sock_cq->waitset = attr->wait_set; + sock_cq->signal = 1; + wait = container_of(attr->wait_set, struct sock_wait, wait_fid); + list_entry = calloc(1, sizeof(*list_entry)); + dlist_init(&list_entry->entry); + list_entry->fid = &sock_cq->cq_fid.fid; + dlist_insert_after(&list_entry->entry, &wait->fid_list); + break; + default: + break; + } + *cq = &sock_cq->cq_fid; atomic_inc(&sock_dom->ref); return 0; - + err3: rbfree(&sock_cq->addr_rb); err2: @@ -448,8 +569,7 @@ int sock_cq_report_error(struct sock_cq *cq, struct sock_pe_entry *entry, struct fi_cq_err_entry err_entry; fastlock_acquire(&cq->lock); - - if(rbavail(&cq->cqerr_rb) < sizeof(struct fi_cq_err_entry)) { + if (rbavail(&cq->cqerr_rb) < sizeof(struct fi_cq_err_entry)) { ret = -FI_ENOSPC; goto out; } @@ -457,14 +577,14 @@ int sock_cq_report_error(struct sock_cq *cq, struct sock_pe_entry *entry, err_entry.err = err; err_entry.olen = olen; err_entry.err_data = err_data; - err_entry.len = entry->done_len; + err_entry.len = entry->data_len; err_entry.prov_errno = prov_errno; err_entry.flags = entry->flags; err_entry.data = entry->data; err_entry.tag = entry->tag; err_entry.op_context = (void*)entry->context; - if(entry->type == SOCK_PE_RX) { + if (entry->type == SOCK_PE_RX) { err_entry.buf = (void*)entry->rx.rx_iov[0].iov.addr; }else { err_entry.buf = (void*)entry->tx.tx_iov[0].src.iov.addr; diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ctx.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ctx.c index 2d03f44b2f..27d214af7f 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ctx.c +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ctx.c @@ -49,10 +49,12 @@ struct sock_rx_ctx *sock_rx_ctx_alloc(struct fi_rx_attr *attr, void *context) return NULL; dlist_init(&rx_ctx->cq_entry); + dlist_init(&rx_ctx->cntr_entry); dlist_init(&rx_ctx->pe_entry); dlist_init(&rx_ctx->pe_entry_list); dlist_init(&rx_ctx->rx_entry_list); + dlist_init(&rx_ctx->rx_buffered_list); dlist_init(&rx_ctx->ep_list); fastlock_init(&rx_ctx->lock); @@ -63,21 +65,14 @@ struct sock_rx_ctx *sock_rx_ctx_alloc(struct fi_rx_attr *attr, void *context) return rx_ctx; } -void sock_rx_ctx_add_ep(struct sock_rx_ctx *rx_ctx, struct sock_ep *ep) -{ - fastlock_acquire(&rx_ctx->lock); - dlist_insert_tail(&ep->rx_ctx_entry, &rx_ctx->ep_list); - atomic_inc(&ep->num_rx_ctx); - fastlock_release(&rx_ctx->lock); -} - void sock_rx_ctx_free(struct sock_rx_ctx *rx_ctx) { fastlock_destroy(&rx_ctx->lock); free(rx_ctx); } -struct sock_tx_ctx *sock_tx_ctx_alloc(struct fi_tx_attr *attr, void *context) +static struct sock_tx_ctx *sock_tx_context_alloc(struct fi_tx_attr *attr, + void *context, size_t fclass) { struct sock_tx_ctx *tx_ctx; @@ -89,30 +84,44 @@ struct sock_tx_ctx *sock_tx_ctx_alloc(struct fi_tx_attr *attr, void *context) goto err; dlist_init(&tx_ctx->cq_entry); + dlist_init(&tx_ctx->cntr_entry); dlist_init(&tx_ctx->pe_entry); - + dlist_init(&tx_ctx->pe_entry_list); dlist_init(&tx_ctx->ep_list); - + fastlock_init(&tx_ctx->rlock); fastlock_init(&tx_ctx->wlock); - tx_ctx->ctx.fid.fclass = FI_CLASS_TX_CTX; - tx_ctx->ctx.fid.context = context; - tx_ctx->attr = *attr; - + switch (fclass) { + case FI_CLASS_TX_CTX: + tx_ctx->ctx.fid.fclass = FI_CLASS_TX_CTX; + tx_ctx->ctx.fid.context = context; + break; + case FI_CLASS_STX_CTX: + tx_ctx->stx.fid.fclass = FI_CLASS_TX_CTX; + tx_ctx->stx.fid.context = context; + break; + default: + goto err; + } + tx_ctx->attr = *attr; return tx_ctx; + err: free(tx_ctx); return NULL; } -void sock_tx_ctx_add_ep(struct sock_tx_ctx *tx_ctx, struct sock_ep *ep) + +struct sock_tx_ctx *sock_tx_ctx_alloc(struct fi_tx_attr *attr, void *context) { - fastlock_acquire(&tx_ctx->lock); - dlist_insert_tail(&ep->tx_ctx_entry, &tx_ctx->ep_list); - atomic_inc(&ep->num_tx_ctx); - fastlock_release(&tx_ctx->lock); + return sock_tx_context_alloc(attr, context, FI_CLASS_TX_CTX); +} + +struct sock_tx_ctx *sock_stx_ctx_alloc(struct fi_tx_attr *attr, void *context) +{ + return sock_tx_context_alloc(attr, context, FI_CLASS_STX_CTX); } void sock_tx_ctx_free(struct sock_tx_ctx *tx_ctx) @@ -136,7 +145,7 @@ void sock_tx_ctx_write(struct sock_tx_ctx *tx_ctx, const void *buf, size_t len) void sock_tx_ctx_commit(struct sock_tx_ctx *tx_ctx) { rbfdcommit(&tx_ctx->rbfd); - fastlock_release(&tx_ctx->rlock); + fastlock_release(&tx_ctx->wlock); } void sock_tx_ctx_abort(struct sock_tx_ctx *tx_ctx) @@ -145,19 +154,3 @@ void sock_tx_ctx_abort(struct sock_tx_ctx *tx_ctx) fastlock_release(&tx_ctx->rlock); } -int sock_tx_ctx_read(struct sock_tx_ctx *tx_ctx, void *buf, size_t len) -{ - int ret; - - fastlock_acquire(&tx_ctx->rlock); - if (rbfdused(&tx_ctx->rbfd) >= len) { - rbfdread(&tx_ctx->rbfd, buf, len); - ret = 0; - } else { - ret = -FI_EAGAIN; - } - fastlock_release(&tx_ctx->rlock); - - return ret; -} - diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_dgram.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_dgram.c deleted file mode 100644 index 7c337956ae..0000000000 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_dgram.c +++ /dev/null @@ -1,802 +0,0 @@ -/* - * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#if HAVE_CONFIG_H -# include -#endif /* HAVE_CONFIG_H */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "sock_util.h" -#include "sock.h" - - -/* FIXME: figure out the sockd caps */ -#if 0 -#define SOCKD_EP_CAP (FI_TAGGED | FI_MSG | FI_ATOMICS | FI_INJECT | \ - FI_RMA | FI_BUFFERED_RECV | FI_MULTI_RECV | \ - FI_READ | FI_WRITE | FI_SEND | FI_RECV | \ - FI_REMOTE_READ | FI_REMOTE_WRITE | \ - FI_REMOTE_COMPLETE | FI_REMOTE_SIGNAL | \ - FI_CANCEL | FI_TRIGGER) -#endif -#define SOCKD_OP_FLAGS (FI_INJECT | FI_EVENT | \ - FI_TRIGGER | FI_REMOTE_SIGNAL | FI_CANCEL) -#define SOCKD_DOMAIN_CAP (FI_WRITE_COHERENT | FI_CONTEXT | \ - FI_USER_MR_KEY | FI_DYNAMIC_MR) -#define SOCKD_MTU (512) - -static int so_rcvbuf; - -int sockd_check_hints(struct fi_info *hints) -{ - switch (hints->ep_type) { - case FI_EP_DGRAM: - break; - default: - SOCK_LOG_ERROR("[sockd] %s: hints->type = %d, only FI_EP_DGRAM = %d is supported\n", - __func__, hints->ep_type, FI_EP_DGRAM); - return -FI_ENODATA; - } - - switch (hints->addr_format) { - case FI_SOCKADDR: - case FI_SOCKADDR_IN: - case FI_SOCKADDR_IN6: - break; - default: - SOCK_LOG_ERROR("[sockd] %s: hints->addr_format = %d, supported = FI_SOCKADDR or FI_SOCKADDR_IN or FI_SOCKADDR_IN6\n", - __func__, hints->addr_format); - return -FI_ENODATA; - } - - if (hints->ep_attr) { - switch (hints->ep_attr->protocol) { - case FI_PROTO_UNSPEC: - break; - default: - /* - SOCK_LOG_ERROR("[sockd] %s: hints->ep_attr->protocol=%lu, supported=%d\n", - __func__, hints->ep_attr->protocol, FI_PROTO_UNSPEC); - */ - return -FI_ENODATA; - } - if (hints->ep_attr->max_msg_size > SOCKD_MTU) { - /* - SOCK_LOG_ERROR("[sockd] %s: hints->ep_attr->max_msg_size=%d, supported=%d\n", - __func__, hints->ep_attr->max_msg_size, SOCKD_MTU); - */ - return -FI_ENODATA; - } - if (hints->ep_attr->inject_size > SOCKD_MTU) { - /* - SOCK_LOG_ERROR("[sockd] %s: hints->ep_attr->inject_size=%d, supported=%d\n", - __func__, hints->ep_attr->inject_size, SOCKD_MTU); - */ - return -FI_ENODATA; - } - if (hints->ep_attr->total_buffered_recv > so_rcvbuf) { - /* - SOCK_LOG_ERROR("[sockd] %s: hints->ep_attr->total_buffered_recv=%d, supported=%d\n", - __func__, hints->ep_attr->total_buffered_recv, so_rcvbuf); - */ - return -FI_ENODATA; - } - /* FIXME: check - * max_order_raw_size, - * max_order_war_size, - * max_order_waw_size, - * mem_tag_format, - * msg_order */ - } - - if ((hints->caps & SOCK_EP_DGRAM_CAP) != hints->caps) { - /* - SOCK_LOG_ERROR("[sockd] %s: hints->ep_cap=0x%llx, supported=0x%llx\n", - __func__, hints->caps, SOCK_EP_DGRAM_CAP); - */ - return -FI_ENODATA; - } - - if (hints->tx_attr && ((hints->tx_attr->op_flags & SOCKD_OP_FLAGS) != hints->tx_attr->op_flags)) { - /* - SOCK_LOG_ERROR("[sockd] %s: hints->tx_attr->op_flags=0x%llx, supported=0x%llx\n", - __func__, hints->tx_attr->op_flags, SOCKD_OP_FLAGS); - */ - return -FI_ENODATA; - } - -#if 0 /* TODO */ - if ((hints->domain_cap & SOCKD_DOMAIN_CAP) != hints->domain_cap) { - SOCK_LOG_ERROR("[sockd] %s: hints->domain_cap=0x%llx, supported=0x%llx\n", - __func__, hints->domain_cap, SOCKD_DOMAIN_CAP); - return -FI_ENODATA; - /* FIXME: check - * threading, control_progress, mr_key_size, eq_data_size */ - } -#endif - - if (hints->fabric_attr) { - /* FIXME: check name */ - } - - struct sockaddr_in *si_src; - if (!hints->src_addr || !hints->src_addrlen) { - SOCK_LOG_ERROR("[sockd] src_addr and src_addrlen are required from hints\n"); - return -FI_ENODATA; - } else { - si_src = (struct sockaddr_in *)(hints->src_addr); - if (ntohs(si_src->sin_port)<1024) { - SOCK_LOG_ERROR("[sockd] port number should be above 1023\n"); - return -FI_ENODATA; - } - SOCK_LOG_ERROR("[sockd] port is set to %d\n", ntohs(si_src->sin_port)); - } - - return 0; -} - -/* TODO */ -struct fi_info *__fi_allocinfo() -{ - return calloc(1, sizeof(struct fi_info)); -} - -static struct fi_info* sockd_dupinfo(struct fi_info *hints) -{ - struct fi_info *fi; - if (!(fi = __fi_allocinfo())) { - goto err1; - } - - fi->next = NULL; - fi->ep_type = FI_EP_DGRAM; - - if (hints) { - fi->caps = hints->caps; - fi->addr_format = hints->addr_format; - } else { - fi->caps = SOCK_EP_DGRAM_CAP; - fi->addr_format = FI_SOCKADDR; - } - - fi->ep_attr = calloc(1, sizeof (struct fi_ep_attr)); - if (!fi->ep_attr) { - goto err2; - } - fi->ep_attr->protocol = FI_PROTO_UNSPEC; - if (hints && hints->ep_attr) { - fi->ep_attr->max_msg_size = hints->ep_attr->max_msg_size; - fi->ep_attr->inject_size = hints->ep_attr->inject_size; - fi->ep_attr->total_buffered_recv = hints->ep_attr->total_buffered_recv; - } else { - fi->ep_attr->max_msg_size = SOCKD_MTU; - fi->ep_attr->inject_size = SOCKD_MTU; - fi->ep_attr->total_buffered_recv = so_rcvbuf; - } - /* fi->ep_attr->mem_tag_format = fi_tag_format(max_tag_value); */ - /* fi->ep_attr->msg_order = FI_ORDER_SAS; */ - - fi->domain_attr = calloc(1, sizeof (struct fi_domain_attr)); - if (!fi->domain_attr) { - goto err3; - } - fi->domain_attr->name = strdup("socket"); - fi->domain_attr->threading = FI_THREAD_PROGRESS; - fi->domain_attr->control_progress = FI_PROGRESS_MANUAL; - fi->domain_attr->data_progress = FI_PROGRESS_MANUAL; /* FIXME: FI_PROGRESS_AUTO? */ -/* TODO fi->domain_cap = SOCKD_DOMAIN_CAP; */ - - fi->fabric_attr = calloc(1, sizeof (struct fi_fabric_attr)); - if (!fi->fabric_attr) { - goto err4; - } - fi->fabric_attr->name = strdup("IP"); /* FIXME: fabric name for socket */ - fi->fabric_attr->prov_name = strdup("socket"); /* FIXME: fabric prov_name for socket */ - /* fi->fabric_attr->prov_version = PROVIDER_VERSION; */ - -#if 0 - if ((hints->ep_cap & FI_PASSIVE)) /* FIXME: FI_SOURCE? */ - sockd_info->ep_cap = FI_PASSIVE; -#endif - - if (hints && hints->src_addr) { - fi->src_addr = malloc(hints->src_addrlen); - if (!fi->src_addr) { - goto err5; - } - memcpy(fi->src_addr, hints->src_addr, hints->src_addrlen); - fi->src_addrlen = hints->src_addrlen; - } else { - SOCK_LOG_ERROR("[sockd] hints must have src_addr\n"); -#if 0 - fi->src_addr = NULL; - fi->src_addrlen = 0; -#endif - goto err6; - } - if (hints && hints->dest_addr) { - fi->dest_addr = malloc(hints->dest_addrlen); - if (!fi->dest_addr) { - goto err6; - } - memcpy(fi->dest_addr, hints->dest_addr, hints->dest_addrlen); - fi->dest_addrlen = hints->dest_addrlen; - } else { - fi->dest_addr = NULL; - fi->dest_addrlen = 0; - } - - fi->tx_attr = calloc(1, sizeof (struct fi_tx_attr)); - if (!fi->tx_attr) { - goto err7; - } - if (hints->tx_attr) - fi->tx_attr->op_flags = hints->tx_attr->op_flags; - else - fi->tx_attr->op_flags = SOCKD_OP_FLAGS; - - return fi; -err7: - free(fi->dest_addr); -err6: - free(fi->src_addr); -err5: - free(fi->fabric_attr); -err4: - free(fi->domain_attr); -err3: - free(fi->ep_attr); -err2: - free(fi); -err1: - return NULL; -} - -int sock_dgram_getinfo(uint32_t version, const char *node, const char *service, - uint64_t flags, struct fi_info *hints, struct fi_info **info) -{ - int ret = 0; - struct fi_info *sockd_info; - int sockfd = -1; - int optval; - socklen_t optlen; - *info = NULL; - -#if 0 - if (!(flags & FI_SOURCE)) { - /* FIXME: FI_SOURCE is required for DGRAM */ - fprintf(stderr, "[sockd] FI_SOURCE is required for EP_DGRAM\n"); - errno = EINVAL; - return -errno; - } -#endif - - /* solve user specified name or address */ - if (node || service) { - struct addrinfo *res; - struct addrinfo sock_hints = { - .ai_family = AF_INET, - .ai_socktype = SOCK_DGRAM, - .ai_protocol = IPPROTO_UDP - }; - ret = getaddrinfo(node, service, &sock_hints, &res); - if (ret) { - SOCK_LOG_ERROR("%s: couldn't getaddrinfo for (%s:%s):%s\n", __func__, node, service, gai_strerror(ret)); - return -FI_ENODATA; - } - freeaddrinfo(res); - } - - sockfd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); - if (sockfd < 0) { - SOCK_LOG_ERROR("%s: couldn't open DGRAM socket\n", __func__); - return -FI_ENODATA; - } - - optlen = sizeof(int); - getsockopt(sockfd, SOL_SOCKET, SO_RCVBUF, (int *)&optval, &optlen); - so_rcvbuf = optval; - - if (hints) { - ret = sockd_check_hints(hints); - if (ret) - return ret; - } - - - /* dup prov info */ - if (!(sockd_info = sockd_dupinfo(hints))) { - ret = -ENOMEM; - return ret; - } - - *info = sockd_info; - - close(sockfd); - return ret; -} - -/* sockd_fi_ops */ - -static int sockd_ep_close(fid_t fid) -{ - struct sock_ep *ep; - - ep = container_of(fid, struct sock_ep, ep.fid); - if (ep->sock_fd) - if (close(ep->sock_fd)) { - SOCK_LOG_ERROR("[sockd] cannot close sock_fd\n"); - return -FI_ENODATA; - } - - free(ep); - return 0; -} - -static int sockd_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) -{ - struct sock_ep *ep; - struct sock_cntr *cntr; - struct sock_eq *eq; - struct sock_cq *cq; - struct sock_av *av; - - ep = container_of(fid, struct sock_ep, ep.fid); - - switch (bfid->fclass) { - case FI_CLASS_CNTR: - SOCK_LOG_ERROR("[sockd] bind counter to ep\n"); - cntr = container_of(bfid, struct sock_cntr, cntr_fid.fid); - if (!(flags & - (FI_WRITE | FI_READ | FI_SEND | FI_RECV))) { - SOCK_LOG_ERROR("[sockd] Counter only support FI_WRITE | FI_READ | FI_SEND | FI_RECV\n"); - errno = FI_EINVAL; - return -errno; - } - if (flags & FI_WRITE) { - if (ep->write_cntr) - return -EINVAL; - ep->write_cntr = cntr; - } - if (flags & FI_READ) { - if (ep->read_cntr) - return -EINVAL; - ep->read_cntr = cntr; - } - if (flags & FI_SEND) { - if (ep->send_cntr) - return -EINVAL; - ep->send_cntr = cntr; - } - if (flags & FI_RECV) { - if (ep->recv_cntr) - return -EINVAL; - ep->recv_cntr = cntr; - } - break; - case FI_CLASS_CQ: - SOCK_LOG_ERROR("[sockd] bind CQ to ep\n"); - cq = container_of(bfid, struct sock_cq, cq_fid.fid); - if (!(flags & - (FI_SEND | FI_RECV))) { - SOCK_LOG_ERROR("[sockd] CQ only support FI_SEND | FI_RECV\n"); - errno = FI_EINVAL; - return -errno; - } - if (flags & FI_SEND) { - if (ep->send_cq) - return -EINVAL; - ep->send_cq = cq; - } - if (flags & FI_RECV) { - if (ep->recv_cq) - return -EINVAL; - ep->recv_cq = cq; - } -/* - if(enqueue_item(cq->ep_list, ep)) { - return -ENOMEM; - } -*/ - break; - case FI_CLASS_EQ: - SOCK_LOG_ERROR("[sockd] bind EQ to ep\n"); - /* FIXME: bind EQ to sockd EP */ - eq = container_of(bfid, struct sock_eq, eq.fid); - if (ep->eq) { - return -EINVAL; - } - ep->eq = eq; - break; - case FI_CLASS_AV: - SOCK_LOG_ERROR("[sockd] bind AV to ep\n"); - av = container_of(bfid, - struct sock_av, av_fid.fid); - if (ep->domain != av->dom) - return -EINVAL; - ep->av = av; - break; - default: - return -FI_ENOSYS; - } - - return 0; -} - -static int sockd_ep_control(fid_t fid, int command, void *arg) -{ - errno = FI_ENOSYS; - return -errno; -} - -static int sockd_ep_ops_open(struct fid *fid, const char *name, - uint64_t flags, void **ops, void *context) -{ - errno = FI_ENOSYS; - return -errno; -} - -/* sockd_ops_ep */ - -static int sockd_ep_enable(struct fid_ep *ep) -{ - struct sock_ep *sock_ep; - sock_ep = container_of(ep, struct sock_ep, ep); - if(!sock_ep) - return -FI_EINVAL; - - sock_ep->enabled = 1; - return 0; -} - -static ssize_t sockd_ep_cancel(fid_t fid, void *context) -{ - errno = FI_ENOSYS; - return -errno; -} - -static int sockd_ep_getopt(fid_t fid, int level, int optname, - void *optval, size_t *optlen) -{ - errno = FI_ENOSYS; - return -errno; -} - -static int sockd_ep_setopt(fid_t fid, int level, int optname, - const void *optval, size_t optlen) -{ - errno = FI_ENOSYS; - return -errno; -} - -static int sockd_ep_tx_ctx(struct fid_sep *sep, int index, - struct fi_tx_attr *attr, struct fid_ep **tx_ep, - void *context) -{ - errno = FI_ENOSYS; - return -errno; -} - - -static int sockd_ep_rx_ctx(struct fid_sep *sep, int index, - struct fi_rx_attr *attr, struct fid_ep **rx_ep, - void *context) -{ - errno = FI_ENOSYS; - return -errno; -} - -/* sockd_ops_cm */ - -static int sockd_cm_getname(fid_t fid, void *addr, size_t *addrlen) -{ - errno = FI_ENOSYS; - return -errno; -} - -static int sockd_cm_join(struct fid_ep *ep, void *addr, fi_addr_t *fi_addr, - uint64_t flags, void *context) -{ - errno = FI_ENOSYS; - return -errno; -} - -static int sockd_cm_leave(struct fid_ep *ep, void *addr, fi_addr_t fi_addr, - uint64_t flags) -{ - errno = FI_ENOSYS; - return -errno; -} - -/* sockd_ops_msg */ - -static ssize_t sockd_msg_recv(struct fid_ep *ep, void *buf, size_t len, void *desc, - fi_addr_t src_addr, void *context) -{ - struct sock_ep *sock_ep; - struct sock_req_item *recv_req; - - sock_ep = container_of(ep, struct sock_ep, ep); - if(!sock_ep) - return -FI_EINVAL; - - recv_req = calloc(1, sizeof(struct sock_req_item)); - if(!recv_req) - return -FI_ENOMEM; - - recv_req->item.buf = (void*)buf; - recv_req->req_type = SOCK_REQ_TYPE_RECV; - recv_req->comm_type = SOCK_COMM_TYPE_SENDTO; - recv_req->context = context; - recv_req->total_len = len; - recv_req->done_len = 0; - - if (sock_ep->av->attr.type == FI_AV_MAP) { - memcpy(&recv_req->addr, (void*)src_addr, sizeof(struct sockaddr_in)); - } else { - size_t idx; - idx = (size_t)src_addr; - if (idx > sock_ep->av->count-1 || idx < 0) { - return -EINVAL; - } - memcpy(&recv_req->addr, &sock_ep->av->table[idx], sizeof(struct sockaddr_in)); - } - - if(0 != enqueue_item(sock_ep->recv_list, recv_req)){ - free(recv_req); - return -FI_ENOMEM; - } - return 0; -} - -static ssize_t sockd_msg_recvv(struct fid_ep *ep, const struct iovec *iov, void **desc, - size_t count, fi_addr_t src_addr, void *context) -{ - errno = FI_ENOSYS; - return -errno; -} - -static ssize_t sockd_msg_recvmsg(struct fid_ep *ep, const struct fi_msg *msg, - uint64_t flags) -{ - errno = FI_ENOSYS; - return -errno; -} - -static ssize_t sockd_msg_send(struct fid_ep *ep, const void *buf, size_t len, void *desc, - fi_addr_t dest_addr, void *context) -{ - struct sock_ep *sock_ep; - struct sock_req_item *send_req; - - sock_ep = container_of(ep, struct sock_ep, ep); - if(!sock_ep) - return -FI_EINVAL; - - send_req = calloc(1, sizeof(struct sock_req_item)); - if(!send_req) - return -FI_ENOMEM; - - send_req->item.buf = (void*)buf; - send_req->req_type = SOCK_REQ_TYPE_SEND; - send_req->comm_type = SOCK_COMM_TYPE_SENDTO; - send_req->context = context; - send_req->total_len = len; - send_req->done_len = 0; - - if (sock_ep->av->attr.type == FI_AV_MAP) { - memcpy(&send_req->addr, (void*)dest_addr, sizeof(struct sockaddr_in)); - } else { - size_t idx; - idx = (size_t)dest_addr; - if (idx > sock_ep->av->count-1 || idx < 0) { - return -EINVAL; - } - memcpy(&send_req->addr, &sock_ep->av->table[idx], sizeof(struct sockaddr_in)); - } - - if(0 != enqueue_item(sock_ep->send_list, send_req)){ - free(send_req); - return -FI_ENOMEM; - } - return 0; -} - -static ssize_t sockd_msg_sendv(struct fid_ep *ep, const struct iovec *iov, void **desc, - size_t count, fi_addr_t dest_addr, void *context) -{ - errno = FI_ENOSYS; - return -errno; -} - -static ssize_t sockd_msg_sendmsg(struct fid_ep *ep, const struct fi_msg *msg, - uint64_t flags) -{ - errno = FI_ENOSYS; - return -errno; -} - -static ssize_t sockd_msg_inject(struct fid_ep *ep, const void *buf, size_t len, - fi_addr_t dest_addr) -{ - errno = FI_ENOSYS; - return -errno; -} - -static ssize_t sockd_msg_senddata(struct fid_ep *ep, const void *buf, size_t len, void *desc, - uint64_t data, fi_addr_t dest_addr, void *context) -{ - errno = FI_ENOSYS; - return -errno; -} - -static struct fi_ops sockd_ep_fi_ops = { - .size = sizeof(struct fi_ops), - .close = sockd_ep_close, - .bind = sockd_ep_bind, - .control = sockd_ep_control, - .ops_open = sockd_ep_ops_open -}; - -static struct fi_ops_ep sockd_ops_ep = { - .size = sizeof(struct fi_ops_ep), - .cancel = sockd_ep_cancel, - .getopt = sockd_ep_getopt, - .setopt = sockd_ep_setopt, - .enable = sockd_ep_enable, - .tx_ctx = sockd_ep_tx_ctx, - .rx_ctx = sockd_ep_rx_ctx, -}; - -static struct fi_ops_cm sockd_ops_cm = { - .size = sizeof(struct fi_ops_cm), - .getname = sockd_cm_getname, - .getpeer = fi_no_getpeer, - .connect = fi_no_connect, - .listen = fi_no_listen, - .accept = fi_no_accept, - .reject = fi_no_reject, - .shutdown = fi_no_shutdown, - .join = sockd_cm_join, - .leave = sockd_cm_leave -}; - -static struct fi_ops_msg sockd_ops_msg = { - .size = sizeof(struct fi_ops_msg), - .recv = sockd_msg_recv, - .recvv = sockd_msg_recvv, - .recvmsg = sockd_msg_recvmsg, - .send = sockd_msg_send, - .sendv = sockd_msg_sendv, - .sendmsg = sockd_msg_sendmsg, - .inject = sockd_msg_inject, - .senddata = sockd_msg_senddata, - .injectdata = fi_no_msg_injectdata, -}; - -static inline int _sock_ep_dgram_progress(struct sock_ep *ep, struct sock_cq *cq) -{ - struct sock_req_item *item; - if((item = dequeue_item(ep->send_list))) { - SOCK_LOG_ERROR("[ep_dgram_progress] found a send req\n"); - } - if((item = dequeue_item(ep->recv_list))) { - SOCK_LOG_ERROR("[ep_dgram_progress] found a recv req\n"); - } - return -FI_ENOSYS; -} - -int sock_dgram_ep(struct fid_domain *domain, struct fi_info *info, - struct fid_ep **ep, void *context) -{ - SOCK_LOG_ERROR("[sockd] enter sock_dgram_ep\n"); - struct sock_ep *_ep; - struct sock_domain *_dom; - struct sockaddr_in si_me; - - _dom = container_of(domain, struct sock_domain, dom_fid); - if(!_dom) - return -FI_EINVAL; - - _ep = (struct sock_ep*)calloc(1, sizeof(*_ep)); - if(!_ep) - return -FI_ENOMEM; - - _ep->ep.fid.fclass = FI_CLASS_EP; - _ep->ep.fid.context = context; - _ep->ep.fid.ops = &sockd_ep_fi_ops; - _ep->ep.ops = &sockd_ops_ep; - _ep->ep.cm = &sockd_ops_cm; - _ep->ep.msg = &sockd_ops_msg; - _ep->ep.rma = NULL; - _ep->ep.tagged = NULL; - _ep->ep.atomic = NULL; - _ep->domain = _dom; - - _ep->sock_fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); - if (_ep->sock_fd < 0) { - SOCK_LOG_ERROR("%s: couldn't open DGRAM socket\n", __func__); - errno = FI_ENODATA; - goto err1; - } - - si_me.sin_family = AF_INET; - si_me.sin_port = ((struct sockaddr_in *)(info->src_addr))->sin_port; - si_me.sin_addr.s_addr = htonl(INADDR_ANY); - if (bind(_ep->sock_fd, &si_me, sizeof(si_me)) == -1) { - SOCK_LOG_ERROR("[sockd] %s: failed to bind sock_fd to port %d\n", __func__, ntohs(si_me.sin_port)); - goto err2; - } - - _ep->port_num = ntohs(si_me.sin_port); - - if(!(_ep->send_list = new_list(SOCK_CQ_DEF_SZ))) - goto err2; - - if(!(_ep->recv_list = new_list(SOCK_CQ_DEF_SZ))) - goto err3; - -/* - _ep->progress_fn = _sock_ep_dgram_progress; -*/ - - *ep = &_ep->ep; - - return 0; - -err3: - free_list(_ep->send_list); - -err2: - close(_ep->sock_fd); - -err1: - free(_ep); - - return -errno; -} diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_dom.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_dom.c index e27c8d6963..2c71e684bd 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_dom.c +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_dom.c @@ -40,6 +40,8 @@ #include "sock.h" #include "sock_util.h" +extern const char const sock_dom_name[]; + const struct fi_domain_attr sock_domain_attr = { .name = NULL, .threading = FI_THREAD_SAFE, @@ -48,8 +50,8 @@ const struct fi_domain_attr sock_domain_attr = { .mr_key_size = 0, .cq_data_size = sizeof(uint64_t), .ep_cnt = SOCK_EP_MAX_EP_CNT, - .tx_ctx_cnt = 0, - .rx_ctx_cnt = 0, + .tx_ctx_cnt = SOCK_EP_MAX_TX_CNT, + .rx_ctx_cnt = SOCK_EP_MAX_RX_CNT, .max_ep_tx_ctx = SOCK_EP_MAX_TX_CNT, .max_ep_rx_ctx = SOCK_EP_MAX_RX_CNT, }; @@ -77,9 +79,9 @@ int sock_verify_domain_attr(struct fi_domain_attr *attr) switch (attr->control_progress){ case FI_PROGRESS_UNSPEC: case FI_PROGRESS_AUTO: + case FI_PROGRESS_MANUAL: break; - case FI_PROGRESS_MANUAL: default: SOCK_LOG_INFO("Control progress mode not supported!\n"); return -FI_ENODATA; @@ -88,9 +90,9 @@ int sock_verify_domain_attr(struct fi_domain_attr *attr) switch (attr->data_progress){ case FI_PROGRESS_UNSPEC: case FI_PROGRESS_AUTO: + case FI_PROGRESS_MANUAL: break; - case FI_PROGRESS_MANUAL: default: SOCK_LOG_INFO("Data progress mode not supported!\n"); return -FI_ENODATA; @@ -114,11 +116,25 @@ int sock_verify_domain_attr(struct fi_domain_attr *attr) static int sock_dom_close(struct fid *fid) { struct sock_domain *dom; + void *res; dom = container_of(fid, struct sock_domain, dom_fid.fid); - if (atomic_get(&dom->ref)) + if (atomic_get(&dom->ref)) { return -FI_EBUSY; + } + dom->listening = 0; + if (pthread_join(dom->listen_thread, &res)) { + SOCK_LOG_ERROR("could not join listener thread, errno = %d\n", errno); + return -FI_EBUSY; + } + + if (dom->u_cmap.size) + sock_conn_map_destroy(&dom->u_cmap); + if (dom->r_cmap.size) + sock_conn_map_destroy(&dom->r_cmap); + + sock_pe_finalize(dom->pe); fastlock_destroy(&dom->lock); free(dom); return 0; @@ -141,7 +157,7 @@ static int sock_mr_close(struct fid *fid) struct sock_mr *mr; mr = container_of(fid, struct sock_mr, mr_fid.fid); - dom = mr->dom; + dom = mr->domain; fastlock_acquire(&dom->lock); idm_clear(&dom->mr_idm , (int) mr->mr_fid.key); fastlock_release(&dom->lock); @@ -150,37 +166,71 @@ static int sock_mr_close(struct fid *fid) return 0; } +static int sock_mr_bind(struct fid *fid, struct fid *bfid, uint64_t flags) +{ + struct sock_cntr *cntr; + struct sock_cq *cq; + struct sock_mr *mr; + + mr = container_of(fid, struct sock_mr, mr_fid.fid); + switch (bfid->fclass) { + case FI_CLASS_CQ: + cq = container_of(bfid, struct sock_cq, cq_fid.fid); + assert(mr->domain == cq->domain); + mr->cq = cq; + break; + + case FI_CLASS_CNTR: + cntr = container_of(bfid, struct sock_cntr, cntr_fid.fid); + assert(mr->domain == cntr->domain); + mr->cntr = cntr; + break; + + default: + return -FI_EINVAL; + } + return 0; +} + static struct fi_ops sock_mr_fi_ops = { .size = sizeof(struct fi_ops), .close = sock_mr_close, - .bind = fi_no_bind, + .bind = sock_mr_bind, .control = fi_no_control, .ops_open = fi_no_ops_open, }; -int sock_mr_verify_key(struct sock_domain *domain, uint16_t key, - void *buf, size_t len, uint64_t access) +struct sock_mr * sock_mr_get_entry(struct sock_domain *domain, uint16_t key) +{ + return (struct sock_mr *)idm_lookup(&domain->mr_idm, key); +} + +struct sock_mr *sock_mr_verify_key(struct sock_domain *domain, uint16_t key, + void *buf, size_t len, uint64_t access) { int i; struct sock_mr *mr; mr = idm_lookup(&domain->mr_idm, key); if (!mr) - return -FI_EINVAL; + return NULL; + + if (mr->flags & FI_MR_OFFSET) + buf = (char*)buf + mr->offset; for (i = 0; i < mr->iov_count; i++) { if ((uintptr_t)buf >= (uintptr_t)mr->mr_iov[i].iov_base && ((uintptr_t)buf + len <= (uintptr_t) mr->mr_iov[i].iov_base + mr->mr_iov[i].iov_len)) { if ((access & mr->access) == access) - return 0; + return mr; } } SOCK_LOG_ERROR("MR check failed\n"); - return -FI_EINVAL; + return NULL; } -int sock_mr_verify_desc(struct sock_domain *domain, void *desc, +struct sock_mr *sock_mr_verify_desc(struct sock_domain *domain, void *desc, void *buf, size_t len, uint64_t access) { uint64_t key = (uint64_t)desc; @@ -209,9 +259,9 @@ static int sock_regattr(struct fid_domain *domain, const struct fi_mr_attr *attr _mr->mr_fid.fid.context = attr->context; _mr->mr_fid.fid.ops = &sock_mr_fi_ops; - atomic_inc(&dom->ref); - _mr->dom = dom; + _mr->domain = dom; _mr->access = attr->access; + _mr->flags = flags; _mr->offset = (flags & FI_MR_OFFSET) ? attr->offset : (uintptr_t) attr->mr_iov[0].iov_base; @@ -228,6 +278,7 @@ static int sock_regattr(struct fid_domain *domain, const struct fi_mr_attr *attr memcpy(&_mr->mr_iov, attr->mr_iov, sizeof(_mr->mr_iov) * attr->iov_count); *mr = &_mr->mr_fid; + atomic_inc(&dom->ref); if (dom->mr_eq) { eq_entry.fid = &domain->fid; @@ -240,7 +291,6 @@ static int sock_regattr(struct fid_domain *domain, const struct fi_mr_attr *attr err: fastlock_release(&dom->lock); - atomic_dec(&dom->ref); free(_mr); return -errno; } @@ -299,6 +349,23 @@ int sock_endpoint(struct fid_domain *domain, struct fi_info *info, return sock_rdm_ep(domain, info, ep, context); case FI_EP_DGRAM: return sock_dgram_ep(domain, info, ep, context); + case FI_EP_MSG: + return sock_msg_ep(domain, info, ep, context); + default: + return -FI_ENOPROTOOPT; + } +} + +int sock_scalable_ep(struct fid_domain *domain, struct fi_info *info, + struct fid_sep **sep, void *context) +{ + switch (info->ep_type) { + case FI_EP_RDM: + return sock_rdm_sep(domain, info, sep, context); + case FI_EP_DGRAM: + return sock_dgram_sep(domain, info, sep, context); + case FI_EP_MSG: + return sock_msg_sep(domain, info, sep, context); default: return -FI_ENOPROTOOPT; } @@ -317,9 +384,12 @@ static struct fi_ops_domain sock_dom_ops = { .av_open = sock_av_open, .cq_open = sock_cq_open, .endpoint = sock_endpoint, + .scalable_ep = sock_scalable_ep, .cntr_open = sock_cntr_open, .wait_open = sock_wait_open, .poll_open = sock_poll_open, + .stx_ctx = sock_stx_ctx, + .srx_ctx = sock_srx_ctx, }; static struct fi_ops_mr sock_dom_mr_ops = { @@ -329,54 +399,6 @@ static struct fi_ops_mr sock_dom_mr_ops = { .regattr = sock_regattr, }; -int _sock_verify_domain_attr(struct fi_domain_attr *attr) -{ - if(attr->name){ - if (strcmp(attr->name, sock_dom_name)) - return -FI_ENODATA; - } - - switch(attr->threading){ - case FI_THREAD_UNSPEC: - case FI_THREAD_SAFE: - case FI_THREAD_PROGRESS: - break; - default: - SOCK_LOG_INFO("Invalid threading model!\n"); - return -FI_ENODATA; - } - - switch (attr->control_progress){ - case FI_PROGRESS_UNSPEC: - case FI_PROGRESS_AUTO: - break; - - case FI_PROGRESS_MANUAL: - default: - SOCK_LOG_INFO("Control progress mode not supported!\n"); - return -FI_ENODATA; - } - - switch (attr->data_progress){ - case FI_PROGRESS_UNSPEC: - case FI_PROGRESS_AUTO: - break; - - case FI_PROGRESS_MANUAL: - default: - SOCK_LOG_INFO("Data progress mode not supported!\n"); - return -FI_ENODATA; - } - - if(attr->max_ep_tx_ctx > SOCK_EP_MAX_TX_CNT) - return -FI_ENODATA; - - if(attr->max_ep_rx_ctx > SOCK_EP_MAX_RX_CNT) - return -FI_ENODATA; - - return 0; -} - int sock_domain(struct fid_fabric *fabric, struct fi_info *info, struct fid_domain **dom, void *context) { @@ -384,7 +406,7 @@ int sock_domain(struct fid_fabric *fabric, struct fi_info *info, struct sock_domain *sock_domain; if(info && info->domain_attr){ - ret = _sock_verify_domain_attr(info->domain_attr); + ret = sock_verify_domain_attr(info->domain_attr); if(ret) return ret; } @@ -396,12 +418,47 @@ int sock_domain(struct fid_fabric *fabric, struct fi_info *info, fastlock_init(&sock_domain->lock); atomic_init(&sock_domain->ref, 0); + if(info && info->src_addr) { + if (getnameinfo(info->src_addr, info->src_addrlen, NULL, 0, + sock_domain->service, + sizeof(sock_domain->service), + NI_NUMERICSERV)) { + SOCK_LOG_ERROR("could not resolve src_addr\n"); + goto err; + } + sock_domain->info = *info; + } else { + SOCK_LOG_ERROR("invalid fi_info\n"); + goto err; + } + sock_domain->dom_fid.fid.fclass = FI_CLASS_DOMAIN; sock_domain->dom_fid.fid.context = context; sock_domain->dom_fid.fid.ops = &sock_dom_fi_ops; sock_domain->dom_fid.ops = &sock_dom_ops; sock_domain->dom_fid.mr = &sock_dom_mr_ops; + if (!info || !info->domain_attr || + info->domain_attr->data_progress == FI_PROGRESS_UNSPEC) + sock_domain->progress_mode = FI_PROGRESS_AUTO; + else + sock_domain->progress_mode = info->domain_attr->data_progress; + + sock_domain->pe = sock_pe_init(sock_domain); + if(!sock_domain->pe){ + SOCK_LOG_ERROR("Failed to init PE\n"); + goto err; + } + + sock_domain->r_cmap.domain = sock_domain; + sock_domain->u_cmap.domain = sock_domain; + + sock_conn_listen(sock_domain); + *dom = &sock_domain->dom_fid; return 0; + +err: + free(sock_domain); + return -FI_EINVAL; } diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ep.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ep.c index ba4193beca..28b96d701e 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ep.c +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ep.c @@ -38,212 +38,1100 @@ #include #include "sock.h" +#include "sock_util.h" -int _sock_verify_ep_attr(struct fi_ep_attr *attr) +extern struct fi_ops_rma sock_ep_rma; +extern struct fi_ops_msg sock_ep_msg_ops; +extern struct fi_ops_tagged sock_ep_tagged; +extern struct fi_ops_atomic sock_ep_atomic; + +extern struct fi_ops_cm sock_ep_cm_ops; +extern struct fi_ops_ep sock_ep_ops; +extern struct fi_ops sock_ep_fi_ops; +extern struct fi_ops_ep sock_ctx_ep_ops; +extern struct fi_ops sock_ctx_ops; + +extern const struct fi_domain_attr sock_domain_attr; +extern const struct fi_fabric_attr sock_fabric_attr; + +extern const char const sock_fab_name[]; +extern const char const sock_dom_name[]; + +static int sock_ctx_close(struct fid *fid) { - switch (attr->protocol) { - case FI_PROTO_UNSPEC: - case FI_PROTO_SOCK_RDS: + struct sock_ep *ep; + struct dlist_entry *entry; + struct sock_tx_ctx *tx_ctx; + struct sock_rx_ctx *rx_ctx; + + switch (fid->fclass) { + case FI_CLASS_TX_CTX: + tx_ctx = container_of(fid, struct sock_tx_ctx, ctx.fid); + + for (entry = tx_ctx->ep_list.next; entry != &tx_ctx->ep_list; + entry = entry->next) { + ep = container_of(entry, struct sock_ep, tx_ctx_entry); + atomic_dec(&ep->num_tx_ctx); + } + sock_tx_ctx_free(tx_ctx); break; + + case FI_CLASS_RX_CTX: + rx_ctx = container_of(fid, struct sock_rx_ctx, ctx.fid); + + for (entry = rx_ctx->ep_list.next; entry != &rx_ctx->ep_list; + entry = entry->next) { + ep = container_of(entry, struct sock_ep, rx_ctx_entry); + atomic_dec(&ep->num_rx_ctx); + } + sock_rx_ctx_free(rx_ctx); + break; + + case FI_CLASS_STX_CTX: + tx_ctx = container_of(fid, struct sock_tx_ctx, stx.fid); + atomic_dec(&tx_ctx->domain->ref); + sock_tx_ctx_free(tx_ctx); + break; + + case FI_CLASS_SRX_CTX: + rx_ctx = container_of(fid, struct sock_rx_ctx, ctx.fid); + atomic_dec(&rx_ctx->domain->ref); + sock_rx_ctx_free(rx_ctx); + break; + default: - return -FI_ENODATA; + SOCK_LOG_ERROR("Invalid fid\n"); + return -FI_EINVAL; + } + return 0; +} + +static int sock_ctx_bind_cq(struct fid *fid, struct fid *bfid, uint64_t flags) +{ + struct sock_cq *sock_cq; + struct sock_tx_ctx *tx_ctx; + struct sock_rx_ctx *rx_ctx; + + sock_cq = container_of(bfid, struct sock_cq, cq_fid.fid); + switch (fid->fclass) { + case FI_CLASS_TX_CTX: + tx_ctx = container_of(fid, struct sock_tx_ctx, ctx); + if (flags & FI_SEND) { + tx_ctx->comp.send_cq = sock_cq; + if (flags & FI_COMPLETION) + tx_ctx->comp.send_cq_event = 1; + } + + if (flags & FI_READ) { + tx_ctx->comp.read_cq = sock_cq; + if (flags & FI_COMPLETION) + tx_ctx->comp.read_cq_event = 1; + } + + if (flags & FI_WRITE) { + tx_ctx->comp.write_cq = sock_cq; + if (flags & FI_COMPLETION) + tx_ctx->comp.write_cq_event = 1; + } + + if (!tx_ctx->progress) { + tx_ctx->progress = 1; + sock_pe_add_tx_ctx(tx_ctx->domain->pe, tx_ctx); + } + dlist_insert_tail(&tx_ctx->cq_entry, &sock_cq->tx_list); + break; + + case FI_CLASS_RX_CTX: + rx_ctx = container_of(fid, struct sock_rx_ctx, ctx.fid); + if (flags & FI_RECV) { + rx_ctx->comp.recv_cq = sock_cq; + if (flags & FI_COMPLETION) + rx_ctx->comp.recv_cq_event = 1; + } + + if (flags & FI_REMOTE_READ) { + rx_ctx->comp.rem_read_cq = sock_cq; + if (flags & FI_COMPLETION) + rx_ctx->comp.rem_read_cq_event = 1; + } + + if (flags & FI_REMOTE_WRITE) { + rx_ctx->comp.rem_write_cq = sock_cq; + if (flags & FI_COMPLETION) + rx_ctx->comp.rem_write_cq_event = 1; + } + + if (!rx_ctx->progress) { + rx_ctx->progress = 1; + sock_pe_add_rx_ctx(rx_ctx->domain->pe, rx_ctx); + } + dlist_insert_tail(&rx_ctx->cq_entry, &sock_cq->rx_list); + break; + + case FI_CLASS_STX_CTX: + tx_ctx = container_of(fid, struct sock_tx_ctx, stx.fid); + if (flags & FI_SEND) { + tx_ctx->comp.send_cq = sock_cq; + if (flags & FI_COMPLETION) + tx_ctx->comp.send_cq_event = 1; + } + + if (flags & FI_READ) { + tx_ctx->comp.read_cq = sock_cq; + if (flags & FI_COMPLETION) + tx_ctx->comp.read_cq_event = 1; + } + + if (flags & FI_WRITE) { + tx_ctx->comp.write_cq = sock_cq; + if (flags & FI_COMPLETION) + tx_ctx->comp.write_cq_event = 1; + } + + if (!tx_ctx->progress) { + tx_ctx->progress = 1; + sock_pe_add_tx_ctx(tx_ctx->domain->pe, tx_ctx); + } + dlist_insert_tail(&tx_ctx->cq_entry, &sock_cq->tx_list); + break; + + default: + SOCK_LOG_ERROR("Invalid fid\n"); + return -FI_EINVAL; + } + return 0; +} + +static int sock_ctx_bind_cntr(struct fid *fid, struct fid *bfid, uint64_t flags) +{ + struct sock_cntr *cntr; + struct sock_tx_ctx *tx_ctx; + struct sock_rx_ctx *rx_ctx; + + cntr = container_of(bfid, struct sock_cntr, cntr_fid.fid); + switch (fid->fclass) { + case FI_CLASS_TX_CTX: + tx_ctx = container_of(fid, struct sock_tx_ctx, ctx.fid); + if (flags & FI_SEND) + tx_ctx->comp.send_cntr = cntr; + + if (flags & FI_READ) + tx_ctx->comp.read_cntr = cntr; + + if (flags & FI_WRITE) + tx_ctx->comp.write_cntr = cntr; + + if (!tx_ctx->progress) { + tx_ctx->progress = 1; + sock_pe_add_tx_ctx(tx_ctx->domain->pe, tx_ctx); + } + dlist_insert_tail(&tx_ctx->cntr_entry, &cntr->tx_list); + + break; + + case FI_CLASS_RX_CTX: + rx_ctx = container_of(fid, struct sock_rx_ctx, ctx.fid); + if (flags & FI_RECV) + rx_ctx->comp.recv_cntr = cntr; + + if (flags & FI_REMOTE_READ) + rx_ctx->comp.rem_read_cntr = cntr; + + if (flags & FI_REMOTE_WRITE) + rx_ctx->comp.rem_write_cntr = cntr; + + if (!rx_ctx->progress) { + rx_ctx->progress = 1; + sock_pe_add_rx_ctx(rx_ctx->domain->pe, rx_ctx); + } + dlist_insert_tail(&rx_ctx->cntr_entry, &cntr->rx_list); + break; + + case FI_CLASS_STX_CTX: + tx_ctx = container_of(fid, struct sock_tx_ctx, ctx.fid); + if (flags & FI_SEND) + tx_ctx->comp.send_cntr = cntr; + + if (flags & FI_READ) + tx_ctx->comp.read_cntr = cntr; + + if (flags & FI_WRITE) + tx_ctx->comp.write_cntr = cntr; + + if (!tx_ctx->progress) { + tx_ctx->progress = 1; + sock_pe_add_tx_ctx(tx_ctx->domain->pe, tx_ctx); + } + dlist_insert_tail(&tx_ctx->cntr_entry, &cntr->tx_list); + + break; + + default: + SOCK_LOG_ERROR("Invalid fid\n"); + return -FI_EINVAL; + } + return 0; +} + +static int sock_ctx_bind(struct fid *fid, struct fid *bfid, uint64_t flags) +{ + switch (bfid->fclass) { + case FI_CLASS_CQ: + return sock_ctx_bind_cq(fid, bfid, flags); + + case FI_CLASS_CNTR: + return sock_ctx_bind_cntr(fid, bfid, flags); + + default: + SOCK_LOG_ERROR("Invalid bind()\n"); + return -FI_EINVAL; } - if(attr->max_msg_size > SOCK_EP_MAX_MSG_SZ) - return -FI_ENODATA; +} - if(attr->inject_size > SOCK_EP_MAX_INJECT_SZ) - return -FI_ENODATA; +static int sock_ctx_control(struct fid *fid, int command, void *arg) +{ + struct sock_tx_ctx *tx_ctx; + struct sock_rx_ctx *rx_ctx; - if(attr->total_buffered_recv > SOCK_EP_MAX_BUFF_RECV) - return -FI_ENODATA; + switch (fid->fclass) { + case FI_CLASS_TX_CTX: + tx_ctx = container_of(fid, struct sock_tx_ctx, ctx.fid); + switch (command) { + case FI_GETOPSFLAG: + *(uint64_t*)arg = tx_ctx->attr.op_flags; + break; + case FI_SETOPSFLAG: + tx_ctx->attr.op_flags = (uint64_t)arg; + break; + default: + return -FI_EINVAL; + } + break; + case FI_CLASS_RX_CTX: + rx_ctx = container_of(fid, struct sock_rx_ctx, ctx.fid); + switch (command) { + case FI_GETOPSFLAG: + *(uint64_t*)arg = rx_ctx->attr.op_flags; + break; + case FI_SETOPSFLAG: + rx_ctx->attr.op_flags = (uint64_t)arg; + break; + default: + return -FI_EINVAL; + } + break; + + case FI_CLASS_STX_CTX: + tx_ctx = container_of(fid, struct sock_tx_ctx, stx.fid); + switch (command) { + case FI_GETOPSFLAG: + *(uint64_t*)arg = tx_ctx->attr.op_flags; + break; + case FI_SETOPSFLAG: + tx_ctx->attr.op_flags = (uint64_t)arg; + break; + default: + return -FI_EINVAL; + } + break; + + default: + return -FI_EINVAL; + } + + return 0; +} + +struct fi_ops sock_ctx_ops = { + .size = sizeof(struct fi_ops), + .close = sock_ctx_close, + .bind = sock_ctx_bind, + .control = sock_ctx_control, +}; + +static int sock_ctx_enable(struct fid_ep *ep) +{ + struct sock_tx_ctx *tx_ctx; + struct sock_rx_ctx *rx_ctx; + + switch (ep->fid.fclass) { + case FI_CLASS_RX_CTX: + rx_ctx = container_of(ep, struct sock_rx_ctx, ctx.fid); + rx_ctx->enabled = 1; + return 0; + + case FI_CLASS_TX_CTX: + tx_ctx = container_of(ep, struct sock_tx_ctx, ctx.fid); + tx_ctx->enabled = 1; + return 0; + + default: + SOCK_LOG_ERROR("Invalid CTX\n"); + break; + } + return -FI_EINVAL; +} + +static int sock_ctx_getopt(fid_t fid, int level, int optname, + void *optval, size_t *optlen) +{ + struct sock_rx_ctx *rx_ctx; + rx_ctx = container_of(fid, struct sock_rx_ctx, ctx.fid); + + if (level != FI_OPT_ENDPOINT) + return -ENOPROTOOPT; + + switch (optname) { + case FI_OPT_MIN_MULTI_RECV: + *(size_t *)optval = rx_ctx->min_multi_recv; + *optlen = sizeof(size_t); + break; + + default: + return -FI_ENOPROTOOPT; + } + return 0; +} + +static int sock_ctx_setopt(fid_t fid, int level, int optname, + const void *optval, size_t optlen) +{ + struct sock_rx_ctx *rx_ctx; + rx_ctx = container_of(fid, struct sock_rx_ctx, ctx.fid); + + if (level != FI_OPT_ENDPOINT) + return -ENOPROTOOPT; + + switch (optname) { + case FI_OPT_MIN_MULTI_RECV: + rx_ctx->min_multi_recv = *(size_t *)optval; + break; + + default: + return -ENOPROTOOPT; + } return 0; } static ssize_t sock_ep_cancel(fid_t fid, void *context) { - return -FI_ENOSYS; + int ret; + struct sock_rx_ctx *rx_ctx; + struct sock_rx_entry *rx_entry; + struct sock_ep *sock_ep; + struct dlist_entry *entry; + + switch (fid->fclass) { + case FI_CLASS_EP: + sock_ep = container_of(fid, struct sock_ep, ep.fid); + rx_ctx = sock_ep->rx_ctx; + break; + + case FI_CLASS_RX_CTX: + case FI_CLASS_SRX_CTX: + rx_ctx = container_of(fid, struct sock_rx_ctx, ctx.fid); + break; + + default: + SOCK_LOG_ERROR("Invalid ep type\n"); + return -FI_EINVAL; + } + + ret = -FI_ENOENT; + fastlock_acquire(&rx_ctx->lock); + for (entry = rx_ctx->rx_entry_list.next; + entry != &rx_ctx->rx_entry_list; entry = entry->next) { + + rx_entry = container_of(entry, struct sock_rx_entry, entry); + if (rx_entry->is_busy || rx_entry->used) + continue; + + if ((uint64_t)context == rx_entry->context) { + dlist_remove(&rx_entry->entry); + sock_rx_release_entry(rx_entry); + ret = 0; + break; + } + } + fastlock_release(&rx_ctx->lock); + return ret; } -static int sock_ep_getopt(fid_t fid, int level, int optname, - void *optval, size_t *optlen) +struct fi_ops_ep sock_ctx_ep_ops = { + .size = sizeof(struct fi_ops_ep), + .enable = sock_ctx_enable, + .cancel = sock_ep_cancel, + .getopt = sock_ctx_getopt, + .setopt = sock_ctx_setopt, + .tx_ctx = fi_no_tx_ctx, + .rx_ctx = fi_no_rx_ctx, +}; + +static int sock_ep_close(struct fid *fid) { - return -FI_ENOSYS; + struct sock_ep *sock_ep; + sock_ep = container_of(fid, struct sock_ep, ep.fid); + + if (atomic_get(&sock_ep->ref) || atomic_get(&sock_ep->num_rx_ctx) || + atomic_get(&sock_ep->num_tx_ctx)) + return -FI_EBUSY; + + if (sock_ep->tx_array[sock_ep->ep_attr.tx_ctx_cnt]) + sock_tx_ctx_free(sock_ep->tx_array[sock_ep->ep_attr.tx_ctx_cnt]); + if (sock_ep->rx_array[sock_ep->ep_attr.rx_ctx_cnt]) + sock_rx_ctx_free(sock_ep->rx_array[sock_ep->ep_attr.rx_ctx_cnt]); + + free(sock_ep->tx_array); + free(sock_ep->rx_array); + + if (sock_ep->src_addr) + free(sock_ep->src_addr); + if (sock_ep->dest_addr) + free(sock_ep->dest_addr); + + free(sock_ep); + return 0; } -static int sock_ep_setopt(fid_t fid, int level, int optname, - const void *optval, size_t optlen) +static int sock_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) { - return -FI_ENOSYS; + int ret, i; + struct sock_ep *ep; + struct sock_cq *cq; + struct sock_av *av; + struct sock_cntr *cntr; + struct sock_tx_ctx *tx_ctx; + struct sock_rx_ctx *rx_ctx; + + ep = container_of(fid, struct sock_ep, ep.fid); + + switch (bfid->fclass) { + case FI_CLASS_EQ: + return -FI_EINVAL; + + case FI_CLASS_MR: + return -FI_EINVAL; + + case FI_CLASS_CQ: + cq = container_of(bfid, struct sock_cq, cq_fid.fid); + assert(ep->domain == cq->domain); + + if (flags & FI_SEND) { + ep->comp.send_cq = cq; + if (flags & FI_COMPLETION) + ep->comp.send_cq_event = 1; + } + + if (flags & FI_READ) { + ep->comp.read_cq = cq; + if (flags & FI_COMPLETION) + ep->comp.read_cq_event = 1; + } + + if (flags & FI_WRITE) { + ep->comp.write_cq = cq; + if (flags & FI_COMPLETION) + ep->comp.write_cq_event = 1; + } + + if (flags & FI_RECV) { + ep->comp.recv_cq = cq; + if (flags & FI_COMPLETION) + ep->comp.recv_cq_event = 1; + } + + if (flags & FI_REMOTE_READ) { + ep->comp.rem_read_cq = cq; + if (flags & FI_COMPLETION) + ep->comp.rem_read_cq_event = 1; + } + + if (flags & FI_REMOTE_WRITE) { + ep->comp.rem_write_cq = cq; + if (flags & FI_COMPLETION) + ep->comp.rem_write_cq_event = 1; + } + + if (flags & FI_SEND || flags & FI_WRITE || flags & FI_READ) { + for (i=0; i<=ep->ep_attr.tx_ctx_cnt; i++) { + tx_ctx = ep->tx_array[i]; + + if (!tx_ctx) + continue; + + if ((ret = sock_ctx_bind_cq(&tx_ctx->ctx.fid, + bfid, flags))) + return ret; + } + } + + if (flags & FI_RECV || flags & FI_REMOTE_READ || + flags & FI_REMOTE_WRITE) { + for (i=0; i<=ep->ep_attr.rx_ctx_cnt; i++) { + rx_ctx = ep->rx_array[i]; + + if (!rx_ctx) + continue; + + if ((ret = sock_ctx_bind_cq(&rx_ctx->ctx.fid, + bfid, flags))) + return ret; + } + } + break; + + case FI_CLASS_CNTR: + cntr = container_of(bfid, struct sock_cntr, cntr_fid.fid); + assert(ep->domain == cntr->domain); + + if (flags & FI_SEND) + ep->comp.send_cntr = cntr; + + if (flags & FI_RECV) + ep->comp.recv_cntr = cntr; + + if (flags & FI_READ) + ep->comp.read_cntr = cntr; + + if (flags & FI_WRITE) + ep->comp.write_cntr = cntr; + + if (flags & FI_REMOTE_READ) + ep->comp.rem_read_cntr = cntr; + + if (flags & FI_REMOTE_WRITE) + ep->comp.rem_write_cntr = cntr; + + if (flags & FI_SEND || flags & FI_WRITE || flags & FI_READ) { + for (i=0; i<=ep->ep_attr.tx_ctx_cnt; i++) { + tx_ctx = ep->tx_array[i]; + + if (!tx_ctx) + continue; + + if ((ret = sock_ctx_bind_cntr(&tx_ctx->ctx.fid, + bfid, flags))) + return ret; + } + } + + if (flags & FI_RECV || flags & FI_REMOTE_READ || + flags & FI_REMOTE_WRITE) { + for (i=0; i<=ep->ep_attr.rx_ctx_cnt; i++) { + rx_ctx = ep->rx_array[i]; + + if (!rx_ctx) + continue; + + if ((ret = sock_ctx_bind_cntr(&rx_ctx->ctx.fid, + bfid, flags))) + return ret; + } + } + break; + + case FI_CLASS_AV: + av = container_of(bfid, struct sock_av, av_fid.fid); + assert(ep->domain == av->domain); + + ep->av = av; + av->cmap = &av->domain->r_cmap; + + if (ep->tx_ctx && + ep->tx_ctx->ctx.fid.fclass == FI_CLASS_TX_CTX) { + ep->tx_ctx->av = av; + } + + if (ep->rx_ctx && + ep->rx_ctx->ctx.fid.fclass == FI_CLASS_RX_CTX) + ep->rx_ctx->av = av; + + for (i=0; iep_attr.tx_ctx_cnt; i++) { + if (ep->tx_array[i]) + ep->tx_array[i]->av = av; + } + + for (i=0; iep_attr.rx_ctx_cnt; i++) { + if (ep->rx_array[i]) + ep->rx_array[i]->av = av; + } + + break; + + case FI_CLASS_STX_CTX: + tx_ctx = container_of(bfid, struct sock_tx_ctx, stx.fid); + dlist_insert_tail(&ep->tx_ctx_entry, &tx_ctx->ep_list); + ep->tx_ctx = tx_ctx; + ep->tx_array[ep->ep_attr.tx_ctx_cnt] = tx_ctx; + break; + + case FI_CLASS_SRX_CTX: + rx_ctx = container_of(bfid, struct sock_rx_ctx, ctx); + dlist_insert_tail(&ep->rx_ctx_entry, &rx_ctx->ep_list); + ep->rx_ctx = rx_ctx; + ep->rx_array[ep->ep_attr.rx_ctx_cnt] = rx_ctx; + break; + + default: + return -ENOSYS; + } + + return 0; } +static int sock_ep_control(struct fid *fid, int command, void *arg) +{ + struct fi_alias *alias; + struct sock_ep *ep, *new_ep; + ep = container_of(fid, struct sock_ep, ep.fid); + + switch (command) { + case FI_ALIAS: + alias = (struct fi_alias*)arg; + new_ep = calloc(1, sizeof(*new_ep)); + if (!new_ep) + return -FI_ENOMEM; + *new_ep = *ep; + new_ep->op_flags = alias->flags; + *alias->fid = &new_ep->ep.fid; + break; + + case FI_GETOPSFLAG: + *(uint64_t*)arg = ep->op_flags; + break; + + case FI_SETOPSFLAG: + ep->op_flags = (uint64_t)arg; + break; + + default: + return -FI_EINVAL; + } + return 0; +} + + +struct fi_ops sock_ep_fi_ops = { + .size = sizeof(struct fi_ops), + .close = sock_ep_close, + .bind = sock_ep_bind, + .control = sock_ep_control, + .ops_open = fi_no_ops_open, +}; + static int sock_ep_enable(struct fid_ep *ep) { - return -FI_ENOSYS; -} - -struct fi_ops_ep sock_ep_ops = { - .size = sizeof(struct fi_ops_ep), - .cancel = sock_ep_cancel, - .getopt = sock_ep_getopt, - .setopt = sock_ep_setopt, - .enable = sock_ep_enable, -}; - -int sock_ep_connect(struct fid_ep *ep, const void *addr, - const void *param, size_t paramlen) -{ - int ret; + int i; struct sock_ep *sock_ep; sock_ep = container_of(ep, struct sock_ep, ep); - if(!sock_ep) - return -FI_EINVAL; - - if(sock_ep->connected) - return 0; - ret = connect(sock_ep->sock_fd, (struct sockaddr *)addr, - sizeof(struct sockaddr)); - if(ret) - return -errno; - sock_ep->connected = 1; + if (sock_ep->tx_ctx && + sock_ep->tx_ctx->ctx.fid.fclass == FI_CLASS_TX_CTX) + sock_ep->tx_ctx->enabled = 1; - /* TODO: event */ + if (sock_ep->rx_ctx && + sock_ep->rx_ctx->ctx.fid.fclass == FI_CLASS_RX_CTX) + sock_ep->rx_ctx->enabled = 1; + + for (i=0; iep_attr.tx_ctx_cnt; i++) { + if (sock_ep->tx_array[i]) + sock_ep->tx_array[i]->enabled = 1; + } + + for (i=0; iep_attr.rx_ctx_cnt; i++) { + if (sock_ep->rx_array[i]) + sock_ep->rx_array[i]->enabled = 1; + } return 0; } -int sock_ep_listen(struct fid_pep *pep) +static int sock_ep_getopt(fid_t fid, int level, int optname, + void *optval, size_t *optlen) +{ + struct sock_ep *sock_ep; + sock_ep = container_of(fid, struct sock_ep, ep.fid); + + if (level != FI_OPT_ENDPOINT) + return -ENOPROTOOPT; + + switch (optname) { + case FI_OPT_MIN_MULTI_RECV: + *(size_t *)optval = sock_ep->min_multi_recv; + *optlen = sizeof(size_t); + break; + + default: + return -FI_ENOPROTOOPT; + } + return 0; +} + +static int sock_ep_setopt(fid_t fid, int level, int optname, + const void *optval, size_t optlen) +{ + int i; + struct sock_ep *sock_ep; + sock_ep = container_of(fid, struct sock_ep, ep.fid); + + if (level != FI_OPT_ENDPOINT) + return -ENOPROTOOPT; + + switch (optname) { + case FI_OPT_MIN_MULTI_RECV: + + sock_ep->min_multi_recv = *(size_t *)optval; + for (i = 0; i < sock_ep->ep_attr.rx_ctx_cnt + 1; i ++) { + if (sock_ep->rx_array[i] != NULL) { + sock_ep->rx_array[i]->min_multi_recv = + sock_ep->min_multi_recv; + } + } + break; + + default: + return -ENOPROTOOPT; + } + return 0; +} + +static int sock_ep_tx_ctx(struct fid_sep *ep, int index, struct fi_tx_attr *attr, + struct fid_ep **tx_ep, void *context) +{ + struct sock_ep *sock_ep; + struct sock_tx_ctx *tx_ctx; + + sock_ep = container_of(ep, struct sock_ep, sep); + if (index >= sock_ep->ep_attr.tx_ctx_cnt) + return -FI_EINVAL; + + tx_ctx = sock_tx_ctx_alloc(&sock_ep->tx_attr, context); + if (!tx_ctx) + return -FI_ENOMEM; + + tx_ctx->tx_id = index; + tx_ctx->ep = sock_ep; + tx_ctx->domain = sock_ep->domain; + dlist_insert_tail(&sock_ep->tx_ctx_entry, &tx_ctx->ep_list); + + tx_ctx->ctx.ops = &sock_ctx_ep_ops; + tx_ctx->ctx.msg = &sock_ep_msg_ops; + tx_ctx->ctx.tagged = &sock_ep_tagged; + tx_ctx->ctx.rma = &sock_ep_rma; + tx_ctx->ctx.atomic = &sock_ep_atomic; + + *tx_ep = &tx_ctx->ctx; + sock_ep->tx_array[index] = tx_ctx; + atomic_inc(&sock_ep->num_tx_ctx); + return 0; +} + +static int sock_ep_rx_ctx(struct fid_sep *ep, int index, struct fi_rx_attr *attr, + struct fid_ep **rx_ep, void *context) +{ + struct sock_ep *sock_ep; + struct sock_rx_ctx *rx_ctx; + + sock_ep = container_of(ep, struct sock_ep, sep); + if (index >= sock_ep->ep_attr.rx_ctx_cnt) + return -FI_EINVAL; + + rx_ctx = sock_rx_ctx_alloc(attr, context); + if (!rx_ctx) + return -FI_ENOMEM; + + rx_ctx->rx_id = index; + rx_ctx->ep = sock_ep; + rx_ctx->domain = sock_ep->domain; + dlist_insert_tail(&sock_ep->rx_ctx_entry, &rx_ctx->ep_list); + + rx_ctx->ctx.ops = &sock_ctx_ep_ops; + rx_ctx->ctx.msg = &sock_ep_msg_ops; + rx_ctx->ctx.tagged = &sock_ep_tagged; + + rx_ctx->min_multi_recv = sock_ep->min_multi_recv; + *rx_ep = &rx_ctx->ctx; + sock_ep->rx_array[index] = rx_ctx; + atomic_inc(&sock_ep->num_rx_ctx); + return 0; +} + +struct fi_ops_ep sock_ep_ops ={ + .size = sizeof(struct fi_ops_ep), + .enable = sock_ep_enable, + .cancel = sock_ep_cancel, + .getopt = sock_ep_getopt, + .setopt = sock_ep_setopt, + .tx_ctx = sock_ep_tx_ctx, + .rx_ctx = sock_ep_rx_ctx, +}; + +static int sock_ep_cm_getname(fid_t fid, void *addr, size_t *addrlen) +{ + struct sock_ep *sock_ep; + if (*addrlen == 0) { + *addrlen = sizeof(struct sockaddr_in); + return -FI_ETOOSMALL; + } + + sock_ep = container_of(fid, struct sock_ep, ep.fid); + *addrlen = MIN(*addrlen, sizeof(struct sockaddr_in)); + memcpy(addr, sock_ep->src_addr, *addrlen); + return 0; +} + +struct fi_ops_cm sock_ep_cm_ops = { + .size = sizeof(struct fi_ops_cm), + .getname = sock_ep_cm_getname, + .getpeer = fi_no_getpeer, + .connect = fi_no_connect, + .listen = fi_no_listen, + .accept = fi_no_accept, + .reject = fi_no_reject, + .shutdown = fi_no_shutdown, + .join = fi_no_join, + .leave = fi_no_leave, +}; + +int sock_stx_ctx(struct fid_domain *domain, + struct fi_tx_attr *attr, struct fid_stx **stx, void *context) +{ + struct sock_domain *dom; + struct sock_tx_ctx *tx_ctx; + + dom = container_of(domain, struct sock_domain, dom_fid); + + tx_ctx = sock_tx_ctx_alloc(attr, context); + if (!tx_ctx) + return -FI_ENOMEM; + + tx_ctx->domain = dom; + tx_ctx->stx.ops = sock_ep_ops; + atomic_inc(&dom->ref); + + *stx = &tx_ctx->stx; + return 0; +} + +int sock_srx_ctx(struct fid_domain *domain, + struct fi_rx_attr *attr, struct fid_ep **srx, void *context) +{ + struct sock_domain *dom; + struct sock_rx_ctx *rx_ctx; + + dom = container_of(domain, struct sock_domain, dom_fid); + rx_ctx = sock_rx_ctx_alloc(attr, context); + if (!rx_ctx) + return -FI_ENOMEM; + + rx_ctx->domain = dom; + rx_ctx->ctx.fid.fclass = FI_CLASS_SRX_CTX; + + rx_ctx->ctx.ops = &sock_ctx_ep_ops; + rx_ctx->ctx.msg = &sock_ep_msg_ops; + rx_ctx->ctx.tagged = &sock_ep_tagged; + + /* default config */ + rx_ctx->min_multi_recv = SOCK_EP_MIN_MULTI_RECV; + + *srx = &rx_ctx->ctx; + atomic_inc(&dom->ref); + return 0; +} + +struct fi_info *sock_fi_info(enum fi_ep_type ep_type, + struct fi_info *hints, void *src_addr, void *dest_addr) +{ + struct fi_info *_info = fi_allocinfo_internal(); + if (!_info) + return NULL; + + _info->src_addr = calloc(1, sizeof(struct sockaddr_in)); + _info->dest_addr = calloc(1, sizeof(struct sockaddr_in)); + + _info->ep_type = ep_type; + _info->mode = SOCK_MODE; + _info->addr_format = FI_SOCKADDR_IN; + _info->dest_addrlen =_info->src_addrlen = sizeof(struct sockaddr_in); + + if (src_addr) { + memcpy(_info->src_addr, src_addr, sizeof(struct sockaddr_in)); + } + + if (dest_addr) { + memcpy(_info->dest_addr, dest_addr, sizeof(struct sockaddr_in)); + } + + if (hints->caps) + _info->caps = hints->caps; + + if (hints->ep_attr) + *(_info->ep_attr) = *hints->ep_attr; + + if (hints->tx_attr) + *(_info->tx_attr) = *hints->tx_attr; + + if (hints->rx_attr) + *(_info->rx_attr) = *hints->rx_attr; + + *(_info->domain_attr) = hints->domain_attr ? *hints->domain_attr : + sock_domain_attr; + *(_info->fabric_attr) = hints->fabric_attr ? *hints->fabric_attr : + sock_fabric_attr; + + _info->domain_attr->name = strdup(sock_dom_name); + _info->fabric_attr->name = strdup(sock_fab_name); + _info->fabric_attr->prov_name = strdup(sock_fab_name); + + return _info; +} + +int sock_alloc_endpoint(struct fid_domain *domain, struct fi_info *info, + struct sock_ep **ep, void *context, size_t fclass) { int ret; - struct sock_pep *sock_pep; - - sock_pep = container_of(pep, struct sock_pep, pep); - ret = listen(sock_pep->sock_fd, 0); - if(ret) - return -errno; - return 0; -} - -int sock_ep_accept(struct fid_ep *ep, fi_connreq_t connreq, - const void *param, size_t paramlen) -{ - return -FI_ENOSYS; -} - -int sock_ep_reject(struct fid_pep *pep, fi_connreq_t connreq, - const void *param, size_t paramlen) -{ - return -FI_ENOSYS; -} - -int sock_ep_shutdown(struct fid_ep *ep, uint64_t flags) -{ - return -FI_ENOSYS; -} - -int sock_ep_join(struct fid_ep *ep, void *addr, fi_addr_t *fi_addr, - uint64_t flags, void *context) -{ - return -FI_ENOSYS; -} - -int sock_ep_leave(struct fid_ep *ep, void *addr, fi_addr_t fi_addr, - uint64_t flags) -{ - return -FI_ENOSYS; -} - -struct fi_ops_cm sock_cm_ops = { - .size = sizeof(struct fi_ops_cm), - .getname = NULL, - .getpeer = NULL, - .connect = NULL, - .listen = NULL, - .accept = NULL, - .reject = NULL, - .shutdown = NULL, - .join = NULL, - .leave= NULL, -}; - -ssize_t sock_ep_recv(struct fid_ep *ep, void *buf, size_t len, void *desc, - fi_addr_t src_addr, void *context) -{ -/* struct sock_ep *sock_ep; - recv_buf_t *list_entry; - sock_ep = container_of(ep, struct _struct sock_ep, ep); + struct sock_tx_ctx *tx_ctx; + struct sock_rx_ctx *rx_ctx; + struct sock_domain *sock_dom; - if(NULL == (list_entry = get_from_free_recv_list(sock_ep))) + if (info) { + ret = sock_verify_info(info); + if (ret) { + SOCK_LOG_INFO("Cannot support requested options!\n"); + return -FI_EINVAL; + } + } + + if (domain) + sock_dom = container_of(domain, struct sock_domain, dom_fid); + else + sock_dom = NULL; + + sock_ep = (struct sock_ep*)calloc(1, sizeof(*sock_ep)); + if (!sock_ep) return -FI_ENOMEM; + + atomic_init(&sock_ep->ref, 0); + + switch (fclass) { + case FI_CLASS_EP: + sock_ep->ep.fid.fclass = FI_CLASS_EP; + sock_ep->ep.fid.context = context; + sock_ep->ep.fid.ops = &sock_ep_fi_ops; + + sock_ep->ep.ops = &sock_ep_ops; + sock_ep->ep.cm = &sock_ep_cm_ops; + sock_ep->ep.msg = &sock_ep_msg_ops; + sock_ep->ep.rma = &sock_ep_rma; + sock_ep->ep.tagged = &sock_ep_tagged; + sock_ep->ep.atomic = &sock_ep_atomic; + break; + + case FI_CLASS_SEP: + sock_ep->sep.fid.fclass = FI_CLASS_SEP; + sock_ep->sep.fid.context = context; + sock_ep->sep.fid.ops = &sock_ep_fi_ops; + + sock_ep->sep.ops = &sock_ep_ops; + sock_ep->sep.cm = &sock_ep_cm_ops; + break; + + case FI_CLASS_PEP: + sock_ep->pep.fid.fclass = FI_CLASS_SEP; + sock_ep->pep.fid.context = context; + sock_ep->pep.fid.ops = &sock_ep_fi_ops; + + sock_ep->pep.ops = &sock_ep_ops; + sock_ep->pep.cm = &sock_ep_cm_ops; + break; + + default: + goto err; + } + + sock_ep->fclass = fclass; + *ep = sock_ep; + + if (info) { + sock_ep->info.caps = info->caps; + sock_ep->info.addr_format = FI_SOCKADDR_IN; + + if (info->src_addr) { + sock_ep->src_addr = calloc(1, sizeof(struct sockaddr_in)); + memcpy(sock_ep->src_addr, info->src_addr, + sizeof(struct sockaddr_in)); + } + + if (info->dest_addr) { + sock_ep->dest_addr = calloc(1, sizeof(struct sockaddr_in)); + memcpy(sock_ep->dest_addr, info->dest_addr, + sizeof(struct sockaddr_in)); + } + + if (info->ep_attr) + sock_ep->ep_attr = *info->ep_attr; + + if (info->tx_attr) { + sock_ep->tx_attr = *info->tx_attr; + sock_ep->op_flags = info->tx_attr->op_flags; + sock_ep->tx_attr.size = sock_ep->tx_attr.size ? + sock_ep->tx_attr.size : SOCK_EP_MAX_TX_CTX_SZ; + } + + if (info->rx_attr) { + sock_ep->rx_attr = *info->rx_attr; + sock_ep->op_flags |= info->rx_attr->op_flags; + } + } - list_entry->buf = buf; - list_entry->buf_len = len; + atomic_init(&sock_ep->ref, 0); + atomic_init(&sock_ep->num_tx_ctx, 0); + atomic_init(&sock_ep->num_rx_ctx, 0); - enqueue_post_recv_list(sock_ep, list_entry); -*/ + if (sock_ep->fclass != FI_CLASS_SEP) { + sock_ep->ep_attr.tx_ctx_cnt = 0; + sock_ep->ep_attr.rx_ctx_cnt = 0; + } + + if (sock_ep->ep_attr.tx_ctx_cnt != FI_SHARED_CONTEXT) { + sock_ep->tx_array = calloc(sock_ep->ep_attr.tx_ctx_cnt + 1, + sizeof(struct sock_tx_ctx *)); + + /* default tx ctx */ + tx_ctx = sock_tx_ctx_alloc(&sock_ep->tx_attr, context); + tx_ctx->ep = sock_ep; + tx_ctx->domain = sock_dom; + tx_ctx->tx_id = sock_ep->ep_attr.tx_ctx_cnt; + dlist_insert_tail(&sock_ep->tx_ctx_entry, &tx_ctx->ep_list); + sock_ep->tx_array[sock_ep->ep_attr.tx_ctx_cnt] = tx_ctx; + sock_ep->tx_ctx = tx_ctx; + } + + if (sock_ep->ep_attr.rx_ctx_cnt != FI_SHARED_CONTEXT) { + sock_ep->rx_array = calloc(sock_ep->ep_attr.rx_ctx_cnt + 1, + sizeof(struct sock_rx_ctx *)); + + /* default rx_ctx */ + rx_ctx = sock_rx_ctx_alloc(&sock_ep->rx_attr, context); + rx_ctx->ep = sock_ep; + rx_ctx->domain = sock_dom; + rx_ctx->rx_id = sock_ep->ep_attr.rx_ctx_cnt; + dlist_insert_tail(&sock_ep->rx_ctx_entry, &rx_ctx->ep_list); + sock_ep->rx_array[sock_ep->ep_attr.rx_ctx_cnt] = rx_ctx; + sock_ep->rx_ctx = rx_ctx; + } + + /* default config */ + sock_ep->min_multi_recv = SOCK_EP_MIN_MULTI_RECV; + + sock_ep->domain = sock_dom; + atomic_inc(&sock_dom->ref); return 0; + +err: + free(sock_ep); + return -FI_EAVAIL; } - -ssize_t sock_ep_recvv(struct fid_ep *ep, const struct iovec *iov, void **desc, - size_t count, fi_addr_t src_addr, void *context) -{ - return 0; -} - -ssize_t sock_ep_recvmsg(struct fid_ep *ep, const struct fi_msg *msg, - uint64_t flags) -{ - return 0; -} - -ssize_t sock_ep_send(struct fid_ep *ep, const void *buf, size_t len, void *desc, - fi_addr_t dest_addr, void *context) -{ - return 0; -} - -ssize_t sock_ep_sendv(struct fid_ep *ep, const struct iovec *iov, void **desc, - size_t count, fi_addr_t dest_addr, void *context) -{ - return 0; -} - -ssize_t sock_ep_sendmsg(struct fid_ep *ep, const struct fi_msg *msg, - uint64_t flags) -{ - return 0; -} - -ssize_t sock_ep_inject(struct fid_ep *ep, const void *buf, size_t len, - fi_addr_t dest_addr) -{ - return 0; -} - -ssize_t sock_ep_senddata(struct fid_ep *ep, const void *buf, size_t len, - void *desc, uint64_t data, fi_addr_t dest_addr, void *context) -{ - return 0; -} - -int sock_passive_ep(struct fid_fabric *fabric, struct fi_info *info, - struct fid_pep **pep, void *context) -{ - return -FI_ENOSYS; -} - -struct fi_ops_msg sock_msg_ops = { - .size = sizeof(struct fi_ops_msg), - .recv = NULL, - .recvv = NULL, - .recvmsg = NULL, - .send = NULL, - .sendv = NULL, - .sendmsg = NULL, - .inject = NULL, - .senddata = NULL, -}; - diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ep_dgram.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ep_dgram.c new file mode 100644 index 0000000000..197207143d --- /dev/null +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ep_dgram.c @@ -0,0 +1,443 @@ +/* + * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include +#endif /* HAVE_CONFIG_H */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sock_util.h" +#include "sock.h" + +const struct fi_ep_attr sock_dgram_ep_attr = { + .protocol = FI_PROTO_SOCK_TCP, + .max_msg_size = SOCK_EP_MAX_MSG_SZ, + .inject_size = SOCK_EP_MAX_INJECT_SZ, + .total_buffered_recv = SOCK_EP_MAX_BUFF_RECV, + .max_order_raw_size = SOCK_EP_MAX_ORDER_RAW_SZ, + .max_order_war_size = SOCK_EP_MAX_ORDER_WAR_SZ, + .max_order_waw_size = SOCK_EP_MAX_ORDER_WAW_SZ, + .mem_tag_format = SOCK_EP_MEM_TAG_FMT, + .msg_order = SOCK_EP_MSG_ORDER, + .tx_ctx_cnt = 0, + .rx_ctx_cnt = 0, +}; + +const struct fi_tx_attr sock_dgram_tx_attr = { + .caps = SOCK_EP_DGRAM_CAP, + .op_flags = SOCK_DEF_OPS, + .msg_order = SOCK_EP_MSG_ORDER, + .inject_size = SOCK_EP_MAX_INJECT_SZ, + .size = SOCK_EP_MAX_TX_CTX_SZ, + .iov_limit = SOCK_EP_MAX_IOV_LIMIT, +}; + +const struct fi_rx_attr sock_dgram_rx_attr = { + .caps = SOCK_EP_DGRAM_CAP, + .op_flags = SOCK_DEF_OPS, + .msg_order = SOCK_EP_MSG_ORDER, + .total_buffered_recv = SOCK_EP_MAX_BUFF_RECV, + .size = SOCK_EP_MAX_MSG_SZ, + .iov_limit = SOCK_EP_MAX_IOV_LIMIT, +}; + +static int sock_dgram_verify_rx_attr(const struct fi_rx_attr *attr) +{ + if (!attr) + return 0; + + if ((attr->caps | SOCK_EP_DGRAM_CAP) != SOCK_EP_DGRAM_CAP) + return -FI_ENODATA; + + if ((attr->op_flags | SOCK_EP_DGRAM_CAP) != SOCK_EP_DGRAM_CAP) + return -FI_ENODATA; + + if ((attr->msg_order | SOCK_EP_MSG_ORDER) != SOCK_EP_MSG_ORDER) + return -FI_ENODATA; + + if (attr->total_buffered_recv > sock_dgram_rx_attr.total_buffered_recv) + return -FI_ENODATA; + + if (attr->size > sock_dgram_rx_attr.size) + return -FI_ENODATA; + + if (attr->iov_limit > sock_dgram_rx_attr.iov_limit) + return -FI_ENODATA; + + return 0; +} + +static int sock_dgram_verify_tx_attr(const struct fi_tx_attr *attr) +{ + if (!attr) + return 0; + + if ((attr->caps | SOCK_EP_DGRAM_CAP) != SOCK_EP_DGRAM_CAP) + return -FI_ENODATA; + + if ((attr->op_flags | SOCK_EP_DGRAM_CAP) != SOCK_EP_DGRAM_CAP) + return -FI_ENODATA; + + if ((attr->msg_order | SOCK_EP_MSG_ORDER) != SOCK_EP_MSG_ORDER) + return -FI_ENODATA; + + if (attr->inject_size > sock_dgram_tx_attr.inject_size) + return -FI_ENODATA; + + if (attr->size > sock_dgram_tx_attr.size) + return -FI_ENODATA; + + if (attr->iov_limit > sock_dgram_tx_attr.iov_limit) + return -FI_ENODATA; + + return 0; +} + +int sock_dgram_verify_ep_attr(struct fi_ep_attr *ep_attr, + struct fi_tx_attr *tx_attr, + struct fi_rx_attr *rx_attr) +{ + if (ep_attr) { + switch (ep_attr->protocol) { + case FI_PROTO_UNSPEC: + case FI_PROTO_SOCK_TCP: + break; + default: + return -FI_ENODATA; + } + + if (ep_attr->max_msg_size > sock_dgram_ep_attr.max_msg_size) + return -FI_ENODATA; + + if (ep_attr->inject_size > sock_dgram_ep_attr.inject_size) + return -FI_ENODATA; + + if (ep_attr->total_buffered_recv > + sock_dgram_ep_attr.total_buffered_recv) + return -FI_ENODATA; + + if (ep_attr->max_order_raw_size > + sock_dgram_ep_attr.max_order_raw_size) + return -FI_ENODATA; + + if (ep_attr->max_order_war_size > + sock_dgram_ep_attr.max_order_war_size) + return -FI_ENODATA; + + if (ep_attr->max_order_waw_size > + sock_dgram_ep_attr.max_order_waw_size) + return -FI_ENODATA; + + if ((ep_attr->msg_order | SOCK_EP_MSG_ORDER) != SOCK_EP_MSG_ORDER) + return -FI_ENODATA; + + if ((ep_attr->tx_ctx_cnt > SOCK_EP_MAX_TX_CNT) && + ep_attr->tx_ctx_cnt != FI_SHARED_CONTEXT) + return -FI_ENODATA; + + if ((ep_attr->rx_ctx_cnt > SOCK_EP_MAX_RX_CNT) && + ep_attr->rx_ctx_cnt != FI_SHARED_CONTEXT) + return -FI_ENODATA; + } + + if (sock_dgram_verify_tx_attr(tx_attr) || sock_dgram_verify_rx_attr(rx_attr)) + return -FI_ENODATA; + + return 0; +} + +static struct fi_info *sock_dgram_fi_info(struct fi_info *hints, + void *src_addr, void *dest_addr) +{ + struct fi_info *_info = sock_fi_info(FI_EP_DGRAM, hints, + src_addr, dest_addr); + if (!_info) + return NULL; + + if (!hints->caps) + _info->caps = SOCK_EP_DGRAM_CAP; + + if (!hints->tx_attr) + *(_info->tx_attr) = sock_dgram_tx_attr; + + if (!hints->rx_attr) + *(_info->rx_attr) = sock_dgram_rx_attr; + + if (!hints->ep_attr) + *(_info->ep_attr) = sock_dgram_ep_attr; + + return _info; +} + +int sock_dgram_getinfo(uint32_t version, const char *node, const char *service, + uint64_t flags, struct fi_info *hints, struct fi_info **info) +{ + int ret; + int udp_sock; + socklen_t len; + struct fi_info *_info; + struct addrinfo sock_hints; + struct addrinfo *result = NULL; + struct sockaddr_in *src_addr = NULL, *dest_addr = NULL; + char sa_ip[INET_ADDRSTRLEN]; + char hostname[HOST_NAME_MAX]; + + if (!info) + return -FI_EBADFLAGS; + + *info = NULL; + + if (!node && !service && !hints) + return -FI_EBADFLAGS; + + if (version != FI_VERSION(SOCK_MAJOR_VERSION, + SOCK_MINOR_VERSION)) + return -FI_ENODATA; + + if (hints) { + if ((SOCK_EP_DGRAM_CAP | hints->caps) != SOCK_EP_DGRAM_CAP) { + SOCK_LOG_INFO( + "Cannot support requested options!\n"); + return -FI_ENODATA; + } + + ret = sock_dgram_verify_rx_attr(hints->rx_attr); + if (ret) + return ret; + + ret = sock_dgram_verify_tx_attr(hints->tx_attr); + if (ret) + return ret; + } + + src_addr = calloc(1, sizeof(struct sockaddr_in)); + dest_addr = calloc(1, sizeof(struct sockaddr_in)); + + memset(&sock_hints, 0, sizeof(struct addrinfo)); + sock_hints.ai_family = AF_INET; + sock_hints.ai_socktype = SOCK_STREAM; + + if (flags & FI_NUMERICHOST) + sock_hints.ai_flags |= AI_NUMERICHOST; + + if ((flags & FI_SOURCE) || !node) { + + if (!node) { + gethostname(hostname, HOST_NAME_MAX); + } + + ret = getaddrinfo(node ? node : hostname, service, + &sock_hints, &result); + if (ret != 0) { + ret = FI_ENODATA; + SOCK_LOG_INFO("getaddrinfo failed!\n"); + goto err; + } + + while (result) { + if (result->ai_family == AF_INET && + result->ai_addrlen == sizeof(struct sockaddr_in)) + break; + result = result->ai_next; + } + + if (!result) { + SOCK_LOG_ERROR("getaddrinfo failed\n"); + ret = -FI_EINVAL; + goto err; + } + + memcpy(src_addr, result->ai_addr, result->ai_addrlen); + freeaddrinfo(result); + } else if (node || service) { + + ret = getaddrinfo(node, service, &sock_hints, &result); + if (ret != 0) { + ret = FI_ENODATA; + SOCK_LOG_INFO("getaddrinfo failed!\n"); + goto err; + } + + while (result) { + if (result->ai_family == AF_INET && + result->ai_addrlen == sizeof(struct sockaddr_in)) + break; + result = result->ai_next; + } + + if (!result) { + SOCK_LOG_ERROR("getaddrinfo failed\n"); + ret = -FI_EINVAL; + goto err; + } + + memcpy(dest_addr, result->ai_addr, result->ai_addrlen); + + udp_sock = socket(AF_INET, SOCK_DGRAM, 0); + ret = connect(udp_sock, result->ai_addr, + result->ai_addrlen); + if ( ret != 0) { + SOCK_LOG_ERROR("Failed to create udp socket\n"); + ret = FI_ENODATA; + goto err; + } + + len = sizeof(struct sockaddr_in); + ret = getsockname(udp_sock, (struct sockaddr*)src_addr, &len); + if (ret != 0) { + SOCK_LOG_ERROR("getsockname failed\n"); + close(udp_sock); + ret = FI_ENODATA; + goto err; + } + + close(udp_sock); + freeaddrinfo(result); + } + + if (dest_addr) { + memcpy(sa_ip, inet_ntoa(dest_addr->sin_addr), INET_ADDRSTRLEN); + SOCK_LOG_INFO("dest_addr: family: %d, IP is %s\n", + ((struct sockaddr_in*)dest_addr)->sin_family, sa_ip); + } + + if (src_addr) { + memcpy(sa_ip, inet_ntoa(src_addr->sin_addr), INET_ADDRSTRLEN); + SOCK_LOG_INFO("src_addr: family: %d, IP is %s\n", + ((struct sockaddr_in*)src_addr)->sin_family, sa_ip); + } + + _info = sock_dgram_fi_info(hints, src_addr, dest_addr); + if (!_info) { + ret = FI_ENOMEM; + goto err; + } + + *info = _info; + free(src_addr); + free(dest_addr); + return 0; + +err: + free(src_addr); + free(dest_addr); + SOCK_LOG_ERROR("fi_getinfo failed\n"); + return ret; +} + +int sock_dgram_endpoint(struct fid_domain *domain, struct fi_info *info, + struct sock_ep **ep, void *context, size_t fclass) +{ + int ret; + + if (info) { + if (info->ep_attr) { + ret = sock_dgram_verify_ep_attr(info->ep_attr, + info->tx_attr, + info->rx_attr); + if (ret) + return ret; + } + + if (info->tx_attr) { + ret = sock_dgram_verify_tx_attr(info->tx_attr); + if (ret) + return ret; + } + + if (info->rx_attr) { + ret = sock_dgram_verify_rx_attr(info->rx_attr); + if (ret) + return ret; + } + } + + ret = sock_alloc_endpoint(domain, info, ep, context, fclass); + if (ret) + return ret; + + if (!info || !info->ep_attr) + (*ep)->ep_attr = sock_dgram_ep_attr; + + if (!info || !info->tx_attr) + (*ep)->tx_attr = sock_dgram_tx_attr; + + if (!info || !info->rx_attr) + (*ep)->rx_attr = sock_dgram_rx_attr; + + return 0; +} + +int sock_dgram_ep(struct fid_domain *domain, struct fi_info *info, + struct fid_ep **ep, void *context) +{ + int ret; + struct sock_ep *endpoint; + + ret = sock_dgram_endpoint(domain, info, &endpoint, context, FI_CLASS_EP); + if (ret) + return ret; + + *ep = &endpoint->ep; + return 0; +} + +int sock_dgram_sep(struct fid_domain *domain, struct fi_info *info, + struct fid_sep **sep, void *context) +{ + int ret; + struct sock_ep *endpoint; + + ret = sock_dgram_endpoint(domain, info, &endpoint, context, FI_CLASS_SEP); + if (ret) + return ret; + + *sep = &endpoint->sep; + return 0; +} diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ep_msg.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ep_msg.c new file mode 100644 index 0000000000..28d6e86eb9 --- /dev/null +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ep_msg.c @@ -0,0 +1,527 @@ +/* + * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include +#endif /* HAVE_CONFIG_H */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sock.h" +#include "sock_util.h" + +const struct fi_ep_attr sock_msg_ep_attr = { + .protocol = FI_PROTO_SOCK_TCP, + .max_msg_size = SOCK_EP_MAX_MSG_SZ, + .inject_size = SOCK_EP_MAX_INJECT_SZ, + .total_buffered_recv = SOCK_EP_MAX_BUFF_RECV, + .max_order_raw_size = SOCK_EP_MAX_ORDER_RAW_SZ, + .max_order_war_size = SOCK_EP_MAX_ORDER_WAR_SZ, + .max_order_waw_size = SOCK_EP_MAX_ORDER_WAW_SZ, + .mem_tag_format = SOCK_EP_MEM_TAG_FMT, + .msg_order = SOCK_EP_MSG_ORDER, + .tx_ctx_cnt = 0, + .rx_ctx_cnt = 0, +}; + +const struct fi_tx_attr sock_msg_tx_attr = { + .caps = SOCK_EP_MSG_CAP, + .op_flags = SOCK_DEF_OPS, + .msg_order = SOCK_EP_MSG_ORDER, + .inject_size = SOCK_EP_MAX_INJECT_SZ, + .size = SOCK_EP_MAX_TX_CTX_SZ, + .iov_limit = SOCK_EP_MAX_IOV_LIMIT, +}; + +const struct fi_rx_attr sock_msg_rx_attr = { + .caps = SOCK_EP_MSG_CAP, + .op_flags = SOCK_DEF_OPS, + .msg_order = SOCK_EP_MSG_ORDER, + .total_buffered_recv = SOCK_EP_MAX_BUFF_RECV, + .size = SOCK_EP_MAX_MSG_SZ, + .iov_limit = SOCK_EP_MAX_IOV_LIMIT, +}; + +static int sock_msg_verify_rx_attr(const struct fi_rx_attr *attr) +{ + if (!attr) + return 0; + + if ((attr->caps | SOCK_EP_MSG_CAP) != SOCK_EP_MSG_CAP) + return -FI_ENODATA; + + if ((attr->op_flags | SOCK_EP_MSG_CAP) != SOCK_EP_MSG_CAP) + return -FI_ENODATA; + + if ((attr->msg_order | SOCK_EP_MSG_ORDER) != SOCK_EP_MSG_ORDER) + return -FI_ENODATA; + + if (attr->total_buffered_recv > sock_msg_rx_attr.total_buffered_recv) + return -FI_ENODATA; + + if (attr->size > sock_msg_rx_attr.size) + return -FI_ENODATA; + + if (attr->iov_limit > sock_msg_rx_attr.iov_limit) + return -FI_ENODATA; + + return 0; +} + +static int sock_msg_verify_tx_attr(const struct fi_tx_attr *attr) +{ + if (!attr) + return 0; + + if ((attr->caps | SOCK_EP_MSG_CAP) != SOCK_EP_MSG_CAP) + return -FI_ENODATA; + + if ((attr->op_flags | SOCK_EP_MSG_CAP) != SOCK_EP_MSG_CAP) + return -FI_ENODATA; + + if ((attr->msg_order | SOCK_EP_MSG_ORDER) != SOCK_EP_MSG_ORDER) + return -FI_ENODATA; + + if (attr->inject_size > sock_msg_tx_attr.inject_size) + return -FI_ENODATA; + + if (attr->size > sock_msg_tx_attr.size) + return -FI_ENODATA; + + if (attr->iov_limit > sock_msg_tx_attr.iov_limit) + return -FI_ENODATA; + + return 0; +} + +int sock_msg_verify_ep_attr(struct fi_ep_attr *ep_attr, + struct fi_tx_attr *tx_attr, + struct fi_rx_attr *rx_attr) +{ + if (ep_attr) { + switch (ep_attr->protocol) { + case FI_PROTO_UNSPEC: + case FI_PROTO_SOCK_TCP: + break; + default: + return -FI_ENODATA; + } + + if (ep_attr->max_msg_size > sock_msg_ep_attr.max_msg_size) + return -FI_ENODATA; + + if (ep_attr->inject_size > sock_msg_ep_attr.inject_size) + return -FI_ENODATA; + + if (ep_attr->total_buffered_recv > + sock_msg_ep_attr.total_buffered_recv) + return -FI_ENODATA; + + if (ep_attr->max_order_raw_size > + sock_msg_ep_attr.max_order_raw_size) + return -FI_ENODATA; + + if (ep_attr->max_order_war_size > + sock_msg_ep_attr.max_order_war_size) + return -FI_ENODATA; + + if (ep_attr->max_order_waw_size > + sock_msg_ep_attr.max_order_waw_size) + return -FI_ENODATA; + + if ((ep_attr->msg_order | SOCK_EP_MSG_ORDER) != SOCK_EP_MSG_ORDER) + return -FI_ENODATA; + + if ((ep_attr->tx_ctx_cnt > SOCK_EP_MAX_TX_CNT) && + ep_attr->tx_ctx_cnt != FI_SHARED_CONTEXT) + return -FI_ENODATA; + + if ((ep_attr->rx_ctx_cnt > SOCK_EP_MAX_RX_CNT) && + ep_attr->rx_ctx_cnt != FI_SHARED_CONTEXT) + return -FI_ENODATA; + } + + if (sock_msg_verify_tx_attr(tx_attr) || sock_msg_verify_rx_attr(rx_attr)) + return -FI_ENODATA; + + return 0; +} + +static struct fi_info *sock_msg_fi_info(struct fi_info *hints, + void *src_addr, void *dest_addr) +{ + struct fi_info *_info = sock_fi_info(FI_EP_MSG, hints, + src_addr, dest_addr); + if (!_info) + return NULL; + + if (!hints->caps) + _info->caps = SOCK_EP_MSG_CAP; + + if (!hints->tx_attr) + *(_info->tx_attr) = sock_msg_tx_attr; + + if (!hints->rx_attr) + *(_info->rx_attr) = sock_msg_rx_attr; + + if (!hints->ep_attr) + *(_info->ep_attr) = sock_msg_ep_attr; + + return _info; +} + +int sock_msg_getinfo(uint32_t version, const char *node, const char *service, + uint64_t flags, struct fi_info *hints, struct fi_info **info) +{ + int ret; + int udp_sock; + socklen_t len; + struct fi_info *_info; + struct addrinfo sock_hints; + struct addrinfo *result = NULL; + struct sockaddr_in *src_addr = NULL, *dest_addr = NULL; + char sa_ip[INET_ADDRSTRLEN]; + char hostname[HOST_NAME_MAX]; + + if (!info) + return -FI_EBADFLAGS; + + *info = NULL; + + if (!node && !service && !hints) + return -FI_EBADFLAGS; + + if (version != FI_VERSION(SOCK_MAJOR_VERSION, + SOCK_MINOR_VERSION)) + return -FI_ENODATA; + + if (hints) { + if ((SOCK_EP_MSG_CAP | hints->caps) != SOCK_EP_MSG_CAP) { + SOCK_LOG_INFO( + "Cannot support requested options!\n"); + return -FI_ENODATA; + } + + ret = sock_msg_verify_rx_attr(hints->rx_attr); + if (ret) + return ret; + + ret = sock_msg_verify_tx_attr(hints->tx_attr); + if (ret) + return ret; + } + + src_addr = calloc(1, sizeof(struct sockaddr_in)); + dest_addr = calloc(1, sizeof(struct sockaddr_in)); + + memset(&sock_hints, 0, sizeof(struct addrinfo)); + sock_hints.ai_family = AF_INET; + sock_hints.ai_socktype = SOCK_STREAM; + + if (flags & FI_NUMERICHOST) + sock_hints.ai_flags |= AI_NUMERICHOST; + + if ((flags & FI_SOURCE) || !node) { + + if (!node) { + gethostname(hostname, HOST_NAME_MAX); + } + + ret = getaddrinfo(node ? node : hostname, service, + &sock_hints, &result); + if (ret != 0) { + ret = FI_ENODATA; + SOCK_LOG_INFO("getaddrinfo failed!\n"); + goto err; + } + + while (result) { + if (result->ai_family == AF_INET && + result->ai_addrlen == sizeof(struct sockaddr_in)) + break; + result = result->ai_next; + } + + if (!result) { + SOCK_LOG_ERROR("getaddrinfo failed\n"); + ret = -FI_EINVAL; + goto err; + } + + memcpy(src_addr, result->ai_addr, result->ai_addrlen); + freeaddrinfo(result); + } else if (node || service) { + + ret = getaddrinfo(node, service, &sock_hints, &result); + if (ret != 0) { + ret = FI_ENODATA; + SOCK_LOG_INFO("getaddrinfo failed!\n"); + goto err; + } + + while (result) { + if (result->ai_family == AF_INET && + result->ai_addrlen == sizeof(struct sockaddr_in)) + break; + result = result->ai_next; + } + + if (!result) { + SOCK_LOG_ERROR("getaddrinfo failed\n"); + ret = -FI_EINVAL; + goto err; + } + + memcpy(dest_addr, result->ai_addr, result->ai_addrlen); + + udp_sock = socket(AF_INET, SOCK_DGRAM, 0); + ret = connect(udp_sock, result->ai_addr, + result->ai_addrlen); + if ( ret != 0) { + SOCK_LOG_ERROR("Failed to create udp socket\n"); + ret = FI_ENODATA; + goto err; + } + + len = sizeof(struct sockaddr_in); + ret = getsockname(udp_sock, (struct sockaddr*)src_addr, &len); + if (ret != 0) { + SOCK_LOG_ERROR("getsockname failed\n"); + close(udp_sock); + ret = FI_ENODATA; + goto err; + } + + close(udp_sock); + freeaddrinfo(result); + } + + if (dest_addr) { + memcpy(sa_ip, inet_ntoa(dest_addr->sin_addr), INET_ADDRSTRLEN); + SOCK_LOG_INFO("dest_addr: family: %d, IP is %s\n", + ((struct sockaddr_in*)dest_addr)->sin_family, sa_ip); + } + + if (src_addr) { + memcpy(sa_ip, inet_ntoa(src_addr->sin_addr), INET_ADDRSTRLEN); + SOCK_LOG_INFO("src_addr: family: %d, IP is %s\n", + ((struct sockaddr_in*)src_addr)->sin_family, sa_ip); + } + + _info = sock_msg_fi_info(hints, src_addr, dest_addr); + if (!_info) { + ret = FI_ENOMEM; + goto err; + } + + *info = _info; + free(src_addr); + free(dest_addr); + return 0; + +err: + free(src_addr); + free(dest_addr); + SOCK_LOG_ERROR("fi_getinfo failed\n"); + return ret; +} + +static int sock_msg_ep_cm_getname(fid_t fid, void *addr, size_t *addrlen) +{ + struct sock_ep *sock_ep; + if (*addrlen == 0) { + *addrlen = sizeof(struct sockaddr_in); + return -FI_ETOOSMALL; + } + + sock_ep = container_of(fid, struct sock_ep, ep.fid); + *addrlen = MIN(*addrlen, sizeof(struct sockaddr_in)); + memcpy(addr, sock_ep->src_addr, *addrlen); + return 0; +} + +static int sock_msg_ep_cm_getpeer(struct fid_ep *ep, void *addr, size_t *addrlen) +{ + struct sock_ep *sock_ep; + + if (*addrlen == 0) { + *addrlen = sizeof(struct sockaddr_in); + return -FI_ETOOSMALL; + } + + sock_ep = container_of(ep, struct sock_ep, ep); + *addrlen = MIN(*addrlen, sizeof(struct sockaddr_in)); + memcpy(addr, sock_ep->dest_addr, *addrlen); + return 0; +} + +static int sock_msg_ep_cm_connect(struct fid_ep *ep, const void *addr, + const void *param, size_t paramlen) +{ + return -FI_ENOSYS; +} + +static int sock_msg_ep_cm_listen(struct fid_pep *pep) +{ + return -FI_ENOSYS; +} + +static int sock_msg_ep_cm_accept(struct fid_ep *ep, const void *param, size_t paramlen) +{ + return -FI_ENOSYS; +} + +static int sock_msg_ep_cm_reject(struct fid_pep *pep, fi_connreq_t connreq, + const void *param, size_t paramlen) +{ + return -FI_ENOSYS; +} + +static int sock_msg_ep_cm_shutdown(struct fid_ep *ep, uint64_t flags) +{ + return -FI_ENOSYS; +} + +struct fi_ops_cm sock_msg_ep_cm_ops = { + .size = sizeof(struct fi_ops_cm), + .getname = sock_msg_ep_cm_getname, + .getpeer = sock_msg_ep_cm_getpeer, + .connect = sock_msg_ep_cm_connect, + .listen = sock_msg_ep_cm_listen, + .accept = sock_msg_ep_cm_accept, + .reject = sock_msg_ep_cm_reject, + .shutdown = sock_msg_ep_cm_shutdown, + .join = fi_no_join, + .leave = fi_no_leave, +}; + +int sock_msg_endpoint(struct fid_domain *domain, struct fi_info *info, + struct sock_ep **ep, void *context, size_t fclass) +{ + int ret; + + if (info) { + if (info->ep_attr) { + ret = sock_msg_verify_ep_attr(info->ep_attr, + info->tx_attr, + info->rx_attr); + if (ret) + return ret; + } + + if (info->tx_attr) { + ret = sock_msg_verify_tx_attr(info->tx_attr); + if (ret) + return ret; + } + + if (info->rx_attr) { + ret = sock_msg_verify_rx_attr(info->rx_attr); + if (ret) + return ret; + } + } + + ret = sock_alloc_endpoint(domain, info, ep, context, fclass); + if (ret) + return ret; + + if (!info || !info->ep_attr) + (*ep)->ep_attr = sock_msg_ep_attr; + + if (!info || !info->tx_attr) + (*ep)->tx_attr = sock_msg_tx_attr; + + if (!info || !info->rx_attr) + (*ep)->rx_attr = sock_msg_rx_attr; + + return 0; +} + +int sock_msg_ep(struct fid_domain *domain, struct fi_info *info, + struct fid_ep **ep, void *context) +{ + int ret; + struct sock_ep *endpoint; + + ret = sock_msg_endpoint(domain, info, &endpoint, context, FI_CLASS_EP); + if (ret) + return ret; + + *ep = &endpoint->ep; + return 0; +} + +int sock_msg_sep(struct fid_domain *domain, struct fi_info *info, + struct fid_sep **sep, void *context) +{ + int ret; + struct sock_ep *endpoint; + + ret = sock_msg_endpoint(domain, info, &endpoint, context, FI_CLASS_SEP); + if (ret) + return ret; + + *sep = &endpoint->sep; + return 0; +} + +int sock_msg_passive_ep(struct fid_fabric *fabric, struct fi_info *info, + struct fid_pep **pep, void *context) +{ + int ret; + struct sock_ep *endpoint; + + ret = sock_msg_endpoint(NULL, info, &endpoint, context, FI_CLASS_PEP); + if (ret) + return ret; + + *pep = &endpoint->pep; + return 0; +} diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ep_rdm.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ep_rdm.c new file mode 100644 index 0000000000..d718fffd3d --- /dev/null +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ep_rdm.c @@ -0,0 +1,442 @@ +/* + * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include +#endif /* HAVE_CONFIG_H */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sock.h" +#include "sock_util.h" + +const struct fi_ep_attr sock_rdm_ep_attr = { + .protocol = FI_PROTO_SOCK_TCP, + .max_msg_size = SOCK_EP_MAX_MSG_SZ, + .inject_size = SOCK_EP_MAX_INJECT_SZ, + .total_buffered_recv = SOCK_EP_MAX_BUFF_RECV, + .max_order_raw_size = SOCK_EP_MAX_ORDER_RAW_SZ, + .max_order_war_size = SOCK_EP_MAX_ORDER_WAR_SZ, + .max_order_waw_size = SOCK_EP_MAX_ORDER_WAW_SZ, + .mem_tag_format = SOCK_EP_MEM_TAG_FMT, + .msg_order = SOCK_EP_MSG_ORDER, + .tx_ctx_cnt = 0, + .rx_ctx_cnt = 0, +}; + +const struct fi_tx_attr sock_rdm_tx_attr = { + .caps = SOCK_EP_RDM_CAP, + .op_flags = SOCK_DEF_OPS, + .msg_order = SOCK_EP_MSG_ORDER, + .inject_size = SOCK_EP_MAX_INJECT_SZ, + .size = SOCK_EP_MAX_TX_CTX_SZ, + .iov_limit = SOCK_EP_MAX_IOV_LIMIT, +}; + +const struct fi_rx_attr sock_rdm_rx_attr = { + .caps = SOCK_EP_RDM_CAP, + .op_flags = SOCK_DEF_OPS, + .msg_order = SOCK_EP_MSG_ORDER, + .total_buffered_recv = SOCK_EP_MAX_BUFF_RECV, + .size = SOCK_EP_MAX_MSG_SZ, + .iov_limit = SOCK_EP_MAX_IOV_LIMIT, +}; + +static int sock_rdm_verify_rx_attr(const struct fi_rx_attr *attr) +{ + if (!attr) + return 0; + + if ((attr->caps | SOCK_EP_RDM_CAP) != SOCK_EP_RDM_CAP) + return -FI_ENODATA; + + if ((attr->op_flags | SOCK_EP_RDM_CAP) != SOCK_EP_RDM_CAP) + return -FI_ENODATA; + + if ((attr->msg_order | SOCK_EP_MSG_ORDER) != SOCK_EP_MSG_ORDER) + return -FI_ENODATA; + + if (attr->total_buffered_recv > sock_rdm_rx_attr.total_buffered_recv) + return -FI_ENODATA; + + if (attr->size > sock_rdm_rx_attr.size) + return -FI_ENODATA; + + if (attr->iov_limit > sock_rdm_rx_attr.iov_limit) + return -FI_ENODATA; + + return 0; +} + +static int sock_rdm_verify_tx_attr(const struct fi_tx_attr *attr) +{ + if (!attr) + return 0; + + if ((attr->caps | SOCK_EP_RDM_CAP) != SOCK_EP_RDM_CAP) + return -FI_ENODATA; + + if ((attr->op_flags | SOCK_EP_RDM_CAP) != SOCK_EP_RDM_CAP) + return -FI_ENODATA; + + if ((attr->msg_order | SOCK_EP_MSG_ORDER) != SOCK_EP_MSG_ORDER) + return -FI_ENODATA; + + if (attr->inject_size > sock_rdm_tx_attr.inject_size) + return -FI_ENODATA; + + if (attr->size > sock_rdm_tx_attr.size) + return -FI_ENODATA; + + if (attr->iov_limit > sock_rdm_tx_attr.iov_limit) + return -FI_ENODATA; + + return 0; +} + +int sock_rdm_verify_ep_attr(struct fi_ep_attr *ep_attr, + struct fi_tx_attr *tx_attr, + struct fi_rx_attr *rx_attr) +{ + if (ep_attr) { + switch (ep_attr->protocol) { + case FI_PROTO_UNSPEC: + case FI_PROTO_SOCK_TCP: + break; + default: + return -FI_ENODATA; + } + + if (ep_attr->max_msg_size > sock_rdm_ep_attr.max_msg_size) + return -FI_ENODATA; + + if (ep_attr->inject_size > sock_rdm_ep_attr.inject_size) + return -FI_ENODATA; + + if (ep_attr->total_buffered_recv > + sock_rdm_ep_attr.total_buffered_recv) + return -FI_ENODATA; + + if (ep_attr->max_order_raw_size > + sock_rdm_ep_attr.max_order_raw_size) + return -FI_ENODATA; + + if (ep_attr->max_order_war_size > + sock_rdm_ep_attr.max_order_war_size) + return -FI_ENODATA; + + if (ep_attr->max_order_waw_size > + sock_rdm_ep_attr.max_order_waw_size) + return -FI_ENODATA; + + if ((ep_attr->msg_order | SOCK_EP_MSG_ORDER) != SOCK_EP_MSG_ORDER) + return -FI_ENODATA; + + if ((ep_attr->tx_ctx_cnt > SOCK_EP_MAX_TX_CNT) && + ep_attr->tx_ctx_cnt != FI_SHARED_CONTEXT) + return -FI_ENODATA; + + if ((ep_attr->rx_ctx_cnt > SOCK_EP_MAX_RX_CNT) && + ep_attr->rx_ctx_cnt != FI_SHARED_CONTEXT) + return -FI_ENODATA; + } + + if (sock_rdm_verify_tx_attr(tx_attr) || sock_rdm_verify_rx_attr(rx_attr)) + return -FI_ENODATA; + + return 0; +} + + +static struct fi_info *sock_rdm_fi_info(struct fi_info *hints, + void *src_addr, void *dest_addr) +{ + struct fi_info *_info = sock_fi_info(FI_EP_RDM, hints, + src_addr, dest_addr); + if (!_info) + return NULL; + + if (!hints->caps) + _info->caps = SOCK_EP_RDM_CAP; + + if (!hints->tx_attr) + *(_info->tx_attr) = sock_rdm_tx_attr; + + if (!hints->rx_attr) + *(_info->rx_attr) = sock_rdm_rx_attr; + + if (!hints->ep_attr) + *(_info->ep_attr) = sock_rdm_ep_attr; + + return _info; +} + +int sock_rdm_getinfo(uint32_t version, const char *node, const char *service, + uint64_t flags, struct fi_info *hints, struct fi_info **info) +{ + int ret; + int udp_sock; + socklen_t len; + struct fi_info *_info; + struct addrinfo sock_hints; + struct addrinfo *result = NULL; + struct sockaddr_in *src_addr = NULL, *dest_addr = NULL; + char sa_ip[INET_ADDRSTRLEN]; + char hostname[HOST_NAME_MAX]; + + if (!info) + return -FI_EBADFLAGS; + + *info = NULL; + + if (version != FI_VERSION(SOCK_MAJOR_VERSION, + SOCK_MINOR_VERSION)) + return -FI_ENODATA; + + if (hints) { + if ((SOCK_EP_RDM_CAP | hints->caps) != SOCK_EP_RDM_CAP) { + SOCK_LOG_INFO("Cannot support requested options!\n"); + return -FI_ENODATA; + } + + ret = sock_rdm_verify_rx_attr(hints->rx_attr); + if (ret) + return ret; + + ret = sock_rdm_verify_tx_attr(hints->tx_attr); + if (ret) + return ret; + } + + src_addr = calloc(1, sizeof(struct sockaddr_in)); + dest_addr = calloc(1, sizeof(struct sockaddr_in)); + + memset(&sock_hints, 0, sizeof(struct addrinfo)); + sock_hints.ai_family = AF_INET; + sock_hints.ai_socktype = SOCK_STREAM; + + if (flags & FI_NUMERICHOST) + sock_hints.ai_flags |= AI_NUMERICHOST; + + if ((flags & FI_SOURCE) || !node) { + + if (!node) { + gethostname(hostname, HOST_NAME_MAX); + } + + ret = getaddrinfo(node ? node : hostname, service, + &sock_hints, &result); + if (ret != 0) { + ret = FI_ENODATA; + SOCK_LOG_INFO("getaddrinfo failed!\n"); + goto err; + } + + while (result) { + if (result->ai_family == AF_INET && + result->ai_addrlen == sizeof(struct sockaddr_in)) + break; + result = result->ai_next; + } + + if (!result) { + SOCK_LOG_ERROR("getaddrinfo failed\n"); + ret = -FI_EINVAL; + goto err; + } + + memcpy(src_addr, result->ai_addr, result->ai_addrlen); + freeaddrinfo(result); + } else if (node || service) { + + ret = getaddrinfo(node, service, &sock_hints, &result); + if (ret != 0) { + ret = FI_ENODATA; + SOCK_LOG_INFO("getaddrinfo failed!\n"); + goto err; + } + + while (result) { + if (result->ai_family == AF_INET && + result->ai_addrlen == sizeof(struct sockaddr_in)) + break; + result = result->ai_next; + } + + if (!result) { + SOCK_LOG_ERROR("getaddrinfo failed\n"); + ret = -FI_EINVAL; + goto err; + } + + memcpy(dest_addr, result->ai_addr, result->ai_addrlen); + + udp_sock = socket(AF_INET, SOCK_DGRAM, 0); + ret = connect(udp_sock, result->ai_addr, + result->ai_addrlen); + if ( ret != 0) { + SOCK_LOG_ERROR("Failed to create udp socket\n"); + ret = FI_ENODATA; + goto err; + } + + len = sizeof(struct sockaddr_in); + ret = getsockname(udp_sock, (struct sockaddr*)src_addr, &len); + if (ret != 0) { + SOCK_LOG_ERROR("getsockname failed\n"); + close(udp_sock); + ret = FI_ENODATA; + goto err; + } + + close(udp_sock); + freeaddrinfo(result); + } + + if (dest_addr) { + memcpy(sa_ip, inet_ntoa(dest_addr->sin_addr), INET_ADDRSTRLEN); + SOCK_LOG_INFO("dest_addr: family: %d, IP is %s\n", + ((struct sockaddr_in*)dest_addr)->sin_family, sa_ip); + } + + if (src_addr) { + memcpy(sa_ip, inet_ntoa(src_addr->sin_addr), INET_ADDRSTRLEN); + SOCK_LOG_INFO("src_addr: family: %d, IP is %s\n", + ((struct sockaddr_in*)src_addr)->sin_family, sa_ip); + } + + _info = sock_rdm_fi_info(hints, src_addr, dest_addr); + if (!_info) { + ret = FI_ENOMEM; + goto err; + } + + *info = _info; + free(src_addr); + free(dest_addr); + return 0; + +err: + free(src_addr); + free(dest_addr); + SOCK_LOG_ERROR("fi_getinfo failed\n"); + return ret; +} + +int sock_rdm_endpoint(struct fid_domain *domain, struct fi_info *info, + struct sock_ep **ep, void *context, size_t fclass) +{ + int ret; + + if (info) { + if (info->ep_attr) { + ret = sock_rdm_verify_ep_attr(info->ep_attr, + info->tx_attr, + info->rx_attr); + if (ret) + return ret; + } + + if (info->tx_attr) { + ret = sock_rdm_verify_tx_attr(info->tx_attr); + if (ret) + return ret; + } + + if (info->rx_attr) { + ret = sock_rdm_verify_rx_attr(info->rx_attr); + if (ret) + return ret; + } + } + + ret = sock_alloc_endpoint(domain, info, ep, context, fclass); + if (ret) + return ret; + + if (!info || !info->ep_attr) + (*ep)->ep_attr = sock_rdm_ep_attr; + + if (!info || !info->tx_attr) + (*ep)->tx_attr = sock_rdm_tx_attr; + + if (!info || !info->rx_attr) + (*ep)->rx_attr = sock_rdm_rx_attr; + + return 0; +} + +int sock_rdm_ep(struct fid_domain *domain, struct fi_info *info, + struct fid_ep **ep, void *context) +{ + int ret; + struct sock_ep *endpoint; + + ret = sock_rdm_endpoint(domain, info, &endpoint, context, FI_CLASS_EP); + if (ret) + return ret; + + *ep = &endpoint->ep; + return 0; +} + +int sock_rdm_sep(struct fid_domain *domain, struct fi_info *info, + struct fid_sep **sep, void *context) +{ + int ret; + struct sock_ep *endpoint; + + ret = sock_rdm_endpoint(domain, info, &endpoint, context, FI_CLASS_SEP); + if (ret) + return ret; + + *sep = &endpoint->sep; + return 0; +} + diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_eq.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_eq.c index 1b2def8949..3f326694a5 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_eq.c +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_eq.c @@ -143,8 +143,11 @@ ssize_t sock_eq_report_event(struct sock_eq *sock_eq, uint32_t event, entry->len = len; entry->flags = flags; memcpy(entry->event, buf, len); - dlistfd_insert_tail(&entry->entry, &sock_eq->list); + + if (sock_eq->signal) + sock_wait_signal(sock_eq->waitset); + fastlock_release(&sock_eq->lock); return 0; } @@ -167,8 +170,11 @@ ssize_t sock_eq_report_error(struct sock_eq *sock_eq, fid_t fid, void *context, err_entry->prov_errno = prov_errno; err_entry->err_data = err_data; entry->len = sizeof(struct fi_eq_err_entry); - dlistfd_insert_tail(&entry->entry, &sock_eq->err_list); + + if (sock_eq->signal) + sock_wait_signal(sock_eq->waitset); + fastlock_release(&sock_eq->lock); return 0; } @@ -212,26 +218,43 @@ int sock_eq_fi_close(struct fid *fid) fastlock_destroy(&sock_eq->lock); atomic_dec(&sock_eq->sock_fab->ref); + if (sock_eq->signal && sock_eq->attr.wait_obj == FI_WAIT_MUTEX_COND) + sock_wait_close(&sock_eq->waitset->fid); + free(sock_eq); return 0; } -int sock_eq_fi_control(struct fid *fid, int command, void *arg) +int sock_eq_control(struct fid *fid, int command, void *arg) { - struct sock_eq *eq; int ret = 0; + struct sock_eq *eq; - eq = container_of(fid, struct sock_eq, eq.fid); - + eq = container_of(fid, struct sock_eq, eq.fid); switch (command) { case FI_GETWAIT: - *(void **) arg = &eq->list.fd[LIST_READ_FD]; + switch (eq->attr.wait_obj) { + case FI_WAIT_NONE: + case FI_WAIT_UNSPEC: + case FI_WAIT_FD: + memcpy(arg, &eq->list.fd[LIST_READ_FD], sizeof(int)); + break; + + case FI_WAIT_SET: + case FI_WAIT_MUTEX_COND: + sock_wait_get_obj(eq->waitset, arg); + break; + + default: + ret = -FI_EINVAL; + break; + } break; + default: - ret = -FI_ENOSYS; + ret = -FI_EINVAL; break; } - return ret; } @@ -239,7 +262,7 @@ static struct fi_ops sock_eq_fi_ops = { .size = sizeof(struct fi_ops), .close = sock_eq_fi_close, .bind = fi_no_bind, - .control = sock_eq_fi_control, + .control = sock_eq_control, .ops_open = fi_no_ops_open, }; @@ -251,6 +274,8 @@ static int _sock_eq_verify_attr(struct fi_eq_attr *attr) switch (attr->wait_obj) { case FI_WAIT_NONE: case FI_WAIT_FD: + case FI_WAIT_SET: + case FI_WAIT_MUTEX_COND: break; case FI_WAIT_UNSPEC: attr->wait_obj = FI_WAIT_FD; @@ -275,6 +300,7 @@ int sock_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr, { int ret; struct sock_eq *sock_eq; + struct fi_wait_attr wait_attr; ret = _sock_eq_verify_attr(attr); if (ret) @@ -293,9 +319,9 @@ int sock_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr, if(attr == NULL) memcpy(&sock_eq->attr, &_sock_eq_def_attr, - sizeof(struct fi_cq_attr)); - else - memcpy(&sock_eq->attr, attr, sizeof(struct fi_cq_attr)); + sizeof(struct fi_eq_attr)); + else + memcpy(&sock_eq->attr, attr, sizeof(struct fi_eq_attr)); ret = dlistfd_head_init(&sock_eq->list); if(ret) @@ -307,6 +333,34 @@ int sock_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr, fastlock_init(&sock_eq->lock); atomic_inc(&sock_eq->sock_fab->ref); + + switch (sock_eq->attr.wait_obj) { + + case FI_WAIT_NONE: + case FI_WAIT_UNSPEC: + case FI_WAIT_FD: + sock_eq->signal = 0; + break; + + case FI_WAIT_MUTEX_COND: + wait_attr.flags = 0; + wait_attr.wait_obj = FI_WAIT_MUTEX_COND; + /* FIXME: waitset is a domain object, but not EQ. This needs to be + updated based on #394 */ + ret = sock_wait_open(NULL, &wait_attr, &sock_eq->waitset); + if (ret) + goto err2; + sock_eq->signal = 1; + break; + + case FI_WAIT_SET: + sock_eq->waitset = attr->wait_set; + sock_eq->signal = 1; + break; + + default: + break; + } return 0; err2: diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_fabric.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_fabric.c index 64068fdadf..d785dcafa1 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_fabric.c +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_fabric.c @@ -37,6 +37,8 @@ #include #include +#include "prov.h" + #include "sock.h" #include "sock_util.h" @@ -93,8 +95,9 @@ int sock_verify_info(struct fi_info *hints) return -FI_ENODATA; } - if (!sock_rdm_verify_ep_attr(hints->ep_attr, - hints->tx_attr, hints->rx_attr)) + if (!sock_rdm_verify_ep_attr(hints->ep_attr, hints->tx_attr, hints->rx_attr) || + !sock_dgram_verify_ep_attr(hints->ep_attr, hints->tx_attr, hints->rx_attr) || + !sock_msg_verify_ep_attr(hints->ep_attr, hints->tx_attr, hints->rx_attr)) return 0; ret = sock_verify_domain_attr(hints->domain_attr); @@ -111,7 +114,7 @@ int sock_verify_info(struct fi_info *hints) static struct fi_ops_fabric sock_fab_ops = { .size = sizeof(struct fi_ops_fabric), .domain = sock_domain, - .passive_ep = sock_passive_ep, + .passive_ep = sock_msg_passive_ep, .eq_open = sock_eq_open, }; @@ -128,28 +131,12 @@ static int sock_fabric_close(fid_t fid) return 0; } -int sock_fabric_bind(struct fid *fid, struct fid *bfid, uint64_t flags) -{ - return -FI_ENOSYS; -} - -int sock_fabric_control(struct fid *fid, int command, void *arg) -{ - return -FI_ENOSYS; -} - -int sock_fabric_ops_open(struct fid *fid, const char *name, - uint64_t flags, void **ops, void *context) -{ - return -FI_ENOSYS; -} - static struct fi_ops sock_fab_fi_ops = { .size = sizeof(struct fi_ops), .close = sock_fabric_close, - .bind = sock_fabric_bind, - .control = sock_fabric_control, - .ops_open = sock_fabric_ops_open, + .bind = fi_no_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open, }; static int sock_fabric(struct fi_fabric_attr *attr, @@ -179,8 +166,6 @@ static int sock_getinfo(uint32_t version, const char *node, const char *service, int ret; struct fi_info *_info, *tmp; - return -FI_ENODATA; - ret = sock_verify_info(hints); if (ret) return ret; @@ -193,6 +178,10 @@ static int sock_getinfo(uint32_t version, const char *node, const char *service, case FI_EP_DGRAM: return sock_dgram_getinfo(version, node, service, flags, hints, info); + + case FI_EP_MSG: + return sock_msg_getinfo(version, node, service, flags, + hints, info); default: break; } @@ -213,6 +202,18 @@ static int sock_getinfo(uint32_t version, const char *node, const char *service, ret = sock_dgram_getinfo(version, node, service, flags, hints, &_info); + if (ret == 0) { + *info = tmp = _info; + while(tmp->next != NULL) + tmp=tmp->next; + } else if (ret == -FI_ENODATA) { + tmp = NULL; + } else + return ret; + + ret = sock_msg_getinfo(version, node, service, flags, + hints, &_info); + if (NULL != tmp) { tmp->next = _info; return ret; @@ -222,25 +223,28 @@ static int sock_getinfo(uint32_t version, const char *node, const char *service, return ret; } +static void fi_sockets_fini(void) +{ +} + struct fi_provider sock_prov = { .name = "IP", .version = FI_VERSION(SOCK_MAJOR_VERSION, SOCK_MINOR_VERSION), + .fi_version = FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), .getinfo = sock_getinfo, .fabric = sock_fabric, + .cleanup = fi_sockets_fini }; -static void __attribute__((constructor)) sock_ini(void) + +SOCKETS_INI { - char *tmp = getenv("SFI_SOCK_DEBUG_LEVEL"); + char *tmp = getenv("OFI_SOCK_LOG_LEVEL"); if (tmp) { sock_log_level = atoi(tmp); } else { sock_log_level = SOCK_ERROR; } - (void) fi_register(&sock_prov); -} - -static void __attribute__((destructor)) sock_fini(void) -{ + return (&sock_prov); } diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_msg.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_msg.c new file mode 100644 index 0000000000..1377247c3e --- /dev/null +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_msg.c @@ -0,0 +1,634 @@ +/* + * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include +#endif /* HAVE_CONFIG_H */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "sock.h" +#include "sock_util.h" + +static ssize_t sock_ep_recvmsg(struct fid_ep *ep, const struct fi_msg *msg, + uint64_t flags) +{ + int i; + struct sock_rx_ctx *rx_ctx; + struct sock_rx_entry *rx_entry; + struct sock_ep *sock_ep; + + switch (ep->fid.fclass) { + case FI_CLASS_EP: + sock_ep = container_of(ep, struct sock_ep, ep); + rx_ctx = sock_ep->rx_ctx; + break; + + case FI_CLASS_RX_CTX: + case FI_CLASS_SRX_CTX: + rx_ctx = container_of(ep, struct sock_rx_ctx, ctx); + break; + + default: + SOCK_LOG_ERROR("Invalid ep type\n"); + return -FI_EINVAL; + } + + assert(rx_ctx->enabled && msg->iov_count <= SOCK_EP_MAX_IOV_LIMIT); + + rx_entry = sock_rx_new_entry(rx_ctx); + if (!rx_entry) + return -FI_ENOMEM; + + flags |= rx_ctx->attr.op_flags; + rx_entry->rx_op.op = SOCK_OP_RECV; + rx_entry->rx_op.dest_iov_len = msg->iov_count; + + rx_entry->flags = flags; + rx_entry->context = (uint64_t)msg->context; + rx_entry->addr = msg->addr; + rx_entry->data = msg->data; + rx_entry->ignore = 0xFFFFFFFF; + + for (i=0; i< msg->iov_count; i++) { + rx_entry->iov[i].iov.addr = (uint64_t)msg->msg_iov[i].iov_base; + rx_entry->iov[i].iov.len = (uint64_t)msg->msg_iov[i].iov_len; + rx_entry->total_len += rx_entry->iov[i].iov.len; + } + + fastlock_acquire(&rx_ctx->lock); + + SOCK_LOG_INFO("New rx_entry: %p (ctx: %p)\n", rx_entry, rx_ctx); + + dlist_insert_tail(&rx_entry->entry, &rx_ctx->rx_entry_list); + fastlock_release(&rx_ctx->lock); + return 0; +} + +static ssize_t sock_ep_recv(struct fid_ep *ep, void *buf, size_t len, void *desc, + fi_addr_t src_addr, void *context) +{ + struct fi_msg msg; + struct iovec msg_iov; + + msg_iov.iov_base = buf; + msg_iov.iov_len = len; + + msg.msg_iov = &msg_iov; + msg.desc = &desc; + msg.iov_count = 1; + msg.addr = src_addr; + msg.context = context; + + return sock_ep_recvmsg(ep, &msg, 0); +} + +static ssize_t sock_ep_recvv(struct fid_ep *ep, const struct iovec *iov, + void **desc, size_t count, fi_addr_t src_addr, + void *context) +{ + struct fi_msg msg; + + msg.msg_iov = iov; + msg.desc = desc; + msg.iov_count = count; + msg.addr = src_addr; + msg.context = context; + return sock_ep_recvmsg(ep, &msg, 0); +} + +static ssize_t sock_ep_sendmsg(struct fid_ep *ep, const struct fi_msg *msg, + uint64_t flags) +{ + int ret, i; + uint64_t total_len; + struct sock_op tx_op; + union sock_iov tx_iov; + struct sock_conn *conn; + struct sock_tx_ctx *tx_ctx; + struct sock_ep *sock_ep; + + switch (ep->fid.fclass) { + case FI_CLASS_EP: + sock_ep = container_of(ep, struct sock_ep, ep); + tx_ctx = sock_ep->tx_ctx; + break; + + case FI_CLASS_TX_CTX: + tx_ctx = container_of(ep, struct sock_tx_ctx, ctx); + sock_ep = tx_ctx->ep; + break; + + default: + SOCK_LOG_ERROR("Invalid EP type\n"); + return -FI_EINVAL; + } + + assert(tx_ctx->enabled && msg->iov_count <= SOCK_EP_MAX_IOV_LIMIT); + + conn = sock_av_lookup_addr(tx_ctx->av, msg->addr); + assert(conn); + + SOCK_LOG_INFO("New sendmsg on TX: %p using conn: %p\n", + tx_ctx, conn); + + flags |= tx_ctx->attr.op_flags; + memset(&tx_op, 0, sizeof(struct sock_op)); + tx_op.op = SOCK_OP_SEND; + + total_len = 0; + if (flags & FI_INJECT) { + for (i=0; i< msg->iov_count; i++) { + total_len += msg->msg_iov[i].iov_len; + } + assert(total_len <= SOCK_EP_MAX_INJECT_SZ); + tx_op.src_iov_len = total_len; + } else { + tx_op.src_iov_len = msg->iov_count; + total_len = msg->iov_count * sizeof(union sock_iov); + } + + total_len += sizeof(struct sock_op_send); + + if (flags & FI_REMOTE_CQ_DATA) + total_len += sizeof(uint64_t); + + sock_tx_ctx_start(tx_ctx); + if (rbfdavail(&tx_ctx->rbfd) < total_len) { + ret = -FI_EAGAIN; + goto err; + } + + sock_tx_ctx_write(tx_ctx, &tx_op, sizeof(struct sock_op)); + sock_tx_ctx_write(tx_ctx, &flags, sizeof(uint64_t)); + sock_tx_ctx_write(tx_ctx, &msg->context, sizeof(uint64_t)); + sock_tx_ctx_write(tx_ctx, &msg->addr, sizeof(uint64_t)); + sock_tx_ctx_write(tx_ctx, &conn, sizeof(uint64_t)); + sock_tx_ctx_write(tx_ctx, &msg->msg_iov[0].iov_base, sizeof(uint64_t)); + sock_tx_ctx_write(tx_ctx, &sock_ep, sizeof(uint64_t)); + + if (flags & FI_REMOTE_CQ_DATA) { + sock_tx_ctx_write(tx_ctx, &msg->data, sizeof(uint64_t)); + } + + if (flags & FI_INJECT) { + for (i=0; i< msg->iov_count; i++) { + sock_tx_ctx_write(tx_ctx, msg->msg_iov[i].iov_base, + msg->msg_iov[i].iov_len); + } + } else { + for (i=0; i< msg->iov_count; i++) { + tx_iov.iov.addr = (uint64_t)msg->msg_iov[i].iov_base; + tx_iov.iov.len = msg->msg_iov[i].iov_len; + sock_tx_ctx_write(tx_ctx, &tx_iov, sizeof(union sock_iov)); + } + } + + sock_tx_ctx_commit(tx_ctx); + return 0; + +err: + sock_tx_ctx_abort(tx_ctx); + return ret; +} + +static ssize_t sock_ep_send(struct fid_ep *ep, const void *buf, size_t len, + void *desc, fi_addr_t dest_addr, void *context) +{ + struct fi_msg msg; + struct iovec msg_iov; + + msg_iov.iov_base = (void*)buf; + msg_iov.iov_len = len; + msg.msg_iov = &msg_iov; + msg.desc = &desc; + msg.iov_count = 1; + msg.addr = dest_addr; + msg.context = context; + + return sock_ep_sendmsg(ep, &msg, 0); +} + +static ssize_t sock_ep_sendv(struct fid_ep *ep, const struct iovec *iov, + void **desc, size_t count, fi_addr_t dest_addr, + void *context) +{ + struct fi_msg msg; + msg.msg_iov = iov; + msg.desc = desc; + msg.iov_count = count; + msg.addr = dest_addr; + msg.context = context; + return sock_ep_sendmsg(ep, &msg, 0); +} + +static ssize_t sock_ep_senddata(struct fid_ep *ep, const void *buf, size_t len, + void *desc, uint64_t data, fi_addr_t dest_addr, + void *context) +{ + struct fi_msg msg; + struct iovec msg_iov; + + msg_iov.iov_base = (void*)buf; + msg_iov.iov_len = len; + + msg.msg_iov = &msg_iov; + msg.desc = desc; + msg.iov_count = 1; + msg.addr = dest_addr; + msg.context = context; + msg.data = data; + + return sock_ep_sendmsg(ep, &msg, FI_REMOTE_CQ_DATA); +} + +static ssize_t sock_ep_inject(struct fid_ep *ep, const void *buf, size_t len, + fi_addr_t dest_addr) +{ + struct fi_msg msg; + struct iovec msg_iov; + + msg_iov.iov_base = (void*)buf; + msg_iov.iov_len = len; + msg.msg_iov = &msg_iov; + msg.iov_count = 1; + msg.addr = dest_addr; + + return sock_ep_sendmsg(ep, &msg, FI_INJECT); +} + +struct fi_ops_msg sock_ep_msg_ops = { + .size = sizeof(struct fi_ops_msg), + .recv = sock_ep_recv, + .recvv = sock_ep_recvv, + .recvmsg = sock_ep_recvmsg, + .send = sock_ep_send, + .sendv = sock_ep_sendv, + .sendmsg = sock_ep_sendmsg, + .inject = sock_ep_inject, + .senddata = sock_ep_senddata, +}; + +static ssize_t sock_ep_trecvmsg(struct fid_ep *ep, + const struct fi_msg_tagged *msg, uint64_t flags) +{ + int i; + struct sock_rx_ctx *rx_ctx; + struct sock_rx_entry *rx_entry; + struct sock_ep *sock_ep; + + switch (ep->fid.fclass) { + case FI_CLASS_EP: + sock_ep = container_of(ep, struct sock_ep, ep); + rx_ctx = sock_ep->rx_ctx; + break; + + case FI_CLASS_RX_CTX: + case FI_CLASS_SRX_CTX: + rx_ctx = container_of(ep, struct sock_rx_ctx, ctx); + break; + + default: + SOCK_LOG_ERROR("Invalid ep type\n"); + return -FI_EINVAL; + } + + assert(rx_ctx->enabled && msg->iov_count <= SOCK_EP_MAX_IOV_LIMIT); + + rx_entry = sock_rx_new_entry(rx_ctx); + if (!rx_entry) + return -FI_ENOMEM; + + flags |= rx_ctx->attr.op_flags; + rx_entry->rx_op.op = SOCK_OP_TRECV; + rx_entry->rx_op.dest_iov_len = msg->iov_count; + + rx_entry->flags = flags; + rx_entry->context = (uint64_t)msg->context; + rx_entry->addr = msg->addr; + rx_entry->data = msg->data; + rx_entry->tag = msg->tag; + rx_entry->ignore = msg->ignore; + + for (i=0; i< msg->iov_count; i++) { + rx_entry->iov[i].iov.addr = (uint64_t)msg->msg_iov[i].iov_base; + rx_entry->iov[i].iov.len = (uint64_t)msg->msg_iov[i].iov_len; + rx_entry->total_len += rx_entry->iov[i].iov.len; + } + + fastlock_acquire(&rx_ctx->lock); + dlist_insert_tail(&rx_entry->entry, &rx_ctx->rx_entry_list); + fastlock_release(&rx_ctx->lock); + return 0; +} + +static ssize_t sock_ep_trecv(struct fid_ep *ep, void *buf, size_t len, void *desc, + fi_addr_t src_addr, uint64_t tag, uint64_t ignore, void *context) +{ + struct fi_msg_tagged msg; + struct iovec msg_iov; + + msg_iov.iov_base = buf; + msg_iov.iov_len = len; + + msg.msg_iov = &msg_iov; + msg.desc = &desc; + msg.iov_count = 1; + msg.addr = src_addr; + msg.context = context; + msg.tag = tag; + msg.ignore = ignore; + + return sock_ep_trecvmsg(ep, &msg, 0); +} + +static ssize_t sock_ep_trecvv(struct fid_ep *ep, const struct iovec *iov, + void **desc, size_t count, fi_addr_t src_addr, + uint64_t tag, uint64_t ignore, void *context) +{ + struct fi_msg_tagged msg; + + msg.msg_iov = iov; + msg.desc = desc; + msg.iov_count = count; + msg.addr = src_addr; + msg.context = context; + msg.tag = tag; + msg.ignore = ignore; + return sock_ep_trecvmsg(ep, &msg, 0); +} + +static ssize_t sock_ep_tsendmsg(struct fid_ep *ep, + const struct fi_msg_tagged *msg, uint64_t flags) +{ + int ret, i; + uint64_t total_len; + struct sock_op tx_op; + union sock_iov tx_iov; + struct sock_conn *conn; + struct sock_tx_ctx *tx_ctx; + struct sock_ep *sock_ep; + + switch (ep->fid.fclass) { + case FI_CLASS_EP: + sock_ep = container_of(ep, struct sock_ep, ep); + tx_ctx = sock_ep->tx_ctx; + break; + + case FI_CLASS_TX_CTX: + tx_ctx = container_of(ep, struct sock_tx_ctx, ctx); + sock_ep = tx_ctx->ep; + break; + + default: + SOCK_LOG_ERROR("Invalid EP type\n"); + return -FI_EINVAL; + } + + assert(tx_ctx->enabled && msg->iov_count <= SOCK_EP_MAX_IOV_LIMIT); + conn = sock_av_lookup_addr(tx_ctx->av, msg->addr); + assert(conn); + + total_len = 0; + if (flags & FI_INJECT) { + for (i=0; i< msg->iov_count; i++) { + total_len += msg->msg_iov[i].iov_len; + } + assert(total_len <= SOCK_EP_MAX_INJECT_SZ); + } else { + total_len = msg->iov_count * sizeof(union sock_iov); + } + + total_len += sizeof(struct sock_op_tsend); + if (flags & FI_REMOTE_CQ_DATA) + total_len += sizeof(uint64_t); + + sock_tx_ctx_start(tx_ctx); + if (rbfdavail(&tx_ctx->rbfd) < total_len) { + ret = -FI_EAGAIN; + goto err; + } + + flags |= tx_ctx->attr.op_flags; + memset(&tx_op, 0, sizeof(struct sock_op)); + tx_op.op = SOCK_OP_TSEND; + tx_op.src_iov_len = msg->iov_count; + + sock_tx_ctx_write(tx_ctx, &tx_op, sizeof(struct sock_op)); + sock_tx_ctx_write(tx_ctx, &flags, sizeof(uint64_t)); + sock_tx_ctx_write(tx_ctx, &msg->context, sizeof(uint64_t)); + sock_tx_ctx_write(tx_ctx, &msg->addr, sizeof(uint64_t)); + sock_tx_ctx_write(tx_ctx, &conn, sizeof(uint64_t)); + sock_tx_ctx_write(tx_ctx, &msg->msg_iov[0].iov_base, sizeof(uint64_t)); + sock_tx_ctx_write(tx_ctx, &sock_ep, sizeof(uint64_t)); + + if (flags & FI_REMOTE_CQ_DATA) { + sock_tx_ctx_write(tx_ctx, &msg->data, sizeof(uint64_t)); + } + sock_tx_ctx_write(tx_ctx, &msg->tag, sizeof(uint64_t)); + + if (flags & FI_INJECT) { + for (i=0; i< msg->iov_count; i++) { + sock_tx_ctx_write(tx_ctx, msg->msg_iov[i].iov_base, + msg->msg_iov[i].iov_len); + } + } else { + for (i=0; i< msg->iov_count; i++) { + tx_iov.iov.addr = (uint64_t)msg->msg_iov[i].iov_base; + tx_iov.iov.len = msg->msg_iov[i].iov_len; + sock_tx_ctx_write(tx_ctx, &tx_iov, sizeof(union sock_iov)); + } + } + + sock_tx_ctx_commit(tx_ctx); + return 0; + +err: + sock_tx_ctx_abort(tx_ctx); + return ret; +} + +static ssize_t sock_ep_tsend(struct fid_ep *ep, const void *buf, size_t len, + void *desc, fi_addr_t dest_addr, uint64_t tag, void *context) +{ + struct fi_msg_tagged msg; + struct iovec msg_iov; + + msg_iov.iov_base = (void*)buf; + msg_iov.iov_len = len; + msg.msg_iov = &msg_iov; + msg.desc = &desc; + msg.iov_count = 1; + msg.addr = dest_addr; + msg.context = context; + msg.tag = tag; + + return sock_ep_tsendmsg(ep, &msg, 0); +} + +static ssize_t sock_ep_tsendv(struct fid_ep *ep, const struct iovec *iov, + void **desc, size_t count, fi_addr_t dest_addr, + uint64_t tag, void *context) +{ + struct fi_msg_tagged msg; + msg.msg_iov = iov; + msg.desc = desc; + msg.iov_count = count; + msg.addr = dest_addr; + msg.context = context; + msg.tag = tag; + return sock_ep_tsendmsg(ep, &msg, 0); +} + +static ssize_t sock_ep_tsenddata(struct fid_ep *ep, const void *buf, size_t len, + void *desc, uint64_t data, fi_addr_t dest_addr, uint64_t tag, + void *context) +{ + struct fi_msg_tagged msg; + struct iovec msg_iov; + + msg_iov.iov_base = (void*)buf; + msg_iov.iov_len = len; + msg.msg_iov = &msg_iov; + msg.desc = desc; + msg.iov_count = 1; + msg.addr = dest_addr; + msg.context = context; + msg.data = data; + msg.tag = tag; + + return sock_ep_tsendmsg(ep, &msg, FI_REMOTE_CQ_DATA); +} + +static ssize_t sock_ep_tinject(struct fid_ep *ep, const void *buf, size_t len, + fi_addr_t dest_addr, uint64_t tag) +{ + struct fi_msg_tagged msg; + struct iovec msg_iov; + + msg_iov.iov_base = (void*)buf; + msg_iov.iov_len = len; + msg.msg_iov = &msg_iov; + msg.iov_count = 1; + msg.addr = dest_addr; + msg.tag = tag; + return sock_ep_tsendmsg(ep, &msg, FI_INJECT); +} + +static ssize_t sock_ep_tsearch(struct fid_ep *ep, uint64_t *tag, uint64_t ignore, + uint64_t flags, fi_addr_t *src_addr, size_t *len, + void *context) +{ + ssize_t ret; + struct dlist_entry *entry; + struct sock_rx_ctx *rx_ctx; + struct sock_rx_entry *rx_entry; + struct sock_ep *sock_ep; + + switch (ep->fid.fclass) { + case FI_CLASS_EP: + sock_ep = container_of(ep, struct sock_ep, ep); + rx_ctx = sock_ep->rx_ctx; + break; + + case FI_CLASS_RX_CTX: + case FI_CLASS_SRX_CTX: + rx_ctx = container_of(ep, struct sock_rx_ctx, ctx); + break; + + default: + SOCK_LOG_ERROR("Invalid ep type\n"); + return -FI_EINVAL; + } + + fastlock_acquire(&rx_ctx->lock); + for (entry = rx_ctx->rx_buffered_list.next; + entry != &rx_ctx->rx_buffered_list; entry = entry->next) { + + rx_entry = container_of(entry, struct sock_rx_entry, entry); + if (rx_entry->is_busy || rx_entry->is_claimed) + continue; + + if (((rx_entry->tag & ~rx_entry->ignore) == + (*tag & ~rx_entry->ignore)) && + (rx_entry->addr == FI_ADDR_UNSPEC || + rx_entry->addr == *src_addr)) { + + if (flags & FI_CLAIM) + rx_entry->is_claimed = 1; + *tag = rx_entry->tag; + *src_addr = rx_entry->addr; + *len = rx_entry->used; + ret = 1; + break; + } + } + + if (entry == &rx_ctx->rx_entry_list) + ret = -FI_ENOENT; + + fastlock_release(&rx_ctx->lock); + return ret; +} + + +struct fi_ops_tagged sock_ep_tagged = { + .size = sizeof(struct fi_ops_tagged), + .recv = sock_ep_trecv, + .recvv = sock_ep_trecvv, + .recvmsg = sock_ep_trecvmsg, + .send = sock_ep_tsend, + .sendv = sock_ep_tsendv, + .sendmsg = sock_ep_tsendmsg, + .inject = sock_ep_tinject, + .senddata = sock_ep_tsenddata, + .search = sock_ep_tsearch, +}; + diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_poll.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_poll.c index c6427fdaa5..41774e34b9 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_poll.c +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_poll.c @@ -38,26 +38,169 @@ #include #include "sock.h" +#include "sock_util.h" -//static struct fi_ops sock_wait_fi_ops = { -// .size = sizeof(struct fi_ops), -// .close = sock_wait_close, -//}; -// -//static struct fi_ops sock_poll_fi_ops = { -// .size = sizeof(struct fi_ops), -// .close = sock_poll_close, -//}; - -int sock_wait_open(struct fid_domain *domain, struct fi_wait_attr *attr, - struct fid_wait **waitset) +int sock_poll_add(struct fid_poll *pollset, struct fid *event_fid, + uint64_t flags) { - return -FI_ENOSYS; /* TODO */ + struct sock_poll *poll; + struct sock_fid_list *list_item; + + poll = container_of(pollset, struct sock_poll, poll_fid.fid); + list_item = calloc(1, sizeof(*list_item)); + if (!list_item) + return -FI_ENOMEM; + + list_item->fid = event_fid; + dlist_init(&list_item->entry); + dlist_insert_after(&list_item->entry, &poll->fid_list); + return 0; +} + +int sock_poll_del(struct fid_poll *pollset, struct fid *event_fid, + uint64_t flags) +{ + struct sock_poll *poll; + struct sock_fid_list *list_item; + struct dlist_entry *p, *head; + + poll = container_of(pollset, struct sock_poll, poll_fid.fid); + head = &poll->fid_list; + for (p = head->next; p != head; p = p->next) { + list_item = container_of(p, struct sock_fid_list, entry); + if (list_item->fid == event_fid) { + dlist_remove(p); + free(list_item); + break; + } + } + return 0; +} + +static int sock_poll_poll(struct fid_poll *pollset, void **context, int count) +{ + struct sock_poll *poll; + struct sock_cq *cq; + struct sock_eq *eq; + struct sock_cntr *cntr; + struct sock_fid_list *list_item; + struct dlist_entry *p, *head; + int ret_count = 0; + + poll = container_of(pollset, struct sock_poll, poll_fid.fid); + head = &poll->fid_list; + + for (p = head->next; p != head && ret_count < count; p = p->next) { + list_item = container_of(p, struct sock_fid_list, entry); + switch (list_item->fid->fclass) { + case FI_CLASS_CQ: + cq = container_of(list_item->fid, struct sock_cq, cq_fid); + if (cq->domain->progress_mode == FI_PROGRESS_MANUAL) + sock_cq_progress(cq); + fastlock_acquire(&cq->lock); + if (rbfdused(&cq->cq_rbfd)) { + *context++ = cq->cq_fid.fid.context; + ret_count++; + } + fastlock_release(&cq->lock); + break; + + case FI_CLASS_CNTR: + cntr = container_of(list_item->fid, struct sock_cntr, cntr_fid); + if (cntr->domain->progress_mode == FI_PROGRESS_MANUAL) + sock_cntr_progress(cntr); + fastlock_acquire(&cntr->mut); + if (atomic_get(&cntr->value) >= atomic_get(&cntr->threshold)) { + *context++ = cntr->cntr_fid.fid.context; + ret_count++; + } + fastlock_release(&cntr->mut); + break; + + case FI_CLASS_EQ: + eq = container_of(list_item->fid, struct sock_eq, eq); + fastlock_acquire(&eq->lock); + if (!dlistfd_empty(&eq->list)) { + *context++ = eq->eq.fid.context; + ret_count++; + } + fastlock_release(&eq->lock); + break; + + default: + break; + } + } + + return ret_count; +} + +static int sock_poll_close(fid_t fid) +{ + struct sock_poll *poll; + struct sock_fid_list *list_item; + struct dlist_entry *p, *head; + + poll = container_of(fid, struct sock_poll, poll_fid.fid); + + head = &poll->fid_list; + while (!dlist_empty(head)) { + p = head->next; + list_item = container_of(p, struct sock_fid_list, entry); + dlist_remove(p); + free(list_item); + } + + atomic_dec(&poll->domain->ref); + free(poll); + return 0; +} + +static struct fi_ops sock_poll_fi_ops = { + .size = sizeof(struct fi_ops), + .close = sock_poll_close, + .bind = fi_no_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open, +}; + +static struct fi_ops_poll sock_poll_ops = { + .size = sizeof(struct fi_ops_poll), + .poll = sock_poll_poll, + .poll_add = sock_poll_add, + .poll_del = sock_poll_del, +}; + +static int sock_poll_verify_attr(struct fi_poll_attr *attr) +{ + if (attr->flags) + return -FI_ENODATA; + return 0; } int sock_poll_open(struct fid_domain *domain, struct fi_poll_attr *attr, struct fid_poll **pollset) { - return -FI_ENOSYS; /* TODO */ + struct sock_domain *dom; + struct sock_poll *poll; + + if (attr && sock_poll_verify_attr(attr)) + return -FI_EINVAL; + + dom = container_of(domain, struct sock_domain, dom_fid); + poll = calloc(1, sizeof(*poll)); + if (!poll) + return -FI_ENOMEM; + + dlist_init(&poll->fid_list); + poll->poll_fid.fid.fclass = FI_CLASS_POLL; + poll->poll_fid.fid.context = 0; + poll->poll_fid.fid.ops = &sock_poll_fi_ops; + poll->poll_fid.ops = &sock_poll_ops; + poll->domain = dom; + atomic_inc(&dom->ref); + + *pollset = &poll->poll_fid; + return 0; } diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_progress.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_progress.c new file mode 100644 index 0000000000..7813faec73 --- /dev/null +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_progress.c @@ -0,0 +1,2462 @@ +/* + * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include +#endif /* HAVE_CONFIG_H */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sock.h" +#include "sock_util.h" + + +#define PE_INDEX(_pe, _e) (_e - &_pe->pe_table[0]) +#define SOCK_GET_RX_ID(_addr, _bits) (((uint64_t)_addr) >> (64 - _bits)) + + +static void sock_pe_release_entry(struct sock_pe *pe, + struct sock_pe_entry *pe_entry) +{ + dlist_remove(&pe_entry->ctx_entry); + + if (pe_entry->type == SOCK_PE_TX) + pe_entry->conn->tx_pe_entry = NULL; + else + pe_entry->conn->rx_pe_entry = NULL; + + pe_entry->conn = NULL; + memset(&pe_entry->rx, 0, sizeof(struct sock_rx_pe_entry)); + memset(&pe_entry->tx, 0, sizeof(struct sock_tx_pe_entry)); + + pe_entry->type =0; + pe_entry->is_complete = 0; + pe_entry->done_len = 0; + pe_entry->total_len = 0; + pe_entry->data_len = 0; + pe_entry->buf = 0; + + dlist_remove(&pe_entry->entry); + dlist_insert_tail(&pe_entry->entry, &pe->free_list); + SOCK_LOG_INFO("progress entry %p released\n", pe_entry); +} + +static struct sock_pe_entry *sock_pe_acquire_entry(struct sock_pe *pe) +{ + struct dlist_entry *entry; + struct sock_pe_entry *pe_entry; + + entry = pe->free_list.next; + pe_entry = container_of(entry, struct sock_pe_entry, entry); + dlist_remove(&pe_entry->entry); + dlist_insert_tail(&pe_entry->entry, &pe->busy_list); + SOCK_LOG_INFO("progress entry %p acquired \n", pe_entry); + return pe_entry; +} + +static void sock_pe_report_tx_completion(struct sock_pe_entry *pe_entry) +{ + int ret1 = 0, ret2 = 0; + + if (pe_entry->comp->send_cq && + (!pe_entry->comp->send_cq_event || + (pe_entry->comp->send_cq_event && + (pe_entry->msg_hdr.flags & FI_COMPLETION)))) + ret1 = pe_entry->comp->send_cq->report_completion( + pe_entry->comp->send_cq, pe_entry->addr, pe_entry); + + if (pe_entry->comp->send_cntr) + ret2 = sock_cntr_inc(pe_entry->comp->send_cntr); + + + if (ret1 < 0 || ret2 < 0) { + SOCK_LOG_ERROR("Failed to report completion %p\n", + pe_entry); + if (pe_entry->comp->eq) { + sock_eq_report_error( + pe_entry->comp->eq, + &pe_entry->comp->send_cntr->cntr_fid.fid, + pe_entry->comp->send_cntr->cntr_fid.fid.context, + -FI_ENOSPC, -FI_ENOSPC, NULL); + } + } +} + +static void sock_pe_report_rx_completion(struct sock_pe_entry *pe_entry) +{ + int ret1 = 0, ret2 = 0; + + if (pe_entry->comp->recv_cq && + (!pe_entry->comp->recv_cq_event || + (pe_entry->comp->recv_cq_event && + (pe_entry->msg_hdr.flags & FI_COMPLETION)))) + ret1 = pe_entry->comp->recv_cq->report_completion( + pe_entry->comp->recv_cq, pe_entry->addr, + pe_entry); + + if (pe_entry->comp->recv_cntr) + ret2 = sock_cntr_inc(pe_entry->comp->recv_cntr); + + + if (ret1 < 0 || ret2 < 0) { + SOCK_LOG_ERROR("Failed to report completion %p\n", pe_entry); + if (pe_entry->comp->eq) { + sock_eq_report_error( + pe_entry->comp->eq, + &pe_entry->comp->recv_cq->cq_fid.fid, + pe_entry->comp->recv_cq->cq_fid.fid.context, + -FI_ENOSPC, -FI_ENOSPC, NULL); + } + } +} + +void sock_pe_report_mr_completion(struct sock_domain *domain, + struct sock_pe_entry *pe_entry) +{ + int i; + struct sock_mr *mr; + + for (i = 0; i < pe_entry->msg_hdr.dest_iov_len; i++) { + mr = sock_mr_get_entry(domain, pe_entry->rx.rx_iov[i].iov.key); + if (!mr || (!mr->cq && !mr->cntr)) + continue; + + pe_entry->buf = pe_entry->rx.rx_iov[i].iov.addr; + pe_entry->data_len = pe_entry->rx.rx_iov[i].iov.len; + + if (mr->cq) + mr->cq->report_completion(mr->cq, + pe_entry->addr, pe_entry); + if (mr->cntr) + sock_cntr_inc(mr->cntr); + } +} + +void sock_pe_report_remote_write(struct sock_rx_ctx *rx_ctx, + struct sock_pe_entry *pe_entry) +{ + pe_entry->buf = pe_entry->rx.rx_iov[0].iov.addr; + pe_entry->data_len = pe_entry->rx.rx_iov[0].iov.len; + + if ((!pe_entry->comp->rem_write_cq && !pe_entry->comp->rem_write_cntr && + !(rx_ctx->attr.op_flags & FI_REMOTE_WRITE))) + return; + + if (pe_entry->comp->rem_write_cq) { + if (pe_entry->comp->rem_write_cq_event) { + if ( pe_entry->flags & FI_COMPLETION) + pe_entry->comp->rem_write_cq->report_completion( + pe_entry->comp->rem_write_cq, + pe_entry->addr, pe_entry); + } else { + pe_entry->comp->rem_write_cq->report_completion( + pe_entry->comp->rem_write_cq, + pe_entry->addr, pe_entry); + } + } + + if (pe_entry->comp->rem_write_cntr) + sock_cntr_inc(pe_entry->comp->rem_write_cntr); +} + +void sock_pe_report_remote_read(struct sock_rx_ctx *rx_ctx, + struct sock_pe_entry *pe_entry) +{ + pe_entry->buf = pe_entry->rx.rx_iov[0].iov.addr; + pe_entry->data_len = pe_entry->rx.rx_iov[0].iov.len; + + if ((!pe_entry->comp->rem_read_cq && !pe_entry->comp->rem_read_cntr && + !(rx_ctx->attr.op_flags & FI_REMOTE_READ))) + return; + + if (pe_entry->comp->rem_read_cq) { + if (pe_entry->comp->rem_read_cq_event) { + if ( pe_entry->flags & FI_COMPLETION) + pe_entry->comp->rem_read_cq->report_completion( + pe_entry->comp->rem_read_cq, + pe_entry->addr, pe_entry); + } else { + pe_entry->comp->rem_read_cq->report_completion( + pe_entry->comp->rem_read_cq, + pe_entry->addr, pe_entry); + } + } + + if (pe_entry->comp->rem_read_cntr) + sock_cntr_inc(pe_entry->comp->rem_read_cntr); +} + +static void sock_pe_report_error(struct sock_pe_entry *pe_entry, int rem) +{ + if (pe_entry->comp->recv_cntr) + sock_cntr_err_inc(pe_entry->comp->recv_cntr); + if (pe_entry->comp->recv_cq) + sock_cq_report_error(pe_entry->comp->recv_cq, pe_entry, rem, + -FI_ENOSPC, -FI_ENOSPC, NULL); +} + +static void sock_pe_progress_pending_ack(struct sock_pe *pe, + struct sock_pe_entry *pe_entry) +{ + int ret, offset, len, data_len, done_data, i; + struct sock_conn *conn = pe_entry->conn; + + assert(conn); + if (conn->tx_pe_entry != NULL && conn->tx_pe_entry != pe_entry) { + SOCK_LOG_INFO("Cannot progress %p as conn %p is being used by %p\n", + pe_entry, conn, conn->tx_pe_entry); + return; + } + + if (conn->tx_pe_entry == NULL) { + SOCK_LOG_INFO("Connection %p grabbed by %p\n", conn, pe_entry); + conn->tx_pe_entry = pe_entry; + } + + len = sizeof(struct sock_msg_response); + if (pe_entry->done_len < len) { + offset = pe_entry->done_len; + + ret = sock_comm_send(conn, + (char*)&pe_entry->rx.response + offset, + sizeof(struct sock_msg_response) - offset); + if (ret <= 0) + return; + pe_entry->done_len += ret; + if (pe_entry->done_len != len) + return; + } + + switch (pe_entry->rx.response.msg_hdr.op_type) { + case SOCK_OP_READ_COMPLETE: + + done_data = pe_entry->done_len - len; + + for (i = 0; i < pe_entry->msg_hdr.dest_iov_len; i++) { + if (done_data >= pe_entry->rx.rx_iov[i].iov.len) { + done_data -= pe_entry->rx.rx_iov[i].iov.len; + continue; + } + + offset = done_data; + data_len = pe_entry->rx.rx_iov[i].iov.len - done_data; + + ret = sock_comm_send(conn, + (char*)pe_entry->rx.rx_iov[i].iov.addr + + offset, data_len); + if (ret <= 0) + return; + done_data = 0; + pe_entry->done_len += ret; + if (ret != data_len) + return; + } + + break; + + case SOCK_OP_ATOMIC_COMPLETE: + + offset = pe_entry->done_len - len; + data_len = (pe_entry->total_len - len) - offset; + + if (data_len) { + ret = sock_comm_send(conn, + (char*)&pe_entry->rx.atomic_cmp[0] + offset, + data_len); + if (ret <= 0) + return; + pe_entry->done_len += ret; + if (ret != data_len) + return; + } + break; + + default: + break; + } + + if (pe_entry->total_len == pe_entry->done_len) { + pe_entry->is_complete = 1; + pe_entry->rx.pending_send = 0; + sock_comm_flush(pe_entry->conn); + pe_entry->conn->tx_pe_entry = NULL; + } +} + +static void sock_pe_send_response(struct sock_pe *pe, + struct sock_pe_entry *pe_entry, + size_t data_len, uint8_t op_type) +{ + struct sock_msg_response *response = &pe_entry->rx.response; + memset(response, 0, sizeof(struct sock_msg_response)); + + response->pe_entry_id = htons(pe_entry->msg_hdr.pe_entry_id); + response->msg_hdr.dest_iov_len = 0; + response->msg_hdr.flags = 0; + response->msg_hdr.msg_len = sizeof(*response) + data_len; + response->msg_hdr.version = SOCK_WIRE_PROTO_VERSION; + response->msg_hdr.op_type = op_type; + response->msg_hdr.msg_len = htonll(response->msg_hdr.msg_len); + response->msg_hdr.rx_id = htons(pe_entry->msg_hdr.rx_id); + + pe->pe_atomic = NULL; + pe_entry->done_len = 0; + pe_entry->rx.pending_send = 1; + pe_entry->conn->rx_pe_entry = NULL; + pe_entry->total_len = sizeof(*response) + data_len; + + sock_pe_progress_pending_ack(pe, pe_entry); +} + +static int sock_pe_handle_ack(struct sock_pe *pe, struct sock_pe_entry *pe_entry) +{ + struct sock_pe_entry *waiting_entry; + struct sock_msg_response response; + int ret, len, offset, data_len; + + len = sizeof(struct sock_msg_hdr); + offset = pe_entry->done_len - len; + data_len = sizeof(response) - len; + + ret = sock_comm_recv(pe_entry->conn, + (char*)&response.pe_entry_id + offset, + data_len - offset); + if (ret <= 0) + return ret; + + pe_entry->done_len += ret; + if (pe_entry->done_len != sizeof(response)) + return 0; + + response.pe_entry_id = ntohs(response.pe_entry_id); + assert(response.pe_entry_id <= SOCK_PE_MAX_ENTRIES); + waiting_entry = &pe->pe_table[response.pe_entry_id]; + SOCK_LOG_INFO("Received ack for PE entry %p (index: %d)\n", + waiting_entry, response.pe_entry_id); + + assert(waiting_entry->type == SOCK_PE_TX); + sock_pe_report_tx_completion(waiting_entry); + waiting_entry->is_complete = 1; + pe_entry->is_complete = 1; + return 0; +} + +static int sock_pe_handle_read_complete(struct sock_pe *pe, + struct sock_pe_entry *pe_entry) +{ + struct sock_pe_entry *waiting_entry; + struct sock_msg_response response; + int ret, len, offset, done_data, i, data_len; + + len = sizeof(struct sock_msg_hdr); + offset = pe_entry->done_len - len; + data_len = sizeof(response) - len; + len += data_len; + + if (pe_entry->done_len < len) { + ret = sock_comm_recv(pe_entry->conn, + (char*)&response.pe_entry_id + offset, + data_len - offset); + if (ret <= 0) + return ret; + + pe_entry->done_len += ret; + if (pe_entry->done_len != len) + return 0; + + response.pe_entry_id = ntohs(response.pe_entry_id); + assert(response.pe_entry_id <= SOCK_PE_MAX_ENTRIES); + waiting_entry = &pe->pe_table[response.pe_entry_id]; + SOCK_LOG_INFO("Received read complete for PE entry %p (index: %d)\n", + waiting_entry, response.pe_entry_id); + } + + waiting_entry = &pe->pe_table[response.pe_entry_id]; + assert(waiting_entry->type == SOCK_PE_TX); + + done_data = pe_entry->done_len - len; + for (i=0; i < waiting_entry->tx.tx_op.dest_iov_len; i++) { + + if (done_data >= waiting_entry->tx.tx_iov[i].dst.iov.len) { + done_data -= waiting_entry->tx.tx_iov[i].dst.iov.len; + continue; + } + + data_len = waiting_entry->tx.tx_iov[i].dst.iov.len - done_data; + offset = done_data; + + ret = sock_comm_recv(pe_entry->conn, + (char*)waiting_entry->tx.tx_iov[i].dst.iov.addr + + offset, data_len); + if (ret <= 0) + return 0; + + done_data = 0; + pe_entry->done_len += ret; + if ( ret != data_len) + return 0; + } + + sock_pe_report_tx_completion(waiting_entry); + waiting_entry->is_complete = 1; + pe_entry->is_complete = 1; + return 0; +} + +static int sock_pe_handle_atomic_complete(struct sock_pe *pe, + struct sock_pe_entry *pe_entry) +{ + size_t datatype_sz; + struct sock_pe_entry *waiting_entry; + struct sock_msg_response response; + int ret, len, offset, done_data, i, data_len; + + len = sizeof(struct sock_msg_hdr); + offset = pe_entry->done_len - len; + data_len = sizeof(response) - len; + len += data_len; + + if (pe_entry->done_len < len) { + ret = sock_comm_recv(pe_entry->conn, + (char*)&response.pe_entry_id + offset, + data_len - offset); + if (ret <= 0) + return ret; + + pe_entry->done_len += ret; + if (pe_entry->done_len != len) + return 0; + + response.pe_entry_id = ntohs(response.pe_entry_id); + assert(response.pe_entry_id <= SOCK_PE_MAX_ENTRIES); + waiting_entry = &pe->pe_table[response.pe_entry_id]; + SOCK_LOG_INFO("Received atomic complete for PE entry %p (index: %d)\n", + waiting_entry, response.pe_entry_id); + } + + waiting_entry = &pe->pe_table[response.pe_entry_id]; + assert(waiting_entry->type == SOCK_PE_TX); + + done_data = pe_entry->done_len - len; + datatype_sz = fi_datatype_size(waiting_entry->tx.tx_op.atomic.datatype); + + for (i=0; i < waiting_entry->tx.tx_op.atomic.res_iov_len; i++) { + if (done_data >= waiting_entry->tx.tx_iov[i].res.ioc.count * + datatype_sz) { + done_data -= waiting_entry->tx.tx_iov[i].res.ioc.count * datatype_sz; + continue; + } + + data_len = (waiting_entry->tx.tx_iov[i].res.ioc.count * datatype_sz) - + done_data; + offset = done_data; + ret = sock_comm_recv(pe_entry->conn, + (char*)waiting_entry->tx.tx_iov[i].res.ioc.addr + + offset, data_len); + if (ret <= 0) + return 0; + + done_data = 0; + pe_entry->done_len += ret; + if ( ret != data_len) + return 0; + } + + sock_pe_report_tx_completion(waiting_entry); + waiting_entry->is_complete = 1; + pe_entry->is_complete = 1; + return 0; +} + + +static int sock_pe_process_rx_read(struct sock_pe *pe, struct sock_rx_ctx *rx_ctx, + struct sock_pe_entry *pe_entry) +{ + int i, ret; + struct sock_mr *mr; + uint64_t offset, len, entry_len, data_len; + + offset = 0; + len = sizeof(struct sock_msg_hdr); + + entry_len = sizeof(union sock_iov) * pe_entry->msg_hdr.dest_iov_len; + offset = pe_entry->done_len - len; + len += entry_len; + + if (pe_entry->done_len < len) { + ret = sock_comm_recv(pe_entry->conn, + (char *)&pe_entry->rx.rx_iov[0] + offset, + entry_len - offset); + if (ret <= 0) + return ret; + pe_entry->done_len += ret; + if (ret != entry_len - offset) { + SOCK_LOG_INFO("Incomplete Recv: %d\n", ret); + return 0; + } + } else { + return 0; + } + + if (pe_entry->done_len != len) + return 0; + + /* verify mr */ + data_len = 0; + for (i = 0; i < pe_entry->msg_hdr.dest_iov_len; i++) { + + mr = sock_mr_verify_key(rx_ctx->domain, + pe_entry->rx.rx_iov[i].iov.key, + (void*)pe_entry->rx.rx_iov[i].iov.addr, + pe_entry->rx.rx_iov[i].iov.len, + FI_REMOTE_READ); + if (!mr) { + SOCK_LOG_ERROR("Remote memory access error: %p, %lu, %lu\n", + (void*)pe_entry->rx.rx_iov[i].iov.addr, + pe_entry->rx.rx_iov[i].iov.len, + pe_entry->rx.rx_iov[i].iov.key); + sock_pe_send_response(pe, pe_entry, 0, SOCK_OP_READ_ERROR); + return -FI_EINVAL; + } + + if (mr->flags & FI_MR_OFFSET) + pe_entry->rx.rx_iov[i].iov.addr += mr->offset; + data_len += pe_entry->rx.rx_iov[i].iov.len; + } + + pe_entry->buf = pe_entry->rx.rx_iov[0].iov.addr; + pe_entry->data_len = data_len; + + if (pe_entry->flags & FI_REMOTE_SIGNAL) { + sock_pe_report_rx_completion(pe_entry); + } + + sock_pe_report_remote_read(rx_ctx, pe_entry); + sock_pe_send_response(pe, pe_entry, data_len, + SOCK_OP_READ_COMPLETE); + return ret; +} + +static int sock_pe_process_rx_write(struct sock_pe *pe, struct sock_rx_ctx *rx_ctx, + struct sock_pe_entry *pe_entry) +{ + int i, ret = 0; + struct sock_mr *mr; + uint64_t offset, rem, len, entry_len, done_data, data_len; + + offset = 0; + len = sizeof(struct sock_msg_hdr); + if (pe_entry->msg_hdr.flags & FI_REMOTE_CQ_DATA) { + offset = pe_entry->done_len - len; + len += SOCK_CQ_DATA_SIZE; + if (pe_entry->done_len < len) { + ret = sock_comm_recv(pe_entry->conn, + (char*)&pe_entry->data + offset, + SOCK_CQ_DATA_SIZE - offset); + if (ret <= 0) + return ret; + pe_entry->done_len += ret; + if (pe_entry->done_len != len) + return 0; + } + } + + entry_len = sizeof(union sock_iov) * pe_entry->msg_hdr.dest_iov_len; + offset = pe_entry->done_len - len; + len += entry_len; + if (pe_entry->done_len < len) { + + ret = sock_comm_recv(pe_entry->conn, + (char *)&pe_entry->rx.rx_iov[0] + offset, + entry_len - offset); + if (ret <= 0) + return ret; + pe_entry->done_len += ret; + if (ret != entry_len - offset) { + SOCK_LOG_INFO("Incomplete Recv: %d\n", ret); + return 0; + } + } + + done_data = pe_entry->done_len - len; + rem = pe_entry->msg_hdr.msg_len - (len + done_data); + + for (i = 0; rem > 0 && i < pe_entry->msg_hdr.dest_iov_len; i++) { + + if (done_data >= pe_entry->rx.rx_iov[i].iov.len) { + done_data -= pe_entry->rx.rx_iov[i].iov.len; + continue; + } + + data_len = pe_entry->rx.rx_iov[i].iov.len - done_data; + offset = done_data; + + mr = sock_mr_verify_key(rx_ctx->domain, + pe_entry->rx.rx_iov[i].iov.key, + (void*)pe_entry->rx.rx_iov[i].iov.addr, + pe_entry->rx.rx_iov[i].iov.len, + FI_REMOTE_WRITE); + if (!mr) { + SOCK_LOG_ERROR("Remote memory access error: %p, %lu, %lu\n", + (void*)pe_entry->rx.rx_iov[i].iov.addr, + pe_entry->rx.rx_iov[i].iov.len, + pe_entry->rx.rx_iov[i].iov.key); + sock_pe_send_response(pe, pe_entry, 0, SOCK_OP_WRITE_ERROR); + break; + } + if (mr->flags & FI_MR_OFFSET) + pe_entry->rx.rx_iov[i].iov.addr += mr->offset; + + ret = sock_comm_recv(pe_entry->conn, + (char*)pe_entry->rx.rx_iov[i].iov.addr + offset, + data_len); + if (ret <= 0) + return ret; + + done_data = 0; + rem -= ret; + pe_entry->done_len += ret; + if (ret != data_len){ + SOCK_LOG_INFO("Incomplete Recv\n"); + return 0; + } + } + pe_entry->buf = pe_entry->rx.rx_iov[0].iov.addr; + + pe_entry->data_len = 0; + for (i = 0; i < pe_entry->msg_hdr.dest_iov_len; i++) { + pe_entry->data_len += pe_entry->rx.rx_iov[i].iov.len; + } + + /* report error, if any */ + if (rem) { + sock_pe_report_error(pe_entry, rem); + goto out; + } else { + if (pe_entry->flags & FI_REMOTE_SIGNAL) { + sock_pe_report_rx_completion(pe_entry); + } + } + +out: + sock_pe_report_remote_write(rx_ctx, pe_entry); + sock_pe_report_mr_completion(rx_ctx->domain, pe_entry); + sock_pe_send_response(pe, pe_entry, 0, SOCK_OP_WRITE_COMPLETE); + return ret; +} + +#define SOCK_ATOMIC_UPDATE_INT(_cmp, _src, _dst, _tmp) do { \ + _cmp = cmp, _dst = dst, _src = src; \ + switch (op) { \ + case FI_MIN: \ + *_cmp = *_dst; \ + if (*_src < *_dst) \ + *_dst = *_src; \ + break; \ + \ + case FI_MAX: \ + *_cmp = *_dst; \ + if (*_src > *_dst) \ + *_dst = *_src; \ + break; \ + \ + case FI_SUM: \ + *_cmp = *_dst; \ + *_dst = *_dst + *_src; \ + break; \ + \ + case FI_PROD: \ + *_cmp = *_dst; \ + *_dst = *_dst * *_src; \ + break; \ + \ + case FI_LOR: \ + *_cmp = *_dst; \ + *_dst = *_dst || *_src; \ + break; \ + \ + case FI_LAND: \ + *_cmp = *_dst; \ + *_dst = *_dst && *_src; \ + break; \ + \ + case FI_BOR: \ + *_cmp = *_dst; \ + *_dst = *_dst | *_src; \ + break; \ + \ + case FI_BAND: \ + *_cmp = *_dst; \ + *_dst = *_dst & *_src; \ + break; \ + \ + case FI_LXOR: \ + *_cmp = *_dst; \ + \ + *_dst = ((*_dst && !*_src) || (!*_dst && *_src)); \ + break; \ + \ + case FI_BXOR: \ + *_cmp = *_dst; \ + *_dst = *_dst ^ *_src; \ + break; \ + \ + case FI_ATOMIC_READ: \ + *_cmp = *_dst; \ + break; \ + \ + case FI_ATOMIC_WRITE: \ + *_cmp = *_dst; \ + *_dst = *_src; \ + break; \ + \ + case FI_CSWAP: \ + if (*_cmp == *_dst) \ + *_dst = *_src; \ + else \ + *_cmp = *_dst; \ + break; \ + \ + case FI_CSWAP_NE: \ + if (*_cmp != *_dst) \ + *_dst = *_src; \ + else \ + *_cmp = *_dst; \ + break; \ + \ + case FI_CSWAP_LE: \ + if (*_cmp <= *_dst) \ + *_dst = *_src; \ + else \ + *_cmp = *_dst; \ + break; \ + \ + case FI_CSWAP_LT: \ + if (*_cmp < *_dst) \ + *_dst = *_src; \ + else \ + *_cmp = *_dst; \ + break; \ + \ + case FI_CSWAP_GE: \ + if (*_cmp >= *_dst) \ + *_dst = *_src; \ + else \ + *_cmp = *_dst; \ + break; \ + \ + case FI_CSWAP_GT: \ + if (*_cmp > *_dst) \ + *_dst = *_src; \ + else \ + *_cmp = *_dst; \ + break; \ + \ + case FI_MSWAP: \ + _tmp = *_dst; \ + *_dst = (*_src & *_cmp) | (*_dst & ~(*_cmp)); \ + *_cmp = _tmp; \ + break; \ + \ + default: \ + SOCK_LOG_ERROR("Atomic operation type not supported\n"); \ + break; \ + } \ + }while(0) + +#define SOCK_ATOMIC_UPDATE_FLOAT(_cmp, _src, _dst) do { \ + _cmp = cmp, _dst = dst, _src = src; \ + switch (op) { \ + case FI_MIN: \ + *_cmp = *_dst; \ + if (*_src < *_dst) \ + *_dst = *_src; \ + break; \ + \ + case FI_MAX: \ + *_cmp = *_dst; \ + if (*_src > *_dst) \ + *_dst = *_src; \ + break; \ + \ + case FI_SUM: \ + *_cmp = *_dst; \ + *_dst = *_dst + *_src; \ + break; \ + \ + case FI_PROD: \ + *_cmp = *_dst; \ + *_dst = *_dst * *_src; \ + break; \ + \ + case FI_LOR: \ + *_cmp = *_dst; \ + *_dst = *_dst || *_src; \ + break; \ + \ + case FI_LAND: \ + *_cmp = *_dst; \ + *_dst = *_dst && *_src; \ + break; \ + \ + case FI_ATOMIC_READ: \ + *_cmp = *_dst; \ + break; \ + \ + case FI_ATOMIC_WRITE: \ + *_cmp = *_dst; \ + *_dst = *_src; \ + break; \ + \ + case FI_CSWAP: \ + if (*_cmp == *_dst) \ + *_dst = *_src; \ + else \ + *_cmp = *_dst; \ + break; \ + \ + case FI_CSWAP_NE: \ + if (*_cmp != *_dst) \ + *_dst = *_src; \ + else \ + *_cmp = *_dst; \ + break; \ + \ + case FI_CSWAP_LE: \ + if (*_cmp <= *_dst) \ + *_dst = *_src; \ + else \ + *_cmp = *_dst; \ + break; \ + \ + case FI_CSWAP_LT: \ + if (*_cmp < *_dst) \ + *_dst = *_src; \ + else \ + *_cmp = *_dst; \ + break; \ + \ + case FI_CSWAP_GE: \ + if (*_cmp >= *_dst) \ + *_dst = *_src; \ + else \ + *_cmp = *_dst; \ + break; \ + \ + case FI_CSWAP_GT: \ + if (*_cmp > *_dst) \ + *_dst = *_src; \ + else \ + *_cmp = *_dst; \ + break; \ + \ + default: \ + SOCK_LOG_ERROR("Atomic operation type not supported\n"); \ + break; \ + } \ + }while(0) + + +static int sock_pe_update_atomic(void *cmp, void *dst, void *src, + enum fi_datatype datatype, enum fi_op op) +{ + + + switch (datatype) { + case FI_INT8: + { + int8_t *_cmp, *_dst, *_src, _tmp; + _cmp = cmp, _src = src, _dst = dst; + SOCK_ATOMIC_UPDATE_INT(_cmp, _src, _dst, _tmp); + break; + } + + case FI_UINT8: + { + uint8_t *_cmp, *_dst, *_src, _tmp; + _cmp = cmp, _src = src, _dst = dst; + SOCK_ATOMIC_UPDATE_INT(_cmp, _src, _dst, _tmp); + break; + } + + case FI_INT16: + { + int16_t *_cmp, *_dst, *_src, _tmp; + _cmp = cmp, _src = src, _dst = dst; + SOCK_ATOMIC_UPDATE_INT(_cmp, _src, _dst, _tmp); + break; + } + + case FI_UINT16: + { + uint16_t *_cmp, *_dst, *_src, _tmp; + _cmp = cmp, _src = src, _dst = dst; + SOCK_ATOMIC_UPDATE_INT(_cmp, _src, _dst, _tmp); + break; + } + + case FI_INT32: + { + int32_t *_cmp, *_dst, *_src, _tmp; + _cmp = cmp, _src = src, _dst = dst; + SOCK_ATOMIC_UPDATE_INT(_cmp, _src, _dst, _tmp); + break; + } + + case FI_UINT32: + { + uint32_t *_cmp, *_dst, *_src, _tmp; + _cmp = cmp, _src = src, _dst = dst; + SOCK_ATOMIC_UPDATE_INT(_cmp, _src, _dst, _tmp); + break; + } + + case FI_INT64: + { + int64_t *_cmp, *_dst, *_src, _tmp; + _cmp = cmp, _src = src, _dst = dst; + SOCK_ATOMIC_UPDATE_INT(_cmp, _src, _dst, _tmp); + break; + } + + case FI_UINT64: + { + uint64_t *_cmp, *_dst, *_src, _tmp; + _cmp = cmp, _src = src, _dst = dst; + SOCK_ATOMIC_UPDATE_INT(_cmp, _src, _dst, _tmp); + break; + } + + case FI_FLOAT: + { + float *_cmp, *_dst, *_src; + _cmp = cmp, _src = src, _dst = dst; + SOCK_ATOMIC_UPDATE_FLOAT(_cmp, _src, _dst); + break; + } + + case FI_DOUBLE: + { + double *_cmp, *_dst, *_src; + _cmp = cmp, _src = src, _dst = dst; + SOCK_ATOMIC_UPDATE_FLOAT(_cmp, _src, _dst); + break; + } + + default: + SOCK_LOG_ERROR("Atomic datatype not supported\n"); + break; + } + return 0; +} + + +static int sock_pe_process_rx_atomic(struct sock_pe *pe, struct sock_rx_ctx *rx_ctx, + struct sock_pe_entry *pe_entry) +{ + int i, j, ret = 0; + size_t datatype_sz; + struct sock_mr *mr; + uint64_t offset, len, entry_len, data_len; + + + if (pe->pe_atomic){ + if (pe->pe_atomic != pe_entry) + return 0; + } else { + pe->pe_atomic = pe_entry; + } + + len = sizeof(struct sock_msg_hdr); + offset = pe_entry->done_len - len; + len = sizeof(struct sock_atomic_req); + + if (pe_entry->done_len < len) { + data_len = sizeof(struct sock_atomic_req) - pe_entry->done_len; + ret = sock_comm_recv(pe_entry->conn, + (char *)&pe_entry->rx.rx_op + offset, + data_len); + if (ret <= 0) + return ret; + pe_entry->done_len += ret; + if (pe_entry->done_len != len) + return 0; + } + + if (pe_entry->msg_hdr.flags & FI_REMOTE_CQ_DATA) { + offset = pe_entry->done_len - len; + len += SOCK_CQ_DATA_SIZE; + data_len = SOCK_CQ_DATA_SIZE - offset; + if (pe_entry->done_len < len) { + ret = sock_comm_recv(pe_entry->conn, + (char*)&pe_entry->data + offset, + data_len); + if (ret <= 0) + return ret; + pe_entry->done_len += ret; + if (pe_entry->done_len != len) + return 0; + } + } + + /* dst iocs */ + entry_len = sizeof(union sock_iov) * pe_entry->rx.rx_op.dest_iov_len; + offset = pe_entry->done_len - len; + len += entry_len; + if (pe_entry->done_len < len) { + data_len = entry_len - offset; + ret = sock_comm_recv(pe_entry->conn, + (char *)&pe_entry->rx.rx_iov[0] + offset, + data_len); + if (ret <= 0) + return ret; + pe_entry->done_len += ret; + if (ret != data_len) { + SOCK_LOG_INFO("Incomplete Recv: %d\n", ret); + return 0; + } + } + + entry_len = 0; + datatype_sz = fi_datatype_size(pe_entry->rx.rx_op.atomic.datatype); + for (i = 0; i < pe_entry->rx.rx_op.dest_iov_len; i++) { + entry_len += pe_entry->rx.rx_iov[i].ioc.count; + } + entry_len *= datatype_sz; + + /* cmp data */ + if (pe_entry->rx.rx_op.atomic.cmp_iov_len) { + offset = pe_entry->done_len - len; + len += entry_len; + + if (pe_entry->done_len < len) { + data_len = entry_len - offset; + + ret = sock_comm_recv(pe_entry->conn, + &pe_entry->rx.atomic_cmp[0] + offset, + data_len); + + if (ret <= 0) + return ret; + pe_entry->done_len += ret; + if (ret != data_len) { + SOCK_LOG_INFO("Incomplete Recv: %d\n", ret); + return 0; + } + + /* compare */ + offset = 0; + for (i = 0; i < pe_entry->rx.rx_op.dest_iov_len; i++) { + + mr = sock_mr_verify_key(rx_ctx->domain, + pe_entry->rx.rx_iov[i].ioc.key, + (void*)pe_entry->rx.rx_iov[i].ioc.addr, + pe_entry->rx.rx_iov[i].ioc.count * datatype_sz, + FI_REMOTE_WRITE); + if (!mr) { + SOCK_LOG_ERROR("Remote memory access error: %p, %lu, %lu\n", + (void*)pe_entry->rx.rx_iov[i].ioc.addr, + pe_entry->rx.rx_iov[i].ioc.count * datatype_sz, + pe_entry->rx.rx_iov[i].ioc.key); + sock_pe_send_response(pe, pe_entry, 0, SOCK_OP_ATOMIC_ERROR); + goto err; + } + if (mr->flags & FI_MR_OFFSET) + pe_entry->rx.rx_iov[i].ioc.addr += mr->offset; + } + } + } + + /* src data */ + offset = pe_entry->done_len - len; + len += entry_len; + if (pe_entry->done_len < len) { + data_len = entry_len - offset; + ret = sock_comm_recv(pe_entry->conn, + &pe_entry->rx.atomic_src[0] + offset, data_len); + + if (ret <= 0) + return ret; + pe_entry->done_len += ret; + if (ret != data_len) { + SOCK_LOG_INFO("Incomplete Recv: %d\n", ret); + return 0; + } + } + + offset = 0; + for (i = 0; i < pe_entry->rx.rx_op.dest_iov_len; i++) { + for (j = 0; j < pe_entry->rx.rx_iov[i].ioc.count; j++) { + sock_pe_update_atomic((char*)&pe_entry->rx.atomic_cmp[0] + offset, + (char *)pe_entry->rx.rx_iov[i].ioc.addr + j * datatype_sz, + (char*)&pe_entry->rx.atomic_src[0] + offset, + pe_entry->rx.rx_op.atomic.datatype, + pe_entry->rx.rx_op.atomic.op); + offset += datatype_sz; + } + } + + pe_entry->buf = pe_entry->rx.rx_iov[0].iov.addr; + pe_entry->data_len = offset; + + if (pe_entry->flags & FI_REMOTE_SIGNAL) { + sock_pe_report_rx_completion(pe_entry); + } + + sock_pe_report_remote_write(rx_ctx, pe_entry); + sock_pe_report_mr_completion(rx_ctx->domain, pe_entry); + sock_pe_send_response(pe, pe_entry, + pe_entry->rx.rx_op.atomic.res_iov_len ? + entry_len : 0, SOCK_OP_ATOMIC_COMPLETE); + return ret; + +err: + sock_pe_report_error(pe_entry, 0); + return -FI_EINVAL; +} + + +int sock_pe_progress_buffered_rx(struct sock_rx_ctx *rx_ctx) +{ + struct dlist_entry *entry; + struct sock_pe_entry pe_entry; + struct sock_rx_entry *rx_buffered, *rx_posted; + int i, rem, offset, len, used_len, dst_offset; + + if (dlist_empty(&rx_ctx->rx_entry_list) || + dlist_empty(&rx_ctx->rx_buffered_list)) + goto out; + + for (entry = rx_ctx->rx_buffered_list.next; + entry != &rx_ctx->rx_buffered_list;) { + + rx_buffered = container_of(entry, struct sock_rx_entry, entry); + entry = entry->next; + + rx_posted = sock_rx_get_entry(rx_ctx, rx_buffered->addr, + rx_buffered->tag); + if (!rx_posted) + continue; + + rx_ctx->buffered_len -= rem; + SOCK_LOG_INFO("Consuming buffered entry: %p, ctx: %p\n", + rx_buffered, rx_ctx); + SOCK_LOG_INFO("Consuming posted entry: %p, ctx: %p\n", + rx_posted, rx_ctx); + + offset = 0; + rem = rx_buffered->iov[0].iov.len; + used_len = rx_posted->used; + for (i = 0; i < rx_posted->rx_op.dest_iov_len && rem > 0; i++) { + if (used_len >= rx_posted->rx_op.dest_iov_len) { + used_len -= rx_posted->rx_op.dest_iov_len; + continue; + } + + dst_offset = used_len; + len = MIN(rx_posted->iov[i].iov.len, rem); + pe_entry.buf = (uint64_t) + (char*)rx_posted->iov[i].iov.addr + dst_offset; + memcpy((char*)rx_posted->iov[i].iov.addr + dst_offset, + (char*)rx_buffered->iov[0].iov.addr + offset, len); + offset += len; + rem -= len; + dst_offset = used_len = 0; + rx_posted->used += len; + pe_entry.data_len = rx_buffered->used; + } + + pe_entry.done_len = offset; + pe_entry.data = rx_buffered->data; + pe_entry.tag = rx_buffered->tag; + pe_entry.context = (uint64_t)rx_posted->context; + pe_entry.rx.rx_iov[0].iov.addr = rx_posted->iov[0].iov.addr; + pe_entry.type = SOCK_PE_RX; + pe_entry.comp = rx_buffered->comp; + + if (rx_posted->flags & FI_MULTI_RECV) { + if (sock_rx_avail_len(rx_posted) < rx_ctx->min_multi_recv) { + pe_entry.flags |= FI_MULTI_RECV; + dlist_remove(&rx_posted->entry); + } + } else { + dlist_remove(&rx_posted->entry); + } + + if (rem) { + SOCK_LOG_INFO("Not enough space in posted recv buffer\n"); + sock_pe_report_error(&pe_entry, rem); + goto out; + } else { + sock_pe_report_rx_completion(&pe_entry); + } + + dlist_remove(&rx_buffered->entry); + sock_rx_release_entry(rx_buffered); + + if (pe_entry.flags & FI_MULTI_RECV) + sock_rx_release_entry(rx_posted); + } + +out: + return 0; +} + +static int sock_pe_process_rx_send(struct sock_pe *pe, struct sock_rx_ctx *rx_ctx, + struct sock_pe_entry *pe_entry) +{ + ssize_t i, ret = 0; + struct sock_rx_entry *rx_entry; + uint64_t len, rem, offset, data_len, done_data, used; + + offset = 0; + len = sizeof(struct sock_msg_hdr); + + if (pe_entry->msg_hdr.op_type == SOCK_OP_TSEND) { + offset = pe_entry->done_len - len; + len += SOCK_TAG_SIZE; + if (pe_entry->done_len < len) { + ret = sock_comm_recv(pe_entry->conn, + (char*)&pe_entry->tag + offset, + SOCK_TAG_SIZE - offset); + if (ret <= 0) + return ret; + pe_entry->done_len += ret; + if (pe_entry->done_len != len) + return 0; + } + } + + if (pe_entry->msg_hdr.flags & FI_REMOTE_CQ_DATA) { + offset = pe_entry->done_len - len; + len += SOCK_CQ_DATA_SIZE; + if (pe_entry->done_len < len) { + ret = sock_comm_recv(pe_entry->conn, + (char*)&pe_entry->data + offset, + SOCK_CQ_DATA_SIZE - offset); + if (ret <= 0) + return ret; + pe_entry->done_len += ret; + if (pe_entry->done_len != len) + return 0; + } + } + + if (pe_entry->done_len == len && !pe_entry->rx.rx_entry) { + + data_len = pe_entry->msg_hdr.msg_len - len; + fastlock_acquire(&rx_ctx->lock); + + /* progress buffered recvs, if any */ + sock_pe_progress_buffered_rx(rx_ctx); + + rx_entry = sock_rx_get_entry(rx_ctx, pe_entry->addr, pe_entry->tag); + SOCK_LOG_INFO("Consuming posted entry: %p\n", rx_entry); + + if (!rx_entry) { + SOCK_LOG_INFO("%p: No matching recv, buffering recv (len=%llu)\n", + pe_entry, (long long unsigned int)data_len); + + rx_entry = sock_rx_new_buffered_entry(rx_ctx, data_len); + if (!rx_entry) + return -FI_ENOMEM; + + rx_entry->addr = pe_entry->addr; + rx_entry->tag = pe_entry->tag; + rx_entry->data = pe_entry->data; + rx_entry->ignore = 0; + rx_entry->comp = pe_entry->comp; + pe_entry->context = rx_entry->context; + } + pe_entry->context = rx_entry->context; + pe_entry->rx.rx_entry = rx_entry; + rx_entry->is_busy = 1; + fastlock_release(&rx_ctx->lock); + } + + rx_entry = pe_entry->rx.rx_entry; + done_data = pe_entry->done_len - len; + pe_entry->data_len = pe_entry->msg_hdr.msg_len - len; + rem = pe_entry->msg_hdr.msg_len - (len + done_data); + used = rx_entry->used; + + for (i = 0; rem > 0 && i < rx_entry->rx_op.dest_iov_len; i++) { + + /* skip used contents in rx_entry */ + if (used >= rx_entry->iov[i].iov.len) { + used -= rx_entry->iov[i].iov.len; + continue; + } + + offset = used; + data_len = MIN(rx_entry->iov[i].iov.len - used, rem); + ret = sock_comm_recv(pe_entry->conn, + (char *)rx_entry->iov[i].iov.addr + offset, + data_len); + if (ret <= 0) + return ret; + + if (!pe_entry->buf) + pe_entry->buf = (uint64_t) + ((char *)rx_entry->iov[i].iov.addr + offset); + rem -= ret; + used = 0; + pe_entry->done_len += ret; + rx_entry->used += ret; + if (ret != data_len) + return 0; + } + + fastlock_acquire(&rx_ctx->lock); + if (rx_entry->flags & FI_MULTI_RECV) { + if (sock_rx_avail_len(rx_entry) < rx_ctx->min_multi_recv) { + pe_entry->flags |= FI_MULTI_RECV; + dlist_remove(&rx_entry->entry); + } + } else { + if (!rx_entry->is_buffered) + dlist_remove(&rx_entry->entry); + } + fastlock_release(&rx_ctx->lock); + + pe_entry->is_complete = 1; + rx_entry->is_busy = 0; + + /* report error, if any */ + if (rem) { + SOCK_LOG_ERROR("Not enough space in posted recv buffer\n"); + sock_pe_report_error(pe_entry, rem); + goto out; + } else { + if (!rx_entry->is_buffered) + sock_pe_report_rx_completion(pe_entry); + } + + if (pe_entry->msg_hdr.flags & FI_REMOTE_COMPLETE) { + sock_pe_send_response(pe, pe_entry, 0, SOCK_OP_SEND_COMPLETE); + } + +out: + if (!rx_entry->is_buffered && + (!(rx_entry->flags & FI_MULTI_RECV) || + (pe_entry->flags & FI_MULTI_RECV))) + sock_rx_release_entry(rx_entry); + return ret; +} + +static int sock_pe_process_recv(struct sock_pe *pe, struct sock_rx_ctx *rx_ctx, + struct sock_pe_entry *pe_entry) +{ + int ret; + struct sock_msg_hdr *msg_hdr; + + msg_hdr = &pe_entry->msg_hdr; + if (msg_hdr->version != SOCK_WIRE_PROTO_VERSION) { + SOCK_LOG_ERROR("Invalid wire protocol\n"); + ret = -FI_EINVAL; + goto out; + } + + /* process rx entry */ + switch (pe_entry->msg_hdr.op_type) { + + case SOCK_OP_SEND: + case SOCK_OP_TSEND: + ret = sock_pe_process_rx_send(pe, rx_ctx, pe_entry); + break; + + case SOCK_OP_WRITE: + ret = sock_pe_process_rx_write(pe, rx_ctx, pe_entry); + break; + + case SOCK_OP_READ: + ret = sock_pe_process_rx_read(pe, rx_ctx, pe_entry); + break; + + case SOCK_OP_ATOMIC_WRITE: + case SOCK_OP_ATOMIC_READ_WRITE: + case SOCK_OP_ATOMIC_COMP_WRITE: + ret = sock_pe_process_rx_atomic(pe, rx_ctx, pe_entry); + break; + + case SOCK_OP_SEND_COMPLETE: + case SOCK_OP_WRITE_COMPLETE: + case SOCK_OP_WRITE_ERROR: + case SOCK_OP_READ_ERROR: + case SOCK_OP_ATOMIC_ERROR: + ret = sock_pe_handle_ack(pe, pe_entry); + break; + + case SOCK_OP_READ_COMPLETE: + ret = sock_pe_handle_read_complete(pe, pe_entry); + break; + + case SOCK_OP_ATOMIC_COMPLETE: + ret = sock_pe_handle_atomic_complete(pe, pe_entry); + break; + + default: + ret = -FI_ENOSYS; + SOCK_LOG_ERROR("Operation not supported\n"); + break; + } + +out: + return ret; +} + +static int sock_pe_read_hdr(struct sock_pe *pe, + struct sock_pe_entry *pe_entry) +{ + int ret; + struct sock_msg_hdr *msg_hdr; + struct sock_conn *conn = pe_entry->conn; + + if (conn->rx_pe_entry != NULL && conn->rx_pe_entry != pe_entry) + return 0; + + if (conn->rx_pe_entry == NULL) { + conn->rx_pe_entry = pe_entry; + } + + msg_hdr = &pe_entry->msg_hdr; + if (pe_entry->done_len < sizeof(struct sock_msg_hdr)) { + ret = sock_comm_recv(conn, + (char*)msg_hdr + pe_entry->done_len, + sizeof(struct sock_msg_hdr) - + pe_entry->done_len); + if (ret <= 0) + return ret; + + pe_entry->done_len += ret; + if (pe_entry->done_len == sizeof(struct sock_msg_hdr)) { + + msg_hdr->msg_len = ntohll(msg_hdr->msg_len); + msg_hdr->rx_id = ntohs(msg_hdr->rx_id); + msg_hdr->flags = ntohll(msg_hdr->flags); + msg_hdr->pe_entry_id = ntohs(msg_hdr->pe_entry_id); + pe_entry->rx.header_read = 1; + + SOCK_LOG_INFO("PE RX (Hdr read): MsgLen: %lu, TX-ID: %d, Type: %d\n", + msg_hdr->msg_len, msg_hdr->rx_id, msg_hdr->op_type); + } + } + return 0; +} + +static int sock_pe_progress_tx_atomic(struct sock_pe *pe, + struct sock_pe_entry *pe_entry, + struct sock_conn *conn) +{ + int ret, datatype_sz; + union sock_iov iov[SOCK_EP_MAX_IOV_LIMIT]; + ssize_t len, i, offset, done_data, data_len, iov_len; + struct sock_atomic_req req; + + if (pe_entry->tx.send_done) + return 0; + + len = sizeof(struct sock_msg_hdr); + offset = pe_entry->done_len; + len = sizeof(struct sock_atomic_req); + if (pe_entry->done_len < len) { + req.op.src_iov_len = 0; + req.op.dest_iov_len = pe_entry->tx.tx_op.dest_iov_len; + req.op.atomic.op = pe_entry->tx.tx_op.atomic.op; + req.op.atomic.datatype = pe_entry->tx.tx_op.atomic.datatype; + req.op.atomic.res_iov_len = pe_entry->tx.tx_op.atomic.res_iov_len; + req.op.atomic.cmp_iov_len = pe_entry->tx.tx_op.atomic.cmp_iov_len; + + data_len = sizeof(req) - pe_entry->done_len; + ret = sock_comm_send(conn, (char*)&req + offset, data_len); + if (ret <= 0) + return ret; + pe_entry->done_len += ret; + if (pe_entry->done_len != len) + return 0; + } + + if (pe_entry->flags & FI_REMOTE_CQ_DATA) { + offset = pe_entry->done_len - len; + len += SOCK_CQ_DATA_SIZE; + if (pe_entry->done_len < len) { + data_len = SOCK_CQ_DATA_SIZE - offset; + ret = sock_comm_send(conn, + (char*)pe_entry->data + offset, + data_len); + if (ret <= 0) + return ret; + pe_entry->done_len += ret; + if (pe_entry->done_len != data_len) + return 0; + } + } + + /* dest iocs */ + offset = pe_entry->done_len - len; + iov_len = sizeof(union sock_iov) * pe_entry->tx.tx_op.dest_iov_len; + len += iov_len; + + if (pe_entry->done_len < len && iov_len) { + for (i=0; i < pe_entry->tx.tx_op.dest_iov_len; i++) { + iov[i].ioc.addr = pe_entry->tx.tx_iov[i].dst.ioc.addr; + iov[i].ioc.count = pe_entry->tx.tx_iov[i].dst.ioc.count; + iov[i].ioc.key = pe_entry->tx.tx_iov[i].dst.ioc.key; + } + + ret = sock_comm_send(conn, + (char*)&iov[0] + offset, iov_len - offset); + if (ret <= 0) + return ret; + pe_entry->done_len += ret; + if (pe_entry->done_len != len) + return 0; + } + + /* cmp data */ + datatype_sz = fi_datatype_size(pe_entry->tx.tx_op.atomic.datatype); + done_data = pe_entry->done_len - len; + for (i=0; i < pe_entry->tx.tx_op.atomic.cmp_iov_len; i++) { + len += pe_entry->tx.tx_iov[i].cmp.ioc.count * datatype_sz; + } + + if (pe_entry->done_len < len) { + + for (i=0; i < pe_entry->tx.tx_op.atomic.cmp_iov_len; i++) { + if (done_data >= pe_entry->tx.tx_iov[i].cmp.ioc.count * datatype_sz) { + done_data -= pe_entry->tx.tx_iov[i].cmp.ioc.count * datatype_sz; + continue; + } + + offset = done_data; + data_len = (pe_entry->tx.tx_iov[i].cmp.ioc.count * + datatype_sz) - done_data; + + ret = sock_comm_send(conn, + (char*)pe_entry->tx.tx_iov[i].cmp.ioc.addr + + offset, data_len); + if (ret <= 0) + return ret; + + done_data = 0; + pe_entry->done_len += ret; + if ( ret != data_len) + return 0; + } + } + + /* data */ + if (pe_entry->flags & FI_INJECT) { + done_data = pe_entry->done_len - len; + len += pe_entry->tx.tx_op.src_iov_len; + pe_entry->data_len = pe_entry->tx.tx_op.src_iov_len; + + data_len = pe_entry->tx.tx_op.src_iov_len - done_data; + if (pe_entry->done_len < len) { + ret = sock_comm_send(conn, + (char*)pe_entry->tx.inject_data + done_data, + data_len); + if (ret <= 0) + return ret; + + pe_entry->done_len += ret; + if (pe_entry->done_len <= len) + return 0; + } + } else { + done_data = pe_entry->done_len - len; + for (i=0; i < pe_entry->tx.tx_op.src_iov_len; i++) { + + if (done_data >= pe_entry->tx.tx_iov[i].src.ioc.count * datatype_sz) { + done_data -= pe_entry->tx.tx_iov[i].src.ioc.count * datatype_sz; + continue; + } + + offset = done_data; + data_len = (pe_entry->tx.tx_iov[i].src.ioc.count * + datatype_sz) - done_data; + + ret = sock_comm_send(conn, + (char*)pe_entry->tx.tx_iov[i].src.ioc.addr + + offset, data_len); + if (ret <= 0) + return ret; + + done_data = 0; + pe_entry->done_len += ret; + pe_entry->data_len += ret; + if ( ret != data_len) + return 0; + } + } + + if (pe_entry->done_len == pe_entry->total_len) { + pe_entry->tx.send_done = 1; + pe_entry->conn->tx_pe_entry = NULL; + SOCK_LOG_INFO("Send complete\n"); + } + sock_comm_flush(pe_entry->conn); + return 0; +} + +static int sock_pe_progress_tx_write(struct sock_pe *pe, + struct sock_pe_entry *pe_entry, + struct sock_conn *conn) +{ + int ret; + union sock_iov dest_iov[SOCK_EP_MAX_IOV_LIMIT]; + ssize_t len, i, offset, done_data, data_len, dest_iov_len; + + if (pe_entry->tx.send_done) + return 0; + + len = sizeof(struct sock_msg_hdr); + if (pe_entry->flags & FI_REMOTE_CQ_DATA) { + + offset = pe_entry->done_len - len; + len += SOCK_CQ_DATA_SIZE; + if (pe_entry->done_len < len) { + ret = sock_comm_send(conn, + (char*)pe_entry->data + offset, + SOCK_CQ_DATA_SIZE - offset); + if (ret <= 0) + return ret; + pe_entry->done_len += ret; + if (pe_entry->done_len != len) + return 0; + } + } + + /* dest iovs */ + offset = pe_entry->done_len - len; + dest_iov_len = sizeof(union sock_iov) * pe_entry->tx.tx_op.dest_iov_len; + len += dest_iov_len; + + if (pe_entry->done_len < len) { + for (i=0; i < pe_entry->tx.tx_op.dest_iov_len; i++) { + dest_iov[i].iov.addr = pe_entry->tx.tx_iov[i].dst.iov.addr; + dest_iov[i].iov.len = pe_entry->tx.tx_iov[i].dst.iov.len; + dest_iov[i].iov.key = pe_entry->tx.tx_iov[i].dst.iov.key; + } + + ret = sock_comm_send(conn, + (char*)&dest_iov[0] + offset, + dest_iov_len - offset); + if (ret <= 0) + return ret; + pe_entry->done_len += ret; + if (pe_entry->done_len != len) + return 0; + } + + /* data */ + if (pe_entry->flags & FI_INJECT) { + offset = pe_entry->done_len - len; + len += pe_entry->tx.tx_op.src_iov_len; + pe_entry->data_len = pe_entry->tx.tx_op.src_iov_len; + + if (pe_entry->done_len < len) { + ret = sock_comm_send(conn, + (char*)pe_entry->tx.inject_data + offset, + pe_entry->tx.tx_op.src_iov_len - offset); + if (ret <= 0) + return ret; + + pe_entry->done_len += ret; + if (pe_entry->done_len <= len) + return 0; + } + } else { + done_data = pe_entry->done_len - len; + + for (i=0; i < pe_entry->tx.tx_op.src_iov_len; i++) { + + if (done_data >= pe_entry->tx.tx_iov[i].src.iov.len) { + done_data -= pe_entry->tx.tx_iov[i].src.iov.len; + continue; + } + + offset = done_data; + data_len = pe_entry->tx.tx_iov[i].src.iov.len - done_data; + + ret = sock_comm_send(conn, + (char*)pe_entry->tx.tx_iov[i].src.iov.addr + + offset, data_len); + if (ret <= 0) + return ret; + + done_data = 0; + pe_entry->done_len += ret; + pe_entry->data_len += ret; + if ( ret != data_len) + return 0; + } + } + + if (pe_entry->done_len == pe_entry->total_len) { + pe_entry->tx.send_done = 1; + pe_entry->conn->tx_pe_entry = NULL; + SOCK_LOG_INFO("Send complete\n"); + } + sock_comm_flush(pe_entry->conn); + return 0; +} + +static int sock_pe_progress_tx_read(struct sock_pe *pe, + struct sock_pe_entry *pe_entry, + struct sock_conn *conn) +{ + int ret; + union sock_iov src_iov[SOCK_EP_MAX_IOV_LIMIT]; + ssize_t len, i, offset, src_iov_len; + + if (pe_entry->tx.send_done) + return 0; + + len = sizeof(struct sock_msg_hdr); + offset = pe_entry->done_len - len; + src_iov_len = sizeof(union sock_iov) * pe_entry->tx.tx_op.src_iov_len; + len += src_iov_len; + + pe_entry->data_len = 0; + for (i=0; i < pe_entry->tx.tx_op.src_iov_len; i++) + pe_entry->data_len += pe_entry->tx.tx_iov[i].src.iov.len; + + /* src iovs */ + if (pe_entry->done_len < len) { + for (i=0; i < pe_entry->tx.tx_op.src_iov_len; i++) { + src_iov[i].iov.addr = pe_entry->tx.tx_iov[i].src.iov.addr; + src_iov[i].iov.len = pe_entry->tx.tx_iov[i].src.iov.len; + src_iov[i].iov.key = pe_entry->tx.tx_iov[i].src.iov.key; + } + + ret = sock_comm_send(conn, + (char*)&src_iov[0] + offset, + src_iov_len - offset); + if (ret <= 0) + return ret; + pe_entry->done_len += ret; + if (pe_entry->done_len != len) + return 0; + } + + if (pe_entry->done_len == pe_entry->total_len) { + pe_entry->tx.send_done = 1; + pe_entry->conn->tx_pe_entry = NULL; + SOCK_LOG_INFO("Send complete\n"); + } + sock_comm_flush(pe_entry->conn); + return 0; +} + + +static int sock_pe_progress_tx_send(struct sock_pe *pe, + struct sock_pe_entry *pe_entry, + struct sock_conn *conn) +{ + int ret; + ssize_t len, i, offset, done_data, data_len; + + if (pe_entry->tx.send_done) + return 0; + + len = sizeof(struct sock_msg_hdr); + if (pe_entry->tx.tx_op.op == SOCK_OP_TSEND) { + + offset = pe_entry->done_len - len; + + len += SOCK_TAG_SIZE; + if (pe_entry->done_len < len) { + ret = sock_comm_send(conn, + (char*)&pe_entry->tag + offset, + SOCK_TAG_SIZE - offset); + if (ret <= 0) + return ret; + + pe_entry->done_len += ret; + if (pe_entry->done_len != len) + return 0; + } + } + + if (pe_entry->flags & FI_REMOTE_CQ_DATA) { + + offset = pe_entry->done_len - len; + len += SOCK_CQ_DATA_SIZE; + if (pe_entry->done_len < len) { + ret = sock_comm_send(conn, + (char*)pe_entry->data + offset, + SOCK_CQ_DATA_SIZE - offset); + if (ret <= 0) + return ret; + + pe_entry->done_len += ret; + if (pe_entry->done_len != len) + return 0; + } + } + + if (pe_entry->flags & FI_INJECT) { + offset = pe_entry->done_len - len; + len += pe_entry->tx.tx_op.src_iov_len; + pe_entry->data_len = pe_entry->tx.tx_op.src_iov_len; + + if (pe_entry->done_len < len) { + ret = sock_comm_send(conn, + (char*)pe_entry->tx.inject_data + offset, + pe_entry->tx.tx_op.src_iov_len - offset); + if (ret <= 0) + return ret; + + pe_entry->done_len += ret; + if (pe_entry->done_len <= len) + return 0; + } + } else { + done_data = pe_entry->done_len - len; + pe_entry->data_len = pe_entry->total_len - len; + + for (i=0; i < pe_entry->tx.tx_op.src_iov_len; i++) { + + if (done_data >= pe_entry->tx.tx_iov[i].src.iov.len) { + done_data -= pe_entry->tx.tx_iov[i].src.iov.len; + continue; + } + + offset = done_data; + data_len = pe_entry->tx.tx_iov[i].src.iov.len - done_data; + + ret = sock_comm_send(conn, + (char*)pe_entry->tx.tx_iov[i].src.iov.addr + + offset, data_len); + if (ret <= 0) + return ret; + + done_data = 0; + pe_entry->done_len += ret; + if ( ret != data_len) + return 0; + } + } + + sock_comm_flush(pe_entry->conn); + if (pe_entry->done_len == pe_entry->total_len) { + pe_entry->tx.send_done = 1; + pe_entry->conn->tx_pe_entry = NULL; + SOCK_LOG_INFO("Send complete\n"); + + if (!(pe_entry->flags & FI_REMOTE_COMPLETE)) { + sock_pe_report_tx_completion(pe_entry); + pe_entry->is_complete = 1; + } + } + + return 0; +} + +static int sock_pe_progress_tx_entry(struct sock_pe *pe, + struct sock_tx_ctx *tx_ctx, + struct sock_pe_entry *pe_entry) +{ + int ret; + struct sock_conn *conn = pe_entry->conn; + + if (pe_entry->tx.send_done) + return 0; + + assert(pe_entry->conn); + if (conn->tx_pe_entry != NULL && conn->tx_pe_entry != pe_entry) { + SOCK_LOG_INFO("Cannot progress %p as conn %p is being used by %p\n", + pe_entry, conn, conn->tx_pe_entry); + return 0; + } + + if (conn->tx_pe_entry == NULL) { + SOCK_LOG_INFO("Connection %p grabbed by %p\n", conn, pe_entry); + conn->tx_pe_entry = pe_entry; + } + + if (!pe_entry->tx.header_sent) { + ret = sock_comm_send(conn, + (char*)&pe_entry->msg_hdr + pe_entry->done_len, + sizeof(struct sock_msg_hdr) - pe_entry->done_len); + if (ret <= 0) + return ret; + + pe_entry->done_len += ret; + if (pe_entry->done_len == sizeof(struct sock_msg_hdr)) { + pe_entry->tx.header_sent = 1; + SOCK_LOG_INFO("[%p] Header sent\n", pe_entry); + }else { + return 0; + } + } + + switch (pe_entry->msg_hdr.op_type) { + + case SOCK_OP_SEND: + case SOCK_OP_TSEND: + ret = sock_pe_progress_tx_send(pe, pe_entry, conn); + break; + + case SOCK_OP_WRITE: + ret = sock_pe_progress_tx_write(pe, pe_entry, conn); + break; + + case SOCK_OP_READ: + ret = sock_pe_progress_tx_read(pe, pe_entry, conn); + break; + + case SOCK_OP_ATOMIC_WRITE: + case SOCK_OP_ATOMIC_READ_WRITE: + case SOCK_OP_ATOMIC_COMP_WRITE: + ret = sock_pe_progress_tx_atomic(pe, pe_entry, conn); + break; + + default: + ret = -FI_ENOSYS; + SOCK_LOG_ERROR("Operation not supported\n"); + break; + } + + return ret; +} + +static int sock_pe_new_rx_entry(struct sock_pe *pe, struct sock_rx_ctx *rx_ctx, + struct sock_ep *ep, struct sock_conn *conn, + int key) +{ + struct sock_pe_entry *pe_entry; + pe_entry = sock_pe_acquire_entry(pe); + if (!pe_entry) { + SOCK_LOG_ERROR("Error in getting PE entry\n"); + return -FI_EINVAL; + } + + memset(&pe_entry->rx, 0, sizeof(struct sock_rx_pe_entry)); + + pe_entry->conn = conn; + pe_entry->type = SOCK_PE_RX; + pe_entry->ep = ep; + pe_entry->is_complete = 0; + pe_entry->done_len = 0; + + if (ep->ep_type == FI_EP_MSG) + pe_entry->addr = FI_ADDR_NOTAVAIL; + else + pe_entry->addr = sock_av_lookup_key(ep->av, key); + + if (ep->ep_attr.rx_ctx_cnt == FI_SHARED_CONTEXT) + pe_entry->comp = &ep->comp; + else + pe_entry->comp = &rx_ctx->comp; + + SOCK_LOG_INFO("New RX on PE entry %p (%ld)\n", + pe_entry, PE_INDEX(pe, pe_entry)); + + SOCK_LOG_INFO("Inserting rx_entry to PE entry %p, conn: %p\n", + pe_entry, pe_entry->conn); + + /* link to tracking list in rx_ctx */ + dlist_init(&pe_entry->ctx_entry); + dlist_insert_tail(&pe_entry->ctx_entry, &rx_ctx->pe_entry_list); + return 0; +} + +static int sock_pe_new_tx_entry(struct sock_pe *pe, struct sock_tx_ctx *tx_ctx) +{ + int i, datatype_sz; + struct sock_msg_hdr *msg_hdr; + struct sock_pe_entry *pe_entry; + struct sock_ep *ep; + uint16_t rx_id; + + pe_entry = sock_pe_acquire_entry(pe); + if (!pe_entry) { + SOCK_LOG_ERROR("Failed to get free PE entry \n"); + return -FI_EINVAL; + } + + memset(&pe_entry->tx, 0, sizeof(struct sock_tx_pe_entry)); + memset(&pe_entry->msg_hdr, 0, sizeof(struct sock_msg_hdr)); + + pe_entry->type = SOCK_PE_TX; + pe_entry->is_complete = 0; + pe_entry->done_len = 0; + pe_entry->conn = NULL; + pe_entry->ep = tx_ctx->ep; + pe_entry->tx.tx_ctx = tx_ctx; + + dlist_insert_tail(&pe_entry->ctx_entry, &tx_ctx->pe_entry_list); + + /* fill in PE tx entry */ + memset(&pe_entry->msg_hdr, 0, sizeof(struct sock_msg_hdr)); + msg_hdr = &pe_entry->msg_hdr; + msg_hdr->msg_len = sizeof(struct sock_msg_hdr); + + msg_hdr->pe_entry_id = PE_INDEX(pe, pe_entry); + SOCK_LOG_INFO("New TX on PE entry %p (%d)\n", + pe_entry, msg_hdr->pe_entry_id); + + rbfdread(&tx_ctx->rbfd, &pe_entry->tx.tx_op, sizeof(struct sock_op)); + rbfdread(&tx_ctx->rbfd, &pe_entry->flags, sizeof(uint64_t)); + rbfdread(&tx_ctx->rbfd, &pe_entry->context, sizeof(uint64_t)); + rbfdread(&tx_ctx->rbfd, &pe_entry->addr, sizeof(uint64_t)); + rbfdread(&tx_ctx->rbfd, &pe_entry->conn, sizeof(uint64_t)); + rbfdread(&tx_ctx->rbfd, &pe_entry->buf, sizeof(uint64_t)); + rbfdread(&tx_ctx->rbfd, &ep, sizeof(uint64_t)); + + if (ep && ep->ep_attr.tx_ctx_cnt == FI_SHARED_CONTEXT) + pe_entry->comp = &ep->comp; + else + pe_entry->comp = &tx_ctx->comp; + + if (pe_entry->flags & FI_REMOTE_CQ_DATA) { + rbfdread(&tx_ctx->rbfd, &pe_entry->data, SOCK_CQ_DATA_SIZE); + msg_hdr->msg_len += SOCK_CQ_DATA_SIZE; + } + + if (pe_entry->tx.tx_op.op == SOCK_OP_TSEND) { + rbfdread(&tx_ctx->rbfd, &pe_entry->tag, SOCK_TAG_SIZE); + msg_hdr->msg_len += SOCK_TAG_SIZE; + } + + msg_hdr->op_type = pe_entry->tx.tx_op.op; + switch (pe_entry->tx.tx_op.op) { + + case SOCK_OP_SEND: + case SOCK_OP_TSEND: + + if (pe_entry->flags & FI_INJECT) { + rbfdread(&tx_ctx->rbfd, &pe_entry->tx.inject_data[0], + pe_entry->tx.tx_op.src_iov_len); + msg_hdr->msg_len += pe_entry->tx.tx_op.src_iov_len; + } else { + /* read src iov(s)*/ + for (i = 0; itx.tx_op.src_iov_len; i++) { + rbfdread(&tx_ctx->rbfd, &pe_entry->tx.tx_iov[i].src, + sizeof(union sock_iov)); + msg_hdr->msg_len += pe_entry->tx.tx_iov[i].src.iov.len; + } + } + break; + + case SOCK_OP_WRITE: + + if (pe_entry->flags & FI_INJECT) { + rbfdread(&tx_ctx->rbfd, &pe_entry->tx.inject_data[0], + pe_entry->tx.tx_op.src_iov_len); + msg_hdr->msg_len += pe_entry->tx.tx_op.src_iov_len; + } else { + /* read src iov(s)*/ + for (i = 0; itx.tx_op.src_iov_len; i++) { + rbfdread(&tx_ctx->rbfd, &pe_entry->tx.tx_iov[i].src, + sizeof(union sock_iov)); + msg_hdr->msg_len += pe_entry->tx.tx_iov[i].src.iov.len; + } + } + + /* read dst iov(s)*/ + for (i = 0; itx.tx_op.dest_iov_len; i++) { + rbfdread(&tx_ctx->rbfd, &pe_entry->tx.tx_iov[i].dst, + sizeof(union sock_iov)); + } + msg_hdr->msg_len += sizeof(union sock_iov) * + pe_entry->tx.tx_op.dest_iov_len; + break; + + case SOCK_OP_READ: + + /* read src iov(s)*/ + for (i = 0; itx.tx_op.src_iov_len; i++) { + rbfdread(&tx_ctx->rbfd, &pe_entry->tx.tx_iov[i].src, + sizeof(union sock_iov)); + } + msg_hdr->msg_len += sizeof(union sock_iov) * + pe_entry->tx.tx_op.src_iov_len; + + /* read dst iov(s)*/ + for (i = 0; itx.tx_op.dest_iov_len; i++) { + rbfdread(&tx_ctx->rbfd, &pe_entry->tx.tx_iov[i].dst, + sizeof(union sock_iov)); + } + break; + + case SOCK_OP_ATOMIC_WRITE: + case SOCK_OP_ATOMIC_READ_WRITE: + case SOCK_OP_ATOMIC_COMP_WRITE: + + msg_hdr->msg_len += sizeof(struct sock_op); + datatype_sz = fi_datatype_size(pe_entry->tx.tx_op.atomic.datatype); + if (pe_entry->flags & FI_INJECT) { + rbfdread(&tx_ctx->rbfd, &pe_entry->tx.inject_data[0], + pe_entry->tx.tx_op.src_iov_len); + msg_hdr->msg_len += pe_entry->tx.tx_op.src_iov_len; + } else { + /* read src ioc(s)*/ + for (i = 0; itx.tx_op.src_iov_len; i++) { + rbfdread(&tx_ctx->rbfd, &pe_entry->tx.tx_iov[i].src, + sizeof(union sock_iov)); + msg_hdr->msg_len += + (pe_entry->tx.tx_iov[i].src.ioc.count * datatype_sz); + } + } + + /* read dst ioc(s)*/ + for (i = 0; itx.tx_op.dest_iov_len; i++) { + rbfdread(&tx_ctx->rbfd, &pe_entry->tx.tx_iov[i].dst, + sizeof(union sock_iov)); + } + msg_hdr->msg_len += sizeof(union sock_iov) * + pe_entry->tx.tx_op.dest_iov_len; + + /* read result ioc(s)*/ + for (i = 0; itx.tx_op.atomic.res_iov_len; i++) { + rbfdread(&tx_ctx->rbfd, &pe_entry->tx.tx_iov[i].res, + sizeof(union sock_iov)); + } + + /* read comp ioc(s)*/ + for (i = 0; itx.tx_op.atomic.cmp_iov_len; i++) { + rbfdread(&tx_ctx->rbfd, &pe_entry->tx.tx_iov[i].cmp, + sizeof(union sock_iov)); + msg_hdr->msg_len += (pe_entry->tx.tx_iov[i].cmp.ioc.count * + datatype_sz); + } + break; + + default: + SOCK_LOG_ERROR("Invalid operation type\n"); + return -FI_EINVAL; + } + + SOCK_LOG_INFO("Inserting TX-entry to PE entry %p, conn: %p\n", + pe_entry, pe_entry->conn); + + /* prepare message header */ + msg_hdr->version = SOCK_WIRE_PROTO_VERSION; + + rx_id = (uint16_t)SOCK_GET_RX_ID(pe_entry->addr, tx_ctx->av->rx_ctx_bits); + msg_hdr->rx_id = htons(rx_id); + msg_hdr->dest_iov_len = pe_entry->tx.tx_op.src_iov_len; + msg_hdr->flags = htonll(pe_entry->flags); + pe_entry->total_len = msg_hdr->msg_len; + msg_hdr->msg_len = htonll(msg_hdr->msg_len); + msg_hdr->pe_entry_id = htons(msg_hdr->pe_entry_id); + return 0; +} + +void sock_pe_add_tx_ctx(struct sock_pe *pe, struct sock_tx_ctx *ctx) +{ + fastlock_acquire(&pe->lock); + dlistfd_insert_tail(&ctx->pe_entry, &pe->tx_list); + fastlock_release(&pe->lock); + SOCK_LOG_INFO("TX ctx added to PE\n"); +} + +void sock_pe_add_rx_ctx(struct sock_pe *pe, struct sock_rx_ctx *ctx) +{ + fastlock_acquire(&pe->lock); + dlistfd_insert_tail(&ctx->pe_entry, &pe->rx_list); + fastlock_release(&pe->lock); + SOCK_LOG_INFO("RX ctx added to PE\n"); +} + +int sock_pe_progress_rx_ctx(struct sock_pe *pe, struct sock_rx_ctx *rx_ctx) +{ + int i, ret = 0, data_avail; + struct sock_ep *ep; + struct pollfd poll_fd; + struct sock_conn *conn; + struct dlist_entry *entry; + struct sock_pe_entry *pe_entry; + struct sock_conn_map *map; + + poll_fd.events = POLLIN; + fastlock_acquire(&pe->lock); + + /* progress buffered recvs */ + fastlock_acquire(&rx_ctx->lock); + sock_pe_progress_buffered_rx(rx_ctx); + fastlock_release(&rx_ctx->lock); + + /* check for incoming data */ + for (entry = rx_ctx->ep_list.next; + entry != &rx_ctx->ep_list; entry = entry->next) { + + ep = container_of(entry, struct sock_ep, rx_ctx_entry); + map = &ep->domain->r_cmap; + assert(map != NULL); + + for (i=0; iused; i++) { + conn = &map->table[i]; + + data_avail = 0; + if (rbused(&conn->inbuf) > 0) { + data_avail = 1; + } else { + poll_fd.fd = conn->sock_fd; + ret = poll(&poll_fd, 1, 0); + if (ret < 0) { + SOCK_LOG_INFO("Error polling fd: %d\n", + conn->sock_fd); + goto out; + } + data_avail = (ret == 1); + } + + if (data_avail && conn->rx_pe_entry == NULL) { + /* new RX PE entry */ + ret = sock_pe_new_rx_entry(pe, rx_ctx, ep, conn, i); + if (ret < 0) + goto out; + } + } + } + + /* progress tx_ctx in PE table */ + for (entry = rx_ctx->pe_entry_list.next; + entry != &rx_ctx->pe_entry_list;) { + + pe_entry = container_of(entry, struct sock_pe_entry, ctx_entry); + entry = entry->next; + + if (pe_entry->rx.pending_send) { + sock_pe_progress_pending_ack(pe, pe_entry); + if (pe_entry->is_complete) { + sock_pe_release_entry(pe, pe_entry); + SOCK_LOG_INFO("[%p] RX done\n", pe_entry); + } + continue; + } + + + if (!pe_entry->rx.header_read) { + ret = sock_pe_read_hdr(pe, pe_entry); + if (ret < 0) + goto out; + } + + if (pe_entry->rx.header_read) { + ret = sock_pe_process_recv(pe, rx_ctx, pe_entry); + if (ret < 0) + goto out; + } + + if (pe_entry->is_complete) { + sock_pe_release_entry(pe, pe_entry); + SOCK_LOG_INFO("[%p] RX done\n", pe_entry); + } + } + +out: + if (ret < 0) + SOCK_LOG_ERROR("failed to progress RX ctx\n"); + fastlock_release(&pe->lock); + return ret; +} + +int sock_pe_progress_tx_ctx(struct sock_pe *pe, struct sock_tx_ctx *tx_ctx) +{ + int ret = 0; + struct dlist_entry *entry; + struct sock_pe_entry *pe_entry; + + fastlock_acquire(&pe->lock); + + /* check tx_ctx rbuf */ + fastlock_acquire(&tx_ctx->rlock); + while (!rbfdempty(&tx_ctx->rbfd) && + !dlist_empty(&pe->free_list)) { + /* new TX PE entry */ + ret = sock_pe_new_tx_entry(pe, tx_ctx); + if (ret < 0) { + fastlock_release(&tx_ctx->rlock); + goto out; + } + } + fastlock_release(&tx_ctx->rlock); + + /* progress tx_ctx in PE table */ + for (entry = tx_ctx->pe_entry_list.next; + entry != &tx_ctx->pe_entry_list;) { + + pe_entry = container_of(entry, struct sock_pe_entry, ctx_entry); + entry = entry->next; + + ret = sock_pe_progress_tx_entry(pe, tx_ctx, pe_entry); + if (ret < 0) { + SOCK_LOG_ERROR("Error in progressing %p\n", pe_entry); + goto out; + } + + if (pe_entry->is_complete) { + sock_pe_release_entry(pe, pe_entry); + SOCK_LOG_INFO("[%p] TX done\n", pe_entry); + } + } + +out: + if (ret < 0) + SOCK_LOG_ERROR("failed to progress TX ctx\n"); + fastlock_release(&pe->lock); + return ret; +} + +static void *sock_pe_progress_thread(void *data) +{ + int ret; + struct dlist_entry *entry; + struct sock_tx_ctx *tx_ctx; + struct sock_rx_ctx *rx_ctx; + struct sock_pe *pe = (struct sock_pe *)data; + + SOCK_LOG_INFO("Progress thread started\n"); + while (pe->do_progress) { + + /* progress tx */ + if (!dlistfd_empty(&pe->tx_list)) { + for (entry = pe->tx_list.list.next; + entry != &pe->tx_list.list; entry = entry->next) { + tx_ctx = container_of(entry, struct sock_tx_ctx, + pe_entry); + ret = sock_pe_progress_tx_ctx(pe, tx_ctx); + if (ret < 0) { + SOCK_LOG_ERROR( + "failed to progress TX\n"); + return NULL; + } + } + } + + /* progress rx */ + if (!dlistfd_empty(&pe->rx_list)) { + for (entry = pe->rx_list.list.next; + entry != &pe->rx_list.list; entry = entry->next) { + rx_ctx = container_of(entry, struct sock_rx_ctx, + pe_entry); + ret = sock_pe_progress_rx_ctx(pe, rx_ctx); + if (ret < 0) { + SOCK_LOG_ERROR( + "failed to progress RX\n"); + return NULL; + } + } + } + } + + SOCK_LOG_INFO("Progress thread terminated\n"); + return NULL; +} + +static void sock_pe_init_table( + struct sock_pe *pe) +{ + int i; + + memset(&pe->pe_table, 0, + sizeof(struct sock_pe_entry) * SOCK_PE_MAX_ENTRIES); + + dlist_init(&pe->free_list); + dlist_init(&pe->busy_list); + + for (i=0; ipe_table[i].entry, &pe->free_list); + } + + SOCK_LOG_INFO("PE table init: OK\n"); +} + +struct sock_pe *sock_pe_init(struct sock_domain *domain) +{ + struct sock_pe *pe = calloc(1, sizeof(struct sock_pe)); + if (!pe) + return NULL; + + sock_pe_init_table(pe); + + dlistfd_head_init(&pe->tx_list); + dlistfd_head_init(&pe->rx_list); + fastlock_init(&pe->lock); + pe->domain = domain; + + if (domain->progress_mode == FI_PROGRESS_AUTO) { + pe->do_progress = 1; + if (pthread_create(&pe->progress_thread, NULL, + sock_pe_progress_thread, (void *)pe)) { + SOCK_LOG_ERROR("Couldn't create progress thread\n"); + goto err; + } + } + SOCK_LOG_INFO("PE init: OK\n"); + return pe; + +err: + dlistfd_head_free(&pe->tx_list); + dlistfd_head_free(&pe->rx_list); + + free(pe); + return NULL; +} + +void sock_pe_finalize(struct sock_pe *pe) +{ + if (pe->domain->progress_mode == FI_PROGRESS_AUTO) { + pe->do_progress = 0; + pthread_join(pe->progress_thread, NULL); + } + + fastlock_destroy(&pe->lock); + + dlistfd_head_free(&pe->tx_list); + dlistfd_head_free(&pe->rx_list); + + free(pe); + SOCK_LOG_INFO("Progress engine finalize: OK\n"); +} + diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_rdm.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_rdm.c deleted file mode 100644 index c109861edd..0000000000 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_rdm.c +++ /dev/null @@ -1,1611 +0,0 @@ -/* - * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#if HAVE_CONFIG_H -# include -#endif /* HAVE_CONFIG_H */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "sock.h" -#include "sock_util.h" - - -extern const struct fi_domain_attr sock_domain_attr; -extern const struct fi_fabric_attr sock_fabric_attr; - -const struct fi_ep_attr sock_rdm_ep_attr = { - .protocol = FI_PROTO_SOCK_RDS, - .max_msg_size = SOCK_EP_MAX_MSG_SZ, - .inject_size = SOCK_EP_MAX_INJECT_SZ, - .total_buffered_recv = SOCK_EP_MAX_BUFF_RECV, - .max_order_raw_size = SOCK_EP_MAX_ORDER_RAW_SZ, - .max_order_war_size = SOCK_EP_MAX_ORDER_WAR_SZ, - .max_order_waw_size = SOCK_EP_MAX_ORDER_WAW_SZ, - .mem_tag_format = SOCK_EP_MEM_TAG_FMT, - .msg_order = SOCK_EP_MSG_ORDER, - .tx_ctx_cnt = SOCK_EP_MAX_TX_CNT, - .rx_ctx_cnt = SOCK_EP_MAX_RX_CNT, -}; - -const struct fi_tx_attr sock_rdm_tx_attr = { - .caps = SOCK_EP_RDM_CAP, - .op_flags = SOCK_OPS_CAP, - .msg_order = 0, - .inject_size = SOCK_EP_MAX_INJECT_SZ, - .size = SOCK_EP_MAX_TX_CTX_SZ, - .iov_limit = SOCK_EP_MAX_IOV_LIMIT, -}; - -const struct fi_rx_attr sock_rdm_rx_attr = { - .caps = SOCK_EP_RDM_CAP, - .op_flags = SOCK_OPS_CAP, - .msg_order = 0, - .total_buffered_recv = 0, - .size = SOCK_EP_MAX_MSG_SZ, - .iov_limit = SOCK_EP_MAX_IOV_LIMIT, -}; - -static int sock_rdm_verify_rx_attr(const struct fi_rx_attr *attr) -{ - if (!attr) - return 0; - - if ((attr->caps | sock_rdm_rx_attr.caps) != sock_rdm_rx_attr.caps) - return -FI_ENODATA; - - if ((attr->op_flags | sock_rdm_rx_attr.op_flags) != - sock_rdm_rx_attr.op_flags) - return -FI_ENODATA; - - if (attr->msg_order != sock_rdm_rx_attr.msg_order) - return -FI_ENODATA; - - if (attr->total_buffered_recv > sock_rdm_rx_attr.total_buffered_recv) - return -FI_ENODATA; - - if (attr->size > sock_rdm_rx_attr.size) - return -FI_ENODATA; - - if (attr->iov_limit > sock_rdm_rx_attr.iov_limit) - return -FI_ENODATA; - - return 0; -} - -static int sock_rdm_verify_tx_attr(const struct fi_tx_attr *attr) -{ - if (!attr) - return 0; - - if ((attr->caps | sock_rdm_tx_attr.caps) != sock_rdm_tx_attr.caps) - return -FI_ENODATA; - - if ((attr->op_flags | sock_rdm_tx_attr.op_flags) != - sock_rdm_tx_attr.op_flags) - return -FI_ENODATA; - - if (attr->msg_order != sock_rdm_tx_attr.msg_order) - return -FI_ENODATA; - - if (attr->inject_size > sock_rdm_tx_attr.inject_size) - return -FI_ENODATA; - - if (attr->size > sock_rdm_tx_attr.size) - return -FI_ENODATA; - - if (attr->iov_limit > sock_rdm_tx_attr.iov_limit) - return -FI_ENODATA; - - return 0; -} - -int sock_rdm_verify_ep_attr(struct fi_ep_attr *ep_attr, - struct fi_tx_attr *tx_attr, - struct fi_rx_attr *rx_attr) -{ - if (ep_attr) { - switch (ep_attr->protocol) { - case FI_PROTO_UNSPEC: - case FI_PROTO_SOCK_RDS: - break; - default: - return -FI_ENODATA; - } - - if (ep_attr->max_msg_size > sock_rdm_ep_attr.max_msg_size) - return -FI_ENODATA; - - if (ep_attr->inject_size > sock_rdm_ep_attr.inject_size) - return -FI_ENODATA; - - if (ep_attr->total_buffered_recv > - sock_rdm_ep_attr.total_buffered_recv) - return -FI_ENODATA; - - if (ep_attr->max_order_raw_size > - sock_rdm_ep_attr.max_order_raw_size) - return -FI_ENODATA; - - if (ep_attr->max_order_war_size > - sock_rdm_ep_attr.max_order_war_size) - return -FI_ENODATA; - - if (ep_attr->max_order_waw_size > - sock_rdm_ep_attr.max_order_waw_size) - return -FI_ENODATA; - - if (ep_attr->msg_order != - sock_rdm_ep_attr.msg_order) - return -FI_ENODATA; - - if (ep_attr->tx_ctx_cnt > sock_rdm_ep_attr.tx_ctx_cnt) - return -FI_ENODATA; - - if (ep_attr->rx_ctx_cnt > sock_rdm_ep_attr.rx_ctx_cnt) - return -FI_ENODATA; - } - - if (sock_rdm_verify_tx_attr(tx_attr) || sock_rdm_verify_rx_attr(rx_attr)) - return -FI_ENODATA; - - return 0; -} - - -static struct fi_info *allocate_fi_info(enum fi_ep_type ep_type, - int addr_format, struct fi_info *hints, - void *src_addr, void *dest_addr) -{ - struct fi_info *_info = fi_allocinfo_internal(); - if (!_info) - return NULL; - - _info->src_addr = calloc(1, sizeof(struct sockaddr_in)); - _info->dest_addr = calloc(1, sizeof(struct sockaddr_in)); - - _info->next = NULL; - _info->ep_type = ep_type; - _info->addr_format = addr_format; - _info->dest_addrlen =_info->src_addrlen = sizeof(struct sockaddr_in); - - if (src_addr) { - memcpy(_info->src_addr, src_addr, sizeof(struct sockaddr_in)); - } - - if (dest_addr) { - memcpy(_info->dest_addr, dest_addr, sizeof(struct sockaddr_in)); - } - - if (hints->caps) { - _info->caps = hints->caps; - } else { - _info->caps = SOCK_EP_RDM_CAP; - } - - *(_info->tx_attr) = sock_rdm_tx_attr; - *(_info->rx_attr) = sock_rdm_rx_attr; - *(_info->ep_attr) = sock_rdm_ep_attr; - - *(_info->domain_attr) = sock_domain_attr; - _info->domain_attr->name = strdup(sock_dom_name); - - *(_info->fabric_attr) = sock_fabric_attr; - _info->fabric_attr->name = strdup(sock_fab_name); - _info->fabric_attr->prov_name = strdup(sock_fab_name); - - return _info; -} - -int sock_rdm_getinfo(uint32_t version, const char *node, const char *service, - uint64_t flags, struct fi_info *hints, struct fi_info **info) -{ - int ret; - struct fi_info *_info; - void *src_addr = NULL, *dest_addr = NULL; - - if (!info) - return -FI_EBADFLAGS; - - *info = NULL; - - if (!node && !service && !hints) - return -FI_EBADFLAGS; - - if (version != FI_VERSION(SOCK_MAJOR_VERSION, - SOCK_MINOR_VERSION)) - return -FI_ENODATA; - - if (hints) { - if ((SOCK_EP_RDM_CAP | hints->caps) != SOCK_EP_RDM_CAP) { - SOCK_LOG_INFO( - "Cannot support requested options!\n"); - return -FI_ENODATA; - } - - ret = sock_rdm_verify_rx_attr(hints->rx_attr); - if (ret) - return ret; - - ret = sock_rdm_verify_tx_attr(hints->tx_attr); - if (ret) - return ret; - } - - if (node || service) { - struct addrinfo sock_hints; - struct addrinfo *result = NULL; - - src_addr = calloc(1, sizeof(struct sockaddr_in)); - dest_addr = calloc(1, sizeof(struct sockaddr_in)); - - memset(&sock_hints, 0, sizeof(struct sockaddr_in)); - sock_hints.ai_family = AF_INET; - sock_hints.ai_protocol = 0; - sock_hints.ai_canonname = NULL; - sock_hints.ai_addr = NULL; - sock_hints.ai_next = NULL; - - if (flags & FI_SOURCE) - sock_hints.ai_flags = AI_PASSIVE; - - if (flags & FI_NUMERICHOST) - sock_hints.ai_flags |= AI_NUMERICHOST; - - - ret = getaddrinfo(node, service, &sock_hints, &result); - if (ret != 0) { - ret = FI_ENODATA; - SOCK_LOG_INFO("getaddrinfo failed!\n"); - goto err; - } - memcpy(src_addr, result->ai_addr, sizeof(struct sockaddr_in)); - - if (!(FI_SOURCE & flags)) { - socklen_t len; - int udp_sock = socket(AF_INET, SOCK_DGRAM, 0); - if (0 != connect(udp_sock, result->ai_addr, - result->ai_addrlen)) { - SOCK_LOG_ERROR( - "Failed to get dest_addr\n"); - ret = FI_ENODATA; - goto err; - } - if (0!= getsockname(udp_sock, (struct sockaddr*)dest_addr, - &len)) { - SOCK_LOG_ERROR( - "Failed to get dest_addr\n"); - close(udp_sock); - ret = FI_ENODATA; - goto err; - } - close(udp_sock); - } - freeaddrinfo(result); - } - - _info = allocate_fi_info(FI_EP_RDM, FI_SOCKADDR_IN, hints, src_addr, - dest_addr); - if (!_info) { - ret = FI_ENOMEM; - goto err; - } - - *info = _info; - free(src_addr); - free(dest_addr); - return 0; - -err: - free(src_addr); - free(dest_addr); - SOCK_LOG_ERROR("fi_getinfo failed\n"); - return ret; -} - -ssize_t sock_rdm_ctx_recvmsg(struct fid_ep *ep, const struct fi_msg *msg, - uint64_t flags) -{ - int i; - struct sock_rx_ctx *rx_ctx; - struct sock_rx_entry *rx_entry; - - rx_ctx = container_of(ep, struct sock_rx_ctx, ctx); - assert(rx_ctx->enabled && msg->iov_count <= SOCK_EP_MAX_IOV_LIMIT); - - /* FIXME: pool of rx_entry */ - rx_entry = calloc(1, sizeof(struct sock_rx_entry)); - if (!rx_entry) - return -FI_ENOMEM; - - dlist_init(&rx_entry->entry); - - rx_entry->rx_op.op = SOCK_OP_RECV; - rx_entry->rx_op.dest_iov_len = msg->iov_count; - - rx_entry->flags = flags; - rx_entry->context = (uint64_t)msg->context; - rx_entry->addr = msg->addr; - rx_entry->data = msg->data; - - for (i=0; i< msg->iov_count; i++) { - rx_entry->iov[i].iov.addr = (uint64_t)msg->msg_iov[i].iov_base; - rx_entry->iov[i].iov.len = (uint64_t)msg->msg_iov[i].iov_len; - } - - fastlock_acquire(&rx_ctx->lock); - dlist_insert_tail(&rx_entry->entry, &rx_ctx->rx_entry_list); - fastlock_release(&rx_ctx->lock); - return 0; -} - -ssize_t sock_rdm_ctx_recv(struct fid_ep *ep, void *buf, size_t len, void *desc, - fi_addr_t src_addr, void *context) -{ - struct fi_msg msg; - struct iovec msg_iov; - - msg_iov.iov_base = buf; - msg_iov.iov_len = len; - - msg.msg_iov = &msg_iov; - msg.desc = desc; - msg.iov_count = 1; - msg.addr = src_addr; - msg.context = context; - - return sock_rdm_ctx_recvmsg(ep, &msg, 0); -} - -ssize_t sock_rdm_ctx_recvv(struct fid_ep *ep, const struct iovec *iov, - void **desc, size_t count, fi_addr_t src_addr, - void *context) -{ - struct fi_msg msg; - - msg.msg_iov = iov; - msg.desc = desc; - msg.iov_count = count; - msg.addr = src_addr; - msg.context = context; - return sock_rdm_ctx_recvmsg(ep, &msg, 0); -} - -static ssize_t sock_rdm_sendmsg(struct sock_tx_ctx *tx_ctx, struct sock_av *av, - const struct fi_msg *msg, uint64_t flags) -{ - int ret, i; - struct sock_op tx_op; - union sock_iov tx_iov; - struct sock_conn *conn; - uint64_t tmp=0, total_len; - - assert(tx_ctx->enabled && msg->iov_count <= SOCK_EP_MAX_IOV_LIMIT); - - if ((ret = sock_av_lookup_addr(av, msg->addr, &conn))) - return ret; - - total_len = 0; - if (flags & FI_INJECT) { - for (i=0; i< msg->iov_count; i++) { - total_len += msg->msg_iov[i].iov_len; - } - assert(total_len <= SOCK_EP_MAX_INJECT_SZ); - } else { - total_len = msg->iov_count * sizeof(union sock_iov); - } - - total_len += sizeof(struct sock_op_send); - - if (flags & FI_REMOTE_CQ_DATA) - total_len += sizeof(uint64_t); - - sock_tx_ctx_start(tx_ctx); - if (rbfdavail(&tx_ctx->rbfd) < total_len) - goto err; - - memset(&tx_op, 0, sizeof(struct sock_op)); - tx_op.op = (flags & FI_INJECT) ? SOCK_OP_SEND_INJECT : SOCK_OP_SEND; - tx_op.src_iov_len = msg->iov_count; - - sock_tx_ctx_write(tx_ctx, &tx_op, sizeof(struct sock_op)); - sock_tx_ctx_write(tx_ctx, &flags, sizeof(uint64_t)); - sock_tx_ctx_write(tx_ctx, msg->context ? msg->context: &tmp, - sizeof(uint64_t)); - sock_tx_ctx_write(tx_ctx, &msg->addr, sizeof(uint64_t)); - sock_tx_ctx_write(tx_ctx, &conn, sizeof(uint64_t)); - if (flags & FI_REMOTE_CQ_DATA) { - sock_tx_ctx_write(tx_ctx, &msg->data, sizeof(uint64_t)); - } - - if (flags & FI_INJECT) { - for (i=0; i< msg->iov_count; i++) { - sock_tx_ctx_write(tx_ctx, msg->msg_iov[i].iov_base, - msg->msg_iov[i].iov_len); - } - } else { - for (i=0; i< msg->iov_count; i++) { - tx_iov.iov.addr = (uint64_t)msg->msg_iov[i].iov_base; - tx_iov.iov.len = msg->msg_iov[i].iov_len; - sock_tx_ctx_write(tx_ctx, &tx_iov, sizeof(union sock_iov)); - } - } - - sock_tx_ctx_commit(tx_ctx); - return 0; - -err: - sock_tx_ctx_abort(tx_ctx); - return -FI_EAGAIN; -} - -ssize_t sock_rdm_ctx_sendmsg(struct fid_ep *ep, const struct fi_msg *msg, - uint64_t flags) -{ - struct sock_tx_ctx *tx_ctx; - tx_ctx = container_of(ep, struct sock_tx_ctx, ctx); - return sock_rdm_sendmsg(tx_ctx, tx_ctx->ep->av, msg, flags); -} - -ssize_t sock_rdm_ctx_send(struct fid_ep *ep, const void *buf, size_t len, - void *desc, fi_addr_t dest_addr, void *context) -{ - struct fi_msg msg; - struct iovec msg_iov; - - msg_iov.iov_base = (void*)buf; - msg_iov.iov_len = len; - msg.msg_iov = &msg_iov; - msg.desc = desc; - msg.iov_count = 1; - msg.addr = dest_addr; - msg.context = context; - - return sock_rdm_ctx_sendmsg(ep, &msg, 0); -} - -ssize_t sock_rdm_ctx_sendv(struct fid_ep *ep, const struct iovec *iov, - void **desc, size_t count, fi_addr_t dest_addr, - void *context) -{ - struct fi_msg msg; - msg.msg_iov = iov; - msg.desc = desc; - msg.iov_count = count; - msg.addr = dest_addr; - msg.context = context; - return sock_rdm_ctx_sendmsg(ep, &msg, 0); -} - - -ssize_t sock_rdm_ctx_senddata(struct fid_ep *ep, const void *buf, - size_t len, void *desc, uint64_t data, - fi_addr_t dest_addr, void *context) -{ - struct fi_msg msg; - struct iovec msg_iov; - - msg_iov.iov_base = (void*)buf; - msg_iov.iov_len = len; - - msg.msg_iov = &msg_iov; - msg.desc = desc; - msg.iov_count = 1; - msg.addr = dest_addr; - msg.context = context; - msg.data = data; - - return sock_rdm_ctx_sendmsg(ep, &msg, FI_REMOTE_CQ_DATA); -} - -static ssize_t sock_rdm_inject(struct sock_tx_ctx *tx_ctx, struct sock_av *av, - const void *buf, size_t len, fi_addr_t dest_addr) -{ - struct fi_msg msg; - struct iovec msg_iov; - - msg_iov.iov_base = (void*)buf; - msg_iov.iov_len = len; - msg.msg_iov = &msg_iov; - msg.iov_count = 1; - msg.addr = dest_addr; - - return sock_rdm_sendmsg(tx_ctx, av, &msg, FI_INJECT); -} - -ssize_t sock_rdm_ctx_inject(struct fid_ep *ep, const void *buf, size_t len, - fi_addr_t dest_addr) -{ - struct sock_tx_ctx *tx_ctx; - tx_ctx = container_of(ep, struct sock_tx_ctx, ctx); - return sock_rdm_inject(tx_ctx, tx_ctx->ep->av, buf, len, dest_addr); -} - -struct fi_ops_msg sock_rdm_ctx_msg_ops = { - .size = sizeof(struct fi_ops_msg), - .recv = sock_rdm_ctx_recv, - .recvv = sock_rdm_ctx_recvv, - .recvmsg = sock_rdm_ctx_recvmsg, - .send = sock_rdm_ctx_send, - .sendv = sock_rdm_ctx_sendv, - .sendmsg = sock_rdm_ctx_sendmsg, - .inject = sock_rdm_ctx_inject, - .senddata = sock_rdm_ctx_senddata, - .injectdata = fi_no_msg_injectdata, -}; - -ssize_t sock_rdm_ctx_trecvmsg(struct fid_ep *ep, const struct fi_msg_tagged *msg, - uint64_t flags) -{ - int i; - struct sock_rx_ctx *rx_ctx; - struct sock_rx_entry *rx_entry; - - rx_ctx = container_of(ep, struct sock_rx_ctx, ctx); - assert(rx_ctx->enabled && msg->iov_count <= SOCK_EP_MAX_IOV_LIMIT); - - /* FIXME: pool of rx_entry */ - rx_entry = calloc(1, sizeof(struct sock_rx_entry)); - if (!rx_entry) - return -FI_ENOMEM; - - dlist_init(&rx_entry->entry); - - rx_entry->rx_op.op = SOCK_OP_TRECV; - rx_entry->rx_op.dest_iov_len = msg->iov_count; - - rx_entry->flags = flags; - rx_entry->context = (uint64_t)msg->context; - rx_entry->addr = msg->addr; - rx_entry->data = msg->data; - rx_entry->tag = msg->tag; - rx_entry->ignore = msg->ignore; - - for (i=0; i< msg->iov_count; i++) { - rx_entry->iov[i].iov.addr = (uint64_t)msg->msg_iov[i].iov_base; - rx_entry->iov[i].iov.len = (uint64_t)msg->msg_iov[i].iov_len; - } - - fastlock_acquire(&rx_ctx->lock); - dlist_insert_tail(&rx_entry->entry, &rx_ctx->rx_entry_list); - fastlock_release(&rx_ctx->lock); - return 0; -} - -ssize_t sock_rdm_ctx_trecv(struct fid_ep *ep, void *buf, size_t len, - void *desc, fi_addr_t src_addr, uint64_t tag, - uint64_t ignore, void *context) -{ - struct fi_msg_tagged msg; - struct iovec msg_iov; - - msg_iov.iov_base = buf; - msg_iov.iov_len = len; - - msg.msg_iov = &msg_iov; - msg.desc = desc; - msg.iov_count = 1; - msg.addr = src_addr; - msg.context = context; - msg.tag = tag; - msg.ignore = ignore; - - return sock_rdm_ctx_trecvmsg(ep, &msg, 0); -} - -ssize_t sock_rdm_ctx_trecvv(struct fid_ep *ep, const struct iovec *iov, - void **desc, size_t count, fi_addr_t src_addr, - uint64_t tag, uint64_t ignore, void *context) -{ - struct fi_msg_tagged msg; - - msg.msg_iov = iov; - msg.desc = desc; - msg.iov_count = count; - msg.addr = src_addr; - msg.context = context; - msg.tag = tag; - msg.ignore = ignore; - return sock_rdm_ctx_trecvmsg(ep, &msg, 0); -} - -static ssize_t sock_rdm_tsendmsg(struct sock_tx_ctx *tx_ctx, struct sock_av *av, - const struct fi_msg_tagged *msg, uint64_t flags) -{ - int ret, i; - struct sock_op tx_op; - union sock_iov tx_iov; - struct sock_conn *conn; - uint64_t tmp=0, total_len; - - assert(tx_ctx->enabled && msg->iov_count <= SOCK_EP_MAX_IOV_LIMIT); - - if ((ret = sock_av_lookup_addr(av, msg->addr, &conn))) - return ret; - - total_len = 0; - if (flags & FI_INJECT) { - for (i=0; i< msg->iov_count; i++) { - total_len += msg->msg_iov[i].iov_len; - } - assert(total_len <= SOCK_EP_MAX_INJECT_SZ); - } else { - total_len = msg->iov_count * sizeof(union sock_iov); - } - - total_len += sizeof(struct sock_op_tsend); - if (flags & FI_REMOTE_CQ_DATA) - total_len += sizeof(uint64_t); - - sock_tx_ctx_start(tx_ctx); - if (rbfdavail(&tx_ctx->rbfd) < total_len) - goto err; - - memset(&tx_op, 0, sizeof(struct sock_op)); - tx_op.op = (flags & FI_INJECT) ? SOCK_OP_TSEND_INJECT : SOCK_OP_TSEND; - tx_op.src_iov_len = msg->iov_count; - - sock_tx_ctx_write(tx_ctx, &tx_op, sizeof(struct sock_op)); - sock_tx_ctx_write(tx_ctx, &flags, sizeof(uint64_t)); - sock_tx_ctx_write(tx_ctx, msg->context ? msg->context: &tmp, - sizeof(uint64_t)); - sock_tx_ctx_write(tx_ctx, &msg->addr, sizeof(uint64_t)); - sock_tx_ctx_write(tx_ctx, &conn, sizeof(uint64_t)); - if (flags & FI_REMOTE_CQ_DATA) { - sock_tx_ctx_write(tx_ctx, &msg->data, sizeof(uint64_t)); - } - sock_tx_ctx_write(tx_ctx, &msg->tag, sizeof(uint64_t)); - - if (flags & FI_INJECT) { - for (i=0; i< msg->iov_count; i++) { - sock_tx_ctx_write(tx_ctx, msg->msg_iov[i].iov_base, - msg->msg_iov[i].iov_len); - } - } else { - for (i=0; i< msg->iov_count; i++) { - tx_iov.iov.addr = (uint64_t)msg->msg_iov[i].iov_base; - tx_iov.iov.len = msg->msg_iov[i].iov_len; - sock_tx_ctx_write(tx_ctx, &tx_iov, sizeof(union sock_iov)); - } - } - - sock_tx_ctx_commit(tx_ctx); - return 0; - -err: - sock_tx_ctx_abort(tx_ctx); - return -FI_EAGAIN; -} - -ssize_t sock_rdm_ctx_tsendmsg(struct fid_ep *ep, const struct fi_msg_tagged *msg, - uint64_t flags) -{ - struct sock_tx_ctx *tx_ctx; - tx_ctx = container_of(ep, struct sock_tx_ctx, ctx); - return sock_rdm_tsendmsg(tx_ctx, tx_ctx->ep->av, msg, flags); -} - -ssize_t sock_rdm_ctx_tsend(struct fid_ep *ep, const void *buf, size_t len, - void *desc, fi_addr_t dest_addr, uint64_t tag, - void *context) -{ - struct fi_msg_tagged msg; - struct iovec msg_iov; - - msg_iov.iov_base = (void*)buf; - msg_iov.iov_len = len; - msg.msg_iov = &msg_iov; - msg.desc = desc; - msg.iov_count = 1; - msg.addr = dest_addr; - msg.context = context; - msg.tag = tag; - - return sock_rdm_ctx_tsendmsg(ep, &msg, 0); -} - -ssize_t sock_rdm_ctx_tsendv(struct fid_ep *ep, const struct iovec *iov, - void **desc, size_t count, fi_addr_t dest_addr, - uint64_t tag, void *context) -{ - struct fi_msg_tagged msg; - msg.msg_iov = iov; - msg.desc = desc; - msg.iov_count = count; - msg.addr = dest_addr; - msg.context = context; - msg.tag = tag; - return sock_rdm_ctx_tsendmsg(ep, &msg, 0); -} - -ssize_t sock_rdm_ctx_tsenddata(struct fid_ep *ep, const void *buf, size_t len, - void *desc, uint64_t data, fi_addr_t dest_addr, - uint64_t tag, void *context) -{ - struct fi_msg_tagged msg; - struct iovec msg_iov; - - msg_iov.iov_base = (void*)buf; - msg_iov.iov_len = len; - msg.msg_iov = &msg_iov; - msg.desc = desc; - msg.iov_count = 1; - msg.addr = dest_addr; - msg.context = context; - msg.data = data; - msg.tag = tag; - - return sock_rdm_ctx_tsendmsg(ep, &msg, FI_REMOTE_CQ_DATA); -} - -static ssize_t sock_rdm_tinject(struct sock_tx_ctx *tx_ctx, struct sock_av *av, - const void *buf, size_t len, - fi_addr_t dest_addr, uint64_t tag) -{ - struct fi_msg_tagged msg; - struct iovec msg_iov; - - msg_iov.iov_base = (void*)buf; - msg_iov.iov_len = len; - msg.msg_iov = &msg_iov; - msg.iov_count = 1; - msg.addr = dest_addr; - msg.tag = tag; - return sock_rdm_tsendmsg(tx_ctx, av, &msg, FI_INJECT); -} - -ssize_t sock_rdm_ctx_tinject(struct fid_ep *ep, const void *buf, size_t len, - fi_addr_t dest_addr, uint64_t tag) -{ - struct sock_tx_ctx *tx_ctx; - tx_ctx = container_of(ep, struct sock_tx_ctx, ctx); - return sock_rdm_tinject(tx_ctx, tx_ctx->ep->av, buf, len, dest_addr, tag); -} - -ssize_t sock_rdm_ctx_tsearch(struct fid_ep *ep, uint64_t *tag, uint64_t ignore, - uint64_t flags, fi_addr_t *src_addr, size_t *len, - void *context) -{ - return -FI_ENOSYS; -} - - -struct fi_ops_tagged sock_rdm_ctx_tagged = { - .size = sizeof(struct fi_ops_tagged), - .recv = sock_rdm_ctx_trecv, - .recvv = sock_rdm_ctx_trecvv, - .recvmsg = sock_rdm_ctx_trecvmsg, - .send = sock_rdm_ctx_tsend, - .sendv = sock_rdm_ctx_tsendv, - .sendmsg = sock_rdm_ctx_tsendmsg, - .inject = sock_rdm_ctx_tinject, - .senddata = sock_rdm_ctx_tsenddata, - .injectdata = fi_no_tagged_injectdata, - .search = sock_rdm_ctx_tsearch, -}; - -int sock_rdm_ctx_close(struct fid *fid) -{ - struct sock_ep *ep; - struct dlist_entry *entry; - struct sock_tx_ctx *tx_ctx; - struct sock_rx_ctx *rx_ctx; - - switch (fid->fclass) { - case FI_CLASS_TX_CTX: - tx_ctx = container_of(fid, struct sock_tx_ctx, ctx); - - for (entry = tx_ctx->ep_list.next; entry != &tx_ctx->ep_list; - entry = entry->next) { - ep = container_of(entry, struct sock_ep, tx_ctx_entry); - atomic_dec(&ep->num_tx_ctx); - } - sock_tx_ctx_free(tx_ctx); - break; - - case FI_CLASS_RX_CTX: - rx_ctx = container_of(fid, struct sock_rx_ctx, ctx); - - for (entry = rx_ctx->ep_list.next; entry != &rx_ctx->ep_list; - entry = entry->next) { - ep = container_of(entry, struct sock_ep, rx_ctx_entry); - atomic_dec(&ep->num_rx_ctx); - } - sock_rx_ctx_free(rx_ctx); - break; - - default: - SOCK_LOG_ERROR("Invalid fid\n"); - return -FI_EINVAL; - } - return 0; -} - -int sock_rdm_ctx_bind_cq(struct fid *fid, struct fid *bfid, uint64_t flags) -{ - struct sock_cq *sock_cq; - struct sock_tx_ctx *tx_ctx; - struct sock_rx_ctx *rx_ctx; - - sock_cq = container_of(bfid, struct sock_cq, cq_fid.fid); - switch (fid->fclass) { - case FI_CLASS_TX_CTX: - tx_ctx = container_of(fid, struct sock_tx_ctx, ctx); - if (flags & FI_SEND) { - tx_ctx->send_cq = sock_cq; - if (flags & FI_EVENT) - tx_ctx->send_cq_event = 1; - } - - if (flags & FI_READ) { - tx_ctx->read_cq = sock_cq; - if (flags & FI_EVENT) - tx_ctx->read_cq_event = 1; - } - - if (flags & FI_WRITE) { - tx_ctx->write_cq = sock_cq; - if (flags & FI_EVENT) - tx_ctx->write_cq_event = 1; - } - - if (!tx_ctx->progress) { - tx_ctx->progress = 1; - sock_pe_add_tx_ctx(tx_ctx->domain->pe, tx_ctx); - } - break; - - case FI_CLASS_RX_CTX: - rx_ctx = container_of(fid, struct sock_rx_ctx, ctx); - if (flags & FI_RECV) { - rx_ctx->recv_cq = sock_cq; - if (flags & FI_EVENT) - rx_ctx->recv_cq_event = 1; - } - - if (flags & FI_REMOTE_READ) { - rx_ctx->rem_read_cq = sock_cq; - if (flags & FI_EVENT) - rx_ctx->rem_read_cq_event = 1; - } - - if (flags & FI_REMOTE_WRITE) { - rx_ctx->rem_write_cq = sock_cq; - if (flags & FI_EVENT) - rx_ctx->rem_write_cq_event = 1; - } - - if (!rx_ctx->progress) { - rx_ctx->progress = 1; - sock_pe_add_rx_ctx(rx_ctx->domain->pe, rx_ctx); - } - break; - - default: - SOCK_LOG_ERROR("Invalid fid\n"); - return -FI_EINVAL; - } - return 0; -} - -int sock_rdm_ctx_bind_cntr(struct fid *fid, struct fid *bfid, uint64_t flags) -{ - struct sock_cntr *cntr; - struct sock_tx_ctx *tx_ctx; - struct sock_rx_ctx *rx_ctx; - - cntr = container_of(bfid, struct sock_cntr, cntr_fid.fid); - switch (fid->fclass) { - case FI_CLASS_TX_CTX: - tx_ctx = container_of(fid, struct sock_tx_ctx, ctx); - if (flags & FI_SEND) - tx_ctx->send_cntr = cntr; - - if (flags & FI_READ) - tx_ctx->read_cntr = cntr; - - if (flags & FI_WRITE) - tx_ctx->write_cntr = cntr; - - if (!tx_ctx->progress) { - tx_ctx->progress = 1; - sock_pe_add_tx_ctx(tx_ctx->domain->pe, tx_ctx); - } - break; - - case FI_CLASS_RX_CTX: - rx_ctx = container_of(fid, struct sock_rx_ctx, ctx); - if (flags & FI_RECV) - rx_ctx->recv_cntr = cntr; - - if (flags & FI_REMOTE_READ) - rx_ctx->rem_read_cntr = cntr; - - if (flags & FI_REMOTE_WRITE) - rx_ctx->rem_write_cntr = cntr; - - if (!rx_ctx->progress) { - rx_ctx->progress = 1; - sock_pe_add_rx_ctx(rx_ctx->domain->pe, rx_ctx); - } - break; - - default: - SOCK_LOG_ERROR("Invalid fid\n"); - return -FI_EINVAL; - } - return 0; -} - -int sock_rdm_ctx_bind(struct fid *fid, struct fid *bfid, uint64_t flags) -{ - switch (bfid->fclass) { - case FI_CLASS_CQ: - return sock_rdm_ctx_bind_cq(fid, bfid, flags); - - case FI_CLASS_CNTR: - return sock_rdm_ctx_bind_cntr(fid, bfid, flags); - - default: - SOCK_LOG_ERROR("Invalid bind()\n"); - return -FI_EINVAL; - } - -} - -struct fi_ops sock_rdm_ctx_ops = { - .size = sizeof(struct fi_ops), - .close = sock_rdm_ctx_close, - .bind = sock_rdm_ctx_bind, - .control = fi_no_control, -}; - -int sock_rdm_ctx_enable(struct fid_ep *ep) -{ - struct sock_tx_ctx *tx_ctx; - struct sock_rx_ctx *rx_ctx; - - switch (ep->fid.fclass) { - case FI_CLASS_RX_CTX: - rx_ctx = container_of(ep, struct sock_rx_ctx, ctx); - rx_ctx->enabled = 1; - return 0; - - case FI_CLASS_TX_CTX: - tx_ctx = container_of(ep, struct sock_tx_ctx, ctx); - tx_ctx->enabled = 1; - return 0; - - default: - SOCK_LOG_ERROR("Invalid CTX\n"); - break; - } - return -FI_EINVAL; -} - -int sock_rdm_ctx_getopt(fid_t fid, int level, int optname, - void *optval, size_t *optlen) -{ - switch (level) { - case FI_OPT_ENDPOINT: - return -FI_ENOPROTOOPT; - default: - return -FI_ENOPROTOOPT; - } - return 0; -} - -int sock_rdm_ctx_setopt(fid_t fid, int level, int optname, - const void *optval, size_t optlen) -{ - switch (level) { - case FI_OPT_ENDPOINT: - return -FI_ENOPROTOOPT; - default: - return -FI_ENOPROTOOPT; - } - return 0; -} - -struct fi_ops_ep sock_rdm_ctx_ep_ops = { - .size = sizeof(struct fi_ops_ep), - .enable = sock_rdm_ctx_enable, - .cancel = fi_no_cancel, - .getopt = sock_rdm_ctx_getopt, - .setopt = sock_rdm_ctx_setopt, - .tx_ctx = fi_no_tx_ctx, - .rx_ctx = fi_no_rx_ctx, -}; - -int sock_rdm_ep_fi_close(struct fid *fid) -{ - struct sock_ep *sock_ep; - sock_ep = container_of(fid, struct sock_ep, ep.fid); - - if (atomic_get(&sock_ep->ref) || atomic_get(&sock_ep->num_rx_ctx) || - atomic_get(&sock_ep->num_tx_ctx)) - return -FI_EBUSY; - - sock_tx_ctx_free(sock_ep->tx_array[sock_ep->ep_attr.tx_ctx_cnt]); - sock_rx_ctx_free(sock_ep->rx_array[sock_ep->ep_attr.rx_ctx_cnt]); - - free(sock_ep->tx_array); - free(sock_ep->rx_array); - - if (sock_ep->src_addr) - free(sock_ep->src_addr); - if (sock_ep->dest_addr) - free(sock_ep->dest_addr); - - free(sock_ep); - return 0; -} - -int sock_rdm_ep_fi_bind(struct fid *fid, struct fid *bfid, uint64_t flags) -{ - int ret, i; - struct sock_ep *ep; - struct sock_cq *cq; - struct sock_cntr *cntr; - struct sock_rx_ctx *rx_ctx; - struct sock_tx_ctx *tx_ctx; - - ep = container_of(fid, struct sock_ep, ep.fid); - - switch (bfid->fclass) { - case FI_CLASS_EQ: - return -FI_ENOSYS; - - case FI_CLASS_CQ: - cq = container_of(bfid, struct sock_cq, cq_fid.fid); - if (ep->domain != cq->domain) - return -EINVAL; - - if (flags & FI_SEND) { - ep->send_cq = cq; - if (flags & FI_EVENT) - ep->send_cq_event = 1; - } - - if (flags & FI_READ) { - ep->read_cq = cq; - if (flags & FI_EVENT) - ep->read_cq_event = 1; - } - - if (flags & FI_WRITE) { - ep->write_cq = cq; - if (flags & FI_EVENT) - ep->write_cq_event = 1; - } - - if (flags & FI_RECV) { - ep->recv_cq = cq; - if (flags & FI_EVENT) - ep->recv_cq_event = 1; - } - - if (flags & FI_REMOTE_READ) { - ep->rem_read_cq = cq; - if (flags & FI_EVENT) - ep->rem_read_cq_event = 1; - } - - if (flags & FI_REMOTE_WRITE) { - ep->rem_write_cq = cq; - if (flags & FI_EVENT) - ep->rem_write_cq_event = 1; - } - - for (i=0; i<=ep->ep_attr.tx_ctx_cnt; i++) { - tx_ctx = ep->tx_array[i]; - - if (!tx_ctx) - continue; - - if ((ret = sock_rdm_ctx_bind_cq(&tx_ctx->ctx.fid, - bfid, flags))) - return ret; - } - - for (i=0; i<=ep->ep_attr.rx_ctx_cnt; i++) { - rx_ctx = ep->rx_array[i]; - - if (!rx_ctx) - continue; - - if ((ret = sock_rdm_ctx_bind_cq(&rx_ctx->ctx.fid, - bfid, flags))) - return ret; - } - break; - - case FI_CLASS_CNTR: - cntr = container_of(bfid, struct sock_cntr, cntr_fid.fid); - if (ep->domain != cntr->dom) - return -EINVAL; - - if (flags & FI_SEND) - ep->send_cntr = cntr; - - if (flags & FI_RECV) - ep->recv_cntr = cntr; - - if (flags & FI_READ) - ep->read_cntr = cntr; - - if (flags & FI_WRITE) - ep->write_cntr = cntr; - - if (flags & FI_REMOTE_READ) - ep->rem_read_cntr = cntr; - - if (flags & FI_REMOTE_WRITE) - ep->rem_write_cntr = cntr; - - for (i=0; i<=ep->ep_attr.tx_ctx_cnt; i++) { - tx_ctx = ep->tx_array[i]; - - if (!tx_ctx) - continue; - - if ((ret = sock_rdm_ctx_bind_cntr(&tx_ctx->ctx.fid, - bfid, flags))) - return ret; - } - - for (i=0; i<=ep->ep_attr.rx_ctx_cnt; i++) { - rx_ctx = ep->rx_array[i]; - - if (!rx_ctx) - continue; - - if ((ret = sock_rdm_ctx_bind_cntr(&rx_ctx->ctx.fid, - bfid, flags))) - return ret; - } - break; - - case FI_CLASS_AV: - return -FI_ENOSYS; -/* - av = container_of(bfid, - struct sock_av, av_fid.fid); - if (ep->domain != av->dom) - return -EINVAL; - ep->av = av; - av->connect_fn = sock_rdm_connect_conn_map; - av->cmap = &av->dom->r_cmap; - av->port_num = ep->port_num; - break; -*/ - - case FI_CLASS_MR: - return -FI_ENOSYS; -/* - if (!bfid->ops || !bfid->ops->bind) - return -EINVAL; - err = bfid->ops->bind(bfid, fid, flags); - if (err) - return err; - break; -*/ - - default: - return -ENOSYS; - } - - return 0; -} - -struct fi_ops sock_rdm_ep_fi_ops = { - .size = sizeof(struct fi_ops), - .close = sock_rdm_ep_fi_close, - .bind = sock_rdm_ep_fi_bind, - .control = fi_no_control, - .ops_open = fi_no_ops_open, -}; - -int sock_rdm_ep_enable(struct fid_ep *ep) -{ - struct sock_ep *sock_ep; - sock_ep = container_of(ep, struct sock_ep, ep); - sock_ep->enabled = 1; - return 0; -} - -int sock_rdm_ep_getopt(fid_t fid, int level, int optname, - void *optval, size_t *optlen) -{ - switch (level) { - case FI_OPT_ENDPOINT: - return -FI_ENOPROTOOPT; - default: - return -FI_ENOPROTOOPT; - } - return 0; -} - -int sock_rdm_ep_setopt(fid_t fid, int level, int optname, - const void *optval, size_t optlen) -{ - switch (level) { - case FI_OPT_ENDPOINT: - return -FI_ENOPROTOOPT; - default: - return -FI_ENOPROTOOPT; - } - return 0; -} - -int sock_rdm_ep_tx_ctx(struct fid_sep *sep, int index, struct fi_tx_attr *attr, - struct fid_ep **tx_ep, void *context) -{ - struct sock_ep *sock_ep; - struct sock_tx_ctx *tx_ctx; - - sock_ep = container_of(sep, struct sock_ep, ep.fid); - if (index >= sock_ep->ep_attr.tx_ctx_cnt) - return -FI_EINVAL; - - tx_ctx = sock_tx_ctx_alloc(&sock_ep->tx_attr, context); - if (!tx_ctx) - return -FI_ENOMEM; - - tx_ctx->tx_id = index; - tx_ctx->ep = sock_ep; - tx_ctx->domain = sock_ep->domain; - sock_tx_ctx_add_ep(tx_ctx, sock_ep); - - tx_ctx->ctx.ops = &sock_rdm_ctx_ep_ops; - tx_ctx->ctx.msg = &sock_rdm_ctx_msg_ops; - - /* TODO */ - tx_ctx->ctx.rma = NULL; - tx_ctx->ctx.tagged = NULL; - tx_ctx->ctx.atomic = NULL; - - *tx_ep = &tx_ctx->ctx; - sock_ep->tx_array[index] = tx_ctx; - atomic_inc(&sock_ep->num_tx_ctx); - return 0; -} - -int sock_rdm_ep_rx_ctx(struct fid_sep *sep, int index, struct fi_rx_attr *attr, - struct fid_ep **rx_ep, void *context) -{ - struct sock_ep *sock_ep; - struct sock_rx_ctx *rx_ctx; - - sock_ep = container_of(sep, struct sock_ep, ep.fid); - if (index >= sock_ep->ep_attr.rx_ctx_cnt) - return -FI_EINVAL; - - rx_ctx = sock_rx_ctx_alloc(attr, context); - if (!rx_ctx) - return -FI_ENOMEM; - - rx_ctx->rx_id = index; - rx_ctx->ep = sock_ep; - rx_ctx->domain = sock_ep->domain; - sock_rx_ctx_add_ep(rx_ctx, sock_ep); - - rx_ctx->ctx.ops = &sock_rdm_ctx_ep_ops; - rx_ctx->ctx.msg = &sock_rdm_ctx_msg_ops; - - /* TODO */ - rx_ctx->ctx.rma = NULL; - rx_ctx->ctx.tagged = NULL; - rx_ctx->ctx.atomic = NULL; - - *rx_ep = &rx_ctx->ctx; - sock_ep->rx_array[index] = rx_ctx; - atomic_inc(&sock_ep->num_rx_ctx); - return 0; -} - -struct fi_ops_ep sock_rdm_ep_ops ={ - .size = sizeof(struct fi_ops_ep), - .enable = sock_rdm_ep_enable, - .cancel = fi_no_cancel, - .getopt = sock_rdm_ep_getopt, - .setopt = sock_rdm_ep_setopt, - .tx_ctx = sock_rdm_ep_tx_ctx, - .rx_ctx = sock_rdm_ep_rx_ctx, -}; - -int sock_rdm_ep_cm_getname(fid_t fid, void *addr, size_t *addrlen) -{ - struct sock_ep *sock_ep; - if (*addrlen == 0) { - *addrlen = sizeof(struct sockaddr_in); - return -FI_ETOOSMALL; - } - - sock_ep = container_of(fid, struct sock_ep, ep.fid); - *addrlen = MIN(*addrlen, sizeof(struct sockaddr_in)); - memcpy(addr, sock_ep->src_addr, *addrlen); - return 0; -} - -struct fi_ops_cm sock_rdm_ep_cm_ops = { - .size = sizeof(struct fi_ops_cm), - .getname = sock_rdm_ep_cm_getname, - .getpeer = fi_no_getpeer, - .connect = fi_no_connect, - .listen = fi_no_listen, - .accept = fi_no_accept, - .reject = fi_no_reject, - .shutdown = fi_no_shutdown, - .join = fi_no_join, - .leave = fi_no_leave, -}; - -ssize_t sock_rdm_ep_msg_recvmsg(struct fid_ep *ep, const struct fi_msg *msg, - uint64_t flags) -{ - struct sock_ep *sock_ep; - sock_ep = container_of(ep, struct sock_ep, ep); - return sock_rdm_ctx_recvmsg(&sock_ep->rx_ctx->ctx,msg, flags); -} - -ssize_t sock_rdm_ep_msg_recv(struct fid_ep *ep, void *buf, size_t len, - void *desc, fi_addr_t src_addr, void *context) -{ - struct sock_ep *sock_ep; - sock_ep = container_of(ep, struct sock_ep, ep); - return sock_rdm_ctx_recv(&sock_ep->rx_ctx->ctx, buf, len, desc, - src_addr, context); -} - -ssize_t sock_rdm_ep_msg_recvv(struct fid_ep *ep, const struct iovec *iov, void **desc, - size_t count, fi_addr_t src_addr, void *context) -{ - struct sock_ep *sock_ep; - sock_ep = container_of(ep, struct sock_ep, ep); - return sock_rdm_ctx_recvv(&sock_ep->rx_ctx->ctx, iov, desc, - count, src_addr, context); -} - -ssize_t sock_rdm_ep_msg_sendmsg(struct fid_ep *ep, const struct fi_msg *msg, - uint64_t flags) -{ - struct sock_ep *sock_ep; - sock_ep = container_of(ep, struct sock_ep, ep); - return sock_rdm_ctx_sendmsg(&sock_ep->tx_ctx->ctx, msg, flags); -} - -ssize_t sock_rdm_ep_msg_send(struct fid_ep *ep, const void *buf, size_t len, - void *desc, fi_addr_t dest_addr, void *context) -{ - struct sock_ep *sock_ep; - sock_ep = container_of(ep, struct sock_ep, ep); - return sock_rdm_ctx_send(&sock_ep->tx_ctx->ctx, buf, len, desc, - dest_addr, context); -} - -ssize_t sock_rdm_ep_msg_sendv(struct fid_ep *ep, const struct iovec *iov, - void **desc, size_t count, fi_addr_t dest_addr, - void *context) -{ - struct sock_ep *sock_ep; - sock_ep = container_of(ep, struct sock_ep, ep); - return sock_rdm_ctx_sendv(&sock_ep->tx_ctx->ctx, iov, desc, - count, dest_addr, context); -} - - -ssize_t sock_rdm_ep_msg_inject(struct fid_ep *ep, const void *buf, size_t len, - fi_addr_t dest_addr) -{ - struct sock_ep *sock_ep; - sock_ep = container_of(ep, struct sock_ep, ep); - return sock_rdm_ctx_inject(&sock_ep->tx_ctx->ctx, buf, len, dest_addr); -} - -ssize_t sock_rdm_ep_msg_senddata(struct fid_ep *ep, const void *buf, size_t len, - void *desc, uint64_t data, fi_addr_t dest_addr, - void *context) -{ - struct sock_ep *sock_ep; - sock_ep = container_of(ep, struct sock_ep, ep); - return sock_rdm_ctx_senddata(&sock_ep->tx_ctx->ctx, buf, len, - desc, data, dest_addr, context); -} - -struct fi_ops_msg sock_rdm_ep_msg_ops = { - .size = sizeof(struct fi_ops_msg), - .recv = sock_rdm_ep_msg_recv, - .recvv = sock_rdm_ep_msg_recvv, - .recvmsg = sock_rdm_ep_msg_recvmsg, - .send = sock_rdm_ep_msg_send, - .sendv = sock_rdm_ep_msg_sendv, - .sendmsg = sock_rdm_ep_msg_sendmsg, - .inject = sock_rdm_ep_msg_inject, - .injectdata = fi_no_msg_injectdata, - .senddata = sock_rdm_ep_msg_senddata, -}; - -int sock_rdm_ep(struct fid_domain *domain, struct fi_info *info, - struct fid_ep **ep, void *context) -{ - int ret; - struct sock_ep *sock_ep; - struct sock_tx_ctx *tx_ctx; - struct sock_rx_ctx *rx_ctx; - struct sock_domain *sock_dom; - - if (info) { - ret = sock_verify_info(info); - if (ret) { - SOCK_LOG_INFO( - "Cannot support requested options!\n"); - return -FI_EINVAL; - } - } - - sock_dom = container_of(domain, struct sock_domain, dom_fid); - if (!sock_dom) - return -FI_EINVAL; - - sock_ep = (struct sock_ep*)calloc(1, sizeof(*sock_ep)); - if (!sock_ep) - return -FI_ENOMEM; - - atomic_init(&sock_ep->ref, 0); - sock_ep->ep.fid.fclass = FI_CLASS_EP; - sock_ep->ep.fid.context = context; - sock_ep->ep.fid.ops = &sock_rdm_ep_fi_ops; - - sock_ep->ep.ops = &sock_rdm_ep_ops; - sock_ep->ep.cm = &sock_rdm_ep_cm_ops; - sock_ep->ep.msg = &sock_rdm_ep_msg_ops; - - /* TODO */ - sock_ep->ep.rma = NULL; - sock_ep->ep.tagged = NULL; - sock_ep->ep.atomic = NULL; - - sock_ep->sock_fd = socket(AF_INET, SOCK_STREAM, 0); - if (sock_ep->sock_fd <0) { - goto err; - } - - *ep = &sock_ep->ep; - if (info) { - sock_ep->info.caps = info->caps; - sock_ep->info.addr_format = FI_SOCKADDR_IN; - - if (info->src_addr) { - sock_ep->src_addr = calloc(1, sizeof(struct sockaddr_in)); - memcpy(sock_ep->src_addr, info->src_addr, - sizeof(struct sockaddr_in)); - } - - if (info->dest_addr) { - sock_ep->dest_addr = calloc(1, sizeof(struct sockaddr_in)); - memcpy(sock_ep->dest_addr, info->dest_addr, - sizeof(struct sockaddr_in)); - } - - if (info->ep_attr) { - ret = sock_rdm_verify_ep_attr(info->ep_attr, - info->tx_attr, - info->rx_attr); - if (ret) - goto err; - sock_ep->ep_attr = *info->ep_attr; - } - - if (info->tx_attr) - sock_ep->tx_attr = *info->tx_attr; - else - sock_ep->tx_attr = sock_rdm_tx_attr; - - if (info->rx_attr) - sock_ep->rx_attr = *info->rx_attr; - else - sock_ep->rx_attr = sock_rdm_rx_attr; - } else { - sock_ep->ep_attr = sock_rdm_ep_attr; - sock_ep->tx_attr = sock_rdm_tx_attr; - sock_ep->rx_attr = sock_rdm_rx_attr; - } - - atomic_init(&sock_ep->ref, 0); - atomic_init(&sock_ep->num_tx_ctx, 0); - atomic_init(&sock_ep->num_rx_ctx, 0); - - sock_ep->tx_array = calloc(sock_ep->ep_attr.tx_ctx_cnt + 1, - sizeof(struct sock_tx_ctx *)); - sock_ep->rx_array = calloc(sock_ep->ep_attr.rx_ctx_cnt + 1, - sizeof(struct sock_rx_ctx *)); - - /* default tx ctx */ - tx_ctx = sock_tx_ctx_alloc(&sock_ep->tx_attr, context); - tx_ctx->ep = sock_ep; - tx_ctx->domain = sock_dom; - tx_ctx->tx_id = sock_ep->ep_attr.tx_ctx_cnt; - sock_tx_ctx_add_ep(tx_ctx, sock_ep); - sock_ep->tx_array[sock_ep->ep_attr.tx_ctx_cnt] = tx_ctx; - sock_ep->tx_ctx = tx_ctx; - - /* default rx_ctx */ - rx_ctx = sock_rx_ctx_alloc(&sock_ep->rx_attr, context); - rx_ctx->ep = sock_ep; - rx_ctx->domain = sock_dom; - rx_ctx->rx_id = sock_ep->ep_attr.rx_ctx_cnt; - sock_rx_ctx_add_ep(rx_ctx, sock_ep); - sock_ep->rx_array[sock_ep->ep_attr.rx_ctx_cnt] = rx_ctx; - sock_ep->rx_ctx = rx_ctx; - - sock_ep->domain = sock_dom; - atomic_inc(&sock_dom->ref); - return 0; - -err: - free(sock_ep); - return -FI_EAVAIL; -} - -int sock_rdm_pep(struct fid_fabric *fabric, struct fi_info *info, - struct fid_pep **pep, void *context) -{ - return -FI_EINVAL; -} - -/* place holder */ -int sock_pe_add_tx_ctx(struct sock_pe *pe, struct sock_tx_ctx *ctx) -{ - return -FI_ENOSYS; -} - -int sock_pe_add_rx_ctx(struct sock_pe *pe, struct sock_rx_ctx *ctx) -{ - return -FI_ENOSYS; -} - diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_rma.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_rma.c new file mode 100644 index 0000000000..86bbd2d7b2 --- /dev/null +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_rma.c @@ -0,0 +1,456 @@ +/* + * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include +#endif /* HAVE_CONFIG_H */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "sock.h" +#include "sock_util.h" + +static ssize_t sock_ep_rma_readmsg(struct fid_ep *ep, + const struct fi_msg_rma *msg, + uint64_t flags) +{ + int ret, i; + struct sock_op tx_op; + union sock_iov tx_iov; + struct sock_conn *conn; + struct sock_tx_ctx *tx_ctx; + uint64_t total_len, src_len, dst_len; + struct sock_ep *sock_ep; + + switch (ep->fid.fclass) { + case FI_CLASS_EP: + sock_ep = container_of(ep, struct sock_ep, ep); + tx_ctx = sock_ep->tx_ctx; + break; + + case FI_CLASS_TX_CTX: + tx_ctx = container_of(ep, struct sock_tx_ctx, ctx); + sock_ep = tx_ctx->ep; + break; + + default: + SOCK_LOG_ERROR("Invalid EP type\n"); + return -FI_EINVAL; + } + + assert(tx_ctx->enabled && + msg->iov_count <= SOCK_EP_MAX_IOV_LIMIT && + msg->rma_iov_count <= SOCK_EP_MAX_IOV_LIMIT); + + conn = sock_av_lookup_addr(tx_ctx->av, msg->addr); + assert(conn); + + total_len = sizeof(struct sock_op_send) + + (msg->iov_count * sizeof(union sock_iov)) + + (msg->rma_iov_count * sizeof(union sock_iov)); + + sock_tx_ctx_start(tx_ctx); + if (rbfdavail(&tx_ctx->rbfd) < total_len) { + ret = -FI_EAGAIN; + goto err; + } + + flags |= tx_ctx->attr.op_flags; + memset(&tx_op, 0, sizeof(struct sock_op)); + tx_op.op = SOCK_OP_READ; + tx_op.src_iov_len = msg->rma_iov_count; + tx_op.dest_iov_len = msg->iov_count; + + sock_tx_ctx_write(tx_ctx, &tx_op, sizeof(struct sock_op)); + sock_tx_ctx_write(tx_ctx, &flags, sizeof(uint64_t)); + sock_tx_ctx_write(tx_ctx, &msg->context, sizeof(uint64_t)); + sock_tx_ctx_write(tx_ctx, &msg->addr, sizeof(uint64_t)); + sock_tx_ctx_write(tx_ctx, &conn, sizeof(uint64_t)); + sock_tx_ctx_write(tx_ctx, &msg->msg_iov[0].iov_base, sizeof(uint64_t)); + sock_tx_ctx_write(tx_ctx, &sock_ep, sizeof(uint64_t)); + + if (flags & FI_REMOTE_CQ_DATA) { + sock_tx_ctx_write(tx_ctx, &msg->data, sizeof(uint64_t)); + } + + src_len = 0; + for (i = 0; i< msg->rma_iov_count; i++) { + tx_iov.iov.addr = msg->rma_iov[i].addr; + tx_iov.iov.key = msg->rma_iov[i].key; + tx_iov.iov.len = msg->rma_iov[i].len; + sock_tx_ctx_write(tx_ctx, &tx_iov, sizeof(union sock_iov)); + src_len += tx_iov.iov.len; + } + + dst_len = 0; + for (i = 0; i< msg->iov_count; i++) { + tx_iov.iov.addr = (uint64_t)msg->msg_iov[i].iov_base; + tx_iov.iov.len = msg->msg_iov[i].iov_len; + tx_iov.iov.key = (uint64_t)msg->desc[i]; + sock_tx_ctx_write(tx_ctx, &tx_iov, sizeof(union sock_iov)); + dst_len += tx_iov.iov.len; + } + + if (dst_len != src_len) { + SOCK_LOG_ERROR("Buffer length mismatch\n"); + ret = -FI_EINVAL; + goto err; + } + + sock_tx_ctx_commit(tx_ctx); + return 0; + +err: + sock_tx_ctx_abort(tx_ctx); + return ret; +} + +static ssize_t sock_ep_rma_read(struct fid_ep *ep, void *buf, size_t len, + void *desc, fi_addr_t src_addr, uint64_t addr, + uint64_t key, void *context) +{ + struct fi_msg_rma msg; + struct iovec msg_iov; + struct fi_rma_iov rma_iov; + + msg_iov.iov_base = (void*)buf; + msg_iov.iov_len = len; + msg.msg_iov = &msg_iov; + msg.desc = &desc; + msg.iov_count = 1; + + rma_iov.addr = addr; + rma_iov.key = key; + rma_iov.len = len; + msg.rma_iov_count = 1; + msg.rma_iov = &rma_iov; + + msg.addr = src_addr; + msg.context = context; + + return sock_ep_rma_readmsg(ep, &msg, 0); +} + +static ssize_t sock_ep_rma_readv(struct fid_ep *ep, const struct iovec *iov, + void **desc, size_t count, fi_addr_t src_addr, + uint64_t addr, uint64_t key, void *context) +{ + struct fi_msg_rma msg; + struct fi_rma_iov rma_iov; + + msg.msg_iov = iov; + msg.desc = desc; + msg.iov_count = count; + + rma_iov.addr = addr; + rma_iov.key = key; + rma_iov.len = 1; + + msg.rma_iov = &rma_iov; + msg.addr = src_addr; + msg.context = context; + + return sock_ep_rma_readmsg(ep, &msg, 0); +} + +static ssize_t sock_ep_rma_writemsg(struct fid_ep *ep, + const struct fi_msg_rma *msg, + uint64_t flags) +{ + int ret, i; + struct sock_op tx_op; + union sock_iov tx_iov; + struct sock_conn *conn; + struct sock_tx_ctx *tx_ctx; + uint64_t total_len, src_len, dst_len; + struct sock_ep *sock_ep; + + switch (ep->fid.fclass) { + case FI_CLASS_EP: + sock_ep = container_of(ep, struct sock_ep, ep); + tx_ctx = sock_ep->tx_ctx; + break; + + case FI_CLASS_TX_CTX: + tx_ctx = container_of(ep, struct sock_tx_ctx, ctx); + sock_ep = tx_ctx->ep; + break; + + default: + SOCK_LOG_ERROR("Invalid EP type\n"); + return -FI_EINVAL; + } + + assert(tx_ctx->enabled && + msg->iov_count <= SOCK_EP_MAX_IOV_LIMIT && + msg->rma_iov_count <= SOCK_EP_MAX_IOV_LIMIT); + + conn = sock_av_lookup_addr(tx_ctx->av, msg->addr); + assert(conn); + + flags |= tx_ctx->attr.op_flags; + memset(&tx_op, 0, sizeof(struct sock_op)); + tx_op.op = SOCK_OP_WRITE; + tx_op.dest_iov_len = msg->rma_iov_count; + + total_len = 0; + if (flags & FI_INJECT) { + for (i=0; i< msg->iov_count; i++) { + total_len += msg->msg_iov[i].iov_len; + } + assert(total_len <= SOCK_EP_MAX_INJECT_SZ); + tx_op.src_iov_len = total_len; + } else { + total_len += msg->iov_count * sizeof(union sock_iov); + tx_op.src_iov_len = msg->iov_count; + } + + total_len += (sizeof(struct sock_op_send) + + (msg->rma_iov_count * sizeof(union sock_iov))); + + sock_tx_ctx_start(tx_ctx); + if (rbfdavail(&tx_ctx->rbfd) < total_len) { + ret = -FI_EAGAIN; + goto err; + } + + sock_tx_ctx_write(tx_ctx, &tx_op, sizeof(struct sock_op)); + sock_tx_ctx_write(tx_ctx, &flags, sizeof(uint64_t)); + sock_tx_ctx_write(tx_ctx, &msg->context, sizeof(uint64_t)); + sock_tx_ctx_write(tx_ctx, &msg->addr, sizeof(uint64_t)); + sock_tx_ctx_write(tx_ctx, &conn, sizeof(uint64_t)); + sock_tx_ctx_write(tx_ctx, &msg->msg_iov[0].iov_base, sizeof(uint64_t)); + sock_tx_ctx_write(tx_ctx, &sock_ep, sizeof(uint64_t)); + + if (flags & FI_REMOTE_CQ_DATA) { + sock_tx_ctx_write(tx_ctx, &msg->data, sizeof(uint64_t)); + } + + src_len = 0; + if (flags & FI_INJECT) { + for (i=0; i< msg->iov_count; i++) { + sock_tx_ctx_write(tx_ctx, msg->msg_iov[i].iov_base, + msg->msg_iov[i].iov_len); + src_len += tx_iov.iov.len; + } + } else { + for (i = 0; i< msg->iov_count; i++) { + tx_iov.iov.addr = (uint64_t)msg->msg_iov[i].iov_base; + tx_iov.iov.len = msg->msg_iov[i].iov_len; + tx_iov.iov.key = (uint64_t)msg->desc[i]; + sock_tx_ctx_write(tx_ctx, &tx_iov, sizeof(union sock_iov)); + src_len += tx_iov.iov.len; + } + } + + dst_len = 0; + for (i = 0; i< msg->rma_iov_count; i++) { + tx_iov.iov.addr = msg->rma_iov[i].addr; + tx_iov.iov.key = msg->rma_iov[i].key; + tx_iov.iov.len = msg->rma_iov[i].len; + sock_tx_ctx_write(tx_ctx, &tx_iov, sizeof(union sock_iov)); + dst_len += tx_iov.iov.len; + } + + if (dst_len != src_len) { + SOCK_LOG_ERROR("Buffer length mismatch\n"); + ret = -FI_EINVAL; + goto err; + } + + sock_tx_ctx_commit(tx_ctx); + return 0; + +err: + sock_tx_ctx_abort(tx_ctx); + return ret; +} + +static ssize_t sock_ep_rma_write(struct fid_ep *ep, const void *buf, + size_t len, void *desc, fi_addr_t dest_addr, + uint64_t addr, uint64_t key, void *context) +{ + struct fi_msg_rma msg; + struct iovec msg_iov; + struct fi_rma_iov rma_iov; + + msg_iov.iov_base = (void*)buf; + msg_iov.iov_len = len; + + msg.msg_iov = &msg_iov; + msg.desc = &desc; + msg.iov_count = 1; + + rma_iov.addr = addr; + rma_iov.key = key; + rma_iov.len = len; + + msg.rma_iov_count = 1; + msg.rma_iov = &rma_iov; + + msg.addr = dest_addr; + msg.context = context; + + return sock_ep_rma_writemsg(ep, &msg, 0); +} + +static ssize_t sock_ep_rma_writev(struct fid_ep *ep, + const struct iovec *iov, void **desc, + size_t count, fi_addr_t dest_addr, uint64_t addr, + uint64_t key, void *context) +{ + struct fi_msg_rma msg; + struct fi_rma_iov rma_iov; + + msg.msg_iov = iov; + msg.desc = desc; + msg.iov_count = count; + + rma_iov.addr = addr; + rma_iov.key = key; + rma_iov.len = 1; + + msg.rma_iov = &rma_iov; + msg.context = context; + msg.addr = dest_addr; + + return sock_ep_rma_writemsg(ep, &msg, 0); +} + +static ssize_t sock_ep_rma_writedata(struct fid_ep *ep, const void *buf, + size_t len, void *desc, uint64_t data, + fi_addr_t dest_addr, uint64_t addr, + uint64_t key, void *context) +{ + struct fi_msg_rma msg; + struct iovec msg_iov; + struct fi_rma_iov rma_iov; + + msg_iov.iov_base = (void*)buf; + msg_iov.iov_len = len; + msg.desc = &desc; + msg.iov_count = 1; + + rma_iov.addr = addr; + rma_iov.key = key; + rma_iov.len = 1; + + msg.rma_iov = &rma_iov; + msg.msg_iov = &msg_iov; + + msg.addr = dest_addr; + msg.context = context; + msg.data = data; + + return sock_ep_rma_writemsg(ep, &msg, FI_REMOTE_CQ_DATA); +} + +static ssize_t sock_ep_rma_inject(struct fid_ep *ep, const void *buf, + size_t len, fi_addr_t dest_addr, uint64_t addr, + uint64_t key) +{ + struct fi_msg_rma msg; + struct iovec msg_iov; + struct fi_rma_iov rma_iov; + + msg_iov.iov_base = (void*)buf; + msg_iov.iov_len = len; + msg.msg_iov = &msg_iov; + msg.iov_count = 1; + + rma_iov.addr = addr; + rma_iov.key = key; + rma_iov.len = 1; + + msg.rma_iov = &rma_iov; + msg.msg_iov = &msg_iov; + msg.addr = dest_addr; + + return sock_ep_rma_writemsg(ep, &msg, FI_INJECT); +} + +static ssize_t sock_ep_rma_injectdata(struct fid_ep *ep, const void *buf, + size_t len, uint64_t data, fi_addr_t dest_addr, + uint64_t addr, uint64_t key) +{ + struct fi_msg_rma msg; + struct iovec msg_iov; + struct fi_rma_iov rma_iov; + + msg_iov.iov_base = (void*)buf; + msg_iov.iov_len = len; + msg.msg_iov = &msg_iov; + msg.iov_count = 1; + + rma_iov.addr = addr; + rma_iov.key = key; + rma_iov.len = 1; + + msg.rma_iov = &rma_iov; + msg.msg_iov = &msg_iov; + msg.addr = dest_addr; + msg.data = data; + return sock_ep_rma_writemsg(ep, &msg, FI_INJECT|FI_REMOTE_CQ_DATA); +} + + +struct fi_ops_rma sock_ep_rma = { + .size = sizeof(struct fi_ops_rma), + .read = sock_ep_rma_read, + .readv = sock_ep_rma_readv, + .readmsg = sock_ep_rma_readmsg, + .write = sock_ep_rma_write, + .writev = sock_ep_rma_writev, + .writemsg = sock_ep_rma_writemsg, + .inject = sock_ep_rma_inject, + .injectdata = sock_ep_rma_injectdata, + .writedata = sock_ep_rma_writedata, +}; + diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_rx_entry.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_rx_entry.c new file mode 100644 index 0000000000..17730b5d83 --- /dev/null +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_rx_entry.c @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include +#endif /* HAVE_CONFIG_H */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sock.h" +#include "sock_util.h" + + +struct sock_rx_entry *sock_rx_new_entry(struct sock_rx_ctx *rx_ctx) +{ + /* FIXME: pool of rx_entry */ + struct sock_rx_entry *rx_entry; + rx_entry = calloc(1, sizeof(struct sock_rx_entry)); + if (!rx_entry) + return NULL; + + SOCK_LOG_INFO("New rx_entry: %p, ctx: %p\n", rx_entry, rx_ctx); + dlist_init(&rx_entry->entry); + return rx_entry; +} + +void sock_rx_release_entry(struct sock_rx_entry *rx_entry) +{ + SOCK_LOG_INFO("Releasing rx_entry: %p\n", rx_entry); + free(rx_entry); +} + + +struct sock_rx_entry *sock_rx_new_buffered_entry(struct sock_rx_ctx *rx_ctx, + size_t len) +{ + struct sock_rx_entry *rx_entry; + + if (rx_ctx->buffered_len + len >= rx_ctx->attr.total_buffered_recv) { + SOCK_LOG_ERROR("Reached max buffered recv limit\n"); + return NULL; + } + + rx_entry = calloc(1, sizeof(struct sock_rx_entry) + len); + if (!rx_entry) + return NULL; + + SOCK_LOG_INFO("New buffered entry:%p len: %lu, ctx: %p\n", + rx_entry, len, rx_ctx); + + rx_entry->is_buffered = 1; + rx_entry->rx_op.dest_iov_len = 1; + rx_entry->iov[0].iov.len = len; + rx_entry->iov[0].iov.addr = (uint64_t)((char*)rx_entry + + sizeof(struct sock_rx_entry)); + rx_entry->total_len = len; + + rx_ctx->buffered_len += len; + dlist_insert_tail(&rx_entry->entry, &rx_ctx->rx_buffered_list); + return rx_entry; +} + +inline size_t sock_rx_avail_len(struct sock_rx_entry *rx_entry) +{ + return rx_entry->total_len - rx_entry->used; +} + +struct sock_rx_entry *sock_rx_get_entry(struct sock_rx_ctx *rx_ctx, + uint64_t addr, uint64_t tag) +{ + struct dlist_entry *entry; + struct sock_rx_entry *rx_entry; + + for (entry = rx_ctx->rx_entry_list.next; + entry != &rx_ctx->rx_entry_list; entry = entry->next) { + + rx_entry = container_of(entry, struct sock_rx_entry, entry); + if (rx_entry->is_busy) + continue; + + if (((rx_entry->tag & ~rx_entry->ignore) == + (tag & ~rx_entry->ignore)) && + (rx_entry->addr == FI_ADDR_UNSPEC || + addr == FI_ADDR_UNSPEC || rx_entry->addr == addr)) { + break; + } + } + + if (entry == &rx_ctx->rx_entry_list) + rx_entry = NULL; + + return rx_entry; +} diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_util.h b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_util.h index 2c54710b38..92621ce175 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_util.h +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_util.h @@ -40,30 +40,31 @@ #include #define SOCK_ERROR (1) -#define SOCK_WARN (2) -#define SOCK_INFO (3) +#define SOCK_WARN (2) +#define SOCK_INFO (3) extern int sock_log_level; -#define SOCK_LOG_INFO(...) do { \ - if (sock_log_level <= SOCK_INFO) { \ - fprintf(stderr, "[SOCK_INFO - %s]: ", __func__); \ +#define SOCK_LOG_INFO(...) do { \ + if (sock_log_level >= SOCK_INFO) { \ + fprintf(stderr, "[SOCK_INFO - %s:%d]: ", __func__, __LINE__); \ fprintf(stderr, __VA_ARGS__); \ } \ } while (0) -#define SOCK_LOG_WARN(...) do { \ - if (sock_log_level <= SOCK_WARN) { \ - fprintf(stderr, "[SOCK_WARN - %s]: ", __func__); \ +#define SOCK_LOG_WARN(...) do { \ + if (sock_log_level >= SOCK_WARN) { \ + fprintf(stderr, "[SOCK_WARN - %s:%d]: ", __func__, __LINE__); \ fprintf(stderr, __VA_ARGS__); \ } \ } while (0) #define SOCK_LOG_ERROR(...) do { \ - if (sock_log_level <= SOCK_ERROR) { \ - fprintf(stderr, "[SOCK_ERROR - %s]: ", __func__); \ + if (sock_log_level >= SOCK_ERROR) { \ + fprintf(stderr, "[SOCK_ERROR - %s:%d]: ", __func__, __LINE__); \ fprintf(stderr, __VA_ARGS__); \ } \ } while (0) #endif + diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_wait.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_wait.c new file mode 100644 index 0000000000..f28437e2af --- /dev/null +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_wait.c @@ -0,0 +1,299 @@ +/* + * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include +#endif /* HAVE_CONFIG_H */ + +#include +#include +#include + +#include "sock.h" +#include "sock_util.h" + +enum { + WAIT_READ_FD = 0, + WAIT_WRITE_FD, +}; + +int sock_wait_get_obj(struct fid_wait *fid, void *arg) +{ + struct fi_mutex_cond mut_cond; + struct sock_wait *wait; + + wait = container_of(fid, struct sock_wait, wait_fid.fid); + + switch (wait->type) { + case FI_WAIT_FD: + memcpy(arg,&wait->fd[WAIT_READ_FD], sizeof(int)); + break; + + case FI_WAIT_MUTEX_COND: + mut_cond.mutex = &wait->mutex; + mut_cond.cond = &wait->cond; + memcpy(arg, &mut_cond, sizeof(mut_cond)); + break; + + default: + SOCK_LOG_ERROR("Invalid wait obj type\n"); + return -FI_EINVAL; + } + + return 0; +} + +static int sock_wait_init(struct sock_wait *wait, enum fi_wait_obj type) +{ + long flags = 0; + wait->type = type; + + switch (type) { + case FI_WAIT_FD: + if (socketpair(AF_UNIX, SOCK_STREAM, 0, wait->fd)) + return -errno; + + fcntl(wait->fd[WAIT_READ_FD], F_GETFL, &flags); + if (fcntl(wait->fd[WAIT_READ_FD], F_SETFL, flags | O_NONBLOCK)) { + close(wait->fd[WAIT_READ_FD]); + close(wait->fd[WAIT_WRITE_FD]); + return -errno; + } + break; + + case FI_WAIT_MUTEX_COND: + pthread_mutex_init(&wait->mutex, NULL); + pthread_cond_init(&wait->cond, NULL); + break; + + default: + SOCK_LOG_ERROR("Invalid wait object type\n"); + return -FI_EINVAL; + } + return 0; +} + +static int sock_wait_wait(struct fid_wait *wait_fid, int timeout) +{ + int err = 0; + struct sock_cq *cq; + struct sock_cntr *cntr; + struct timeval now; + struct sock_wait *wait; + double start_ms = 0.0, end_ms = 0.0; + struct dlist_entry *p, *head; + struct sock_fid_list *list_item; + + wait = container_of(wait_fid, struct sock_wait, wait_fid); + if (wait->domain->progress_mode == FI_PROGRESS_MANUAL) { + if (timeout > 0) { + gettimeofday(&now, NULL); + start_ms = (double)now.tv_sec * 1000.0 + + (double)now.tv_usec / 1000.0; + } + + head = &wait->fid_list; + for (p = head->next; p != head; p = p->next) { + list_item = container_of(p, struct sock_fid_list, entry); + switch (list_item->fid->fclass) { + case FI_CLASS_CQ: + cq = container_of(list_item->fid, + struct sock_cq, cq_fid); + sock_cq_progress(cq); + break; + + case FI_CLASS_CNTR: + cntr = container_of(list_item->fid, + struct sock_cntr, cntr_fid); + sock_cntr_progress(cntr); + break; + } + } + if (timeout > 0) { + gettimeofday(&now, NULL); + end_ms = (double)now.tv_sec * 1000.0 + + (double)now.tv_usec / 1000.0; + timeout -= (end_ms - start_ms); + timeout = timeout < 0 ? 0 : timeout; + } + } + + switch (wait->type) { + case FI_WAIT_FD: + err = fi_poll_fd(wait->fd[WAIT_READ_FD], timeout); + if (err > 0) + err = 0; + else if (err == 0) + err = -FI_ETIMEDOUT; + break; + + case FI_WAIT_MUTEX_COND: + err = fi_wait_cond(&wait->cond, + &wait->mutex, timeout); + break; + + default: + SOCK_LOG_ERROR("Invalid wait object type\n"); + return -FI_EINVAL; + } + return err; +} + +void sock_wait_signal(struct fid_wait *wait_fid) +{ + struct sock_wait *wait; + static char c = 'a'; + + wait = container_of(wait_fid, struct sock_wait, wait_fid); + + switch (wait->type) { + case FI_WAIT_FD: + write(wait->fd[WAIT_WRITE_FD], &c, 1); + break; + + case FI_WAIT_MUTEX_COND: + pthread_cond_signal(&wait->cond); + break; + default: + SOCK_LOG_ERROR("Invalid wait object type\n"); + return; + } +} + +static struct fi_ops_wait sock_wait_ops = { + .size = sizeof(struct fi_ops_wait), + .wait = sock_wait_wait, +}; + +static int sock_wait_control(struct fid *fid, int command, void *arg) +{ + struct sock_wait *wait; + int ret = 0; + + wait = container_of(fid, struct sock_wait, wait_fid.fid); + switch (command) { + case FI_GETWAIT: + ret = sock_wait_get_obj(&wait->wait_fid, arg); + break; + default: + ret = -FI_EINVAL; + break; + } + return ret; +} + +int sock_wait_close(fid_t fid) +{ + struct sock_fid_list *list_item; + struct dlist_entry *p, *head; + struct sock_wait *wait; + + wait = container_of(fid, struct sock_wait, wait_fid.fid); + head = &wait->fid_list; + + for (p = head->next; p != head; p = p->next) { + list_item = container_of(p, struct sock_fid_list, entry); + free(list_item); + } + + if (wait->type == FI_WAIT_FD) { + close(wait->fd[WAIT_READ_FD]); + close(wait->fd[WAIT_WRITE_FD]); + } + + atomic_dec(&wait->domain->ref); + free(wait); + return 0; +} + +static struct fi_ops sock_wait_fi_ops = { + .size = sizeof(struct fi_ops), + .close = sock_wait_close, + .bind = fi_no_bind, + .control = sock_wait_control, + .ops_open = fi_no_ops_open, +}; + +static int sock_verify_wait_attr(struct fi_wait_attr *attr) +{ + switch (attr->wait_obj) { + case FI_WAIT_UNSPEC: + case FI_WAIT_FD: + case FI_WAIT_MUTEX_COND: + break; + + default: + SOCK_LOG_ERROR("Invalid wait object type\n"); + return -FI_EINVAL; + } + if (attr->flags) + return -FI_EINVAL; + return 0; +} + +int sock_wait_open(struct fid_domain *domain, struct fi_wait_attr *attr, + struct fid_wait **waitset) +{ + int err; + struct sock_wait *wait; + struct sock_domain *dom; + enum fi_wait_obj wait_obj_type; + + + if(attr && sock_verify_wait_attr(attr)) + return -FI_EINVAL; + + dom = container_of(domain, struct sock_domain, dom_fid); + if (!attr || attr->wait_obj == FI_WAIT_UNSPEC) + wait_obj_type = FI_WAIT_FD; + + wait = calloc(1, sizeof(*wait)); + if (!wait) + return -FI_ENOMEM; + + err = sock_wait_init(wait, wait_obj_type); + if (err) { + free(wait); + return err; + } + + wait->wait_fid.fid.fclass = FI_CLASS_WAIT; + wait->wait_fid.fid.context = 0; + wait->wait_fid.fid.ops = &sock_wait_fi_ops; + wait->wait_fid.ops = &sock_wait_ops; + wait->domain = dom; + wait->type = wait_obj_type; + atomic_inc(&dom->ref); + + *waitset = &wait->wait_fid; + return 0; +} diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf.h b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf.h index 5c68011b96..d60bba3dfe 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf.h +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf.h @@ -40,25 +40,40 @@ #include #include "usdf_progress.h" +#include "usd.h" + +#define USDF_PROV_NAME "usnic" +#define USDF_MAJOR_VERS 1 +#define USDF_MINOR_VERS 0 +#define USDF_PROV_VERSION FI_VERSION(USDF_MAJOR_VERS, USDF_MINOR_VERS) -#define USDF_FI_NAME "usnic" #define USDF_HDR_BUF_ENTRY 64 #define USDF_EP_CAP_PIO (1ULL << 63) +#define USDF_MAX_PEERS (16 * 1024) + #define USDF_DGRAM_CAPS (FI_MSG | FI_SOURCE | FI_SEND | FI_RECV) #define USDF_DGRAM_SUPP_MODE (FI_LOCAL_MR | FI_MSG_PREFIX) #define USDF_DGRAM_REQ_MODE (FI_LOCAL_MR) -#define USDF_MSG_CAPS (FI_MSG | FI_SOURCE | FI_SEND | FI_RECV) - -#define USDF_MSG_SUPP_MODE (FI_LOCAL_MR) -#define USDF_MSG_REQ_MODE (FI_LOCAL_MR) - /* usdf event flags */ #define USDF_EVENT_FLAG_ERROR (1ULL << 62) #define USDF_EVENT_FLAG_FREE_BUF (1ULL << 63) +/* + * TAILQ stuff that should exist + */ +#define TAILQ_REMOVE_MARK(head, elm, link) \ + do { \ + TAILQ_REMOVE(head, elm, link); \ + (elm)->link.tqe_prev = NULL; \ + } while (0) + +#define TAILQ_ON_LIST(elm, link) ((elm)->link.tqe_prev != NULL) + +struct usdf_domain; + struct usdf_dev_entry { struct usd_device *ue_dev; struct usd_device_attrs ue_dattr; @@ -73,9 +88,11 @@ extern struct usdf_usnic_info *__usdf_devinfo; struct usdf_fabric { struct fid_fabric fab_fid; + struct fi_fabric_attr fab_attr; struct usd_device_attrs *fab_dev_attrs; int fab_arp_sockfd; atomic_t fab_refcnt; + LIST_HEAD(,usdf_domain) fab_domain_list; /* progression */ pthread_t fab_thread; @@ -98,10 +115,25 @@ struct usdf_fabric { struct usdf_domain { struct fid_domain dom_fid; struct usdf_fabric *dom_fabric; + struct fi_info *dom_info; atomic_t dom_refcnt; struct usdf_eq *dom_eq; struct usd_device *dom_dev; - struct usd_device_attrs dom_dev_attrs; + + pthread_spinlock_t dom_progress_lock; + TAILQ_HEAD(,usdf_tx) dom_tx_ready; + TAILQ_HEAD(,usdf_cq_hard) dom_hcq_list; + + struct usdf_rdm_connection **dom_rdc_hashtab; + SLIST_HEAD(,usdf_rdm_connection) dom_rdc_free; + atomic_t dom_rdc_free_cnt; + size_t dom_rdc_total; + + /* used only by connected endpoints */ + struct usdf_ep **dom_peer_tab; + uint32_t dom_next_peer; + + LIST_ENTRY(usdf_domain) dom_link; }; #define dom_ftou(FDOM) container_of(FDOM, struct usdf_domain, dom_fid) #define dom_utof(DOM) (&(DOM)->dom_fid) @@ -125,41 +157,174 @@ struct usdf_pep { #define pep_ftou(FPEP) container_of(FPEP, struct usdf_pep, pep_fid) #define pep_fidtou(FID) container_of(FID, struct usdf_pep, pep_fid.fid) #define pep_utof(PEP) (&(PEP)->pep_fid) +#define pep_utofid(PEP) (&(PEP)->pep_fid.fid) + +struct usdf_tx { + struct fid_stx tx_fid; + atomic_t tx_refcnt; + struct usdf_domain *tx_domain; + TAILQ_ENTRY(usdf_tx) tx_link; + + struct fi_tx_attr tx_attr; + struct usd_qp *tx_qp; + void (*tx_progress)(struct usdf_tx *tx); + + union { + struct { + struct usdf_cq_hard *tx_hcq; + + struct usdf_msg_qe *tx_wqe_buf; + TAILQ_HEAD(,usdf_msg_qe) tx_free_wqe; + TAILQ_HEAD(,usdf_ep) tx_ep_ready; + TAILQ_HEAD(,usdf_ep) tx_ep_have_acks; + } msg; + struct { + struct usdf_cq_hard *tx_hcq; + + atomic_t tx_next_msg_id; + struct usdf_rdm_qe *tx_wqe_buf; + TAILQ_HEAD(,usdf_rdm_qe) tx_free_wqe; + TAILQ_HEAD(,usdf_rdm_connection) tx_rdc_ready; + TAILQ_HEAD(,usdf_rdm_connection) tx_rdc_have_acks; + } rdm; + } t; +}; +#define tx_ftou(FEP) container_of(FEP, struct usdf_tx, tx_fid) +#define tx_fidtou(FID) container_of(FID, struct usdf_tx, tx_fid) +#define tx_utof(RX) (&(RX)->tx_fid) +#define tx_utofid(RX) (&(RX)->tx_fid.fid) + +struct usdf_rx { + struct fid_ep rx_fid; + atomic_t rx_refcnt; + struct usdf_domain *rx_domain; + + struct fi_rx_attr rx_attr; + struct usd_qp *rx_qp; + + union { + struct { + struct usdf_cq_hard *rx_hcq; + + uint8_t *rx_bufs; + struct usdf_msg_qe *rx_rqe_buf; + TAILQ_HEAD(,usdf_msg_qe) rx_free_rqe; + TAILQ_HEAD(,usdf_msg_qe) rx_posted_rqe; + } msg; + struct { + int rx_sock; + struct usdf_cq_hard *rx_hcq; + struct usdf_tx *rx_tx; + + uint8_t *rx_bufs; + struct usdf_rdm_qe *rx_rqe_buf; + TAILQ_HEAD(,usdf_rdm_qe) rx_free_rqe; + TAILQ_HEAD(,usdf_rdm_qe) rx_posted_rqe; + } rdm; + } r; +}; +#define rx_ftou(FEP) container_of(FEP, struct usdf_rx, rx_fid) +#define rx_fidtou(FID) container_of(FID, struct usdf_rx, rx_fid) +#define rx_utof(RX) (&(RX)->rx_fid) +#define rx_utofid(RX) (&(RX)->rx_fid.fid) struct usdf_ep { struct fid_ep ep_fid; + struct usdf_domain *ep_domain; atomic_t ep_refcnt; uint64_t ep_caps; uint64_t ep_mode; - int ep_sock; - int ep_conn_sock; - uint32_t ep_wqe; + + uint32_t ep_wqe; /* requested queue sizes */ uint32_t ep_rqe; - struct usdf_domain *ep_domain; - struct usdf_av *ep_av; - struct usdf_cq *ep_wcq; - struct usdf_cq *ep_rcq; - struct usdf_eq *ep_eq; - struct usd_qp *ep_qp; - struct usd_dest *ep_dest; + struct usd_qp_attrs ep_qp_attrs; - void *ep_hdr_buf; - struct usd_udp_hdr **ep_hdr_ptr; + + struct usdf_eq *ep_eq; + + struct usdf_tx *ep_tx; + struct usdf_rx *ep_rx; + + union { + struct { + struct usd_qp *ep_qp; + struct usdf_cq *ep_wcq; + struct usdf_cq *ep_rcq; + + int ep_sock; + struct usdf_av *ep_av; + + void *ep_hdr_buf; + struct usd_udp_hdr **ep_hdr_ptr; + } dg; + struct { + + struct usdf_connreq *ep_connreq; + struct usd_dest *ep_dest; + uint32_t ep_rem_peer_id; + uint32_t ep_lcl_peer_id; + + TAILQ_HEAD(,usdf_msg_qe) ep_posted_wqe; + TAILQ_HEAD(usdf_msg_qe_head ,usdf_msg_qe) ep_sent_wqe; + uint32_t ep_fairness_credits; + uint32_t ep_seq_credits; + uint16_t ep_next_tx_seq; + uint16_t ep_last_rx_ack; + int ep_send_nak; + + struct usdf_msg_qe *ep_cur_recv; + uint16_t ep_next_rx_seq; + TAILQ_ENTRY(usdf_ep) ep_ack_link; + + struct usdf_timer_entry *ep_ack_timer; + + TAILQ_ENTRY(usdf_ep) ep_link; + } msg; + struct { + int ep_sock; + struct usdf_av *ep_av; + + } rdm; + } e; }; #define ep_ftou(FEP) container_of(FEP, struct usdf_ep, ep_fid) #define ep_fidtou(FID) container_of(FID, struct usdf_ep, ep_fid.fid) #define ep_utof(EP) (&(EP)->ep_fid) +#define ep_utofid(EP) (&(EP)->ep_fid.fid) struct usdf_mr { struct fid_mr mr_fid; struct usd_mr *mr_mr; }; +struct usdf_cq_hard { + struct usdf_cq *cqh_cq; + struct usd_cq *cqh_ucq; + atomic_t cqh_refcnt; + void (*cqh_progress)(struct usdf_cq_hard *hcq); + void (*cqh_post)(struct usdf_cq_hard *hcq, void *context, size_t len); + TAILQ_ENTRY(usdf_cq_hard) cqh_link; + TAILQ_ENTRY(usdf_cq_hard) cqh_dom_link; +}; + struct usdf_cq { struct fid_cq cq_fid; atomic_t cq_refcnt; struct usdf_domain *cq_domain; - struct usd_cq *cq_cq; + struct fi_cq_attr cq_attr; + + union { + struct { + struct usd_cq *cq_cq; + } hard; + struct { + void *cq_comps; + void *cq_end; + void *cq_head; + void *cq_tail; + TAILQ_HEAD(,usdf_cq_hard) cq_list; + } soft; + } c; struct usd_completion cq_comp; }; #define cq_ftou(FCQ) container_of(FCQ, struct usdf_cq, cq_fid) diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_av.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_av.c index 791c2d5a73..18dff35d88 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_av.c +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_av.c @@ -58,7 +58,6 @@ #include "libnl_utils.h" #include "usd.h" #include "usd_queue.h" -#include "usd_dest.h" #include "usdf.h" #include "usdf_av.h" @@ -114,11 +113,27 @@ usdf_post_insert_request_error(struct usdf_av_insert *insert, err_entry.data = req - (struct usdf_av_req *)(insert + 1); err_entry.err = -req->avr_status; - usdf_eq_write_internal(av->av_eq, FI_COMPLETE, + usdf_eq_write_internal(av->av_eq, 0, &err_entry, sizeof(err_entry), USDF_EVENT_FLAG_ERROR); } +static int +usdf_av_alloc_dest(struct usdf_dest **dest_o) +{ + struct usdf_dest *dest; + + dest = calloc(1, sizeof(**dest_o)); + if (dest == NULL) { + return -errno; + } + SLIST_INIT(&dest->ds_rdm_rdc_list); + + *dest_o = dest; + return 0; +} + + /* * Called by progression thread to look for AV completions on this domain */ @@ -128,7 +143,7 @@ usdf_av_insert_progress(void *v) int ret; struct usdf_av_insert *insert; struct usdf_fabric *fp; - struct usd_dest *dest; + struct usdf_dest *dest; struct usdf_av_req *req; struct usdf_av_req *tmpreq; struct usd_device_attrs *dap; @@ -142,7 +157,7 @@ usdf_av_insert_progress(void *v) TAILQ_FOREACH_SAFE(req, tmpreq, &insert->avi_req_list, avr_link) { dest = req->avr_dest; - eth = &dest->ds_dest.ds_udp.u_hdr.uh_eth.ether_dhost[0]; + eth = &dest->ds_dest.ds_dest.ds_udp.u_hdr.uh_eth.ether_dhost[0]; ret = usnic_arp_lookup(dap->uda_ifname, req->avr_daddr_be, fp->fab_arp_sockfd, eth); @@ -153,7 +168,7 @@ usdf_av_insert_progress(void *v) if (ret == 0) { ++insert->avi_successes; - *(struct usd_dest **)req->avr_fi_addr = dest; + *(struct usdf_dest **)req->avr_fi_addr = dest; } else { usdf_post_insert_request_error(insert, req); } @@ -282,7 +297,7 @@ usdf_am_insert_async(struct fid_av *fav, const void *addr, size_t count, ret = -FI_ENOMEM; goto fail; } - usd_fill_udp_dest(req->avr_dest, dap, + usd_fill_udp_dest(&req->avr_dest->ds_dest, dap, sin->sin_addr.s_addr, sin->sin_port); TAILQ_INSERT_TAIL(&insert->avi_req_list, req, avr_link); @@ -313,7 +328,8 @@ usdf_am_insert_sync(struct fid_av *fav, const void *addr, size_t count, { const struct sockaddr_in *sin; struct usdf_av *av; - struct usd_dest *dest; + struct usd_dest *u_dest; + struct usdf_dest *dest = dest; // supress uninit int ret_count; int ret; int i; @@ -327,16 +343,21 @@ usdf_am_insert_sync(struct fid_av *fav, const void *addr, size_t count, ret_count = 0; sin = addr; - /* XXX parallelize */ + /* XXX parallelize, this will also eliminate u_dest silliness */ for (i = 0; i < count; i++) { - ret = usd_create_dest(av->av_domain->dom_dev, + ret = usdf_av_alloc_dest(&dest); + if (ret == 0) { + ret = usd_create_dest(av->av_domain->dom_dev, sin->sin_addr.s_addr, sin->sin_port, - &dest); - if (ret != 0) { - fi_addr[i] = FI_ADDR_NOTAVAIL; - } else { + &u_dest); + } + if (ret == 0) { + dest->ds_dest = *u_dest; + free(u_dest); fi_addr[i] = (fi_addr_t)dest; ++ret_count; + } else { + fi_addr[i] = FI_ADDR_NOTAVAIL; } ++sin; } @@ -348,7 +369,7 @@ static int usdf_am_remove(struct fid_av *fav, fi_addr_t *fi_addr, size_t count, uint64_t flags) { - struct usd_dest *dest; + struct usdf_dest *dest; struct usdf_av *av; av = av_ftou(fav); @@ -358,8 +379,8 @@ usdf_am_remove(struct fid_av *fav, fi_addr_t *fi_addr, size_t count, } // XXX - dest = (struct usd_dest *)(uintptr_t)fi_addr; - usd_destroy_dest(dest); + dest = (struct usdf_dest *)(uintptr_t)fi_addr; + free(dest); return 0; } @@ -368,11 +389,11 @@ static int usdf_am_lookup(struct fid_av *av, fi_addr_t fi_addr, void *addr, size_t *addrlen) { - struct usd_dest *dest; + struct usdf_dest *dest; struct sockaddr_in sin; size_t copylen; - dest = (struct usd_dest *)(uintptr_t)fi_addr; + dest = (struct usdf_dest *)(uintptr_t)fi_addr; if (*addrlen < sizeof(sin)) { copylen = *addrlen; @@ -381,7 +402,7 @@ usdf_am_lookup(struct fid_av *av, fi_addr_t fi_addr, void *addr, } sin.sin_family = AF_INET; - usd_expand_dest(dest, &sin.sin_addr.s_addr, &sin.sin_port); + usd_expand_dest(&dest->ds_dest, &sin.sin_addr.s_addr, &sin.sin_port); memcpy(addr, &sin, copylen); *addrlen = sizeof(sin); @@ -518,9 +539,6 @@ usdf_av_open(struct fid_domain *domain, struct fi_av_attr *attr, struct usdf_domain *udp; struct usdf_av *av; - if (attr->name != NULL) { - return -FI_ENOSYS; - } if ((attr->flags & ~(FI_EVENT | FI_READ)) != 0) { return -FI_ENOSYS; } diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_av.h b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_av.h index 557c1ea21c..3a46ccdca0 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_av.h +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_av.h @@ -36,13 +36,26 @@ #ifndef _USDF_AV_H_ #define _USDF_AV_H_ +#include "usd_dest.h" + #define USDF_AV_MAX_ARPS 3 #define USDF_AV_ARP_INTERVAL 1000 +struct usdf_rdm_connection; + +/* + * libfabric version of dest + */ +struct usdf_dest { + struct usd_dest ds_dest; + + SLIST_HEAD(,usdf_rdm_connection) ds_rdm_rdc_list; +}; + /* struct used to track async insert requests */ struct usdf_av_req { fi_addr_t *avr_fi_addr; - struct usd_dest *avr_dest; + struct usdf_dest *avr_dest; int avr_status; uint32_t avr_daddr_be; diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_cm.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_cm.c index c7c0b875fe..2e45f3b751 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_cm.c +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_cm.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -58,90 +59,401 @@ #include "usnic_direct.h" #include "usdf.h" +#include "usdf_endpoint.h" #include "usdf_dgram.h" -#include "usdf_cm.h" #include "usdf_msg.h" +#include "usdf_av.h" +#include "usdf_cm.h" - -static struct fi_ops_msg usdf_dgram_conn_ops = { - .size = sizeof(struct fi_ops_msg), - .recv = usdf_dgram_recv, - .recvv = usdf_dgram_recvv, - .recvmsg = usdf_dgram_recvmsg, - .send = usdf_dgram_conn_send, - .sendv = usdf_dgram_sendv, - .sendmsg = usdf_dgram_sendmsg, - .inject = usdf_dgram_inject, - .senddata = usdf_dgram_senddata, -}; - -int -usdf_cm_dgram_connect(struct fid_ep *fep, const void *addr, - const void *param, size_t paramlen) +static void +usdf_cm_msg_connreq_cleanup(struct usdf_connreq *crp) { struct usdf_ep *ep; - const struct sockaddr_in *sin; - int ret; + struct usdf_pep *pep; + struct usdf_fabric *fp; - ep = ep_ftou(fep); - sin = addr; - - ret = usd_create_dest(ep->ep_domain->dom_dev, sin->sin_addr.s_addr, - sin->sin_port, &ep->ep_dest); - if (!ret) { - ep->ep_fid.msg = &usdf_dgram_conn_ops; + ep = crp->cr_ep; + pep = crp->cr_pep; + if (pep != NULL) { + fp = pep->pep_fabric; + } else { + fp = ep->ep_domain->dom_fabric; } - return ret; + if (crp->cr_pollitem.pi_rtn != NULL) { + epoll_ctl(fp->fab_epollfd, EPOLL_CTL_DEL, crp->cr_sockfd, NULL); + crp->cr_pollitem.pi_rtn = NULL; + } + if (crp->cr_sockfd != -1) { + close(crp->cr_sockfd); + crp->cr_sockfd = -1; + } + + /* If there is a passive endpoint, recycle the crp */ + if (pep != NULL) { + if (TAILQ_ON_LIST(crp, cr_link)) { + TAILQ_REMOVE(&pep->pep_cr_pending, crp, cr_link); + } + TAILQ_INSERT_TAIL(&pep->pep_cr_free, crp, cr_link); + } else { + free(crp); + } +} + +static int +usdf_cm_msg_accept_complete(struct usdf_connreq *crp) +{ + struct usdf_ep *ep; + struct fi_eq_cm_entry entry; + int ret; + + ep = crp->cr_ep; + + /* post EQ entry */ + entry.fid = ep_utofid(ep); + entry.info = NULL; + ret = usdf_eq_write_internal(ep->ep_eq, FI_COMPLETE, &entry, + sizeof(entry), 0); + if (ret != sizeof(entry)) { + usdf_cm_msg_connreq_failed(crp, ret); + return 0; + } + + usdf_cm_msg_connreq_cleanup(crp); + + return 0; } int -usdf_cm_dgram_shutdown(struct fid_ep *ep, uint64_t flags) +usdf_cm_msg_accept(struct fid_ep *fep, const void *param, size_t paramlen) { - return 0; // XXX + struct usdf_ep *ep; + struct usdf_rx *rx; + struct usdf_domain *udp; + struct usdf_fabric *fp; + struct usdf_connreq *crp; + struct usdf_connreq_msg *reqp; + struct usd_qp_impl *qp; + int ret; + int n; + + ep = ep_ftou(fep); + udp = ep->ep_domain; + fp = udp->dom_fabric; + crp = ep->e.msg.ep_connreq; + if (crp == NULL) { + return -FI_ENOTCONN; + } + if (ep->ep_eq == NULL) { + return -FI_ENOEQ; + } + crp->cr_ep = ep; + reqp = (struct usdf_connreq_msg *)crp->cr_data; + + ep->e.msg.ep_lcl_peer_id = ntohs(reqp->creq_peer_id); + + /* start creating the dest early */ + ret = usd_create_dest_with_mac(udp->dom_dev, reqp->creq_ipaddr, + reqp->creq_port, reqp->creq_mac, + &ep->e.msg.ep_dest); + if (ret != 0) { + goto fail; + } + + ret = usdf_ep_msg_get_queues(ep); + if (ret != 0) { + goto fail; + } + rx = ep->ep_rx; + qp = to_qpi(rx->rx_qp); + + /* allocate a peer ID */ + ep->e.msg.ep_rem_peer_id = udp->dom_next_peer; + udp->dom_peer_tab[udp->dom_next_peer] = ep; + ++udp->dom_next_peer; + + crp->cr_ptr = crp->cr_data; + crp->cr_resid = sizeof(*reqp) + paramlen; + + reqp->creq_peer_id = htons(ep->e.msg.ep_rem_peer_id); + reqp->creq_ipaddr = fp->fab_dev_attrs->uda_ipaddr_be; + reqp->creq_port = + qp->uq_attrs.uqa_local_addr.ul_addr.ul_udp.u_addr.sin_port; + memcpy(reqp->creq_mac, fp->fab_dev_attrs->uda_mac_addr, ETH_ALEN); + reqp->creq_result = htonl(0); + reqp->creq_datalen = htonl(paramlen); + memcpy(reqp->creq_data, param, paramlen); + + n = write(crp->cr_sockfd, crp->cr_ptr, crp->cr_resid); + if (n == -1) { + usdf_cm_msg_connreq_cleanup(crp); + ret = -errno; + goto fail; + } + + crp->cr_resid -= n; + if (crp->cr_resid == 0) { + usdf_cm_msg_accept_complete(crp); + } else { + // XXX set up epoll junk to send rest + } + + return 0; +fail: + free(ep->e.msg.ep_dest); + /* XXX release queues */ + return ret; +} + +/* + * Connection request attempt failed + */ +void +usdf_cm_msg_connreq_failed(struct usdf_connreq *crp, int error) +{ + struct usdf_pep *pep; + struct usdf_ep *ep; + struct usdf_eq *eq; + fid_t fid; + struct fi_eq_err_entry err; + + pep = crp->cr_pep; + ep = crp->cr_ep; + if (ep != NULL) { + fid = ep_utofid(ep); + eq = ep->ep_eq; + ep->ep_domain->dom_peer_tab[ep->e.msg.ep_rem_peer_id] = NULL; + } else { + fid = pep_utofid(pep); + eq = pep->pep_eq; + } + + err.fid = fid; + err.context = NULL; + err.data = 0; + err.err = -error; + err.prov_errno = 0; + err.err_data = NULL; + usdf_eq_write_internal(eq, 0, &err, sizeof(err), USDF_EVENT_FLAG_ERROR); + + usdf_cm_msg_connreq_cleanup(crp); +} + +/* + * read connection request response from the listener + */ +static int +usdf_cm_msg_connect_cb_rd(void *v) +{ + struct usdf_connreq *crp; + struct usdf_ep *ep; + struct usdf_fabric *fp; + struct usdf_domain *udp; + struct usdf_connreq_msg *reqp; + struct fi_eq_cm_entry *entry; + size_t entry_len; + int ret; + + crp = v; + ep = crp->cr_ep; + fp = ep->ep_domain->dom_fabric; + + ret = read(crp->cr_sockfd, crp->cr_ptr, crp->cr_resid); + if (ret == -1) { + usdf_cm_msg_connreq_failed(crp, -errno); + return 0; + } + + crp->cr_resid -= ret; + reqp = (struct usdf_connreq_msg *)crp->cr_data; + if (crp->cr_resid == 0 && crp->cr_ptr == crp->cr_data + sizeof(*reqp)) { + reqp->creq_datalen = ntohl(reqp->creq_datalen); + crp->cr_resid = reqp->creq_datalen; + } + + /* if resid is 0 now, completely done */ + if (crp->cr_resid == 0) { + ret = epoll_ctl(fp->fab_epollfd, EPOLL_CTL_DEL, + crp->cr_sockfd, NULL); + close(crp->cr_sockfd); + crp->cr_sockfd = -1; + + entry_len = sizeof(*entry) + reqp->creq_datalen; + entry = malloc(entry_len); + if (entry == NULL) { + usdf_cm_msg_connreq_failed(crp, -errno); + return 0; + } + + udp = ep->ep_domain; + ep->e.msg.ep_lcl_peer_id = ntohs(reqp->creq_peer_id); + ret = usd_create_dest_with_mac(udp->dom_dev, reqp->creq_ipaddr, + reqp->creq_port, reqp->creq_mac, + &ep->e.msg.ep_dest); + if (ret != 0) { + free(entry); + usdf_cm_msg_connreq_failed(crp, ret); + return 0; + } + + entry->fid = ep_utofid(ep); + entry->info = NULL; + memcpy(entry->data, reqp->creq_data, reqp->creq_datalen); + ret = usdf_eq_write_internal(ep->ep_eq, FI_COMPLETE, entry, + entry_len, 0); + free(entry); + if (ret != entry_len) { + free(ep->e.msg.ep_dest); + ep->e.msg.ep_dest = NULL; + usdf_cm_msg_connreq_failed(crp, ret); + return 0; + } + + usdf_cm_msg_connreq_cleanup(crp); + } + return 0; +} + +/* + * Write connection request data to the listener + * Once everything is written, switch over into listening mode to + * capture the listener response. + */ +static int +usdf_cm_msg_connect_cb_wr(void *v) +{ + struct usdf_connreq *crp; + struct usdf_ep *ep; + struct usdf_fabric *fp; + struct epoll_event ev; + int ret; + + crp = v; + ep = crp->cr_ep; + fp = ep->ep_domain->dom_fabric; + + ret = write(crp->cr_sockfd, crp->cr_ptr, crp->cr_resid); + if (ret == -1) { + usdf_cm_msg_connreq_failed(crp, -errno); + return 0; + } + + crp->cr_resid -= ret; + if (crp->cr_resid == 0) { + crp->cr_pollitem.pi_rtn = usdf_cm_msg_connect_cb_rd; + ev.events = EPOLLIN; + ev.data.ptr = &crp->cr_pollitem; + ret = epoll_ctl(fp->fab_epollfd, EPOLL_CTL_MOD, + crp->cr_sockfd, &ev); + if (ret != 0) { + usdf_cm_msg_connreq_failed(crp, -errno); + return 0; + } + + crp->cr_ptr = crp->cr_data; + crp->cr_resid = sizeof(struct usdf_connreq_msg); + } + return 0; } int usdf_cm_msg_connect(struct fid_ep *fep, const void *addr, const void *param, size_t paramlen) { + struct usdf_connreq *crp; struct usdf_ep *ep; + struct usdf_rx *rx; + struct usdf_domain *udp; const struct sockaddr_in *sin; + struct epoll_event ev; + struct usdf_fabric *fp; + struct usdf_connreq_msg *reqp; + struct usd_qp_impl *qp; int ret; ep = ep_ftou(fep); + udp = ep->ep_domain; + fp = udp->dom_fabric; sin = addr; + crp = NULL; - ep->ep_conn_sock = socket(AF_INET, SOCK_STREAM, 0); - if (ep->ep_conn_sock == -1) { + crp = calloc(1, sizeof(*crp) + sizeof(struct usdf_connreq_msg) + + paramlen); + if (crp == NULL) { ret = -errno; goto fail; } - ret = fcntl(ep->ep_conn_sock, F_GETFL, 0); + crp->cr_sockfd = socket(AF_INET, SOCK_STREAM, 0); + if (crp->cr_sockfd == -1) { + ret = -errno; + goto fail; + } + + ret = fcntl(crp->cr_sockfd, F_GETFL, 0); if (ret == -1) { ret = -errno; goto fail; } - ret = fcntl(ep->ep_conn_sock, F_SETFL, ret | O_NONBLOCK); + ret = fcntl(crp->cr_sockfd, F_SETFL, ret | O_NONBLOCK); if (ret == -1) { ret = -errno; goto fail; } - ret = connect(ep->ep_conn_sock, (struct sockaddr *)sin, sizeof(*sin)); + ret = usdf_ep_msg_get_queues(ep); + if (ret != 0) { + goto fail; + } + rx = ep->ep_rx; + qp = to_qpi(rx->rx_qp); + + ret = connect(crp->cr_sockfd, (struct sockaddr *)sin, sizeof(*sin)); if (ret != 0 && errno != EINPROGRESS) { ret = -errno; goto fail; } -printf("connect in progress\n"); + + /* register for notification when connect completes */ + crp->cr_pollitem.pi_rtn = usdf_cm_msg_connect_cb_wr; + crp->cr_pollitem.pi_context = crp; + ev.events = EPOLLOUT; + ev.data.ptr = &crp->cr_pollitem; + ret = epoll_ctl(fp->fab_epollfd, EPOLL_CTL_ADD, crp->cr_sockfd, &ev); + if (ret != 0) { + crp->cr_pollitem.pi_rtn = NULL; + ret = -errno; + goto fail; + } + + /* allocate remote peer ID */ + ep->e.msg.ep_rem_peer_id = udp->dom_next_peer; + udp->dom_peer_tab[udp->dom_next_peer] = ep; + ++udp->dom_next_peer; + + crp->cr_ep = ep; + reqp = (struct usdf_connreq_msg *)crp->cr_data; + crp->cr_ptr = crp->cr_data; + crp->cr_resid = sizeof(*reqp) + paramlen; + + reqp->creq_peer_id = htons(ep->e.msg.ep_rem_peer_id); + reqp->creq_ipaddr = fp->fab_dev_attrs->uda_ipaddr_be; + reqp->creq_port = + qp->uq_attrs.uqa_local_addr.ul_addr.ul_udp.u_addr.sin_port; + memcpy(reqp->creq_mac, fp->fab_dev_attrs->uda_mac_addr, ETH_ALEN); + reqp->creq_datalen = htonl(paramlen); + memcpy(reqp->creq_data, param, paramlen); return 0; fail: - if (ep->ep_conn_sock != -1) { - close(ep->ep_conn_sock); + if (crp != NULL) { + if (crp->cr_sockfd != -1) { + close(crp->cr_sockfd); + } + free(crp); } + usdf_ep_msg_release_queues(ep); return ret; } @@ -150,3 +462,50 @@ usdf_cm_msg_shutdown(struct fid_ep *ep, uint64_t flags) { return -FI_ENOSYS; } + +/* + * Check a message CQ for completions and progress the send engine as needed, + * create completions for the app if anything needs to be percolated up + */ +int +usdf_cq_msg_poll(struct usd_cq *ucq, struct usd_completion *comp) +{ + return -EAGAIN; +} + +/* + * Return local address of an EP + */ +int usdf_cm_rdm_getname(fid_t fid, void *addr, size_t *addrlen) +{ + struct usdf_ep *ep; + struct usdf_rx *rx; + struct sockaddr_in sin; + size_t copylen; + + ep = ep_fidtou(fid); + rx = ep->ep_rx; + + copylen = sizeof(sin); + if (copylen > *addrlen) { + copylen = *addrlen; + } + *addrlen = sizeof(sin); + + memset(&sin, 0, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = + ep->ep_domain->dom_fabric->fab_dev_attrs->uda_ipaddr_be; + if (rx == NULL || rx->rx_qp == NULL) { + sin.sin_port = 0; + } else { + sin.sin_port = to_qpi(rx->rx_qp)->uq_attrs.uqa_local_addr.ul_addr.ul_udp.u_addr.sin_port; + } + memcpy(addr, &sin, copylen); + + if (copylen < sizeof(sin)) { + return -FI_ETOOSMALL; + } else { + return 0; + } +} diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_cm.h b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_cm.h index c391260326..137fa8beb0 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_cm.h +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_cm.h @@ -38,18 +38,24 @@ #include -struct usdf_connreq_msg { - uint32_t creq_data_len; -} __attribute__((packed)); +#define USDF_MAX_CONN_DATA 256 -struct usdf_connresp_msg { - uint32_t cresp_result; - uint32_t cresp_reason; +struct usdf_connreq_msg { + uint32_t creq_peer_id; + uint32_t creq_ipaddr; + uint32_t creq_port; + uint8_t creq_mac[ETH_ALEN]; + uint8_t pad[8 - ETH_ALEN]; + uint32_t creq_result; + uint32_t creq_reason; + uint32_t creq_datalen; + uint8_t creq_data[0]; } __attribute__((packed)); struct usdf_connreq { int cr_sockfd; struct usdf_pep *cr_pep; + struct usdf_ep *cr_ep; TAILQ_ENTRY(usdf_connreq) cr_link; struct usdf_poll_item cr_pollitem; @@ -57,7 +63,12 @@ struct usdf_connreq { uint8_t *cr_ptr; size_t cr_resid; + size_t cr_datalen; uint8_t cr_data[0]; }; +void usdf_cm_msg_connreq_failed(struct usdf_connreq *crp, int error); + +int usdf_cm_rdm_getname(fid_t fid, void *addr, size_t *addrlen); + #endif /* _USDF_CM_H_ */ diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_cq.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_cq.c index 0160d7ea43..8fbf8a221f 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_cq.c +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_cq.c @@ -57,11 +57,14 @@ #include #include #include "fi.h" +#include "fi_enosys.h" #include "usnic_direct.h" #include "usd.h" #include "usdf.h" #include "usdf_av.h" +#include "usdf_progress.h" +#include "usdf_cq.h" static ssize_t usdf_cq_readerr(struct fid_cq *fcq, struct fi_cq_err_entry *entry, @@ -93,12 +96,23 @@ usdf_cq_sread(struct fid_cq *cq, void *buf, size_t count, const void *cond, return -FI_ENOSYS; } -static ssize_t -usdf_cq_read_context(struct fid_cq *fcq, void *buf, size_t count) +/* + * poll a hard CQ + * Since this routine is an inline and is always called with format as + * a constant, I am counting on the compiler optimizing away all the switches + * on format. + */ +static inline ssize_t +usdf_cq_read_common(struct fid_cq *fcq, void *buf, size_t count, + enum fi_cq_format format) { struct usdf_cq *cq; - struct fi_cq_entry *entry; - struct fi_cq_entry *last; + uint8_t *entry; + uint8_t *last; + size_t entry_len; + struct fi_cq_entry *ctx_entry; + struct fi_cq_msg_entry *msg_entry; + struct fi_cq_data_entry *data_entry; ssize_t ret; cq = cq_ftou(fcq); @@ -106,11 +120,26 @@ usdf_cq_read_context(struct fid_cq *fcq, void *buf, size_t count) return -FI_EAVAIL; } + switch (format) { + case FI_CQ_FORMAT_CONTEXT: + entry_len = sizeof(struct fi_cq_entry); + break; + case FI_CQ_FORMAT_MSG: + entry_len = sizeof(struct fi_cq_msg_entry); + break; + case FI_CQ_FORMAT_DATA: + entry_len = sizeof(struct fi_cq_data_entry); + break; + default: + return 0; + } + ret = 0; entry = buf; - last = entry + count; + last = entry + (entry_len * count); + while (entry < last) { - ret = usd_poll_cq(cq->cq_cq, &cq->cq_comp); + ret = usd_poll_cq(cq->c.hard.cq_cq, &cq->cq_comp); if (ret == -EAGAIN) { ret = 0; break; @@ -119,19 +148,56 @@ usdf_cq_read_context(struct fid_cq *fcq, void *buf, size_t count) ret = -FI_EAVAIL; break; } - - entry->op_context = cq->cq_comp.uc_context; - - entry++; + switch (format) { + case FI_CQ_FORMAT_CONTEXT: + ctx_entry = (struct fi_cq_entry *)entry; + ctx_entry->op_context = cq->cq_comp.uc_context; + break; + case FI_CQ_FORMAT_MSG: + msg_entry = (struct fi_cq_msg_entry *)entry; + msg_entry->op_context = cq->cq_comp.uc_context; + msg_entry->flags = 0; + msg_entry->len = cq->cq_comp.uc_bytes; + break; + case FI_CQ_FORMAT_DATA: + data_entry = (struct fi_cq_data_entry *)entry; + data_entry->op_context = cq->cq_comp.uc_context; + data_entry->flags = 0; + data_entry->len = cq->cq_comp.uc_bytes; + data_entry->buf = 0; /* XXX */ + data_entry->data = 0; + break; + default: + return 0; + } + entry += entry_len; } - if (entry > (struct fi_cq_entry *)buf) { - return entry - (struct fi_cq_entry *)buf; + if (entry > (uint8_t *)buf) { + return (entry - (uint8_t *)buf) / entry_len; } else { return ret; } } +static ssize_t +usdf_cq_read_context(struct fid_cq *fcq, void *buf, size_t count) +{ + return usdf_cq_read_common(fcq, buf, count, FI_CQ_FORMAT_CONTEXT); +} + +static ssize_t +usdf_cq_read_msg(struct fid_cq *fcq, void *buf, size_t count) +{ + return usdf_cq_read_common(fcq, buf, count, FI_CQ_FORMAT_MSG); +} + +static ssize_t +usdf_cq_read_data(struct fid_cq *fcq, void *buf, size_t count) +{ + return usdf_cq_read_common(fcq, buf, count, FI_CQ_FORMAT_DATA); +} + static ssize_t usdf_cq_readfrom_context(struct fid_cq *fcq, void *buf, size_t count, fi_addr_t *src_addr) @@ -151,7 +217,7 @@ usdf_cq_readfrom_context(struct fid_cq *fcq, void *buf, size_t count, if (cq->cq_comp.uc_status != 0) { return -FI_EAVAIL; } - ucq = to_cqi(cq->cq_cq); + ucq = to_cqi(cq->c.hard.cq_cq); ret = 0; entry = buf; @@ -160,7 +226,7 @@ usdf_cq_readfrom_context(struct fid_cq *fcq, void *buf, size_t count, cq_desc = (struct cq_desc *)((uint8_t *)ucq->ucq_desc_ring + (ucq->ucq_next_desc << 4)); - ret = usd_poll_cq(cq->cq_cq, &cq->cq_comp); + ret = usd_poll_cq(cq->c.hard.cq_cq, &cq->cq_comp); if (ret == -EAGAIN) { ret = 0; break; @@ -174,13 +240,13 @@ usdf_cq_readfrom_context(struct fid_cq *fcq, void *buf, size_t count, index = le16_to_cpu(cq_desc->completed_index) & CQ_DESC_COMP_NDX_MASK; ep = cq->cq_comp.uc_qp->uq_context; - hdr = ep->ep_hdr_ptr[index]; + hdr = ep->e.dg.ep_hdr_ptr[index]; memset(&sin, 0, sizeof(sin)); sin.sin_addr.s_addr = hdr->uh_ip.saddr; sin.sin_port = hdr->uh_udp.source; - ret = fi_av_insert(av_utof(ep->ep_av), &sin, 1, + ret = fi_av_insert(av_utof(ep->e.dg.ep_av), &sin, 1, src_addr, 0, NULL); if (ret != 1) { *src_addr = FI_ADDR_NOTAVAIL; @@ -201,12 +267,174 @@ usdf_cq_readfrom_context(struct fid_cq *fcq, void *buf, size_t count, } } -static ssize_t -usdf_cq_read_msg(struct fid_cq *fcq, void *buf, size_t count) +/***************************************************************** + * "soft" CQ support + *****************************************************************/ + +static inline void +usdf_progress_hard_cq(struct usdf_cq_hard *hcq, enum fi_cq_format format) +{ + int ret; + struct usd_completion comp; + void *entry; + size_t entry_size; + struct fi_cq_entry *ctx_entry; + struct fi_cq_msg_entry *msg_entry; + struct fi_cq_data_entry *data_entry; + struct usdf_cq *cq; + + cq = hcq->cqh_cq; + + do { + ret = usd_poll_cq(hcq->cqh_ucq, &comp); + if (ret == 0) { + entry = cq->c.soft.cq_head; + switch (format) { + case FI_CQ_FORMAT_CONTEXT: + entry_size = sizeof(*ctx_entry); + ctx_entry = (struct fi_cq_entry *)entry; + ctx_entry->op_context = cq->cq_comp.uc_context; + break; + case FI_CQ_FORMAT_MSG: + entry_size = sizeof(*msg_entry); + msg_entry = (struct fi_cq_msg_entry *)entry; + msg_entry->op_context = cq->cq_comp.uc_context; + msg_entry->flags = 0; + msg_entry->len = cq->cq_comp.uc_bytes; + break; + case FI_CQ_FORMAT_DATA: + entry_size = sizeof(*data_entry); + data_entry = (struct fi_cq_data_entry *)entry; + data_entry->op_context = cq->cq_comp.uc_context; + data_entry->flags = 0; + data_entry->len = cq->cq_comp.uc_bytes; + data_entry->buf = 0; /* XXX */ + data_entry->data = 0; + break; + default: + return; + } + + /* update with wrap */ + entry = (uint8_t *)entry + entry_size; + if (entry != cq->c.soft.cq_end) { + cq->c.soft.cq_head = entry; + } else { + cq->c.soft.cq_head = cq->c.soft.cq_comps; + } + } + } while (ret != -EAGAIN); +} + +void +usdf_progress_hard_cq_context(struct usdf_cq_hard *hcq) +{ + usdf_progress_hard_cq(hcq, FI_CQ_FORMAT_CONTEXT); +} + +void +usdf_progress_hard_cq_msg(struct usdf_cq_hard *hcq) +{ + usdf_progress_hard_cq(hcq, FI_CQ_FORMAT_MSG); +} + +void +usdf_progress_hard_cq_data(struct usdf_cq_hard *hcq) +{ + usdf_progress_hard_cq(hcq, FI_CQ_FORMAT_DATA); +} + +static inline void +usdf_cq_post_soft(struct usdf_cq_hard *hcq, void *context, size_t len, + enum fi_cq_format format) +{ + void *entry; + size_t entry_size; + struct fi_cq_entry *ctx_entry; + struct fi_cq_msg_entry *msg_entry; + struct fi_cq_data_entry *data_entry; + struct usdf_cq *cq; + + cq = hcq->cqh_cq; + + entry = cq->c.soft.cq_head; + switch (format) { + case FI_CQ_FORMAT_CONTEXT: + entry_size = sizeof(*ctx_entry); + ctx_entry = (struct fi_cq_entry *)entry; + ctx_entry->op_context = context; + break; + case FI_CQ_FORMAT_MSG: + entry_size = sizeof(*msg_entry); + msg_entry = (struct fi_cq_msg_entry *)entry; + msg_entry->op_context = context; + msg_entry->flags = 0; + msg_entry->len = len; + break; + case FI_CQ_FORMAT_DATA: + entry_size = sizeof(*data_entry); + data_entry = (struct fi_cq_data_entry *)entry; + data_entry->op_context = context; + data_entry->flags = 0; + data_entry->len = len; + data_entry->buf = NULL; + data_entry->data = 0; + break; + default: + return; + } + + /* update with wrap */ + entry = (uint8_t *)entry + entry_size; + if (entry != cq->c.soft.cq_end) { + cq->c.soft.cq_head = entry; + } else { + cq->c.soft.cq_head = cq->c.soft.cq_comps; + } + +} + +void +usdf_cq_post_soft_context(struct usdf_cq_hard *hcq, void *context, size_t len) +{ + usdf_cq_post_soft(hcq, context, len, FI_CQ_FORMAT_CONTEXT); +} + +void +usdf_cq_post_soft_msg(struct usdf_cq_hard *hcq, void *context, size_t len) +{ + usdf_cq_post_soft(hcq, context, len, FI_CQ_FORMAT_MSG); +} + +void +usdf_cq_post_soft_data(struct usdf_cq_hard *hcq, void *context, size_t len) +{ + usdf_cq_post_soft(hcq, context, len, FI_CQ_FORMAT_DATA); +} + +ssize_t +usdf_cq_sread_soft(struct fid_cq *cq, void *buf, size_t count, const void *cond, + int timeout) +{ + return -FI_ENOSYS; +} + +/* + * poll a soft CQ + * This will loop over all the hard CQs within, collecting results. + * Since this routine is an inline and is always called with format as + * a constant, I am counting on the compiler optimizing away all the switches + * on format. + */ +static inline ssize_t +usdf_cq_read_common_soft(struct fid_cq *fcq, void *buf, size_t count, + enum fi_cq_format format) { struct usdf_cq *cq; - struct fi_cq_msg_entry *entry; - struct fi_cq_msg_entry *last; + uint8_t *entry; + uint8_t *last; + void *tail; + size_t entry_len; ssize_t ret; cq = cq_ftou(fcq); @@ -214,11 +442,94 @@ usdf_cq_read_msg(struct fid_cq *fcq, void *buf, size_t count) return -FI_EAVAIL; } + /* progress... */ + usdf_domain_progress(cq->cq_domain); + + switch (format) { + case FI_CQ_FORMAT_CONTEXT: + entry_len = sizeof(struct fi_cq_entry); + break; + case FI_CQ_FORMAT_MSG: + entry_len = sizeof(struct fi_cq_msg_entry); + break; + case FI_CQ_FORMAT_DATA: + entry_len = sizeof(struct fi_cq_data_entry); + break; + default: + return 0; + } + + ret = 0; + entry = buf; + last = entry + (entry_len * count); + tail = cq->c.soft.cq_tail; + + // XXX ... handle error comps + while (entry < last && tail != cq->c.soft.cq_head) { + memcpy(entry, tail, entry_len); + entry += entry_len; + + tail = (uint8_t *)tail + entry_len; + if (tail == cq->c.soft.cq_end) { + tail = cq->c.soft.cq_comps; + } + } + cq->c.soft.cq_tail = tail; + + if (entry > (uint8_t *)buf) { + return (entry - (uint8_t *)buf) / entry_len; + } else { + return ret; + } +} + +static ssize_t +usdf_cq_read_context_soft(struct fid_cq *fcq, void *buf, size_t count) +{ + return usdf_cq_read_common_soft(fcq, buf, count, FI_CQ_FORMAT_CONTEXT); +} + +static ssize_t +usdf_cq_read_msg_soft(struct fid_cq *fcq, void *buf, size_t count) +{ + return usdf_cq_read_common_soft(fcq, buf, count, FI_CQ_FORMAT_MSG); +} + +static ssize_t +usdf_cq_read_data_soft(struct fid_cq *fcq, void *buf, size_t count) +{ + return usdf_cq_read_common_soft(fcq, buf, count, FI_CQ_FORMAT_DATA); +} + +static ssize_t +usdf_cq_readfrom_context_soft(struct fid_cq *fcq, void *buf, size_t count, + fi_addr_t *src_addr) +{ + struct usdf_cq *cq; + struct usd_cq_impl *ucq; + struct fi_cq_entry *entry; + struct fi_cq_entry *last; + ssize_t ret; + struct cq_desc *cq_desc; + struct usdf_ep *ep; + struct sockaddr_in sin; + struct usd_udp_hdr *hdr; + uint16_t index; + + cq = cq_ftou(fcq); + if (cq->cq_comp.uc_status != 0) { + return -FI_EAVAIL; + } + ucq = to_cqi(cq->c.hard.cq_cq); + ret = 0; entry = buf; last = entry + count; while (entry < last) { - ret = usd_poll_cq(cq->cq_cq, &cq->cq_comp); + cq_desc = (struct cq_desc *)((uint8_t *)ucq->ucq_desc_ring + + (ucq->ucq_next_desc << 4)); + + ret = usd_poll_cq(cq->c.hard.cq_cq, &cq->cq_comp); if (ret == -EAGAIN) { ret = 0; break; @@ -228,62 +539,40 @@ usdf_cq_read_msg(struct fid_cq *fcq, void *buf, size_t count) break; } + if (cq->cq_comp.uc_type == USD_COMPTYPE_RECV) { + index = le16_to_cpu(cq_desc->completed_index) & + CQ_DESC_COMP_NDX_MASK; + ep = cq->cq_comp.uc_qp->uq_context; + hdr = ep->e.dg.ep_hdr_ptr[index]; + memset(&sin, 0, sizeof(sin)); + + sin.sin_addr.s_addr = hdr->uh_ip.saddr; + sin.sin_port = hdr->uh_udp.source; + + ret = fi_av_insert(av_utof(ep->e.dg.ep_av), &sin, 1, + src_addr, 0, NULL); + if (ret != 1) { + *src_addr = FI_ADDR_NOTAVAIL; + } + ++src_addr; + } + + entry->op_context = cq->cq_comp.uc_context; - entry->flags = 0; - entry->len = cq->cq_comp.uc_bytes; entry++; } - if (entry > (struct fi_cq_msg_entry *)buf) { - return entry - (struct fi_cq_msg_entry *)buf; + if (entry > (struct fi_cq_entry *)buf) { + return entry - (struct fi_cq_entry *)buf; } else { return ret; } } -static ssize_t -usdf_cq_read_data(struct fid_cq *fcq, void *buf, size_t count) -{ - struct usdf_cq *cq; - struct fi_cq_data_entry *entry; - struct fi_cq_data_entry *last; - ssize_t ret; - - cq = cq_ftou(fcq); - if (cq->cq_comp.uc_status != 0) { - return -FI_EAVAIL; - } - - ret = 0; - entry = buf; - last = entry + count; - while (entry < last) { - ret = usd_poll_cq(cq->cq_cq, &cq->cq_comp); - if (ret == -EAGAIN) { - ret = 0; - break; - } - if (cq->cq_comp.uc_status != 0) { - ret = -FI_EAVAIL; - break; - } - - entry->op_context = cq->cq_comp.uc_context; - entry->flags = 0; - entry->len = cq->cq_comp.uc_bytes; - entry->buf = 0; /* XXX */ - entry->data = 0; - - entry++; - } - - if (entry > (struct fi_cq_data_entry *)buf) { - return entry - (struct fi_cq_data_entry *)buf; - } else { - return ret; - } -} +/***************************************************************** + * common CQ support + *****************************************************************/ static const char * usdf_cq_strerror(struct fid_cq *eq, int prov_errno, const void *err_data, @@ -294,31 +583,6 @@ usdf_cq_strerror(struct fid_cq *eq, int prov_errno, const void *err_data, return buf; } -static struct fi_ops_cq usdf_cq_context_ops = { - .size = sizeof(struct fi_ops_cq), - .read = usdf_cq_read_context, - .sread = usdf_cq_sread, - .readfrom = usdf_cq_readfrom_context, - .readerr = usdf_cq_readerr, - .strerror = usdf_cq_strerror -}; - -static struct fi_ops_cq usdf_cq_msg_ops = { - .size = sizeof(struct fi_ops_cq), - .read = usdf_cq_read_msg, - .sread = usdf_cq_sread, - .readerr = usdf_cq_readerr, - .strerror = usdf_cq_strerror -}; - -static struct fi_ops_cq usdf_cq_data_ops = { - .size = sizeof(struct fi_ops_cq), - .read = usdf_cq_read_data, - .sread = usdf_cq_sread, - .readerr = usdf_cq_readerr, - .strerror = usdf_cq_strerror -}; - static int usdf_cq_control(fid_t fid, int command, void *arg) { @@ -329,13 +593,35 @@ static int usdf_cq_close(fid_t fid) { struct usdf_cq *cq; + struct usdf_cq_hard *hcq; int ret; cq = container_of(fid, struct usdf_cq, cq_fid.fid); - if (cq->cq_cq) { - ret = usd_destroy_cq(cq->cq_cq); - if (ret != 0) { - return ret; + if (atomic_get(&cq->cq_refcnt) > 0) { + return -FI_EBUSY; + } + + if (usdf_cq_is_soft(cq)) { + while (!TAILQ_EMPTY(&cq->c.soft.cq_list)) { + hcq = TAILQ_FIRST(&cq->c.soft.cq_list); + if (atomic_get(&hcq->cqh_refcnt) > 0) { + return -FI_EBUSY; + } + TAILQ_REMOVE(&cq->c.soft.cq_list, hcq, cqh_link); + if (hcq->cqh_ucq != NULL) { + ret = usd_destroy_cq(hcq->cqh_ucq); + if (ret != 0) { + return ret; + } + } + free(hcq); + } + } else { + if (cq->c.hard.cq_cq) { + ret = usd_destroy_cq(cq->c.hard.cq_cq); + if (ret != 0) { + return ret; + } } } @@ -343,21 +629,206 @@ usdf_cq_close(fid_t fid) return 0; } +static struct fi_ops_cq usdf_cq_context_ops = { + .size = sizeof(struct fi_ops_cq), + .read = usdf_cq_read_context, + .sread = usdf_cq_sread, + .readfrom = usdf_cq_readfrom_context, + .readerr = usdf_cq_readerr, + .strerror = usdf_cq_strerror +}; + +static struct fi_ops_cq usdf_cq_context_soft_ops = { + .size = sizeof(struct fi_ops_cq), + .read = usdf_cq_read_context_soft, + .sread = usdf_cq_sread_soft, + .readfrom = usdf_cq_readfrom_context_soft, + .readerr = usdf_cq_readerr, + .strerror = usdf_cq_strerror +}; + +static struct fi_ops_cq usdf_cq_msg_ops = { + .size = sizeof(struct fi_ops_cq), + .read = usdf_cq_read_msg, + .sread = usdf_cq_sread, + .readfrom = fi_no_cq_readfrom, /* XXX */ + .readerr = usdf_cq_readerr, + .strerror = usdf_cq_strerror +}; + +static struct fi_ops_cq usdf_cq_msg_soft_ops = { + .size = sizeof(struct fi_ops_cq), + .read = usdf_cq_read_msg_soft, + .sread = usdf_cq_sread, + .readfrom = fi_no_cq_readfrom, /* XXX */ + .readerr = usdf_cq_readerr, + .strerror = usdf_cq_strerror +}; + +static struct fi_ops_cq usdf_cq_data_ops = { + .size = sizeof(struct fi_ops_cq), + .read = usdf_cq_read_data, + .sread = usdf_cq_sread, + .readfrom = fi_no_cq_readfrom, /* XXX */ + .readerr = usdf_cq_readerr, + .strerror = usdf_cq_strerror +}; + +static struct fi_ops_cq usdf_cq_data_soft_ops = { + .size = sizeof(struct fi_ops_cq), + .read = usdf_cq_read_data_soft, + .sread = usdf_cq_sread, + .readfrom = fi_no_cq_readfrom, /* XXX */ + .readerr = usdf_cq_readerr, + .strerror = usdf_cq_strerror +}; + static struct fi_ops usdf_cq_fi_ops = { .size = sizeof(struct fi_ops), .close = usdf_cq_close, .control = usdf_cq_control, }; +/* + * Return true is this CQ is in "soft" (emulated) mode + */ +int +usdf_cq_is_soft(struct usdf_cq *cq) +{ + struct fi_ops_cq *soft_ops; + + switch (cq->cq_attr.format) { + case FI_CQ_FORMAT_CONTEXT: + soft_ops = &usdf_cq_context_soft_ops; + break; + case FI_CQ_FORMAT_MSG: + soft_ops = &usdf_cq_msg_soft_ops; + break; + case FI_CQ_FORMAT_DATA: + soft_ops = &usdf_cq_data_soft_ops; + break; + default: + return 0; + } + + return cq->cq_fid.ops == soft_ops; +} + +int +usdf_cq_make_soft(struct usdf_cq *cq) +{ + struct fi_ops_cq *hard_ops; + struct fi_ops_cq *soft_ops; + struct usdf_cq_hard *hcq; + struct usd_cq *ucq; + size_t comp_size; + void (*rtn)(struct usdf_cq_hard *hcq); + + switch (cq->cq_attr.format) { + case FI_CQ_FORMAT_CONTEXT: + hard_ops = &usdf_cq_context_ops; + soft_ops = &usdf_cq_context_soft_ops; + comp_size = sizeof(struct fi_cq_entry); + rtn = usdf_progress_hard_cq_context; + break; + case FI_CQ_FORMAT_MSG: + hard_ops = &usdf_cq_msg_ops; + soft_ops = &usdf_cq_msg_soft_ops; + comp_size = sizeof(struct fi_cq_msg_entry); + rtn = usdf_progress_hard_cq_msg; + break; + case FI_CQ_FORMAT_DATA: + hard_ops = &usdf_cq_data_ops; + soft_ops = &usdf_cq_data_soft_ops; + comp_size = sizeof(struct fi_cq_data_entry); + rtn = usdf_progress_hard_cq_data; + break; + default: + return 0; + } + + if (cq->cq_fid.ops == hard_ops) { + + /* save the CQ before we trash the union */ + ucq = cq->c.hard.cq_cq; + + /* fill in the soft part of union */ + TAILQ_INIT(&cq->c.soft.cq_list); + cq->c.soft.cq_comps = calloc(cq->cq_attr.size, comp_size); + if (cq->c.soft.cq_comps == NULL) { + return -FI_ENOMEM; + } + cq->c.soft.cq_end = (void *)((uintptr_t)cq->c.soft.cq_comps + + (cq->cq_attr.size * comp_size)); + cq->c.soft.cq_head = cq->c.soft.cq_comps; + cq->c.soft.cq_tail = cq->c.soft.cq_comps; + + /* need to add hard queue to list? */ + if (ucq != NULL) { + hcq = malloc(sizeof(*hcq)); + if (hcq == NULL) { + free(cq->c.soft.cq_comps); + cq->c.hard.cq_cq = ucq; /* restore */ + return -FI_ENOMEM; + } + + hcq->cqh_cq = cq; + hcq->cqh_ucq = ucq; + hcq->cqh_progress = rtn; + + atomic_init(&hcq->cqh_refcnt, + atomic_get(&cq->cq_refcnt)); + TAILQ_INSERT_HEAD(&cq->c.soft.cq_list, hcq, cqh_link); + } + + cq->cq_fid.ops = soft_ops; + } + return 0; +} + +static int +usdf_cq_process_attr(struct fi_cq_attr *attr, struct usdf_domain *udp) +{ + /* no wait object yet */ + if (attr->wait_obj != FI_WAIT_NONE) { + return -FI_ENOSYS; + } + + /* bound and default size */ + if (attr->size > udp->dom_fabric->fab_dev_attrs->uda_max_cqe) { + return -FI_EINVAL; + } + if (attr->size == 0) { + attr->size = udp->dom_fabric->fab_dev_attrs->uda_max_cqe; + } + + /* default format is FI_CQ_FORMAT_CONTEXT */ + if (attr->format == FI_CQ_FORMAT_UNSPEC) { + + attr->format = FI_CQ_FORMAT_CONTEXT; + } + return 0; +} + +int +usdf_cq_create_cq(struct usdf_cq *cq) +{ + return usd_create_cq(cq->cq_domain->dom_dev, cq->cq_attr.size, -1, + &cq->c.hard.cq_cq); +} + int usdf_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, struct fid_cq **cq_o, void *context) { struct usdf_cq *cq; + struct usdf_domain *udp; int ret; - if (attr->wait_obj != FI_WAIT_NONE) { - return -FI_ENOSYS; + udp = dom_ftou(domain); + ret = usdf_cq_process_attr(attr, udp); + if (ret != 0) { + return ret; } cq = calloc(1, sizeof(*cq)); @@ -365,13 +836,7 @@ usdf_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, return -FI_ENOMEM; } - cq->cq_domain = container_of(domain, struct usdf_domain, dom_fid); - - ret = usd_create_cq(cq->cq_domain->dom_dev, attr->size, -1, &cq->cq_cq); - if (ret != 0) { - goto fail; - } - + cq->cq_domain = udp; cq->cq_fid.fid.fclass = FI_CLASS_CQ; cq->cq_fid.fid.context = context; cq->cq_fid.fid.ops = &usdf_cq_fi_ops; @@ -391,13 +856,14 @@ usdf_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, goto fail; } + cq->cq_attr = *attr; *cq_o = &cq->cq_fid; return 0; fail: if (cq != NULL) { - if (cq->cq_cq != NULL) { - usd_destroy_cq(cq->cq_cq); + if (cq->c.hard.cq_cq != NULL) { + usd_destroy_cq(cq->c.hard.cq_cq); } free(cq); } diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_cq.h b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_cq.h new file mode 100644 index 0000000000..cf89dc44cb --- /dev/null +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_cq.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _USDF_CQ_H_ +#define _USDF_CQ_H_ + +int usdf_cq_is_soft(struct usdf_cq *cq); +int usdf_cq_make_soft(struct usdf_cq *cq); +int usdf_cq_create_cq(struct usdf_cq *cq); + +void usdf_progress_hard_cq_context(struct usdf_cq_hard *hcq); +void usdf_progress_hard_cq_msg(struct usdf_cq_hard *hcq); +void usdf_progress_hard_cq_data(struct usdf_cq_hard *hcq); + +void usdf_cq_post_soft_context(struct usdf_cq_hard *hcq, void *context, + size_t len); +void usdf_cq_post_soft_msg(struct usdf_cq_hard *hcq, void *context, + size_t len); +void usdf_cq_post_soft_data(struct usdf_cq_hard *hcq, void *context, + size_t len); + +#endif /* _USDF_CQ_H_ */ diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_dgram.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_dgram.c index fad9771e54..16c87a14df 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_dgram.c +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_dgram.c @@ -57,11 +57,12 @@ #include #include "fi.h" -#include "usnic_direct.h" #include "usd.h" #include "usd_post.h" + #include "usdf.h" #include "usdf_dgram.h" +#include "usdf_av.h" ssize_t usdf_dgram_recv(struct fid_ep *fep, void *buf, size_t len, @@ -73,11 +74,11 @@ usdf_dgram_recv(struct fid_ep *fep, void *buf, size_t len, uint32_t index; ep = ep_ftou(fep); - qp = to_qpi(ep->ep_qp); + qp = to_qpi(ep->e.dg.ep_qp); index = qp->uq_rq.urq_post_index; rxd.urd_context = context; - rxd.urd_iov[0].iov_base = (uint8_t *)ep->ep_hdr_buf + + rxd.urd_iov[0].iov_base = (uint8_t *)ep->e.dg.ep_hdr_buf + (index * USDF_HDR_BUF_ENTRY) + (USDF_HDR_BUF_ENTRY - sizeof(struct usd_udp_hdr)); rxd.urd_iov[0].iov_len = sizeof(struct usd_udp_hdr); @@ -86,16 +87,16 @@ usdf_dgram_recv(struct fid_ep *fep, void *buf, size_t len, rxd.urd_iov_cnt = 2; rxd.urd_next = NULL; - ep->ep_hdr_ptr[index] = rxd.urd_iov[0].iov_base; + ep->e.dg.ep_hdr_ptr[index] = rxd.urd_iov[0].iov_base; index = (index + 1) & qp->uq_rq.urq_post_index_mask; - ep->ep_hdr_ptr[index] = rxd.urd_iov[0].iov_base; + ep->e.dg.ep_hdr_ptr[index] = rxd.urd_iov[0].iov_base; - return usd_post_recv(ep->ep_qp, &rxd); + return usd_post_recv(ep->e.dg.ep_qp, &rxd); } ssize_t usdf_dgram_recvv(struct fid_ep *fep, const struct iovec *iov, void **desc, - size_t count, fi_addr_t src_addr, void *context) + size_t count, fi_addr_t src_addr, void *context) { struct usdf_ep *ep; struct usd_recv_desc rxd; @@ -104,10 +105,10 @@ usdf_dgram_recvv(struct fid_ep *fep, const struct iovec *iov, void **desc, int i; ep = ep_ftou(fep); - qp = to_qpi(ep->ep_qp); + qp = to_qpi(ep->e.dg.ep_qp); rxd.urd_context = context; - rxd.urd_iov[0].iov_base = ep->ep_hdr_buf + + rxd.urd_iov[0].iov_base = ep->e.dg.ep_hdr_buf + qp->uq_rq.urq_post_index * USDF_HDR_BUF_ENTRY; rxd.urd_iov[0].iov_len = sizeof(struct usd_udp_hdr); memcpy(&rxd.urd_iov[1], iov, sizeof(*iov) * count); @@ -116,23 +117,30 @@ usdf_dgram_recvv(struct fid_ep *fep, const struct iovec *iov, void **desc, index = qp->uq_rq.urq_post_index; for (i = 0; i < count; ++i) { - ep->ep_hdr_ptr[index] = rxd.urd_iov[0].iov_base; + ep->e.dg.ep_hdr_ptr[index] = rxd.urd_iov[0].iov_base; index = (index + 1) & qp->uq_rq.urq_post_index_mask; } - return usd_post_recv(ep->ep_qp, &rxd); + return usd_post_recv(ep->e.dg.ep_qp, &rxd); +} + +ssize_t +usdf_dgram_recvmsg(struct fid_ep *fep, const struct fi_msg *msg, uint64_t flags) +{ + return usdf_dgram_recvv(fep, msg->msg_iov, msg->desc, + msg->iov_count, (fi_addr_t)msg->addr, msg->context); } static inline ssize_t -_usdf_dgram_send(struct usdf_ep *ep, struct usd_dest *dest, +_usdf_dgram_send(struct usdf_ep *ep, struct usdf_dest *dest, const void *buf, size_t len, void *context) { if (len <= USD_SEND_MAX_COPY - sizeof(struct usd_udp_hdr)) { - return usd_post_send_one_copy(ep->ep_qp, dest, buf, len, - USD_SF_SIGNAL, context); + return usd_post_send_one_copy(ep->e.dg.ep_qp, + &dest->ds_dest, buf, len, USD_SF_SIGNAL, context); } else { - return usd_post_send_one(ep->ep_qp, dest, buf, len, - USD_SF_SIGNAL, context); + return usd_post_send_one(ep->e.dg.ep_qp, &dest->ds_dest, + buf, len, USD_SF_SIGNAL, context); } } @@ -141,57 +149,170 @@ usdf_dgram_send(struct fid_ep *fep, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, void *context) { struct usdf_ep *ep; - struct usd_dest *dest; + struct usdf_dest *dest; ep = ep_ftou(fep); - dest = (struct usd_dest *)(uintptr_t) dest_addr; + dest = (struct usdf_dest *)(uintptr_t) dest_addr; return _usdf_dgram_send(ep, dest, buf, len, context); } ssize_t -usdf_dgram_conn_send(struct fid_ep *fep, const void *buf, size_t len, - void *desc, fi_addr_t dest_addr, void *context) -{ - struct usdf_ep *ep; - - ep = ep_ftou(fep); - - return _usdf_dgram_send(ep, ep->ep_dest, buf, len, context); -} - -ssize_t -usdf_dgram_senddata(struct fid_ep *ep, const void *buf, size_t len, - void *desc, uint64_t data, fi_addr_t dest_addr, - void *context) +usdf_dgram_senddata(struct fid_ep *fep, const void *buf, size_t len, + void *desc, uint64_t data, fi_addr_t dest_addr, + void *context) { return -FI_ENOSYS; } +static ssize_t +_usdf_dgram_send_iov_copy(struct usdf_ep *ep, struct usd_dest *dest, + const struct iovec *iov, size_t count, void *context) +{ + struct usd_wq *wq; + struct usd_qp_impl *qp; + struct usd_udp_hdr *hdr; + uint32_t last_post; + struct usd_wq_post_info *info; + uint8_t *copybuf; + size_t len; + unsigned i; + + qp = to_qpi(ep->e.dg.ep_qp); + wq = &qp->uq_wq; + copybuf = wq->uwq_copybuf + + wq->uwq_post_index * USD_SEND_MAX_COPY; + + hdr = (struct usd_udp_hdr *)copybuf; + memcpy(hdr, &dest->ds_dest.ds_udp.u_hdr, sizeof(*hdr)); + hdr->uh_udp.source = + qp->uq_attrs.uqa_local_addr.ul_addr.ul_udp.u_addr.sin_port; + + len = sizeof(*hdr); + for (i = 0; i < count; i++) { + memcpy(copybuf + len, iov[i].iov_base, iov[i].iov_len); + len += iov[i].iov_len; + } + + /* adjust lengths */ + hdr->uh_ip.tot_len = htons(len - sizeof(struct ether_header)); + hdr->uh_udp.len = htons(len - sizeof(struct ether_header) - + sizeof(struct iphdr)); + + last_post = _usd_post_send_one(wq, hdr, len, 1); + + info = &wq->uwq_post_info[last_post]; + info->wp_context = context; + info->wp_len = len; + + return 0; +} + ssize_t -usdf_dgram_sendv(struct fid_ep *ep, const struct iovec *iov, void **desc, +usdf_dgram_sendv(struct fid_ep *fep, const struct iovec *iov, void **desc, size_t count, fi_addr_t dest_addr, void *context) { - return -FI_ENOSYS; + struct usdf_ep *ep; + struct usd_dest *dest; + struct usd_wq *wq; + struct usd_qp_impl *qp; + struct usd_udp_hdr *hdr; + uint32_t last_post; + struct usd_wq_post_info *info; + uint8_t *copybuf; + size_t len; + struct iovec send_iov[USDF_DGRAM_MAX_SGE]; + int i; + + ep = ep_ftou(fep); + dest = (struct usd_dest *)(uintptr_t) dest_addr; + + len = 0; + for (i = 0; i < count; i++) { + len += iov[i].iov_len; + } + + if (len + sizeof(struct usd_udp_hdr) > USD_SEND_MAX_COPY) { + qp = to_qpi(ep->e.dg.ep_qp); + wq = &qp->uq_wq; + copybuf = wq->uwq_copybuf + + wq->uwq_post_index * USD_SEND_MAX_COPY; + hdr = (struct usd_udp_hdr *)copybuf; + memcpy(hdr, &dest->ds_dest.ds_udp.u_hdr, sizeof(*hdr)); + + /* adjust lengths and insert source port */ + hdr->uh_ip.tot_len = htons(len + sizeof(struct usd_udp_hdr) - + sizeof(struct ether_header)); + hdr->uh_udp.len = htons((sizeof(struct usd_udp_hdr) - + sizeof(struct ether_header) - + sizeof(struct iphdr)) + len); + hdr->uh_udp.source = + qp->uq_attrs.uqa_local_addr.ul_addr.ul_udp.u_addr.sin_port; + + send_iov[0].iov_base = hdr; + send_iov[0].iov_len = sizeof(*hdr); + memcpy(&send_iov[1], iov, sizeof(struct iovec) * count); + last_post = _usd_post_send_iov(wq, send_iov, count + 1, 1); + info = &wq->uwq_post_info[last_post]; + info->wp_context = context; + info->wp_len = len; + } else { + _usdf_dgram_send_iov_copy(ep, dest, iov, count, context); + } + return 0; } ssize_t -usdf_dgram_sendmsg(struct fid_ep *ep, const struct fi_msg *msg, uint64_t flags) +usdf_dgram_sendmsg(struct fid_ep *fep, const struct fi_msg *msg, uint64_t flags) { - return -FI_ENOSYS; + return usdf_dgram_sendv(fep, msg->msg_iov, msg->desc, msg->iov_count, + (fi_addr_t)msg->addr, msg->context); } ssize_t -usdf_dgram_inject(struct fid_ep *ep, const void *buf, size_t len, +usdf_dgram_inject(struct fid_ep *fep, const void *buf, size_t len, fi_addr_t dest_addr) { - return -FI_ENOSYS; -} + struct usdf_ep *ep; + struct usdf_dest *dest; + struct usd_wq *wq; + struct usd_qp_impl *qp; + struct usd_udp_hdr *hdr; + uint32_t last_post; + struct usd_wq_post_info *info; + uint8_t *copybuf; -ssize_t -usdf_dgram_recvmsg(struct fid_ep *ep, const struct fi_msg *msg, uint64_t flags) -{ - return -FI_ENOSYS; + if (len + sizeof(struct usd_udp_hdr) > USD_SEND_MAX_COPY) { + return -FI_ENOSPC; + } + + ep = ep_ftou(fep); + dest = (struct usdf_dest *)(uintptr_t)dest_addr; + + qp = to_qpi(ep->e.dg.ep_qp); + wq = &qp->uq_wq; + copybuf = wq->uwq_copybuf + + wq->uwq_post_index * USD_SEND_MAX_COPY; + + hdr = (struct usd_udp_hdr *)copybuf; + memcpy(hdr, &dest->ds_dest.ds_dest.ds_udp.u_hdr, sizeof(*hdr)); + hdr->uh_udp.source = + qp->uq_attrs.uqa_local_addr.ul_addr.ul_udp.u_addr.sin_port; + hdr->uh_ip.tot_len = htons(len + sizeof(*hdr) + - sizeof(struct ether_header)); + hdr->uh_udp.len = htons(len + sizeof(*hdr) - + sizeof(struct ether_header) - + sizeof(struct iphdr)); + + memcpy(hdr + 1, buf, len); + + last_post = _usd_post_send_one(wq, hdr, len + sizeof(*hdr), 1); + + info = &wq->uwq_post_info[last_post]; + info->wp_context = NULL; + info->wp_len = len; + + return 0; } /* @@ -207,19 +328,20 @@ usdf_dgram_prefix_recv(struct fid_ep *fep, void *buf, size_t len, uint32_t index; ep = ep_ftou(fep); - qp = to_qpi(ep->ep_qp); + qp = to_qpi(ep->e.dg.ep_qp); index = qp->uq_rq.urq_post_index; rxd.urd_context = context; rxd.urd_iov[0].iov_base = (uint8_t *)buf + USDF_HDR_BUF_ENTRY - sizeof(struct usd_udp_hdr); - rxd.urd_iov[0].iov_len = len; + rxd.urd_iov[0].iov_len = len - + (USDF_HDR_BUF_ENTRY - sizeof(struct usd_udp_hdr)); rxd.urd_iov_cnt = 1; rxd.urd_next = NULL; - ep->ep_hdr_ptr[index] = rxd.urd_iov[0].iov_base; + ep->e.dg.ep_hdr_ptr[index] = rxd.urd_iov[0].iov_base; - return usd_post_recv(ep->ep_qp, &rxd); + return usd_post_recv(ep->e.dg.ep_qp, &rxd); } ssize_t @@ -233,61 +355,131 @@ usdf_dgram_prefix_recvv(struct fid_ep *fep, const struct iovec *iov, int i; ep = ep_ftou(fep); - qp = to_qpi(ep->ep_qp); + qp = to_qpi(ep->e.dg.ep_qp); rxd.urd_context = context; memcpy(&rxd.urd_iov[0], iov, sizeof(*iov) * count); rxd.urd_iov[0].iov_base = (uint8_t *)rxd.urd_iov[0].iov_base + USDF_HDR_BUF_ENTRY - sizeof(struct usd_udp_hdr); + rxd.urd_iov[0].iov_len -= (USDF_HDR_BUF_ENTRY - + sizeof(struct usd_udp_hdr)); rxd.urd_iov_cnt = count; rxd.urd_next = NULL; index = qp->uq_rq.urq_post_index; for (i = 0; i < count; ++i) { - ep->ep_hdr_ptr[index] = rxd.urd_iov[0].iov_base; + ep->e.dg.ep_hdr_ptr[index] = rxd.urd_iov[0].iov_base; index = (index + 1) & qp->uq_rq.urq_post_index_mask; } - return usd_post_recv(ep->ep_qp, &rxd); + return usd_post_recv(ep->e.dg.ep_qp, &rxd); +} + +ssize_t +usdf_dgram_prefix_recvmsg(struct fid_ep *fep, const struct fi_msg *msg, uint64_t flags) +{ + return usdf_dgram_recvv(fep, msg->msg_iov, msg->desc, + msg->iov_count, (fi_addr_t)msg->addr, msg->context); } ssize_t usdf_dgram_prefix_send(struct fid_ep *fep, const void *buf, size_t len, - void *desc, fi_addr_t dest_addr, void *context) + void *desc, fi_addr_t dest_addr, void *context) { - struct usdf_ep *ep; - struct usd_dest *dest; - struct usd_qp_impl *qp; - struct usd_udp_hdr *hdr; - struct usd_wq *wq; - uint32_t last_post; - struct usd_wq_post_info *info; + struct usdf_ep *ep; + struct usdf_dest *dest; + struct usd_qp_impl *qp; + struct usd_udp_hdr *hdr; + struct usd_wq *wq; + uint32_t last_post; + struct usd_wq_post_info *info; - ep = ep_ftou(fep); - dest = (struct usd_dest *)(uintptr_t)dest_addr; + ep = ep_ftou(fep); + dest = (struct usdf_dest *)(uintptr_t)dest_addr; - qp = to_qpi(ep->ep_qp); - wq = &qp->uq_wq; + qp = to_qpi(ep->e.dg.ep_qp); + wq = &qp->uq_wq; - hdr = (struct usd_udp_hdr *) buf - 1; - memcpy(hdr, &dest->ds_dest.ds_udp.u_hdr, sizeof(*hdr)); + hdr = (struct usd_udp_hdr *) buf - 1; + memcpy(hdr, &dest->ds_dest.ds_dest.ds_udp.u_hdr, sizeof(*hdr)); - /* adjust lengths and insert source port */ - hdr->uh_ip.tot_len = htons(len + sizeof(struct usd_udp_hdr) - - sizeof(struct ether_header)); - hdr->uh_udp.len = htons((sizeof(struct usd_udp_hdr) - - sizeof(struct ether_header) - - sizeof(struct iphdr)) + len); - hdr->uh_udp.source = - qp->uq_attrs.uqa_local_addr.ul_addr.ul_udp.u_addr.sin_port; + /* adjust lengths and insert source port */ + hdr->uh_ip.tot_len = htons(len + sizeof(struct usd_udp_hdr) - + sizeof(struct ether_header)); + hdr->uh_udp.len = htons((sizeof(struct usd_udp_hdr) - + sizeof(struct ether_header) - + sizeof(struct iphdr)) + len); + hdr->uh_udp.source = + qp->uq_attrs.uqa_local_addr.ul_addr.ul_udp.u_addr.sin_port; - last_post = _usd_post_send_one(wq, hdr, - len + sizeof(struct usd_udp_hdr), 1); + last_post = _usd_post_send_one(wq, hdr, + len + sizeof(struct usd_udp_hdr), 1); - info = &wq->uwq_post_info[last_post]; - info->wp_context = context; - info->wp_len = len; + info = &wq->uwq_post_info[last_post]; + info->wp_context = context; + info->wp_len = len; - return 0; + return 0; } + +ssize_t +usdf_dgram_prefix_sendv(struct fid_ep *fep, const struct iovec *iov, void **desc, + size_t count, fi_addr_t dest_addr, void *context) +{ + struct usdf_ep *ep; + struct usd_dest *dest; + struct usd_wq *wq; + struct usd_qp_impl *qp; + struct usd_udp_hdr *hdr; + uint32_t last_post; + struct usd_wq_post_info *info; + struct iovec send_iov[USDF_DGRAM_MAX_SGE]; + size_t len; + unsigned i; + + ep = ep_ftou(fep); + dest = (struct usd_dest *)(uintptr_t) dest_addr; + + len = 0; + for (i = 0; i < count; i++) { + len += iov[i].iov_len; + } + + if (len + sizeof(struct usd_udp_hdr) > USD_SEND_MAX_COPY) { + qp = to_qpi(ep->e.dg.ep_qp); + wq = &qp->uq_wq; + hdr = (struct usd_udp_hdr *) iov[0].iov_base - 1; + memcpy(hdr, &dest->ds_dest.ds_udp.u_hdr, sizeof(*hdr)); + + /* adjust lengths and insert source port */ + hdr->uh_ip.tot_len = htons(len + sizeof(struct usd_udp_hdr) - + sizeof(struct ether_header)); + hdr->uh_udp.len = htons((sizeof(struct usd_udp_hdr) - + sizeof(struct ether_header) - + sizeof(struct iphdr)) + len); + hdr->uh_udp.source = + qp->uq_attrs.uqa_local_addr.ul_addr.ul_udp.u_addr.sin_port; + + memcpy(send_iov, iov, sizeof(struct iovec) * count); + send_iov[0].iov_base = hdr; + send_iov[0].iov_len += sizeof(*hdr); + + last_post = _usd_post_send_iov(wq, send_iov, count, 1); + info = &wq->uwq_post_info[last_post]; + info->wp_context = context; + info->wp_len = len; + } else { + _usdf_dgram_send_iov_copy(ep, dest, iov, count, context); + } + return 0; +} + +ssize_t +usdf_dgram_prefix_sendmsg(struct fid_ep *fep, const struct fi_msg *msg, uint64_t flags) +{ + return usdf_dgram_prefix_sendv(fep, msg->msg_iov, msg->desc, msg->iov_count, + (fi_addr_t)msg->addr, msg->context); +} + + diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_dgram.h b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_dgram.h index 3549c5623f..edfab76afd 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_dgram.h +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_dgram.h @@ -36,10 +36,8 @@ #ifndef _USDF_DGRAM_H_ #define _USDF_DGRAM_H_ -/* fi_ops_cm for DGRAM */ -int usdf_cm_dgram_connect(struct fid_ep *ep, const void *addr, - const void *param, size_t paramlen); -int usdf_cm_dgram_shutdown(struct fid_ep *ep, uint64_t flags); +#define USDF_DGRAM_MAX_SGE 8 +#define USDF_DGRAM_DFLT_SGE 4 /* fi_ops_msg for DGRAM */ ssize_t usdf_dgram_recv(struct fid_ep *ep, void *buf, size_t len, void *desc, @@ -50,8 +48,6 @@ ssize_t usdf_dgram_recvmsg(struct fid_ep *ep, const struct fi_msg *msg, uint64_t flags); ssize_t usdf_dgram_send(struct fid_ep *ep, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, void *context); -ssize_t usdf_dgram_conn_send(struct fid_ep *ep, const void *buf, size_t len, - void *desc, fi_addr_t dest_addr, void *context); ssize_t usdf_dgram_sendv(struct fid_ep *ep, const struct iovec *iov, void **desc, size_t count, fi_addr_t dest_addr, void *context); ssize_t usdf_dgram_sendmsg(struct fid_ep *ep, const struct fi_msg *msg, @@ -65,7 +61,13 @@ ssize_t usdf_dgram_prefix_recv(struct fid_ep *ep, void *buf, size_t len, void *desc, fi_addr_t src_addr, void *context); ssize_t usdf_dgram_prefix_recvv(struct fid_ep *ep, const struct iovec *iov, void **desc, size_t count, fi_addr_t src_addr, void *context); +ssize_t usdf_dgram_prefix_recvmsg(struct fid_ep *fep, const struct fi_msg *msg, + uint64_t flags); ssize_t usdf_dgram_prefix_send(struct fid_ep *ep, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, void *context); +ssize_t usdf_dgram_prefix_sendv(struct fid_ep *fep, const struct iovec *iov, + void **desc, size_t count, fi_addr_t dest_addr, void *context); +ssize_t usdf_dgram_prefix_sendmsg(struct fid_ep *ep, const struct fi_msg *msg, + uint64_t flags); #endif /* _USDF_DGRAM_H_ */ diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_domain.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_domain.c index a8ae3867db..972a95e1c4 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_domain.c +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_domain.c @@ -55,6 +55,8 @@ #include "usnic_direct.h" #include "usdf.h" +#include "usdf_rdm.h" +#include "usdf_timer.h" static int usdf_domain_bind(struct fid *fid, struct fid *bfid, uint64_t flags) @@ -78,6 +80,81 @@ usdf_domain_bind(struct fid *fid, struct fid *bfid, uint64_t flags) return 0; } +static void +usdf_dom_rdc_free_data(struct usdf_domain *udp) +{ + struct usdf_rdm_connection *rdc; + int i; + + if (udp->dom_rdc_hashtab != NULL) { + + pthread_spin_lock(&udp->dom_progress_lock); + for (i = 0; i < USDF_RDM_HASH_SIZE; ++i) { + rdc = udp->dom_rdc_hashtab[i]; + while (rdc != NULL) { + usdf_timer_reset(udp->dom_fabric, + rdc->dc_timer, 0); + rdc = rdc->dc_hash_next; + } + } + pthread_spin_unlock(&udp->dom_progress_lock); + + /* XXX probably want a timeout here... */ + while (atomic_get(&udp->dom_rdc_free_cnt) < + udp->dom_rdc_total) { + pthread_yield(); + } + + free(udp->dom_rdc_hashtab); + udp->dom_rdc_hashtab = NULL; + } + + while (!SLIST_EMPTY(&udp->dom_rdc_free)) { + rdc = SLIST_FIRST(&udp->dom_rdc_free); + SLIST_REMOVE_HEAD(&udp->dom_rdc_free, dc_addr_link); + usdf_timer_free(udp->dom_fabric, rdc->dc_timer); + free(rdc); + } +} + +static int +usdf_dom_rdc_alloc_data(struct usdf_domain *udp) +{ + struct usdf_rdm_connection *rdc; + int ret; + int i; + + udp->dom_rdc_hashtab = calloc(USDF_RDM_HASH_SIZE, + sizeof(*udp->dom_rdc_hashtab)); + if (udp->dom_rdc_hashtab == NULL) { + return -FI_ENOMEM; + } + SLIST_INIT(&udp->dom_rdc_free); + atomic_init(&udp->dom_rdc_free_cnt, 0); + for (i = 0; i < USDF_RDM_FREE_BLOCK; ++i) { + rdc = calloc(1, sizeof(*rdc)); + if (rdc == NULL) { + return -FI_ENOMEM; + } + ret = usdf_timer_alloc(usdf_rdm_rdc_timeout, rdc, + &rdc->dc_timer); + if (ret != 0) { + free(rdc); + return ret; + } + rdc->dc_flags = USDF_DCS_UNCONNECTED | USDF_DCF_NEW_RX; + rdc->dc_next_rx_seq = 0; + rdc->dc_next_tx_seq = 0; + rdc->dc_last_rx_ack = rdc->dc_next_tx_seq - 1; + TAILQ_INIT(&rdc->dc_wqe_posted); + TAILQ_INIT(&rdc->dc_wqe_sent); + SLIST_INSERT_HEAD(&udp->dom_rdc_free, rdc, dc_addr_link); + atomic_inc(&udp->dom_rdc_free_cnt); + } + udp->dom_rdc_total = USDF_RDM_FREE_BLOCK; + return 0; +} + static int usdf_domain_close(fid_t fid) { @@ -95,11 +172,14 @@ usdf_domain_close(fid_t fid) return ret; } } + usdf_dom_rdc_free_data(udp); if (udp->dom_eq != NULL) { atomic_dec(&udp->dom_eq->eq_refcnt); } atomic_dec(&udp->dom_fabric->fab_refcnt); + LIST_REMOVE(udp, dom_link); + fi_freeinfo(udp->dom_info); free(udp); return 0; @@ -132,6 +212,8 @@ usdf_domain_open(struct fid_fabric *fabric, struct fi_info *info, struct usdf_domain *udp; struct usdf_usnic_info *dp; struct usdf_dev_entry *dep; + struct sockaddr_in *sin; + size_t addrlen; int d; int ret; @@ -143,6 +225,27 @@ usdf_domain_open(struct fid_fabric *fabric, struct fi_info *info, fp = fab_fidtou(fabric); + /* + * Make sure address format is good and matches this fabric + */ + switch (info->addr_format) { + case FI_SOCKADDR: + addrlen = sizeof(struct sockaddr); + break; + case FI_SOCKADDR_IN: + addrlen = sizeof(struct sockaddr_in); + break; + default: + ret = -FI_EINVAL; + goto fail; + } + sin = info->src_addr; + if (info->src_addrlen != addrlen || sin->sin_family != AF_INET || + sin->sin_addr.s_addr != fp->fab_dev_attrs->uda_ipaddr_be) { + ret = -FI_EINVAL; + goto fail; + } + /* steal cached device from info if we can */ dp = __usdf_devinfo; for (d = 0; d < dp->uu_num_devs; ++d) { @@ -169,7 +272,32 @@ usdf_domain_open(struct fid_fabric *fabric, struct fi_info *info, udp->dom_fid.ops = &usdf_domain_ops; udp->dom_fid.mr = &usdf_domain_mr_ops; + ret = pthread_spin_init(&udp->dom_progress_lock, + PTHREAD_PROCESS_PRIVATE); + if (ret != 0) { + ret = -ret; + goto fail; + } + TAILQ_INIT(&udp->dom_tx_ready); + TAILQ_INIT(&udp->dom_hcq_list); + + udp->dom_info = fi_dupinfo(info); + if (udp->dom_info == NULL) { + ret = -FI_ENOMEM; + goto fail; + } + if (udp->dom_info->dest_addr != NULL) { + free(udp->dom_info->dest_addr); + udp->dom_info->dest_addr = NULL; + } + + ret = usdf_dom_rdc_alloc_data(udp); + if (ret != 0) { + goto fail; + } + udp->dom_fabric = fp; + LIST_INSERT_HEAD(&fp->fab_domain_list, udp, dom_link); atomic_init(&udp->dom_refcnt, 0); atomic_inc(&fp->fab_refcnt); @@ -178,6 +306,13 @@ usdf_domain_open(struct fid_fabric *fabric, struct fi_info *info, fail: if (udp != NULL) { + if (udp->dom_info != NULL) { + fi_freeinfo(udp->dom_info); + } + if (udp->dom_dev != NULL) { + usd_close(udp->dom_dev); + } + usdf_dom_rdc_free_data(udp); free(udp); } return ret; diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_endpoint.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_endpoint.c index 8a0846795f..8afcfafc2b 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_endpoint.c +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_endpoint.c @@ -58,90 +58,8 @@ #include "fi.h" #include "fi_enosys.h" -#include "usnic_direct.h" -#include "usd.h" #include "usdf.h" -#include "usdf_av.h" #include "usdf_endpoint.h" -#include "usdf_progress.h" - -static int -usdf_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) -{ - struct usdf_ep *ep; - - ep = ep_fidtou(fid); - - switch (bfid->fclass) { - - case FI_CLASS_AV: - if (ep->ep_av != NULL) { - return -FI_EINVAL; - } - ep->ep_av = av_fidtou(bfid); - break; - - case FI_CLASS_CQ: - if (flags & FI_SEND) { - if (ep->ep_wcq != NULL) { - return -FI_EINVAL; - } - ep->ep_wcq = cq_fidtou(bfid); - } - - if (flags & FI_RECV) { - if (ep->ep_rcq != NULL) { - return -FI_EINVAL; - } - ep->ep_rcq = cq_fidtou(bfid); - } - break; - - case FI_CLASS_EQ: -printf("bind EQ to ep!\n"); - if (ep->ep_eq != NULL) { - return -FI_EINVAL; - } - ep->ep_eq = eq_fidtou(bfid); - atomic_inc(&ep->ep_eq->eq_refcnt); - break; - default: - return -FI_EINVAL; - } - - return 0; -} - -static int -usdf_ep_close(fid_t fid) -{ - struct usdf_ep *ep; - - ep = ep_fidtou(fid); - - if (atomic_get(&ep->ep_refcnt) > 0) { - return -FI_EBUSY; - } - - if (ep->ep_qp != NULL) { - usd_destroy_qp(ep->ep_qp); - } - atomic_dec(&ep->ep_domain->dom_refcnt); - if (ep->ep_eq != NULL) { - atomic_dec(&ep->ep_eq->eq_refcnt); - } - - free(ep); - return 0; -} - -struct fi_ops usdf_ep_ops = { - .size = sizeof(struct fi_ops), - .close = usdf_ep_close, - .bind = usdf_ep_bind, - .control = fi_no_control, - .ops_open = fi_no_ops_open -}; int usdf_ep_port_bind(struct usdf_ep *ep, struct fi_info *info) @@ -151,13 +69,13 @@ usdf_ep_port_bind(struct usdf_ep *ep, struct fi_info *info) int ret; sin = (struct sockaddr_in *)info->src_addr; - ret = bind(ep->ep_sock, (struct sockaddr *)sin, sizeof(*sin)); + ret = bind(ep->e.dg.ep_sock, (struct sockaddr *)sin, sizeof(*sin)); if (ret == -1) { return -errno; } addrlen = sizeof(*sin); - ret = getsockname(ep->ep_sock, (struct sockaddr *)sin, &addrlen); + ret = getsockname(ep->e.dg.ep_sock, (struct sockaddr *)sin, &addrlen); if (ret == -1) { return -errno; } @@ -174,6 +92,8 @@ usdf_endpoint_open(struct fid_domain *domain, struct fi_info *info, return usdf_ep_dgram_open(domain, info, ep_o, context); case FI_EP_MSG: return usdf_ep_msg_open(domain, info, ep_o, context); + case FI_EP_RDM: + return usdf_ep_rdm_open(domain, info, ep_o, context); default: return -FI_ENODEV; } diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_endpoint.h b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_endpoint.h index 4418ed957c..cd911a87e9 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_endpoint.h +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_endpoint.h @@ -41,6 +41,10 @@ int usdf_ep_dgram_open(struct fid_domain *domain, struct fi_info *info, struct fid_ep **ep, void *context); int usdf_ep_msg_open(struct fid_domain *domain, struct fi_info *info, struct fid_ep **ep, void *context); +int usdf_ep_rdm_open(struct fid_domain *domain, struct fi_info *info, + struct fid_ep **ep, void *context); +int usdf_ep_msg_get_queues(struct usdf_ep *ep); +void usdf_ep_msg_release_queues(struct usdf_ep *ep); extern struct fi_ops usdf_ep_ops; diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_ep_dgram.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_ep_dgram.c index 2f55248d8f..148114ccb8 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_ep_dgram.c +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_ep_dgram.c @@ -63,9 +63,11 @@ #include "usdf.h" #include "usdf_endpoint.h" #include "usdf_dgram.h" +#include "usdf_av.h" +#include "usdf_cq.h" static int -usdf_dgram_ep_enable(struct fid_ep *fep) +usdf_ep_dgram_enable(struct fid_ep *fep) { struct usdf_ep *ep; struct usd_filter filt; @@ -75,18 +77,18 @@ usdf_dgram_ep_enable(struct fid_ep *fep) ep = ep_ftou(fep); filt.uf_type = USD_FTY_UDP_SOCK; - filt.uf_filter.uf_udp_sock.u_sock = ep->ep_sock; + filt.uf_filter.uf_udp_sock.u_sock = ep->e.dg.ep_sock; if (ep->ep_caps & USDF_EP_CAP_PIO) { ret = usd_create_qp(ep->ep_domain->dom_dev, USD_QTR_UDP, USD_QTY_PIO, - ep->ep_wcq->cq_cq, - ep->ep_rcq->cq_cq, + ep->e.dg.ep_wcq->c.hard.cq_cq, + ep->e.dg.ep_rcq->c.hard.cq_cq, 127, // XXX 127, // XXX &filt, - &ep->ep_qp); + &ep->e.dg.ep_qp); } else { ret = -EAGAIN; } @@ -95,33 +97,33 @@ usdf_dgram_ep_enable(struct fid_ep *fep) ret = usd_create_qp(ep->ep_domain->dom_dev, USD_QTR_UDP, USD_QTY_NORMAL, - ep->ep_wcq->cq_cq, - ep->ep_rcq->cq_cq, + ep->e.dg.ep_wcq->c.hard.cq_cq, + ep->e.dg.ep_rcq->c.hard.cq_cq, ep->ep_wqe, ep->ep_rqe, &filt, - &ep->ep_qp); + &ep->e.dg.ep_qp); } if (ret != 0) { goto fail; } - ep->ep_qp->uq_context = ep; + ep->e.dg.ep_qp->uq_context = ep; /* * Allocate a memory region big enough to hold a header for each - * RQ entry + * RQ entry */ - uqp = to_qpi(ep->ep_qp); - ep->ep_hdr_ptr = calloc(uqp->uq_rq.urq_num_entries, - sizeof(ep->ep_hdr_ptr[0])); - if (ep->ep_hdr_ptr == NULL) { + uqp = to_qpi(ep->e.dg.ep_qp); + ep->e.dg.ep_hdr_ptr = calloc(uqp->uq_rq.urq_num_entries, + sizeof(ep->e.dg.ep_hdr_ptr[0])); + if (ep->e.dg.ep_hdr_ptr == NULL) { ret = -FI_ENOMEM; goto fail; } ret = usd_alloc_mr(ep->ep_domain->dom_dev, - usd_get_recv_credits(ep->ep_qp) * USDF_HDR_BUF_ENTRY, - &ep->ep_hdr_buf); + usd_get_recv_credits(ep->e.dg.ep_qp) * USDF_HDR_BUF_ENTRY, + &ep->e.dg.ep_hdr_buf); if (ret != 0) { goto fail; } @@ -129,18 +131,143 @@ usdf_dgram_ep_enable(struct fid_ep *fep) return 0; fail: - if (ep->ep_hdr_ptr != NULL) { - free(ep->ep_hdr_ptr); + if (ep->e.dg.ep_hdr_ptr != NULL) { + free(ep->e.dg.ep_hdr_ptr); } - if (ep->ep_qp != NULL) { - usd_destroy_qp(ep->ep_qp); + if (ep->e.dg.ep_qp != NULL) { + usd_destroy_qp(ep->e.dg.ep_qp); } return ret; } +static int +usdf_ep_dgram_bind(struct fid *fid, struct fid *bfid, uint64_t flags) +{ + struct usdf_ep *ep; + struct usdf_cq *cq; + int ret; + + ep = ep_fidtou(fid); + + switch (bfid->fclass) { + + case FI_CLASS_AV: + if (ep->e.dg.ep_av != NULL) { + return -FI_EINVAL; + } + ep->e.dg.ep_av = av_fidtou(bfid); + break; + + case FI_CLASS_CQ: + cq = cq_fidtou(bfid); + + /* actually, could look through CQ list for a hard + * CQ with function usd_poll_cq() and use that... XXX + */ + if (usdf_cq_is_soft(cq)) { + return -FI_EINVAL; + } + if (cq->c.hard.cq_cq == NULL) { + ret = usdf_cq_create_cq(cq); + if (ret != 0) { + return ret; + } + } + + if (flags & FI_SEND) { + if (ep->e.dg.ep_wcq != NULL) { + return -FI_EINVAL; + } + ep->e.dg.ep_wcq = cq; + atomic_inc(&cq->cq_refcnt); + } + + if (flags & FI_RECV) { + if (ep->e.dg.ep_rcq != NULL) { + return -FI_EINVAL; + } + ep->e.dg.ep_rcq = cq; + atomic_inc(&cq->cq_refcnt); + } + break; + + case FI_CLASS_EQ: + if (ep->ep_eq != NULL) { + return -FI_EINVAL; + } + ep->ep_eq = eq_fidtou(bfid); + atomic_inc(&ep->ep_eq->eq_refcnt); + break; + default: + return -FI_EINVAL; + } + + return 0; +} + +static void +usdf_ep_dgram_deref_cq(struct usdf_cq *cq) +{ + struct usdf_cq_hard *hcq; + void (*rtn)(struct usdf_cq_hard *hcq); + + if (cq == NULL) { + return; + } + atomic_dec(&cq->cq_refcnt); + + switch (cq->cq_attr.format) { + case FI_CQ_FORMAT_CONTEXT: + rtn = usdf_progress_hard_cq_context; + break; + case FI_CQ_FORMAT_MSG: + rtn = usdf_progress_hard_cq_msg; + break; + case FI_CQ_FORMAT_DATA: + rtn = usdf_progress_hard_cq_data; + break; + default: + return; + } + + if (usdf_cq_is_soft(cq)) { + TAILQ_FOREACH(hcq, &cq->c.soft.cq_list, cqh_link) { + if (hcq->cqh_progress == rtn) { + atomic_dec(&hcq->cqh_refcnt); + return; + } + } + } +} + +static int +usdf_ep_dgram_close(fid_t fid) +{ + struct usdf_ep *ep; + + ep = ep_fidtou(fid); + + if (atomic_get(&ep->ep_refcnt) > 0) { + return -FI_EBUSY; + } + + if (ep->e.dg.ep_qp != NULL) { + usd_destroy_qp(ep->e.dg.ep_qp); + } + atomic_dec(&ep->ep_domain->dom_refcnt); + if (ep->ep_eq != NULL) { + atomic_dec(&ep->ep_eq->eq_refcnt); + } + usdf_ep_dgram_deref_cq(ep->e.dg.ep_wcq); + usdf_ep_dgram_deref_cq(ep->e.dg.ep_rcq); + + free(ep); + return 0; +} + static struct fi_ops_ep usdf_base_dgram_ops = { .size = sizeof(struct fi_ops_ep), - .enable = usdf_dgram_ep_enable, + .enable = usdf_ep_dgram_enable, .cancel = fi_no_cancel, .getopt = fi_no_getopt, .setopt = fi_no_setopt, @@ -165,10 +292,10 @@ static struct fi_ops_msg usdf_dgram_prefix_ops = { .size = sizeof(struct fi_ops_msg), .recv = usdf_dgram_prefix_recv, .recvv = usdf_dgram_prefix_recvv, - .recvmsg = usdf_dgram_recvmsg, - .send = usdf_dgram_send, - .sendv = usdf_dgram_sendv, - .sendmsg = usdf_dgram_sendmsg, + .recvmsg = usdf_dgram_prefix_recvmsg, + .send = usdf_dgram_prefix_send, + .sendv = usdf_dgram_prefix_sendv, + .sendmsg = usdf_dgram_prefix_sendmsg, .inject = usdf_dgram_inject, .senddata = usdf_dgram_senddata, .injectdata = fi_no_msg_injectdata, @@ -176,8 +303,16 @@ static struct fi_ops_msg usdf_dgram_prefix_ops = { static struct fi_ops_cm usdf_cm_dgram_ops = { .size = sizeof(struct fi_ops_cm), - .connect = usdf_cm_dgram_connect, - .shutdown = usdf_cm_dgram_shutdown, + .connect = fi_no_connect, + .shutdown = fi_no_shutdown, +}; + +static struct fi_ops usdf_ep_dgram_ops = { + .size = sizeof(struct fi_ops), + .close = usdf_ep_dgram_close, + .bind = usdf_ep_dgram_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open }; int @@ -199,8 +334,8 @@ usdf_ep_dgram_open(struct fid_domain *domain, struct fi_info *info, return -FI_ENOMEM; } - ep->ep_sock = socket(AF_INET, SOCK_DGRAM, 0); - if (ep->ep_sock == -1) { + ep->e.dg.ep_sock = socket(AF_INET, SOCK_DGRAM, 0); + if (ep->e.dg.ep_sock == -1) { ret = -errno; goto fail; } @@ -216,7 +351,7 @@ usdf_ep_dgram_open(struct fid_domain *domain, struct fi_info *info, ep->ep_fid.fid.fclass = FI_CLASS_EP; ep->ep_fid.fid.context = context; - ep->ep_fid.fid.ops = &usdf_ep_ops; + ep->ep_fid.fid.ops = &usdf_ep_dgram_ops; ep->ep_fid.ops = &usdf_base_dgram_ops; ep->ep_fid.cm = &usdf_cm_dgram_ops; ep->ep_domain = udp; @@ -225,12 +360,14 @@ usdf_ep_dgram_open(struct fid_domain *domain, struct fi_info *info, if (info->tx_attr != NULL && info->tx_attr->size != 0) { ep->ep_wqe = info->tx_attr->size; } else { - ep->ep_wqe = udp->dom_dev_attrs.uda_max_send_credits; + ep->ep_wqe = + udp->dom_fabric->fab_dev_attrs->uda_max_send_credits; } if (info->rx_attr != NULL && info->rx_attr->size != 0) { ep->ep_rqe = info->rx_attr->size; } else { - ep->ep_rqe = udp->dom_dev_attrs.uda_max_recv_credits; + ep->ep_rqe = + udp->dom_fabric->fab_dev_attrs->uda_max_recv_credits; } if (ep->ep_mode & FI_MSG_PREFIX) { @@ -252,8 +389,8 @@ usdf_ep_dgram_open(struct fid_domain *domain, struct fi_info *info, fail: if (ep != NULL) { - if (ep->ep_sock != -1) { - close(ep->ep_sock); + if (ep->e.dg.ep_sock != -1) { + close(ep->e.dg.ep_sock); } free(ep); } diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_ep_msg.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_ep_msg.c index 672ebf30c2..59f011c70e 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_ep_msg.c +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_ep_msg.c @@ -62,10 +62,221 @@ #include "usd.h" #include "usdf.h" #include "usdf_endpoint.h" +#include "usdf_rudp.h" #include "usdf_msg.h" +#include "usdf_cq.h" +#include "usdf_timer.h" static int -usdf_msg_ep_getopt(fid_t fid, int level, int optname, +usdf_tx_msg_enable(struct usdf_tx *tx) +{ + struct usdf_msg_qe *wqe; + struct usdf_domain *udp; + struct usdf_cq_hard *hcq; + struct usd_filter filt; + int ret; + int i; + + udp = tx->tx_domain; + + hcq = tx->t.msg.tx_hcq; + if (hcq == NULL) { + return -FI_ENOCQ; + } + + /* XXX temp until we can allocate WQ and RQ independently */ + filt.uf_type = USD_FTY_UDP; + filt.uf_filter.uf_udp.u_port = 0; + ret = usd_create_qp(udp->dom_dev, + USD_QTR_UDP, + USD_QTY_NORMAL, + hcq->cqh_ucq, + hcq->cqh_ucq, + udp->dom_fabric->fab_dev_attrs->uda_max_send_credits, + udp->dom_fabric->fab_dev_attrs->uda_max_recv_credits, + &filt, + &tx->tx_qp); + if (ret != 0) { + goto fail; + } + tx->tx_qp->uq_context = tx; + + /* msg send queue */ + tx->t.msg.tx_wqe_buf = malloc(tx->tx_attr.size * + sizeof(struct usdf_msg_qe)); + if (tx->t.msg.tx_wqe_buf == NULL) { + ret = -errno; + goto fail; + } + + /* populate free list */ + TAILQ_INIT(&tx->t.msg.tx_free_wqe); + wqe = tx->t.msg.tx_wqe_buf; + for (i = 0; i < tx->tx_attr.size; ++i) { + TAILQ_INSERT_TAIL(&tx->t.msg.tx_free_wqe, wqe, ms_link); + ++wqe; + } + + return 0; + +fail: + if (tx->t.msg.tx_wqe_buf != NULL) { + free(tx->t.msg.tx_wqe_buf); + tx->t.msg.tx_wqe_buf = NULL; + TAILQ_INIT(&tx->t.msg.tx_free_wqe); + } + if (tx->tx_qp != NULL) { + usd_destroy_qp(tx->tx_qp); + } + return ret; +} + +static int +usdf_rx_msg_enable(struct usdf_rx *rx) +{ + struct usdf_domain *udp; + struct usdf_cq_hard *hcq; + struct usdf_msg_qe *rqe; + struct usd_filter filt; + struct usd_qp_impl *qp; + uint8_t *ptr; + size_t mtu; + int ret; + int i; + + udp = rx->rx_domain; + + hcq = rx->r.msg.rx_hcq; + if (hcq == NULL) { + return -FI_ENOCQ; + } + + /* XXX temp until we can allocate WQ and RQ independently */ + filt.uf_type = USD_FTY_UDP; + filt.uf_filter.uf_udp.u_port = 0; + ret = usd_create_qp(udp->dom_dev, + USD_QTR_UDP, + USD_QTY_NORMAL, + hcq->cqh_ucq, + hcq->cqh_ucq, + udp->dom_fabric->fab_dev_attrs->uda_max_send_credits, + udp->dom_fabric->fab_dev_attrs->uda_max_recv_credits, + &filt, + &rx->rx_qp); + if (ret != 0) { + goto fail; + } + rx->rx_qp->uq_context = rx; + qp = to_qpi(rx->rx_qp); + + /* receive buffers */ + mtu = rx->rx_domain->dom_fabric->fab_dev_attrs->uda_mtu; + ret = usd_alloc_mr(rx->rx_domain->dom_dev, + qp->uq_rq.urq_num_entries * mtu, + (void **)&rx->r.msg.rx_bufs); + if (ret != 0) { + goto fail; + } + + /* post all the buffers */ + ptr = rx->r.msg.rx_bufs; + for (i = 0; i < qp->uq_rq.urq_num_entries - 1; ++i) { + usdf_msg_post_recv(rx, ptr, mtu); + ptr += mtu; + } + + /* msg recv queue */ + rx->r.msg.rx_rqe_buf = malloc(rx->rx_attr.size * + sizeof(struct usdf_msg_qe)); + if (rx->r.msg.rx_rqe_buf == NULL) { + ret = -errno; + goto fail; + } + + /* populate free list */ + TAILQ_INIT(&rx->r.msg.rx_free_rqe); + rqe = rx->r.msg.rx_rqe_buf; + for (i = 0; i < rx->rx_attr.size; ++i) { + TAILQ_INSERT_TAIL(&rx->r.msg.rx_free_rqe, rqe, ms_link); + ++rqe; + } + + return 0; + +fail: + if (rx->r.msg.rx_rqe_buf != NULL) { + free(rx->r.msg.rx_rqe_buf); + rx->r.msg.rx_rqe_buf = NULL; + TAILQ_INIT(&rx->r.msg.rx_free_rqe); + } + if (rx->r.msg.rx_bufs != NULL) { + usd_free_mr(rx->r.msg.rx_bufs); + rx->r.msg.rx_bufs = NULL; + } + if (rx->rx_qp != NULL) { + usd_destroy_qp(rx->rx_qp); + } + return ret; +} + +/* + * release queue resources + */ +void +usdf_ep_msg_release_queues(struct usdf_ep *ep) +{ + /* XXX */ +} + +/* + * Allocate any missing queue resources for this endpoint + */ +int +usdf_ep_msg_get_queues(struct usdf_ep *ep) +{ + struct usdf_tx *tx; + struct usdf_rx *rx; + int ret; + + /* Must have TX context at this point */ + tx = ep->ep_tx; + if (tx == NULL) { + ret = -FI_EINVAL; + goto fail; + } + if (tx->tx_qp == NULL) { + ret = usdf_tx_msg_enable(tx); + if (ret != 0) { + goto fail; + } + } + + /* Must have RX context at this point */ + rx = ep->ep_rx; + if (rx == NULL) { + ret = -FI_EINVAL; + goto fail; + } + if (rx->rx_qp == NULL) { + ret = usdf_rx_msg_enable(rx); + if (ret != 0) { + goto fail; + } + } + + return 0; +fail: + return ret; +} + +static int +usdf_ep_msg_enable(struct fid_ep *fep) +{ + return usdf_ep_msg_get_queues(ep_ftou(fep)); +} + +static int +usdf_ep_msg_getopt(fid_t fid, int level, int optname, void *optval, size_t *optlen) { struct usdf_ep *ep; @@ -82,7 +293,7 @@ usdf_msg_ep_getopt(fid_t fid, int level, int optname, } static int -usdf_msg_ep_setopt(fid_t fid, int level, int optname, +usdf_ep_msg_setopt(fid_t fid, int level, int optname, const void *optval, size_t optlen) { struct usdf_ep *ep; @@ -98,84 +309,291 @@ usdf_msg_ep_setopt(fid_t fid, int level, int optname, return 0; } -static int -usdf_msg_ep_enable(struct fid_ep *fep) +static ssize_t +usdf_ep_msg_cancel(fid_t fid, void *context) { - struct usdf_ep *ep; - struct usd_filter filt; - struct usd_qp_impl *uqp; + return 0; +} + +int +usdf_msg_fill_tx_attr(struct fi_tx_attr *txattr) +{ + if (txattr->size > USDF_MSG_MAX_CTX_SIZE || + txattr->iov_limit > USDF_MSG_MAX_SGE) { + return -FI_ENODATA; + } + + if (txattr->size == 0) { + txattr->size = USDF_MSG_DFLT_CTX_SIZE; + } + if (txattr->iov_limit == 0) { + txattr->iov_limit = USDF_MSG_DFLT_SGE; + } + return 0; +} + +int +usdf_msg_fill_rx_attr(struct fi_rx_attr *rxattr) +{ + if (rxattr->size > USDF_MSG_MAX_CTX_SIZE || + rxattr->iov_limit > USDF_MSG_MAX_SGE) { + return -FI_ENODATA; + } + + if (rxattr->size == 0) { + rxattr->size = USDF_MSG_DFLT_CTX_SIZE; + } + if (rxattr->iov_limit == 0) { + rxattr->iov_limit = USDF_MSG_DFLT_SGE; + } + return 0; +} + +/* + * Find a hard CQ within this soft CQ that services message EPs + */ +static struct usdf_cq_hard * +usdf_ep_msg_find_cqh(struct usdf_cq *cq) +{ + struct usdf_cq_hard *hcq; + + TAILQ_FOREACH(hcq, &cq->c.soft.cq_list, cqh_link) { + if (hcq->cqh_progress == usdf_msg_hcq_progress) { + return hcq; + } + } + return NULL; +} + +static int +usdf_ep_msg_bind_cq(struct usdf_ep *ep, struct usdf_cq *cq, uint64_t flags) +{ + struct usdf_cq_hard **hcqp; + struct usdf_cq_hard *hcq; int ret; - ep = ep_ftou(fep); - - filt.uf_type = USD_FTY_UDP_SOCK; - filt.uf_filter.uf_udp_sock.u_sock = ep->ep_sock; - - ret = usd_create_qp(ep->ep_domain->dom_dev, - USD_QTR_UDP, - USD_QTY_NORMAL, - ep->ep_wcq->cq_cq, - ep->ep_rcq->cq_cq, - ep->ep_wqe, - ep->ep_rqe, - &filt, - &ep->ep_qp); - if (ret != 0) { - goto fail; - } - ep->ep_qp->uq_context = ep; - /* - * Allocate a memory region big enough to hold a header for each - * RQ entry + * The CQ is actually bound the RX or TX ctx, not the EP directly */ - uqp = to_qpi(ep->ep_qp); - ep->ep_hdr_ptr = calloc(uqp->uq_rq.urq_num_entries, - sizeof(ep->ep_hdr_ptr[0])); - if (ep->ep_hdr_ptr == NULL) { - ret = -FI_ENOMEM; - goto fail; + if (flags & FI_SEND) { + /* if TX is shared, but bind directly */ + if (ep->ep_tx->tx_fid.fid.fclass == FI_CLASS_STX_CTX) { + return -FI_EINVAL; + } + hcqp = &ep->ep_tx->t.msg.tx_hcq; + } else { + /* if RX is shared, but bind directly */ + if (ep->ep_rx->rx_fid.fid.fclass == FI_CLASS_SRX_CTX) { + return -FI_EINVAL; + } + hcqp = &ep->ep_rx->r.msg.rx_hcq; + } + if (*hcqp != NULL) { + return -FI_EINVAL; } - ret = usd_alloc_mr(ep->ep_domain->dom_dev, - usd_get_recv_credits(ep->ep_qp) * USDF_HDR_BUF_ENTRY, - &ep->ep_hdr_buf); + /* Make sure this CQ is "soft" */ + ret = usdf_cq_make_soft(cq); if (ret != 0) { - goto fail; + return ret; } + /* Use existing msg CQ if present */ + hcq = usdf_ep_msg_find_cqh(cq); + if (hcq == NULL) { + hcq = malloc(sizeof(*hcq)); + if (hcq == NULL) { + return -errno; + } + ret = usd_create_cq(cq->cq_domain->dom_dev, 8195, /* XXX */ + -1, &hcq->cqh_ucq); + if (ret != 0) { + goto fail; + } + hcq->cqh_cq = cq; + atomic_init(&hcq->cqh_refcnt, 0); + hcq->cqh_progress = usdf_msg_hcq_progress; + switch (cq->cq_attr.format) { + default: + case FI_CQ_FORMAT_CONTEXT: + hcq->cqh_post = usdf_cq_post_soft_context; + break; + case FI_CQ_FORMAT_MSG: + hcq->cqh_post = usdf_cq_post_soft_msg; + break; + case FI_CQ_FORMAT_DATA: + hcq->cqh_post = usdf_cq_post_soft_data; + break; + } + TAILQ_INSERT_TAIL(&cq->c.soft.cq_list, hcq, cqh_link); + + /* add to domain progression list */ + TAILQ_INSERT_TAIL(&ep->ep_domain->dom_hcq_list, + hcq, cqh_dom_link); + } + atomic_inc(&hcq->cqh_refcnt); + atomic_inc(&cq->cq_refcnt); + *hcqp = hcq; return 0; fail: - if (ep->ep_hdr_ptr != NULL) { - free(ep->ep_hdr_ptr); - } - if (ep->ep_qp != NULL) { - usd_destroy_qp(ep->ep_qp); + if (hcq != NULL) { + free(hcq); } return ret; } -static ssize_t -usdf_msg_ep_cancel(fid_t fid, void *context) +static int +usdf_ep_msg_bind(struct fid *fid, struct fid *bfid, uint64_t flags) { + struct usdf_ep *ep; + struct usdf_cq *cq; + + ep = ep_fidtou(fid); + + switch (bfid->fclass) { + + case FI_CLASS_CQ: + if (flags & FI_SEND) { + cq = cq_fidtou(bfid); + usdf_ep_msg_bind_cq(ep, cq, FI_SEND); + } + + if (flags & FI_RECV) { + cq = cq_fidtou(bfid); + usdf_ep_msg_bind_cq(ep, cq, FI_RECV); + } + break; + + case FI_CLASS_EQ: + if (ep->ep_eq != NULL) { + return -FI_EINVAL; + } + ep->ep_eq = eq_fidtou(bfid); + atomic_inc(&ep->ep_eq->eq_refcnt); + break; + default: + return -FI_EINVAL; + } + + return 0; +} + +static int +usdf_msg_rx_ctx_close(fid_t fid) +{ + struct usdf_rx *rx; + struct usdf_cq_hard *hcq; + + rx = rx_fidtou(fid); + + if (atomic_get(&rx->rx_refcnt) > 0) { + return -FI_EBUSY; + } + + hcq = rx->r.msg.rx_hcq; + if (hcq != NULL) { + atomic_dec(&hcq->cqh_refcnt); + atomic_dec(&hcq->cqh_cq->cq_refcnt); + } + + if (rx->rx_qp != NULL) { + usd_free_mr(rx->r.msg.rx_bufs); + free(rx->r.msg.rx_rqe_buf); + usd_destroy_qp(rx->rx_qp); + } + atomic_dec(&rx->rx_domain->dom_refcnt); + + free(rx); + + return 0; +} + +static int +usdf_msg_tx_ctx_close(fid_t fid) +{ + struct usdf_tx *tx; + struct usdf_cq_hard *hcq; + + tx = tx_fidtou(fid); + + if (atomic_get(&tx->tx_refcnt) > 0) { + return -FI_EBUSY; + } + + hcq = tx->t.msg.tx_hcq; + if (hcq != NULL) { + atomic_dec(&hcq->cqh_refcnt); + atomic_dec(&hcq->cqh_cq->cq_refcnt); + } + + if (tx->tx_qp != NULL) { + free(tx->t.msg.tx_wqe_buf); + usd_destroy_qp(tx->tx_qp); + } + atomic_dec(&tx->tx_domain->dom_refcnt); + + free(tx); + + return 0; +} + +static int +usdf_ep_msg_close(fid_t fid) +{ + struct usdf_ep *ep; + + ep = ep_fidtou(fid); + + if (atomic_get(&ep->ep_refcnt) > 0) { + return -FI_EBUSY; + } + + if (ep->ep_rx != NULL) { + atomic_dec(&ep->ep_rx->rx_refcnt); + if (rx_utofid(ep->ep_rx)->fclass == FI_CLASS_RX_CTX) { + (void) usdf_msg_rx_ctx_close(rx_utofid(ep->ep_rx)); + } + } + + if (ep->ep_tx != NULL) { + atomic_dec(&ep->ep_tx->tx_refcnt); + if (tx_utofid(ep->ep_tx)->fclass == FI_CLASS_TX_CTX) { + (void) usdf_msg_tx_ctx_close(tx_utofid(ep->ep_tx)); + } + } + + atomic_dec(&ep->ep_domain->dom_refcnt); + if (ep->ep_eq != NULL) { + atomic_dec(&ep->ep_eq->eq_refcnt); + } + usdf_timer_free(ep->ep_domain->dom_fabric, ep->e.msg.ep_ack_timer); + + free(ep); return 0; } static struct fi_ops_ep usdf_base_msg_ops = { .size = sizeof(struct fi_ops_ep), - .enable = usdf_msg_ep_enable, - .cancel = usdf_msg_ep_cancel, - .getopt = usdf_msg_ep_getopt, - .setopt = usdf_msg_ep_setopt, + .enable = usdf_ep_msg_enable, + .cancel = usdf_ep_msg_cancel, + .getopt = usdf_ep_msg_getopt, + .setopt = usdf_ep_msg_setopt, .tx_ctx = fi_no_tx_ctx, .rx_ctx = fi_no_rx_ctx, }; static struct fi_ops_cm usdf_cm_msg_ops = { .size = sizeof(struct fi_ops_cm), + .getname = fi_no_getname, + .getpeer = fi_no_getpeer, .connect = usdf_cm_msg_connect, + .listen = fi_no_listen, + .accept = usdf_cm_msg_accept, + .reject = fi_no_reject, .shutdown = usdf_cm_msg_shutdown, + .join = fi_no_join, + .leave = fi_no_leave, }; static struct fi_ops_msg usdf_msg_ops = { @@ -191,58 +609,130 @@ static struct fi_ops_msg usdf_msg_ops = { .injectdata = fi_no_msg_injectdata, }; +static struct fi_ops usdf_ep_msg_ops = { + .size = sizeof(struct fi_ops), + .close = usdf_ep_msg_close, + .bind = usdf_ep_msg_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open +}; + int usdf_ep_msg_open(struct fid_domain *domain, struct fi_info *info, struct fid_ep **ep_o, void *context) { struct usdf_domain *udp; + struct usdf_fabric *fp; + struct usdf_tx *tx; + struct usdf_rx *rx; struct usdf_ep *ep; int ret; - if ((info->caps & ~USDF_DGRAM_CAPS) != 0) { - return -FI_EBADF; + ep = NULL; + rx = NULL; + tx = NULL; + if ((info->caps & ~USDF_MSG_CAPS) != 0) { + return -FI_EBADFLAGS; } udp = dom_ftou(domain); + fp = udp->dom_fabric; - ep = calloc(1, sizeof(*ep)); - if (ep == NULL) { - return -FI_ENOMEM; + /* allocate peer table if not done */ + if (udp->dom_peer_tab == NULL) { + udp->dom_peer_tab = calloc(USDF_MAX_PEERS, sizeof(ep)); } - - ep->ep_sock = socket(AF_INET, SOCK_DGRAM, 0); - if (ep->ep_sock == -1) { + if (udp->dom_peer_tab == NULL) { ret = -errno; goto fail; } - if (info->src_addr != NULL) { - if (info->addr_format == FI_SOCKADDR || - info->addr_format == FI_SOCKADDR_IN) { - ret = usdf_ep_port_bind(ep, info); - if (ret != 0) { - goto fail; - } - } + + ep = calloc(1, sizeof(*ep)); + if (ep == NULL) { + ret = -errno; + goto fail; } ep->ep_fid.fid.fclass = FI_CLASS_EP; ep->ep_fid.fid.context = context; - ep->ep_fid.fid.ops = &usdf_ep_ops; + ep->ep_fid.fid.ops = &usdf_ep_msg_ops; ep->ep_fid.ops = &usdf_base_msg_ops; ep->ep_fid.cm = &usdf_cm_msg_ops; ep->ep_fid.msg = &usdf_msg_ops; ep->ep_domain = udp; ep->ep_caps = info->caps; ep->ep_mode = info->mode; - if (info->tx_attr != NULL && info->tx_attr->size != 0) { - ep->ep_wqe = info->tx_attr->size; - } else { - ep->ep_wqe = udp->dom_dev_attrs.uda_max_send_credits; + ep->e.msg.ep_connreq = info->connreq; + + ep->e.msg.ep_seq_credits = USDF_RUDP_SEQ_CREDITS; + TAILQ_INIT(&ep->e.msg.ep_posted_wqe); + TAILQ_INIT(&ep->e.msg.ep_sent_wqe); + --ep->e.msg.ep_last_rx_ack; + + ret = usdf_timer_alloc(usdf_msg_ep_timeout, ep, + &ep->e.msg.ep_ack_timer); + if (ret != 0) { + goto fail; } - if (info->rx_attr != NULL && info->rx_attr->size != 0) { - ep->ep_rqe = info->rx_attr->size; - } else { - ep->ep_rqe = udp->dom_dev_attrs.uda_max_recv_credits; + + /* implicitly create TX context if not to be shared */ + if (info->ep_attr == NULL || + info->ep_attr->tx_ctx_cnt != FI_SHARED_CONTEXT) { + tx = calloc(1, sizeof(*tx)); + if (tx == NULL) { + ret = -errno; + goto fail; + } + tx->tx_fid.fid.fclass = FI_CLASS_TX_CTX; + atomic_init(&tx->tx_refcnt, 0); + tx->tx_domain = udp; + tx->tx_progress = usdf_msg_tx_progress; + atomic_inc(&udp->dom_refcnt); + if (info->tx_attr != NULL) { + ret = usdf_msg_fill_tx_attr(info->tx_attr); + if (ret != 0) { + goto fail; + } + tx->tx_attr = *info->tx_attr; + } else { + ret = usdf_msg_fill_tx_attr(&tx->tx_attr); + } + TAILQ_INIT(&tx->t.msg.tx_free_wqe); + TAILQ_INIT(&tx->t.msg.tx_ep_ready); + TAILQ_INIT(&tx->t.msg.tx_ep_have_acks); + + ep->ep_tx = tx; + atomic_inc(&tx->tx_refcnt); + atomic_inc(&udp->dom_refcnt); + } + TAILQ_INIT(&ep->e.msg.ep_posted_wqe); + + /* implicitly create RX context if not to be shared */ + if (info->ep_attr == NULL || + info->ep_attr->rx_ctx_cnt != FI_SHARED_CONTEXT) { + rx = calloc(1, sizeof(*rx)); + if (rx == NULL) { + ret = -errno; + goto fail; + } + rx->rx_fid.fid.fclass = FI_CLASS_RX_CTX; + atomic_init(&rx->rx_refcnt, 0); + rx->rx_domain = udp; + atomic_inc(&udp->dom_refcnt); + if (info->rx_attr != NULL) { + ret = usdf_msg_fill_rx_attr(info->rx_attr); + if (ret != 0) { + goto fail; + } + rx->rx_attr = *info->rx_attr; + } else { + ret = usdf_msg_fill_rx_attr(&rx->rx_attr); + } + TAILQ_INIT(&rx->r.msg.rx_free_rqe); + TAILQ_INIT(&rx->r.msg.rx_posted_rqe); + + ep->ep_rx = rx; + atomic_inc(&rx->rx_refcnt); } atomic_init(&ep->ep_refcnt, 0); @@ -250,11 +740,18 @@ usdf_ep_msg_open(struct fid_domain *domain, struct fi_info *info, *ep_o = ep_utof(ep); return 0; - fail: + if (rx != NULL) { + free(rx); + atomic_dec(&udp->dom_refcnt); + } + if (tx != NULL) { + free(tx); + atomic_dec(&udp->dom_refcnt); + } if (ep != NULL) { - if (ep->ep_sock != -1) { - close(ep->ep_sock); + if (ep->e.msg.ep_ack_timer != NULL) { + usdf_timer_free(fp, ep->e.msg.ep_ack_timer); } free(ep); } diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_ep_rdm.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_ep_rdm.c new file mode 100644 index 0000000000..7f39b922f8 --- /dev/null +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_ep_rdm.c @@ -0,0 +1,808 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#if HAVE_CONFIG_H +# include +#endif /* HAVE_CONFIG_H */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include "fi.h" +#include "fi_enosys.h" + +#include "usd.h" +#include "usdf.h" +#include "usdf_endpoint.h" +#include "usdf_rudp.h" +#include "usdf_cq.h" +#include "usdf_cm.h" +#include "usdf_av.h" +#include "usdf_timer.h" +#include "usdf_rdm.h" + +static int +usdf_tx_rdm_enable(struct usdf_tx *tx) +{ + struct usdf_rdm_qe *wqe; + struct usdf_domain *udp; + struct usdf_cq_hard *hcq; + struct usd_filter filt; + int ret; + int i; + + udp = tx->tx_domain; + + hcq = tx->t.rdm.tx_hcq; + if (hcq == NULL) { + return -FI_ENOCQ; + } + + /* XXX temp until we can allocate WQ and RQ independently */ + filt.uf_type = USD_FTY_UDP; + filt.uf_filter.uf_udp.u_port = 0; + ret = usd_create_qp(udp->dom_dev, + USD_QTR_UDP, + USD_QTY_NORMAL, + hcq->cqh_ucq, + hcq->cqh_ucq, + udp->dom_fabric->fab_dev_attrs->uda_max_send_credits, + udp->dom_fabric->fab_dev_attrs->uda_max_recv_credits, + &filt, + &tx->tx_qp); + if (ret != 0) { + goto fail; + } + tx->tx_qp->uq_context = tx; + + /* rdm send queue */ + tx->t.rdm.tx_wqe_buf = malloc(tx->tx_attr.size * + sizeof(struct usdf_rdm_qe)); + if (tx->t.rdm.tx_wqe_buf == NULL) { + ret = -errno; + goto fail; + } + + /* populate free list */ + TAILQ_INIT(&tx->t.rdm.tx_free_wqe); + wqe = tx->t.rdm.tx_wqe_buf; + for (i = 0; i < tx->tx_attr.size; ++i) { + TAILQ_INSERT_TAIL(&tx->t.rdm.tx_free_wqe, wqe, rd_link); + ++wqe; + } + + return 0; + +fail: + if (tx->t.rdm.tx_wqe_buf != NULL) { + free(tx->t.rdm.tx_wqe_buf); + tx->t.rdm.tx_wqe_buf = NULL; + TAILQ_INIT(&tx->t.rdm.tx_free_wqe); + } + if (tx->tx_qp != NULL) { + usd_destroy_qp(tx->tx_qp); + } + return ret; +} + +static int +usdf_rx_rdm_enable(struct usdf_rx *rx) +{ + struct usdf_domain *udp; + struct usdf_cq_hard *hcq; + struct usdf_rdm_qe *rqe; + struct usd_filter filt; + struct usd_qp_impl *qp; + uint8_t *ptr; + size_t mtu; + int ret; + int i; + + udp = rx->rx_domain; + + hcq = rx->r.rdm.rx_hcq; + if (hcq == NULL) { + return -FI_ENOCQ; + } + + /* XXX temp until we can allocate WQ and RQ independently */ + filt.uf_type = USD_FTY_UDP_SOCK; + filt.uf_filter.uf_udp_sock.u_sock = rx->r.rdm.rx_sock; + ret = usd_create_qp(udp->dom_dev, + USD_QTR_UDP, + USD_QTY_NORMAL, + hcq->cqh_ucq, + hcq->cqh_ucq, + udp->dom_fabric->fab_dev_attrs->uda_max_send_credits, + udp->dom_fabric->fab_dev_attrs->uda_max_recv_credits, + &filt, + &rx->rx_qp); + if (ret != 0) { + goto fail; + } + rx->rx_qp->uq_context = rx; + qp = to_qpi(rx->rx_qp); + + /* receive buffers */ + mtu = rx->rx_domain->dom_fabric->fab_dev_attrs->uda_mtu; + ret = usd_alloc_mr(rx->rx_domain->dom_dev, + qp->uq_rq.urq_num_entries * mtu, + (void **)&rx->r.rdm.rx_bufs); + if (ret != 0) { + goto fail; + } + + /* post all the buffers */ + ptr = rx->r.rdm.rx_bufs; + for (i = 0; i < qp->uq_rq.urq_num_entries - 1; ++i) { + usdf_rdm_post_recv(rx, ptr, mtu); + ptr += mtu; + } + + /* rdm recv queue */ + rx->r.rdm.rx_rqe_buf = malloc(rx->rx_attr.size * + sizeof(struct usdf_rdm_qe)); + if (rx->r.rdm.rx_rqe_buf == NULL) { + ret = -errno; + goto fail; + } + + /* populate free list */ + TAILQ_INIT(&rx->r.rdm.rx_free_rqe); + rqe = rx->r.rdm.rx_rqe_buf; + for (i = 0; i < rx->rx_attr.size; ++i) { + TAILQ_INSERT_TAIL(&rx->r.rdm.rx_free_rqe, rqe, rd_link); + ++rqe; + } + + return 0; + +fail: + if (rx->r.rdm.rx_rqe_buf != NULL) { + free(rx->r.rdm.rx_rqe_buf); + rx->r.rdm.rx_rqe_buf = NULL; + TAILQ_INIT(&rx->r.rdm.rx_free_rqe); + } + if (rx->r.rdm.rx_bufs != NULL) { + usd_free_mr(rx->r.rdm.rx_bufs); + rx->r.rdm.rx_bufs = NULL; + } + if (rx->rx_qp != NULL) { + usd_destroy_qp(rx->rx_qp); + } + return ret; +} + +/* + * release queue resources + */ +void +usdf_ep_rdm_release_queues(struct usdf_ep *ep) +{ + /* XXX */ +} + +/* + * Allocate any missing queue resources for this endpoint + */ +int +usdf_ep_rdm_get_queues(struct usdf_ep *ep) +{ + struct usdf_tx *tx; + struct usdf_rx *rx; + int ret; + + /* Must have TX context at this point */ + tx = ep->ep_tx; + if (tx == NULL) { + ret = -FI_EINVAL; + goto fail; + } + if (tx->tx_qp == NULL) { + ret = usdf_tx_rdm_enable(tx); + if (ret != 0) { + goto fail; + } + } + + /* Must have RX context at this point */ + rx = ep->ep_rx; + if (rx == NULL) { + ret = -FI_EINVAL; + goto fail; + } + if (rx->rx_qp == NULL) { + ret = usdf_rx_rdm_enable(rx); + if (ret != 0) { + goto fail; + } + } + + return 0; +fail: + return ret; +} + +static int +usdf_ep_rdm_enable(struct fid_ep *fep) +{ + return usdf_ep_rdm_get_queues(ep_ftou(fep)); +} + +static int +usdf_ep_rdm_getopt(fid_t fid, int level, int optname, + void *optval, size_t *optlen) +{ + struct usdf_ep *ep; + ep = ep_fidtou(fid); + (void)ep; + + switch (level) { + case FI_OPT_ENDPOINT: + return -FI_ENOPROTOOPT; + default: + return -FI_ENOPROTOOPT; + } + return 0; +} + +static int +usdf_ep_rdm_setopt(fid_t fid, int level, int optname, + const void *optval, size_t optlen) +{ + struct usdf_ep *ep; + ep = ep_fidtou(fid); + (void)ep; + + switch (level) { + case FI_OPT_ENDPOINT: + return -FI_ENOPROTOOPT; + default: + return -FI_ENOPROTOOPT; + } + return 0; +} + +static ssize_t +usdf_ep_rdm_cancel(fid_t fid, void *context) +{ + return 0; +} + +int +usdf_rdm_fill_tx_attr(struct fi_tx_attr *txattr) +{ + if (txattr->size > USDF_RDM_MAX_CTX_SIZE || + txattr->iov_limit > USDF_RDM_MAX_SGE) { + return -FI_ENODATA; + } + + if (txattr->size == 0) { + txattr->size = USDF_RDM_DFLT_CTX_SIZE; + } + if (txattr->iov_limit == 0) { + txattr->iov_limit = USDF_RDM_DFLT_SGE; + } + return 0; +} + +int +usdf_rdm_fill_rx_attr(struct fi_rx_attr *rxattr) +{ + if (rxattr->size > USDF_RDM_MAX_CTX_SIZE || + rxattr->iov_limit > USDF_RDM_MAX_SGE) { + return -FI_ENODATA; + } + + if (rxattr->size == 0) { + rxattr->size = USDF_RDM_DFLT_CTX_SIZE; + } + if (rxattr->iov_limit == 0) { + rxattr->iov_limit = USDF_RDM_DFLT_SGE; + } + return 0; +} + +/* + * Find a hard CQ within this soft CQ that services message EPs + */ +static struct usdf_cq_hard * +usdf_ep_rdm_find_cqh(struct usdf_cq *cq) +{ + struct usdf_cq_hard *hcq; + + TAILQ_FOREACH(hcq, &cq->c.soft.cq_list, cqh_link) { + if (hcq->cqh_progress == usdf_rdm_hcq_progress) { + return hcq; + } + } + return NULL; +} + +static int +usdf_ep_rdm_bind_cq(struct usdf_ep *ep, struct usdf_cq *cq, uint64_t flags) +{ + struct usdf_cq_hard **hcqp; + struct usdf_cq_hard *hcq; + int ret; + + /* + * The CQ is actually bound the RX or TX ctx, not the EP directly + */ + if (flags & FI_SEND) { + /* if TX is shared, but bind directly */ + if (ep->ep_tx->tx_fid.fid.fclass == FI_CLASS_STX_CTX) { + return -FI_EINVAL; + } + hcqp = &ep->ep_tx->t.rdm.tx_hcq; + } else { + /* if RX is shared, but bind directly */ + if (ep->ep_rx->rx_fid.fid.fclass == FI_CLASS_SRX_CTX) { + return -FI_EINVAL; + } + hcqp = &ep->ep_rx->r.rdm.rx_hcq; + } + if (*hcqp != NULL) { + return -FI_EINVAL; + } + + /* Make sure this CQ is "soft" */ + ret = usdf_cq_make_soft(cq); + if (ret != 0) { + return ret; + } + + /* Use existing rdm CQ if present */ + hcq = usdf_ep_rdm_find_cqh(cq); + if (hcq == NULL) { + hcq = malloc(sizeof(*hcq)); + if (hcq == NULL) { + return -errno; + } + ret = usd_create_cq(cq->cq_domain->dom_dev, 8195, /* XXX */ + -1, &hcq->cqh_ucq); + if (ret != 0) { + goto fail; + } + hcq->cqh_cq = cq; + atomic_init(&hcq->cqh_refcnt, 0); + hcq->cqh_progress = usdf_rdm_hcq_progress; + switch (cq->cq_attr.format) { + default: + case FI_CQ_FORMAT_CONTEXT: + hcq->cqh_post = usdf_cq_post_soft_context; + break; + case FI_CQ_FORMAT_MSG: + hcq->cqh_post = usdf_cq_post_soft_msg; + break; + case FI_CQ_FORMAT_DATA: + hcq->cqh_post = usdf_cq_post_soft_data; + break; + } + TAILQ_INSERT_TAIL(&cq->c.soft.cq_list, hcq, cqh_link); + + /* add to domain progression list */ + TAILQ_INSERT_TAIL(&ep->ep_domain->dom_hcq_list, + hcq, cqh_dom_link); + } + atomic_inc(&hcq->cqh_refcnt); + atomic_inc(&cq->cq_refcnt); + *hcqp = hcq; + return 0; + +fail: + if (hcq != NULL) { + free(hcq); + } + return ret; +} + +static int +usdf_ep_rdm_bind(struct fid *fid, struct fid *bfid, uint64_t flags) +{ + struct usdf_ep *ep; + struct usdf_cq *cq; + + ep = ep_fidtou(fid); + + switch (bfid->fclass) { + + case FI_CLASS_AV: + if (ep->e.rdm.ep_av != NULL) { + return -FI_EINVAL; + } + ep->e.rdm.ep_av = av_fidtou(bfid); + break; + + + case FI_CLASS_CQ: + if (flags & FI_SEND) { + cq = cq_fidtou(bfid); + usdf_ep_rdm_bind_cq(ep, cq, FI_SEND); + } + + if (flags & FI_RECV) { + cq = cq_fidtou(bfid); + usdf_ep_rdm_bind_cq(ep, cq, FI_RECV); + } + break; + + case FI_CLASS_EQ: + if (ep->ep_eq != NULL) { + return -FI_EINVAL; + } + ep->ep_eq = eq_fidtou(bfid); + atomic_inc(&ep->ep_eq->eq_refcnt); + break; + default: + return -FI_EINVAL; + } + + return 0; +} + +/* + * XXX clean up pending transmits + */ +static int +usdf_rdm_rx_ctx_close(fid_t fid) +{ + struct usdf_rx *rx; + struct usdf_cq_hard *hcq; + + rx = rx_fidtou(fid); + + if (atomic_get(&rx->rx_refcnt) > 0) { + return -FI_EBUSY; + } + + hcq = rx->r.rdm.rx_hcq; + if (hcq != NULL) { + atomic_dec(&hcq->cqh_refcnt); + atomic_dec(&hcq->cqh_cq->cq_refcnt); + } + if (rx->r.rdm.rx_sock != -1) { + close(rx->r.rdm.rx_sock); + } + + if (rx->rx_qp != NULL) { + usd_free_mr(rx->r.rdm.rx_bufs); + free(rx->r.rdm.rx_rqe_buf); + usd_destroy_qp(rx->rx_qp); + } + atomic_dec(&rx->rx_domain->dom_refcnt); + + free(rx); + + return 0; +} + +/* + * XXX clean up pending receives + */ +static int +usdf_rdm_tx_ctx_close(fid_t fid) +{ + struct usdf_tx *tx; + struct usdf_cq_hard *hcq; + + tx = tx_fidtou(fid); + + if (atomic_get(&tx->tx_refcnt) > 0) { + return -FI_EBUSY; + } + + hcq = tx->t.rdm.tx_hcq; + if (hcq != NULL) { + atomic_dec(&hcq->cqh_refcnt); + atomic_dec(&hcq->cqh_cq->cq_refcnt); + } + + if (tx->tx_qp != NULL) { + free(tx->t.rdm.tx_wqe_buf); + usd_destroy_qp(tx->tx_qp); + } + atomic_dec(&tx->tx_domain->dom_refcnt); + + free(tx); + + return 0; +} + +int +usdf_rx_rdm_port_bind(struct usdf_rx *rx, struct fi_info *info) +{ + struct sockaddr_in *sin; + struct sockaddr_in src; + socklen_t addrlen; + int ret; + + if (info->src_addr != NULL) { + if (info->addr_format != FI_SOCKADDR && + info->addr_format != FI_SOCKADDR_IN) { + return -FI_EINVAL; + } + sin = (struct sockaddr_in *)info->src_addr; + } else { + memset(&src, 0, sizeof(src)); + sin = &src; + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = + rx->rx_domain->dom_fabric->fab_dev_attrs->uda_ipaddr_be; + } + + rx->r.rdm.rx_sock = socket(AF_INET, SOCK_DGRAM, 0); + if (rx->r.rdm.rx_sock == -1) { + return -errno; + } + ret = bind(rx->r.rdm.rx_sock, (struct sockaddr *)sin, sizeof(*sin)); + if (ret == -1) { + return -errno; + } + + addrlen = sizeof(*sin); + ret = getsockname(rx->r.rdm.rx_sock, (struct sockaddr *)sin, &addrlen); + if (ret == -1) { + return -errno; + } + + return 0; +} + +static int +usdf_ep_rdm_close(fid_t fid) +{ + struct usdf_ep *ep; + + ep = ep_fidtou(fid); + + if (atomic_get(&ep->ep_refcnt) > 0) { + return -FI_EBUSY; + } + + if (ep->ep_rx != NULL) { + atomic_dec(&ep->ep_rx->rx_refcnt); + if (rx_utofid(ep->ep_rx)->fclass == FI_CLASS_RX_CTX) { + (void) usdf_rdm_rx_ctx_close(rx_utofid(ep->ep_rx)); + } + } + + if (ep->ep_tx != NULL) { + atomic_dec(&ep->ep_tx->tx_refcnt); + if (tx_utofid(ep->ep_tx)->fclass == FI_CLASS_TX_CTX) { + (void) usdf_rdm_tx_ctx_close(tx_utofid(ep->ep_tx)); + } + } + + atomic_dec(&ep->ep_domain->dom_refcnt); + if (ep->ep_eq != NULL) { + atomic_dec(&ep->ep_eq->eq_refcnt); + } + + free(ep); + return 0; +} + +static struct fi_ops_ep usdf_base_rdm_ops = { + .size = sizeof(struct fi_ops_ep), + .enable = usdf_ep_rdm_enable, + .cancel = usdf_ep_rdm_cancel, + .getopt = usdf_ep_rdm_getopt, + .setopt = usdf_ep_rdm_setopt, + .tx_ctx = fi_no_tx_ctx, + .rx_ctx = fi_no_rx_ctx, +}; + +static struct fi_ops_cm usdf_cm_rdm_ops = { + .size = sizeof(struct fi_ops_cm), + .getname = usdf_cm_rdm_getname, + .getpeer = fi_no_getpeer, + .connect = fi_no_connect, + .listen = fi_no_listen, + .accept = fi_no_accept, + .reject = fi_no_reject, + .shutdown = fi_no_shutdown, + .join = fi_no_join, + .leave = fi_no_leave, +}; + +static struct fi_ops_msg usdf_rdm_ops = { + .size = sizeof(struct fi_ops_msg), + .recv = usdf_rdm_recv, + .recvv = usdf_rdm_recvv, + .recvmsg = usdf_rdm_recvmsg, + .send = usdf_rdm_send, + .sendv = usdf_rdm_sendv, + .sendmsg = usdf_rdm_sendmsg, + .inject = usdf_rdm_inject, + .senddata = usdf_rdm_senddata, + .injectdata = fi_no_msg_injectdata, +}; + +static struct fi_ops usdf_ep_rdm_ops = { + .size = sizeof(struct fi_ops), + .close = usdf_ep_rdm_close, + .bind = usdf_ep_rdm_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open +}; + +int +usdf_ep_rdm_open(struct fid_domain *domain, struct fi_info *info, + struct fid_ep **ep_o, void *context) +{ + struct usdf_domain *udp; + struct usdf_tx *tx; + struct usdf_rx *rx; + struct usdf_ep *ep; + int ret; + + ep = NULL; + rx = NULL; + tx = NULL; + if ((info->caps & ~USDF_RDM_CAPS) != 0) { + return -FI_EBADFLAGS; + } + + udp = dom_ftou(domain); + + /* allocate peer table if not done */ + if (udp->dom_peer_tab == NULL) { + udp->dom_peer_tab = calloc(USDF_MAX_PEERS, sizeof(ep)); + } + if (udp->dom_peer_tab == NULL) { + ret = -errno; + goto fail; + } + + ep = calloc(1, sizeof(*ep)); + if (ep == NULL) { + ret = -errno; + goto fail; + } + + ep->ep_fid.fid.fclass = FI_CLASS_EP; + ep->ep_fid.fid.context = context; + ep->ep_fid.fid.ops = &usdf_ep_rdm_ops; + ep->ep_fid.ops = &usdf_base_rdm_ops; + ep->ep_fid.cm = &usdf_cm_rdm_ops; + ep->ep_fid.msg = &usdf_rdm_ops; + ep->ep_domain = udp; + ep->ep_caps = info->caps; + ep->ep_mode = info->mode; + + /* implicitly create TX context if not to be shared */ + if (info->ep_attr == NULL || + info->ep_attr->tx_ctx_cnt != FI_SHARED_CONTEXT) { + tx = calloc(1, sizeof(*tx)); + if (tx == NULL) { + ret = -errno; + goto fail; + } + tx->tx_fid.fid.fclass = FI_CLASS_TX_CTX; + atomic_init(&tx->tx_refcnt, 0); + tx->tx_domain = udp; + tx->tx_progress = usdf_rdm_tx_progress; + atomic_init(&tx->t.rdm.tx_next_msg_id, 1); + atomic_inc(&udp->dom_refcnt); + + if (info->tx_attr != NULL) { + ret = usdf_rdm_fill_tx_attr(info->tx_attr); + if (ret != 0) { + goto fail; + } + tx->tx_attr = *info->tx_attr; + } else { + ret = usdf_rdm_fill_tx_attr(&tx->tx_attr); + } + TAILQ_INIT(&tx->t.rdm.tx_free_wqe); + TAILQ_INIT(&tx->t.rdm.tx_rdc_ready); + TAILQ_INIT(&tx->t.rdm.tx_rdc_have_acks); + + ep->ep_tx = tx; + atomic_inc(&tx->tx_refcnt); + } + + /* implicitly create RX context if not to be shared */ + if (info->ep_attr == NULL || + info->ep_attr->rx_ctx_cnt != FI_SHARED_CONTEXT) { + rx = calloc(1, sizeof(*rx)); + if (rx == NULL) { + ret = -errno; + goto fail; + } + + rx->rx_fid.fid.fclass = FI_CLASS_RX_CTX; + atomic_init(&rx->rx_refcnt, 0); + rx->rx_domain = udp; + rx->r.rdm.rx_tx = tx; + rx->r.rdm.rx_sock = -1; + atomic_inc(&udp->dom_refcnt); + + ret = usdf_rx_rdm_port_bind(rx, info); + if (ret != 0) { + goto fail; + } + + if (info->rx_attr != NULL) { + ret = usdf_rdm_fill_rx_attr(info->rx_attr); + if (ret != 0) { + goto fail; + } + rx->rx_attr = *info->rx_attr; + } else { + ret = usdf_rdm_fill_rx_attr(&rx->rx_attr); + } + TAILQ_INIT(&rx->r.rdm.rx_free_rqe); + TAILQ_INIT(&rx->r.rdm.rx_posted_rqe); + + ep->ep_rx = rx; + atomic_inc(&rx->rx_refcnt); + } + + atomic_init(&ep->ep_refcnt, 0); + atomic_inc(&udp->dom_refcnt); + + *ep_o = ep_utof(ep); + return 0; +fail: + if (rx != NULL) { + if (rx->r.rdm.rx_sock != -1) { + close(rx->r.rdm.rx_sock); + } + free(rx); + atomic_dec(&udp->dom_refcnt); + } + if (tx != NULL) { + free(tx); + atomic_dec(&udp->dom_refcnt); + } + return ret; +} diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_eq.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_eq.c index e8e7555be2..b0fde4fb4f 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_eq.c +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_eq.c @@ -566,6 +566,9 @@ usdf_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr, /* * Allocate and initialize event ring */ + if (attr->size == 0) { + attr->size = 1024; // XXX + } eq->eq_ev_ring = calloc(attr->size, sizeof(*eq->eq_ev_ring)); eq->eq_ev_buf = calloc(attr->size, sizeof(*eq->eq_ev_buf)); if (eq->eq_ev_ring == NULL || eq->eq_ev_buf == NULL) { diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_fabric.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_fabric.c index ed258c0ff9..2b35e2fbc4 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_fabric.c +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_fabric.c @@ -60,6 +60,7 @@ #include #include "fi.h" #include "fi_enosys.h" +#include "prov.h" #include "usnic_direct.h" #include "libnl_utils.h" @@ -68,6 +69,9 @@ #include "fi_usnic.h" #include "usdf_progress.h" #include "usdf_timer.h" +#include "usdf_dgram.h" +#include "usdf_msg.h" +#include "usdf_rdm.h" struct usdf_usnic_info *__usdf_devinfo; @@ -108,8 +112,12 @@ usdf_validate_hints(struct fi_info *hints, struct usd_device_attrs *dap) fattrp = hints->fabric_attr; if (fattrp != NULL) { + if (fattrp->prov_version != 0 && + fattrp->prov_version != USDF_PROV_VERSION) { + return -FI_ENODATA; + } if (fattrp->prov_name != NULL && - strcmp(fattrp->prov_name, USDF_FI_NAME) != 0) { + strcmp(fattrp->prov_name, USDF_PROV_NAME) != 0) { return -FI_ENODATA; } if (fattrp->name != NULL && @@ -122,16 +130,15 @@ usdf_validate_hints(struct fi_info *hints, struct usd_device_attrs *dap) } static int -usdf_fill_addr_info(struct fi_info *fi, struct fi_info *hints, +usdf_fill_addr_info(struct fi_info *fi, uint32_t addr_format, struct sockaddr_in *src, struct sockaddr_in *dest, struct usd_device_attrs *dap) { struct sockaddr_in *sin; int ret; - /* If hints speficied, we already validated requested addr_format */ - if (hints != NULL && hints->addr_format != FI_FORMAT_UNSPEC) { - fi->addr_format = hints->addr_format; + if (addr_format != FI_FORMAT_UNSPEC) { + fi->addr_format = addr_format; } else { fi->addr_format = FI_SOCKADDR_IN; } @@ -192,6 +199,8 @@ usdf_fill_info_dgram( struct fi_tx_attr *txattr; struct fi_rx_attr *rxattr; struct fi_ep_attr *eattrp; + uint32_t addr_format; + size_t entries; int ret; /* check that we are capable of what's requested */ @@ -214,12 +223,14 @@ usdf_fill_info_dgram( if (hints != NULL) { fi->mode = hints->mode & USDF_DGRAM_SUPP_MODE; + addr_format = hints->addr_format; } else { fi->mode = USDF_DGRAM_SUPP_MODE; + addr_format = FI_FORMAT_UNSPEC; } fi->ep_type = FI_EP_DGRAM; - ret = usdf_fill_addr_info(fi, hints, src, dest, dap); + ret = usdf_fill_addr_info(fi, addr_format, src, dest, dap); if (ret != 0) { goto fail; } @@ -234,22 +245,72 @@ usdf_fill_info_dgram( /* TX attrs */ txattr = fi->tx_attr; - txattr->size = dap->uda_max_send_credits; - if (hints != NULL && - hints->tx_attr != NULL && - hints->tx_attr->size != 0 && - hints->tx_attr->size < txattr->size) { - txattr->size = hints->tx_attr->size; + txattr->iov_limit = USDF_DGRAM_DFLT_SGE; + txattr->size = dap->uda_max_send_credits / USDF_DGRAM_DFLT_SGE; + if (hints != NULL && hints->tx_attr != NULL) { + if (hints->tx_attr->iov_limit > USDF_MSG_MAX_SGE) { + ret = -FI_ENODATA; + goto fail; + } + if (hints->tx_attr->iov_limit != 0) { + txattr->iov_limit = hints->tx_attr->iov_limit; + entries = hints->tx_attr->size * txattr->iov_limit; + if (entries > dap->uda_max_send_credits) { + ret = -FI_ENODATA; + goto fail; + } else if (entries == 0) { + txattr->size = dap->uda_max_send_credits / + txattr->iov_limit; + } else { + txattr->size = hints->tx_attr->size; + } + } else if (hints->tx_attr->size != 0) { + txattr->size = hints->tx_attr->size; + if (txattr->size > dap->uda_max_send_credits) { + ret = -FI_ENODATA; + goto fail; + } + entries = txattr->size * txattr->iov_limit; + if (entries > dap->uda_max_send_credits) { + txattr->iov_limit = dap->uda_max_send_credits / + txattr->size; + } + } } /* RX attrs */ rxattr = fi->rx_attr; - rxattr->size = dap->uda_max_recv_credits; - if (hints != NULL && - hints->rx_attr != NULL && - hints->rx_attr->size != 0 && - hints->rx_attr->size < rxattr->size) { - rxattr->size = hints->rx_attr->size; + rxattr->iov_limit = USDF_DGRAM_DFLT_SGE; + rxattr->size = dap->uda_max_recv_credits / USDF_DGRAM_DFLT_SGE; + if (hints != NULL && hints->rx_attr != NULL) { + if (hints->rx_attr->iov_limit > USDF_MSG_MAX_SGE) { + ret = -FI_ENODATA; + goto fail; + } + if (hints->rx_attr->iov_limit != 0) { + rxattr->iov_limit = hints->rx_attr->iov_limit; + entries = hints->rx_attr->size * rxattr->iov_limit; + if (entries > dap->uda_max_recv_credits) { + ret = -FI_ENODATA; + goto fail; + } else if (entries == 0) { + rxattr->size = dap->uda_max_recv_credits / + rxattr->iov_limit; + } else { + rxattr->size = hints->rx_attr->size; + } + } else if (hints->rx_attr->size != 0) { + rxattr->size = hints->rx_attr->size; + if (rxattr->size > dap->uda_max_recv_credits) { + ret = -FI_ENODATA; + goto fail; + } + entries = rxattr->size * rxattr->iov_limit; + if (entries > dap->uda_max_recv_credits) { + rxattr->iov_limit = dap->uda_max_recv_credits / + rxattr->size; + } + } } /* endpoint attrs */ @@ -267,7 +328,7 @@ usdf_fill_info_dgram( dattrp = fi->domain_attr; dattrp->threading = FI_THREAD_UNSPEC; dattrp->control_progress = FI_PROGRESS_AUTO; - dattrp->data_progress = FI_PROGRESS_AUTO; + dattrp->data_progress = FI_PROGRESS_MANUAL; /* add to tail of list */ if (*fi_first == NULL) { @@ -301,6 +362,7 @@ usdf_fill_info_msg( struct fi_tx_attr *txattr; struct fi_rx_attr *rxattr; struct fi_ep_attr *eattrp; + uint32_t addr_format; int ret; /* check that we are capable of what's requested */ @@ -323,13 +385,15 @@ usdf_fill_info_msg( if (hints != NULL) { fi->mode = hints->mode & USDF_MSG_SUPP_MODE; + addr_format = hints->addr_format; } else { fi->mode = USDF_MSG_SUPP_MODE; + addr_format = FI_FORMAT_UNSPEC; } fi->ep_type = FI_EP_MSG; - ret = usdf_fill_addr_info(fi, hints, src, dest, dap); + ret = usdf_fill_addr_info(fi, addr_format, src, dest, dap); if (ret != 0) { goto fail; } @@ -344,28 +408,21 @@ usdf_fill_info_msg( /* TX attrs */ txattr = fi->tx_attr; - txattr->size = dap->uda_max_send_credits; - if (hints != NULL && - hints->tx_attr != NULL && - hints->tx_attr->size != 0 && - hints->tx_attr->size < txattr->size) { - txattr->size = hints->tx_attr->size; + if (hints != NULL && hints->tx_attr != NULL) { + *txattr = *hints->tx_attr; } + usdf_msg_fill_tx_attr(txattr); /* RX attrs */ rxattr = fi->rx_attr; - rxattr->size = dap->uda_max_recv_credits; - if (hints != NULL && - hints->rx_attr != NULL && - hints->rx_attr->size != 0 && - hints->rx_attr->size < rxattr->size) { - rxattr->size = hints->rx_attr->size; + if (hints != NULL && hints->rx_attr != NULL) { + *rxattr = *hints->rx_attr; } + usdf_msg_fill_rx_attr(rxattr); /* endpoint attrs */ eattrp = fi->ep_attr; - eattrp->max_msg_size = dap->uda_mtu - - sizeof(struct usd_udp_hdr); + eattrp->max_msg_size = USDF_MSG_MAX_MSG; eattrp->protocol = FI_PROTO_RUDP; eattrp->tx_ctx_cnt = 1; eattrp->rx_ctx_cnt = 1; @@ -374,7 +431,109 @@ usdf_fill_info_msg( dattrp = fi->domain_attr; dattrp->threading = FI_THREAD_UNSPEC; dattrp->control_progress = FI_PROGRESS_AUTO; - dattrp->data_progress = FI_PROGRESS_AUTO; + dattrp->data_progress = FI_PROGRESS_MANUAL; + + /* add to tail of list */ + if (*fi_first == NULL) { + *fi_first = fi; + } else { + (*fi_last)->next = fi; + } + *fi_last = fi; + + return 0; + +fail: + if (fi != NULL) { + fi_freeinfo(fi); + } + return ret; +} + +static int +usdf_fill_info_rdm( + struct fi_info *hints, + struct sockaddr_in *src, + struct sockaddr_in *dest, + struct usd_device_attrs *dap, + struct fi_info **fi_first, + struct fi_info **fi_last) +{ + struct fi_info *fi; + struct fi_fabric_attr *fattrp; + struct fi_domain_attr *dattrp; + struct fi_tx_attr *txattr; + struct fi_rx_attr *rxattr; + struct fi_ep_attr *eattrp; + uint32_t addr_format; + int ret; + + /* check that we are capable of what's requested */ + if ((hints->caps & ~USDF_RDM_CAPS) != 0) { + return -FI_ENODATA; + } + + /* app must support these modes */ + if ((hints->mode & USDF_RDM_REQ_MODE) != USDF_RDM_REQ_MODE) { + return -FI_ENODATA; + } + + fi = fi_allocinfo_internal(); + if (fi == NULL) { + ret = -FI_ENOMEM; + goto fail; + } + + fi->caps = USDF_RDM_CAPS; + + if (hints != NULL) { + fi->mode = hints->mode & USDF_RDM_SUPP_MODE; + addr_format = hints->addr_format; + } else { + fi->mode = USDF_RDM_SUPP_MODE; + addr_format = FI_FORMAT_UNSPEC; + } + fi->ep_type = FI_EP_RDM; + + ret = usdf_fill_addr_info(fi, addr_format, src, dest, dap); + if (ret != 0) { + goto fail; + } + + /* fabric attrs */ + fattrp = fi->fabric_attr; + fattrp->name = strdup(dap->uda_devname); + if (fattrp->name == NULL) { + ret = -FI_ENOMEM; + goto fail; + } + + /* TX attrs */ + txattr = fi->tx_attr; + if (hints != NULL && hints->tx_attr != NULL) { + *txattr = *hints->tx_attr; + } + usdf_rdm_fill_tx_attr(txattr); + + /* RX attrs */ + rxattr = fi->rx_attr; + if (hints != NULL && hints->rx_attr != NULL) { + *rxattr = *hints->rx_attr; + } + usdf_rdm_fill_rx_attr(rxattr); + + /* endpoint attrs */ + eattrp = fi->ep_attr; + eattrp->max_msg_size = USDF_RDM_MAX_MSG; + eattrp->protocol = FI_PROTO_RUDP; + eattrp->tx_ctx_cnt = 1; + eattrp->rx_ctx_cnt = 1; + + /* domain attrs */ + dattrp = fi->domain_attr; + dattrp->threading = FI_THREAD_UNSPEC; + dattrp->control_progress = FI_PROGRESS_AUTO; + dattrp->data_progress = FI_PROGRESS_MANUAL; /* add to tail of list */ if (*fi_first == NULL) { @@ -561,6 +720,14 @@ usdf_getinfo(uint32_t version, const char *node, const char *service, goto fail; } } + + if (ep_type == FI_EP_RDM || ep_type == FI_EP_UNSPEC) { + ret = usdf_fill_info_rdm(hints, src, dest, dap, + &fi_first, &fi_last); + if (ret != 0 && ret != -FI_ENODATA) { + goto fail; + } + } } if (fi_first != NULL) { @@ -690,6 +857,17 @@ usdf_fabric_open(struct fi_fabric_attr *fattrp, struct fid_fabric **fabric, } fp->fab_epollfd = -1; fp->fab_arp_sockfd = -1; + LIST_INIT(&fp->fab_domain_list); + + fp->fab_attr.fabric = fab_utof(fp); + fp->fab_attr.name = strdup(fattrp->name); + fp->fab_attr.prov_name = strdup(USDF_PROV_NAME); + fp->fab_attr.prov_version = USDF_PROV_VERSION; + if (fp->fab_attr.name == NULL || + fp->fab_attr.prov_name == NULL) { + ret = -FI_ENOMEM; + goto fail; + } fp->fab_fid.fid.fclass = FI_CLASS_FABRIC; fp->fab_fid.fid.context = context; @@ -726,6 +904,7 @@ usdf_fabric_open(struct fi_fabric_attr *fattrp, struct fid_fabric **fabric, goto fail; } + /* initialize timer subsystem */ ret = usdf_timer_init(fp); if (ret != 0) { goto fail; @@ -746,7 +925,9 @@ usdf_fabric_open(struct fi_fabric_attr *fattrp, struct fid_fabric **fabric, } atomic_init(&fp->fab_refcnt, 0); - *fabric = &fp->fab_fid; + fattrp->fabric = fab_utof(fp); + fattrp->prov_version = USDF_PROV_VERSION; + *fabric = fab_utof(fp); return 0; fail: @@ -766,20 +947,20 @@ fail: return ret; } +static void usdf_fini(void) +{ +} + static struct fi_provider usdf_ops = { - .name = USDF_FI_NAME, - .version = FI_VERSION(0, 7), + .name = USDF_PROV_NAME, + .version = USDF_PROV_VERSION, + .fi_version = FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), .getinfo = usdf_getinfo, .fabric = usdf_fabric_open, + .cleanup = usdf_fini }; -static void __attribute__((constructor)) -usdf_ini(void) -{ - (void) fi_register(&usdf_ops); -} - -static void __attribute__((destructor)) -usdf_fini(void) +USNIC_INI { + return (&usdf_ops); } diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_msg.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_msg.c index 4fe58a4bd2..ef95a3cc1e 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_msg.c +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_msg.c @@ -57,81 +57,153 @@ #include #include "fi.h" -#include "usnic_direct.h" #include "usd.h" +#include "usd_post.h" + #include "usdf.h" +#include "usdf_rudp.h" +#include "usdf_msg.h" +#include "usdf_timer.h" +#include "usdf_progress.h" + +static inline void +usdf_msg_ep_ready(struct usdf_ep *ep) +{ + struct usdf_tx *tx; + + tx = ep->ep_tx; + if (!TAILQ_ON_LIST(ep, e.msg.ep_link)) { + + ep->e.msg.ep_fairness_credits = USDF_MSG_FAIRNESS_CREDITS; + TAILQ_INSERT_TAIL(&tx->t.msg.tx_ep_ready, ep, e.msg.ep_link); + + /* Make sure TX is on domain ready list */ + if (!TAILQ_ON_LIST(tx, tx_link)) { + TAILQ_INSERT_TAIL(&tx->tx_domain->dom_tx_ready, + tx, tx_link); + } + } +} + +static inline void +usdf_msg_rewind_qe(struct usdf_msg_qe *qe, size_t rewind, size_t mtu) +{ + size_t cur_resid; + size_t cur_iov; + size_t bytes; + size_t len; + + if (qe->ms_resid == 0) { + bytes = qe->ms_length % mtu; + cur_resid = 0; + } else { + bytes = mtu; + cur_resid = qe->ms_iov_resid; + } + bytes += (rewind - 1) * mtu; + qe->ms_resid += bytes; + + cur_iov = qe->ms_cur_iov; + while (bytes > 0) { + len = qe->ms_iov[cur_iov].iov_len - cur_resid; + if (len >= bytes) { + len = bytes; + cur_resid += len; + } else { + --cur_iov; + cur_resid = 0; + } + bytes -= len; + } + + qe->ms_cur_iov = cur_iov; + qe->ms_cur_ptr = qe->ms_iov[cur_iov].iov_base + + qe->ms_iov[cur_iov].iov_len - cur_resid; + qe->ms_iov_resid = cur_resid; +} + +/* + * semi-native rx buffer post, i want to eventually avoid using the + * vnic_*() calls + */ +static inline int +_usdf_msg_post_recv(struct usdf_rx *rx, void *buf, size_t len) +{ + struct usd_rq *rq; + struct vnic_rq *vrq; + struct rq_enet_desc *desc; + struct usd_qp_impl *qp; + + qp = to_qpi(rx->rx_qp); + rq = &qp->uq_rq; + vrq = &rq->urq_vnic_rq; + + rq->urq_context[rq->urq_post_index] = buf; + rq->urq_post_index = (rq->urq_post_index + 1) + & rq->urq_post_index_mask; + + desc = vnic_rq_next_desc(vrq); + rq_enet_desc_enc(desc, (dma_addr_t) buf, + RQ_ENET_TYPE_ONLY_SOP, len); + wmb(); + vnic_rq_post(vrq, buf, 0, (dma_addr_t) buf, len, 0); + + return 0; +} + +/* + * Allow external access to the inline + */ +int +usdf_msg_post_recv(struct usdf_rx *rx, void *buf, size_t len) +{ + return _usdf_msg_post_recv(rx, buf, len); +} ssize_t usdf_msg_recv(struct fid_ep *fep, void *buf, size_t len, void *desc, fi_addr_t src_addr, void *context) { struct usdf_ep *ep; - struct usd_qp_impl *qp; - struct usd_recv_desc rxd; - uint32_t index; + struct usdf_rx *rx; + struct usdf_msg_qe *rqe; + struct usdf_domain *udp; ep = ep_ftou(fep); - qp = to_qpi(ep->ep_qp); + rx = ep->ep_rx; + udp = ep->ep_domain; - index = qp->uq_rq.urq_post_index; - rxd.urd_context = context; - rxd.urd_iov[0].iov_base = (uint8_t *)ep->ep_hdr_buf + - (index * USDF_HDR_BUF_ENTRY) + - (USDF_HDR_BUF_ENTRY - sizeof(struct usd_udp_hdr)); - rxd.urd_iov[0].iov_len = sizeof(struct usd_udp_hdr); - rxd.urd_iov[1].iov_base = buf; - rxd.urd_iov[1].iov_len = len; - rxd.urd_iov_cnt = 2; - rxd.urd_next = NULL; + if (TAILQ_EMPTY(&rx->r.msg.rx_free_rqe)) { + return -FI_EAGAIN; + } - ep->ep_hdr_ptr[index] = rxd.urd_iov[0].iov_base; - index = (index + 1) & qp->uq_rq.urq_post_index_mask; - ep->ep_hdr_ptr[index] = rxd.urd_iov[0].iov_base; + pthread_spin_lock(&udp->dom_progress_lock); - return usd_post_recv(ep->ep_qp, &rxd); + rqe = TAILQ_FIRST(&rx->r.msg.rx_free_rqe); + TAILQ_REMOVE(&rx->r.msg.rx_free_rqe, rqe, ms_link); + + rqe->ms_context = context; + rqe->ms_iov[0].iov_base = buf; + rqe->ms_iov[0].iov_len = len; + rqe->ms_last_iov = 0; + + rqe->ms_cur_iov = 0; + rqe->ms_cur_ptr = buf; + rqe->ms_iov_resid = len; + rqe->ms_length = 0; + + TAILQ_INSERT_TAIL(&rx->r.msg.rx_posted_rqe, rqe, ms_link); + + pthread_spin_unlock(&udp->dom_progress_lock); + + return 0; } ssize_t usdf_msg_recvv(struct fid_ep *fep, const struct iovec *iov, void **desc, size_t count, fi_addr_t src_addr, void *context) { - struct usdf_ep *ep; - struct usd_recv_desc rxd; - struct usd_qp_impl *qp; - uint32_t index; - int i; - - ep = ep_ftou(fep); - qp = to_qpi(ep->ep_qp); - - rxd.urd_context = context; - rxd.urd_iov[0].iov_base = ep->ep_hdr_buf + - qp->uq_rq.urq_post_index * USDF_HDR_BUF_ENTRY; - rxd.urd_iov[0].iov_len = sizeof(struct usd_udp_hdr); - memcpy(&rxd.urd_iov[1], iov, sizeof(*iov) * count); - rxd.urd_iov_cnt = count + 1; - rxd.urd_next = NULL; - - index = qp->uq_rq.urq_post_index; - for (i = 0; i < count; ++i) { - ep->ep_hdr_ptr[index] = rxd.urd_iov[0].iov_base; - index = (index + 1) & qp->uq_rq.urq_post_index_mask; - } - - return usd_post_recv(ep->ep_qp, &rxd); -} - -static inline ssize_t -_usdf_msg_send(struct usdf_ep *ep, struct usd_dest *dest, - const void *buf, size_t len, fi_addr_t dest_addr, void *context) -{ - if (len <= USD_SEND_MAX_COPY - sizeof(struct usd_udp_hdr)) { - return usd_post_send_one_copy(ep->ep_qp, dest, buf, len, - USD_SF_SIGNAL, context); - } else { - return usd_post_send_one(ep->ep_qp, dest, buf, len, - USD_SF_SIGNAL, context); - } + return -FI_ENOSYS; } ssize_t @@ -139,20 +211,48 @@ usdf_msg_send(struct fid_ep *fep, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, void *context) { struct usdf_ep *ep; - struct usd_dest *dest; - int ret; + struct usdf_tx *tx; + struct usdf_msg_qe *wqe; + struct usdf_domain *udp; ep = ep_ftou(fep); + tx = ep->ep_tx; + udp = ep->ep_domain; - dest = (struct usd_dest *)(uintptr_t)dest_addr; - return _usdf_msg_send(ep, dest, buf, len, dest_addr, context); + if (TAILQ_EMPTY(&tx->t.msg.tx_free_wqe)) { + return -FI_EAGAIN; + } - return ret; + pthread_spin_lock(&udp->dom_progress_lock); + + wqe = TAILQ_FIRST(&tx->t.msg.tx_free_wqe); + TAILQ_REMOVE(&tx->t.msg.tx_free_wqe, wqe, ms_link); + + wqe->ms_context = context; + wqe->ms_iov[0].iov_base = (void *)buf; + wqe->ms_iov[0].iov_len = len; + wqe->ms_last_iov = 0; + + wqe->ms_cur_iov = 0; + wqe->ms_cur_ptr = buf; + wqe->ms_iov_resid = len; + wqe->ms_resid = len; + wqe->ms_length = len; + + /* add send to EP, and add EP to TX list if not present */ + TAILQ_INSERT_TAIL(&ep->e.msg.ep_posted_wqe, wqe, ms_link); + usdf_msg_ep_ready(ep); + + pthread_spin_unlock(&udp->dom_progress_lock); + + usdf_domain_progress(udp); + + return 0; } ssize_t -usdf_msg_senddata(struct fid_ep *ep, const void *buf, size_t len, - void *desc, uint64_t data, fi_addr_t dest_addr, void *context) +usdf_msg_senddata(struct fid_ep *ep, const void *buf, size_t len, void *desc, + uint64_t data, fi_addr_t dest_addr, void *context) { return -FI_ENOSYS; } @@ -183,60 +283,640 @@ usdf_msg_recvmsg(struct fid_ep *ep, const struct fi_msg *msg, uint64_t flags) return -FI_ENOSYS; } -/* - * Versions that rely on user to reserve space for header at start of buffer - */ -ssize_t -usdf_msg_prefix_recv(struct fid_ep *fep, void *buf, size_t len, - void *desc, fi_addr_t src_addr, void *context) +static void +usdf_msg_send_complete(struct usdf_ep *ep, struct usdf_msg_qe *wqe) { - struct usdf_ep *ep; - struct usd_qp_impl *qp; - struct usd_recv_desc rxd; - uint32_t index; + TAILQ_REMOVE(&ep->e.msg.ep_posted_wqe, wqe, ms_link); - ep = ep_ftou(fep); - qp = to_qpi(ep->ep_qp); - - index = qp->uq_rq.urq_post_index; - rxd.urd_context = context; - rxd.urd_iov[0].iov_base = (uint8_t *)buf + - USDF_HDR_BUF_ENTRY - sizeof(struct usd_udp_hdr); - rxd.urd_iov[0].iov_len = len; - rxd.urd_iov_cnt = 1; - rxd.urd_next = NULL; - - ep->ep_hdr_ptr[index] = rxd.urd_iov[0].iov_base; - - return usd_post_recv(ep->ep_qp, &rxd); + wqe->ms_last_seq = ep->e.msg.ep_next_tx_seq - 1; + TAILQ_INSERT_TAIL(&ep->e.msg.ep_sent_wqe, wqe, ms_link); } -ssize_t -usdf_msg_prefix_recvv(struct fid_ep *fep, const struct iovec *iov, - void **desc, size_t count, fi_addr_t src_addr, void *context) +static inline void +usdf_msg_send_segment(struct usdf_tx *tx, struct usdf_ep *ep) { - struct usdf_ep *ep; - struct usd_recv_desc rxd; - struct usd_qp_impl *qp; + struct usdf_msg_qe *msg; + struct rudp_pkt *hdr; + struct usd_wq *wq; uint32_t index; - int i; + size_t cur_iov; + size_t cur_resid; + size_t resid; + const uint8_t *cur_ptr; + const uint8_t *send_ptr; + size_t sge_len; + uint8_t *ptr; + struct usd_wq_post_info *info; - ep = ep_ftou(fep); - qp = to_qpi(ep->ep_qp); + msg = TAILQ_FIRST(&ep->e.msg.ep_posted_wqe); + wq = &(to_qpi(tx->tx_qp)->uq_wq); - rxd.urd_context = context; - memcpy(&rxd.urd_iov[0], iov, sizeof(*iov) * count); - rxd.urd_iov[0].iov_base = (uint8_t *)rxd.urd_iov[0].iov_base + - USDF_HDR_BUF_ENTRY - sizeof(struct usd_udp_hdr); + index = wq->uwq_post_index; + hdr = (struct rudp_pkt *)(wq->uwq_copybuf + index * USD_SEND_MAX_COPY); - rxd.urd_iov_cnt = count; - rxd.urd_next = NULL; + memcpy(hdr, &ep->e.msg.ep_dest->ds_dest.ds_udp.u_hdr, + sizeof(struct usd_udp_hdr)); + hdr->msg.src_peer_id = htons(ep->e.msg.ep_lcl_peer_id); - index = qp->uq_rq.urq_post_index; - for (i = 0; i < count; ++i) { - ep->ep_hdr_ptr[index] = rxd.urd_iov[0].iov_base; - index = (index + 1) & qp->uq_rq.urq_post_index_mask; + resid = msg->ms_resid; + cur_iov = msg->ms_cur_iov; + cur_ptr = msg->ms_cur_ptr; + cur_resid = msg->ms_iov_resid; + + /* save first seq for message */ + if (cur_iov == 0 && cur_resid == msg->ms_iov[0].iov_len) { + msg->ms_first_seq = ep->e.msg.ep_next_tx_seq; } - return usd_post_recv(ep->ep_qp, &rxd); + if (resid < USD_SEND_MAX_COPY - sizeof(*hdr)) { + hdr->msg.opcode = htons(RUDP_OP_LAST); + hdr->msg.m.rc_data.length = htons(resid); + hdr->msg.m.rc_data.seqno = htons(ep->e.msg.ep_next_tx_seq); + ++ep->e.msg.ep_next_tx_seq; + + ptr = (uint8_t *)(hdr + 1); + while (resid > 0) { + memcpy(ptr, cur_ptr, cur_resid); + ptr += msg->ms_iov_resid; + resid -= msg->ms_iov_resid; + ++cur_iov; + cur_ptr = msg->ms_iov[cur_iov].iov_base; + cur_resid = msg->ms_iov[cur_iov].iov_len; + } + + /* add packet lengths */ + sge_len = resid; + hdr->hdr.uh_ip.tot_len = htons( + sge_len + sizeof(struct rudp_pkt) - + sizeof(struct ether_header)); + hdr->hdr.uh_udp.len = htons( + (sizeof(struct rudp_pkt) - + sizeof(struct ether_header) - + sizeof(struct iphdr)) + sge_len); + + index = _usd_post_send_one(wq, hdr, + resid + sizeof(*hdr), 1); + } else { + struct vnic_wq *vwq; + u_int8_t offload_mode = 0, eop; + u_int16_t mss = 7, header_length = 0, vlan_tag = 0; + u_int8_t vlan_tag_insert = 0, loopback = 0, fcoe_encap = 0; + struct wq_enet_desc *desc; + size_t space; + size_t num_sge; + size_t sent; + + vwq = &wq->uwq_vnic_wq; + desc = wq->uwq_next_desc; + space = ep->ep_domain->dom_fabric->fab_dev_attrs->uda_mtu - + sizeof(*hdr); + num_sge = 1; + + /* encode header desc */ + eop = 0; + wq_enet_desc_enc(desc, (uintptr_t)hdr, sizeof(*hdr), + mss, header_length, offload_mode, eop, 0, fcoe_encap, + vlan_tag_insert, vlan_tag, loopback); + + do { + desc = (struct wq_enet_desc *) + ((uintptr_t)wq->uwq_desc_ring + (index << 4)); + index = (index + 1) & wq->uwq_post_index_mask; + + send_ptr = cur_ptr; + if (cur_resid >= space) { + sge_len = space; + eop = 1; + cur_resid -= sge_len; + cur_ptr += sge_len; + } else { + sge_len = cur_resid; + if (num_sge == USDF_MSG_MAX_SGE - 1 || + cur_resid == resid) { + eop = 1; + } + ++cur_iov; + cur_ptr = msg->ms_iov[cur_iov].iov_base; + cur_resid = msg->ms_iov[cur_iov].iov_len; + } + + wq_enet_desc_enc(desc, (uintptr_t)send_ptr, sge_len, + mss, header_length, offload_mode, eop, eop, + fcoe_encap, vlan_tag_insert, + vlan_tag, loopback); + + ++num_sge; + space -= sge_len; + resid -= sge_len; + } while (space > 0 && num_sge <= USDF_MSG_MAX_SGE && resid > 0); + + /* add packet lengths */ + sent = ep->ep_domain->dom_fabric->fab_dev_attrs->uda_mtu - + space; + hdr->hdr.uh_ip.tot_len = htons( + sent + sizeof(struct rudp_pkt) - + sizeof(struct ether_header)); + hdr->hdr.uh_udp.len = htons( + (sizeof(struct rudp_pkt) - + sizeof(struct ether_header) - + sizeof(struct iphdr)) + sent); +if (0) { +if ((random() % 177) == 0 && resid == 0) { + hdr->hdr.uh_eth.ether_type = 0; +//printf("BORK seq %u\n", ep->e.msg.ep_next_tx_seq); +} +} + + if (resid == 0) { + hdr->msg.opcode = htons(RUDP_OP_LAST); + } else { + hdr->msg.opcode = htons(RUDP_OP_FIRST); + } + hdr->msg.m.rc_data.length = htons(sent); + hdr->msg.m.rc_data.seqno = htons(ep->e.msg.ep_next_tx_seq); + ++ep->e.msg.ep_next_tx_seq; + + wmb(); + iowrite64(index, &vwq->ctrl->posted_index); + + wq->uwq_next_desc = (struct wq_enet_desc *) + ((uintptr_t)wq->uwq_desc_ring + (index << 4)); + wq->uwq_post_index = (index + 1) & wq->uwq_post_index_mask; + wq->uwq_send_credits -= num_sge; + } + + info = &wq->uwq_post_info[index]; + info->wp_context = tx; + info->wp_len = sge_len; + + /* If send complete, remove from send list */ + if (resid == 0) { + usdf_msg_send_complete(ep, msg); + } else { + msg->ms_resid = resid; + msg->ms_iov_resid = cur_resid; + msg->ms_cur_iov = cur_iov; + msg->ms_cur_ptr = cur_ptr; + } + + /* set ACK timer */ + usdf_timer_set(ep->ep_domain->dom_fabric, ep->e.msg.ep_ack_timer, + USDF_RUDP_ACK_TIMEOUT); +} + +static inline void +usdf_msg_send_ack(struct usdf_tx *tx, struct usdf_ep *ep) +{ + struct rudp_pkt *hdr; + struct usd_wq *wq; + uint32_t last_post; + struct usd_wq_post_info *info; + uint16_t seq; + + wq = &(to_qpi(tx->tx_qp)->uq_wq); + + hdr = (struct rudp_pkt *) (wq->uwq_copybuf + + wq->uwq_post_index * USD_SEND_MAX_COPY); + + memcpy(hdr, &ep->e.msg.ep_dest->ds_dest.ds_udp.u_hdr, + sizeof(struct usd_udp_hdr)); + + hdr->msg.src_peer_id = htons(ep->e.msg.ep_lcl_peer_id); + if (ep->e.msg.ep_send_nak) { + hdr->msg.opcode = htons(RUDP_OP_NAK); + seq = ep->e.msg.ep_next_rx_seq; + hdr->msg.m.nak.nak_seq = htons(seq); + ep->e.msg.ep_send_nak = 0; + } else { + hdr->msg.opcode = htons(RUDP_OP_ACK); + seq = ep->e.msg.ep_next_rx_seq - 1; + hdr->msg.m.ack.ack_seq = htons(seq); + } + + /* add packet lengths */ + hdr->hdr.uh_ip.tot_len = htons( + sizeof(struct rudp_pkt) - + sizeof(struct ether_header)); + hdr->hdr.uh_udp.len = htons(sizeof(struct rudp_pkt) - + sizeof(struct ether_header) - sizeof(struct iphdr)); + + last_post = _usd_post_send_one(wq, hdr, sizeof(*hdr), 1); + + info = &wq->uwq_post_info[last_post]; + info->wp_context = tx; + info->wp_len = 0; +} + +/* + * If this TX has sends to do and is not on domain ready list, then + * this completion means we can go back on the domain ready list + */ +static void +usdf_msg_send_completion(struct usd_completion *comp) +{ + struct usdf_tx *tx; + + tx = comp->uc_context; + + if (!TAILQ_EMPTY(&tx->t.msg.tx_ep_ready) && + !TAILQ_ON_LIST(tx, tx_link)) { + TAILQ_INSERT_TAIL(&tx->tx_domain->dom_tx_ready, tx, tx_link); + } +} + +/* + * Keep progressing sends on this queue until: + * a) no more send credits on the queue (it's full) + * or + * b) all endpoints are complete or blocked awaiting ACKs + */ +void +usdf_msg_tx_progress(struct usdf_tx *tx) +{ + struct usdf_ep *ep; + struct usd_qp_impl *qp; + + qp = to_qpi(tx->tx_qp); + while (qp->uq_wq.uwq_send_credits > 1 && + !TAILQ_EMPTY(&tx->t.msg.tx_ep_have_acks)) { + ep = TAILQ_FIRST(&tx->t.msg.tx_ep_have_acks); + TAILQ_REMOVE_MARK(&tx->t.msg.tx_ep_have_acks, + ep, e.msg.ep_ack_link); + + usdf_msg_send_ack(tx, ep); + } + + while (qp->uq_wq.uwq_send_credits > 1 && + !TAILQ_EMPTY(&tx->t.msg.tx_ep_ready)) { + ep = TAILQ_FIRST(&tx->t.msg.tx_ep_ready); + + /* + * Send next segment on this EP. This will also remove the + * current send from the EP send list if it completes + */ + usdf_msg_send_segment(tx, ep); + + --ep->e.msg.ep_seq_credits; + if (TAILQ_EMPTY(&ep->e.msg.ep_posted_wqe)) { + TAILQ_REMOVE_MARK(&tx->t.msg.tx_ep_ready, + ep, e.msg.ep_link); + } else { + --ep->e.msg.ep_fairness_credits; + if (ep->e.msg.ep_seq_credits == 0) { + TAILQ_REMOVE_MARK(&tx->t.msg.tx_ep_ready, + ep, e.msg.ep_link); + ep->e.msg.ep_fairness_credits = + USDF_MSG_FAIRNESS_CREDITS; + + /* fairness credits exhausted, go to back of the line */ + } else if (ep->e.msg.ep_fairness_credits == 0) { + TAILQ_REMOVE(&tx->t.msg.tx_ep_ready, + ep, e.msg.ep_link); + TAILQ_INSERT_TAIL(&tx->t.msg.tx_ep_ready, + ep, e.msg.ep_link); + ep->e.msg.ep_fairness_credits = + USDF_MSG_FAIRNESS_CREDITS; + } + } + } +} + +static void inline +usdf_msg_recv_complete(struct usdf_ep *ep, struct usdf_msg_qe *rqe) +{ + struct usdf_cq_hard *hcq; + + hcq = ep->ep_rx->r.msg.rx_hcq; + hcq->cqh_post(hcq, rqe->ms_context, rqe->ms_length); + + TAILQ_INSERT_HEAD(&ep->ep_rx->r.msg.rx_free_rqe, rqe, ms_link); +} + +static inline void +usdf_msg_ep_has_ack(struct usdf_ep *ep) +{ + struct usdf_tx *tx; + struct usdf_domain *udp; + + if (!TAILQ_ON_LIST(ep, e.msg.ep_ack_link)) { + tx = ep->ep_tx; + udp = ep->ep_domain; + TAILQ_INSERT_TAIL(&tx->t.msg.tx_ep_have_acks, ep, + e.msg.ep_ack_link); + /* Add TX to domain list if not present */ + if (!TAILQ_ON_LIST(tx, tx_link)) { + TAILQ_INSERT_TAIL(&udp->dom_tx_ready, tx, tx_link); + } + + } +} + +static inline int +usdf_msg_check_seq(struct usdf_ep *ep, struct rudp_pkt *pkt) +{ + uint16_t seq; + int ret; + + seq = ntohs(pkt->msg.m.rc_data.seqno); + + /* Drop bad seq, send NAK if seq from the future */ + if (seq != ep->e.msg.ep_next_rx_seq) { + if (RUDP_SEQ_GT(seq, ep->e.msg.ep_next_rx_seq)) { + ep->e.msg.ep_send_nak = 1; + } + ret = -1; + } else { + ++ep->e.msg.ep_next_rx_seq; + ret = 0; + } + usdf_msg_ep_has_ack(ep); + + return ret; +} + +static inline void +usdf_msg_process_ack(struct usdf_ep *ep, uint16_t seq) +{ + struct usdf_cq_hard *hcq; + struct usdf_msg_qe *wqe; + uint16_t max_ack; + unsigned credits; + + /* don't try to ACK what we don't think we've sent */ + max_ack = ep->e.msg.ep_next_tx_seq - 1; + if (RUDP_SEQ_GT(seq, max_ack)) { + seq = max_ack; + } + + hcq = ep->ep_tx->t.msg.tx_hcq; + while (!TAILQ_EMPTY(&ep->e.msg.ep_sent_wqe)) { + wqe = TAILQ_FIRST(&ep->e.msg.ep_sent_wqe); + if (RUDP_SEQ_LE(wqe->ms_last_seq, seq)) { + TAILQ_REMOVE(&ep->e.msg.ep_sent_wqe, wqe, ms_link); + hcq->cqh_post(hcq, wqe->ms_context, wqe->ms_length); + + TAILQ_INSERT_HEAD(&ep->ep_tx->t.msg.tx_free_wqe, + wqe, ms_link); + } else { + break; + } + } + + credits = RUDP_SEQ_DIFF(seq, ep->e.msg.ep_last_rx_ack); + if (ep->e.msg.ep_seq_credits == 0 && credits > 0 && + !TAILQ_EMPTY(&ep->e.msg.ep_posted_wqe)) { + usdf_msg_ep_ready(ep); + } + ep->e.msg.ep_seq_credits += credits; + ep->e.msg.ep_last_rx_ack = seq; + + /* If all ACKed, cancel timer, else reset it */ + if (seq == max_ack) { + usdf_timer_cancel(ep->ep_domain->dom_fabric, + ep->e.msg.ep_ack_timer); + } else { + usdf_timer_reset(ep->ep_domain->dom_fabric, + ep->e.msg.ep_ack_timer, USDF_RUDP_ACK_TIMEOUT); + } +} + +static inline void +usdf_process_nak(struct usdf_ep *ep, uint16_t seq) +{ + struct usdf_msg_qe *wqe; + size_t rewind; + + /* Ignore NAKs of future packets */ + if (RUDP_SEQ_GE(seq, ep->e.msg.ep_next_tx_seq)) { + return; + } + + /* + * Move any WQEs that contain NAKed sequences back to the + * posted list. We set ms_resid == 0 here because final set to zero + * is optimized out of the fastpath + */ + while (!TAILQ_EMPTY(&ep->e.msg.ep_sent_wqe)) { + wqe = TAILQ_LAST(&ep->e.msg.ep_sent_wqe, usdf_msg_qe_head); + TAILQ_REMOVE(&ep->e.msg.ep_sent_wqe, wqe, ms_link); + wqe->ms_resid = 0; + TAILQ_INSERT_HEAD(&ep->e.msg.ep_posted_wqe, wqe, ms_link); + } + wqe = TAILQ_FIRST(&ep->e.msg.ep_posted_wqe); + + /* reset WQE to old sequence # */ + if (wqe->ms_resid == 0) { + rewind = RUDP_SEQ_DIFF(wqe->ms_last_seq, seq) + 1; + } else { + rewind = RUDP_SEQ_DIFF(ep->e.msg.ep_next_tx_seq, seq); + } + if (rewind > 0) { + ep->e.msg.ep_seq_credits = USDF_RUDP_SEQ_CREDITS; + ep->e.msg.ep_next_tx_seq = seq; + + usdf_msg_rewind_qe(wqe, rewind, + ep->ep_domain->dom_fabric->fab_dev_attrs->uda_mtu - + sizeof(struct rudp_pkt)); + + usdf_msg_ep_ready(ep); + } +} + +void +usdf_msg_ep_timeout(void *vep) +{ + struct usdf_ep *ep; + struct usdf_domain *udp; + uint16_t nak; + + ep = vep; + udp = ep->ep_domain; + + pthread_spin_lock(&udp->dom_progress_lock); + nak = ep->e.msg.ep_last_rx_ack + 1; + + usdf_process_nak(ep, nak); + pthread_spin_unlock(&udp->dom_progress_lock); +} + +static inline void +usdf_msg_rx_ack(struct usdf_ep *ep, struct rudp_pkt *pkt) +{ + uint16_t seq; + seq = ntohs(pkt->msg.m.ack.ack_seq); + usdf_msg_process_ack(ep, seq); +} + +static inline void +usdf_msg_rx_nak(struct usdf_ep *ep, struct rudp_pkt *pkt) +{ + uint16_t seq; + + seq = ntohs(pkt->msg.m.nak.nak_seq); + usdf_msg_process_ack(ep, seq); + + usdf_process_nak(ep, seq); +} + +/* + * Handle a receive on a queue servicing a message endpoint + */ +static inline void +usdf_msg_handle_recv(struct usdf_domain *udp, struct usd_completion *comp) +{ + struct rudp_pkt *pkt; + struct usdf_msg_qe *rqe; + struct usdf_ep *ep; + struct usd_qp *qp; + struct usdf_rx *rx; + uint32_t peer_id; + uint32_t opcode; + uint8_t *rx_ptr; + uint8_t *rqe_ptr; + size_t cur_iov; + size_t iov_resid; + size_t rxlen; + size_t copylen; + int ret; + + pkt = comp->uc_context; + opcode = ntohs(pkt->msg.opcode); + peer_id = ntohs(pkt->msg.src_peer_id); + if (peer_id > USDF_MAX_PEERS) { + qp = comp->uc_qp; + rx = qp->uq_context; + goto dropit; + } + ep = udp->dom_peer_tab[peer_id]; + if (ep == NULL) { + qp = comp->uc_qp; + rx = qp->uq_context; + goto dropit; + } + rx = ep->ep_rx; + + switch (opcode) { + case RUDP_OP_ACK: + usdf_msg_rx_ack(ep, pkt); + break; + + case RUDP_OP_NAK: + usdf_msg_rx_nak(ep, pkt); + break; + + case RUDP_OP_FIRST: + ret = usdf_msg_check_seq(ep, pkt); + if (ret == -1) { + goto dropit; + } + + rqe = ep->e.msg.ep_cur_recv; + if (rqe == NULL) { + if (TAILQ_EMPTY(&rx->r.msg.rx_posted_rqe)) { + goto dropit; + } + rqe = TAILQ_FIRST(&rx->r.msg.rx_posted_rqe); + TAILQ_REMOVE(&rx->r.msg.rx_posted_rqe, rqe, ms_link); + ep->e.msg.ep_cur_recv = rqe; + } + + rx_ptr = (uint8_t *)(pkt + 1); + rxlen = ntohs(pkt->msg.m.rc_data.length); + rqe->ms_length += rxlen; + rqe_ptr = (uint8_t *)rqe->ms_cur_ptr; + iov_resid = rqe->ms_iov_resid; + cur_iov = rqe->ms_cur_iov; + while (rxlen > 0) { + copylen = MIN(rxlen, iov_resid); + memcpy(rqe_ptr, rx_ptr, copylen); + rx_ptr += copylen; + rxlen -= copylen; + iov_resid -= copylen; + if (iov_resid == 0) { + if (cur_iov == rqe->ms_last_iov) { + break; + } + ++cur_iov; + rqe_ptr = rqe->ms_iov[cur_iov].iov_base; + iov_resid = rqe->ms_iov[cur_iov].iov_len; + } else { + rqe_ptr += copylen; + } + } + break; + + case RUDP_OP_LAST: + ret = usdf_msg_check_seq(ep, pkt); + if (ret == -1) { + goto dropit; + } + + rqe = ep->e.msg.ep_cur_recv; + if (rqe == NULL) { + rqe = TAILQ_FIRST(&rx->r.msg.rx_posted_rqe); + if (rqe == NULL) { + goto dropit; + } + TAILQ_REMOVE(&rx->r.msg.rx_posted_rqe, rqe, ms_link); + ep->e.msg.ep_cur_recv = rqe; + } + + rx_ptr = (uint8_t *)(pkt + 1); + rxlen = ntohs(pkt->msg.m.rc_data.length); + rqe->ms_length += rxlen; + rqe_ptr = (uint8_t *)rqe->ms_cur_ptr; + iov_resid = rqe->ms_iov_resid; + cur_iov = rqe->ms_cur_iov; + while (rxlen > 0) { + copylen = MIN(rxlen, iov_resid); + memcpy(rqe_ptr, rx_ptr, copylen); + rx_ptr += copylen; + rxlen -= copylen; + iov_resid -= copylen; + if (iov_resid == 0) { + if (cur_iov == rqe->ms_last_iov) { + break; + } + ++cur_iov; + rqe_ptr = rqe->ms_iov[cur_iov].iov_base; + iov_resid = rqe->ms_iov[cur_iov].iov_len; + } else { + rqe_ptr += copylen; + } + } + if (rxlen > 0) { + rqe->ms_length -= rxlen; +/* printf("RQE truncated XXX\n"); */ + } else { + usdf_msg_recv_complete(ep, rqe); + } + break; + default: + break; + } + +dropit: + /* repost buffer */ + _usdf_msg_post_recv(rx, pkt, + rx->rx_domain->dom_fabric->fab_dev_attrs->uda_mtu); +} + +/* + * Process message completions + */ +void +usdf_msg_hcq_progress(struct usdf_cq_hard *hcq) +{ + struct usd_completion comp; + + while (usd_poll_cq(hcq->cqh_ucq, &comp) != -EAGAIN) { + switch (comp.uc_type) { + case USD_COMPTYPE_SEND: + usdf_msg_send_completion(&comp); + break; + case USD_COMPTYPE_RECV: + usdf_msg_handle_recv(hcq->cqh_cq->cq_domain, &comp); + break; + } + } } diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_msg.h b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_msg.h index 2e5e7378f6..09c7d3455e 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_msg.h +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_msg.h @@ -36,9 +36,54 @@ #ifndef _USDF_MSG_H_ #define _USDF_MSG_H_ +#define USDF_MSG_CAPS (FI_MSG | FI_SOURCE | FI_SEND | FI_RECV) + +#define USDF_MSG_SUPP_MODE (FI_LOCAL_MR) +#define USDF_MSG_REQ_MODE (FI_LOCAL_MR) + +#define USDF_MSG_MAX_SGE 8 +#define USDF_MSG_DFLT_SGE 8 +#define USDF_MSG_MAX_CTX_SIZE 1024 +#define USDF_MSG_DFLT_CTX_SIZE 128 + +#define USDF_MSG_MAX_MSG UINT_MAX + +#define USDF_MSG_FAIRNESS_CREDITS 16 + +#define USDF_MSG_RUDP_SEQ_CREDITS 256 + +struct usdf_msg_qe { + void *ms_context; + + struct iovec ms_iov[USDF_MSG_MAX_SGE]; + size_t ms_last_iov; + size_t ms_length; + + uint16_t ms_first_seq; + uint16_t ms_last_seq; + + size_t ms_cur_iov; + const uint8_t *ms_cur_ptr; + size_t ms_resid; /* amount remaining in entire msg */ + size_t ms_iov_resid; /* amount remaining in current iov */ + + TAILQ_ENTRY(usdf_msg_qe) ms_link; +}; + +int usdf_msg_post_recv(struct usdf_rx *rx, void *buf, size_t len); +int usdf_msg_fill_tx_attr(struct fi_tx_attr *txattr); +int usdf_msg_fill_rx_attr(struct fi_rx_attr *rxattr); +int usdf_cq_msg_poll(struct usd_cq *ucq, struct usd_completion *comp); +void usdf_msg_ep_timeout(void *vep); + +void usdf_msg_hcq_progress(struct usdf_cq_hard *hcq); +void usdf_msg_tx_progress(struct usdf_tx *tx); + + /* fi_ops_cm for RC */ int usdf_cm_msg_connect(struct fid_ep *ep, const void *addr, const void *param, size_t paramlen); +int usdf_cm_msg_accept(struct fid_ep *fep, const void *param, size_t paramlen); int usdf_cm_msg_shutdown(struct fid_ep *ep, uint64_t flags); /* fi_ops_msg for RC */ diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_pep.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_pep.c index a9cab5dc11..b3647deecd 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_pep.c +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_pep.c @@ -58,10 +58,12 @@ #include "fi.h" #include "fi_enosys.h" +#include "fi_usnic.h" #include "usnic_direct.h" #include "usd.h" #include "usdf.h" #include "usdf_cm.h" +#include "usdf_msg.h" int usdf_pep_bind(fid_t fid, fid_t bfid, uint64_t flags) @@ -73,7 +75,6 @@ usdf_pep_bind(fid_t fid, fid_t bfid, uint64_t flags) switch (bfid->fclass) { case FI_CLASS_EQ: -printf("bind EQ!\n"); if (pep->pep_eq != NULL) { return -FI_EINVAL; } @@ -88,12 +89,100 @@ printf("bind EQ!\n"); return 0; } -/* - * Report an error to the PEP's EQ - */ -static void -usdf_pep_accept_error(struct usdf_pep *pep, int error) +static struct fi_info * +usdf_pep_conn_info(struct usdf_connreq *crp) { + struct fi_info *ip; + struct usdf_pep *pep; + struct sockaddr_in *sin; + struct usdf_fabric *fp; + struct usdf_domain *udp; + struct usd_device_attrs *dap; + struct usdf_connreq_msg *reqp; + + pep = crp->cr_pep; + fp = pep->pep_fabric; + udp = LIST_FIRST(&fp->fab_domain_list); + dap = fp->fab_dev_attrs; + reqp = (struct usdf_connreq_msg *)crp->cr_data; + + /* If there is a domain, just copy info from there */ + if (udp != NULL) { + ip = fi_dupinfo(udp->dom_info); + if (ip == NULL) { + return NULL; + } + + /* no domains yet, make an info suitable for creating one */ + } else { + ip = fi_allocinfo_internal(); + if (ip == NULL) { + return NULL; + } + + ip->caps = USDF_MSG_CAPS; + ip->mode = USDF_MSG_SUPP_MODE; + ip->ep_type = FI_EP_MSG; + + ip->addr_format = FI_SOCKADDR_IN; + ip->src_addrlen = sizeof(struct sockaddr_in); + sin = calloc(1, ip->src_addrlen); + if (sin == NULL) { + goto fail; + } + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = dap->uda_ipaddr_be; + ip->src_addr = sin; + + ip->ep_attr->protocol = FI_PROTO_RUDP; + + ip->fabric_attr->fabric = fab_utof(fp); + ip->fabric_attr->name = strdup(fp->fab_attr.name); + ip->fabric_attr->prov_name = strdup(fp->fab_attr.prov_name); + ip->fabric_attr->prov_version = fp->fab_attr.prov_version; + if (ip->fabric_attr->name == NULL || + ip->fabric_attr->prov_name == NULL) { + goto fail; + } + } + + /* fill in dest addr */ + ip->dest_addrlen = ip->src_addrlen; + sin = calloc(1, ip->dest_addrlen); + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = reqp->creq_ipaddr; + sin->sin_port = reqp->creq_port; + + ip->connreq = crp; + return ip; +fail: + fi_freeinfo(ip); + return NULL; +} + +/* + * Remove connection request from epoll list if not done already. + * crp->cr_pollitem.pi_rtn is non-NULL when epoll() is active + */ +static int +usdf_pep_creq_epoll_del(struct usdf_connreq *crp) +{ + int ret; + struct usdf_pep *pep; + + pep = crp->cr_pep; + + if (crp->cr_pollitem.pi_rtn != NULL) { + ret = epoll_ctl(pep->pep_fabric->fab_epollfd, EPOLL_CTL_DEL, + crp->cr_sockfd, NULL); + crp->cr_pollitem.pi_rtn = NULL; + if (ret != 0) { + ret = -errno; + } + } else { + ret = 0; + } + return ret; } static int @@ -102,6 +191,9 @@ usdf_pep_read_connreq(void *v) struct usdf_connreq *crp; struct usdf_pep *pep; struct usdf_connreq_msg *reqp; + struct fi_eq_cm_entry *entry; + size_t entry_len; + int ret; int n; crp = v; @@ -109,25 +201,51 @@ usdf_pep_read_connreq(void *v) n = read(crp->cr_sockfd, crp->cr_ptr, crp->cr_resid); if (n == -1) { - usdf_pep_accept_error(pep, -errno); - // XXX DEL epoll item - close(crp->cr_sockfd); - TAILQ_REMOVE(&pep->pep_cr_pending, crp, cr_link); + usdf_cm_msg_connreq_failed(crp, -errno); return 0; } crp->cr_ptr += n; crp->cr_resid -= n; + reqp = (struct usdf_connreq_msg *)crp->cr_data; + if (crp->cr_resid == 0 && crp->cr_ptr == crp->cr_data + sizeof(*reqp)) { - reqp = (struct usdf_connreq_msg *)crp->cr_data; - crp->cr_resid = ntohl(reqp->creq_data_len); + reqp->creq_datalen = ntohl(reqp->creq_datalen); + crp->cr_resid = reqp->creq_datalen; } /* if resid is 0 now, completely done */ if (crp->cr_resid == 0) { - // DEL epoll_wait - // create CONNREQ EQ entry + ret = usdf_pep_creq_epoll_del(crp); + if (ret != 0) { + usdf_cm_msg_connreq_failed(crp, ret); + return 0; + } + + /* create CONNREQ EQ entry */ + entry_len = sizeof(*entry) + reqp->creq_datalen; + entry = malloc(entry_len); + if (entry == NULL) { + usdf_cm_msg_connreq_failed(crp, -errno); + return 0; + } + + entry->fid = &pep->pep_fid.fid; + entry->info = usdf_pep_conn_info(crp); + if (entry->info == NULL) { + free(entry); + usdf_cm_msg_connreq_failed(crp, -FI_ENOMEM); + return 0; + } + memcpy(entry->data, reqp->creq_data, reqp->creq_datalen); + ret = usdf_eq_write_internal(pep->pep_eq, FI_CONNREQ, entry, + entry_len, 0); + free(entry); + if (ret != entry_len) { + usdf_cm_msg_connreq_failed(crp, ret); + return 0; + } } return 0; @@ -149,21 +267,21 @@ usdf_pep_listen_cb(void *v) socklen = sizeof(sin); s = accept(pep->pep_sock, &sin, &socklen); if (s == -1) { - usdf_pep_accept_error(pep, -errno); + /* ignore early failure */ return 0; } -printf("connreq on %p, s = %d (%x)!\n", pep, s, sin.sin_addr.s_addr); crp = NULL; pthread_spin_lock(&pep->pep_cr_lock); if (!TAILQ_EMPTY(&pep->pep_cr_free)) { crp = TAILQ_FIRST(&pep->pep_cr_free); - TAILQ_REMOVE(&pep->pep_cr_free, crp, cr_link); + TAILQ_REMOVE_MARK(&pep->pep_cr_free, crp, cr_link); + TAILQ_NEXT(crp, cr_link) = NULL; } pthread_spin_unlock(&pep->pep_cr_lock); /* no room for request, just drop it */ if (crp == NULL) { - // send response? + /* XXX send response? */ close(s); return 0; } @@ -181,9 +299,8 @@ printf("connreq on %p, s = %d (%x)!\n", pep, s, sin.sin_addr.s_addr); ret = epoll_ctl(pep->pep_fabric->fab_epollfd, EPOLL_CTL_ADD, crp->cr_sockfd, &ev); if (ret == -1) { - usdf_pep_accept_error(pep, -errno); - close(crp->cr_sockfd); - TAILQ_INSERT_TAIL(&pep->pep_cr_free, crp, cr_link); + crp->cr_pollitem.pi_rtn = NULL; + usdf_cm_msg_connreq_failed(crp, -errno); return 0; } @@ -208,7 +325,7 @@ usdf_pep_listen(struct fid_pep *fpep) ret = -errno; } - pep->pep_pollitem.pi_rtn = &usdf_pep_listen_cb; + pep->pep_pollitem.pi_rtn = usdf_pep_listen_cb; pep->pep_pollitem.pi_context = pep; ev.events = EPOLLIN; ev.data.ptr = &pep->pep_pollitem; @@ -226,12 +343,6 @@ usdf_pep_cancel(fid_t fid, void *context) return -FI_EINVAL; } -int -usdf_pep_accept(struct fid_ep *ep, const void *param, size_t paramlen) -{ - return 0; -} - int usdf_pep_reject(struct fid_pep *pep, fi_connreq_t connreq, const void *param, size_t paramlen) @@ -264,9 +375,6 @@ usdf_pep_grow_backlog(struct usdf_pep *pep) size_t extra; extra = sizeof(struct usdf_connreq_msg) + pep->pep_cr_max_data; - if (extra < sizeof(struct usdf_connresp_msg)) { - extra = sizeof(struct usdf_connresp_msg); - } while (pep->pep_cr_alloced < pep->pep_backlog) { crp = calloc(1, sizeof(*crp) + extra); @@ -316,6 +424,8 @@ static struct fi_ops_ep usdf_pep_base_ops = { .cancel = usdf_pep_cancel, .getopt = fi_no_getopt, .setopt = fi_no_setopt, + .tx_ctx = fi_no_tx_ctx, + .rx_ctx = fi_no_rx_ctx, }; static struct fi_ops_cm usdf_pep_cm_ops = { @@ -324,7 +434,7 @@ static struct fi_ops_cm usdf_pep_cm_ops = { .getpeer = fi_no_getpeer, .connect = fi_no_connect, .listen = usdf_pep_listen, - .accept = usdf_pep_accept, + .accept = fi_no_accept, .reject = usdf_pep_reject, .shutdown = fi_no_shutdown, .join = fi_no_join, diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_progress.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_progress.c index d8c84cd44b..80ee0a3584 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_progress.c +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_progress.c @@ -111,7 +111,7 @@ usdf_fabric_progression_thread(void *v) } n = epoll_wait(epfd, &ev, 1, sleep_time); - if (n == -1) { + if (fp->fab_exit || (n == -1 && errno != EINTR)) { pthread_exit(NULL); } @@ -126,9 +126,31 @@ usdf_fabric_progression_thread(void *v) /* call timer progress each wakeup */ usdf_timer_progress(fp); - - if (fp->fab_exit) { - pthread_exit(NULL); - } } } + +/* + * Progress operations in this domain + */ +void +usdf_domain_progress(struct usdf_domain *udp) +{ + struct usdf_tx *tx; + struct usdf_cq_hard *hcq; + + /* one big hammer lock... */ + pthread_spin_lock(&udp->dom_progress_lock); + + TAILQ_FOREACH(hcq, &udp->dom_hcq_list, cqh_dom_link) { + hcq->cqh_progress(hcq); + } + + while (!TAILQ_EMPTY(&udp->dom_tx_ready)) { + tx = TAILQ_FIRST(&udp->dom_tx_ready); + TAILQ_REMOVE_MARK(&udp->dom_tx_ready, tx, tx_link); + + tx->tx_progress(tx); + } + + pthread_spin_unlock(&udp->dom_progress_lock); +} diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_progress.h b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_progress.h index 213bc05c6d..5ac184fa00 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_progress.h +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_progress.h @@ -42,9 +42,11 @@ struct usdf_poll_item { }; struct usdf_fabric; +struct usdf_domain; void *usdf_fabric_progression_thread(void *v); int usdf_fabric_wake_thread(struct usdf_fabric *fp); int usdf_fabric_progression_cb(void *v); +void usdf_domain_progress(struct usdf_domain *udp); #endif /* _USDF_PROGRESS_H_ */ diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_rdm.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_rdm.c new file mode 100644 index 0000000000..1b5199ac0a --- /dev/null +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_rdm.c @@ -0,0 +1,1324 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#if HAVE_CONFIG_H +# include +#endif /* HAVE_CONFIG_H */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include "fi.h" + +#include "usd.h" +#include "usd_post.h" + +#include "usdf.h" +#include "usdf_rudp.h" +#include "usdf_rdm.h" +#include "usdf_timer.h" +#include "usdf_av.h" +#include "usdf_progress.h" + +#define PRINTF if (0) printf + +static inline void +usdf_rdm_rdc_ready(struct usdf_rdm_connection *rdc, struct usdf_tx *tx) +{ + /* skip if we have pending send messages */ + if (!TAILQ_EMPTY(&rdc->dc_wqe_sent)) { +PRINTF("SKIP rdc %p ready due to pending wqe\n", rdc); + return; + } + if (!TAILQ_ON_LIST(rdc, dc_tx_link)) { + rdc->dc_fairness_credits = USDF_RDM_FAIRNESS_CREDITS; + TAILQ_INSERT_TAIL(&tx->t.rdm.tx_rdc_ready, rdc, dc_tx_link); + + /* Make sure TX is on domain ready list */ + if (!TAILQ_ON_LIST(tx, tx_link)) { + TAILQ_INSERT_TAIL(&tx->tx_domain->dom_tx_ready, + tx, tx_link); + } + } +else PRINTF("RDC %p already on list\n", rdc); +} + +static inline uint16_t +usdf_rdm_rdc_hash_helper(uint16_t *ipaddr, uint16_t port) +{ + uint16_t hash_index; + + hash_index = ipaddr[0]; + hash_index ^= ipaddr[1]; + hash_index ^= port; + + return hash_index & USDF_RDM_HASH_MASK; +} + +static inline uint16_t +usdf_rdm_rdc_hash_hdr(struct usd_udp_hdr *hdr) +{ + return usdf_rdm_rdc_hash_helper((uint16_t *)&hdr->uh_ip.saddr, + hdr->uh_udp.source); +} + +static inline int +usdf_rdm_rdc_hdr_match(struct usdf_rdm_connection *rdc, struct usd_udp_hdr *hdr) +{ + return hdr->uh_ip.saddr == rdc->dc_hdr.uh_ip.daddr && + hdr->uh_udp.source == rdc->dc_hdr.uh_udp.dest; +} + +static inline int +usdf_rdm_rdc_addr_match(struct usdf_rdm_connection *rdc, uint16_t *ipaddr, + uint16_t port) +{ + return *(uint32_t *)ipaddr == rdc->dc_hdr.uh_ip.daddr && + port == rdc->dc_hdr.uh_udp.dest; +} + +/* + * Find a matching RDM connection on this domain + */ +static inline struct usdf_rdm_connection * +usdf_rdm_rdc_addr_lookup(struct usdf_domain *udp, uint16_t *ipaddr, + uint16_t port) +{ + uint16_t hash_index; + struct usdf_rdm_connection *rdc; + + hash_index = usdf_rdm_rdc_hash_helper(ipaddr, port); + + rdc = udp->dom_rdc_hashtab[hash_index]; + + while (rdc != NULL) { + if (usdf_rdm_rdc_addr_match(rdc, ipaddr, port)) { + return rdc; + } + rdc = rdc->dc_hash_next; + } + + return NULL; +} + +/* + * Find a matching RDM connection on this domain + */ +static inline struct usdf_rdm_connection * +usdf_rdm_rdc_hdr_lookup(struct usdf_domain *udp, struct usd_udp_hdr *hdr) +{ + uint16_t hash_index; + struct usdf_rdm_connection *rdc; + + hash_index = usdf_rdm_rdc_hash_hdr(hdr); + + rdc = udp->dom_rdc_hashtab[hash_index]; + + while (rdc != NULL) { + if (usdf_rdm_rdc_hdr_match(rdc, hdr)) { + return rdc; + } + rdc = rdc->dc_hash_next; + } + + return NULL; +} + +/* + * Insert rdc into domain hash table + */ +static inline void +usdf_rdm_rdc_insert(struct usdf_domain *udp, struct usdf_rdm_connection *rdc) +{ + uint16_t hash_index; + + hash_index = usdf_rdm_rdc_hash_helper( + (uint16_t *)&rdc->dc_hdr.uh_ip.daddr, + rdc->dc_hdr.uh_udp.dest); +PRINTF("insert rdc %p at %u\n", rdc, hash_index); + + rdc->dc_hash_next = udp->dom_rdc_hashtab[hash_index]; + udp->dom_rdc_hashtab[hash_index] = rdc; +} + +static inline void +usdf_rdm_rdc_remove(struct usdf_domain *udp, struct usdf_rdm_connection *rdc) +{ + uint16_t hash_index; + struct usdf_rdm_connection *prev; + + hash_index = usdf_rdm_rdc_hash_helper( + (uint16_t *)&rdc->dc_hdr.uh_ip.daddr, + rdc->dc_hdr.uh_udp.dest); +PRINTF("remove rdc %p from %u\n", rdc, hash_index); + + if (udp->dom_rdc_hashtab[hash_index] == rdc) { + udp->dom_rdc_hashtab[hash_index] = rdc->dc_hash_next; + } else { + prev = udp->dom_rdc_hashtab[hash_index]; + while (prev->dc_hash_next != rdc) { + prev = prev->dc_hash_next; + } + prev->dc_hash_next = rdc->dc_hash_next; + } +} + +/* + * Get a new RDC from domain list. + */ +static inline struct usdf_rdm_connection * +usdf_rdc_alloc(struct usdf_domain *udp) +{ + struct usdf_rdm_connection *rdc; + + if (SLIST_EMPTY(&udp->dom_rdc_free)) { + return NULL; // XXX alloc a new batch + } else { + rdc = SLIST_FIRST(&udp->dom_rdc_free); + SLIST_REMOVE_HEAD(&udp->dom_rdc_free, dc_addr_link); + atomic_dec(&udp->dom_rdc_free_cnt); + } + return rdc; +} + +/* + * Get an RDM connection for this send. If there is a connection for this + * TX queue already attached to this destination, use that. + * If not, check to see if one if in the connection cache (possibly put + * there by receive). If there is not one there either, grab a new one + * and put it in the cache and also attch to this dest. + */ +static inline struct usdf_rdm_connection * +usdf_rdm_rdc_tx_get(struct usdf_dest *dest, struct usdf_ep *ep) +{ + struct usdf_rdm_connection *rdc; + struct usdf_tx *tx; + struct usdf_rx *rx; + struct usd_qp_impl *qp; + struct usdf_domain *udp; + + tx = ep->ep_tx; + rx = ep->ep_rx; + + SLIST_FOREACH(rdc, &dest->ds_rdm_rdc_list, dc_addr_link) { + if (rdc->dc_tx == tx) { + return rdc; + } + } + + udp = tx->tx_domain; + rdc = usdf_rdm_rdc_addr_lookup(udp, + (uint16_t *)&dest->ds_dest.ds_dest.ds_udp.u_hdr.uh_ip.daddr, + dest->ds_dest.ds_dest.ds_udp.u_hdr.uh_udp.dest); + + if (rdc == NULL) { + rdc = usdf_rdc_alloc(udp); + if (rdc == NULL) { + return NULL; + } + memcpy(&rdc->dc_hdr, + &dest->ds_dest.ds_dest.ds_udp.u_hdr, + sizeof(rdc->dc_hdr)); + + qp = to_qpi(rx->rx_qp); + rdc->dc_tx = tx; + rdc->dc_hdr.uh_udp.source = + qp->uq_attrs.uqa_local_addr.ul_addr.ul_udp.u_addr.sin_port; + + usdf_rdm_rdc_insert(udp, rdc); + + /* start eviction timer */ + usdf_timer_set(tx->tx_domain->dom_fabric, rdc->dc_timer, + USDF_RDM_RDC_TIMEOUT); + } + + /* Add to list for this dest */ + SLIST_INSERT_HEAD(&dest->ds_rdm_rdc_list, rdc, dc_addr_link); + rdc->dc_dest = dest; + rdc->dc_seq_credits = USDF_RUDP_SEQ_CREDITS; + rdc->dc_next_tx_seq = 0; + + return rdc; +} + +/* + * See if there is matching connectoin in hash table. If not, grab a new one. + */ +static inline struct usdf_rdm_connection * +usdf_rdm_rdc_rx_get(struct usdf_rx *rx, struct rudp_pkt *pkt) +{ + struct usdf_rdm_connection *rdc; + struct usdf_domain *udp; + struct usdf_tx *tx; + + udp = rx->rx_domain; + tx = rx->r.rdm.rx_tx; + + /* if pkt->msg.src_peer_id != 0, live connection, just look up */ + + rdc = usdf_rdm_rdc_hdr_lookup(udp, &pkt->hdr); + if (rdc == NULL) { + rdc = usdf_rdc_alloc(udp); + if (rdc == NULL) { + return NULL; + } + + memcpy(&rdc->dc_hdr, pkt, sizeof(rdc->dc_hdr)); + memcpy(rdc->dc_hdr.uh_eth.ether_shost, + pkt->hdr.uh_eth.ether_dhost, ETH_ALEN); + memcpy(rdc->dc_hdr.uh_eth.ether_dhost, + pkt->hdr.uh_eth.ether_shost, ETH_ALEN); + rdc->dc_hdr.uh_ip.saddr = pkt->hdr.uh_ip.daddr; + rdc->dc_hdr.uh_ip.daddr = pkt->hdr.uh_ip.saddr; + rdc->dc_hdr.uh_udp.dest = pkt->hdr.uh_udp.source; + rdc->dc_hdr.uh_udp.source = pkt->hdr.uh_udp.dest; + + rdc->dc_next_rx_seq = 0; + rdc->dc_tx = tx; + usdf_rdm_rdc_insert(udp, rdc); + + /* start eviction timer */ + usdf_timer_set(tx->tx_domain->dom_fabric, rdc->dc_timer, + USDF_RDM_RDC_TIMEOUT); + } + return rdc; +} + +/* + * Rewind a queue entry by "rewind" packets + */ +static inline void +usdf_rdm_rewind_qe(struct usdf_rdm_qe *qe, size_t rewind, size_t mtu) +{ + size_t cur_resid; + size_t cur_iov; + size_t bytes; + size_t len; + + if (qe->rd_resid == 0) { + bytes = qe->rd_length % mtu; + cur_resid = 0; + } else { + bytes = mtu; + cur_resid = qe->rd_iov_resid; + } + bytes += (rewind - 1) * mtu; + qe->rd_resid += bytes; + + cur_iov = qe->rd_cur_iov; + while (bytes > 0) { + len = qe->rd_iov[cur_iov].iov_len - cur_resid; + if (len >= bytes) { + len = bytes; + cur_resid += len; + } else { + --cur_iov; + cur_resid = 0; + } + bytes -= len; + } + + qe->rd_cur_iov = cur_iov; + qe->rd_cur_ptr = qe->rd_iov[cur_iov].iov_base + + qe->rd_iov[cur_iov].iov_len - cur_resid; + qe->rd_iov_resid = cur_resid; +} + +/* + * semi-native rx buffer post, i want to eventually avoid using the + * vnic_*() calls + */ +static inline int +_usdf_rdm_post_recv(struct usdf_rx *rx, void *buf, size_t len) +{ + struct usd_rq *rq; + struct vnic_rq *vrq; + struct rq_enet_desc *desc; + struct usd_qp_impl *qp; + + qp = to_qpi(rx->rx_qp); + rq = &qp->uq_rq; + vrq = &rq->urq_vnic_rq; + + rq->urq_context[rq->urq_post_index] = buf; + rq->urq_post_index = (rq->urq_post_index + 1) + & rq->urq_post_index_mask; + + desc = vnic_rq_next_desc(vrq); + rq_enet_desc_enc(desc, (dma_addr_t) buf, + RQ_ENET_TYPE_ONLY_SOP, len); + wmb(); + vnic_rq_post(vrq, buf, 0, (dma_addr_t) buf, len, 0); + + return 0; +} + +/* + * Allow external access to the inline + */ +int +usdf_rdm_post_recv(struct usdf_rx *rx, void *buf, size_t len) +{ + return _usdf_rdm_post_recv(rx, buf, len); +} + +ssize_t +usdf_rdm_recv(struct fid_ep *fep, void *buf, size_t len, + void *desc, fi_addr_t src_addr, void *context) +{ + struct usdf_ep *ep; + struct usdf_rx *rx; + struct usdf_rdm_qe *rqe; + struct usdf_domain *udp; + + ep = ep_ftou(fep); + rx = ep->ep_rx; + udp = ep->ep_domain; + + if (TAILQ_EMPTY(&rx->r.rdm.rx_free_rqe)) { + return -FI_EAGAIN; + } + + pthread_spin_lock(&udp->dom_progress_lock); + + rqe = TAILQ_FIRST(&rx->r.rdm.rx_free_rqe); + TAILQ_REMOVE(&rx->r.rdm.rx_free_rqe, rqe, rd_link); + + rqe->rd_context = context; + rqe->rd_iov[0].iov_base = buf; + rqe->rd_iov[0].iov_len = len; + rqe->rd_last_iov = 0; + + rqe->rd_cur_iov = 0; + rqe->rd_cur_ptr = buf; + rqe->rd_iov_resid = len; + rqe->rd_length = 0; +PRINTF("RECV post rqe=%p len=%lu\n", rqe, len); + + TAILQ_INSERT_TAIL(&rx->r.rdm.rx_posted_rqe, rqe, rd_link); + + pthread_spin_unlock(&udp->dom_progress_lock); + + return 0; +} + +ssize_t +usdf_rdm_recvv(struct fid_ep *fep, const struct iovec *iov, void **desc, + size_t count, fi_addr_t src_addr, void *context) +{ + return -FI_ENOSYS; +} + +ssize_t +usdf_rdm_send(struct fid_ep *fep, const void *buf, size_t len, void *desc, + fi_addr_t dest_addr, void *context) +{ + struct usdf_ep *ep; + struct usdf_tx *tx; + struct usdf_rdm_qe *wqe; + struct usdf_domain *udp; + struct usdf_dest *dest; + struct usdf_rdm_connection *rdc; + uint32_t msg_id; + + ep = ep_ftou(fep); + tx = ep->ep_tx; + udp = ep->ep_domain; + dest = (struct usdf_dest *)dest_addr; + + if (TAILQ_EMPTY(&tx->t.rdm.tx_free_wqe)) { + return -FI_EAGAIN; + } + + pthread_spin_lock(&udp->dom_progress_lock); + + rdc = usdf_rdm_rdc_tx_get(dest, ep); + if (rdc == NULL) { + pthread_spin_unlock(&udp->dom_progress_lock); + return -FI_EAGAIN; + } + + wqe = TAILQ_FIRST(&tx->t.rdm.tx_free_wqe); + TAILQ_REMOVE(&tx->t.rdm.tx_free_wqe, wqe, rd_link); + + wqe->rd_context = context; + + msg_id = atomic_inc(&tx->t.rdm.tx_next_msg_id); + wqe->rd_msg_id_be = htonl(msg_id); + + wqe->rd_iov[0].iov_base = (void *)buf; + wqe->rd_iov[0].iov_len = len; + wqe->rd_last_iov = 0; + + wqe->rd_cur_iov = 0; + wqe->rd_cur_ptr = buf; + wqe->rd_iov_resid = len; + wqe->rd_resid = len; + wqe->rd_length = len; + + /* add send to TX list */ + TAILQ_INSERT_TAIL(&rdc->dc_wqe_posted, wqe, rd_link); + usdf_rdm_rdc_ready(rdc, tx); + + pthread_spin_unlock(&udp->dom_progress_lock); +PRINTF("SEND posted len=%lu, ID = %d\n", len, msg_id); + + usdf_domain_progress(udp); + + return 0; +} + +ssize_t +usdf_rdm_senddata(struct fid_ep *ep, const void *buf, size_t len, void *desc, + uint64_t data, fi_addr_t dest_addr, void *context) +{ + return -FI_ENOSYS; +} + +ssize_t +usdf_rdm_sendv(struct fid_ep *ep, const struct iovec *iov, void **desc, + size_t count, fi_addr_t dest_addr, void *context) +{ + return -FI_ENOSYS; +} + +ssize_t +usdf_rdm_sendmsg(struct fid_ep *ep, const struct fi_msg *msg, uint64_t flags) +{ + return -FI_ENOSYS; +} + +ssize_t +usdf_rdm_inject(struct fid_ep *ep, const void *buf, size_t len, + fi_addr_t dest_addr) +{ + return -FI_ENOSYS; +} + +ssize_t +usdf_rdm_recvmsg(struct fid_ep *ep, const struct fi_msg *msg, uint64_t flags) +{ + return -FI_ENOSYS; +} + +/* + * All segments send, stall this TXD until message completely ACKed + */ +static inline void +usdf_rdm_send_sent(struct usdf_tx *tx, struct usdf_rdm_connection *rdc) +{ + struct usdf_rdm_qe *wqe; + + wqe = TAILQ_FIRST(&rdc->dc_wqe_posted); + TAILQ_REMOVE(&rdc->dc_wqe_posted, wqe, rd_link); + TAILQ_INSERT_TAIL(&rdc->dc_wqe_sent, wqe, rd_link); + +#if 0 + /* remove this RDC from TX */ +if (!TAILQ_ON_LIST(rdc, dc_tx_link) abort(); + TAILQ_REMOVE_MARK(&tx->t.rdm.tx_rdc_ready, rdc, dc_tx_link); +#endif +} + +static inline void +usdf_rdm_send_segment(struct usdf_tx *tx, struct usdf_rdm_connection *rdc) +{ + struct rudp_pkt *hdr; + struct usdf_rdm_qe *wqe; + struct usd_qp_impl *qp; + struct usd_wq *wq; + uint32_t index; + size_t cur_iov; + size_t cur_resid; + size_t resid; + const uint8_t *cur_ptr; + const uint8_t *send_ptr; + size_t sent; + uint8_t *ptr; + struct usd_wq_post_info *info; + uint16_t opcode; + + wqe = TAILQ_FIRST(&rdc->dc_wqe_posted); + qp = to_qpi(tx->tx_qp); + wq = &(qp->uq_wq); + + index = wq->uwq_post_index; + hdr = (struct rudp_pkt *)(wq->uwq_copybuf + index * USD_SEND_MAX_COPY); + + memcpy(hdr, &rdc->dc_hdr, sizeof(struct usd_udp_hdr)); + + resid = wqe->rd_resid; + cur_iov = wqe->rd_cur_iov; + cur_ptr = wqe->rd_cur_ptr; + cur_resid = wqe->rd_iov_resid; + + if (cur_ptr == wqe->rd_iov[0].iov_base) { + opcode = RUDP_OP_FIRST; + } else { + opcode = RUDP_OP_MID; + } + + if (resid < USD_SEND_MAX_COPY - sizeof(*hdr)) { + opcode |= RUDP_OP_LAST; + hdr->msg.opcode = htons(opcode); + hdr->msg.msg_id = wqe->rd_msg_id_be; + hdr->msg.m.rc_data.length = htons(resid); + hdr->msg.m.rc_data.seqno = htons(rdc->dc_next_tx_seq); + ++rdc->dc_next_tx_seq; + + ptr = (uint8_t *)(hdr + 1); + sent = resid; + while (resid > 0) { + memcpy(ptr, cur_ptr, cur_resid); + ptr += wqe->rd_iov_resid; + resid -= wqe->rd_iov_resid; + ++cur_iov; + cur_ptr = wqe->rd_iov[cur_iov].iov_base; + cur_resid = wqe->rd_iov[cur_iov].iov_len; + } + + /* add packet lengths */ + hdr->hdr.uh_ip.tot_len = htons( + sent + sizeof(struct rudp_pkt) - + sizeof(struct ether_header)); + hdr->hdr.uh_udp.len = htons( + (sizeof(struct rudp_pkt) - + sizeof(struct ether_header) - + sizeof(struct iphdr)) + sent); +PRINTF("TX 1seg=%lu, s/i = %u/%u\n", sent, ntohs(hdr->msg.m.rc_data.seqno), ntohl(hdr->msg.msg_id)); + + index = _usd_post_send_one(wq, hdr, + sent + sizeof(*hdr), 1); + } else { + struct vnic_wq *vwq; + u_int8_t offload_mode = 0, eop; + u_int16_t mss = 7, header_length = 0, vlan_tag = 0; + u_int8_t vlan_tag_insert = 0, loopback = 0, fcoe_encap = 0; + struct wq_enet_desc *desc; + size_t space; + size_t num_sge; + size_t sge_len; + + vwq = &wq->uwq_vnic_wq; + desc = wq->uwq_next_desc; + space = tx->tx_domain->dom_fabric->fab_dev_attrs->uda_mtu - + sizeof(*hdr); + num_sge = 1; + + /* encode header desc */ + eop = 0; + wq_enet_desc_enc(desc, (uintptr_t)hdr, sizeof(*hdr), + mss, header_length, offload_mode, eop, 0, fcoe_encap, + vlan_tag_insert, vlan_tag, loopback); + + do { + desc = (struct wq_enet_desc *) + ((uintptr_t)wq->uwq_desc_ring + (index << 4)); + index = (index + 1) & wq->uwq_post_index_mask; + + send_ptr = cur_ptr; + if (cur_resid >= space) { + sge_len = space; + eop = 1; + cur_resid -= sge_len; + cur_ptr += sge_len; + } else { + sge_len = cur_resid; + if (num_sge == USDF_RDM_MAX_SGE - 1 || + cur_resid == resid) { + eop = 1; + } + ++cur_iov; + cur_ptr = wqe->rd_iov[cur_iov].iov_base; + cur_resid = wqe->rd_iov[cur_iov].iov_len; + } + + wq_enet_desc_enc(desc, (uintptr_t)send_ptr, sge_len, + mss, header_length, offload_mode, eop, eop, + fcoe_encap, vlan_tag_insert, + vlan_tag, loopback); + + ++num_sge; + space -= sge_len; + resid -= sge_len; + } while (space > 0 && num_sge <= USDF_RDM_MAX_SGE && resid > 0); + + /* add packet lengths */ + sent = tx->tx_domain->dom_fabric->fab_dev_attrs->uda_mtu - + sizeof(*hdr) - space; +//printf("SEND sent=%lu resid=%lu\n", sent, resid); + hdr->hdr.uh_ip.tot_len = htons( + sent + sizeof(struct rudp_pkt) - + sizeof(struct ether_header)); + hdr->hdr.uh_udp.len = htons( + (sizeof(struct rudp_pkt) - + sizeof(struct ether_header) - + sizeof(struct iphdr)) + sent); +if (0) { +if ((random() % 177) == 0 && resid == 0) { + hdr->hdr.uh_eth.ether_type = 0; +//printf("BORK seq %u, ID %u\n", rdc->dc_next_tx_seq, ntohl(wqe->rd_msg_id_be)); +} +} + + if (resid == 0) { + opcode |= RUDP_OP_LAST; + } + hdr->msg.opcode = htons(opcode); + hdr->msg.msg_id = wqe->rd_msg_id_be; + hdr->msg.m.rc_data.length = htons(sent); + hdr->msg.m.rc_data.seqno = htons(rdc->dc_next_tx_seq); + ++rdc->dc_next_tx_seq; +PRINTF("TX sge=%lu, s/i = %u/%u\n", sent, ntohs(hdr->msg.m.rc_data.seqno), ntohl(hdr->msg.msg_id)); + + wmb(); + iowrite64(index, &vwq->ctrl->posted_index); + + wq->uwq_next_desc = (struct wq_enet_desc *) + ((uintptr_t)wq->uwq_desc_ring + (index << 4)); + wq->uwq_post_index = (index + 1) & wq->uwq_post_index_mask; + wq->uwq_send_credits -= num_sge; + } + + info = &wq->uwq_post_info[index]; + info->wp_context = tx; + info->wp_len = sent; + + /* If send complete, wait for last ack on this message */ + if (resid == 0) { + wqe->rd_resid = 0; + usdf_rdm_send_sent(tx, rdc); + } else { + wqe->rd_resid = resid; + wqe->rd_iov_resid = cur_resid; + wqe->rd_cur_iov = cur_iov; + wqe->rd_cur_ptr = cur_ptr; + } + + /* set ack timer */ + usdf_timer_set(tx->tx_domain->dom_fabric, rdc->dc_timer, + USDF_RUDP_ACK_TIMEOUT); +} + +static inline void +usdf_rdm_send_ack(struct usdf_tx *tx, struct usdf_rdm_connection *rdc) +{ + struct rudp_pkt *hdr; + struct usd_wq *wq; + uint32_t last_post; + struct usd_wq_post_info *info; + uint16_t seq; + + wq = &(to_qpi(tx->tx_qp)->uq_wq); + hdr = (struct rudp_pkt *) (wq->uwq_copybuf + + wq->uwq_post_index * USD_SEND_MAX_COPY); + + memcpy(hdr, &rdc->dc_hdr, sizeof(struct usd_udp_hdr)); + + if (rdc->dc_send_nak) { + hdr->msg.opcode = htons(RUDP_OP_NAK); + seq = rdc->dc_ack_seq + 1; + hdr->msg.m.nak.nak_seq = htons(seq); + rdc->dc_send_nak = 0; +PRINTF("TX NAK seq=%d\n", seq); + } else { + hdr->msg.opcode = htons(RUDP_OP_ACK); + seq = rdc->dc_ack_seq; + hdr->msg.m.ack.ack_seq = htons(seq); +PRINTF("TXACK seq=%u:%u\n", seq, rdc->dc_rx_msg_id); + } + hdr->msg.msg_id = htonl(rdc->dc_ack_msg_id); + + /* add packet lengths */ + hdr->hdr.uh_ip.tot_len = htons( + sizeof(struct rudp_pkt) - + sizeof(struct ether_header)); + hdr->hdr.uh_udp.len = htons(sizeof(struct rudp_pkt) - + sizeof(struct ether_header) - sizeof(struct iphdr)); + + last_post = _usd_post_send_one(wq, hdr, sizeof(*hdr), 1); + + info = &wq->uwq_post_info[last_post]; + info->wp_context = tx; + info->wp_len = 0; +} + +/* + * If this TX has sends to do and is not on domain ready list, then + * this completion means we can go back on the domain ready list + */ +static void +usdf_rdm_send_completion(struct usd_completion *comp) +{ + struct usdf_tx *tx; + + tx = comp->uc_context; + + if (!TAILQ_EMPTY(&tx->t.rdm.tx_rdc_ready) && + !TAILQ_ON_LIST(tx, tx_link)) { + TAILQ_INSERT_TAIL(&tx->tx_domain->dom_tx_ready, tx, tx_link); + } +} + +/* + * Keep progressing sends on this queue until: + * a) no more send credits on the queue (it's full) + * or + * b) all endpoints are complete or blocked awaiting ACKs + */ +void +usdf_rdm_tx_progress(struct usdf_tx *tx) +{ + struct usdf_rdm_connection *rdc; + struct usd_qp_impl *qp; + + qp = to_qpi(tx->tx_qp); + while (qp->uq_wq.uwq_send_credits > 1 && + !TAILQ_EMPTY(&tx->t.rdm.tx_rdc_have_acks)) { + rdc = TAILQ_FIRST(&tx->t.rdm.tx_rdc_have_acks); + TAILQ_REMOVE_MARK(&tx->t.rdm.tx_rdc_have_acks, + rdc, dc_ack_link); + + usdf_rdm_send_ack(tx, rdc); + } + + while (qp->uq_wq.uwq_send_credits > 1 && + !TAILQ_EMPTY(&tx->t.rdm.tx_rdc_ready)) { + rdc = TAILQ_FIRST(&tx->t.rdm.tx_rdc_ready); + + /* + * Send next segment on this connection. This will also + * remove the current WQE from the RDC list if it + * completes. + */ + usdf_rdm_send_segment(tx, rdc); + + --rdc->dc_seq_credits; + if (!TAILQ_EMPTY(&rdc->dc_wqe_sent)) { + TAILQ_REMOVE_MARK(&tx->t.rdm.tx_rdc_ready, + rdc, dc_tx_link); + } else if (TAILQ_EMPTY(&rdc->dc_wqe_posted)) { + TAILQ_REMOVE_MARK(&tx->t.rdm.tx_rdc_ready, + rdc, dc_tx_link); + } else { + --rdc->dc_fairness_credits; + if (rdc->dc_seq_credits == 0) { + TAILQ_REMOVE_MARK(&tx->t.rdm.tx_rdc_ready, + rdc, dc_tx_link); + rdc->dc_fairness_credits = + USDF_RDM_FAIRNESS_CREDITS; + + /* fairness credits exhausted, go to back of the line */ + } else if (rdc->dc_fairness_credits == 0) { + TAILQ_REMOVE(&tx->t.rdm.tx_rdc_ready, + rdc, dc_tx_link); + TAILQ_INSERT_TAIL(&tx->t.rdm.tx_rdc_ready, + rdc, dc_tx_link); + rdc->dc_fairness_credits = + USDF_RDM_FAIRNESS_CREDITS; + } + } + } +} + +static void inline +usdf_rdm_recv_complete(struct usdf_rx *rx, struct usdf_rdm_connection *rdc, + struct usdf_rdm_qe *rqe) +{ + struct usdf_cq_hard *hcq; + +PRINTF("RECV complete ID=%u len=%lu\n", rdc->dc_rx_msg_id, rqe->rd_length); + hcq = rx->r.rdm.rx_hcq; + hcq->cqh_post(hcq, rqe->rd_context, rqe->rd_length); + + TAILQ_INSERT_HEAD(&rx->r.rdm.rx_free_rqe, rqe, rd_link); + + rdc->dc_cur_rqe = NULL; +} + +static inline void +usdf_rdm_rdc_has_ack(struct usdf_rdm_connection *rdc) +{ + struct usdf_tx *tx; + struct usdf_domain *udp; + + if (!TAILQ_ON_LIST(rdc, dc_ack_link)) { + tx = rdc->dc_tx; + udp = tx->tx_domain; + TAILQ_INSERT_TAIL(&tx->t.rdm.tx_rdc_have_acks, rdc, + dc_ack_link); + /* Add TX to domain list if not present */ + if (!TAILQ_ON_LIST(tx, tx_link)) { + TAILQ_INSERT_TAIL(&udp->dom_tx_ready, tx, tx_link); + } + } +} + +static inline void +usdf_set_ack_nak(struct usdf_rdm_connection *rdc, uint32_t msg_id, + uint16_t seq, uint16_t nak) +{ + /* if newly on list or msg_id > cur, use all new values */ + if (!TAILQ_ON_LIST(rdc, dc_ack_link) || + RUDP_MSGID_GT(msg_id, rdc->dc_ack_msg_id)) { + rdc->dc_ack_msg_id = msg_id; + rdc->dc_ack_seq = seq; + rdc->dc_send_nak = nak; + + /* If same msg_id and new seq, use new seq */ + } else if (msg_id == rdc->dc_ack_msg_id && + RUDP_SEQ_GE(seq, rdc->dc_ack_seq)) { + rdc->dc_ack_seq = seq; + rdc->dc_send_nak = nak; + } + + usdf_rdm_rdc_has_ack(rdc); +} + +static inline void +usdf_set_ack(struct usdf_rdm_connection *rdc, uint32_t msg_id, uint16_t seq) +{ + usdf_set_ack_nak(rdc, msg_id, seq, 0); +} + +static inline void +usdf_set_nak(struct usdf_rdm_connection *rdc, uint32_t msg_id, uint16_t seq) +{ + usdf_set_ack_nak(rdc, msg_id, seq, 1); +} + +static inline struct usdf_rdm_qe * +usdf_rdm_check_seq_id(struct usdf_rdm_connection *rdc, struct usdf_rx *rx, + struct rudp_pkt *pkt) +{ + uint16_t seq; + uint32_t msg_id; + int32_t msg_delta; + struct usdf_rdm_qe *rqe; + + seq = ntohs(pkt->msg.m.rc_data.seqno); + msg_id = ntohl(pkt->msg.msg_id); + if (rdc->dc_flags & USDF_DCF_NEW_RX) { + msg_delta = 1; + } else { + msg_delta = RUDP_SEQ_DIFF(msg_id, rdc->dc_rx_msg_id); + } + rqe = rdc->dc_cur_rqe; +PRINTF("RXSEQ %u:%u, msg_delt=%d, rqe=%p\n", seq, msg_id, msg_delta, rqe); + + /* old message ID */ + if (msg_delta < 0) { + return NULL; /* just DROP */ + + /* current message ID */ + } else if (msg_delta == 0) { + if (RUDP_SEQ_LT(seq, rdc->dc_next_rx_seq)) { +PRINTF("old SEQ, ACK %u\n", (uint16_t)(rdc->dc_next_rx_seq)); + usdf_set_ack(rdc, msg_id, rdc->dc_next_rx_seq); + } else if (seq == rdc->dc_next_rx_seq) { +PRINTF("old SEQ, ACK %u\n", (uint16_t)(rdc->dc_next_rx_seq)); + usdf_set_ack(rdc, msg_id, rdc->dc_next_rx_seq); + ++rdc->dc_next_rx_seq; + } else { +PRINTF("future SEQ, NAK %u\n", rdc->dc_next_rx_seq); + usdf_set_nak(rdc, msg_id, rdc->dc_next_rx_seq - 1); + rqe = NULL; + } + + /* future message ID */ + } else { + if (rqe != NULL) { + return NULL; /* DROP */ + } else if (seq != 0) { + usdf_set_nak(rdc, msg_id, -1); + } else if (TAILQ_EMPTY(&rx->r.rdm.rx_posted_rqe)) { +printf("RX overrun?????\n"); + usdf_set_nak(rdc, msg_id, -1); + } else { + rqe = TAILQ_FIRST(&rx->r.rdm.rx_posted_rqe); + TAILQ_REMOVE(&rx->r.rdm.rx_posted_rqe, rqe, rd_link); + rdc->dc_flags &= ~USDF_DCF_NEW_RX; + rdc->dc_cur_rqe = rqe; + rdc->dc_rx_msg_id = msg_id; + usdf_set_ack(rdc, msg_id, 0); + rdc->dc_next_rx_seq = 1; +PRINTF("start new msg, rqe=%p\n", rqe); + } + } + return rqe; +} + +static inline void +usdf_rdm_process_ack(struct usdf_rdm_connection *rdc, + struct usdf_tx *tx, uint16_t seq, uint32_t msg_id) +{ + struct usdf_cq_hard *hcq; + struct usdf_rdm_qe *wqe; + struct usdf_fabric *fp; + uint16_t max_ack; + unsigned credits; + + /* find assocoated send, drop if none */ + if (!TAILQ_EMPTY(&rdc->dc_wqe_sent)) { + wqe = TAILQ_FIRST(&rdc->dc_wqe_sent); + } else if (!TAILQ_EMPTY(&rdc->dc_wqe_posted)) { + wqe = TAILQ_FIRST(&rdc->dc_wqe_posted); + } else { +PRINTF("ACK no WQEs\n"); + return; + } + + /* drop if not for this message */ + if (msg_id != ntohl(wqe->rd_msg_id_be)) { +PRINTF("ACK ID %u != %u\n", msg_id, ntohl(wqe->rd_msg_id_be)); + return; + } + + /* don't try to ACK what we don't think we've sent */ + max_ack = rdc->dc_next_tx_seq - 1; +PRINTF("ACK %u max = %u\n", seq, max_ack); + if (RUDP_SEQ_GT(seq, max_ack)) { + seq = max_ack; + } + + credits = RUDP_SEQ_DIFF(seq, rdc->dc_last_rx_ack); + if (rdc->dc_seq_credits == 0 && credits > 0 && + !TAILQ_EMPTY(&rdc->dc_wqe_posted)) { + usdf_rdm_rdc_ready(rdc, tx); + } + rdc->dc_seq_credits += credits; + rdc->dc_last_rx_ack = seq; + + /* + * Look at the current send - if this ACK is for the last sequence we + * have sent and the message is fully sent, post a completion and move + * on to the next send. + */ + fp = tx->tx_domain->dom_fabric; + if (seq == max_ack) { + hcq = tx->t.rdm.tx_hcq; + if (!TAILQ_EMPTY(&rdc->dc_wqe_sent)) { + if (wqe->rd_resid == 0) { + TAILQ_REMOVE(&rdc->dc_wqe_sent, wqe, rd_link); +PRINTF("send ID=%u complete\n", msg_id); + hcq->cqh_post(hcq, wqe->rd_context, + wqe->rd_length); + + TAILQ_INSERT_HEAD(&tx->t.rdm.tx_free_wqe, + wqe, rd_link); + + /* prepare for next message */ + rdc->dc_next_tx_seq = 0; + rdc->dc_last_rx_ack = rdc->dc_next_tx_seq - 1; +PRINTF("posted %s, sent %s\n", TAILQ_EMPTY(&rdc->dc_wqe_posted)?"empty":"occupied", TAILQ_EMPTY(&rdc->dc_wqe_sent)?"empty":"occupied"); + if (!TAILQ_EMPTY(&rdc->dc_wqe_posted)) { + usdf_rdm_rdc_ready(rdc, tx); + } + } + } + + /* revert to eviction timeout */ + usdf_timer_reset(fp, rdc->dc_timer, USDF_RDM_RDC_TIMEOUT); + } else { + usdf_timer_reset(fp, rdc->dc_timer, USDF_RUDP_ACK_TIMEOUT); + } +} + +static inline void +usdf_rdm_process_nak(struct usdf_rdm_connection *rdc, struct usdf_tx *tx, + uint16_t seq, uint32_t msg_id) +{ + struct usdf_rdm_qe *wqe; + struct usdf_fabric *fp; + uint32_t wqe_msg_id; + int rewind; + + /* Ignore NAKs of future packets */ + /* XXX or non-matching msg id */ + + /* In unconnected case, only one msg in flight. If wqe_sent != NULL, + * apply to that, else apply to wqe_posted + */ + if (!TAILQ_EMPTY(&rdc->dc_wqe_sent)) { + wqe = TAILQ_FIRST(&rdc->dc_wqe_sent); + wqe_msg_id = ntohl(wqe->rd_msg_id_be); +PRINTF("NAK %u:%u, next = %u:%u\n", seq, msg_id, rdc->dc_next_tx_seq, wqe_msg_id); + if (msg_id != wqe_msg_id) { + return; + } + TAILQ_REMOVE(&rdc->dc_wqe_sent, wqe, rd_link); + TAILQ_INSERT_HEAD(&rdc->dc_wqe_posted, wqe, rd_link); + } else if (!TAILQ_EMPTY(&rdc->dc_wqe_posted)) { + wqe = TAILQ_FIRST(&rdc->dc_wqe_posted); + wqe_msg_id = ntohl(wqe->rd_msg_id_be); +PRINTF("NAK %u:%u, next = %u:%u (posted)\n", seq, msg_id, rdc->dc_next_tx_seq, wqe_msg_id); + if (msg_id != wqe_msg_id) { + return; + } + } else { +PRINTF("NAK Nothing send or posted\n"); + return; + } + + /* reset WQE to old sequence # */ + rewind = RUDP_SEQ_DIFF(rdc->dc_next_tx_seq, seq); +PRINTF("rewind = %d\n", rewind); + if (rewind > 0) { + rdc->dc_seq_credits = USDF_RUDP_SEQ_CREDITS; + rdc->dc_next_tx_seq = seq; + + fp = rdc->dc_tx->tx_domain->dom_fabric; + usdf_rdm_rewind_qe(wqe, rewind, + fp->fab_dev_attrs->uda_mtu - sizeof(struct rudp_pkt)); + + usdf_rdm_rdc_ready(rdc, tx); + } +} + +/* + * RDC timeout could be because of needing to retransmit a packet, or it + * could be cache eviction timer + */ +void +usdf_rdm_rdc_timeout(void *vrdc) +{ + struct usdf_rdm_connection *rdc; + struct usdf_rdm_qe *wqe; + struct usdf_domain *udp; + struct usdf_dest *dest; + uint16_t nak; + + rdc = vrdc; + udp = rdc->dc_tx->tx_domain; +PRINTF("RDC timer fire\n"); + + pthread_spin_lock(&udp->dom_progress_lock); + + if (!TAILQ_EMPTY(&rdc->dc_wqe_sent)) { + wqe = TAILQ_FIRST(&rdc->dc_wqe_sent); + goto gotnak; + } else if (!TAILQ_EMPTY(&rdc->dc_wqe_posted)) { + wqe = TAILQ_FIRST(&rdc->dc_wqe_posted); + goto gotnak; + + /* If inactive, remove from hash list */ + } else if (rdc->dc_cur_rqe == NULL && + !TAILQ_ON_LIST(rdc, dc_tx_link) && + !TAILQ_ON_LIST(rdc, dc_ack_link)) { + + dest = rdc->dc_dest; + if (dest != NULL) { + SLIST_REMOVE(&dest->ds_rdm_rdc_list, rdc, + usdf_rdm_connection, dc_addr_link); + } + + rdc->dc_dest = NULL; + rdc->dc_flags = USDF_DCS_UNCONNECTED | USDF_DCF_NEW_RX; + rdc->dc_next_rx_seq = 0; + usdf_rdm_rdc_remove(udp, rdc); + + SLIST_INSERT_HEAD(&udp->dom_rdc_free, rdc, dc_addr_link); + atomic_inc(&udp->dom_rdc_free_cnt); + + } else { + usdf_timer_set(udp->dom_fabric, rdc->dc_timer, + USDF_RDM_RDC_TIMEOUT); + } + goto done; + +gotnak: + /* wqe set above */ + nak = rdc->dc_last_rx_ack + 1; +PRINTF("TIMEOUT nak=%u:%u\n", nak, ntohl(wqe->rd_msg_id_be)); + usdf_rdm_process_nak(rdc, rdc->dc_tx, nak, ntohl(wqe->rd_msg_id_be)); + +done: + pthread_spin_unlock(&udp->dom_progress_lock); +} + +static inline void +usdf_rdm_rx_ack(struct usdf_rdm_connection *rdc, struct usdf_tx *tx, + struct rudp_pkt *pkt) +{ + uint16_t seq; + uint32_t msg_id; + + seq = ntohs(pkt->msg.m.nak.nak_seq); + msg_id = ntohl(pkt->msg.msg_id); +PRINTF("RXACK %u:%u\n", seq, msg_id); + usdf_rdm_process_ack(rdc, tx, seq, msg_id); +} + +static inline void +usdf_rdm_rx_nak(struct usdf_rdm_connection *rdc, struct usdf_tx *tx, + struct rudp_pkt *pkt) +{ + uint16_t seq; + uint32_t msg_id; + + seq = ntohs(pkt->msg.m.nak.nak_seq); + msg_id = ntohl(pkt->msg.msg_id); + usdf_rdm_process_ack(rdc, tx, seq - 1, msg_id); + + usdf_rdm_process_nak(rdc, tx, seq, msg_id); +} + +/* + * Handle a receive on a queue servicing a message endpoint + */ +static inline void +usdf_rdm_handle_recv(struct usdf_domain *udp, struct usd_completion *comp) +{ + struct rudp_pkt *pkt; + struct usdf_rdm_qe *rqe; + struct usdf_rdm_connection *rdc; + struct usd_qp *qp; + struct usdf_rx *rx; + uint32_t opcode; + uint8_t *rx_ptr; + uint8_t *rqe_ptr; + size_t cur_iov; + size_t iov_resid; + size_t rxlen; + size_t copylen; + + qp = comp->uc_qp; + rx = qp->uq_context; + pkt = comp->uc_context; + opcode = ntohs(pkt->msg.opcode); + + rdc = usdf_rdm_rdc_rx_get(rx, pkt); + if (rdc == NULL) { + goto repost; + } +//printf("RX opcode=%u\n", opcode); + + switch (opcode) { + case RUDP_OP_ACK: + usdf_rdm_rx_ack(rdc, rx->r.rdm.rx_tx, pkt); + goto repost; + + case RUDP_OP_NAK: + usdf_rdm_rx_nak(rdc, rx->r.rdm.rx_tx, pkt); + goto repost; + default: + break; + } + + if ((opcode & ~RUDP_OP_DATA_MASK) != 0) { + goto repost; + } + + /* check sequence # and msg_id */ + rqe = usdf_rdm_check_seq_id(rdc, rx, pkt); + if (rqe == NULL) { + goto repost; + } + + /* Consume the data in the packet */ + rxlen = ntohs(pkt->msg.m.rc_data.length); + rqe->rd_length += rxlen; + + rx_ptr = (uint8_t *)(pkt + 1); + rqe_ptr = (uint8_t *)rqe->rd_cur_ptr; + iov_resid = rqe->rd_iov_resid; + cur_iov = rqe->rd_cur_iov; + while (rxlen > 0) { + copylen = MIN(rxlen, iov_resid); + memcpy(rqe_ptr, rx_ptr, copylen); + rx_ptr += copylen; + rxlen -= copylen; + iov_resid -= copylen; + if (iov_resid == 0) { + if (cur_iov == rqe->rd_last_iov) { + break; + } + ++cur_iov; + rqe_ptr = rqe->rd_iov[cur_iov].iov_base; + iov_resid = rqe->rd_iov[cur_iov].iov_len; + } else { + rqe_ptr += copylen; + } + } + + if (rxlen > 0) { + rqe->rd_length -= rxlen; +/* printf("RQE truncated XXX\n"); */ + } else if (opcode & RUDP_OP_LAST) { + usdf_rdm_recv_complete(rx, rdc, rqe); + } + +repost: + /* repost buffer */ + _usdf_rdm_post_recv(rx, pkt, + rx->rx_domain->dom_fabric->fab_dev_attrs->uda_mtu); +} + +/* + * Process message completions + */ +void +usdf_rdm_hcq_progress(struct usdf_cq_hard *hcq) +{ + struct usd_completion comp; + int loop; + + loop = 100; + while (loop-- > 0 && usd_poll_cq(hcq->cqh_ucq, &comp) != -EAGAIN) { + switch (comp.uc_type) { + case USD_COMPTYPE_SEND: + usdf_rdm_send_completion(&comp); + break; + case USD_COMPTYPE_RECV: + usdf_rdm_handle_recv(hcq->cqh_cq->cq_domain, &comp); + break; + } + } +} diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_rdm.h b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_rdm.h new file mode 100644 index 0000000000..921ee1cc7d --- /dev/null +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_rdm.h @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _USDF_RDM_H_ +#define _USDF_RDM_H_ + +#define USDF_RDM_CAPS (FI_MSG | FI_SOURCE | FI_SEND | FI_RECV) + +#define USDF_RDM_SUPP_MODE (FI_LOCAL_MR) +#define USDF_RDM_REQ_MODE (FI_LOCAL_MR) + +#define USDF_RDM_MAX_SGE 8 +#define USDF_RDM_DFLT_SGE 8 +#define USDF_RDM_MAX_CTX_SIZE 1024 +#define USDF_RDM_DFLT_CTX_SIZE 128 + +#define USDF_RDM_MAX_MSG UINT_MAX + +#define USDF_RDM_FREE_BLOCK (16 * 1024) +#define USDF_RDM_HASH_SIZE (64 * 1024) +#define USDF_RDM_HASH_MASK (USDF_RDM_HASH_SIZE - 1) +#define USDF_RDM_FAIRNESS_CREDITS 16 + +#define USDF_RDM_RUDP_SEQ_CREDITS 256 + +#define USDF_RDM_RDC_TIMEOUT 1000 /* ms */ + +struct usdf_rdm_qe { + void *rd_context; + uint32_t rd_msg_id_be; + + struct iovec rd_iov[USDF_RDM_MAX_SGE]; + size_t rd_last_iov; + size_t rd_length; + + size_t rd_cur_iov; + const uint8_t *rd_cur_ptr; + size_t rd_resid; /* amount remaining in entire rdm */ + size_t rd_iov_resid; /* amount remaining in current iov */ + + TAILQ_ENTRY(usdf_rdm_qe) rd_link; + + struct usdf_rdm_connection *rd_conn; +}; + +/* + * RDM connection state + */ +enum { + USDF_DCS_UNCONNECTED = 0, + USDF_DCS_CONNECTING = 1, + USDF_DCS_CONNECTED = 2 +}; + +#define USDF_DCF_STATE_BITS 0x03 +#define USDF_DCF_NEW_RX 0x04 + +/* + * We're only connectionless to the app. + * This connection struct is used to manage messages in flight. + */ +struct usdf_rdm_connection { + atomic_t dc_refcnt; + + struct usdf_tx *dc_tx; + struct usd_udp_hdr dc_hdr; + uint16_t dc_flags; + struct usdf_timer_entry *dc_timer; + + /* RX state */ + uint32_t dc_rx_msg_id; + struct usdf_rdm_qe *dc_cur_rqe; + uint16_t dc_next_rx_seq; + uint16_t dc_send_nak; + uint32_t dc_ack_msg_id; + uint16_t dc_ack_seq; + TAILQ_ENTRY(usdf_rdm_connection) dc_ack_link; + + /* TX state */ + struct usdf_dest *dc_dest; + TAILQ_HEAD(,usdf_rdm_qe) dc_wqe_posted; + TAILQ_HEAD(,usdf_rdm_qe) dc_wqe_sent; + uint16_t dc_next_tx_seq; + uint16_t dc_last_rx_ack; + size_t dc_fairness_credits; + size_t dc_seq_credits; + TAILQ_ENTRY(usdf_rdm_connection) dc_tx_link; + + SLIST_ENTRY(usdf_rdm_connection) dc_addr_link; + struct usdf_rdm_connection *dc_hash_next; +}; + +int usdf_rdm_post_recv(struct usdf_rx *rx, void *buf, size_t len); +int usdf_rdm_fill_tx_attr(struct fi_tx_attr *txattr); +int usdf_rdm_fill_rx_attr(struct fi_rx_attr *rxattr); +int usdf_cq_rdm_poll(struct usd_cq *ucq, struct usd_completion *comp); +void usdf_rdm_rdc_timeout(void *vrdc); + +void usdf_rdm_hcq_progress(struct usdf_cq_hard *hcq); +void usdf_rdm_tx_progress(struct usdf_tx *tx); + +/* fi_ops_cm for RC */ +int usdf_cm_rdm_connect(struct fid_ep *ep, const void *addr, + const void *param, size_t paramlen); +int usdf_cm_rdm_accept(struct fid_ep *fep, const void *param, size_t paramlen); +int usdf_cm_rdm_shutdown(struct fid_ep *ep, uint64_t flags); + +/* fi_ops_rdm for RC */ +ssize_t usdf_rdm_recv(struct fid_ep *ep, void *buf, size_t len, void *desc, + fi_addr_t src_addr, void *context); +ssize_t usdf_rdm_recvv(struct fid_ep *ep, const struct iovec *iov, + void **desc, size_t count, fi_addr_t src_addr, void *context); +ssize_t usdf_rdm_recvmsg(struct fid_ep *ep, const struct fi_msg *msg, + uint64_t flags); + +ssize_t usdf_rdm_send(struct fid_ep *ep, const void *buf, size_t len, + void *desc, fi_addr_t src_addr, void *context); +ssize_t usdf_rdm_sendv(struct fid_ep *ep, const struct iovec *iov, + void **desc, size_t count, fi_addr_t src_addr, void *context); +ssize_t usdf_rdm_sendmsg(struct fid_ep *ep, const struct fi_msg *msg, + uint64_t flags); +ssize_t usdf_rdm_senddata(struct fid_ep *ep, const void *buf, size_t len, + void *desc, uint64_t data, fi_addr_t src_addr, void *context); + +ssize_t usdf_rdm_inject(struct fid_ep *ep, const void *buf, size_t len, + fi_addr_t src_addr); + + + +#endif /* _USDF_RDM_H_ */ diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_rudp.h b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_rudp.h new file mode 100644 index 0000000000..e284408dfd --- /dev/null +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_rudp.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _USDF_RUDP_H_ +#define _USDF_RUDP_H_ + +#include "usnic_direct.h" + +#define USDF_RUDP_SEQ_CREDITS 256 +#define USDF_RUDP_ACK_TIMEOUT 5 /* ms */ + +#define RUDP_SEQ_DIFF(A, B) ((int16_t)((u_int16_t)(A) - (u_int16_t)(B))) +#define RUDP_SEQ_LT(A, B) (RUDP_SEQ_DIFF((A), (B)) < 0) +#define RUDP_SEQ_LE(A, B) (RUDP_SEQ_DIFF((A), (B)) <= 0) +#define RUDP_SEQ_GT(A, B) (RUDP_SEQ_DIFF((A), (B)) > 0) +#define RUDP_SEQ_GE(A, B) (RUDP_SEQ_DIFF((A), (B)) >= 0) + +#define RUDP_MSGID_DIFF(A, B) ((int32_t)((u_int32_t)(A) - (u_int32_t)(B))) +#define RUDP_MSGID_LT(A, B) (RUDP_MSGID_DIFF((A), (B)) < 0) +#define RUDP_MSGID_LE(A, B) (RUDP_MSGID_DIFF((A), (B)) <= 0) +#define RUDP_MSGID_GT(A, B) (RUDP_MSGID_DIFF((A), (B)) > 0) +#define RUDP_MSGID_GE(A, B) (RUDP_MSGID_DIFF((A), (B)) >= 0) + +enum { + /* data messages (a bitmask of FIRST and LAST) */ + RUDP_OP_MID = 0x00, + RUDP_OP_FIRST = 0x01, + RUDP_OP_LAST = 0x02, + RUDP_OP_ONLY = 0x03, + + /* control messages */ + RUDP_OP_CONNECT_REQ = 0x81, + RUDP_OP_CONNECT_RESP = 0x82, + RUDP_OP_NAK = 0x83, + RUDP_OP_ACK = 0x84, +}; + +#define RUDP_OP_DATA_MASK (RUDP_OP_FIRST | RUDP_OP_LAST) + +struct rudp_rc_data_msg { + u_int32_t offset; /* 4 */ + u_int16_t rkey; /* 8 */ + u_int16_t length; /* 10 */ + u_int16_t seqno; /* 12 */ + u_int16_t rdma_id; /* 14 */ +} __attribute__ ((__packed__)); + +struct rudp_msg { + u_int16_t opcode; + u_int16_t src_peer_id; + u_int32_t msg_id; + union { + struct rudp_rc_data_msg rc_data; + struct { + u_int16_t dst_peer_id; + } connect_req; + struct { + u_int16_t dst_peer_id; + } connect_resp; + struct { + u_int16_t ack_seq; + } ack; + struct { + u_int16_t nak_seq; + u_int32_t seq_mask; + } nak; + } __attribute__ ((__packed__)) m; +} __attribute__ ((__packed__)); + + +struct rudp_pkt { + struct usd_udp_hdr hdr; + struct rudp_msg msg; +} __attribute__ ((__packed__)); + + +#endif /* _USDF_RUDP_H_ */ diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_timer.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_timer.c index 7a2ce979e5..35bdceabf9 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_timer.c +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_timer.c @@ -131,21 +131,21 @@ usdf_timer_cancel(struct usdf_fabric *fp, struct usdf_timer_entry *entry) * be called again until usdf_timer_set() is called again to re-set it. * usdf_timer_set() is safe to call from timer service routine. */ -int -usdf_timer_set(struct usdf_fabric *fp, struct usdf_timer_entry *entry, +static inline int +_usdf_timer_do_set(struct usdf_fabric *fp, struct usdf_timer_entry *entry, uint32_t ms) { int ret; unsigned bucket; - pthread_spin_lock(&fp->fab_timer_lock); - /* If no timers active, cur_bucket_ms may need catchup */ - if (fp->fab_active_timer_count == 0) { + ++fp->fab_active_timer_count; + if (fp->fab_active_timer_count == 1) { fp->fab_cur_bucket_ms = usdf_get_ms(); ret = usdf_fabric_wake_thread(fp); if (ret != 0) { - goto out; + --fp->fab_active_timer_count; + return ret; } } @@ -156,21 +156,47 @@ usdf_timer_set(struct usdf_fabric *fp, struct usdf_timer_entry *entry, // we could make "overflow" bucket... if (ms >= USDF_NUM_TIMER_BUCKETS) { - ret = -FI_EINVAL; - goto out; + --fp->fab_active_timer_count; + return -FI_EINVAL; } bucket = (fp->fab_cur_bucket + ms) & (USDF_NUM_TIMER_BUCKETS - 1); LIST_INSERT_HEAD(&fp->fab_timer_buckets[bucket], entry, te_link); entry->te_flags |= USDF_TF_QUEUED; - ++fp->fab_active_timer_count; - ret = 0; + return 0; +} -out: +int +usdf_timer_set(struct usdf_fabric *fp, struct usdf_timer_entry *entry, + uint32_t ms) +{ + int ret; + + pthread_spin_lock(&fp->fab_timer_lock); + if (entry->te_flags & USDF_TF_QUEUED) { + ret = 0; + } else { + ret = _usdf_timer_do_set(fp, entry, ms); + } pthread_spin_unlock(&fp->fab_timer_lock); + return ret; } +int +usdf_timer_reset(struct usdf_fabric *fp, struct usdf_timer_entry *entry, + uint32_t ms) +{ + int ret; + + pthread_spin_lock(&fp->fab_timer_lock); + ret = _usdf_timer_do_set(fp, entry, ms); + pthread_spin_unlock(&fp->fab_timer_lock); + + return ret; +} + + static inline void usdf_run_bucket(struct usdf_fabric *fp, struct usdf_timer_bucket *bp) { diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_timer.h b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_timer.h index d33934a87c..aecebc8257 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_timer.h +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_timer.h @@ -61,6 +61,8 @@ void usdf_timer_free(struct usdf_fabric *fp, struct usdf_timer_entry *entry); int usdf_timer_set(struct usdf_fabric *fp, struct usdf_timer_entry *entry, uint32_t timeout); +int usdf_timer_reset(struct usdf_fabric *fp, struct usdf_timer_entry *entry, + uint32_t timeout); void usdf_timer_cancel(struct usdf_fabric *fp, struct usdf_timer_entry *entry); diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd.h b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd.h index e212a59561..b630e83822 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd.h +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd.h @@ -60,7 +60,7 @@ #define USD_SF_ISSET(flags, flagname) \ ((flags >> USD_SFS_##flagname) & 1) -#define USD_SEND_MAX_COPY 1024 +#define USD_SEND_MAX_COPY 992 #define USD_MAX_CQ_GROUP 1024 #define USD_MAX_PRESEND 4 diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd_poll.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd_poll.c index 7f9235d4c6..125b9e2630 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd_poll.c +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd_poll.c @@ -87,11 +87,38 @@ usd_desc_to_rq_comp( CQ_ENET_RQ_DESC_FLAGS_TCP_UDP_CSUM_OK; if (bytes_written_flags & CQ_ENET_RQ_DESC_FLAGS_TRUNCATED || (edesc->flags & ipudpok) != ipudpok) { - if (edesc->flags & CQ_ENET_RQ_DESC_FLAGS_FCS_OK || - bytes_written != 0) + if (((edesc->flags & CQ_ENET_RQ_DESC_FLAGS_FCS_OK) == 0) && + bytes_written == 0) { + size_t rcvbuf_len; + dma_addr_t bus_addr; + u16 len; + u8 type; + uint16_t i; + + i = q_index; + rcvbuf_len = 0; + do { + rq_enet_desc_dec( (struct rq_enet_desc *) + ((uintptr_t)rq->urq_vnic_rq.ring.descs + (i<<4)), + &bus_addr, &type, &len); + rcvbuf_len += len; + i = (i - 1) & rq->urq_post_index_mask; + } while (type == RQ_ENET_TYPE_NOT_SOP); + + /* + * If only the paddings to meet 64-byte minimum eth frame + * requirement are truncated, do not mark packet as + * error due to truncation. + * The usnic hdr should not be split into multiple receive buffer + */ + if (ntohs(((struct usd_udp_hdr *)bus_addr)->uh_ip.tot_len) + + sizeof(struct ether_header) > rcvbuf_len) + comp->uc_status = USD_COMPSTAT_ERROR_TRUNC; + else + comp->uc_status = USD_COMPSTAT_SUCCESS; + } else { comp->uc_status = USD_COMPSTAT_ERROR_CRC; - else - comp->uc_status = USD_COMPSTAT_ERROR_TRUNC; + } } else { comp->uc_status = USD_COMPSTAT_SUCCESS; } diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd_post.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd_post.c index d0311cb903..024b5601bb 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd_post.c +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd_post.c @@ -100,7 +100,6 @@ usd_post_recv( vnic_rq_post(vrq, iovp[0].iov_base, 0, (dma_addr_t) iovp[0].iov_base, iovp[0].iov_len, 0); - for (i = 1; i < recv_list->urd_iov_cnt; ++i) { rq->urq_context[rq->urq_post_index] = recv_list->urd_context; diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd_post.h b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd_post.h index 22b2076f4d..77124783cf 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd_post.h +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd_post.h @@ -43,6 +43,8 @@ #ifndef _USD_POST_H_ #define _USD_POST_H_ +#include + #include "usd.h" #include "usd_util.h" @@ -94,7 +96,6 @@ _usd_post_send_two( struct vnic_wq *vwq; uint32_t index; struct wq_enet_desc *desc; - uint64_t wr; u_int8_t offload_mode = 0, eop; u_int16_t mss = 7, header_length = 0, vlan_tag = 0; u_int8_t vlan_tag_insert = 0, loopback = 0, fcoe_encap = 0; @@ -119,8 +120,7 @@ _usd_post_send_two( vlan_tag_insert, vlan_tag, loopback); wmb(); - wr = vnic_cached_posted_index((dma_addr_t)hdr, hdrlen, index); - iowrite64(wr, &vwq->ctrl->posted_index); + iowrite32(index, &vwq->ctrl->posted_index); wq->uwq_next_desc = (struct wq_enet_desc *) ((uintptr_t)wq->uwq_desc_ring + (index<<4)); @@ -130,4 +130,51 @@ _usd_post_send_two( return index; } +/* + * Consume iov count credits, assumes that iov[0] includes usnic header + */ +static inline uint32_t +_usd_post_send_iov( + struct usd_wq *wq, + const struct iovec *iov, + size_t count, + u_int8_t cq_entry) +{ + struct vnic_wq *vwq; + uint32_t index; + struct wq_enet_desc *desc; + u_int8_t offload_mode = 0; + u_int16_t mss = 7, header_length = 0, vlan_tag = 0; + u_int8_t vlan_tag_insert = 0, loopback = 0, fcoe_encap = 0; + unsigned i; + + vwq = &wq->uwq_vnic_wq; + desc = wq->uwq_next_desc; + index = wq->uwq_post_index; + + for (i = 0; i < count - 1; i++) { + wq_enet_desc_enc(desc, (uintptr_t)(iov[i].iov_base), + iov[i].iov_len, mss, header_length, offload_mode, + 0, 0, fcoe_encap, vlan_tag_insert, vlan_tag, loopback); + desc = (struct wq_enet_desc *) ((uintptr_t)wq->uwq_desc_ring + + (index<<4)); + index = (index+1) & wq->uwq_post_index_mask; + } + + wq_enet_desc_enc(desc, (uintptr_t)(iov[i].iov_base), + iov[i].iov_len, mss, header_length, offload_mode, + 1, cq_entry, fcoe_encap, vlan_tag_insert, vlan_tag, loopback); + + wmb(); + + iowrite32(index, &vwq->ctrl->posted_index); + + wq->uwq_next_desc = (struct wq_enet_desc *) + ((uintptr_t)wq->uwq_desc_ring + (index<<4)); + wq->uwq_post_index = (index+1) & wq->uwq_post_index_mask; + wq->uwq_send_credits -= count; + + return index; +} + #endif /* _USD_POST_H_ */ diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd_post_udp_normal.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd_post_udp_normal.c index 28f98499b2..902236a62c 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd_post_udp_normal.c +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd_post_udp_normal.c @@ -158,7 +158,7 @@ usd_post_send_one_prefixed_udp_normal( hdr->uh_udp.len = htons((sizeof(struct usd_udp_hdr) - sizeof(struct ether_header) - sizeof(struct iphdr)) + len); - hdr->uh_udp.source = + hdr->uh_udp.source = qp->uq_attrs.uqa_local_addr.ul_addr.ul_udp.u_addr.sin_port; last_post = @@ -224,9 +224,59 @@ usd_post_send_two_copy_udp_normal( return 0; } +static int +usd_post_send_iov_udp_normal(struct usd_qp *uqp, + struct usd_dest *dest, const struct iovec* iov, + size_t iov_count, uint32_t flags, void *context) +{ + struct usd_qp_impl *qp; + struct usd_udp_hdr *hdr; + struct usd_wq *wq; + uint32_t last_post; + uint8_t *copybuf; + struct usd_wq_post_info *info; + struct iovec send_iov[USD_SEND_MAX_SGE + 1]; + size_t len; + unsigned i; + + qp = to_qpi(uqp); + wq = &qp->uq_wq; + copybuf = wq->uwq_copybuf + wq->uwq_post_index * USD_SEND_MAX_COPY; + + for (i = 0, len = 0; i < iov_count; i++) { + len += iov[i].iov_len; + } + + hdr = (struct usd_udp_hdr *)copybuf; + memcpy(hdr, &dest->ds_dest.ds_udp.u_hdr, sizeof(*hdr)); + + /* adjust lengths and insert source port */ + hdr->uh_ip.tot_len = htons(len + sizeof(struct usd_udp_hdr) - + sizeof(struct ether_header)); + hdr->uh_udp.len = htons((sizeof(struct usd_udp_hdr) - + sizeof(struct ether_header) - + sizeof(struct iphdr)) + len); + hdr->uh_udp.source = + qp->uq_attrs.uqa_local_addr.ul_addr.ul_udp.u_addr.sin_port; + + send_iov[0].iov_base = hdr; + send_iov[0].iov_len = sizeof(*hdr); + memcpy(&send_iov[1], iov, sizeof(struct iovec) * iov_count); + + last_post = _usd_post_send_iov(wq, send_iov, iov_count + 1, + USD_SF_ISSET(flags, SIGNAL)); + info = &wq->uwq_post_info[last_post]; + info->wp_context = context; + info->wp_len = len; + + return 0; +} + + struct usd_qp_ops usd_qp_ops_udp_normal = { .qo_post_send_one = usd_post_send_one_udp_normal, .qo_post_send_one_prefixed = usd_post_send_one_prefixed_udp_normal, .qo_post_send_one_copy = usd_post_send_one_copy_udp_normal, .qo_post_send_two_copy = usd_post_send_two_copy_udp_normal, + .qo_post_send_iov = usd_post_send_iov_udp_normal, }; diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd_queues.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd_queues.c index 93fbb5b4d8..ab0f6a1ed0 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd_queues.c +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usd_queues.c @@ -1116,6 +1116,16 @@ usd_create_qp( } rq->urq_state |= USD_QS_FILTER_ALLOC; + /* Fill in some attrs */ + switch (transport) { + case USD_QTR_UDP: + qp->uq_attrs.uqa_hdr_len = sizeof(struct usd_udp_hdr); + break; + case USD_QTR_RAW: + qp->uq_attrs.uqa_hdr_len = 0; + break; + } + /* * Now, do the type-specific configuration */ @@ -1133,16 +1143,6 @@ usd_create_qp( break; } - /* Fill in some attrs */ - switch (transport) { - case USD_QTR_UDP: - qp->uq_attrs.uqa_hdr_len = sizeof(struct usd_udp_hdr); - break; - case USD_QTR_RAW: - qp->uq_attrs.uqa_hdr_len = 0; - break; - } - *uqp_o = to_usdqp(qp); return 0; diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usnic_direct.h b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usnic_direct.h index 5aea5ab8d9..98b9657808 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usnic_direct.h +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usnic_direct/usnic_direct.h @@ -53,6 +53,7 @@ #define USD_MAX_DEVICES 8 #define USD_MAX_DEVNAME 16 #define USD_RECV_MAX_SGE 8 +#define USD_SEND_MAX_SGE 8 enum usd_link_state { USD_LINK_DOWN, @@ -147,6 +148,9 @@ struct usd_qp_ops { int (*qo_post_send_two_copy)(struct usd_qp *qp, struct usd_dest *dest, const void *hdr, size_t hdrlen, const void *pkt, size_t pktlen, uint32_t flags, void *context); + int (*qo_post_send_iov)(struct usd_qp *qp, + struct usd_dest *dest, const struct iovec* iov, + size_t iov_count, uint32_t flags, void *context); }; /* @@ -604,11 +608,16 @@ usd_post_send_two_copy(struct usd_qp *qp, struct usd_dest *dest, /* * Post an N-buffer send * All buffers must be in registered memory. - * Requires iov_len + 1 send credits + * Requires iov_count + 1 send credits */ -int usd_post_send_sge(struct usd_qp *qp, struct usd_dest *dest, - const struct iovec *iov, size_t iov_len, uint32_t flags, void *context); - +static inline int +usd_post_send_iov(struct usd_qp *qp, struct usd_dest *dest, + const struct iovec *iov, size_t iov_count, uint32_t flags, + void *context) +{ + return qp->uq_ops.qo_post_send_iov( + qp, dest, iov, iov_count, flags, context); +} /**************************************************************** * enum-to-string utility functions (for prettyprinting) ****************************************************************/ diff --git a/opal/mca/common/libfabric/libfabric/prov/verbs/src/fi_verbs.c b/opal/mca/common/libfabric/libfabric/prov/verbs/src/fi_verbs.c index a1f426ded0..28f86e1313 100644 --- a/opal/mca/common/libfabric/libfabric/prov/verbs/src/fi_verbs.c +++ b/opal/mca/common/libfabric/libfabric/prov/verbs/src/fi_verbs.c @@ -54,14 +54,28 @@ #include #include #include + #include "fi.h" #include "fi_enosys.h" +#include "prov.h" -#define PROV_NAME "verbs" -#define PROV_VERS FI_VERSION(0,7) +#define VERBS_PROV_NAME "verbs" +#define VERBS_PROV_VERS FI_VERSION(1,0) -#define PROV_WARN(fmt, ...) \ - do { fprintf(stderr, "%s:%s: " fmt, PACKAGE, PROV_NAME, ##__VA_ARGS__); } while (0) +#define VERBS_WARN(fmt, ...) \ + do { fprintf(stderr, "%s:%s: " fmt, PACKAGE, VERBS_PROV_NAME, ##__VA_ARGS__); } while (0) + +#define VERBS_MSG_SIZE (1ULL << 31) +#define VERBS_IB_PREFIX "IB-0x" +#define VERBS_IWARP_FABRIC "Ethernet-iWARP" +#define VERBS_ANY_FABRIC "Any RDMA fabric" + +#define VERBS_CAPS (FI_MSG | FI_RMA | FI_ATOMICS | FI_READ | FI_WRITE | \ + FI_SEND | FI_RECV | FI_REMOTE_READ | FI_REMOTE_WRITE | \ + FI_REMOTE_CQ_DATA | FI_REMOTE_COMPLETE) +#define VERBS_MODE (FI_LOCAL_MR | FI_PROV_MR_ATTR) +#define VERBS_MSG_ORDER (FI_ORDER_RAR | FI_ORDER_RAW | FI_ORDER_RAS | \ + FI_ORDER_WAW | FI_ORDER_WAS | FI_ORDER_SAW | FI_ORDER_SAS ) struct fi_ibv_fabric { struct fid_fabric fabric_fid; @@ -119,6 +133,53 @@ static char def_send_sge[16] = "4"; static char def_recv_sge[16] = "4"; static char def_inline_data[16] = "64"; +const struct fi_fabric_attr verbs_fabric_attr = { + .name = VERBS_PROV_NAME, + .prov_version = VERBS_PROV_VERS, +}; + +const struct fi_domain_attr verbs_domain_attr = { + .threading = FI_THREAD_SAFE, + .control_progress = FI_PROGRESS_AUTO, + .data_progress = FI_PROGRESS_AUTO, + .mr_key_size = sizeof_field(struct ibv_sge, lkey), + .cq_data_size = sizeof_field(struct ibv_send_wr, imm_data), + .max_ep_tx_ctx = 1, + .max_ep_rx_ctx = 1, +}; + +const struct fi_ep_attr verbs_ep_attr = { + .protocol_version = 1, + .max_msg_size = VERBS_MSG_SIZE, + .total_buffered_recv = 0, + .msg_prefix_size = 0, + .max_order_raw_size = VERBS_MSG_SIZE, + .max_order_war_size = 0, + .max_order_waw_size = VERBS_MSG_SIZE, + .mem_tag_format = 0, + .msg_order = VERBS_MSG_ORDER, + .tx_ctx_cnt = 1, + .rx_ctx_cnt = 1, +}; + +const struct fi_rx_attr verbs_rx_attr = { + .caps = VERBS_CAPS, + .mode = VERBS_MODE, + .msg_order = VERBS_MSG_ORDER, + .total_buffered_recv = 0, + .size = 256, + .iov_limit = 8, +}; + +const struct fi_tx_attr verbs_tx_attr = { + .caps = VERBS_CAPS, + .mode = VERBS_MODE, + .msg_order = VERBS_MSG_ORDER, + .inject_size = 0, + .size = 256, + .iov_limit = 8, +}; + static int fi_ibv_sockaddr_len(struct sockaddr *addr) { if (!addr) @@ -136,9 +197,174 @@ static int fi_ibv_sockaddr_len(struct sockaddr *addr) } } -static int fi_ibv_check_hints(struct fi_info *hints) +static int fi_ibv_check_fabric_attr(struct fi_fabric_attr *attr) { - switch (hints->ep_type) { + if (attr->name && !(!strcmp(attr->name, VERBS_ANY_FABRIC) || + !strncmp(attr->name, VERBS_IB_PREFIX, strlen(VERBS_IB_PREFIX)) || + !strcmp(attr->name, VERBS_IWARP_FABRIC))) + return -FI_ENODATA; + + if (attr->prov_name && strcmp(attr->prov_name, VERBS_PROV_NAME)) + return -FI_ENODATA; + + if (attr->prov_version > VERBS_PROV_VERS) + return -FI_ENODATA; + + return 0; +} + +static int fi_ibv_check_domain_attr(struct fi_domain_attr *attr) +{ + switch (attr->threading) { + case FI_THREAD_UNSPEC: + case FI_THREAD_SAFE: + case FI_THREAD_PROGRESS: + break; + default: + VERBS_WARN("Invalid threading model\n"); + return -FI_ENODATA; + } + + switch (attr->control_progress) { + case FI_PROGRESS_UNSPEC: + case FI_PROGRESS_AUTO: + case FI_PROGRESS_MANUAL: + break; + default: + VERBS_WARN("Given control progress mode not supported\n"); + return -FI_ENODATA; + } + + switch (attr->data_progress) { + case FI_PROGRESS_UNSPEC: + case FI_PROGRESS_AUTO: + case FI_PROGRESS_MANUAL: + break; + default: + VERBS_WARN("Given data progress mode not supported!\n"); + return -FI_ENODATA; + } + + if (attr->mr_key_size > sizeof_field(struct ibv_sge, lkey)) + return -FI_ENODATA; + + if (attr->cq_data_size > sizeof_field(struct ibv_send_wr, imm_data)) + return -FI_ENODATA; + + return 0; +} + +static int fi_ibv_check_ep_attr(struct fi_ep_attr *attr) +{ + switch (attr->protocol) { + case FI_PROTO_UNSPEC: + case FI_PROTO_RDMA_CM_IB_RC: + case FI_PROTO_IWARP: + case FI_PROTO_IB_UD: + break; + default: + return -FI_ENODATA; + } + + if (attr->protocol_version > 1) + return -FI_ENODATA; + + if (attr->max_msg_size > verbs_ep_attr.max_msg_size) + return -FI_ENODATA; + + if (attr->total_buffered_recv) { + VERBS_WARN("Buffered Recv not supported\n"); + return -FI_ENODATA; + } + + if (attr->max_order_raw_size > verbs_ep_attr.max_order_raw_size) { + VERBS_WARN("max_order_raw_size exceeds supported size\n"); + return -FI_ENODATA; + } + + if (attr->max_order_war_size) { + VERBS_WARN("max_order_war_size exceeds supported size\n"); + return -FI_ENODATA; + } + + if (attr->max_order_waw_size > verbs_ep_attr.max_order_waw_size) { + VERBS_WARN("max_order_waw_size exceeds supported size\n"); + return -FI_ENODATA; + } + + if (attr->msg_order & ~(verbs_ep_attr.msg_order)) { + VERBS_WARN("Given msg ordering not supported\n"); + return -FI_ENODATA; + } + + if (attr->tx_ctx_cnt > verbs_ep_attr.tx_ctx_cnt) { + VERBS_WARN("tx_ctx_cnt exceeds supported size\n"); + return -FI_ENODATA; + } + + if (attr->rx_ctx_cnt > verbs_ep_attr.rx_ctx_cnt) { + VERBS_WARN("rx_ctx_cnt exceeds supported size\n"); + return -FI_ENODATA; + } + + return 0; +} + +static int fi_ibv_check_rx_attr(struct fi_rx_attr *attr) +{ + if (attr->caps & ~(verbs_rx_attr.caps)) { + VERBS_WARN("Given rx_attr->caps not supported\n"); + return -FI_ENODATA; + } + + if ((attr->mode & verbs_rx_attr.mode) != verbs_rx_attr.mode) { + VERBS_WARN("Given rx_attr->mode not supported\n"); + return -FI_ENODATA; + } + + if (attr->msg_order & ~(verbs_rx_attr.msg_order)) { + VERBS_WARN("Given rx_attr->msg_order not supported\n"); + return -FI_ENODATA; + } + + if (attr->total_buffered_recv > verbs_rx_attr.total_buffered_recv) { + VERBS_WARN("Given rx_attr->total_buffered_recv exceeds supported size\n"); + return -FI_ENODATA; + } + + return 0; +} + +static int fi_ibv_check_tx_attr(struct fi_tx_attr *attr) +{ + if (attr->caps & ~(verbs_tx_attr.caps)) { + VERBS_WARN("Given tx_attr->caps not supported\n"); + return -FI_ENODATA; + } + + if ((attr->mode & verbs_tx_attr.mode) != verbs_tx_attr.mode) { + VERBS_WARN("Given tx_attr->mode not supported\n"); + return -FI_ENODATA; + } + + if (attr->msg_order & ~(verbs_tx_attr.msg_order)) { + VERBS_WARN("Given tx_attr->msg_order not supported\n"); + return -FI_ENODATA; + } + + if (attr->inject_size > verbs_tx_attr.inject_size) { + VERBS_WARN("Given tx_attr->inject_size exceeds supported size\n"); + return -FI_ENODATA; + } + + return 0; +} + +static int fi_ibv_check_info(struct fi_info *info) +{ + int ret; + + switch (info->ep_type) { case FI_EP_UNSPEC: case FI_EP_MSG: break; @@ -146,24 +372,64 @@ static int fi_ibv_check_hints(struct fi_info *hints) return -FI_ENODATA; } - if (hints->ep_attr) { - switch (hints->ep_attr->protocol) { - case FI_PROTO_UNSPEC: - case FI_PROTO_RDMA_CM_IB_RC: - case FI_PROTO_IWARP: - case FI_PROTO_IB_UD: - break; - default: - return -FI_ENODATA; - } + if (!(info->caps & VERBS_CAPS) && info->caps) + return -FI_ENODATA; + + if (info->fabric_attr) { + ret = fi_ibv_check_fabric_attr(info->fabric_attr); + if (ret) + return ret; } - if (!(hints->caps & (FI_MSG | FI_RMA)) && hints->caps) - return -FI_ENODATA; + if (info->domain_attr) { + ret = fi_ibv_check_domain_attr(info->domain_attr); + if (ret) + return ret; + } - if (hints->fabric_attr && hints->fabric_attr->name && - strcmp(hints->fabric_attr->name, "RDMA")) + if (info->ep_attr) { + ret = fi_ibv_check_ep_attr(info->ep_attr); + if (ret) + return ret; + } + + if (info->rx_attr) { + ret = fi_ibv_check_rx_attr(info->rx_attr); + if (ret) + return ret; + } + + if (info->tx_attr) { + ret = fi_ibv_check_tx_attr(info->tx_attr); + if (ret) + return ret; + } + + return 0; +} + +static int fi_ibv_check_dev_limits(struct fi_domain_attr *domain_attr, + struct ibv_device_attr *device_attr) +{ + if (domain_attr->cq_cnt > device_attr->max_cq) { + VERBS_WARN("cq_cnt exceeds supported size\n"); return -FI_ENODATA; + } + + if (domain_attr->ep_cnt > device_attr->max_qp) { + VERBS_WARN("ep_cnt exceeds supported size\n"); + return -FI_ENODATA; + } + + if (domain_attr->tx_ctx_cnt > device_attr->max_qp) { + VERBS_WARN("domain_attr: tx_ctx_cnt exceeds supported size\n"); + return -FI_ENODATA; + } + + if (domain_attr->rx_ctx_cnt > device_attr->max_qp) { + VERBS_WARN("domain_attr: rx_ctx_cnt exceeds supported size\n"); + return -FI_ENODATA; + } return 0; } @@ -210,7 +476,7 @@ static int fi_ibv_fi_to_rai(struct fi_info *fi, uint64_t flags, struct rdma_addr } static int fi_ibv_rai_to_fi(struct rdma_addrinfo *rai, struct fi_info *hints, - struct fi_info *fi) + struct fi_info *fi) { // fi->sa_family = rai->ai_family; if (rai->ai_qp_type == IBV_QPT_RC || rai->ai_port_space == RDMA_PS_TCP) { @@ -241,6 +507,90 @@ static int fi_ibv_rai_to_fi(struct rdma_addrinfo *rai, struct fi_info *hints, return 0; } +static int fi_ibv_fill_info_attr(struct ibv_context *ctx, struct fi_info *hints, + struct fi_info *fi) +{ + struct ibv_device_attr device_attr; + struct ibv_port_attr port_attr; + union ibv_gid gid; + size_t name_len; + int ret; + + *(fi->fabric_attr) = verbs_fabric_attr; + *(fi->domain_attr) = verbs_domain_attr; + *(fi->ep_attr) = verbs_ep_attr; + *(fi->tx_attr) = verbs_tx_attr; + *(fi->rx_attr) = verbs_rx_attr; + + if (!(fi->fabric_attr->prov_name = strdup(VERBS_PROV_NAME))) + return -FI_ENOMEM; + + if (!ctx) { + if (!(fi->fabric_attr->name = strdup(VERBS_ANY_FABRIC))) + return -FI_ENOMEM; + + return 0; + } + + ibv_query_gid(ctx, 1, 0, &gid); + ret = ibv_query_device(ctx, &device_attr); + if (ret) + return -errno; + + ret = ibv_query_port(ctx, 1, &port_attr); + if (ret) + return -errno; + + if (hints && hints->domain_attr) { + ret = fi_ibv_check_dev_limits(hints->domain_attr, &device_attr); + if (ret) + return ret; + } + + switch (ctx->device->transport_type) { + case IBV_TRANSPORT_IB: + name_len = strlen(VERBS_IB_PREFIX) + INET6_ADDRSTRLEN; + if (!(fi->fabric_attr->name = calloc(1, name_len + 1))) + return -FI_ENOMEM; + + snprintf(fi->fabric_attr->name, name_len, VERBS_IB_PREFIX "%lx", + gid.global.subnet_prefix); + break; + case IBV_TRANSPORT_IWARP: + fi->fabric_attr->name = strdup(VERBS_IWARP_FABRIC); + break; + default: + VERBS_WARN("Unknown transport type"); + return -FI_ENODATA; + } + + if (!(fi->domain_attr->name = strdup(ctx->device->name))) + return -FI_ENOMEM; + + fi->domain_attr->cq_cnt = device_attr.max_cq; + fi->domain_attr->ep_cnt = device_attr.max_qp; + fi->domain_attr->tx_ctx_cnt = device_attr.max_qp; + fi->domain_attr->rx_ctx_cnt = device_attr.max_qp; + + switch (ctx->device->transport_type) { + case IBV_TRANSPORT_IWARP: + fi->ep_attr->protocol = FI_PROTO_IWARP; + break; + case IBV_TRANSPORT_IB: + fi->ep_attr->protocol = FI_PROTO_RDMA_CM_IB_RC; + break; + default: + return -FI_ENODATA; + } + + fi->ep_attr->protocol_version = 1; + fi->ep_attr->max_msg_size = port_attr.max_msg_sz; + // TODO Give a real size once verbs provider supports inject + fi->ep_attr->inject_size = 0; + + return 0; +} + static int fi_ibv_getepinfo(const char *node, const char *service, uint64_t flags, struct fi_info *hints, @@ -251,7 +601,7 @@ fi_ibv_getepinfo(const char *node, const char *service, int ret; if (hints) { - ret = fi_ibv_check_hints(hints); + ret = fi_ibv_check_info(hints); if (ret) return ret; @@ -282,22 +632,13 @@ fi_ibv_getepinfo(const char *node, const char *service, ret = -errno; goto err2; } - rdma_freeaddrinfo(rai); - if ((*id)->verbs) { - if (!(fi->domain_attr->name = strdup((*id)->verbs->device->name))) { - ret = -FI_ENOMEM; - goto err3; - } - } - - // TODO: Get a real name here - if (!(fi->fabric_attr->name = strdup("RDMA"))) { - ret = -FI_ENOMEM; + ret = fi_ibv_fill_info_attr((*id)->verbs, hints, fi); + if (ret) goto err3; - } *info = fi; + rdma_freeaddrinfo(rai); return 0; err3: @@ -310,7 +651,7 @@ err1: } static int fi_ibv_getinfo(uint32_t version, const char *node, const char *service, - uint64_t flags, struct fi_info *hints, struct fi_info **info) + uint64_t flags, struct fi_info *hints, struct fi_info **info) { struct rdma_cm_id *id; int ret; @@ -1488,6 +1829,7 @@ fi_ibv_eq_readerr(struct fid_eq *eq, struct fi_eq_err_entry *entry, return sizeof(*entry); } +/* TODO: This should copy the listening fi_info as the base */ static struct fi_info * fi_ibv_eq_cm_getinfo(struct fi_ibv_fabric *fab, struct rdma_cm_event *event) { @@ -1498,12 +1840,7 @@ fi_ibv_eq_cm_getinfo(struct fi_ibv_fabric *fab, struct rdma_cm_event *event) return NULL; fi->ep_type = FI_EP_MSG; - fi->caps = FI_MSG | FI_RMA; - if (event->id->verbs->device->transport_type == IBV_TRANSPORT_IWARP) { - fi->ep_attr->protocol = FI_PROTO_IWARP; - } else { - fi->ep_attr->protocol = FI_PROTO_RDMA_CM_IB_RC; - } + fi->caps = VERBS_CAPS; fi->src_addrlen = fi_ibv_sockaddr_len(rdma_get_local_addr(event->id)); if (!(fi->src_addr = malloc(fi->src_addrlen))) @@ -1515,14 +1852,7 @@ fi_ibv_eq_cm_getinfo(struct fi_ibv_fabric *fab, struct rdma_cm_event *event) goto err; memcpy(fi->dest_addr, rdma_get_peer_addr(event->id), fi->dest_addrlen); - if (!(fi->fabric_attr->name = strdup("RDMA"))) - goto err; - if (!(fi->fabric_attr->prov_name = strdup(PROV_NAME))) - goto err; - fi->fabric_attr->prov_version = PROV_VERS; - - if (!(fi->domain_attr->name = strdup(event->id->verbs->device->name))) - goto err; + fi_ibv_fill_info_attr(event->id->verbs, NULL, fi); fi->connreq = event->id; return fi; @@ -2360,8 +2690,10 @@ static struct fi_ops_fabric fi_ibv_ops_fabric = { int fi_ibv_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric, void *context) { struct fi_ibv_fabric *fab; + int ret; - if (strcmp(attr->name, "RDMA")) + ret = fi_ibv_check_fabric_attr(attr); + if (ret) return -FI_ENODATA; fab = calloc(1, sizeof(*fab)); @@ -2376,18 +2708,20 @@ int fi_ibv_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric, void return 0; } +static void fi_ibv_fini(void) +{ +} + static struct fi_provider fi_ibv_prov = { - .name = PROV_NAME, - .version = PROV_VERS, + .name = VERBS_PROV_NAME, + .version = VERBS_PROV_VERS, + .fi_version = FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), .getinfo = fi_ibv_getinfo, .fabric = fi_ibv_fabric, + .cleanup = fi_ibv_fini }; -static void __attribute__((constructor)) fi_ibv_ini(void) -{ - (void) fi_register(&fi_ibv_prov); -} - -static void __attribute__((destructor)) fi_ibv_fini(void) +VERBS_INI { + return &fi_ibv_prov; } diff --git a/opal/mca/common/libfabric/libfabric/src/fabric.c b/opal/mca/common/libfabric/libfabric/src/fabric.c index dc86fc108e..7d649ec755 100644 --- a/opal/mca/common/libfabric/libfabric/src/fabric.c +++ b/opal/mca/common/libfabric/libfabric/src/fabric.c @@ -44,6 +44,7 @@ #include #include "fi.h" +#include "prov.h" #ifdef HAVE_LIBDL #include @@ -57,38 +58,47 @@ struct fi_prov { struct fi_provider *provider; }; -static struct fi_prov *prov_head, *prov_tail; - static struct fi_prov *fi_getprov(const char *prov_name); +static struct fi_prov *prov_head, *prov_tail; +static volatile int init = 0; +static pthread_mutex_t ini_lock = PTHREAD_MUTEX_INITIALIZER; -__attribute__((visibility ("default"))) -int fi_register_provider_(uint32_t fi_version, struct fi_provider *provider) + +static int fi_register_provider(struct fi_provider *provider) { struct fi_prov *prov; + int ret; - if (FI_MAJOR(fi_version) != FI_MAJOR_VERSION || - FI_MINOR(fi_version) > FI_MINOR_VERSION) - return -FI_ENOSYS; + if (!provider) + return -FI_EINVAL; + + if (FI_MAJOR(provider->fi_version) != FI_MAJOR_VERSION || + FI_MINOR(provider->fi_version) > FI_MINOR_VERSION) { + ret = -FI_ENOSYS; + goto cleanup; + } - /* If a provider with this name is already registered: - * - if the new provider has a lower version number, just fail - * to register it - * - otherwise, just overwrite the old prov entry - * If the provider is a new/unique name, calloc() a new prov entry. - */ prov = fi_getprov(provider->name); if (prov) { - if (FI_VERSION_GE(prov->provider->version, provider->version)) - return -FI_EALREADY; + /* If we have two versions of the same provider, + * keep the most recent + */ + if (FI_VERSION_GE(prov->provider->version, provider->version)) { + ret = -FI_EALREADY; + goto cleanup; + } + prov->provider->cleanup(); prov->provider = provider; return 0; } prov = calloc(sizeof *prov, 1); - if (!prov) - return -FI_ENOMEM; + if (!prov) { + ret = -FI_ENOMEM; + goto cleanup; + } prov->provider = provider; if (prov_tail) @@ -97,8 +107,11 @@ int fi_register_provider_(uint32_t fi_version, struct fi_provider *provider) prov_head = prov; prov_tail = prov; return 0; + +cleanup: + provider->cleanup(); + return ret; } -default_symver(fi_register_provider_, fi_register_provider); #ifdef HAVE_LIBDL static int lib_filter(const struct dirent *entry) @@ -111,13 +124,26 @@ static int lib_filter(const struct dirent *entry) else return 0; } +#endif -static void __attribute__((constructor)) fi_ini(void) +static void fi_ini(void) { + pthread_mutex_lock(&ini_lock); + + if (init) + goto unlock; + + fi_register_provider(VERBS_INIT); + fi_register_provider(PSM_INIT); + fi_register_provider(SOCKETS_INIT); + fi_register_provider(USNIC_INIT); + +#ifdef HAVE_LIBDL struct dirent **liblist; int n, want_warn = 0; char *lib, *extdir = getenv("FI_EXTDIR"); void *dlhandle; + struct fi_provider* (*inif)(void); if (extdir) { /* Warn if user specified $FI_EXTDIR, but there's a @@ -130,7 +156,7 @@ static void __attribute__((constructor)) fi_ini(void) /* If dlopen fails, assume static linking and just return without error */ if (dlopen(NULL, RTLD_NOW) == NULL) { - return; + goto done; } n = scandir(extdir, &liblist, lib_filter, NULL); @@ -139,13 +165,14 @@ static void __attribute__((constructor)) fi_ini(void) FI_WARN("scandir error reading %s: %s\n", extdir, strerror(errno)); } - return; + goto done; } while (n--) { if (asprintf(&lib, "%s/%s", extdir, liblist[n]->d_name) < 0) { FI_WARN("asprintf failed to allocate memory\n"); - return; + free(liblist[n]); + goto done; } dlhandle = dlopen(lib, RTLD_NOW); @@ -154,14 +181,26 @@ static void __attribute__((constructor)) fi_ini(void) free(liblist[n]); free(lib); + + inif = dlsym(dlhandle, "fi_prov_ini"); + if (inif == NULL) + FI_WARN("dlsym: %s\n", dlerror()); + else + fi_register_provider((inif)()); } free(liblist); -} +done: #endif + init = 1; +unlock: + pthread_mutex_unlock(&ini_lock); +} static void __attribute__((destructor)) fi_fini(void) { + for (struct fi_prov *prov = prov_head; prov; prov = prov->next) + prov->provider->cleanup(); } static struct fi_prov *fi_getprov(const char *prov_name) @@ -182,7 +221,10 @@ int fi_getinfo_(uint32_t version, const char *node, const char *service, { struct fi_prov *prov; struct fi_info *tail, *cur; - int ret = -ENOSYS; + int ret = -FI_ENOSYS; + + if (!init) + fi_ini(); *info = tail = NULL; for (prov = prov_head; prov; prov = prov->next) { @@ -345,6 +387,9 @@ int fi_fabric_(struct fi_fabric_attr *attr, struct fid_fabric **fabric, void *co if (!attr || !attr->prov_name || !attr->name) return -FI_EINVAL; + if (!init) + fi_ini(); + prov = fi_getprov(attr->prov_name); if (!prov || !prov->provider->fabric) return -FI_ENODEV; diff --git a/opal/mca/common/libfabric/libfabric/src/fi_tostr.c b/opal/mca/common/libfabric/libfabric/src/fi_tostr.c index da44c22e13..4834da26fc 100644 --- a/opal/mca/common/libfabric/libfabric/src/fi_tostr.c +++ b/opal/mca/common/libfabric/libfabric/src/fi_tostr.c @@ -64,9 +64,9 @@ #define TAB " " #define CASEENUMSTR(SYM) \ - case SYM: { strcat(buf, #SYM); break; } + case SYM: { strcatf(buf, #SYM); break; } #define IFFLAGSTR(flags, SYM) \ - do { if (flags & SYM) strcat(buf, #SYM ", "); } while(0) + do { if (flags & SYM) strcatf(buf, #SYM ", "); } while(0) static void fi_remove_comma(char *buffer) { @@ -81,7 +81,7 @@ static void strcatf(char *dest, const char *fmt, ...) va_list arglist; va_start (arglist, fmt); - vsprintf(&dest[len], fmt, arglist); + vsnprintf(&dest[len], BUFSIZ - 1 - len, fmt, arglist); va_end (arglist); } @@ -122,9 +122,9 @@ static void fi_tostr_addr_format(char *buf, uint32_t addr_format) CASEENUMSTR(FI_ADDR_PSMX); default: if (addr_format & FI_PROV_SPECIFIC) - strcat(buf, "Provider specific"); + strcatf(buf, "Provider specific"); else - strcat(buf, "Unknown"); + strcatf(buf, "Unknown"); break; } } @@ -136,7 +136,7 @@ static void fi_tostr_progress(char *buf, enum fi_progress progress) CASEENUMSTR(FI_PROGRESS_AUTO); CASEENUMSTR(FI_PROGRESS_MANUAL); default: - strcat(buf, "Unknown"); + strcatf(buf, "Unknown"); break; } } @@ -148,7 +148,7 @@ static void fi_tostr_threading(char *buf, enum fi_threading threading) CASEENUMSTR(FI_THREAD_SAFE); CASEENUMSTR(FI_THREAD_PROGRESS); default: - strcat(buf, "Unknown"); + strcatf(buf, "Unknown"); break; } } @@ -191,7 +191,7 @@ static void fi_tostr_ep_type(char *buf, enum fi_ep_type ep_type) CASEENUMSTR(FI_EP_DGRAM); CASEENUMSTR(FI_EP_RDM); default: - strcat(buf, "Unknown"); + strcatf(buf, "Unknown"); break; } } @@ -207,9 +207,9 @@ static void fi_tostr_protocol(char *buf, uint32_t protocol) CASEENUMSTR(FI_PROTO_UDP); default: if (protocol & FI_PROV_SPECIFIC) - strcat(buf, "Provider specific"); + strcatf(buf, "Provider specific"); else - strcat(buf, "Unknown"); + strcatf(buf, "Unknown"); break; } } @@ -231,7 +231,7 @@ static void fi_tostr_addr(char *buf, uint32_t addr_format, p = buf + strlen(buf); if (addr == NULL) { - strcat(p, "(null)"); + strcatf(p, "(null)"); return; } @@ -275,15 +275,15 @@ static void fi_tostr_tx_attr(char *buf, const struct fi_tx_attr *attr, strcatf(buf, "%sfi_tx_attr:\n", prefix); strcatf(buf, "%s%scaps: [ ", prefix, TAB); fi_tostr_caps(buf, attr->caps); - strcat(buf, " ]\n"); + strcatf(buf, " ]\n"); strcatf(buf, "%s%sop_flags: [ ", prefix, TAB); fi_tostr_flags(buf, attr->op_flags); - strcat(buf, " ]\n"); + strcatf(buf, " ]\n"); strcatf(buf, "%s%smsg_order: [ ", prefix, TAB); fi_tostr_order(buf, attr->msg_order); - strcat(buf, " ]\n"); + strcatf(buf, " ]\n"); strcatf(buf, "%s%sinject_size: %zd\n", prefix, TAB, attr->inject_size); strcatf(buf, "%s%ssize: %zd\n", prefix, TAB, attr->size); @@ -301,15 +301,15 @@ static void fi_tostr_rx_attr(char *buf, const struct fi_rx_attr *attr, strcatf(buf, "%sfi_rx_attr:\n", prefix); strcatf(buf, "%s%scaps: [ ", prefix, TAB); fi_tostr_caps(buf, attr->caps); - strcat(buf, " ]\n"); + strcatf(buf, " ]\n"); strcatf(buf, "%s%sop_flags: [ ", prefix, TAB); fi_tostr_flags(buf, attr->op_flags); - strcat(buf, " ]\n"); + strcatf(buf, " ]\n"); strcatf(buf, "%s%smsg_order: [ ", prefix, TAB); fi_tostr_order(buf, attr->msg_order); - strcat(buf, " ]\n"); + strcatf(buf, " ]\n"); strcatf(buf, "%s%stotal_buffered_recv: %zd\n", prefix, TAB, attr->total_buffered_recv); strcatf(buf, "%s%ssize: %zd\n", prefix, TAB, attr->size); @@ -337,7 +337,7 @@ static void fi_tostr_ep_attr(char *buf, const struct fi_ep_attr *attr, const cha strcatf(buf, "%s%smsg_order: [ ", prefix, TAB); fi_tostr_order(buf, attr->msg_order); - strcat(buf, " ]\n"); + strcatf(buf, " ]\n"); strcatf(buf, "%s%stx_ctx_cnt: %zd\n", prefix, TAB, attr->tx_ctx_cnt); strcatf(buf, "%s%srx_ctx_cnt: %zd\n", prefix, TAB, attr->rx_ctx_cnt); @@ -390,30 +390,30 @@ static void fi_tostr_fabric_attr(char *buf, const struct fi_fabric_attr *attr, static void fi_tostr_info(char *buf, const struct fi_info *info) { - strcat(buf, "fi_info:\n"); + strcatf(buf, "fi_info:\n"); strcatf(buf, "%scaps: [ ", TAB); fi_tostr_caps(buf, info->caps); - strcat(buf, " ]\n"); + strcatf(buf, " ]\n"); strcatf(buf, "%smode: [ ", TAB); fi_tostr_mode(buf, info->mode); - strcat(buf, " ]\n"); + strcatf(buf, " ]\n"); strcatf(buf, "%sep_type: ", TAB); fi_tostr_ep_type(buf, info->ep_type); - strcat(buf, "\n"); + strcatf(buf, "\n"); strcatf(buf, "%sfi_addr_format: ", TAB); fi_tostr_addr_format(buf, info->addr_format); - strcat(buf, "\n"); + strcatf(buf, "\n"); strcatf(buf, "%ssrc_addrlen: %zd\n", TAB, info->src_addrlen); strcatf(buf, "%sdest_addrlen: %zd\n", TAB, info->dest_addrlen); strcatf(buf, "%ssrc_addr: ", TAB); fi_tostr_addr(buf, info->addr_format, info->src_addr); - strcat(buf, "\n"); + strcatf(buf, "\n"); strcatf(buf, "%sdest_addr: ", TAB); fi_tostr_addr(buf, info->addr_format, info->dest_addr); - strcat(buf, "\n"); + strcatf(buf, "\n"); strcatf(buf, "%sconnreq: %s\n", TAB, info->connreq); fi_tostr_tx_attr(buf, info->tx_attr, TAB); @@ -429,7 +429,7 @@ static void fi_tostr_av_type(char *buf, enum fi_av_type type) CASEENUMSTR(FI_AV_MAP); CASEENUMSTR(FI_AV_TABLE); default: - strcat(buf, "Unknown"); + strcatf(buf, "Unknown"); break; } } @@ -437,7 +437,7 @@ static void fi_tostr_av_type(char *buf, enum fi_av_type type) __attribute__((visibility ("default"))) char *fi_tostr_(const void *data, enum fi_type datatype) { - char *buf; + static char *buf = NULL; uint64_t val64 = *(const uint64_t *) data; uint32_t val32 = *(const uint32_t *) data; int enumval = *(const int *) data; @@ -445,9 +445,12 @@ char *fi_tostr_(const void *data, enum fi_type datatype) if (!data) return NULL; - buf = calloc(4096, sizeof (*buf)); - if (!buf) - return NULL; + if (!buf) { + buf = calloc(BUFSIZ, 1); + if (!buf) + return NULL; + } + buf[0] = '\0'; switch (datatype) { case FI_TYPE_INFO: @@ -499,7 +502,7 @@ char *fi_tostr_(const void *data, enum fi_type datatype) fi_tostr_av_type(buf, enumval); break; default: - strcat(buf, "Unknown type"); + strcatf(buf, "Unknown type"); break; } return buf;