From 7d5755f62b0bff72b2bfd3d0158ab8485e264b30 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Sat, 24 Jan 2015 05:48:48 -0800 Subject: [PATCH] libfabric: update to ofiwg/libfabric@b3f7af4c67492d9b49ea0c6d7965560ba1cc8caf Pull down a new embedded copy of libfabric from https://github.com/ofiwg/libfabric. --- .../common/libfabric/libfabric/Makefile.am | 2 + opal/mca/common/libfabric/libfabric/README | 2 +- .../libfabric/libfabric/include/fi_enosys.h | 4 + .../libfabric/libfabric/include/rdma/fabric.h | 5 - .../libfabric/include/rdma/fi_endpoint.h | 19 ++ .../libfabric/libfabric/man/fi_endpoint.3 | 14 +- .../libfabric/libfabric/man/fi_getinfo.3 | 14 +- .../common/libfabric/libfabric/man/fi_msg.3 | 30 ++- .../libfabric/libfabric/man/fi_rx_size_left.3 | 1 + .../libfabric/libfabric/man/fi_tx_size_left.3 | 1 + .../libfabric/libfabric/prov/psm/src/psmx.h | 7 + .../libfabric/prov/psm/src/psmx_atomic.c | 8 +- .../libfabric/prov/psm/src/psmx_cq.c | 3 + .../libfabric/prov/psm/src/psmx_domain.c | 2 + .../libfabric/prov/psm/src/psmx_ep.c | 49 +++++ .../libfabric/prov/psm/src/psmx_mr.c | 46 +++-- .../libfabric/prov/psm/src/psmx_msg.c | 2 + .../libfabric/prov/psm/src/psmx_msg2.c | 2 + .../libfabric/prov/sockets/src/sock.h | 9 +- .../libfabric/prov/sockets/src/sock_av.c | 8 +- .../libfabric/prov/sockets/src/sock_conn.c | 178 +++++++++++------- .../libfabric/prov/sockets/src/sock_dom.c | 9 +- .../libfabric/prov/sockets/src/sock_ep.c | 4 +- .../libfabric/prov/sockets/src/sock_msg.c | 2 + .../prov/sockets/src/sock_progress.c | 35 ++-- .../libfabric/prov/usnic/src/usdf_dgram.c | 74 ++++++++ .../libfabric/prov/usnic/src/usdf_dgram.h | 4 + .../libfabric/prov/usnic/src/usdf_ep_dgram.c | 19 +- .../libfabric/prov/usnic/src/usdf_pep.c | 11 ++ .../libfabric/prov/verbs/src/fi_verbs.c | 80 ++++---- .../common/libfabric/libfabric/src/enosys.c | 8 + .../common/libfabric/libfabric/src/fabric.c | 103 +++++----- 32 files changed, 532 insertions(+), 223 deletions(-) create mode 100644 opal/mca/common/libfabric/libfabric/man/fi_rx_size_left.3 create mode 100644 opal/mca/common/libfabric/libfabric/man/fi_tx_size_left.3 diff --git a/opal/mca/common/libfabric/libfabric/Makefile.am b/opal/mca/common/libfabric/libfabric/Makefile.am index 86385c7150..174c553aaf 100644 --- a/opal/mca/common/libfabric/libfabric/Makefile.am +++ b/opal/mca/common/libfabric/libfabric/Makefile.am @@ -371,6 +371,7 @@ dummy_man_pages = \ man/fi_recvv.3 \ man/fi_reject.3 \ man/fi_rx_addr.3 \ + man/fi_rx_size_left.3 \ man/fi_send.3 \ man/fi_senddata.3 \ man/fi_sendmsg.3 \ @@ -389,6 +390,7 @@ dummy_man_pages = \ man/fi_tsenddata.3 \ man/fi_tsendmsg.3 \ man/fi_tsendv.3 \ + man/fi_tx_size_left.3 \ man/fi_wait.3 \ man/fi_wait_open.3 \ man/fi_write.3 \ diff --git a/opal/mca/common/libfabric/libfabric/README b/opal/mca/common/libfabric/libfabric/README index 5bf3647100..1f2c7411c1 100644 --- a/opal/mca/common/libfabric/libfabric/README +++ b/opal/mca/common/libfabric/libfabric/README @@ -1,7 +1,7 @@ This README is for userspace RDMA fabric library. Version Libfabric v0.0.2 -Released on 2015-01-15 +Released on 2015-01-24 Building ======== diff --git a/opal/mca/common/libfabric/libfabric/include/fi_enosys.h b/opal/mca/common/libfabric/libfabric/include/fi_enosys.h index 1acecaa64f..019d98c1fe 100644 --- a/opal/mca/common/libfabric/libfabric/include/fi_enosys.h +++ b/opal/mca/common/libfabric/libfabric/include/fi_enosys.h @@ -262,6 +262,8 @@ static struct fi_ops_msg X = { .inject = fi_no_msg_inject, .senddata = fi_no_msg_senddata, .injectdata = fi_no_msg_injectdata, + .rx_size_left = fi_no_msg_rx_size_left, + .tx_size_left = fi_no_msg_tx_size_left, }; */ ssize_t fi_no_msg_recv(struct fid_ep *ep, void *buf, size_t len, void *desc, @@ -282,6 +284,8 @@ ssize_t fi_no_msg_senddata(struct fid_ep *ep, const void *buf, size_t len, void uint64_t data, fi_addr_t dest_addr, void *context); ssize_t fi_no_msg_injectdata(struct fid_ep *ep, const void *buf, size_t len, uint64_t data, fi_addr_t dest_addr); +ssize_t fi_no_msg_rx_size_left(struct fid_ep *ep); +ssize_t fi_no_msg_tx_size_left(struct fid_ep *ep); /* static struct fi_ops_wait X = { diff --git a/opal/mca/common/libfabric/libfabric/include/rdma/fabric.h b/opal/mca/common/libfabric/libfabric/include/rdma/fabric.h index 29fb2caff8..70ff46b9c3 100644 --- a/opal/mca/common/libfabric/libfabric/include/rdma/fabric.h +++ b/opal/mca/common/libfabric/libfabric/include/rdma/fabric.h @@ -376,11 +376,6 @@ static inline int fi_close(struct fid *fid) return fid->ops->close(fid); } -static inline int fi_bind(struct fid *fid, struct fid *bfid, uint64_t flags) -{ - return fid->ops->bind(fid, bfid, flags); -} - struct fi_alias { struct fid **fid; uint64_t flags; diff --git a/opal/mca/common/libfabric/libfabric/include/rdma/fi_endpoint.h b/opal/mca/common/libfabric/libfabric/include/rdma/fi_endpoint.h index 407c1fd0c1..8503cda30d 100644 --- a/opal/mca/common/libfabric/libfabric/include/rdma/fi_endpoint.h +++ b/opal/mca/common/libfabric/libfabric/include/rdma/fi_endpoint.h @@ -99,6 +99,8 @@ struct fi_ops_msg { uint64_t data, fi_addr_t dest_addr, void *context); ssize_t (*injectdata)(struct fid_ep *ep, const void *buf, size_t len, uint64_t data, fi_addr_t dest_addr); + ssize_t (*rx_size_left)(struct fid_ep *ep); + ssize_t (*tx_size_left)(struct fid_ep *ep); }; struct fi_ops_cm; @@ -171,6 +173,11 @@ static inline int fi_ep_bind(struct fid_ep *ep, struct fid *bfid, uint64_t flags return ep->fid.ops->bind(&ep->fid, bfid, flags); } +static inline int fi_pep_bind(struct fid_pep *pep, struct fid *bfid, uint64_t flags) +{ + return pep->fid.ops->bind(&pep->fid, bfid, flags); +} + static inline int fi_scalable_ep_bind(struct fid_sep *sep, struct fid *bfid, uint64_t flags) { return sep->fid.ops->bind(&sep->fid, bfid, flags); @@ -291,6 +298,18 @@ fi_injectdata(struct fid_ep *ep, const void *buf, size_t len, return ep->msg->injectdata(ep, buf, len, data, dest_addr); } +static inline ssize_t +fi_rx_size_left(struct fid_ep *ep) +{ + return ep->msg->rx_size_left(ep); +} + +static inline ssize_t +fi_tx_size_left(struct fid_ep *ep) +{ + return ep->msg->tx_size_left(ep); +} + #else // FABRIC_DIRECT #include #endif diff --git a/opal/mca/common/libfabric/libfabric/man/fi_endpoint.3 b/opal/mca/common/libfabric/libfabric/man/fi_endpoint.3 index 13fae4c910..244d0d2698 100644 --- a/opal/mca/common/libfabric/libfabric/man/fi_endpoint.3 +++ b/opal/mca/common/libfabric/libfabric/man/fi_endpoint.3 @@ -1,4 +1,4 @@ -.TH fi_endpoint 3 "2015\-01\-08" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH fi_endpoint 3 "2015\-01\-16" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .SH NAME .PP fi_endpoint - Fabric endpoint operations @@ -426,6 +426,13 @@ datagram queue pairs. protocol known as PSM, performance scaled messaging. PSMX is an extended version of the PSM protocol to support the libfabric interfaces. +.PP +\f[I]FI_PROTO_UDP\f[] : The protocol sends and receives UDP datagrams. +For example, an endpoint using \f[I]FI_PROTO_UDP\f[] will be able to +communicate with a remote peer that is using Berkeley +\f[I]SOCK_DGRAM\f[] sockets using \f[I]IPPROTO_UDP\f[]. +.PP +\f[I]FI_PROTO_SOCK_TCP\f[] : The protocol is layered over TCP packets. .SS protocol_version - Protocol Version .PP Identifies which version of the protocol is employed by the provider. @@ -954,8 +961,9 @@ Use of this flag may cause a single posted receive operation to generate multiple completions as messages are placed into the buffer. The placement of received data into the buffer may be subjected to provider specific alignment restrictions. -The buffer will be freed from the endpoint when a message is received -that cannot fit into the remaining free buffer space. +The buffer will be returned to the application\[aq]s control, and an +\f[I]FI_MULTI_RECV\f[] completion will be generated, when a message is +received that cannot fit into the remaining free buffer space. .PP \f[I]FI_BUFFERED_RECV\f[] : If set, the communication interface implementation should attempt to queue inbound data that arrives before diff --git a/opal/mca/common/libfabric/libfabric/man/fi_getinfo.3 b/opal/mca/common/libfabric/libfabric/man/fi_getinfo.3 index b9b94913d5..617dec154e 100644 --- a/opal/mca/common/libfabric/libfabric/man/fi_getinfo.3 +++ b/opal/mca/common/libfabric/libfabric/man/fi_getinfo.3 @@ -1,4 +1,4 @@ -.TH fi_getinfo 3 "2015\-01\-07" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH fi_getinfo 3 "2015\-01\-20" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .SH NAME .PP fi_getinfo / fi_freeinfo - Obtain / free fabric interface information @@ -11,7 +11,7 @@ fi_getinfo / fi_freeinfo - Obtain / free fabric interface information int\ fi_getinfo(int\ version,\ const\ char\ *node,\ const\ char\ *service, \ \ \ \ \ \ \ \ uint64_t\ flags,\ struct\ fi_info\ *hints,\ struct\ fi_info\ **info); -int\ fi_freeinfo(struct\ fi_info\ *info); +void\ fi_freeinfo(struct\ fi_info\ *info); struct\ fi_info\ *fi_dupinfo(const\ struct\ fi_info\ *info); \f[] @@ -435,10 +435,10 @@ Such received messages will indicate a transfer size of 0 bytes. .PP \f[I]FI_PROV_MR_ATTR\f[] : The provider assigns one or more attributes associated with a memory registration request. -The provider will set this mode if it returns the the memory -registration keys that applications must use, or if it requires that the -MR offset associated with a memory region be the same as the virtual -address of the memory. +The provider will set this mode if it returns the memory registration +keys that applications must use, or if it requires that the MR offset +associated with a memory region be the same as the virtual address of +the memory. .PP Applications that support provider MR attributes will need to exchange MR parameters with remote peers for RMA and atomic operations. @@ -479,7 +479,7 @@ these operations. .PP A provider may support one or more of the following addressing formats. In some cases, a selected addressing format may need to be translated or -mapped into into an address which is native to the fabric. +mapped into an address which is native to the fabric. See \f[C]fi_av\f[](3). .PP \f[I]FI_FORMAT_UNSPEC\f[] : FI_FORMAT_UNSPEC indicates that a provider diff --git a/opal/mca/common/libfabric/libfabric/man/fi_msg.3 b/opal/mca/common/libfabric/libfabric/man/fi_msg.3 index 6f321d7cd4..5d44b89200 100644 --- a/opal/mca/common/libfabric/libfabric/man/fi_msg.3 +++ b/opal/mca/common/libfabric/libfabric/man/fi_msg.3 @@ -1,4 +1,4 @@ -.TH fi_msg 3 "2014\-12\-08" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH fi_msg 3 "2015\-01\-23" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .SH NAME .PP fi_msg - Message data transfer operations @@ -42,6 +42,10 @@ ssize_t\ fi_inject(struct\ fid_ep\ *ep,\ void\ *buf,\ size_t\ len, ssize_t\ fi_senddata(struct\ fid_ep\ *ep,\ void\ *buf,\ size_t\ len, \ \ \ \ void\ *desc,\ uint64_t\ data,\ fi_addr_t\ dest_addr,\ void\ *context); + +ssize_t\ fi_rx_size_left(struct\ fid_ep\ *ep); + +ssize_t\ fi_tx_size_left(struct\ fid_ep\ *ep); \f[] .fi .SH ARGUMENTS @@ -97,6 +101,10 @@ asynchronously. Users should not touch the posted data buffer(s) until the receive operation has completed. .PP +The "size_left" functions -- fi_rx_size_left, fi_tx_size_left -- return +a lower bound on the number of receive/send operations that may be +posted to the given endpoint without returning -FI_EAGAIN. +.PP Completed message operations are reported to the user through one or more event collectors associated with the endpoint. Users provide context which are associated with each operation, and is @@ -173,6 +181,26 @@ The fi_recvmsg call supports posting buffers over both connected and unconnected endpoints, with the ability to control the receive operation per call through the use of flags. The fi_recvmsg function takes a struct fi_msg as input. +.SS fi_rx_size_left +.PP +The fi_rx_size_left call returns a lower bound on the number of receive +operations that may be posted to the given endpoint without that +operation returning -FI_EAGAIN. +Depending on the specific details of the subsequently posted receive +operations (e.g., number of iov entries, which receive function is +called, etc.) +, it may be possible to post more receive operations than originally +indicated by fi_rx_size_left. +.SS fi_tx_size_left +.PP +The fi_tx_size_left call returns a lower bound on the number of send +operations that may be posted to the given endpoint without that +operation returning -FI_EAGAIN. +Depending on the specific details of the subsequently posted send +operations (e.g., number of iov entries, which send function is called, +etc.) +, it may be possible to post more send operations than originally +indicated by fi_tx_size_left. .SH FLAGS .PP The fi_recvmsg and fi_sendmsg calls allow the user to specify flags diff --git a/opal/mca/common/libfabric/libfabric/man/fi_rx_size_left.3 b/opal/mca/common/libfabric/libfabric/man/fi_rx_size_left.3 new file mode 100644 index 0000000000..7c0724c989 --- /dev/null +++ b/opal/mca/common/libfabric/libfabric/man/fi_rx_size_left.3 @@ -0,0 +1 @@ +.so man3/fi_msg.3 diff --git a/opal/mca/common/libfabric/libfabric/man/fi_tx_size_left.3 b/opal/mca/common/libfabric/libfabric/man/fi_tx_size_left.3 new file mode 100644 index 0000000000..7c0724c989 --- /dev/null +++ b/opal/mca/common/libfabric/libfabric/man/fi_tx_size_left.3 @@ -0,0 +1 @@ +.so man3/fi_msg.3 diff --git a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx.h b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx.h index 96cd31aab4..cfa550122a 100644 --- a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx.h +++ b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx.h @@ -527,6 +527,11 @@ struct psmx_fid_ep { size_t min_multi_recv; }; +struct psmx_fid_stx { + struct fid_stx stx; + struct psmx_fid_domain *domain; +}; + struct psmx_fid_mr { struct fid_mr mr; struct psmx_fid_domain *domain; @@ -577,6 +582,8 @@ int psmx_wait_open(struct fid_fabric *fabric, struct fi_wait_attr *attr, struct fid_wait **waitset); int psmx_ep_open(struct fid_domain *domain, struct fi_info *info, struct fid_ep **ep, void *context); +int psmx_stx_ctx(struct fid_domain *domain, struct fi_tx_attr *attr, + struct fid_stx **stx, void *context); int psmx_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, struct fid_cq **cq, void *context); int psmx_av_open(struct fid_domain *domain, struct fi_av_attr *attr, diff --git a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_atomic.c b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_atomic.c index 6c45bdbcb9..abe04a59ee 100644 --- a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_atomic.c +++ b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_atomic.c @@ -438,6 +438,10 @@ int psmx_am_atomic_handler(psm_am_token_t token, psm_epaddr_t epaddr, key = args[3].u64; datatype = args[4].u32w0; op = args[4].u32w1; + + if (op == FI_ATOMIC_READ) + len = fi_datatype_size(datatype) * count; + assert(len == fi_datatype_size(datatype) * count); mr = psmx_mr_hash_get(key); @@ -994,7 +998,7 @@ ssize_t _psmx_atomic_readwrite(struct fid_ep *ep, return 0; } - if (!buf) + if (!buf && op != FI_ATOMIC_READ) return -EINVAL; if (datatype < 0 || datatype >= FI_DATATYPE_LAST) @@ -1062,7 +1066,7 @@ ssize_t _psmx_atomic_readwrite(struct fid_ep *ep, args[4].u32w1 = op; err = psm_am_request_short((psm_epaddr_t) dest_addr, PSMX_AM_ATOMIC_HANDLER, args, 5, - (void *)buf, len, am_flags, NULL, NULL); + (void *)buf, (buf?len:0), am_flags, NULL, NULL); return 0; } diff --git a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_cq.c b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_cq.c index 631a78e32b..ce4f8d40bf 100644 --- a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_cq.c +++ b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_cq.c @@ -191,12 +191,14 @@ static struct psmx_cq_event *psmx_cq_create_event_from_status( case FI_CQ_FORMAT_MSG: event->cqe.msg.op_context = op_context; + event->cqe.msg.flags = 0; event->cqe.msg.len = psm_status->nbytes; break; case FI_CQ_FORMAT_DATA: event->cqe.data.op_context = op_context; event->cqe.data.buf = buf; + event->cqe.data.flags = 0; event->cqe.data.len = psm_status->nbytes; if (data) event->cqe.data.data = data; @@ -205,6 +207,7 @@ static struct psmx_cq_event *psmx_cq_create_event_from_status( case FI_CQ_FORMAT_TAGGED: event->cqe.tagged.op_context = op_context; event->cqe.tagged.buf = buf; + event->cqe.tagged.flags = 0; event->cqe.tagged.len = psm_status->nbytes; event->cqe.tagged.tag = psm_status->msg_tag; if (data) diff --git a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_domain.c b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_domain.c index 52aad11712..98def5b6fa 100644 --- a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_domain.c +++ b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_domain.c @@ -83,6 +83,8 @@ static struct fi_ops_domain psmx_domain_ops = { .endpoint = psmx_ep_open, .cntr_open = psmx_cntr_open, .poll_open = psmx_poll_open, + .stx_ctx = psmx_stx_ctx, + .srx_ctx = fi_no_srx_context, }; int psmx_domain_open(struct fid_fabric *fabric, struct fi_info *info, diff --git a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_ep.c b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_ep.c index 2571ff0ccf..0fe43ab31a 100644 --- a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_ep.c +++ b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_ep.c @@ -160,6 +160,7 @@ static int psmx_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) struct psmx_fid_av *av; struct psmx_fid_cq *cq; struct psmx_fid_cntr *cntr; + struct psmx_fid_stx *stx; int err; ep = container_of(fid, struct psmx_fid_ep, ep.fid); @@ -222,6 +223,13 @@ static int psmx_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) return err; break; + case FI_CLASS_STX_CTX: + stx = container_of(bfid, + struct psmx_fid_stx, stx.fid); + if (ep->domain != stx->domain) + return -EINVAL; + break; + default: return -ENOSYS; } @@ -361,3 +369,44 @@ int psmx_ep_open(struct fid_domain *domain, struct fi_info *info, return 0; } +/* STX support is essentially no-op since PSM supports only one send/recv + * context and thus always works in shared context mode. + */ + +static int psmx_stx_close(fid_t fid) +{ + struct psmx_fid_stx *stx; + + stx = container_of(fid, struct psmx_fid_stx, stx.fid); + free(stx); + + return 0; +} + +static struct fi_ops psmx_fi_ops_stx = { + .size = sizeof(struct fi_ops), + .close = psmx_stx_close, + .bind = fi_no_bind, + .control = fi_no_control, +}; + +int psmx_stx_ctx(struct fid_domain *domain, struct fi_tx_attr *attr, + struct fid_stx **stx, void *context) +{ + struct psmx_fid_stx *stx_priv; + + psmx_debug("%s\n", __func__); + + stx_priv = (struct psmx_fid_stx *) calloc(1, sizeof *stx_priv); + if (!stx_priv) + return -ENOMEM; + + stx_priv->stx.fid.fclass = FI_CLASS_STX_CTX; + stx_priv->stx.fid.context = context; + stx_priv->stx.fid.ops = &psmx_fi_ops_stx; + stx_priv->domain = container_of(domain, struct psmx_fid_domain, domain.fid); + + *stx = &stx_priv->stx; + return 0; +} + diff --git a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_mr.c b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_mr.c index 9799bcb94e..cac942eda6 100644 --- a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_mr.c +++ b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_mr.c @@ -270,8 +270,12 @@ static int psmx_mr_reg(struct fid_domain *domain, const void *buf, size_t len, uint64_t key; domain_priv = container_of(domain, struct psmx_fid_domain, domain); - if (!(domain_priv->mode & FI_PROV_MR_ATTR) && psmx_mr_hash_get(requested_key)) + if (flags & FI_MR_KEY) { + if (domain_priv->mode & FI_PROV_MR_ATTR) + return -FI_EBADFLAGS; + if (psmx_mr_hash_get(requested_key)) return -FI_ENOKEY; + } mr_priv = (struct psmx_fid_mr *) calloc(1, sizeof(*mr_priv) + sizeof(struct iovec)); if (!mr_priv) @@ -281,7 +285,7 @@ static int psmx_mr_reg(struct fid_domain *domain, const void *buf, size_t len, mr_priv->mr.fid.context = context; mr_priv->mr.fid.ops = &psmx_fi_ops; mr_priv->mr.mem_desc = mr_priv; - if (!(domain_priv->mode & FI_PROV_MR_ATTR)) { + if (flags & FI_MR_KEY) { key = requested_key; } else { @@ -293,10 +297,12 @@ static int psmx_mr_reg(struct fid_domain *domain, const void *buf, size_t len, mr_priv->domain = domain_priv; mr_priv->access = access; mr_priv->flags = flags; - mr_priv->offset = (flags & FI_MR_OFFSET) ? offset : 0; mr_priv->iov_count = 1; mr_priv->iov[0].iov_base = (void *)buf; mr_priv->iov[0].iov_len = len; + mr_priv->offset = (flags & FI_MR_OFFSET) ? + ((uint64_t)mr_priv->iov[0].iov_base - offset) : + 0; psmx_mr_hash_add(mr_priv); @@ -316,8 +322,12 @@ static int psmx_mr_regv(struct fid_domain *domain, uint64_t key; domain_priv = container_of(domain, struct psmx_fid_domain, domain); - if (!(domain_priv->mode & FI_PROV_MR_ATTR) && psmx_mr_hash_get(requested_key)) + if (flags & FI_MR_KEY) { + if (domain_priv->mode & FI_PROV_MR_ATTR) + return -FI_EBADFLAGS; + if (psmx_mr_hash_get(requested_key)) return -FI_ENOKEY; + } if (count == 0 || iov == NULL) return -EINVAL; @@ -332,7 +342,7 @@ static int psmx_mr_regv(struct fid_domain *domain, mr_priv->mr.fid.context = context; mr_priv->mr.fid.ops = &psmx_fi_ops; mr_priv->mr.mem_desc = mr_priv; - if (!(domain_priv->mode & FI_PROV_MR_ATTR)) { + if (flags & FI_MR_KEY) { key = requested_key; } else { @@ -344,12 +354,14 @@ static int psmx_mr_regv(struct fid_domain *domain, mr_priv->domain = domain_priv; mr_priv->access = access; mr_priv->flags = flags; - mr_priv->offset = (flags & FI_MR_OFFSET) ? offset : 0; mr_priv->iov_count = count; for (i=0; iiov[i] = iov[i]; - psmx_mr_normalize_iov(mr_priv->iov, &mr_priv->iov_count); + mr_priv->offset = (flags & FI_MR_OFFSET) ? + ((uint64_t)mr_priv->iov[0].iov_base - offset) : + 0; + psmx_mr_hash_add(mr_priv); *mr = &mr_priv->mr; @@ -366,8 +378,12 @@ static int psmx_mr_regattr(struct fid_domain *domain, const struct fi_mr_attr *a uint64_t key; domain_priv = container_of(domain, struct psmx_fid_domain, domain); - if (!(domain_priv->mode & FI_PROV_MR_ATTR) && psmx_mr_hash_get(attr->requested_key)) + if (flags & FI_MR_KEY) { + if (domain_priv->mode & FI_PROV_MR_ATTR) + return -FI_EBADFLAGS; + if (psmx_mr_hash_get(attr->requested_key)) return -FI_ENOKEY; + } if (!attr) return -EINVAL; @@ -382,9 +398,10 @@ static int psmx_mr_regattr(struct fid_domain *domain, const struct fi_mr_attr *a return -ENOMEM; mr_priv->mr.fid.fclass = FI_CLASS_MR; + mr_priv->mr.fid.context = attr->context; mr_priv->mr.fid.ops = &psmx_fi_ops; mr_priv->mr.mem_desc = mr_priv; - if (!(domain_priv->mode & FI_PROV_MR_ATTR)) { + if (flags & FI_MR_KEY) { key = attr->requested_key; } else { @@ -394,17 +411,16 @@ static int psmx_mr_regattr(struct fid_domain *domain, const struct fi_mr_attr *a } mr_priv->mr.key = key; mr_priv->domain = domain_priv; - mr_priv->access = FI_READ | FI_WRITE | FI_REMOTE_READ | FI_REMOTE_WRITE; + mr_priv->access = attr->access; mr_priv->flags = flags; - mr_priv->offset = (flags & FI_MR_OFFSET) ? attr->offset : 0; mr_priv->iov_count = attr->iov_count; for (i=0; iiov_count; i++) mr_priv->iov[i] = attr->mr_iov[i]; - - mr_priv->mr.fid.context = attr->context; - mr_priv->access = attr->access; - psmx_mr_normalize_iov(mr_priv->iov, &mr_priv->iov_count); + mr_priv->offset = (flags & FI_MR_OFFSET) ? + ((uint64_t)mr_priv->iov[0].iov_base - attr->offset) : + 0; + psmx_mr_hash_add(mr_priv); *mr = &mr_priv->mr; diff --git a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_msg.c b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_msg.c index adaef4943e..2de6baa823 100644 --- a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_msg.c +++ b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_msg.c @@ -359,5 +359,7 @@ struct fi_ops_msg psmx_msg_ops = { .inject = psmx_inject, .senddata = fi_no_msg_senddata, .injectdata = fi_no_msg_injectdata, + .rx_size_left = fi_no_msg_rx_size_left, + .tx_size_left = fi_no_msg_tx_size_left, }; diff --git a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_msg2.c b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_msg2.c index ca4b2292c5..d05bbf64bb 100644 --- a/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_msg2.c +++ b/opal/mca/common/libfabric/libfabric/prov/psm/src/psmx_msg2.c @@ -626,5 +626,7 @@ struct fi_ops_msg psmx_msg2_ops = { .inject = psmx_inject2, .senddata = fi_no_msg_senddata, .injectdata = fi_no_msg_injectdata, + .rx_size_left = fi_no_msg_rx_size_left, + .tx_size_left = fi_no_msg_tx_size_left, }; diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock.h b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock.h index fba3fcf792..b31dcecd64 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock.h +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock.h @@ -136,6 +136,7 @@ struct sock_conn_map { int used; int size; struct sock_domain *domain; + fastlock_t lock; }; struct sock_domain { @@ -151,7 +152,6 @@ struct sock_domain { enum fi_progress progress_mode; struct index_map mr_idm; struct sock_pe *pe; - struct sock_conn_map u_cmap; struct sock_conn_map r_cmap; pthread_t listen_thread; int listening; @@ -848,9 +848,14 @@ int sock_av_compare_addr(struct sock_av *av, struct sock_conn *sock_conn_map_lookup_key(struct sock_conn_map *conn_map, uint16_t key); +uint16_t sock_conn_map_connect(struct sock_domain *dom, + struct sock_conn_map *map, + struct sockaddr_in *addr); +uint16_t sock_conn_map_lookup(struct sock_conn_map *map, + struct sockaddr_in *addr); uint16_t sock_conn_map_match_or_connect(struct sock_domain *dom, struct sock_conn_map *map, - struct sockaddr_in *addr, int match_only); + struct sockaddr_in *addr); int sock_conn_listen(struct sock_domain *domain); int sock_conn_map_clear_pe_entry(struct sock_conn *conn_entry, uint16_t key); void sock_conn_map_destroy(struct sock_conn_map *cmap); diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_av.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_av.c index 9740fa17c5..a758acdffe 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_av.c +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_av.c @@ -60,9 +60,9 @@ fi_addr_t sock_av_lookup_key(struct sock_av *av, int key) idx = av_addr - &av->table[0]; if (!av->key[idx]) { - av->key[idx] = sock_conn_map_match_or_connect( - av->domain, av->cmap, - (struct sockaddr_in*)&av_addr->addr, 1); + av->key[idx] = sock_conn_map_lookup( + av->cmap, + (struct sockaddr_in*)&av_addr->addr); if (!av->key[idx]) { continue; } @@ -123,7 +123,7 @@ struct sock_conn *sock_av_lookup_addr(struct sock_av *av, if (!av->key[idx]) { av->key[idx] = sock_conn_map_match_or_connect( av->domain, av->cmap, - (struct sockaddr_in*)&av_addr->addr, 0); + (struct sockaddr_in*)&av_addr->addr); if (!av->key[idx]) { SOCK_LOG_ERROR("failed to match or connect to addr %lu\n", addr); errno = EINVAL; diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_conn.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_conn.c index a674932116..ded6f51cfd 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_conn.c +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_conn.c @@ -53,9 +53,9 @@ #include "sock.h" #include "sock_util.h" -static int _init_map(struct sock_conn_map *map, int init_size) +static int sock_conn_map_init(struct sock_conn_map *map, int init_size) { - map->table = (struct sock_conn*)calloc(init_size, + map->table = (struct sock_conn*)calloc(init_size, sizeof(struct sock_conn)); if (!map->table) return -FI_ENOMEM; @@ -64,7 +64,7 @@ static int _init_map(struct sock_conn_map *map, int init_size) return 0; } -static int _increase_map(struct sock_conn_map *map, int new_size) +static int sock_conn_map_increase(struct sock_conn_map *map, int new_size) { if (map->used + new_size > map->size) { void *_table = realloc(map->table, map->size * sizeof(struct @@ -75,7 +75,6 @@ static int _increase_map(struct sock_conn_map *map, int new_size) map->size = MAX(map->size, new_size) * 2; map->table = (struct sock_conn*) _table; } - return 0; } @@ -103,32 +102,60 @@ struct sock_conn *sock_conn_map_lookup_key(struct sock_conn_map *conn_map, #define SOCK_ADDR_IN_PORT(sa)SOCK_ADDR_IN_PTR(sa)->sin_port #define SOCK_ADDR_IN_ADDR(sa)SOCK_ADDR_IN_PTR(sa)->sin_addr -uint16_t sock_conn_map_match_or_connect(struct sock_domain *dom, - struct sock_conn_map *map, - struct sockaddr_in *addr, int match_only) +uint16_t sock_conn_map_lookup(struct sock_conn_map *map, + struct sockaddr_in *addr) { - int i, conn_fd, arg, optval; - socklen_t optlen; - char sa_ip[INET_ADDRSTRLEN]; + int i; struct sockaddr_in *entry; - struct timeval tv; - fd_set fds; - struct sock_conn *conn; - - /* match */ + fastlock_acquire(&map->lock); for (i=0; i < map->used; i++) { entry = (struct sockaddr_in *)&(map->table[i].addr); if ((SOCK_ADDR_IN_ADDR(entry).s_addr == SOCK_ADDR_IN_ADDR(addr).s_addr) && (SOCK_ADDR_IN_PORT(entry) == SOCK_ADDR_IN_PORT(addr))) { + fastlock_release(&map->lock); return i+1; } } + fastlock_release(&map->lock); + return 0; +} - if (match_only) - return 0; +static int sock_conn_map_insert(struct sock_conn_map *map, + struct sockaddr_in *addr, + int conn_fd) +{ + int index; + fastlock_acquire(&map->lock); + + if (map->size == map->used) { + if (sock_conn_map_increase(map, map->size * 2)) { + fastlock_release(&map->lock); + return 0; + } + } + + index = map->used; + memcpy(&map->table[index].addr, addr, sizeof *addr); + map->table[index].sock_fd = conn_fd; + sock_comm_buffer_init(&map->table[index]); + map->used++; + fastlock_release(&map->lock); + return index + 1; +} + +uint16_t sock_conn_map_connect(struct sock_domain *dom, + struct sock_conn_map *map, + struct sockaddr_in *addr) +{ + int conn_fd, optval, ret; + char sa_ip[INET_ADDRSTRLEN]; + unsigned short reply; + struct timeval tv; + socklen_t optlen; + uint64_t flags; + fd_set fds; - /* no matching entry, connect */ conn_fd = socket(AF_INET, SOCK_STREAM, 0); if (conn_fd < 0) { SOCK_LOG_ERROR("failed to create conn_fd, errno: %d\n", errno); @@ -137,22 +164,13 @@ uint16_t sock_conn_map_match_or_connect(struct sock_domain *dom, optval = 1; setsockopt(conn_fd, SOL_SOCKET, SO_REUSEADDR, &optval, sizeof optval); - optval = 1; - setsockopt(conn_fd, SOL_SOCKET, SO_REUSEPORT, &optval, sizeof optval); - - fcntl(conn_fd, F_SETFL, O_NONBLOCK); + memcpy(sa_ip, inet_ntoa(addr->sin_addr), INET_ADDRSTRLEN); SOCK_LOG_INFO("Connecting to: %s:%d\n", sa_ip, ntohs(((struct sockaddr_in*)addr)->sin_port)); - - if (bind(conn_fd, (struct sockaddr*)&dom->src_addr, - sizeof(struct sockaddr_in)) != 0) { - SOCK_LOG_ERROR("Cannot bind to src_addr\n"); - return 0; - } - - SOCK_LOG_INFO("Binding conn_fd to port: %d\n", - ntohs(((struct sockaddr_in*)&dom->src_addr)->sin_port)); + + flags = fcntl(conn_fd, F_GETFL, 0); + fcntl(conn_fd, F_SETFL, flags | O_NONBLOCK); if (connect(conn_fd, addr, sizeof *addr) < 0) { if (errno == EINPROGRESS) { @@ -167,53 +185,79 @@ uint16_t sock_conn_map_match_or_connect(struct sock_domain *dom, if (optval) { SOCK_LOG_ERROR("failed to connect %d - %s\n", optval, - strerror(optval)); + strerror(optval)); close(conn_fd); return 0; } } else { SOCK_LOG_ERROR("Timeout or error to connect %d - %s\n", optval, - strerror(optval)); + strerror(optval)); close(conn_fd); return 0; } } else { SOCK_LOG_ERROR("Error connecting %d - %s\n", errno, - strerror(errno)); + strerror(errno)); close(conn_fd); return 0; } } + + flags = fcntl(conn_fd, F_GETFL, 0); + flags &= (~O_NONBLOCK); + fcntl(conn_fd, F_SETFL, flags); + + ret = send(conn_fd, + &((struct sockaddr_in*)&dom->src_addr)->sin_port, + sizeof(unsigned short), 0); + if (ret != sizeof(unsigned short)) { + SOCK_LOG_ERROR("Cannot exchange port\n"); + return 0; + } - arg = fcntl(conn_fd, F_GETFL, NULL); - arg &= (~O_NONBLOCK); - fcntl(conn_fd, F_SETFL, arg); + ret = recv(conn_fd, + &reply, sizeof(unsigned short), 0); + if (ret != sizeof(unsigned short)) { + SOCK_LOG_ERROR("Cannot exchange port: %d\n", ret); + return 0; + } - memcpy(&map->table[map->used].addr, addr, sizeof *addr); - map->table[map->used].sock_fd = conn_fd; - - conn = &map->table[map->used]; - sock_comm_buffer_init(conn); - - map->used++; - return map->used; + reply = ntohs(reply); + SOCK_LOG_INFO("Connect response: %d\n", ntohs(reply)); + if (reply == 0) { + ret = sock_conn_map_insert(map, addr, conn_fd); + } else { + ret = sock_conn_map_lookup(map, addr); + close(conn_fd); + } + return ret; } -static void * _sock_conn_listen(void *arg) +uint16_t sock_conn_map_match_or_connect(struct sock_domain *dom, + struct sock_conn_map *map, + struct sockaddr_in *addr) +{ + uint16_t index; + return (index = sock_conn_map_lookup(map, addr)) ? + index : sock_conn_map_connect(dom, map, addr); +} + +static void *_sock_conn_listen(void *arg) { struct sock_domain *domain = (struct sock_domain*) arg; struct sock_conn_map *map = &domain->r_cmap; struct addrinfo *s_res = NULL, *p; struct addrinfo hints; int optval, flags, tmp; - int listen_fd = 0, conn_fd; + int listen_fd = 0, conn_fd, ret; struct sockaddr_in remote; socklen_t addr_size; - struct sock_conn *conn; struct pollfd poll_fds[2]; char service[NI_MAXSERV]; struct sockaddr_in addr; char sa_ip[INET_ADDRSTRLEN]; + unsigned short port; + uint16_t index; memset(&hints, 0, sizeof(hints)); hints.ai_family = AF_INET; @@ -235,13 +279,9 @@ static void * _sock_conn_listen(void *arg) fcntl(listen_fd, F_SETFL, flags | O_NONBLOCK); optval = 1; - setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &optval, sizeof - optval); - optval = 1; - setsockopt(listen_fd, SOL_SOCKET, SO_REUSEPORT, &optval, sizeof - optval); - - + setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &optval, + sizeof optval); + if (!bind(listen_fd, s_res->ai_addr, s_res->ai_addrlen)) break; close(listen_fd); @@ -263,7 +303,7 @@ static void * _sock_conn_listen(void *arg) SOCK_LOG_INFO("Bound to port: %d\n", domain->service); } - if (listen(listen_fd, 128)) { + if (listen(listen_fd, 0)) { SOCK_LOG_ERROR("failed to listen socket: %d\n", errno); goto err; } @@ -294,21 +334,25 @@ static void * _sock_conn_listen(void *arg) addr_size = sizeof(struct sockaddr_in); getpeername(conn_fd, &remote, &addr_size); - memcpy(sa_ip, inet_ntoa(remote.sin_addr), INET_ADDRSTRLEN); SOCK_LOG_INFO("ACCEPT: %s, %d\n", sa_ip, ntohs(remote.sin_port)); - /* TODO: lock for multi-threads */ - if ((map->size - map->used) == 0) { - _increase_map(map, map->size*2); - } - memcpy(&map->table[map->used].addr, &remote, addr_size); - map->table[map->used].sock_fd = conn_fd; + ret = recv(conn_fd, &port, sizeof(port), 0); + if (ret != sizeof(port)) + SOCK_LOG_ERROR("Cannot exchange port\n"); - conn = &map->table[map->used]; - sock_comm_buffer_init(conn); + remote.sin_port = port; + SOCK_LOG_INFO("Remote port: %d\n", ntohs(port)); + index = sock_conn_map_lookup(map, &remote); + port = (index) ? 1 : 0; + ret = send(conn_fd, &port, sizeof(port), 0); + if (ret != sizeof(port)) + SOCK_LOG_ERROR("Cannot exchange port\n"); - map->used++; + if (index == 0) + sock_conn_map_insert(map, &remote, conn_fd); + else + close(conn_fd); } close(listen_fd); @@ -322,7 +366,7 @@ err: int sock_conn_listen(struct sock_domain *domain) { - _init_map(&domain->r_cmap, 128); /* TODO: init cmap size */ + sock_conn_map_init(&domain->r_cmap, 128); /* TODO: init cmap size */ pthread_create(&domain->listen_thread, 0, _sock_conn_listen, domain); return 0; } diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_dom.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_dom.c index 816c63708c..f66d72921d 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_dom.c +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_dom.c @@ -133,10 +133,9 @@ static int sock_dom_close(struct fid *fid) return -FI_EBUSY; } - if (dom->u_cmap.size) - sock_conn_map_destroy(&dom->u_cmap); if (dom->r_cmap.size) sock_conn_map_destroy(&dom->r_cmap); + fastlock_destroy(&dom->r_cmap.lock); sock_pe_finalize(dom->pe); fastlock_destroy(&dom->lock); @@ -267,7 +266,8 @@ static int sock_regattr(struct fid_domain *domain, const struct fi_mr_attr *attr _mr->access = attr->access; _mr->flags = flags; _mr->offset = (flags & FI_MR_OFFSET) ? - attr->offset : (uintptr_t) attr->mr_iov[0].iov_base; + (uintptr_t) attr->mr_iov[0].iov_base + attr->offset : + (uintptr_t) attr->mr_iov[0].iov_base; fastlock_acquire(&dom->lock); key = (dom->info.mode & FI_PROV_MR_ATTR) ? @@ -456,8 +456,7 @@ int sock_domain(struct fid_fabric *fabric, struct fi_info *info, } sock_domain->r_cmap.domain = sock_domain; - sock_domain->u_cmap.domain = sock_domain; - + fastlock_init(&sock_domain->r_cmap.lock); if(socketpair(AF_UNIX, SOCK_STREAM, 0, sock_domain->signal_fds) < 0) goto err; diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ep.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ep.c index 25f16a49d3..b3d1dfadbb 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ep.c +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_ep.c @@ -952,6 +952,8 @@ int sock_stx_ctx(struct fid_domain *domain, return -FI_ENOMEM; tx_ctx->domain = dom; + tx_ctx->fid.ctx.fid.fclass = FI_CLASS_STX_CTX; + tx_ctx->fid.stx.fid.ops = &sock_ctx_ops; tx_ctx->fid.stx.ops = &sock_ep_ops; atomic_inc(&dom->ref); @@ -1188,7 +1190,7 @@ struct sock_conn *sock_ep_lookup_conn(struct sock_ep *ep) { if (!ep->key) { ep->key = sock_conn_map_match_or_connect( - ep->domain, &ep->domain->r_cmap, ep->dest_addr, 0); + ep->domain, &ep->domain->r_cmap, ep->dest_addr); if (!ep->key) { SOCK_LOG_ERROR("failed to match or connect to addr\n"); errno = EINVAL; diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_msg.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_msg.c index 715e311459..a5c5447e95 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_msg.c +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_msg.c @@ -317,6 +317,8 @@ struct fi_ops_msg sock_ep_msg_ops = { .sendmsg = sock_ep_sendmsg, .inject = sock_ep_inject, .senddata = sock_ep_senddata, + .rx_size_left = fi_no_msg_rx_size_left, + .tx_size_left = fi_no_msg_tx_size_left, }; static ssize_t sock_ep_trecvmsg(struct fid_ep *ep, diff --git a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_progress.c b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_progress.c index 598bd717f0..a9f287cfd8 100644 --- a/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_progress.c +++ b/opal/mca/common/libfabric/libfabric/prov/sockets/src/sock_progress.c @@ -552,23 +552,26 @@ static int sock_pe_process_rx_write(struct sock_pe *pe, struct sock_rx_ctx *rx_c rem = pe_entry->msg_hdr.msg_len - len; for (i = 0; rem > 0 && i < pe_entry->msg_hdr.dest_iov_len; i++) { - mr = sock_mr_verify_key(rx_ctx->domain, - pe_entry->pe.rx.rx_iov[i].iov.key, - (void*)pe_entry->pe.rx.rx_iov[i].iov.addr, - pe_entry->pe.rx.rx_iov[i].iov.len, - FI_REMOTE_WRITE); - if (!mr) { - SOCK_LOG_ERROR("Remote memory access error: %p, %lu, %lu\n", - (void*)pe_entry->pe.rx.rx_iov[i].iov.addr, - pe_entry->pe.rx.rx_iov[i].iov.len, - pe_entry->pe.rx.rx_iov[i].iov.key); - sock_pe_send_response(pe, pe_entry, 0, SOCK_OP_WRITE_ERROR); - break; + if ((len - pe_entry->done_len) == pe_entry->pe.rx.rx_iov[i].iov.addr) { + mr = sock_mr_verify_key(rx_ctx->domain, + pe_entry->pe.rx.rx_iov[i].iov.key, + (void*)pe_entry->pe.rx.rx_iov[i].iov.addr, + pe_entry->pe.rx.rx_iov[i].iov.len, + FI_REMOTE_WRITE); + if (!mr) { + SOCK_LOG_ERROR("Remote memory access error: %p, %lu, %lu\n", + (void*)pe_entry->pe.rx.rx_iov[i].iov.addr, + pe_entry->pe.rx.rx_iov[i].iov.len, + pe_entry->pe.rx.rx_iov[i].iov.key); + sock_pe_send_response(pe, pe_entry, 0, + SOCK_OP_WRITE_ERROR); + break; + } + + if (mr->flags & FI_MR_OFFSET) + pe_entry->pe.rx.rx_iov[i].iov.addr += mr->offset; } - - if (mr->flags & FI_MR_OFFSET) - pe_entry->pe.rx.rx_iov[i].iov.addr += mr->offset; - + if (sock_pe_recv_field(pe_entry, (void*)pe_entry->pe.rx.rx_iov[i].iov.addr, pe_entry->pe.rx.rx_iov[i].iov.len, len)) diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_dgram.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_dgram.c index 16c87a14df..6b209c7754 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_dgram.c +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_dgram.c @@ -315,6 +315,44 @@ usdf_dgram_inject(struct fid_ep *fep, const void *buf, size_t len, return 0; } +ssize_t usdf_dgram_rx_size_left(struct fid_ep *fep) +{ + struct usdf_ep *ep; + + if (fep == NULL) + return -FI_EINVAL; + + ep = ep_ftou(fep); + + if (ep->e.dg.ep_qp == NULL) + return -FI_EOPBADSTATE; /* EP not enabled */ + + /* NOTE-SIZE-LEFT: divide by constant right now, rather than keeping + * track of the rx_attr->iov_limit value we gave to the user. This + * sometimes under-reports the number of RX ops that could be posted, + * but it avoids touching a cache line that we don't otherwise need. + * + * sendv/recvv could potentially post iov_limit+1 descriptors + */ + return usd_get_recv_credits(ep->e.dg.ep_qp) / (USDF_DGRAM_DFLT_SGE + 1); +} + +ssize_t usdf_dgram_tx_size_left(struct fid_ep *fep) +{ + struct usdf_ep *ep; + + if (fep == NULL) + return -FI_EINVAL; + + ep = ep_ftou(fep); + + if (ep->e.dg.ep_qp == NULL) + return -FI_EOPBADSTATE; /* EP not enabled */ + + /* see NOTE-SIZE-LEFT */ + return usd_get_send_credits(ep->e.dg.ep_qp) / (USDF_DGRAM_DFLT_SGE + 1); +} + /* * Versions that rely on user to reserve space for header at start of buffer */ @@ -482,4 +520,40 @@ usdf_dgram_prefix_sendmsg(struct fid_ep *fep, const struct fi_msg *msg, uint64_t (fi_addr_t)msg->addr, msg->context); } +ssize_t usdf_dgram_prefix_rx_size_left(struct fid_ep *fep) +{ + struct usdf_ep *ep; + + if (fep == NULL) + return -FI_EINVAL; + + ep = ep_ftou(fep); + + if (ep->e.dg.ep_qp == NULL) + return -FI_EOPBADSTATE; /* EP not enabled */ + + /* prefix_recvv can post up to iov_limit descriptors + * + * also see NOTE-SIZE-LEFT */ + return (usd_get_recv_credits(ep->e.dg.ep_qp) / USDF_DGRAM_DFLT_SGE); +} + +ssize_t usdf_dgram_prefix_tx_size_left(struct fid_ep *fep) +{ + struct usdf_ep *ep; + + if (fep == NULL) + return -FI_EINVAL; + + ep = ep_ftou(fep); + + if (ep->e.dg.ep_qp == NULL) + return -FI_EOPBADSTATE; /* EP not enabled */ + + /* prefix_sendvcan post up to iov_limit descriptors + * + * also see NOTE-SIZE-LEFT */ + return (usd_get_send_credits(ep->e.dg.ep_qp) / USDF_DGRAM_DFLT_SGE); +} + diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_dgram.h b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_dgram.h index edfab76afd..a767041ae6 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_dgram.h +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_dgram.h @@ -56,6 +56,8 @@ ssize_t usdf_dgram_inject(struct fid_ep *ep, const void *buf, size_t len, fi_addr_t dest_addr); ssize_t usdf_dgram_senddata(struct fid_ep *ep, const void *buf, size_t len, void *desc, uint64_t data, fi_addr_t dest_addr, void *context); +ssize_t usdf_dgram_rx_size_left(struct fid_ep *ep); +ssize_t usdf_dgram_tx_size_left(struct fid_ep *ep); ssize_t usdf_dgram_prefix_recv(struct fid_ep *ep, void *buf, size_t len, void *desc, fi_addr_t src_addr, void *context); @@ -69,5 +71,7 @@ ssize_t usdf_dgram_prefix_sendv(struct fid_ep *fep, const struct iovec *iov, void **desc, size_t count, fi_addr_t dest_addr, void *context); ssize_t usdf_dgram_prefix_sendmsg(struct fid_ep *ep, const struct fi_msg *msg, uint64_t flags); +ssize_t usdf_dgram_prefix_rx_size_left(struct fid_ep *ep); +ssize_t usdf_dgram_prefix_tx_size_left(struct fid_ep *ep); #endif /* _USDF_DGRAM_H_ */ diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_ep_dgram.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_ep_dgram.c index 148114ccb8..f0445421e5 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_ep_dgram.c +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_ep_dgram.c @@ -76,6 +76,15 @@ usdf_ep_dgram_enable(struct fid_ep *fep) ep = ep_ftou(fep); + if (ep->e.dg.ep_wcq == NULL) { + ret = -FI_EOPBADSTATE; + goto fail; + } + if (ep->e.dg.ep_rcq == NULL) { + ret = -FI_EOPBADSTATE; + goto fail; + } + filt.uf_type = USD_FTY_UDP_SOCK; filt.uf_filter.uf_udp_sock.u_sock = ep->e.dg.ep_sock; @@ -90,7 +99,7 @@ usdf_ep_dgram_enable(struct fid_ep *fep) &filt, &ep->e.dg.ep_qp); } else { - ret = -EAGAIN; + ret = -FI_EAGAIN; } if (ret != 0) { @@ -261,6 +270,10 @@ usdf_ep_dgram_close(fid_t fid) usdf_ep_dgram_deref_cq(ep->e.dg.ep_wcq); usdf_ep_dgram_deref_cq(ep->e.dg.ep_rcq); + if (ep->e.dg.ep_sock != -1) { + close(ep->e.dg.ep_sock); + } + free(ep); return 0; } @@ -286,6 +299,8 @@ static struct fi_ops_msg usdf_dgram_ops = { .inject = usdf_dgram_inject, .senddata = usdf_dgram_senddata, .injectdata = fi_no_msg_injectdata, + .rx_size_left = usdf_dgram_rx_size_left, + .tx_size_left = usdf_dgram_tx_size_left, }; static struct fi_ops_msg usdf_dgram_prefix_ops = { @@ -299,6 +314,8 @@ static struct fi_ops_msg usdf_dgram_prefix_ops = { .inject = usdf_dgram_inject, .senddata = usdf_dgram_senddata, .injectdata = fi_no_msg_injectdata, + .rx_size_left = usdf_dgram_prefix_rx_size_left, + .tx_size_left = usdf_dgram_prefix_tx_size_left, }; static struct fi_ops_cm usdf_cm_dgram_ops = { diff --git a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_pep.c b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_pep.c index e58a2d8b9c..0f04d40993 100644 --- a/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_pep.c +++ b/opal/mca/common/libfabric/libfabric/prov/usnic/src/usdf_pep.c @@ -446,6 +446,7 @@ usdf_pep_open(struct fid_fabric *fabric, struct fi_info *info, struct usdf_pep *pep; struct usdf_fabric *fp; int ret; + int optval; if (info->ep_type != FI_EP_MSG) { return -FI_ENODEV; @@ -485,6 +486,16 @@ usdf_pep_open(struct fid_fabric *fabric, struct fi_info *info, goto fail; } + /* set SO_REUSEADDR to prevent annoying "Address already in use" errors + * on successive runs of programs listening on a well known port */ + optval = 1; + ret = setsockopt(pep->pep_sock, SOL_SOCKET, SO_REUSEADDR, &optval, + sizeof(optval)); + if (ret == -1) { + ret = -errno; + goto fail; + } + ret = bind(pep->pep_sock, (struct sockaddr *)info->src_addr, info->src_addrlen); if (ret == -1) { diff --git a/opal/mca/common/libfabric/libfabric/prov/verbs/src/fi_verbs.c b/opal/mca/common/libfabric/libfabric/prov/verbs/src/fi_verbs.c index 7dd35bfbb2..195db04dc9 100644 --- a/opal/mca/common/libfabric/libfabric/prov/verbs/src/fi_verbs.c +++ b/opal/mca/common/libfabric/libfabric/prov/verbs/src/fi_verbs.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2014 Intel Corporation, Inc. All rights reserved. + * Copyright (c) 2013-2015 Intel Corporation, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -72,7 +72,7 @@ #define VERBS_CAPS (FI_MSG | FI_RMA | FI_ATOMICS | FI_READ | FI_WRITE | \ FI_SEND | FI_RECV | FI_REMOTE_READ | FI_REMOTE_WRITE | \ - FI_REMOTE_CQ_DATA | FI_REMOTE_COMPLETE) + FI_REMOTE_CQ_DATA | FI_REMOTE_SIGNAL) #define VERBS_MODE (FI_LOCAL_MR | FI_PROV_MR_ATTR) #define VERBS_MSG_ORDER (FI_ORDER_RAR | FI_ORDER_RAW | FI_ORDER_RAS | \ FI_ORDER_WAW | FI_ORDER_WAS | FI_ORDER_SAW | FI_ORDER_SAS ) @@ -127,6 +127,7 @@ struct fi_ibv_msg_ep { uint32_t inline_size; }; +static const char *local_node = "localhost"; static char def_send_wr[16] = "384"; static char def_recv_wr[16] = "384"; static char def_send_sge[16] = "4"; @@ -371,7 +372,10 @@ static int fi_ibv_check_info(struct fi_info *info) return -FI_ENODATA; } - if (!(info->caps & VERBS_CAPS) && info->caps) + if (info->caps && (info->caps & ~VERBS_CAPS)) + return -FI_ENODATA; + + if ((info->mode & VERBS_MODE) != VERBS_MODE) return -FI_ENODATA; if (info->fabric_attr) { @@ -433,9 +437,6 @@ static int fi_ibv_check_dev_limits(struct fi_domain_attr *domain_attr, return 0; } -/* - * TODO: this is not the full set of checks which are needed - */ static int fi_ibv_fi_to_rai(struct fi_info *fi, uint64_t flags, struct rdma_addrinfo *rai) { memset(rai, 0, sizeof *rai); @@ -443,30 +444,19 @@ static int fi_ibv_fi_to_rai(struct fi_info *fi, uint64_t flags, struct rdma_addr rai->ai_flags = RAI_PASSIVE; if (flags & FI_NUMERICHOST) rai->ai_flags |= RAI_NUMERICHOST; -// if (fi->flags & FI_FAMILY) -// rai->ai_flags |= RAI_FAMILY; -// rai->ai_family = fi->sa_family; - if (fi->ep_type == FI_EP_MSG || fi->caps & FI_RMA || (fi->ep_attr && - (fi->ep_attr->protocol == FI_PROTO_RDMA_CM_IB_RC || - fi->ep_attr->protocol == FI_PROTO_IWARP))) { - rai->ai_qp_type = IBV_QPT_RC; - rai->ai_port_space = RDMA_PS_TCP; - } else if (fi->ep_type == FI_EP_DGRAM || (fi->ep_attr && - fi->ep_attr->protocol == FI_PROTO_IB_UD)) { - rai->ai_qp_type = IBV_QPT_UD; - rai->ai_port_space = RDMA_PS_UDP; - } + rai->ai_qp_type = IBV_QPT_RC; + rai->ai_port_space = RDMA_PS_TCP; - if (fi->src_addrlen) { + if (fi && fi->src_addrlen) { if (!(rai->ai_src_addr = malloc(fi->src_addrlen))) - return ENOMEM; + return -FI_ENOMEM; memcpy(rai->ai_src_addr, fi->src_addr, fi->src_addrlen); rai->ai_src_len = fi->src_addrlen; } - if (fi->dest_addrlen) { + if (fi && fi->dest_addrlen) { if (!(rai->ai_dst_addr = malloc(fi->dest_addrlen))) - return ENOMEM; + return -FI_ENOMEM; memcpy(rai->ai_dst_addr, fi->dest_addr, fi->dest_addrlen); rai->ai_dst_len = fi->dest_addrlen; } @@ -477,27 +467,20 @@ static int fi_ibv_fi_to_rai(struct fi_info *fi, uint64_t flags, struct rdma_addr static int fi_ibv_rai_to_fi(struct rdma_addrinfo *rai, struct fi_info *hints, struct fi_info *fi) { - // fi->sa_family = rai->ai_family; - if (rai->ai_qp_type == IBV_QPT_RC || rai->ai_port_space == RDMA_PS_TCP) { - fi->caps |= FI_MSG | FI_RMA; - fi->ep_type = FI_EP_MSG; - } else if (rai->ai_qp_type == IBV_QPT_UD || - rai->ai_port_space == RDMA_PS_UDP) { - fi->ep_attr->protocol = FI_PROTO_IB_UD; - fi->caps |= FI_MSG; - fi->ep_type = FI_EP_DGRAM; - } + fi->caps = VERBS_CAPS; + fi->mode = VERBS_MODE; + fi->ep_type = FI_EP_MSG; if (rai->ai_src_len) { if (!(fi->src_addr = malloc(rai->ai_src_len))) - return ENOMEM; + return -FI_ENOMEM; memcpy(fi->src_addr, rai->ai_src_addr, rai->ai_src_len); fi->src_addrlen = rai->ai_src_len; fi->addr_format = FI_SOCKADDR; } if (rai->ai_dst_len) { if (!(fi->dest_addr = malloc(rai->ai_dst_len))) - return ENOMEM; + return -FI_ENOMEM; memcpy(fi->dest_addr, rai->ai_dst_addr, rai->ai_dst_len); fi->dest_addrlen = rai->ai_dst_len; fi->addr_format = FI_SOCKADDR; @@ -603,17 +586,19 @@ fi_ibv_getepinfo(const char *node, const char *service, ret = fi_ibv_check_info(hints); if (ret) return ret; - - ret = fi_ibv_fi_to_rai(hints, flags, &rai_hints); - if (ret) - return ret; - - ret = rdma_getaddrinfo((char *) node, (char *) service, - &rai_hints, &rai); - } else { - ret = rdma_getaddrinfo((char *) node, (char *) service, - NULL, &rai); } + + ret = fi_ibv_fi_to_rai(hints, flags, &rai_hints); + if (ret) + return ret; + + if (!node && !rai_hints.ai_src_addr && !rai_hints.ai_dst_addr) { + node = local_node; + rai_hints.ai_flags |= RAI_PASSIVE; + } + + ret = rdma_getaddrinfo((char *) node, (char *) service, + &rai_hints, &rai); if (ret) return (errno == ENODEV) ? -FI_ENODATA : -errno; @@ -923,6 +908,8 @@ static struct fi_ops_msg fi_ibv_msg_ep_msg_ops = { .inject = fi_no_msg_inject, .senddata = fi_ibv_msg_ep_senddata, .injectdata = fi_no_msg_injectdata, + .rx_size_left = fi_no_msg_rx_size_left, + .tx_size_left = fi_no_msg_tx_size_left, }; static ssize_t @@ -2499,6 +2486,9 @@ static int fi_ibv_open_device_by_name(struct fi_ibv_domain *domain, const char * struct ibv_context **dev_list; int i, ret = -FI_ENODEV; + if (!name) + return -FI_EINVAL; + dev_list = rdma_get_devices(NULL); if (!dev_list) return -errno; diff --git a/opal/mca/common/libfabric/libfabric/src/enosys.c b/opal/mca/common/libfabric/libfabric/src/enosys.c index d293f52908..177dfcb67c 100644 --- a/opal/mca/common/libfabric/libfabric/src/enosys.c +++ b/opal/mca/common/libfabric/libfabric/src/enosys.c @@ -349,6 +349,14 @@ ssize_t fi_no_msg_injectdata(struct fid_ep *ep, const void *buf, size_t len, { return -FI_ENOSYS; } +ssize_t fi_no_msg_rx_size_left(struct fid_ep *ep) +{ + return -FI_ENOSYS; +} +ssize_t fi_no_msg_tx_size_left(struct fid_ep *ep) +{ + return -FI_ENOSYS; +} /* * struct fi_ops_wait diff --git a/opal/mca/common/libfabric/libfabric/src/fabric.c b/opal/mca/common/libfabric/libfabric/src/fabric.c index 42edf2df78..f6ebe13497 100644 --- a/opal/mca/common/libfabric/libfabric/src/fabric.c +++ b/opal/mca/common/libfabric/libfabric/src/fabric.c @@ -136,10 +136,6 @@ static void fi_ini(void) if (init) goto unlock; - fi_register_provider(VERBS_INIT); - fi_register_provider(PSM_INIT); - fi_register_provider(USNIC_INIT); - #ifdef HAVE_LIBDL struct dirent **liblist; int n; @@ -182,8 +178,14 @@ static void fi_ini(void) free(liblist); done: #endif + + fi_register_provider(PSM_INIT); + fi_register_provider(USNIC_INIT); + + fi_register_provider(VERBS_INIT); fi_register_provider(SOCKETS_INIT); init = 1; + unlock: pthread_mutex_unlock(&ini_lock); } @@ -206,50 +208,6 @@ static struct fi_prov *fi_getprov(const char *prov_name) return NULL; } -__attribute__((visibility ("default"))) -int fi_getinfo_(uint32_t version, const char *node, const char *service, - uint64_t flags, struct fi_info *hints, struct fi_info **info) -{ - struct fi_prov *prov; - struct fi_info *tail, *cur; - int ret = -FI_ENOSYS; - - if (!init) - fi_ini(); - - *info = tail = NULL; - for (prov = prov_head; prov; prov = prov->next) { - if (!prov->provider->getinfo) - continue; - - if (hints->fabric_attr && hints->fabric_attr->prov_name && - strcmp(prov->provider->name, hints->fabric_attr->prov_name)) - continue; - - ret = prov->provider->getinfo(version, node, service, flags, - hints, &cur); - if (ret) { - if (ret == -FI_ENODATA) - continue; - break; - } - - if (!*info) - *info = cur; - else - tail->next = cur; - for (tail = cur; tail->next; tail = tail->next) { - tail->fabric_attr->prov_name = strdup(prov->provider->name); - tail->fabric_attr->prov_version = prov->provider->version; - } - tail->fabric_attr->prov_name = strdup(prov->provider->name); - tail->fabric_attr->prov_version = prov->provider->version; - } - - return *info ? 0 : ret; -} -default_symver(fi_getinfo_, fi_getinfo); - __attribute__((visibility ("default"))) void fi_freeinfo_(struct fi_info *info) { @@ -277,6 +235,55 @@ void fi_freeinfo_(struct fi_info *info) } default_symver(fi_freeinfo_, fi_freeinfo); +__attribute__((visibility ("default"))) +int fi_getinfo_(uint32_t version, const char *node, const char *service, + uint64_t flags, struct fi_info *hints, struct fi_info **info) +{ + struct fi_prov *prov; + struct fi_info *tail, *cur; + int ret = -FI_ENOSYS; + + if (!init) + fi_ini(); + + *info = tail = NULL; + for (prov = prov_head; prov; prov = prov->next) { + if (!prov->provider->getinfo) + continue; + + if (hints->fabric_attr && hints->fabric_attr->prov_name && + strcmp(prov->provider->name, hints->fabric_attr->prov_name)) + continue; + + ret = prov->provider->getinfo(version, node, service, flags, + hints, &cur); + if (ret) { + if (ret == -FI_ENODATA) { + continue; + } else { + /* a provider has an error, clean up and bail */ + fi_freeinfo_(*info); + *info = NULL; + return ret; + } + } + + if (!*info) + *info = cur; + else + tail->next = cur; + for (tail = cur; tail->next; tail = tail->next) { + tail->fabric_attr->prov_name = strdup(prov->provider->name); + tail->fabric_attr->prov_version = prov->provider->version; + } + tail->fabric_attr->prov_name = strdup(prov->provider->name); + tail->fabric_attr->prov_version = prov->provider->version; + } + + return *info ? 0 : ret; +} +default_symver(fi_getinfo_, fi_getinfo); + __attribute__((visibility ("default"))) struct fi_info *fi_dupinfo_(const struct fi_info *info) {