Merge pull request #5933 from hppritcha/topic/remove_bfo_pml
remove the bfo pml
Этот коммит содержится в:
Коммит
a435bfe1cf
@ -1,78 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
|
||||
# Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
|
||||
#
|
||||
# Copyright (c) 2017 IBM Corporation. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
AM_CPPFLAGS = -DPML_BFO=1
|
||||
|
||||
dist_ompidata_DATA = \
|
||||
help-mpi-pml-bfo.txt
|
||||
|
||||
EXTRA_DIST = post_configure.sh
|
||||
|
||||
bfo_sources = \
|
||||
pml_bfo.c \
|
||||
pml_bfo.h \
|
||||
pml_bfo_comm.c \
|
||||
pml_bfo_comm.h \
|
||||
pml_bfo_component.c \
|
||||
pml_bfo_component.h \
|
||||
pml_bfo_failover.c \
|
||||
pml_bfo_failover.h \
|
||||
pml_bfo_hdr.h \
|
||||
pml_bfo_iprobe.c \
|
||||
pml_bfo_irecv.c \
|
||||
pml_bfo_isend.c \
|
||||
pml_bfo_progress.c \
|
||||
pml_bfo_rdma.c \
|
||||
pml_bfo_rdma.h \
|
||||
pml_bfo_rdmafrag.c \
|
||||
pml_bfo_rdmafrag.h \
|
||||
pml_bfo_recvfrag.c \
|
||||
pml_bfo_recvfrag.h \
|
||||
pml_bfo_recvreq.c \
|
||||
pml_bfo_recvreq.h \
|
||||
pml_bfo_sendreq.c \
|
||||
pml_bfo_sendreq.h \
|
||||
pml_bfo_start.c
|
||||
|
||||
# If we have CUDA support requested, build the CUDA file also
|
||||
if OPAL_cuda_support
|
||||
bfo_sources += \
|
||||
pml_bfo_cuda.c
|
||||
endif
|
||||
|
||||
if MCA_BUILD_ompi_pml_bfo_DSO
|
||||
component_noinst =
|
||||
component_install = mca_pml_bfo.la
|
||||
else
|
||||
component_noinst = libmca_pml_bfo.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(ompilibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_pml_bfo_la_SOURCES = $(bfo_sources)
|
||||
mca_pml_bfo_la_LDFLAGS = -module -avoid-version
|
||||
mca_pml_bfo_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_pml_bfo_la_SOURCES = $(bfo_sources)
|
||||
libmca_pml_bfo_la_LDFLAGS = -module -avoid-version
|
@ -1,340 +0,0 @@
|
||||
Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
|
||||
BFO DESIGN DOCUMENT
|
||||
This document describes the use and design of the bfo. In addition,
|
||||
there is a section at the end explaining why this functionality was
|
||||
not merged into the ob1 PML.
|
||||
|
||||
1. GENERAL USAGE
|
||||
First, one has to configure the failover code into the openib BTL so
|
||||
that bfo will work correctly. To do this:
|
||||
configure --enable-btl-openib-failover.
|
||||
|
||||
Then, when running one needs to select the bfo PML explicitly.
|
||||
mpirun --mca pml bfo
|
||||
|
||||
Note that one needs to both configure with --enable-btl-openib-failover
|
||||
and run with --mca pml bfo to get the failover support. If one of
|
||||
these two steps is skipped, then the MPI job will just abort in the
|
||||
case of an error like it normally does with the ob1 PML.
|
||||
|
||||
2. GENERAL FUNCTION
|
||||
The bfo failover feature requires two or more openib BTLs in use. In
|
||||
normal operation, it will stripe the communication over the multiple
|
||||
BTLs. When an error is detected, it will stop using the BTL that
|
||||
incurred the error and continue the communication over the remaining
|
||||
BTL. Once a BTL has been mapped out, it cannot be used by the job
|
||||
again, even if the underlying fabric becomes functional again. Only
|
||||
new jobs started after the fabric comes back up will use both BTLs.
|
||||
|
||||
The bfo works in conjunction with changes that were made in the openib
|
||||
BTL. As noted above, those changes need to be configured into the
|
||||
BTL for everything to work properly.
|
||||
|
||||
The bfo only fails over between openib BTLs. It cannot failover from
|
||||
an openib BTL to TCP, for example.
|
||||
|
||||
3. GENERAL DESIGN
|
||||
The bfo (Btl FailOver) PML was designed to work in clusters that have
|
||||
multiple openib BTLs. It was designed to be lightweight so as to
|
||||
avoid any adverse effects on latency. To that end, there is no
|
||||
tracking of fragments or messages in the bfo PML. Rather, it depends
|
||||
on the underlying BTL to notify it of each fragment that has an error.
|
||||
The bfo then decides what needs to be done based on the type of
|
||||
fragment that gets an error.
|
||||
|
||||
No additional sequence numbers were introduced in the bfo. Instead,
|
||||
it makes use of the sequence numbers that exist in the MATCH, RNDV and
|
||||
RGET fragment header. In that way, duplicate fragments that have
|
||||
MATCH information in them can be detected. Other fragments, like PUT
|
||||
and ACK, are never retransmitted so it does not matter that they do
|
||||
not have sequence numbers. The FIN header was a special case in that
|
||||
it was changed to include the MATCH header so that the tag, source,
|
||||
and context fields could be used to check for duplicate FINs.
|
||||
|
||||
Note that the assumption is that the underlying BTL will always issue
|
||||
a callback with an error flag when it thinks a fragment has an error.
|
||||
This means that even after an error is detected on a BTL, the BTL
|
||||
continues to be checked for any other messages that may also complete
|
||||
with an error. This is potentially a unique characteristic of the
|
||||
openib BTL when running over RC connections that allows the BFO to
|
||||
work properly.
|
||||
|
||||
One scenario that is particularly difficult to handle is the case
|
||||
where a fragment has an error but the message actually makes it to the
|
||||
other side. It is because of this that all fragments need to be
|
||||
checked to make sure they are not a duplicate. This scenario also
|
||||
complicates some of the rendezvous protocols as the two sides may not
|
||||
agree where the problem occurred. For example, one can imagine a
|
||||
sender getting an error on a final FIN message, but the FIN message
|
||||
actually arrives at the other side. The receiver thinks the
|
||||
communication is done and moves on. The sender thinks there was a
|
||||
problem, and that the communication needs to restart.
|
||||
|
||||
It is also important to note that a message cannot signal a successful
|
||||
completion and *not* make it to the receiver. This would probably cause
|
||||
the bfo to hang.
|
||||
|
||||
4. ERRORS
|
||||
Errors are detected in the openib BTL layer and propagated to the PML
|
||||
layer. Typically, the errors occur while polling the completion
|
||||
queue, but can happen in other areas as well. When an error occurs,
|
||||
an additional callback is called so the PML can map out the connection
|
||||
for future sending. Then the callback associated with the fragment is
|
||||
called, but with the error field set to OMPI_ERROR. This way, the PML
|
||||
knows that this fragment may not have made it to the remote side.
|
||||
|
||||
The first callback into the PML is via the mca_pml_bfo_error_handler()
|
||||
callback and the PML uses this to remove a connection for future
|
||||
sending. If the error_proc_t field is NULL, then the entire BTL is
|
||||
removed for any future communication. If the error_proc_t is not
|
||||
NULL, then the BTL is only removed for the connection associated with
|
||||
the error_proc_t.
|
||||
|
||||
The second callback is the standard one for a completion event, and
|
||||
this can trigger various activities in the PML. The regular callback
|
||||
function is called but the status is set to OMPI_ERROR. The PML layer
|
||||
detects this and calls some failover specific routines depending on
|
||||
the type of fragment that got the error.
|
||||
|
||||
|
||||
5. RECOVERY OF MATCH FRAGMENTS
|
||||
Note: For a general description of how the various fragments interact,
|
||||
see Appendix 1 at the end of this document.
|
||||
|
||||
In the case of a MATCH fragment, the fragment is simply resent. Care
|
||||
has to be taken with a MATCH fragment that is sent via the standard
|
||||
interface and one that is sent via the sendi interface. In the
|
||||
standard send, the send request is still available and is therefore
|
||||
reset reused to send the MATCH fragment. In the case of the sendi
|
||||
fragment, the send request is gone, so the fragment is regenerated
|
||||
from the information contained within the fragment.
|
||||
|
||||
6. RECOVERY OF RNDV or LARGE MESSAGE RDMA
|
||||
In the case of a large message RDMA transfer or a RNDV transfer where
|
||||
the message consists of several fragments, the restart is a little
|
||||
more complicated. This includes fragments like RNDV, PUT, RGET, FRAG,
|
||||
FIN, and RDMA write and RDMA read completions. In most cases, the
|
||||
requests associated with these fragments are reset and restarted.
|
||||
|
||||
First, it should be pointed out that a new variable was added to the
|
||||
send and receive requests. This variable tracks outstanding send
|
||||
events that have not yet received their completion events. This new
|
||||
variable is used so that a request is not restarted until all the
|
||||
outstanding events have completed. If one does not wait for the
|
||||
outstanding events to complete, then one may restart a request and
|
||||
then a completion event will happen on the wrong request.
|
||||
|
||||
There is a second variable added to each request and that is one that
|
||||
shows whether the request is already in an error state. When a request
|
||||
reaches the state that it has an error flagged on it and the outstanding
|
||||
completion events are down to zero, it can start the restart dance
|
||||
as described below.
|
||||
|
||||
7. SPECIAL CASE FOR FIN FRAGMENT
|
||||
Like the MATCH fragment, the FIN message is also simply resent. Like
|
||||
the sendi MATCH fragment, there may be no request associated with the
|
||||
FIN message when it gets an error, so the fragment is recreated from
|
||||
the information in the fragment. The FIN fragment was modified to
|
||||
have additional information like what is in a MATCH fragment including
|
||||
the context, source, and tag. In this way, we can figure out if the
|
||||
FIN message is a duplicate on the receiving side.
|
||||
|
||||
8. RESTART DANCE
|
||||
When the bfo determines that there are no outstanding completion events,
|
||||
a restart dance is initiated. There are four new PML message types that
|
||||
have been created to participate in the dance.
|
||||
1. RNDVRESTARTNOTIFY
|
||||
2. RECVERRNOTIFY
|
||||
3. RNDVRESTARTACK
|
||||
4. RNDVRESTARTNACK
|
||||
|
||||
When the send request is in an error state and the outstanding
|
||||
completion events is zero, RNDVRESTARTNOTIFY is sent from the sender
|
||||
to the receiver to let it know that the communication needs to be
|
||||
restarted. Upon receipt of the RNDVRESTARTNOTIFY, the receiver first
|
||||
checks to make sure that it is still pointing to a valid receiver
|
||||
request. If so, it marks the receive request in error. It then
|
||||
checks to see if there are any outstanding completion events on the
|
||||
receiver. If there are no outstanding completion events, the receiver
|
||||
sends the RNDVRESTARTACK. If there are outstanding completion events,
|
||||
then the RNDVRESTARTACK gets sent later when a completion event occurs
|
||||
that brings the outstanding event count to zero.
|
||||
|
||||
In the case that the receiver determines that it is no longer looking
|
||||
at a valid receive request, which means the request is complete, the
|
||||
receiver responds with a RNDVRESTARTNACK. While rare, this case can
|
||||
happen for example, when a final FRAG message triggers an error on the
|
||||
sender, but actually makes it to the receiver.
|
||||
|
||||
The RECVERRNOTIFY fragment is used so the receiver can let the sender
|
||||
sender know that it had an error. The sender then waits for all of
|
||||
its completion events, and then sends a RNDVRESTARTNOTIFY.
|
||||
|
||||
All the handling of these new messages is contained in the
|
||||
pml_bfo_failover files.
|
||||
|
||||
9. BTL SUPPORT
|
||||
The openib BTL also supplies a lot of support for the bfo PML. First,
|
||||
fragments can be stored in the BTL during normal operation if
|
||||
resources become scarce. This means that when an error is detected in
|
||||
the BTL, it needs to scour its internal queues for fragments that are
|
||||
destined for the BTL and error them out. The function
|
||||
error_out_all_pending_frags() takes care of this functionality. And
|
||||
some of the fragments stored can be coalesced, so care has to be taken
|
||||
to tease out each message from a coalesced fragment.
|
||||
|
||||
There is also some special code in the BTL to handle some strange
|
||||
occurrences that were observed in the BTL. First, there are times
|
||||
where only one half of the connection gets an error. This can result
|
||||
in a mismatch between what the PML thinks is available to it and can
|
||||
cause hangs. Therefore, when a BTL detects an error, it sends a
|
||||
special message down the working BTL connection to tell the remote
|
||||
side that it needs to be brought down as well.
|
||||
|
||||
Secondly, it has been observed that a message can get stuck in the
|
||||
eager RDMA connection between two BTLs. In this case, an error is
|
||||
detected on one side, but the other side never sees the message.
|
||||
Therefore, a special message is sent to the other side telling it to
|
||||
move along in the eager RDMA connection. This is all somewhat
|
||||
confusing. See the code in the btl_openib_failover.c file for the
|
||||
details.
|
||||
|
||||
10. MERGING
|
||||
Every effort was made to try and merge the bfo PML into the ob1 PML.
|
||||
The idea was that any upgrades to the ob1 PML would automatically make
|
||||
it into the bfo PML and this would enhance maintainability of all the
|
||||
code. However, it was deemed that this merging would cause more
|
||||
problems than it would solve. What was attempted and why the
|
||||
conclusion was made are documented here.
|
||||
|
||||
One can look at the bfo and easily see the differences between it and
|
||||
ob1. All the bfo specific code is surrounded by #if PML_BFO. In
|
||||
addition, there are two additional files in the bfo,
|
||||
pml_bfo_failover.c and pml_bfo_failover.h.
|
||||
|
||||
To merge them, the following was attempted. First, add all the code
|
||||
in #if regions into the ob1 PML. As of this writing, there are 73
|
||||
#ifs that would have to be added into ob1.
|
||||
|
||||
Secondly, remove almost all the pml_bfo files and replace them with
|
||||
links to the ob1 files.
|
||||
|
||||
Third, create a new header file that did name shifting of all the
|
||||
functions so that ob1 and bfo could live together. This also included
|
||||
having to create macros for the names of header files as well. To
|
||||
help illustrate the name shifting issue, here is what the file might
|
||||
look like in the bfo directory.
|
||||
|
||||
/* Need macros for the header files as they are different in the
|
||||
* different PMLs */
|
||||
#define PML "bfo"
|
||||
#define PML_OB1_H "pml_bfo.h"
|
||||
#define PML_OB1_COMM_H "pml_bfo_comm.h"
|
||||
#define PML_OB1_COMPONENT_H "pml_bfo_component.h"
|
||||
#define PML_OB1_HDR_H "pml_bfo_hdr.h"
|
||||
#define PML_OB1_RDMA_H "pml_bfo_rdma.h"
|
||||
#define PML_OB1_RDMAFRAG_H "pml_bfo_rdmafrag.h"
|
||||
#define PML_OB1_RECVFRAG_H "pml_bfo_recvfrag.h"
|
||||
#define PML_OB1_RECVREQ_H "pml_bfo_recvreq.h"
|
||||
#define PML_OB1_SENDREQ_H "pml_bfo_sendreq.h"
|
||||
|
||||
/* Name shifting of functions from ob1 to bfo (incomplete list) */
|
||||
#define mca_pml_ob1 mca_pml_bfo
|
||||
#define mca_pml_ob1_t mca_pml_bfo_t
|
||||
#define mca_pml_ob1_component mca_pml_bfo_component
|
||||
#define mca_pml_ob1_add_procs mca_pml_bfo_add_procs
|
||||
#define mca_pml_ob1_del_procs mca_pml_bfo_del_procs
|
||||
#define mca_pml_ob1_enable mca_pml_bfo_enable
|
||||
#define mca_pml_ob1_progress mca_pml_bfo_progress
|
||||
#define mca_pml_ob1_add_comm mca_pml_bfo_add_comm
|
||||
#define mca_pml_ob1_del_comm mca_pml_bfo_del_comm
|
||||
#define mca_pml_ob1_irecv_init mca_pml_bfo_irecv_init
|
||||
#define mca_pml_ob1_irecv mca_pml_bfo_irecv
|
||||
#define mca_pml_ob1_recv mca_pml_bfo_recv
|
||||
#define mca_pml_ob1_isend_init mca_pml_bfo_isend_init
|
||||
#define mca_pml_ob1_isend mca_pml_bfo_isend
|
||||
#define mca_pml_ob1_send mca_pml_bfo_send
|
||||
#define mca_pml_ob1_iprobe mca_pml_bfo_iprobe
|
||||
[...and much more ...]
|
||||
|
||||
The pml_bfo_hdr.h file was not a link because the changes in it were
|
||||
so extensive. Also the Makefile was kept separate so it could include
|
||||
the additional failover files as well as add a compile directive that
|
||||
would force the files to be compiled as bfo instead of ob1.
|
||||
|
||||
After these changes were made, several independent developers reviewed
|
||||
the results and concluded that making these changes would have too
|
||||
much of a negative impact on ob1 maintenance. First, the code became
|
||||
much harder to read with all the additional #ifdefs. Secondly, the
|
||||
possibility of adding other features, like csum, to ob1 would only
|
||||
make this issue even worse. Therefore, it was decided to keep the bfo
|
||||
PML separate from ob1.
|
||||
|
||||
11. UTILITIES
|
||||
In an ideal world, any bug fixes that are made in the ob1 PML would
|
||||
also be made in the csum and the bfo PMLs. However, that does not
|
||||
always happen. Therefore, there are two new utilities added to the
|
||||
contrib directory.
|
||||
|
||||
check-ob1-revision.pl
|
||||
check-ob1-pml-diffs.pl
|
||||
|
||||
The first one can be run to see if ob1 has changed from its last known
|
||||
state. Here is an example.
|
||||
|
||||
machine =>check-ob1-revision.pl
|
||||
Running svn diff -r24138 ../ompi/mca/pml/ob1
|
||||
No new changes detected in ob1. Everything is fine.
|
||||
|
||||
If there are differences, then one needs to review them and potentially
|
||||
add them to the bfo (and csum also if one feels like it).
|
||||
After that, bump up the value in the script to the latest value.
|
||||
|
||||
The second script allows one to see the differences between the ob1
|
||||
and bfo PML. Here is an example.
|
||||
|
||||
machine =>check-ob1-pml-diffs.pl
|
||||
|
||||
Starting script to check differences between bfo and ob1...
|
||||
Files Compared: pml_ob1.c and pml_bfo.c
|
||||
No differences encountered
|
||||
Files Compared: pml_ob1.h and pml_bfo.h
|
||||
[...snip...]
|
||||
Files Compared: pml_ob1_start.c and pml_bfo_start.c
|
||||
No differences encountered
|
||||
|
||||
There is a lot more in the script that tells how it is used.
|
||||
|
||||
|
||||
Appendix 1: SIMPLE OVERVIEW OF COMMUNICATION PROTOCOLS
|
||||
The drawings below attempt to describe some of the general flow of
|
||||
fragments in the various protocols that are supported in the PMLs.
|
||||
The "read" and "write" are actual RDMA actions and do not pertain to
|
||||
fragments that are sent. As can be inferred, they use FIN messages to
|
||||
indicate their completion.
|
||||
|
||||
|
||||
MATCH PROTOCOL
|
||||
sender >->->-> MATCH >->->-> receiver
|
||||
|
||||
SEND WITH MULTIPLE FRAGMENTS
|
||||
sender >->->-> RNDV >->->-> receiver
|
||||
<-<-<-< ACK <-<-<-<
|
||||
>->->-> FRAG >->->->
|
||||
>->->-> FRAG >->->->
|
||||
>->->-> FRAG >->->->
|
||||
|
||||
RDMA PUT
|
||||
sender >->->-> RNDV >->->-> receiver
|
||||
<-<-<-< PUT <-<-<-<
|
||||
<-<-<-< PUT <-<-<-<
|
||||
>->->-> write >->->->
|
||||
>->->-> FIN >->->->
|
||||
>->->-> write >->->->
|
||||
>->->-> FIN >->->->
|
||||
|
||||
RMA GET
|
||||
sender >->->-> RGET >->->-> receiver
|
||||
<-<-<-< read <-<-<-<
|
||||
<-<-<-< FIN <-<-<-<
|
@ -1,27 +0,0 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2013 Sandia National Laboratories. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_ompi_pml_bfo_POST_CONFIG(will_build)
|
||||
# ----------------------------------------
|
||||
# The BFO PML requires a BML endpoint tag to compile, so require it.
|
||||
# Require in POST_CONFIG instead of CONFIG so that we only require it
|
||||
# if we're not disabled.
|
||||
AC_DEFUN([MCA_ompi_pml_bfo_POST_CONFIG], [
|
||||
AS_IF([test "$1" = "1"], [OMPI_REQUIRE_ENDPOINT_TAG([BML])])
|
||||
])dnl
|
||||
|
||||
# MCA_ompi_pml_bfo_CONFIG(action-if-can-compile,
|
||||
# [action-if-cant-compile])
|
||||
# ------------------------------------------------
|
||||
# We can always build, unless we were explicitly disabled.
|
||||
AC_DEFUN([MCA_ompi_pml_bfo_CONFIG],[
|
||||
AC_CONFIG_FILES([ompi/mca/pml/bfo/Makefile])
|
||||
[$1]
|
||||
])dnl
|
@ -1,20 +0,0 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
[eager_limit_too_small]
|
||||
The "eager limit" MCA parameter in the %s BTL was set to a value which
|
||||
is too low for Open MPI to function properly. Please re-run your job
|
||||
with a higher eager limit value for this BTL; the exact MCA parameter
|
||||
name and its corresponding minimum value is shown below.
|
||||
|
||||
Local host: %s
|
||||
BTL name: %s
|
||||
BTL eager limit value: %d (set via btl_%s_eager_limit)
|
||||
BTL eager limit minimum: %d
|
||||
MCA parameter name: btl_%s_eager_limit
|
@ -1,7 +0,0 @@
|
||||
#
|
||||
# owner/status file
|
||||
# owner: institution that is responsible for this package
|
||||
# status: e.g. active, maintenance, unmaintained
|
||||
#
|
||||
owner: NVIDIA
|
||||
status: unmaintained
|
@ -1,897 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2006-2008 University of Houston. All rights reserved.
|
||||
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011 Sandia National Laboratories. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "opal/class/opal_bitmap.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/show_help.h"
|
||||
#include "opal/mca/btl/btl.h"
|
||||
#include "opal/mca/btl/base/base.h"
|
||||
#include "opal/mca/pmix/pmix.h"
|
||||
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "ompi/mca/pml/base/base.h"
|
||||
#include "ompi/mca/pml/base/base.h"
|
||||
#include "ompi/mca/bml/base/base.h"
|
||||
#include "ompi/runtime/ompi_cr.h"
|
||||
|
||||
#include "pml_bfo.h"
|
||||
#include "pml_bfo_component.h"
|
||||
#include "pml_bfo_comm.h"
|
||||
#include "pml_bfo_hdr.h"
|
||||
#include "pml_bfo_recvfrag.h"
|
||||
#include "pml_bfo_sendreq.h"
|
||||
#include "pml_bfo_recvreq.h"
|
||||
#include "pml_bfo_rdmafrag.h"
|
||||
#if PML_BFO
|
||||
#include "pml_bfo_failover.h"
|
||||
#endif /* PML_BFO */
|
||||
|
||||
mca_pml_bfo_t mca_pml_bfo = {
|
||||
{
|
||||
mca_pml_bfo_add_procs,
|
||||
mca_pml_bfo_del_procs,
|
||||
mca_pml_bfo_enable,
|
||||
mca_pml_bfo_progress,
|
||||
mca_pml_bfo_add_comm,
|
||||
mca_pml_bfo_del_comm,
|
||||
mca_pml_bfo_irecv_init,
|
||||
mca_pml_bfo_irecv,
|
||||
mca_pml_bfo_recv,
|
||||
mca_pml_bfo_isend_init,
|
||||
mca_pml_bfo_isend,
|
||||
mca_pml_bfo_send,
|
||||
mca_pml_bfo_iprobe,
|
||||
mca_pml_bfo_probe,
|
||||
mca_pml_bfo_start,
|
||||
mca_pml_bfo_improbe,
|
||||
mca_pml_bfo_mprobe,
|
||||
mca_pml_bfo_imrecv,
|
||||
mca_pml_bfo_mrecv,
|
||||
mca_pml_bfo_dump,
|
||||
mca_pml_bfo_ft_event,
|
||||
65535,
|
||||
INT_MAX
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
void mca_pml_bfo_error_handler( struct mca_btl_base_module_t* btl,
|
||||
int32_t flags, ompi_proc_t* errproc,
|
||||
char* btlinfo );
|
||||
|
||||
int mca_pml_bfo_enable(bool enable)
|
||||
{
|
||||
if( false == enable ) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
OBJ_CONSTRUCT(&mca_pml_bfo.lock, opal_mutex_t);
|
||||
|
||||
/* fragments */
|
||||
OBJ_CONSTRUCT(&mca_pml_bfo.rdma_frags, opal_free_list_t);
|
||||
opal_free_list_init( &mca_pml_bfo.rdma_frags,
|
||||
sizeof(mca_pml_bfo_rdma_frag_t),
|
||||
opal_cache_line_size,
|
||||
OBJ_CLASS(mca_pml_bfo_rdma_frag_t),
|
||||
0,opal_cache_line_size,
|
||||
mca_pml_bfo.free_list_num,
|
||||
mca_pml_bfo.free_list_max,
|
||||
mca_pml_bfo.free_list_inc,
|
||||
NULL, 0, NULL, NULL, NULL );
|
||||
|
||||
OBJ_CONSTRUCT(&mca_pml_bfo.recv_frags, opal_free_list_t);
|
||||
opal_free_list_init( &mca_pml_bfo.recv_frags,
|
||||
sizeof(mca_pml_bfo_recv_frag_t) + mca_pml_bfo.unexpected_limit,
|
||||
opal_cache_line_size,
|
||||
OBJ_CLASS(mca_pml_bfo_recv_frag_t),
|
||||
0,opal_cache_line_size,
|
||||
mca_pml_bfo.free_list_num,
|
||||
mca_pml_bfo.free_list_max,
|
||||
mca_pml_bfo.free_list_inc,
|
||||
NULL, 0, NULL, NULL, NULL );
|
||||
|
||||
OBJ_CONSTRUCT(&mca_pml_bfo.pending_pckts, opal_free_list_t);
|
||||
opal_free_list_init( &mca_pml_bfo.pending_pckts,
|
||||
sizeof(mca_pml_bfo_pckt_pending_t),
|
||||
opal_cache_line_size,
|
||||
OBJ_CLASS(mca_pml_bfo_pckt_pending_t),
|
||||
0,opal_cache_line_size,
|
||||
mca_pml_bfo.free_list_num,
|
||||
mca_pml_bfo.free_list_max,
|
||||
mca_pml_bfo.free_list_inc,
|
||||
NULL, 0, NULL, NULL, NULL );
|
||||
|
||||
OBJ_CONSTRUCT(&mca_pml_bfo.buffers, opal_free_list_t);
|
||||
OBJ_CONSTRUCT(&mca_pml_bfo.send_ranges, opal_free_list_t);
|
||||
opal_free_list_init( &mca_pml_bfo.send_ranges,
|
||||
sizeof(mca_pml_bfo_send_range_t) +
|
||||
(mca_pml_bfo.max_send_per_range - 1) * sizeof(mca_pml_bfo_com_btl_t),
|
||||
opal_cache_line_size,
|
||||
OBJ_CLASS(mca_pml_bfo_send_range_t),
|
||||
0,opal_cache_line_size,
|
||||
mca_pml_bfo.free_list_num,
|
||||
mca_pml_bfo.free_list_max,
|
||||
mca_pml_bfo.free_list_inc,
|
||||
NULL, 0, NULL, NULL, NULL );
|
||||
|
||||
/* pending operations */
|
||||
OBJ_CONSTRUCT(&mca_pml_bfo.send_pending, opal_list_t);
|
||||
OBJ_CONSTRUCT(&mca_pml_bfo.recv_pending, opal_list_t);
|
||||
OBJ_CONSTRUCT(&mca_pml_bfo.pckt_pending, opal_list_t);
|
||||
OBJ_CONSTRUCT(&mca_pml_bfo.rdma_pending, opal_list_t);
|
||||
/* missing communicator pending list */
|
||||
OBJ_CONSTRUCT(&mca_pml_bfo.non_existing_communicator_pending, opal_list_t);
|
||||
|
||||
/**
|
||||
* If we get here this is the PML who get selected for the run. We
|
||||
* should get ownership for the send and receive requests list, and
|
||||
* initialize them with the size of our own requests.
|
||||
*/
|
||||
opal_free_list_init( &mca_pml_base_send_requests,
|
||||
sizeof(mca_pml_bfo_send_request_t) +
|
||||
(mca_pml_bfo.max_rdma_per_request - 1) *
|
||||
sizeof(mca_pml_bfo_com_btl_t),
|
||||
opal_cache_line_size,
|
||||
OBJ_CLASS(mca_pml_bfo_send_request_t),
|
||||
0,opal_cache_line_size,
|
||||
mca_pml_bfo.free_list_num,
|
||||
mca_pml_bfo.free_list_max,
|
||||
mca_pml_bfo.free_list_inc,
|
||||
NULL, 0, NULL, NULL, NULL );
|
||||
|
||||
opal_free_list_init( &mca_pml_base_recv_requests,
|
||||
sizeof(mca_pml_bfo_recv_request_t) +
|
||||
(mca_pml_bfo.max_rdma_per_request - 1) *
|
||||
sizeof(mca_pml_bfo_com_btl_t),
|
||||
opal_cache_line_size,
|
||||
OBJ_CLASS(mca_pml_bfo_recv_request_t),
|
||||
0,opal_cache_line_size,
|
||||
mca_pml_bfo.free_list_num,
|
||||
mca_pml_bfo.free_list_max,
|
||||
mca_pml_bfo.free_list_inc,
|
||||
NULL, 0, NULL, NULL, NULL );
|
||||
|
||||
mca_pml_bfo.enabled = true;
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_pml_bfo_add_comm(ompi_communicator_t* comm)
|
||||
{
|
||||
/* allocate pml specific comm data */
|
||||
mca_pml_bfo_comm_t* pml_comm = OBJ_NEW(mca_pml_bfo_comm_t);
|
||||
opal_list_item_t *item, *next_item;
|
||||
mca_pml_bfo_recv_frag_t* frag;
|
||||
mca_pml_bfo_comm_proc_t* pml_proc;
|
||||
mca_pml_bfo_match_hdr_t* hdr;
|
||||
int i;
|
||||
|
||||
if (NULL == pml_comm) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* should never happen, but it was, so check */
|
||||
if (comm->c_contextid > mca_pml_bfo.super.pml_max_contextid) {
|
||||
OBJ_RELEASE(pml_comm);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
mca_pml_bfo_comm_init_size(pml_comm, comm->c_remote_group->grp_proc_count);
|
||||
comm->c_pml_comm = pml_comm;
|
||||
|
||||
for( i = 0; i < comm->c_remote_group->grp_proc_count; i++ ) {
|
||||
pml_comm->procs[i].ompi_proc = ompi_group_peer_lookup(comm->c_remote_group,i);
|
||||
OBJ_RETAIN(pml_comm->procs[i].ompi_proc);
|
||||
}
|
||||
/* Grab all related messages from the non_existing_communicator pending queue */
|
||||
for( item = opal_list_get_first(&mca_pml_bfo.non_existing_communicator_pending);
|
||||
item != opal_list_get_end(&mca_pml_bfo.non_existing_communicator_pending);
|
||||
item = next_item ) {
|
||||
frag = (mca_pml_bfo_recv_frag_t*)item;
|
||||
next_item = opal_list_get_next(item);
|
||||
hdr = &frag->hdr.hdr_match;
|
||||
|
||||
/* Is this fragment for the current communicator ? */
|
||||
if( frag->hdr.hdr_match.hdr_ctx != comm->c_contextid )
|
||||
continue;
|
||||
|
||||
/* As we now know we work on a fragment for this communicator
|
||||
* we should remove it from the
|
||||
* non_existing_communicator_pending list. */
|
||||
opal_list_remove_item( &mca_pml_bfo.non_existing_communicator_pending,
|
||||
item );
|
||||
|
||||
add_fragment_to_unexpected:
|
||||
|
||||
/* We generate the MSG_ARRIVED event as soon as the PML is aware
|
||||
* of a matching fragment arrival. Independing if it is received
|
||||
* on the correct order or not. This will allow the tools to
|
||||
* figure out if the messages are not received in the correct
|
||||
* order (if multiple network interfaces).
|
||||
*/
|
||||
PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_ARRIVED, comm,
|
||||
hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
|
||||
|
||||
/* There is no matching to be done, and no lock to be held on the communicator as
|
||||
* we know at this point that the communicator has not yet been returned to the user.
|
||||
* The only required protection is around the non_existing_communicator_pending queue.
|
||||
* We just have to push the fragment into the unexpected list of the corresponding
|
||||
* proc, or into the out-of-order (cant_match) list.
|
||||
*/
|
||||
pml_proc = &(pml_comm->procs[hdr->hdr_src]);
|
||||
|
||||
if( ((uint16_t)hdr->hdr_seq) == ((uint16_t)pml_proc->expected_sequence) ) {
|
||||
/* We're now expecting the next sequence number. */
|
||||
pml_proc->expected_sequence++;
|
||||
opal_list_append( &pml_proc->unexpected_frags, (opal_list_item_t*)frag );
|
||||
PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_INSERT_IN_UNEX_Q, comm,
|
||||
hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
|
||||
/* And now the ugly part. As some fragments can be inserted in the cant_match list,
|
||||
* every time we succesfully add a fragment in the unexpected list we have to make
|
||||
* sure the next one is not in the cant_match. Otherwise, we will endup in a deadlock
|
||||
* situation as the cant_match is only checked when a new fragment is received from
|
||||
* the network.
|
||||
*/
|
||||
for(frag = (mca_pml_bfo_recv_frag_t *)opal_list_get_first(&pml_proc->frags_cant_match);
|
||||
frag != (mca_pml_bfo_recv_frag_t *)opal_list_get_end(&pml_proc->frags_cant_match);
|
||||
frag = (mca_pml_bfo_recv_frag_t *)opal_list_get_next(frag)) {
|
||||
hdr = &frag->hdr.hdr_match;
|
||||
/* If the message has the next expected seq from that proc... */
|
||||
if(hdr->hdr_seq != pml_proc->expected_sequence)
|
||||
continue;
|
||||
|
||||
opal_list_remove_item(&pml_proc->frags_cant_match, (opal_list_item_t*)frag);
|
||||
goto add_fragment_to_unexpected;
|
||||
}
|
||||
} else {
|
||||
opal_list_append( &pml_proc->frags_cant_match, (opal_list_item_t*)frag );
|
||||
}
|
||||
}
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_pml_bfo_del_comm(ompi_communicator_t* comm)
|
||||
{
|
||||
mca_pml_bfo_comm_t* pml_comm = comm->c_pml_comm;
|
||||
int i;
|
||||
|
||||
for( i = 0; i < comm->c_remote_group->grp_proc_count; i++ ) {
|
||||
OBJ_RELEASE(pml_comm->procs[i].ompi_proc);
|
||||
}
|
||||
OBJ_RELEASE(comm->c_pml_comm);
|
||||
comm->c_pml_comm = NULL;
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* For each proc setup a datastructure that indicates the BTLs
|
||||
* that can be used to reach the destination.
|
||||
*
|
||||
*/
|
||||
|
||||
int mca_pml_bfo_add_procs(ompi_proc_t** procs, size_t nprocs)
|
||||
{
|
||||
opal_bitmap_t reachable;
|
||||
int rc;
|
||||
opal_list_item_t *item;
|
||||
|
||||
if(nprocs == 0)
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
OBJ_CONSTRUCT(&reachable, opal_bitmap_t);
|
||||
rc = opal_bitmap_init(&reachable, (int)nprocs);
|
||||
if(OMPI_SUCCESS != rc)
|
||||
return rc;
|
||||
|
||||
/*
|
||||
* JJH: Disable this in FT enabled builds since
|
||||
* we use a wrapper PML. It will cause this check to
|
||||
* return failure as all processes will return the wrapper PML
|
||||
* component in use instead of the wrapped PML component underneath.
|
||||
*/
|
||||
#if OPAL_ENABLE_FT_CR == 0
|
||||
/* make sure remote procs are using the same PML as us */
|
||||
if (OMPI_SUCCESS != (rc = mca_pml_base_pml_check_selected("bfo",
|
||||
procs,
|
||||
nprocs))) {
|
||||
return rc;
|
||||
}
|
||||
#endif
|
||||
|
||||
rc = mca_bml.bml_add_procs( nprocs,
|
||||
procs,
|
||||
&reachable );
|
||||
if(OMPI_SUCCESS != rc)
|
||||
goto cleanup_and_return;
|
||||
|
||||
/* Check that values supplied by all initialized btls will work
|
||||
for us. Note that this is the list of all initialized BTLs,
|
||||
not the ones used for the just added procs. This is a little
|
||||
overkill and inaccurate, as we may end up not using the BTL in
|
||||
question and all add_procs calls after the first one are
|
||||
duplicating an already completed check. But the final
|
||||
initialization of the PML occurs before the final
|
||||
initialization of the BTLs, and iterating through the in-use
|
||||
BTLs requires iterating over the procs, as the BML does not
|
||||
expose all currently in use btls. */
|
||||
|
||||
for (item = opal_list_get_first(&mca_btl_base_modules_initialized) ;
|
||||
item != opal_list_get_end(&mca_btl_base_modules_initialized) ;
|
||||
item = opal_list_get_next(item)) {
|
||||
mca_btl_base_selected_module_t *sm =
|
||||
(mca_btl_base_selected_module_t*) item;
|
||||
if (sm->btl_module->btl_eager_limit < sizeof(mca_pml_bfo_hdr_t)) {
|
||||
opal_show_help("help-mpi-pml-bfo.txt", "eager_limit_too_small",
|
||||
true,
|
||||
sm->btl_component->btl_version.mca_component_name,
|
||||
ompi_process_info.nodename,
|
||||
sm->btl_component->btl_version.mca_component_name,
|
||||
sm->btl_module->btl_eager_limit,
|
||||
sm->btl_component->btl_version.mca_component_name,
|
||||
sizeof(mca_pml_bfo_hdr_t),
|
||||
sm->btl_component->btl_version.mca_component_name);
|
||||
rc = OMPI_ERR_BAD_PARAM;
|
||||
goto cleanup_and_return;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* TODO: Move these callback registration to another place */
|
||||
rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_MATCH,
|
||||
mca_pml_bfo_recv_frag_callback_match,
|
||||
NULL );
|
||||
if(OMPI_SUCCESS != rc)
|
||||
goto cleanup_and_return;
|
||||
|
||||
rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_RNDV,
|
||||
mca_pml_bfo_recv_frag_callback_rndv,
|
||||
NULL );
|
||||
if(OMPI_SUCCESS != rc)
|
||||
goto cleanup_and_return;
|
||||
|
||||
rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_RGET,
|
||||
mca_pml_bfo_recv_frag_callback_rget,
|
||||
NULL );
|
||||
if(OMPI_SUCCESS != rc)
|
||||
goto cleanup_and_return;
|
||||
|
||||
rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_ACK,
|
||||
mca_pml_bfo_recv_frag_callback_ack,
|
||||
NULL );
|
||||
if(OMPI_SUCCESS != rc)
|
||||
goto cleanup_and_return;
|
||||
|
||||
rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_FRAG,
|
||||
mca_pml_bfo_recv_frag_callback_frag,
|
||||
NULL );
|
||||
if(OMPI_SUCCESS != rc)
|
||||
goto cleanup_and_return;
|
||||
|
||||
rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_PUT,
|
||||
mca_pml_bfo_recv_frag_callback_put,
|
||||
NULL );
|
||||
if(OMPI_SUCCESS != rc)
|
||||
goto cleanup_and_return;
|
||||
|
||||
rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_FIN,
|
||||
mca_pml_bfo_recv_frag_callback_fin,
|
||||
NULL );
|
||||
if(OMPI_SUCCESS != rc)
|
||||
goto cleanup_and_return;
|
||||
|
||||
#if PML_BFO
|
||||
rc = mca_pml_bfo_register_callbacks();
|
||||
if(OMPI_SUCCESS != rc)
|
||||
goto cleanup_and_return;
|
||||
#endif /* PML_BFO */
|
||||
/* register error handlers */
|
||||
rc = mca_bml.bml_register_error((mca_btl_base_module_error_cb_fn_t)mca_pml_bfo_error_handler);
|
||||
if(OMPI_SUCCESS != rc)
|
||||
goto cleanup_and_return;
|
||||
|
||||
cleanup_and_return:
|
||||
OBJ_DESTRUCT(&reachable);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
/*
|
||||
* iterate through each proc and notify any PTLs associated
|
||||
* with the proc that it is/has gone away
|
||||
*/
|
||||
|
||||
int mca_pml_bfo_del_procs(ompi_proc_t** procs, size_t nprocs)
|
||||
{
|
||||
return mca_bml.bml_del_procs(nprocs, procs);
|
||||
}
|
||||
|
||||
/*
|
||||
* diagnostics
|
||||
*/
|
||||
|
||||
int mca_pml_bfo_dump(struct ompi_communicator_t* comm, int verbose)
|
||||
{
|
||||
struct mca_pml_comm_t* pml_comm = comm->c_pml_comm;
|
||||
int i;
|
||||
|
||||
/* iterate through all procs on communicator */
|
||||
for( i = 0; i < (int)pml_comm->num_procs; i++ ) {
|
||||
mca_pml_bfo_comm_proc_t* proc = &pml_comm->procs[i];
|
||||
mca_bml_base_endpoint_t* ep = (mca_bml_base_endpoint_t*)proc->ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
|
||||
size_t n;
|
||||
|
||||
opal_output(0, "[Rank %d]\n", i);
|
||||
/* dump all receive queues */
|
||||
|
||||
/* dump all btls */
|
||||
for(n=0; n<ep->btl_eager.arr_size; n++) {
|
||||
mca_bml_base_btl_t* bml_btl = &ep->btl_eager.bml_btls[n];
|
||||
bml_btl->btl->btl_dump(bml_btl->btl, bml_btl->btl_endpoint, verbose);
|
||||
}
|
||||
}
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static void mca_pml_bfo_fin_completion( mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* ep,
|
||||
struct mca_btl_base_descriptor_t* des,
|
||||
int status )
|
||||
{
|
||||
|
||||
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context;
|
||||
|
||||
#if PML_BFO
|
||||
if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
|
||||
mca_pml_bfo_repost_fin(des);
|
||||
return;
|
||||
}
|
||||
MCA_PML_BFO_CHECK_EAGER_BML_BTL_ON_FIN_COMPLETION(bml_btl, btl, des);
|
||||
#endif /* PML_BFO */
|
||||
/* check for pending requests */
|
||||
MCA_PML_BFO_PROGRESS_PENDING(bml_btl);
|
||||
}
|
||||
|
||||
/**
|
||||
* Send an FIN to the peer. If we fail to send this ack (no more available
|
||||
* fragments or the send failed) this function automatically add the FIN
|
||||
* to the list of pending FIN, Which guarantee that the FIN will be sent
|
||||
* later.
|
||||
*/
|
||||
int mca_pml_bfo_send_fin( ompi_proc_t* proc,
|
||||
mca_bml_base_btl_t* bml_btl,
|
||||
opal_ptr_t hdr_des,
|
||||
uint8_t order,
|
||||
#if PML_BFO
|
||||
uint32_t status,
|
||||
uint16_t seq,
|
||||
uint8_t restartseq,
|
||||
uint16_t ctx, uint32_t src)
|
||||
#else /* PML_BFO */
|
||||
uint32_t status )
|
||||
#endif /* PML_BFO */
|
||||
{
|
||||
mca_btl_base_descriptor_t* fin;
|
||||
mca_pml_bfo_fin_hdr_t* hdr;
|
||||
int rc;
|
||||
|
||||
mca_bml_base_alloc(bml_btl, &fin, order, sizeof(mca_pml_bfo_fin_hdr_t),
|
||||
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
||||
|
||||
if(NULL == fin) {
|
||||
MCA_PML_BFO_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
fin->des_cbfunc = mca_pml_bfo_fin_completion;
|
||||
fin->des_cbdata = NULL;
|
||||
|
||||
/* fill in header */
|
||||
hdr = (mca_pml_bfo_fin_hdr_t*)fin->des_local->seg_addr.pval;
|
||||
hdr->hdr_common.hdr_flags = 0;
|
||||
hdr->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_FIN;
|
||||
hdr->hdr_des = hdr_des;
|
||||
hdr->hdr_fail = status;
|
||||
#if PML_BFO
|
||||
fin->des_cbdata = proc;
|
||||
hdr->hdr_match.hdr_seq = seq;
|
||||
hdr->hdr_match.hdr_ctx = ctx;
|
||||
hdr->hdr_match.hdr_src = src;
|
||||
hdr->hdr_match.hdr_common.hdr_flags = restartseq; /* use unused hdr_flags field */
|
||||
#endif /* PML_BFO */
|
||||
|
||||
bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_FIN, proc);
|
||||
|
||||
/* queue request */
|
||||
rc = mca_bml_base_send( bml_btl,
|
||||
fin,
|
||||
MCA_PML_BFO_HDR_TYPE_FIN );
|
||||
if( OPAL_LIKELY( rc >= 0 ) ) {
|
||||
if( OPAL_LIKELY( 1 == rc ) ) {
|
||||
MCA_PML_BFO_PROGRESS_PENDING(bml_btl);
|
||||
}
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
mca_bml_base_free(bml_btl, fin);
|
||||
MCA_PML_BFO_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
void mca_pml_bfo_process_pending_packets(mca_bml_base_btl_t* bml_btl)
|
||||
{
|
||||
mca_pml_bfo_pckt_pending_t *pckt;
|
||||
int32_t i, rc, s = (int32_t)opal_list_get_size(&mca_pml_bfo.pckt_pending);
|
||||
|
||||
for(i = 0; i < s; i++) {
|
||||
mca_bml_base_btl_t *send_dst = NULL;
|
||||
OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
|
||||
pckt = (mca_pml_bfo_pckt_pending_t*)
|
||||
opal_list_remove_first(&mca_pml_bfo.pckt_pending);
|
||||
OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
|
||||
if(NULL == pckt)
|
||||
break;
|
||||
if(pckt->bml_btl != NULL &&
|
||||
pckt->bml_btl->btl == bml_btl->btl) {
|
||||
send_dst = pckt->bml_btl;
|
||||
} else {
|
||||
mca_bml_base_endpoint_t* endpoint =
|
||||
(mca_bml_base_endpoint_t*) pckt->proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
|
||||
send_dst = mca_bml_base_btl_array_find(
|
||||
&endpoint->btl_eager, bml_btl->btl);
|
||||
}
|
||||
if(NULL == send_dst) {
|
||||
OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
|
||||
opal_list_append(&mca_pml_bfo.pckt_pending,
|
||||
(opal_list_item_t*)pckt);
|
||||
OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
|
||||
continue;
|
||||
}
|
||||
|
||||
switch(pckt->hdr.hdr_common.hdr_type) {
|
||||
case MCA_PML_BFO_HDR_TYPE_ACK:
|
||||
rc = mca_pml_bfo_recv_request_ack_send_btl(pckt->proc,
|
||||
send_dst,
|
||||
pckt->hdr.hdr_ack.hdr_src_req.lval,
|
||||
pckt->hdr.hdr_ack.hdr_dst_req.pval,
|
||||
pckt->hdr.hdr_ack.hdr_send_offset,
|
||||
pckt->hdr.hdr_common.hdr_flags & MCA_PML_BFO_HDR_FLAGS_NORDMA);
|
||||
if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) {
|
||||
OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
|
||||
opal_list_append(&mca_pml_bfo.pckt_pending,
|
||||
(opal_list_item_t*)pckt);
|
||||
OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
|
||||
return;
|
||||
}
|
||||
break;
|
||||
case MCA_PML_BFO_HDR_TYPE_FIN:
|
||||
rc = mca_pml_bfo_send_fin(pckt->proc, send_dst,
|
||||
pckt->hdr.hdr_fin.hdr_des,
|
||||
pckt->order,
|
||||
#if PML_BFO
|
||||
pckt->hdr.hdr_fin.hdr_fail,
|
||||
pckt->hdr.hdr_fin.hdr_match.hdr_seq,
|
||||
pckt->hdr.hdr_fin.hdr_match.hdr_common.hdr_flags,
|
||||
pckt->hdr.hdr_fin.hdr_match.hdr_ctx,
|
||||
pckt->hdr.hdr_fin.hdr_match.hdr_src);
|
||||
#else /* PML_BFO */
|
||||
pckt->hdr.hdr_fin.hdr_fail);
|
||||
#endif /* PML_BFO */
|
||||
if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) {
|
||||
return;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
opal_output(0, "[%s:%d] wrong header type\n",
|
||||
__FILE__, __LINE__);
|
||||
break;
|
||||
}
|
||||
/* We're done with this packet, return it back to the free list */
|
||||
MCA_PML_BFO_PCKT_PENDING_RETURN(pckt);
|
||||
}
|
||||
}
|
||||
|
||||
void mca_pml_bfo_process_pending_rdma(void)
|
||||
{
|
||||
mca_pml_bfo_rdma_frag_t* frag;
|
||||
int32_t i, rc, s = (int32_t)opal_list_get_size(&mca_pml_bfo.rdma_pending);
|
||||
|
||||
for(i = 0; i < s; i++) {
|
||||
OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
|
||||
frag = (mca_pml_bfo_rdma_frag_t*)
|
||||
opal_list_remove_first(&mca_pml_bfo.rdma_pending);
|
||||
OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
|
||||
if(NULL == frag)
|
||||
break;
|
||||
if(frag->rdma_state == MCA_PML_BFO_RDMA_PUT) {
|
||||
frag->retries++;
|
||||
rc = mca_pml_bfo_send_request_put_frag(frag);
|
||||
} else {
|
||||
rc = mca_pml_bfo_recv_request_get_frag(frag);
|
||||
}
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == rc)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void mca_pml_bfo_error_handler(
|
||||
struct mca_btl_base_module_t* btl, int32_t flags,
|
||||
ompi_proc_t* errproc, char* btlinfo ) {
|
||||
#if PML_BFO
|
||||
if (flags & MCA_BTL_ERROR_FLAGS_NONFATAL) {
|
||||
mca_pml_bfo_failover_error_handler(btl, flags, errproc, btlinfo);
|
||||
return;
|
||||
}
|
||||
#endif /* PML_BFO */
|
||||
ompi_rte_abort(-1, NULL);
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 0
|
||||
int mca_pml_bfo_ft_event( int state ) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
#else
|
||||
int mca_pml_bfo_ft_event( int state )
|
||||
{
|
||||
static bool first_continue_pass = false;
|
||||
ompi_proc_t** procs = NULL;
|
||||
size_t num_procs;
|
||||
int ret, p;
|
||||
|
||||
if(OPAL_CRS_CHECKPOINT == state) {
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR1);
|
||||
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P0);
|
||||
}
|
||||
else if(OPAL_CRS_CONTINUE == state) {
|
||||
first_continue_pass = !first_continue_pass;
|
||||
|
||||
if( !first_continue_pass ) {
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR0);
|
||||
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P2);
|
||||
}
|
||||
|
||||
if (opal_cr_continue_like_restart && !first_continue_pass) {
|
||||
/*
|
||||
* Get a list of processes
|
||||
*/
|
||||
procs = ompi_proc_all(&num_procs);
|
||||
if(NULL == procs) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Refresh the proc structure, and publish our proc info in the modex.
|
||||
* NOTE: Do *not* call ompi_proc_finalize as there are many places in
|
||||
* the code that point to indv. procs in this strucutre. For our
|
||||
* needs here we only need to fix up the modex, bml and pml
|
||||
* references.
|
||||
*/
|
||||
if (OMPI_SUCCESS != (ret = ompi_proc_refresh())) {
|
||||
opal_output(0,
|
||||
"pml:bfo: ft_event(Restart): proc_refresh Failed %d",
|
||||
ret);
|
||||
for(p = 0; p < (int)num_procs; ++p) {
|
||||
OBJ_RELEASE(procs[p]);
|
||||
}
|
||||
free (procs);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if(OPAL_CRS_RESTART_PRE == state ) {
|
||||
/* Nothing here */
|
||||
}
|
||||
else if(OPAL_CRS_RESTART == state ) {
|
||||
/*
|
||||
* Get a list of processes
|
||||
*/
|
||||
procs = ompi_proc_all(&num_procs);
|
||||
if(NULL == procs) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Clean out the modex information since it is invalid now.
|
||||
* ompi_rte_purge_proc_attrs();
|
||||
* This happens at the ORTE level, so doing it again here will cause
|
||||
* some issues with socket caching.
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
* Refresh the proc structure, and publish our proc info in the modex.
|
||||
* NOTE: Do *not* call ompi_proc_finalize as there are many places in
|
||||
* the code that point to indv. procs in this strucutre. For our
|
||||
* needs here we only need to fix up the modex, bml and pml
|
||||
* references.
|
||||
*/
|
||||
if (OMPI_SUCCESS != (ret = ompi_proc_refresh())) {
|
||||
opal_output(0,
|
||||
"pml:bfo: ft_event(Restart): proc_refresh Failed %d",
|
||||
ret);
|
||||
for(p = 0; p < (int)num_procs; ++p) {
|
||||
OBJ_RELEASE(procs[p]);
|
||||
}
|
||||
free (procs);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
else if(OPAL_CRS_TERM == state ) {
|
||||
;
|
||||
}
|
||||
else {
|
||||
;
|
||||
}
|
||||
|
||||
/* Call the BML
|
||||
* BML is expected to call ft_event in
|
||||
* - BTL(s)
|
||||
* - MPool(s)
|
||||
*/
|
||||
if( OMPI_SUCCESS != (ret = mca_bml.bml_ft_event(state))) {
|
||||
opal_output(0, "pml:base: ft_event: BML ft_event function failed: %d\n",
|
||||
ret);
|
||||
}
|
||||
|
||||
if(OPAL_CRS_CHECKPOINT == state) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P1);
|
||||
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR0);
|
||||
/* JJH Cannot barrier here due to progress engine -- ompi_rte_barrier();*/
|
||||
}
|
||||
}
|
||||
else if(OPAL_CRS_CONTINUE == state) {
|
||||
if( !first_continue_pass ) {
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR1);
|
||||
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3);
|
||||
}
|
||||
|
||||
if (opal_cr_continue_like_restart && !first_continue_pass) {
|
||||
/*
|
||||
* Exchange the modex information once again.
|
||||
* BTLs will have republished their modex information.
|
||||
*/
|
||||
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Startup the PML stack now that the modex is running again
|
||||
* Add the new procs (BTLs redo modex recv's)
|
||||
*/
|
||||
if( OMPI_SUCCESS != (ret = mca_pml_bfo_add_procs(procs, num_procs) ) ) {
|
||||
opal_output(0, "pml:bfo: ft_event(Restart): Failed in add_procs (%d)", ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Is this barrier necessary ? JJH */
|
||||
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
|
||||
return ret;
|
||||
}
|
||||
|
||||
if( NULL != procs ) {
|
||||
for(p = 0; p < (int)num_procs; ++p) {
|
||||
OBJ_RELEASE(procs[p]);
|
||||
}
|
||||
free(procs);
|
||||
procs = NULL;
|
||||
}
|
||||
}
|
||||
if( !first_continue_pass ) {
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR2);
|
||||
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP1);
|
||||
}
|
||||
}
|
||||
else if(OPAL_CRS_RESTART_PRE == state ) {
|
||||
/* Nothing here */
|
||||
}
|
||||
else if(OPAL_CRS_RESTART == state ) {
|
||||
/*
|
||||
* Exchange the modex information once again.
|
||||
* BTLs will have republished their modex information.
|
||||
*/
|
||||
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Startup the PML stack now that the modex is running again
|
||||
* Add the new procs (BTLs redo modex recv's)
|
||||
*/
|
||||
if( OMPI_SUCCESS != (ret = mca_pml_bfo_add_procs(procs, num_procs) ) ) {
|
||||
opal_output(0, "pml:bfo: ft_event(Restart): Failed in add_procs (%d)", ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Is this barrier necessary ? JJH */
|
||||
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
|
||||
return ret;
|
||||
}
|
||||
|
||||
if( NULL != procs ) {
|
||||
for(p = 0; p < (int)num_procs; ++p) {
|
||||
OBJ_RELEASE(procs[p]);
|
||||
}
|
||||
free(procs);
|
||||
procs = NULL;
|
||||
}
|
||||
}
|
||||
else if(OPAL_CRS_TERM == state ) {
|
||||
;
|
||||
}
|
||||
else {
|
||||
;
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
#endif /* OPAL_ENABLE_FT_CR */
|
||||
|
||||
int mca_pml_bfo_com_btl_comp(const void *v1, const void *v2)
|
||||
{
|
||||
const mca_pml_bfo_com_btl_t *b1 = (const mca_pml_bfo_com_btl_t *) v1;
|
||||
const mca_pml_bfo_com_btl_t *b2 = (const mca_pml_bfo_com_btl_t *) v2;
|
||||
|
||||
if(b1->bml_btl->btl_weight < b2->bml_btl->btl_weight)
|
||||
return 1;
|
||||
if(b1->bml_btl->btl_weight > b2->bml_btl->btl_weight)
|
||||
return -1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -1,362 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2013 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*/
|
||||
|
||||
#ifndef MCA_PML_BFO_H
|
||||
#define MCA_PML_BFO_H
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "opal/class/opal_free_list.h"
|
||||
#include "ompi/request/request.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "ompi/mca/pml/base/pml_base_request.h"
|
||||
#include "ompi/mca/pml/base/pml_base_bsend.h"
|
||||
#include "ompi/mca/pml/base/pml_base_sendreq.h"
|
||||
#include "ompi/datatype/ompi_datatype.h"
|
||||
#include "pml_bfo_hdr.h"
|
||||
#include "ompi/mca/bml/base/base.h"
|
||||
#include "ompi/proc/proc.h"
|
||||
#include "opal/mca/allocator/base/base.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/**
|
||||
* BFO PML module
|
||||
*/
|
||||
|
||||
struct mca_pml_bfo_t {
|
||||
mca_pml_base_module_t super;
|
||||
|
||||
int priority;
|
||||
int free_list_num; /* initial size of free list */
|
||||
int free_list_max; /* maximum size of free list */
|
||||
int free_list_inc; /* number of elements to grow free list */
|
||||
unsigned int send_pipeline_depth;
|
||||
unsigned int recv_pipeline_depth;
|
||||
unsigned int rdma_put_retries_limit;
|
||||
int max_rdma_per_request;
|
||||
int max_send_per_range;
|
||||
bool leave_pinned;
|
||||
int leave_pinned_pipeline;
|
||||
|
||||
/* lock queue access */
|
||||
opal_mutex_t lock;
|
||||
|
||||
/* free lists */
|
||||
opal_free_list_t rdma_frags;
|
||||
opal_free_list_t recv_frags;
|
||||
opal_free_list_t pending_pckts;
|
||||
opal_free_list_t buffers;
|
||||
opal_free_list_t send_ranges;
|
||||
|
||||
/* list of pending operations */
|
||||
opal_list_t pckt_pending;
|
||||
opal_list_t send_pending;
|
||||
opal_list_t recv_pending;
|
||||
opal_list_t rdma_pending;
|
||||
/* List of pending fragments without a matching communicator */
|
||||
opal_list_t non_existing_communicator_pending;
|
||||
bool enabled;
|
||||
char* allocator_name;
|
||||
mca_allocator_base_module_t* allocator;
|
||||
unsigned int unexpected_limit;
|
||||
};
|
||||
typedef struct mca_pml_bfo_t mca_pml_bfo_t;
|
||||
|
||||
extern mca_pml_bfo_t mca_pml_bfo;
|
||||
extern int mca_pml_bfo_output;
|
||||
|
||||
/*
|
||||
* PML interface functions.
|
||||
*/
|
||||
|
||||
extern int mca_pml_bfo_add_comm(
|
||||
struct ompi_communicator_t* comm
|
||||
);
|
||||
|
||||
extern int mca_pml_bfo_del_comm(
|
||||
struct ompi_communicator_t* comm
|
||||
);
|
||||
|
||||
extern int mca_pml_bfo_add_procs(
|
||||
struct ompi_proc_t **procs,
|
||||
size_t nprocs
|
||||
);
|
||||
|
||||
extern int mca_pml_bfo_del_procs(
|
||||
struct ompi_proc_t **procs,
|
||||
size_t nprocs
|
||||
);
|
||||
|
||||
extern int mca_pml_bfo_enable( bool enable );
|
||||
|
||||
extern int mca_pml_bfo_progress(void);
|
||||
|
||||
extern int mca_pml_bfo_iprobe( int dst,
|
||||
int tag,
|
||||
struct ompi_communicator_t* comm,
|
||||
int *matched,
|
||||
ompi_status_public_t* status );
|
||||
|
||||
extern int mca_pml_bfo_probe( int dst,
|
||||
int tag,
|
||||
struct ompi_communicator_t* comm,
|
||||
ompi_status_public_t* status );
|
||||
|
||||
extern int mca_pml_bfo_improbe( int dst,
|
||||
int tag,
|
||||
struct ompi_communicator_t* comm,
|
||||
int *matched,
|
||||
struct ompi_message_t **message,
|
||||
ompi_status_public_t* status );
|
||||
|
||||
extern int mca_pml_bfo_mprobe( int dst,
|
||||
int tag,
|
||||
struct ompi_communicator_t* comm,
|
||||
struct ompi_message_t **message,
|
||||
ompi_status_public_t* status );
|
||||
|
||||
extern int mca_pml_bfo_isend_init( void *buf,
|
||||
size_t count,
|
||||
ompi_datatype_t *datatype,
|
||||
int dst,
|
||||
int tag,
|
||||
mca_pml_base_send_mode_t mode,
|
||||
struct ompi_communicator_t* comm,
|
||||
struct ompi_request_t **request );
|
||||
|
||||
extern int mca_pml_bfo_isend( void *buf,
|
||||
size_t count,
|
||||
ompi_datatype_t *datatype,
|
||||
int dst,
|
||||
int tag,
|
||||
mca_pml_base_send_mode_t mode,
|
||||
struct ompi_communicator_t* comm,
|
||||
struct ompi_request_t **request );
|
||||
|
||||
extern int mca_pml_bfo_send( void *buf,
|
||||
size_t count,
|
||||
ompi_datatype_t *datatype,
|
||||
int dst,
|
||||
int tag,
|
||||
mca_pml_base_send_mode_t mode,
|
||||
struct ompi_communicator_t* comm );
|
||||
|
||||
extern int mca_pml_bfo_irecv_init( void *buf,
|
||||
size_t count,
|
||||
ompi_datatype_t *datatype,
|
||||
int src,
|
||||
int tag,
|
||||
struct ompi_communicator_t* comm,
|
||||
struct ompi_request_t **request );
|
||||
|
||||
extern int mca_pml_bfo_irecv( void *buf,
|
||||
size_t count,
|
||||
ompi_datatype_t *datatype,
|
||||
int src,
|
||||
int tag,
|
||||
struct ompi_communicator_t* comm,
|
||||
struct ompi_request_t **request );
|
||||
|
||||
extern int mca_pml_bfo_recv( void *buf,
|
||||
size_t count,
|
||||
ompi_datatype_t *datatype,
|
||||
int src,
|
||||
int tag,
|
||||
struct ompi_communicator_t* comm,
|
||||
ompi_status_public_t* status );
|
||||
|
||||
extern int mca_pml_bfo_imrecv( void *buf,
|
||||
size_t count,
|
||||
ompi_datatype_t *datatype,
|
||||
struct ompi_message_t **message,
|
||||
struct ompi_request_t **request );
|
||||
|
||||
extern int mca_pml_bfo_mrecv( void *buf,
|
||||
size_t count,
|
||||
ompi_datatype_t *datatype,
|
||||
struct ompi_message_t **message,
|
||||
ompi_status_public_t* status );
|
||||
|
||||
extern int mca_pml_bfo_dump( struct ompi_communicator_t* comm,
|
||||
int verbose );
|
||||
|
||||
extern int mca_pml_bfo_start( size_t count,
|
||||
ompi_request_t** requests );
|
||||
|
||||
extern int mca_pml_bfo_ft_event( int state );
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
struct mca_pml_bfo_pckt_pending_t {
|
||||
opal_free_list_item_t super;
|
||||
ompi_proc_t* proc;
|
||||
mca_pml_bfo_hdr_t hdr;
|
||||
struct mca_bml_base_btl_t *bml_btl;
|
||||
uint8_t order;
|
||||
};
|
||||
typedef struct mca_pml_bfo_pckt_pending_t mca_pml_bfo_pckt_pending_t;
|
||||
OBJ_CLASS_DECLARATION(mca_pml_bfo_pckt_pending_t);
|
||||
|
||||
#define MCA_PML_BFO_PCKT_PENDING_ALLOC(pckt) \
|
||||
do { \
|
||||
opal_free_list_item_t* item; \
|
||||
OPAL_FREE_LIST_WAIT(&mca_pml_bfo.pending_pckts, item); \
|
||||
pckt = (mca_pml_bfo_pckt_pending_t*)item; \
|
||||
} while (0)
|
||||
|
||||
#define MCA_PML_BFO_PCKT_PENDING_RETURN(pckt) \
|
||||
do { \
|
||||
/* return packet */ \
|
||||
OPAL_FREE_LIST_RETURN(&mca_pml_bfo.pending_pckts, \
|
||||
(opal_free_list_item_t*)pckt); \
|
||||
} while(0)
|
||||
|
||||
#define MCA_PML_BFO_ADD_FIN_TO_PENDING(P, D, B, O, S) \
|
||||
do { \
|
||||
mca_pml_bfo_pckt_pending_t *_pckt; \
|
||||
\
|
||||
MCA_PML_BFO_PCKT_PENDING_ALLOC(_pckt); \
|
||||
_pckt->hdr.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_FIN; \
|
||||
_pckt->hdr.hdr_fin.hdr_des = (D); \
|
||||
_pckt->hdr.hdr_fin.hdr_fail = (S); \
|
||||
_pckt->proc = (P); \
|
||||
_pckt->bml_btl = (B); \
|
||||
_pckt->order = (O); \
|
||||
OPAL_THREAD_LOCK(&mca_pml_bfo.lock); \
|
||||
opal_list_append(&mca_pml_bfo.pckt_pending, \
|
||||
(opal_list_item_t*)_pckt); \
|
||||
OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); \
|
||||
} while(0)
|
||||
|
||||
|
||||
int mca_pml_bfo_send_fin(ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl,
|
||||
#if PML_BFO
|
||||
opal_ptr_t hdr_des, uint8_t order, uint32_t status,
|
||||
uint16_t seq, uint8_t reqseq, uint16_t ctx, uint32_t src);
|
||||
#else /* PML_BFO */
|
||||
opal_ptr_t hdr_des, uint8_t order, uint32_t status);
|
||||
#endif /* PML_BFO */
|
||||
|
||||
/* This function tries to resend FIN/ACK packets from pckt_pending queue.
|
||||
* Packets are added to the queue when sending of FIN or ACK is failed due to
|
||||
* resource unavailability. bml_btl passed to the function doesn't represents
|
||||
* packet's destination, it represents BTL on which resource was freed, so only
|
||||
* this BTL should be considered for resending packets */
|
||||
void mca_pml_bfo_process_pending_packets(mca_bml_base_btl_t* bml_btl);
|
||||
|
||||
/* This function retries failed PUT/GET operations on frag. When RDMA operation
|
||||
* cannot be accomplished for some reason, frag is put on the rdma_pending list.
|
||||
* Later the operation is retried. The destination of RDMA operation is stored
|
||||
* inside the frag structure */
|
||||
void mca_pml_bfo_process_pending_rdma(void);
|
||||
|
||||
#define MCA_PML_BFO_PROGRESS_PENDING(bml_btl) \
|
||||
do { \
|
||||
if(opal_list_get_size(&mca_pml_bfo.pckt_pending)) \
|
||||
mca_pml_bfo_process_pending_packets(bml_btl); \
|
||||
if(opal_list_get_size(&mca_pml_bfo.recv_pending)) \
|
||||
mca_pml_bfo_recv_request_process_pending(); \
|
||||
if(opal_list_get_size(&mca_pml_bfo.send_pending)) \
|
||||
mca_pml_bfo_send_request_process_pending(bml_btl); \
|
||||
if(opal_list_get_size(&mca_pml_bfo.rdma_pending)) \
|
||||
mca_pml_bfo_process_pending_rdma(); \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
* Compute the total number of bytes on supplied descriptor
|
||||
*/
|
||||
static inline int mca_pml_bfo_compute_segment_length (size_t seg_size, void *segments, size_t count,
|
||||
size_t hdrlen) {
|
||||
size_t i, length;
|
||||
|
||||
for (i = 0, length = -hdrlen ; i < count ; ++i) {
|
||||
mca_btl_base_segment_t *segment =
|
||||
(mca_btl_base_segment_t *)((char *) segments + i * seg_size);
|
||||
|
||||
length += segment->seg_len;
|
||||
}
|
||||
|
||||
return length;
|
||||
}
|
||||
|
||||
static inline int mca_pml_bfo_compute_segment_length_base (mca_btl_base_segment_t *segments,
|
||||
size_t count, size_t hdrlen) {
|
||||
size_t i, length;
|
||||
|
||||
for (i = 0, length = -hdrlen ; i < count ; ++i) {
|
||||
length += segments[i].seg_len;
|
||||
}
|
||||
|
||||
return length;
|
||||
}
|
||||
|
||||
/* represent BTL chosen for sending request */
|
||||
struct mca_pml_bfo_com_btl_t {
|
||||
mca_bml_base_btl_t *bml_btl;
|
||||
struct mca_mpool_base_registration_t* btl_reg;
|
||||
size_t length;
|
||||
};
|
||||
typedef struct mca_pml_bfo_com_btl_t mca_pml_bfo_com_btl_t;
|
||||
|
||||
int mca_pml_bfo_com_btl_comp(const void *v1, const void *v2);
|
||||
|
||||
/* Calculate what percentage of a message to send through each BTL according to
|
||||
* relative weight */
|
||||
static inline void
|
||||
mca_pml_bfo_calc_weighted_length( mca_pml_bfo_com_btl_t *btls, int num_btls, size_t size,
|
||||
double weight_total )
|
||||
{
|
||||
int i;
|
||||
size_t length_left;
|
||||
|
||||
/* shortcut for common case for only one BTL */
|
||||
if( OPAL_LIKELY(1 == num_btls) ) {
|
||||
btls[0].length = size;
|
||||
return;
|
||||
}
|
||||
|
||||
/* sort BTLs according of their weights so BTLs with smaller weight will
|
||||
* not hijack all of the traffic */
|
||||
qsort( btls, num_btls, sizeof(mca_pml_bfo_com_btl_t),
|
||||
mca_pml_bfo_com_btl_comp );
|
||||
|
||||
for(length_left = size, i = 0; i < num_btls; i++) {
|
||||
mca_bml_base_btl_t* bml_btl = btls[i].bml_btl;
|
||||
size_t length = 0;
|
||||
if( OPAL_UNLIKELY(0 != length_left) ) {
|
||||
length = (length_left > bml_btl->btl->btl_eager_limit)?
|
||||
((size_t)(size * (bml_btl->btl_weight / weight_total))) :
|
||||
length_left;
|
||||
|
||||
if(length > length_left)
|
||||
length = length_left;
|
||||
length_left -= length;
|
||||
}
|
||||
btls[i].length = length;
|
||||
}
|
||||
|
||||
/* account for rounding errors */
|
||||
btls[0].length += length_left;
|
||||
}
|
||||
|
||||
#endif
|
@ -1,100 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010-2012 Oracle and/or its affiliates. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include <string.h>
|
||||
|
||||
#include "pml_bfo.h"
|
||||
#include "pml_bfo_comm.h"
|
||||
|
||||
|
||||
|
||||
static void mca_pml_bfo_comm_proc_construct(mca_pml_bfo_comm_proc_t* proc)
|
||||
{
|
||||
proc->expected_sequence = 1;
|
||||
proc->ompi_proc = NULL;
|
||||
proc->send_sequence = 0;
|
||||
OBJ_CONSTRUCT(&proc->frags_cant_match, opal_list_t);
|
||||
OBJ_CONSTRUCT(&proc->specific_receives, opal_list_t);
|
||||
OBJ_CONSTRUCT(&proc->unexpected_frags, opal_list_t);
|
||||
}
|
||||
|
||||
|
||||
static void mca_pml_bfo_comm_proc_destruct(mca_pml_bfo_comm_proc_t* proc)
|
||||
{
|
||||
OBJ_DESTRUCT(&proc->frags_cant_match);
|
||||
OBJ_DESTRUCT(&proc->specific_receives);
|
||||
OBJ_DESTRUCT(&proc->unexpected_frags);
|
||||
}
|
||||
|
||||
|
||||
static OBJ_CLASS_INSTANCE(
|
||||
mca_pml_bfo_comm_proc_t,
|
||||
opal_object_t,
|
||||
mca_pml_bfo_comm_proc_construct,
|
||||
mca_pml_bfo_comm_proc_destruct);
|
||||
|
||||
|
||||
static void mca_pml_bfo_comm_construct(mca_pml_bfo_comm_t* comm)
|
||||
{
|
||||
OBJ_CONSTRUCT(&comm->wild_receives, opal_list_t);
|
||||
OBJ_CONSTRUCT(&comm->matching_lock, opal_mutex_t);
|
||||
comm->recv_sequence = 0;
|
||||
comm->procs = NULL;
|
||||
comm->last_probed = 0;
|
||||
comm->num_procs = 0;
|
||||
}
|
||||
|
||||
|
||||
static void mca_pml_bfo_comm_destruct(mca_pml_bfo_comm_t* comm)
|
||||
{
|
||||
size_t i;
|
||||
for(i=0; i<comm->num_procs; i++)
|
||||
OBJ_DESTRUCT((&comm->procs[i]));
|
||||
if(NULL != comm->procs)
|
||||
free(comm->procs);
|
||||
OBJ_DESTRUCT(&comm->wild_receives);
|
||||
OBJ_DESTRUCT(&comm->matching_lock);
|
||||
}
|
||||
|
||||
|
||||
OBJ_CLASS_INSTANCE(
|
||||
mca_pml_bfo_comm_t,
|
||||
opal_object_t,
|
||||
mca_pml_bfo_comm_construct,
|
||||
mca_pml_bfo_comm_destruct);
|
||||
|
||||
|
||||
int mca_pml_bfo_comm_init_size(mca_pml_bfo_comm_t* comm, size_t size)
|
||||
{
|
||||
size_t i;
|
||||
|
||||
/* send message sequence-number support - sender side */
|
||||
comm->procs = (mca_pml_bfo_comm_proc_t*)malloc(sizeof(mca_pml_bfo_comm_proc_t)*size);
|
||||
if(NULL == comm->procs) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
for(i=0; i<size; i++) {
|
||||
OBJ_CONSTRUCT(comm->procs+i, mca_pml_bfo_comm_proc_t);
|
||||
}
|
||||
comm->num_procs = size;
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -1,81 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010-2012 Oracle and/or its affiliates. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*/
|
||||
#ifndef MCA_PML_BFO_COMM_H
|
||||
#define MCA_PML_BFO_COMM_H
|
||||
|
||||
#include "opal/threads/mutex.h"
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "ompi/proc/proc.h"
|
||||
BEGIN_C_DECLS
|
||||
|
||||
|
||||
struct mca_pml_bfo_comm_proc_t {
|
||||
opal_object_t super;
|
||||
uint16_t expected_sequence; /**< send message sequence number - receiver side */
|
||||
struct ompi_proc_t* ompi_proc;
|
||||
#if OPAL_ENABLE_MULTI_THREADS
|
||||
volatile int32_t send_sequence; /**< send side sequence number */
|
||||
#else
|
||||
int32_t send_sequence; /**< send side sequence number */
|
||||
#endif
|
||||
opal_list_t frags_cant_match; /**< out-of-order fragment queues */
|
||||
opal_list_t specific_receives; /**< queues of unmatched specific receives */
|
||||
opal_list_t unexpected_frags; /**< unexpected fragment queues */
|
||||
};
|
||||
typedef struct mca_pml_bfo_comm_proc_t mca_pml_bfo_comm_proc_t;
|
||||
|
||||
|
||||
/**
|
||||
* Cached on ompi_communicator_t to hold queues/state
|
||||
* used by the PML<->PTL interface for matching logic.
|
||||
*/
|
||||
struct mca_pml_comm_t {
|
||||
opal_object_t super;
|
||||
#if OPAL_ENABLE_MULTI_THREADS
|
||||
volatile uint32_t recv_sequence; /**< recv request sequence number - receiver side */
|
||||
#else
|
||||
uint32_t recv_sequence; /**< recv request sequence number - receiver side */
|
||||
#endif
|
||||
opal_mutex_t matching_lock; /**< matching lock */
|
||||
opal_list_t wild_receives; /**< queue of unmatched wild (source process not specified) receives */
|
||||
mca_pml_bfo_comm_proc_t* procs;
|
||||
size_t num_procs;
|
||||
size_t last_probed;
|
||||
};
|
||||
typedef struct mca_pml_comm_t mca_pml_bfo_comm_t;
|
||||
|
||||
OBJ_CLASS_DECLARATION(mca_pml_bfo_comm_t);
|
||||
|
||||
|
||||
/**
|
||||
* Initialize an instance of mca_pml_bfo_comm_t based on the communicator size.
|
||||
*
|
||||
* @param comm Instance of mca_pml_bfo_comm_t
|
||||
* @param size Size of communicator
|
||||
* @return OMPI_SUCCESS or error status on failure.
|
||||
*/
|
||||
|
||||
extern int mca_pml_bfo_comm_init_size(mca_pml_bfo_comm_t* comm, size_t size);
|
||||
|
||||
END_C_DECLS
|
||||
#endif
|
||||
|
@ -1,274 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "opal/mca/event/event.h"
|
||||
#include "mpi.h"
|
||||
#include "ompi/runtime/params.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "ompi/mca/pml/base/pml_base_bsend.h"
|
||||
#include "pml_bfo.h"
|
||||
#include "pml_bfo_hdr.h"
|
||||
#include "pml_bfo_sendreq.h"
|
||||
#include "pml_bfo_recvreq.h"
|
||||
#include "pml_bfo_rdmafrag.h"
|
||||
#include "pml_bfo_recvfrag.h"
|
||||
#include "ompi/mca/bml/base/base.h"
|
||||
#include "pml_bfo_component.h"
|
||||
#include "opal/mca/allocator/base/base.h"
|
||||
#include "opal/runtime/opal_params.h"
|
||||
|
||||
OBJ_CLASS_INSTANCE( mca_pml_bfo_pckt_pending_t,
|
||||
ompi_free_list_item_t,
|
||||
NULL,
|
||||
NULL );
|
||||
|
||||
static int mca_pml_bfo_component_register(void);
|
||||
static int mca_pml_bfo_component_open(void);
|
||||
static int mca_pml_bfo_component_close(void);
|
||||
static mca_pml_base_module_t*
|
||||
mca_pml_bfo_component_init( int* priority, bool enable_progress_threads,
|
||||
bool enable_mpi_threads );
|
||||
static int mca_pml_bfo_component_fini(void);
|
||||
int mca_pml_bfo_output = 0;
|
||||
static int mca_pml_bfo_verbose = 0;
|
||||
|
||||
mca_pml_base_component_2_0_0_t mca_pml_bfo_component = {
|
||||
|
||||
/* First, the mca_base_component_t struct containing meta
|
||||
information about the component itself */
|
||||
|
||||
.pmlm_version = {
|
||||
MCA_PML_BASE_VERSION_2_0_0,
|
||||
|
||||
.mca_component_name = "bfo",
|
||||
MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
|
||||
OMPI_RELEASE_VERSION),
|
||||
.mca_open_component = mca_pml_bfo_component_open,
|
||||
.mca_close_component = mca_pml_bfo_component_close,
|
||||
.mca_register_component_params = mca_pml_bfo_component_register,
|
||||
},
|
||||
.pmlm_data = {
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
|
||||
.pmlm_init = mca_pml_bfo_component_init,
|
||||
.pmlm_finalize = mca_pml_bfo_component_fini,
|
||||
};
|
||||
|
||||
void *mca_pml_bfo_seg_alloc( struct mca_mpool_base_module_t* mpool,
|
||||
size_t* size,
|
||||
mca_mpool_base_registration_t** registration);
|
||||
|
||||
void mca_pml_bfo_seg_free( struct mca_mpool_base_module_t* mpool,
|
||||
void* segment );
|
||||
|
||||
static inline int mca_pml_bfo_param_register_int(
|
||||
const char* param_name,
|
||||
int default_value,
|
||||
int *storage)
|
||||
{
|
||||
*storage = default_value;
|
||||
(void) mca_base_component_var_register(&mca_pml_bfo_component.pmlm_version, param_name,
|
||||
NULL, MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY, storage);
|
||||
|
||||
return *storage;
|
||||
}
|
||||
|
||||
static inline unsigned int mca_pml_bfo_param_register_uint(
|
||||
const char* param_name,
|
||||
unsigned int default_value,
|
||||
unsigned int *storage)
|
||||
{
|
||||
*storage = default_value;
|
||||
(void) mca_base_component_var_register(&mca_pml_bfo_component.pmlm_version, param_name,
|
||||
NULL, MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY, storage);
|
||||
|
||||
return *storage;
|
||||
}
|
||||
|
||||
static int mca_pml_bfo_component_register(void)
|
||||
{
|
||||
int default_priority;
|
||||
|
||||
#if PML_BFO
|
||||
default_priority = 5;
|
||||
#else /* PML_BFO */
|
||||
default_priority = 20;
|
||||
mca_pml_bfo_param_register_int("priority", 20);
|
||||
#endif /* PML_BFO */
|
||||
|
||||
(void) mca_pml_bfo_param_register_int("verbose", 0, &mca_pml_bfo_verbose);
|
||||
(void) mca_pml_bfo_param_register_int("free_list_num", 4, &mca_pml_bfo.free_list_num);
|
||||
(void) mca_pml_bfo_param_register_int("free_list_max", -1, &mca_pml_bfo.free_list_max);
|
||||
(void) mca_pml_bfo_param_register_int("free_list_inc", 64, &mca_pml_bfo.free_list_inc);
|
||||
(void) mca_pml_bfo_param_register_int("priority", default_priority, &mca_pml_bfo.priority);
|
||||
(void) mca_pml_bfo_param_register_uint("send_pipeline_depth", 3, &mca_pml_bfo.send_pipeline_depth);
|
||||
(void) mca_pml_bfo_param_register_uint("recv_pipeline_depth", 4, &mca_pml_bfo.recv_pipeline_depth);
|
||||
(void) mca_pml_bfo_param_register_uint("rdma_put_retries_limit", 5, &mca_pml_bfo.rdma_put_retries_limit);
|
||||
(void) mca_pml_bfo_param_register_int("max_rdma_per_request", 4, &mca_pml_bfo.max_rdma_per_request);
|
||||
(void) mca_pml_bfo_param_register_int("max_send_per_range", 4, &mca_pml_bfo.max_send_per_range);
|
||||
(void) mca_pml_bfo_param_register_uint("unexpected_limit", 128, &mca_pml_bfo.unexpected_limit);
|
||||
|
||||
mca_pml_bfo.allocator_name = "bucket";
|
||||
(void) mca_base_component_var_register(&mca_pml_bfo_component.pmlm_version,
|
||||
"allocator",
|
||||
"Name of allocator component for unexpected messages",
|
||||
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_pml_bfo.allocator_name);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int mca_pml_bfo_component_open(void)
|
||||
{
|
||||
mca_pml_bfo_output = opal_output_open(NULL);
|
||||
opal_output_set_verbosity(mca_pml_bfo_output, mca_pml_bfo_verbose);
|
||||
|
||||
mca_pml_bfo.enabled = false;
|
||||
return mca_base_framework_open(&ompi_bml_base_framework, 0);
|
||||
}
|
||||
|
||||
|
||||
static int mca_pml_bfo_component_close(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
if (OMPI_SUCCESS != (rc = mca_base_framework_close(&ompi_bml_base_framework))) {
|
||||
return rc;
|
||||
}
|
||||
opal_output_close(mca_pml_bfo_output);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static mca_pml_base_module_t*
|
||||
mca_pml_bfo_component_init( int* priority,
|
||||
bool enable_progress_threads,
|
||||
bool enable_mpi_threads )
|
||||
{
|
||||
mca_allocator_base_component_t* allocator_component;
|
||||
|
||||
opal_output_verbose( 10, mca_pml_bfo_output,
|
||||
"in bfo, my priority is %d\n", mca_pml_bfo.priority);
|
||||
|
||||
if((*priority) > mca_pml_bfo.priority) {
|
||||
*priority = mca_pml_bfo.priority;
|
||||
return NULL;
|
||||
}
|
||||
*priority = mca_pml_bfo.priority;
|
||||
|
||||
allocator_component = mca_allocator_component_lookup( mca_pml_bfo.allocator_name );
|
||||
if(NULL == allocator_component) {
|
||||
opal_output(0, "mca_pml_bfo_component_init: can't find allocator: %s\n", mca_pml_bfo.allocator_name);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
mca_pml_bfo.allocator = allocator_component->allocator_init(true,
|
||||
mca_pml_bfo_seg_alloc,
|
||||
mca_pml_bfo_seg_free, NULL);
|
||||
if(NULL == mca_pml_bfo.allocator) {
|
||||
opal_output(0, "mca_pml_bfo_component_init: unable to initialize allocator\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
if(OMPI_SUCCESS != mca_bml_base_init( enable_progress_threads,
|
||||
enable_mpi_threads)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Set this here (vs in component_open()) because
|
||||
opal_leave_pinned* may have been set after MCA params were
|
||||
read (e.g., by the openib btl) */
|
||||
mca_pml_bfo.leave_pinned = (1 == opal_leave_pinned);
|
||||
mca_pml_bfo.leave_pinned_pipeline = (int) opal_leave_pinned_pipeline;
|
||||
|
||||
return &mca_pml_bfo.super;
|
||||
}
|
||||
|
||||
int mca_pml_bfo_component_fini(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
/* Shutdown BML */
|
||||
if(OMPI_SUCCESS != (rc = mca_bml.bml_finalize()))
|
||||
return rc;
|
||||
|
||||
if(!mca_pml_bfo.enabled)
|
||||
return OMPI_SUCCESS; /* never selected.. return success.. */
|
||||
mca_pml_bfo.enabled = false; /* not anymore */
|
||||
|
||||
OBJ_DESTRUCT(&mca_pml_bfo.rdma_pending);
|
||||
OBJ_DESTRUCT(&mca_pml_bfo.pckt_pending);
|
||||
OBJ_DESTRUCT(&mca_pml_bfo.recv_pending);
|
||||
OBJ_DESTRUCT(&mca_pml_bfo.send_pending);
|
||||
OBJ_DESTRUCT(&mca_pml_bfo.non_existing_communicator_pending);
|
||||
OBJ_DESTRUCT(&mca_pml_bfo.buffers);
|
||||
OBJ_DESTRUCT(&mca_pml_bfo.pending_pckts);
|
||||
OBJ_DESTRUCT(&mca_pml_bfo.recv_frags);
|
||||
OBJ_DESTRUCT(&mca_pml_bfo.rdma_frags);
|
||||
OBJ_DESTRUCT(&mca_pml_bfo.lock);
|
||||
|
||||
if(OMPI_SUCCESS != (rc = mca_pml_bfo.allocator->alc_finalize(mca_pml_bfo.allocator))) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
#if 0
|
||||
if (mca_pml_base_send_requests.fl_num_allocated !=
|
||||
mca_pml_base_send_requests.super.opal_list_length) {
|
||||
opal_output(0, "bfo send requests: %d allocated %d returned\n",
|
||||
mca_pml_base_send_requests.fl_num_allocated,
|
||||
mca_pml_base_send_requests.super.opal_list_length);
|
||||
}
|
||||
if (mca_pml_base_recv_requests.fl_num_allocated !=
|
||||
mca_pml_base_recv_requests.super.opal_list_length) {
|
||||
opal_output(0, "bfo recv requests: %d allocated %d returned\n",
|
||||
mca_pml_base_recv_requests.fl_num_allocated,
|
||||
mca_pml_base_recv_requests.super.opal_list_length);
|
||||
}
|
||||
#endif
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
void *mca_pml_bfo_seg_alloc( struct mca_mpool_base_module_t* mpool,
|
||||
size_t* size,
|
||||
mca_mpool_base_registration_t** registration) {
|
||||
return malloc(*size);
|
||||
}
|
||||
|
||||
void mca_pml_bfo_seg_free( struct mca_mpool_base_module_t* mpool,
|
||||
void* segment ) {
|
||||
free(segment);
|
||||
}
|
@ -1,33 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*/
|
||||
|
||||
#ifndef MCA_PML_BFO_COMPONENT_H
|
||||
#define MCA_PML_BFO_COMPONENT_H
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/*
|
||||
* PML module functions.
|
||||
*/
|
||||
OMPI_MODULE_DECLSPEC extern mca_pml_base_component_2_0_0_t mca_pml_bfo_component;
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
@ -1,157 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2008 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2008 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2010-2012 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012-2015 NVIDIA Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "opal/prefetch.h"
|
||||
#include "opal/mca/btl/btl.h"
|
||||
#include "opal/mca/mpool/mpool.h"
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "pml_bfo.h"
|
||||
#include "pml_bfo_hdr.h"
|
||||
#include "pml_bfo_rdmafrag.h"
|
||||
#include "pml_bfo_recvreq.h"
|
||||
#include "pml_bfo_sendreq.h"
|
||||
#include "ompi/mca/bml/base/base.h"
|
||||
#include "ompi/memchecker.h"
|
||||
|
||||
size_t mca_pml_bfo_rdma_cuda_btls(
|
||||
mca_bml_base_endpoint_t* bml_endpoint,
|
||||
unsigned char* base,
|
||||
size_t size,
|
||||
mca_pml_bfo_com_btl_t* rdma_btls);
|
||||
|
||||
int mca_pml_bfo_cuda_need_buffers(void * rreq,
|
||||
mca_btl_base_module_t* btl);
|
||||
|
||||
/**
|
||||
* Handle the CUDA buffer.
|
||||
*/
|
||||
int mca_pml_bfo_send_request_start_cuda(mca_pml_bfo_send_request_t* sendreq,
|
||||
mca_bml_base_btl_t* bml_btl,
|
||||
size_t size) {
|
||||
int rc;
|
||||
sendreq->req_send.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
|
||||
if (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == false) {
|
||||
unsigned char *base;
|
||||
opal_convertor_get_current_pointer( &sendreq->req_send.req_base.req_convertor, (void**)&base );
|
||||
/* Set flag back */
|
||||
sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
|
||||
if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_bfo_rdma_cuda_btls(
|
||||
sendreq->req_endpoint,
|
||||
base,
|
||||
sendreq->req_send.req_bytes_packed,
|
||||
sendreq->req_rdma))) {
|
||||
rc = mca_pml_bfo_send_request_start_rdma(sendreq, bml_btl,
|
||||
sendreq->req_send.req_bytes_packed);
|
||||
if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
|
||||
mca_pml_bfo_free_rdma_resources(sendreq);
|
||||
}
|
||||
} else {
|
||||
if (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_PUT) {
|
||||
rc = mca_pml_bfo_send_request_start_rndv(sendreq, bml_btl, size,
|
||||
MCA_PML_BFO_HDR_FLAGS_CONTIG);
|
||||
} else {
|
||||
rc = mca_pml_bfo_send_request_start_rndv(sendreq, bml_btl, size, 0);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* Do not send anything with first rendezvous message as copying GPU
|
||||
* memory into RNDV message is expensive. */
|
||||
sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
|
||||
rc = mca_pml_bfo_send_request_start_rndv(sendreq, bml_btl, 0, 0);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
|
||||
size_t mca_pml_bfo_rdma_cuda_btls(
|
||||
mca_bml_base_endpoint_t* bml_endpoint,
|
||||
unsigned char* base,
|
||||
size_t size,
|
||||
mca_pml_bfo_com_btl_t* rdma_btls)
|
||||
{
|
||||
int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
|
||||
double weight_total = 0;
|
||||
int num_btls_used = 0, n;
|
||||
|
||||
/* shortcut when there are no rdma capable btls */
|
||||
if(num_btls == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* check to see if memory is registered */
|
||||
for(n = 0; n < num_btls && num_btls_used < mca_pml_bfo.max_rdma_per_request;
|
||||
n++) {
|
||||
mca_bml_base_btl_t* bml_btl =
|
||||
mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n);
|
||||
|
||||
if (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET) {
|
||||
mca_mpool_base_registration_t* reg = NULL;
|
||||
mca_mpool_base_module_t *btl_mpool = bml_btl->btl->btl_mpool;
|
||||
|
||||
if( NULL != btl_mpool ) {
|
||||
/* register the memory */
|
||||
btl_mpool->mpool_register(btl_mpool, base, size, 0, ®);
|
||||
}
|
||||
|
||||
if(NULL == reg)
|
||||
continue;
|
||||
|
||||
rdma_btls[num_btls_used].bml_btl = bml_btl;
|
||||
rdma_btls[num_btls_used].btl_reg = reg;
|
||||
weight_total += bml_btl->btl_weight;
|
||||
num_btls_used++;
|
||||
}
|
||||
}
|
||||
|
||||
/* if we don't use leave_pinned and all BTLs that already have this memory
|
||||
* registered amount to less then half of available bandwidth - fall back to
|
||||
* pipeline protocol */
|
||||
if(0 == num_btls_used || (!mca_pml_bfo.leave_pinned && weight_total < 0.5))
|
||||
return 0;
|
||||
|
||||
mca_pml_bfo_calc_weighted_length(rdma_btls, num_btls_used, size,
|
||||
weight_total);
|
||||
|
||||
return num_btls_used;
|
||||
}
|
||||
|
||||
int mca_pml_bfo_cuda_need_buffers(void * rreq,
|
||||
mca_btl_base_module_t* btl)
|
||||
{
|
||||
mca_pml_bfo_recv_request_t* recvreq = (mca_pml_bfo_recv_request_t*)rreq;
|
||||
if ((recvreq->req_recv.req_base.req_convertor.flags & CONVERTOR_CUDA) &&
|
||||
(btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET)) {
|
||||
recvreq->req_recv.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
|
||||
if(opal_convertor_need_buffers(&recvreq->req_recv.req_base.req_convertor) == true) {
|
||||
recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
|
||||
return true;
|
||||
} else {
|
||||
recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -1,398 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* Functions that implement failover capabilities.
|
||||
*/
|
||||
|
||||
#ifndef MCA_PML_BFO_FAILOVER_H
|
||||
#define MCA_PML_BFO_FAILOVER_H
|
||||
|
||||
#include "opal/mca/btl/btl.h"
|
||||
#include "pml_bfo_hdr.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
bool mca_pml_bfo_is_duplicate_msg(mca_pml_bfo_comm_proc_t* proc,
|
||||
mca_pml_bfo_match_hdr_t *hdr);
|
||||
bool mca_pml_bfo_is_duplicate_fin(mca_pml_bfo_hdr_t* hdr, mca_btl_base_descriptor_t* rdma,
|
||||
mca_btl_base_module_t* btl);
|
||||
|
||||
mca_pml_bfo_recv_request_t* mca_pml_bfo_get_request(mca_pml_bfo_match_hdr_t *hdr);
|
||||
|
||||
void mca_pml_bfo_send_request_restart(mca_pml_bfo_send_request_t* sendreq,
|
||||
bool repost, mca_btl_base_tag_t tag);
|
||||
void mca_pml_bfo_send_request_rndvrestartnotify(mca_pml_bfo_send_request_t* sendreq,
|
||||
bool repost, mca_btl_base_tag_t tag, int status,
|
||||
mca_btl_base_module_t* btl);
|
||||
|
||||
void
|
||||
mca_pml_bfo_rndvrestartnotify_completion(mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* ep,
|
||||
struct mca_btl_base_descriptor_t* des,
|
||||
int status);
|
||||
void
|
||||
mca_pml_bfo_check_recv_ctl_completion_status(mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_descriptor_t* des,
|
||||
int status);
|
||||
|
||||
/* Reset a receive request to the beginning */
|
||||
void mca_pml_bfo_recv_request_reset(mca_pml_bfo_recv_request_t* recvreq);
|
||||
/* Notify sender that receiver detected an error */
|
||||
void mca_pml_bfo_recv_request_recverrnotify(mca_pml_bfo_recv_request_t* recvreq,
|
||||
mca_btl_base_tag_t tag, int status);
|
||||
/* Ack the RNDVRESTARTNOTIFY message */
|
||||
void mca_pml_bfo_recv_request_rndvrestartack(mca_pml_bfo_recv_request_t* recvreq,
|
||||
mca_btl_base_tag_t tag, int status,
|
||||
mca_btl_base_module_t* btl);
|
||||
/* Nack the RNDVRESTARTNOTIFY message */
|
||||
void mca_pml_bfo_recv_request_rndvrestartnack(mca_btl_base_descriptor_t* olddes,
|
||||
ompi_proc_t* ompi_proc, bool repost);
|
||||
|
||||
void mca_pml_bfo_recv_restart_completion(mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* ep,
|
||||
struct mca_btl_base_descriptor_t* des,
|
||||
int status);
|
||||
void mca_pml_bfo_failover_error_handler(struct mca_btl_base_module_t* btl,
|
||||
int32_t flags, ompi_proc_t *errproc, char *btlname);
|
||||
void mca_pml_bfo_repost_match_fragment(struct mca_btl_base_descriptor_t* des);
|
||||
void mca_pml_bfo_repost_fin(struct mca_btl_base_descriptor_t* des);
|
||||
|
||||
void mca_pml_bfo_map_out_btl(struct mca_btl_base_module_t* btl,
|
||||
ompi_proc_t *errproc, char *btlname);
|
||||
|
||||
extern void mca_pml_bfo_map_out( mca_btl_base_module_t *btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t* descriptor,
|
||||
void* cbdata );
|
||||
|
||||
int mca_pml_bfo_register_callbacks(void);
|
||||
|
||||
void mca_pml_bfo_update_rndv_fields(mca_pml_bfo_hdr_t* hdr,
|
||||
mca_pml_bfo_send_request_t*, char *type);
|
||||
|
||||
void mca_pml_bfo_update_bml_btl(mca_bml_base_btl_t** bml_btl, mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_descriptor_t* des);
|
||||
|
||||
void mca_pml_bfo_find_recvreq_eager_bml_btl(mca_bml_base_btl_t** bml_btl,
|
||||
mca_btl_base_module_t* btl,
|
||||
mca_pml_bfo_recv_request_t* recvreq,
|
||||
char* type);
|
||||
|
||||
void mca_pml_bfo_find_sendreq_eager_bml_btl(mca_bml_base_btl_t** bml_btl,
|
||||
mca_btl_base_module_t* btl,
|
||||
mca_pml_bfo_send_request_t* sendreq,
|
||||
char* type);
|
||||
|
||||
void mca_pml_bfo_find_sendreq_rdma_bml_btl(mca_bml_base_btl_t** bml_btl,
|
||||
mca_btl_base_module_t* btl,
|
||||
mca_pml_bfo_send_request_t* sendreq,
|
||||
char* type);
|
||||
|
||||
void mca_pml_bfo_update_eager_bml_btl_recv_ctl(mca_bml_base_btl_t** bml_btl,
|
||||
mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_descriptor_t* des);
|
||||
void mca_pml_bfo_find_recvreq_rdma_bml_btl(mca_bml_base_btl_t** bml_btl,
|
||||
mca_btl_base_module_t* btl,
|
||||
mca_pml_bfo_recv_request_t* recvreq,
|
||||
char* type);
|
||||
|
||||
bool mca_pml_bfo_rndv_completion_status_error(struct mca_btl_base_descriptor_t* des,
|
||||
mca_pml_bfo_send_request_t* sendreq);
|
||||
void mca_pml_bfo_send_ctl_completion_status_error(struct mca_btl_base_descriptor_t* des);
|
||||
|
||||
|
||||
void mca_pml_bfo_completion_sendreq_has_error(mca_pml_bfo_send_request_t* sendreq,
|
||||
int status,
|
||||
mca_btl_base_module_t* btl,
|
||||
int type,
|
||||
char *description);
|
||||
/**
|
||||
* Four new callbacks for the four new message types.
|
||||
*/
|
||||
extern void mca_pml_bfo_recv_frag_callback_rndvrestartnotify( mca_btl_base_module_t *btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t* descriptor,
|
||||
void* cbdata );
|
||||
|
||||
extern void mca_pml_bfo_recv_frag_callback_rndvrestartack( mca_btl_base_module_t *btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t* descriptor,
|
||||
void* cbdata );
|
||||
|
||||
extern void mca_pml_bfo_recv_frag_callback_rndvrestartnack( mca_btl_base_module_t *btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t* descriptor,
|
||||
void* cbdata );
|
||||
|
||||
extern void mca_pml_bfo_recv_frag_callback_recverrnotify( mca_btl_base_module_t *btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t* descriptor,
|
||||
void* cbdata );
|
||||
|
||||
/**
|
||||
* A bunch of macros to help isolate failover code from regular ob1 code.
|
||||
*/
|
||||
|
||||
/* Drop any ACK fragments if request is in error state. Do not want
|
||||
* to initiate any more activity. */
|
||||
#define MCA_PML_BFO_ERROR_CHECK_ON_ACK_CALLBACK(sendreq) \
|
||||
if( OPAL_UNLIKELY((sendreq)->req_error)) { \
|
||||
opal_output_verbose(20, mca_pml_bfo_output, \
|
||||
"ACK: received: dropping because request in error, " \
|
||||
"PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", \
|
||||
(uint16_t)(sendreq)->req_send.req_base.req_sequence, \
|
||||
(sendreq)->req_restartseq, \
|
||||
(void *)(sendreq), (sendreq)->req_recv.pval, \
|
||||
(sendreq)->req_send.req_base.req_peer); \
|
||||
return; \
|
||||
}
|
||||
|
||||
/* Drop any FRAG fragments if request is in error state. Do not want
|
||||
* to initiate any more activity. */
|
||||
#define MCA_PML_BFO_ERROR_CHECK_ON_FRAG_CALLBACK(recvreq) \
|
||||
if( OPAL_UNLIKELY((recvreq)->req_errstate)) { \
|
||||
opal_output_verbose(20, mca_pml_bfo_output, \
|
||||
"FRAG: received: dropping because request in error, " \
|
||||
"PML=%d, src_req=%p, dst_req=%p, peer=%d, offset=%d", \
|
||||
(uint16_t)(recvreq)->req_msgseq, \
|
||||
(recvreq)->remote_req_send.pval, \
|
||||
(void *)(recvreq), \
|
||||
(recvreq)->req_recv.req_base.req_ompi.req_status.MPI_SOURCE, \
|
||||
(int)hdr->hdr_frag.hdr_frag_offset); \
|
||||
return; \
|
||||
}
|
||||
|
||||
/* Drop any PUT fragments if request is in error state. Do not want
|
||||
* to initiate any more activity. */
|
||||
#define MCA_PML_BFO_ERROR_CHECK_ON_PUT_CALLBACK(sendreq) \
|
||||
if( OPAL_UNLIKELY((sendreq)->req_error)) { \
|
||||
opal_output_verbose(20, mca_pml_bfo_output, \
|
||||
"PUT: received: dropping because request in error, " \
|
||||
"PML=%d, src_req=%p, dst_req=%p, peer=%d", \
|
||||
(uint16_t)(sendreq)->req_send.req_base.req_sequence, \
|
||||
(void *)(sendreq), (sendreq)->req_recv.pval, \
|
||||
(sendreq)->req_send.req_base.req_peer); \
|
||||
return; \
|
||||
}
|
||||
|
||||
/**
|
||||
* Macros for pml_bfo_recvreq.c file.
|
||||
*/
|
||||
|
||||
/* This can happen if a FIN message arrives after the request was
|
||||
* marked in error. So, just drop the message. Note that the status
|
||||
* field is not being checked. That is because the status field is the
|
||||
* value returned in the FIN hdr.hdr_fail field and may be used for
|
||||
* other things. Note that we allow the various fields to be updated
|
||||
* in case this actually completes the request and the sending side
|
||||
* thinks it is done. */
|
||||
#define MCA_PML_BFO_ERROR_CHECK_ON_FIN_FOR_PUT(recvreq) \
|
||||
if( OPAL_UNLIKELY((recvreq)->req_errstate)) { \
|
||||
opal_output_verbose(20, mca_pml_bfo_output, \
|
||||
"FIN: received on broken request, skipping, " \
|
||||
"PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", \
|
||||
(recvreq)->req_msgseq, (recvreq)->req_restartseq, \
|
||||
(recvreq)->remote_req_send.pval, (void *)(recvreq), \
|
||||
(recvreq)->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); \
|
||||
/* Even though in error, it still might complete. */ \
|
||||
recv_request_pml_complete_check(recvreq); \
|
||||
return; \
|
||||
}
|
||||
|
||||
#define MCA_PML_BFO_ERROR_CHECK_ON_RDMA_READ_COMPLETION(recvreq) \
|
||||
if ((recvreq)->req_errstate) { \
|
||||
opal_output_verbose(30, mca_pml_bfo_output, \
|
||||
"RDMA read: completion failed, error already seen, " \
|
||||
"PML=%d, RQS=%d, src_req=%lx, dst_req=%lx, peer=%d", \
|
||||
(recvreq)->req_msgseq, (recvreq)->req_restartseq, \
|
||||
(unsigned long)(recvreq)->remote_req_send.pval, \
|
||||
(unsigned long)(recvreq), \
|
||||
(recvreq)->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); \
|
||||
return; \
|
||||
} else { \
|
||||
opal_output_verbose(30, mca_pml_bfo_output, \
|
||||
"RDMA read: completion failed, sending RECVERRNOTIFY to " \
|
||||
"sender, PML=%d, RQS=%d, src_req=%lx, dst_req=%lx, peer=%d", \
|
||||
(recvreq)->req_msgseq, (recvreq)->req_restartseq, \
|
||||
(unsigned long)(recvreq)->remote_req_send.pval, \
|
||||
(unsigned long)(recvreq), \
|
||||
(recvreq)->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); \
|
||||
mca_pml_bfo_recv_request_recverrnotify(recvreq, MCA_PML_BFO_HDR_TYPE_RGET, status); \
|
||||
}
|
||||
|
||||
#define MCA_PML_BFO_SECOND_ERROR_CHECK_ON_RDMA_READ_COMPLETION(recvreq, status, btl) \
|
||||
/* See if the request has received a RNDVRESTARTNOTIFY */ \
|
||||
if( OPAL_UNLIKELY(recvreq->req_errstate)) { \
|
||||
if (recvreq->req_errstate & RECVREQ_RNDVRESTART_RECVED) { \
|
||||
opal_output_verbose(30, mca_pml_bfo_output, \
|
||||
"RDMA read: completion: recvreq has error, outstanding events=%d " \
|
||||
"PML=%d, RQS=%d, src_req=%lx, dst_req=%lx, status=%d, peer=%d", \
|
||||
recvreq->req_events, recvreq->req_msgseq, recvreq->req_restartseq, \
|
||||
(unsigned long)recvreq->remote_req_send.pval, \
|
||||
(unsigned long)recvreq, status, \
|
||||
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); \
|
||||
if (0 == recvreq->req_events) { \
|
||||
mca_pml_bfo_recv_request_rndvrestartack(recvreq, MCA_PML_BFO_HDR_TYPE_RGET, \
|
||||
status, btl); \
|
||||
} \
|
||||
} \
|
||||
MCA_PML_BFO_RDMA_FRAG_RETURN(frag); \
|
||||
return; \
|
||||
}
|
||||
|
||||
/**
|
||||
* Macros for pml_bfo_sendreq.c file.
|
||||
*/
|
||||
|
||||
/* This macro is called on the sending side after receiving
|
||||
* a PUT message. There is a chance that this PUT message
|
||||
* has shown up and is attempting to modify the state of
|
||||
* the req_state, but the req_state is no longer being tracked
|
||||
* because the RNDV message has turned into a RGET message
|
||||
* because it got an error on the RNDV completion.
|
||||
*/
|
||||
#define MCA_PML_BFO_VERIFY_SENDREQ_REQ_STATE_VALUE(sendreq) \
|
||||
if (sendreq->req_state == -1) { \
|
||||
OPAL_THREAD_ADD_FETCH32(&sendreq->req_state, 1); \
|
||||
}
|
||||
|
||||
/* Now check the error state. This request can be in error if the
|
||||
* RNDV message made it over, but the receiver got an error trying to
|
||||
* send the ACK back and therefore sent a RECVERRNOTIFY message. In
|
||||
* that case, we want to start the restart dance as the receiver has
|
||||
* matched this message already. Only restart if there are no
|
||||
* outstanding events on send request. */
|
||||
#define MCA_PML_BFO_RNDV_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, status, btl, type, description) \
|
||||
if( OPAL_UNLIKELY ((sendreq)->req_error)) { \
|
||||
mca_pml_bfo_completion_sendreq_has_error(sendreq, status, \
|
||||
btl, type, description); \
|
||||
return; \
|
||||
}
|
||||
|
||||
/**
|
||||
* This macro is called within the frag completion function in two
|
||||
* places. It is called to see if any errors occur prior to the
|
||||
* completion event on the frag. It is then called a second time
|
||||
* after the scheduling routine is called as the scheduling routine
|
||||
* may have detected that a BTL that was cached on the request had
|
||||
* been removed and therefore marked the request in error. In that
|
||||
* case, the scheduling of fragments can no longer proceed properly,
|
||||
* and if there are no outstanding events, iniated the restart dance.
|
||||
*/
|
||||
#define MCA_PML_BFO_FRAG_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, status, btl, type, description) \
|
||||
if( OPAL_UNLIKELY((sendreq)->req_error)) { \
|
||||
mca_pml_bfo_completion_sendreq_has_error(sendreq, status, \
|
||||
btl, type, description); \
|
||||
return; \
|
||||
}
|
||||
|
||||
/* This can happen if a FIN message arrives after the request was
|
||||
* marked in error. So, just drop the message. Note that the status
|
||||
* field is not checked here. That is because that is the value
|
||||
* returned in the FIN hdr.hdr_fail field and may be used for other
|
||||
* things. */
|
||||
#define MCA_PML_BFO_RGET_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, btl, des) \
|
||||
if( OPAL_UNLIKELY(sendreq->req_error)) { \
|
||||
opal_output_verbose(30, mca_pml_bfo_output, \
|
||||
"FIN: received on broken request, skipping, " \
|
||||
"PML=%d, src_req=%lx, dst_req=%lx, peer=%d", \
|
||||
(uint16_t)sendreq->req_send.req_base.req_sequence, \
|
||||
(unsigned long)sendreq, (unsigned long)sendreq->req_recv.pval, \
|
||||
sendreq->req_send.req_base.req_peer); \
|
||||
btl->btl_free(btl, des); \
|
||||
return; \
|
||||
}
|
||||
|
||||
|
||||
/* Check if there has been an error on the send request when we get
|
||||
* a completion event on the RDMA write. */
|
||||
#define MCA_PML_BFO_PUT_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, status, btl) \
|
||||
if ( OPAL_UNLIKELY(sendreq->req_error)) { \
|
||||
mca_pml_bfo_completion_sendreq_has_error(sendreq, status, btl, \
|
||||
MCA_PML_BFO_HDR_TYPE_PUT, "RDMA write"); \
|
||||
MCA_PML_BFO_RDMA_FRAG_RETURN(frag); \
|
||||
return; \
|
||||
}
|
||||
|
||||
#define MCA_PML_BFO_CHECK_FOR_RNDV_RESTART(hdr, sendreq, type) \
|
||||
if (0 < sendreq->req_restartseq) { \
|
||||
mca_pml_bfo_update_rndv_fields(hdr, sendreq, type); \
|
||||
}
|
||||
|
||||
/* If a bml_btl gets mapped out, then we need to adjust it based
|
||||
* on the btl from the callback function. These macros are called on
|
||||
* every callback to make sure things are copacetic.
|
||||
*/
|
||||
#define MCA_PML_BFO_CHECK_EAGER_BML_BTL_ON_FIN_COMPLETION(bml_btl, btl, des) \
|
||||
if (bml_btl->btl != btl) { \
|
||||
ompi_proc_t *proc = (ompi_proc_t*) des->des_cbdata; \
|
||||
mca_bml_base_endpoint_t* bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; \
|
||||
bml_btl = mca_bml_base_btl_array_find(&bml_endpoint->btl_eager, btl); \
|
||||
}
|
||||
#define MCA_PML_BFO_CHECK_SENDREQ_EAGER_BML_BTL(bml_btl, btl, sendreq, type) \
|
||||
if (bml_btl->btl != btl) { \
|
||||
mca_pml_bfo_find_sendreq_eager_bml_btl(&bml_btl, btl, sendreq, type); \
|
||||
}
|
||||
#define MCA_PML_BFO_CHECK_SENDREQ_RDMA_BML_BTL(bml_btl, btl, sendreq, type) \
|
||||
if (bml_btl->btl != btl) { \
|
||||
mca_pml_bfo_find_sendreq_rdma_bml_btl(&bml_btl, btl, sendreq, type); \
|
||||
}
|
||||
|
||||
#define MCA_PML_BFO_CHECK_RECVREQ_EAGER_BML_BTL(bml_btl, btl, recvreq, type) \
|
||||
if (bml_btl->btl != btl) { \
|
||||
mca_pml_bfo_find_recvreq_eager_bml_btl(&bml_btl, btl, recvreq, type); \
|
||||
}
|
||||
|
||||
#define MCA_PML_BFO_CHECK_RECVREQ_RDMA_BML_BTL(bml_btl, btl, recvreq, type) \
|
||||
if (bml_btl->btl != btl) { \
|
||||
mca_pml_bfo_find_recvreq_rdma_bml_btl(&bml_btl, btl, recvreq, type); \
|
||||
}
|
||||
|
||||
#define MCA_PML_BFO_CHECK_RECVREQ_EAGER_BML_BTL_RECV_CTL(bml_btl, btl, des) \
|
||||
if (bml_btl->btl != btl) { \
|
||||
mca_pml_bfo_update_eager_bml_btl_recv_ctl(&bml_btl, btl, des); \
|
||||
}
|
||||
|
||||
#define MCA_PML_BFO_CHECK_FOR_REMOVED_BML(sendreq, frag, btl) \
|
||||
if( OPAL_UNLIKELY(NULL == frag->rdma_bml) ) { \
|
||||
opal_output_verbose(30, mca_pml_bfo_output, \
|
||||
"PUT received: no matching BTL to RDMA write to, oustanding " \
|
||||
"events=%d, PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", \
|
||||
sendreq->req_events, \
|
||||
(uint16_t)sendreq->req_send.req_base.req_sequence, \
|
||||
sendreq->req_restartseq, (void *)sendreq, \
|
||||
sendreq->req_recv.pval, sendreq->req_send.req_base.req_peer); \
|
||||
MCA_PML_BFO_RDMA_FRAG_RETURN(frag); \
|
||||
sendreq->req_error++; \
|
||||
if (0 == sendreq->req_events) { \
|
||||
mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false, \
|
||||
MCA_PML_BFO_HDR_TYPE_PUT, \
|
||||
OMPI_ERROR, btl); \
|
||||
} \
|
||||
return; \
|
||||
}
|
||||
|
||||
/* This macro checks to see if the cached number of BTLs in the
|
||||
* send request still matches the value from the endpoint.
|
||||
* If it does not, this means that a BTL was removed from the
|
||||
* available list. In this case, start the request over.
|
||||
*/
|
||||
#define MCA_PML_BFO_CHECK_FOR_REMOVED_BTL(sendreq, range) \
|
||||
if ((int)mca_bml_base_btl_array_get_size(&sendreq->req_endpoint->btl_send) \
|
||||
!= range->range_btl_cnt) { \
|
||||
sendreq->req_error++; \
|
||||
return OMPI_ERROR; \
|
||||
}
|
||||
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
@ -1,539 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2009 IBM Corporation. All rights reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*/
|
||||
#ifndef MCA_PML_BFO_HEADER_H
|
||||
#define MCA_PML_BFO_HEADER_H
|
||||
|
||||
#include "ompi_config.h"
|
||||
#ifdef HAVE_SYS_TYPES_H
|
||||
#include <sys/types.h>
|
||||
#endif
|
||||
#ifdef HAVE_NETINET_IN_H
|
||||
#include <netinet/in.h>
|
||||
#endif
|
||||
|
||||
#include "opal/types.h"
|
||||
#include "opal/util/arch.h"
|
||||
#include "opal/mca/btl/btl.h"
|
||||
#include "ompi/proc/proc.h"
|
||||
|
||||
#define MCA_PML_BFO_HDR_TYPE_MATCH (MCA_BTL_TAG_PML + 1)
|
||||
#define MCA_PML_BFO_HDR_TYPE_RNDV (MCA_BTL_TAG_PML + 2)
|
||||
#define MCA_PML_BFO_HDR_TYPE_RGET (MCA_BTL_TAG_PML + 3)
|
||||
#define MCA_PML_BFO_HDR_TYPE_ACK (MCA_BTL_TAG_PML + 4)
|
||||
#define MCA_PML_BFO_HDR_TYPE_NACK (MCA_BTL_TAG_PML + 5)
|
||||
#define MCA_PML_BFO_HDR_TYPE_FRAG (MCA_BTL_TAG_PML + 6)
|
||||
#define MCA_PML_BFO_HDR_TYPE_GET (MCA_BTL_TAG_PML + 7)
|
||||
#define MCA_PML_BFO_HDR_TYPE_PUT (MCA_BTL_TAG_PML + 8)
|
||||
#define MCA_PML_BFO_HDR_TYPE_FIN (MCA_BTL_TAG_PML + 9)
|
||||
#if PML_BFO
|
||||
#define MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY (MCA_BTL_TAG_PML + 10)
|
||||
#define MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK (MCA_BTL_TAG_PML + 11)
|
||||
#define MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNACK (MCA_BTL_TAG_PML + 12)
|
||||
#define MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY (MCA_BTL_TAG_PML + 13)
|
||||
#endif /* PML_BFO */
|
||||
|
||||
#define MCA_PML_BFO_HDR_FLAGS_ACK 1 /* is an ack required */
|
||||
#define MCA_PML_BFO_HDR_FLAGS_NBO 2 /* is the hdr in network byte order */
|
||||
#define MCA_PML_BFO_HDR_FLAGS_PIN 4 /* is user buffer pinned */
|
||||
#define MCA_PML_BFO_HDR_FLAGS_CONTIG 8 /* is user buffer contiguous */
|
||||
#define MCA_PML_BFO_HDR_FLAGS_NORDMA 16 /* rest will be send by copy-in-out */
|
||||
#if PML_BFO
|
||||
#define MCA_PML_BFO_HDR_FLAGS_RESTART 32 /* restart RNDV because of error */
|
||||
#endif /* PML_BFO */
|
||||
|
||||
/**
|
||||
* Common hdr attributes - must be first element in each hdr type
|
||||
*/
|
||||
struct mca_pml_bfo_common_hdr_t {
|
||||
uint8_t hdr_type; /**< type of envelope */
|
||||
uint8_t hdr_flags; /**< flags indicating how fragment should be processed */
|
||||
};
|
||||
typedef struct mca_pml_bfo_common_hdr_t mca_pml_bfo_common_hdr_t;
|
||||
|
||||
#define MCA_PML_BFO_COMMON_HDR_NTOH(h)
|
||||
#define MCA_PML_BFO_COMMON_HDR_HTON(h)
|
||||
|
||||
/**
|
||||
* Header definition for the first fragment, contains the
|
||||
* attributes required to match the corresponding posted receive.
|
||||
*/
|
||||
struct mca_pml_bfo_match_hdr_t {
|
||||
mca_pml_bfo_common_hdr_t hdr_common; /**< common attributes */
|
||||
uint16_t hdr_ctx; /**< communicator index */
|
||||
int32_t hdr_src; /**< source rank */
|
||||
int32_t hdr_tag; /**< user tag */
|
||||
uint16_t hdr_seq; /**< message sequence number */
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
uint8_t hdr_padding[2]; /**< explicitly pad to 16 bytes. Compilers seem to already prefer to do this, but make it explicit just in case */
|
||||
#endif
|
||||
};
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
#define OMPI_PML_BFO_MATCH_HDR_LEN 16
|
||||
#else
|
||||
#define OMPI_PML_BFO_MATCH_HDR_LEN 14
|
||||
#endif
|
||||
|
||||
typedef struct mca_pml_bfo_match_hdr_t mca_pml_bfo_match_hdr_t;
|
||||
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
|
||||
#define MCA_PML_BFO_MATCH_HDR_FILL(h) \
|
||||
do { \
|
||||
(h).hdr_padding[0] = 0; \
|
||||
(h).hdr_padding[1] = 0; \
|
||||
} while(0)
|
||||
#else
|
||||
#define MCA_PML_BFO_MATCH_HDR_FILL(h)
|
||||
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
|
||||
|
||||
#define MCA_PML_BFO_MATCH_HDR_NTOH(h) \
|
||||
do { \
|
||||
MCA_PML_BFO_COMMON_HDR_NTOH((h).hdr_common); \
|
||||
(h).hdr_ctx = ntohs((h).hdr_ctx); \
|
||||
(h).hdr_src = ntohl((h).hdr_src); \
|
||||
(h).hdr_tag = ntohl((h).hdr_tag); \
|
||||
(h).hdr_seq = ntohs((h).hdr_seq); \
|
||||
} while (0)
|
||||
|
||||
#define MCA_PML_BFO_MATCH_HDR_HTON(h) \
|
||||
do { \
|
||||
MCA_PML_BFO_COMMON_HDR_HTON((h).hdr_common); \
|
||||
MCA_PML_BFO_MATCH_HDR_FILL(h); \
|
||||
(h).hdr_ctx = htons((h).hdr_ctx); \
|
||||
(h).hdr_src = htonl((h).hdr_src); \
|
||||
(h).hdr_tag = htonl((h).hdr_tag); \
|
||||
(h).hdr_seq = htons((h).hdr_seq); \
|
||||
} while (0)
|
||||
|
||||
/**
|
||||
* Header definition for the first fragment when an acknowledgment
|
||||
* is required. This could be the first fragment of a large message
|
||||
* or a short message that requires an ack (synchronous).
|
||||
*/
|
||||
struct mca_pml_bfo_rendezvous_hdr_t {
|
||||
mca_pml_bfo_match_hdr_t hdr_match;
|
||||
uint64_t hdr_msg_length; /**< message length */
|
||||
opal_ptr_t hdr_src_req; /**< pointer to source request - returned in ack */
|
||||
#if PML_BFO
|
||||
opal_ptr_t hdr_dst_req; /**< pointer to dst req */
|
||||
uint8_t hdr_restartseq; /**< restart sequence */
|
||||
#endif /* PML_BFO */
|
||||
};
|
||||
typedef struct mca_pml_bfo_rendezvous_hdr_t mca_pml_bfo_rendezvous_hdr_t;
|
||||
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
|
||||
#define MCA_PML_BFO_RNDV_HDR_FILL(h) \
|
||||
MCA_PML_BFO_MATCH_HDR_FILL((h).hdr_match)
|
||||
#else
|
||||
#define MCA_PML_BFO_RNDV_HDR_FILL(h)
|
||||
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
|
||||
|
||||
/* Note that hdr_src_req is not put in network byte order because it
|
||||
is never processed by the receiver, other than being copied into
|
||||
the ack header */
|
||||
#define MCA_PML_BFO_RNDV_HDR_NTOH(h) \
|
||||
do { \
|
||||
MCA_PML_BFO_MATCH_HDR_NTOH((h).hdr_match); \
|
||||
(h).hdr_msg_length = ntoh64((h).hdr_msg_length); \
|
||||
} while (0)
|
||||
|
||||
#define MCA_PML_BFO_RNDV_HDR_HTON(h) \
|
||||
do { \
|
||||
MCA_PML_BFO_MATCH_HDR_HTON((h).hdr_match); \
|
||||
MCA_PML_BFO_RNDV_HDR_FILL(h); \
|
||||
(h).hdr_msg_length = hton64((h).hdr_msg_length); \
|
||||
} while (0)
|
||||
|
||||
/**
|
||||
* Header definition for a combined rdma rendezvous/get
|
||||
*/
|
||||
struct mca_pml_bfo_rget_hdr_t {
|
||||
mca_pml_bfo_rendezvous_hdr_t hdr_rndv;
|
||||
uint32_t hdr_seg_cnt; /**< number of segments for rdma */
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
uint8_t hdr_padding[4];
|
||||
#endif
|
||||
opal_ptr_t hdr_des; /**< source descriptor */
|
||||
};
|
||||
typedef struct mca_pml_bfo_rget_hdr_t mca_pml_bfo_rget_hdr_t;
|
||||
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
|
||||
#define MCA_PML_BFO_RGET_HDR_FILL(h) \
|
||||
do { \
|
||||
MCA_PML_BFO_RNDV_HDR_FILL((h).hdr_rndv); \
|
||||
(h).hdr_padding[0] = 0; \
|
||||
(h).hdr_padding[1] = 0; \
|
||||
(h).hdr_padding[2] = 0; \
|
||||
(h).hdr_padding[3] = 0; \
|
||||
} while(0)
|
||||
#else
|
||||
#define MCA_PML_BFO_RGET_HDR_FILL(h)
|
||||
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
|
||||
|
||||
#define MCA_PML_BFO_RGET_HDR_NTOH(h) \
|
||||
do { \
|
||||
MCA_PML_BFO_RNDV_HDR_NTOH((h).hdr_rndv); \
|
||||
(h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \
|
||||
} while (0)
|
||||
|
||||
#define MCA_PML_BFO_RGET_HDR_HTON(h) \
|
||||
do { \
|
||||
MCA_PML_BFO_RNDV_HDR_HTON((h).hdr_rndv); \
|
||||
MCA_PML_BFO_RGET_HDR_FILL(h); \
|
||||
(h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \
|
||||
} while (0)
|
||||
|
||||
/**
|
||||
* Header for subsequent fragments.
|
||||
*/
|
||||
struct mca_pml_bfo_frag_hdr_t {
|
||||
mca_pml_bfo_common_hdr_t hdr_common; /**< common attributes */
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
uint8_t hdr_padding[6];
|
||||
#endif
|
||||
uint64_t hdr_frag_offset; /**< offset into message */
|
||||
opal_ptr_t hdr_src_req; /**< pointer to source request */
|
||||
opal_ptr_t hdr_dst_req; /**< pointer to matched receive */
|
||||
};
|
||||
typedef struct mca_pml_bfo_frag_hdr_t mca_pml_bfo_frag_hdr_t;
|
||||
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
|
||||
#define MCA_PML_BFO_FRAG_HDR_FILL(h) \
|
||||
do { \
|
||||
(h).hdr_padding[0] = 0; \
|
||||
(h).hdr_padding[1] = 0; \
|
||||
(h).hdr_padding[2] = 0; \
|
||||
(h).hdr_padding[3] = 0; \
|
||||
(h).hdr_padding[4] = 0; \
|
||||
(h).hdr_padding[5] = 0; \
|
||||
} while(0)
|
||||
#else
|
||||
#define MCA_PML_BFO_FRAG_HDR_FILL(h)
|
||||
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
|
||||
|
||||
#define MCA_PML_BFO_FRAG_HDR_NTOH(h) \
|
||||
do { \
|
||||
MCA_PML_BFO_COMMON_HDR_NTOH((h).hdr_common); \
|
||||
(h).hdr_frag_offset = ntoh64((h).hdr_frag_offset); \
|
||||
} while (0)
|
||||
|
||||
#define MCA_PML_BFO_FRAG_HDR_HTON(h) \
|
||||
do { \
|
||||
MCA_PML_BFO_COMMON_HDR_HTON((h).hdr_common); \
|
||||
MCA_PML_BFO_FRAG_HDR_FILL(h); \
|
||||
(h).hdr_frag_offset = hton64((h).hdr_frag_offset); \
|
||||
} while (0)
|
||||
|
||||
/**
|
||||
* Header used to acknowledgment outstanding fragment(s).
|
||||
*/
|
||||
|
||||
struct mca_pml_bfo_ack_hdr_t {
|
||||
mca_pml_bfo_common_hdr_t hdr_common; /**< common attributes */
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
uint8_t hdr_padding[6];
|
||||
#endif
|
||||
opal_ptr_t hdr_src_req; /**< source request */
|
||||
opal_ptr_t hdr_dst_req; /**< matched receive request */
|
||||
uint64_t hdr_send_offset; /**< starting point of copy in/out */
|
||||
};
|
||||
typedef struct mca_pml_bfo_ack_hdr_t mca_pml_bfo_ack_hdr_t;
|
||||
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
|
||||
#define MCA_PML_BFO_ACK_HDR_FILL(h) \
|
||||
do { \
|
||||
(h).hdr_padding[0] = 0; \
|
||||
(h).hdr_padding[1] = 0; \
|
||||
(h).hdr_padding[2] = 0; \
|
||||
(h).hdr_padding[3] = 0; \
|
||||
(h).hdr_padding[4] = 0; \
|
||||
(h).hdr_padding[5] = 0; \
|
||||
} while (0)
|
||||
#else
|
||||
#define MCA_PML_BFO_ACK_HDR_FILL(h)
|
||||
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
|
||||
|
||||
/* Note that the request headers are not put in NBO because the
|
||||
src_req is already in receiver's byte order and the dst_req is not
|
||||
used by the receiver for anything other than backpointers in return
|
||||
headers */
|
||||
#define MCA_PML_BFO_ACK_HDR_NTOH(h) \
|
||||
do { \
|
||||
MCA_PML_BFO_COMMON_HDR_NTOH((h).hdr_common); \
|
||||
(h).hdr_send_offset = ntoh64((h).hdr_send_offset); \
|
||||
} while (0)
|
||||
|
||||
#define MCA_PML_BFO_ACK_HDR_HTON(h) \
|
||||
do { \
|
||||
MCA_PML_BFO_COMMON_HDR_HTON((h).hdr_common); \
|
||||
MCA_PML_BFO_ACK_HDR_FILL(h); \
|
||||
(h).hdr_send_offset = hton64((h).hdr_send_offset); \
|
||||
} while (0)
|
||||
|
||||
/**
|
||||
* Header used to initiate an RDMA operation.
|
||||
*/
|
||||
|
||||
struct mca_pml_bfo_rdma_hdr_t {
|
||||
mca_pml_bfo_common_hdr_t hdr_common; /**< common attributes */
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
uint8_t hdr_padding[2]; /** two to pad out the hdr to a 4 byte alignment. hdr_req will then be 8 byte aligned after 4 for hdr_seg_cnt */
|
||||
#endif
|
||||
uint32_t hdr_seg_cnt; /**< number of segments for rdma */
|
||||
opal_ptr_t hdr_req; /**< destination request */
|
||||
#if PML_BFO
|
||||
opal_ptr_t hdr_dst_req; /**< pointer to destination request */
|
||||
#endif /* PML_BFO */
|
||||
opal_ptr_t hdr_des; /**< source descriptor */
|
||||
uint64_t hdr_rdma_offset; /**< current offset into user buffer */
|
||||
mca_btl_base_segment_t hdr_segs[1]; /**< list of segments for rdma */
|
||||
};
|
||||
typedef struct mca_pml_bfo_rdma_hdr_t mca_pml_bfo_rdma_hdr_t;
|
||||
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
|
||||
#define MCA_PML_BFO_RDMA_HDR_FILL(h) \
|
||||
do { \
|
||||
(h).hdr_padding[0] = 0; \
|
||||
(h).hdr_padding[1] = 0; \
|
||||
} while(0)
|
||||
#else
|
||||
#define MCA_PML_BFO_RDMA_HDR_FILL(h)
|
||||
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
|
||||
|
||||
#define MCA_PML_BFO_RDMA_HDR_NTOH(h) \
|
||||
do { \
|
||||
MCA_PML_BFO_COMMON_HDR_NTOH((h).hdr_common); \
|
||||
(h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \
|
||||
(h).hdr_rdma_offset = ntoh64((h).hdr_rdma_offset); \
|
||||
} while (0)
|
||||
|
||||
#define MCA_PML_BFO_RDMA_HDR_HTON(h) \
|
||||
do { \
|
||||
MCA_PML_BFO_COMMON_HDR_HTON((h).hdr_common); \
|
||||
MCA_PML_BFO_RDMA_HDR_FILL(h); \
|
||||
(h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \
|
||||
(h).hdr_rdma_offset = hton64((h).hdr_rdma_offset); \
|
||||
} while (0)
|
||||
|
||||
/**
|
||||
* Header used to complete an RDMA operation.
|
||||
*/
|
||||
|
||||
struct mca_pml_bfo_fin_hdr_t {
|
||||
mca_pml_bfo_common_hdr_t hdr_common; /**< common attributes */
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
uint8_t hdr_padding[2];
|
||||
#endif
|
||||
#if PML_BFO
|
||||
/* Match info is needed to check for duplicate FIN messages. */
|
||||
mca_pml_bfo_match_hdr_t hdr_match;
|
||||
#endif /* PML_BFO */
|
||||
uint32_t hdr_fail; /**< RDMA operation failed */
|
||||
opal_ptr_t hdr_des; /**< completed descriptor */
|
||||
};
|
||||
typedef struct mca_pml_bfo_fin_hdr_t mca_pml_bfo_fin_hdr_t;
|
||||
|
||||
#if PML_BFO
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
|
||||
#define MCA_PML_BFO_FIN_HDR_FILL(h) \
|
||||
do { \
|
||||
(h).hdr_padding[0] = 0; \
|
||||
(h).hdr_padding[1] = 0; \
|
||||
MCA_PML_BFO_MATCH_HDR_FILL((h).hdr_match); \
|
||||
} while (0)
|
||||
#else
|
||||
#define MCA_PML_BFO_FIN_HDR_FILL(h)
|
||||
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
|
||||
|
||||
#define MCA_PML_BFO_FIN_HDR_NTOH(h) \
|
||||
do { \
|
||||
MCA_PML_BFO_COMMON_HDR_NTOH((h).hdr_common); \
|
||||
MCA_PML_BFO_MATCH_HDR_NTOH((h).hdr_match); \
|
||||
} while (0)
|
||||
|
||||
#define MCA_PML_BFO_FIN_HDR_HTON(h) \
|
||||
do { \
|
||||
MCA_PML_BFO_COMMON_HDR_HTON((h).hdr_common); \
|
||||
MCA_PML_BFO_MATCH_HDR_HTON((h).hdr_match); \
|
||||
MCA_PML_BFO_FIN_HDR_FILL(h); \
|
||||
} while (0)
|
||||
#else /* PML_BFO */
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
|
||||
#define MCA_PML_BFO_FIN_HDR_FILL(h) \
|
||||
do { \
|
||||
(h).hdr_padding[0] = 0; \
|
||||
(h).hdr_padding[1] = 0; \
|
||||
} while (0)
|
||||
#else
|
||||
#define MCA_PML_BFO_FIN_HDR_FILL(h)
|
||||
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
|
||||
|
||||
#define MCA_PML_BFO_FIN_HDR_NTOH(h) \
|
||||
do { \
|
||||
MCA_PML_BFO_COMMON_HDR_NTOH((h).hdr_common); \
|
||||
} while (0)
|
||||
|
||||
#define MCA_PML_BFO_FIN_HDR_HTON(h) \
|
||||
do { \
|
||||
MCA_PML_BFO_COMMON_HDR_HTON((h).hdr_common); \
|
||||
MCA_PML_BFO_FIN_HDR_FILL(h); \
|
||||
} while (0)
|
||||
#endif /* PML_BFO */
|
||||
|
||||
#if PML_BFO
|
||||
/**
|
||||
* Header used to restart a rendezvous request.
|
||||
*/
|
||||
struct mca_pml_bfo_restart_hdr_t {
|
||||
mca_pml_bfo_match_hdr_t hdr_match; /**< needed to avoid duplicate messages */
|
||||
uint8_t hdr_restartseq; /**< restart sequence */
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
uint8_t hdr_padding[3];
|
||||
#endif
|
||||
opal_ptr_t hdr_src_req; /**< source request */
|
||||
opal_ptr_t hdr_dst_req; /**< matched receive request */
|
||||
int32_t hdr_dst_rank; /**< needed to send NACK */
|
||||
uint32_t hdr_jobid; /**< needed to send NACK */
|
||||
uint32_t hdr_vpid; /**< needed to send NACK */
|
||||
};
|
||||
typedef struct mca_pml_bfo_restart_hdr_t mca_pml_bfo_restart_hdr_t;
|
||||
|
||||
/* Only need to put parts of the restart header in NBO. No need
|
||||
to do hdr_src_req and hdr_dst_req as they are only used on the
|
||||
by the process that originated them. */
|
||||
#define MCA_PML_BFO_RESTART_HDR_NTOH(h) \
|
||||
do { \
|
||||
MCA_PML_BFO_MATCH_HDR_NTOH((h).hdr_match); \
|
||||
(h).hdr_dst_rank = ntohl((h).hdr_dst_rank); \
|
||||
(h).hdr_jobid = ntohl((h).hdr_jobid); \
|
||||
(h).hdr_vpid = ntohl((h).hdr_vpid); \
|
||||
} while (0)
|
||||
|
||||
#define MCA_PML_BFO_RESTART_HDR_HTON(h) \
|
||||
do { \
|
||||
MCA_PML_BFO_MATCH_HDR_HTON((h).hdr_match); \
|
||||
(h).hdr_dst_rank = htonl((h).hdr_dst_rank); \
|
||||
(h).hdr_jobid = htonl((h).hdr_jobid); \
|
||||
(h).hdr_vpid = htonl((h).hdr_vpid); \
|
||||
} while (0)
|
||||
|
||||
#endif /* PML_BFO */
|
||||
/**
|
||||
* Union of defined hdr types.
|
||||
*/
|
||||
union mca_pml_bfo_hdr_t {
|
||||
mca_pml_bfo_common_hdr_t hdr_common;
|
||||
mca_pml_bfo_match_hdr_t hdr_match;
|
||||
mca_pml_bfo_rendezvous_hdr_t hdr_rndv;
|
||||
mca_pml_bfo_rget_hdr_t hdr_rget;
|
||||
mca_pml_bfo_frag_hdr_t hdr_frag;
|
||||
mca_pml_bfo_ack_hdr_t hdr_ack;
|
||||
mca_pml_bfo_rdma_hdr_t hdr_rdma;
|
||||
mca_pml_bfo_fin_hdr_t hdr_fin;
|
||||
#if PML_BFO
|
||||
mca_pml_bfo_restart_hdr_t hdr_restart;
|
||||
#endif /* PML_BFO */
|
||||
};
|
||||
typedef union mca_pml_bfo_hdr_t mca_pml_bfo_hdr_t;
|
||||
|
||||
#if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
static inline __opal_attribute_always_inline__ void
|
||||
bfo_hdr_ntoh(mca_pml_bfo_hdr_t *hdr, const uint8_t hdr_type)
|
||||
{
|
||||
if(!(hdr->hdr_common.hdr_flags & MCA_PML_BFO_HDR_FLAGS_NBO))
|
||||
return;
|
||||
|
||||
switch(hdr_type) {
|
||||
case MCA_PML_BFO_HDR_TYPE_MATCH:
|
||||
MCA_PML_BFO_MATCH_HDR_NTOH(hdr->hdr_match);
|
||||
break;
|
||||
case MCA_PML_BFO_HDR_TYPE_RNDV:
|
||||
MCA_PML_BFO_RNDV_HDR_NTOH(hdr->hdr_rndv);
|
||||
break;
|
||||
case MCA_PML_BFO_HDR_TYPE_RGET:
|
||||
MCA_PML_BFO_RGET_HDR_NTOH(hdr->hdr_rget);
|
||||
break;
|
||||
case MCA_PML_BFO_HDR_TYPE_ACK:
|
||||
MCA_PML_BFO_ACK_HDR_NTOH(hdr->hdr_ack);
|
||||
break;
|
||||
case MCA_PML_BFO_HDR_TYPE_FRAG:
|
||||
MCA_PML_BFO_FRAG_HDR_NTOH(hdr->hdr_frag);
|
||||
break;
|
||||
case MCA_PML_BFO_HDR_TYPE_PUT:
|
||||
MCA_PML_BFO_RDMA_HDR_NTOH(hdr->hdr_rdma);
|
||||
break;
|
||||
case MCA_PML_BFO_HDR_TYPE_FIN:
|
||||
MCA_PML_BFO_FIN_HDR_NTOH(hdr->hdr_fin);
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
break;
|
||||
}
|
||||
}
|
||||
#else
|
||||
#define bfo_hdr_ntoh(h, t) do{}while(0)
|
||||
#endif
|
||||
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
#define bfo_hdr_hton(h, t, p) \
|
||||
bfo_hdr_hton_intr((mca_pml_bfo_hdr_t*)h, t, p)
|
||||
static inline __opal_attribute_always_inline__ void
|
||||
bfo_hdr_hton_intr(mca_pml_bfo_hdr_t *hdr, const uint8_t hdr_type,
|
||||
const ompi_proc_t *proc)
|
||||
{
|
||||
#ifdef WORDS_BIGENDIAN
|
||||
hdr->hdr_common.hdr_flags |= MCA_PML_BFO_HDR_FLAGS_NBO;
|
||||
#else
|
||||
|
||||
if(!(proc->super.proc_arch & OPAL_ARCH_ISBIGENDIAN))
|
||||
return;
|
||||
|
||||
hdr->hdr_common.hdr_flags |= MCA_PML_BFO_HDR_FLAGS_NBO;
|
||||
switch(hdr_type) {
|
||||
case MCA_PML_BFO_HDR_TYPE_MATCH:
|
||||
MCA_PML_BFO_MATCH_HDR_HTON(hdr->hdr_match);
|
||||
break;
|
||||
case MCA_PML_BFO_HDR_TYPE_RNDV:
|
||||
MCA_PML_BFO_RNDV_HDR_HTON(hdr->hdr_rndv);
|
||||
break;
|
||||
case MCA_PML_BFO_HDR_TYPE_RGET:
|
||||
MCA_PML_BFO_RGET_HDR_HTON(hdr->hdr_rget);
|
||||
break;
|
||||
case MCA_PML_BFO_HDR_TYPE_ACK:
|
||||
MCA_PML_BFO_ACK_HDR_HTON(hdr->hdr_ack);
|
||||
break;
|
||||
case MCA_PML_BFO_HDR_TYPE_FRAG:
|
||||
MCA_PML_BFO_FRAG_HDR_HTON(hdr->hdr_frag);
|
||||
break;
|
||||
case MCA_PML_BFO_HDR_TYPE_PUT:
|
||||
MCA_PML_BFO_RDMA_HDR_HTON(hdr->hdr_rdma);
|
||||
break;
|
||||
case MCA_PML_BFO_HDR_TYPE_FIN:
|
||||
MCA_PML_BFO_FIN_HDR_HTON(hdr->hdr_fin);
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#else
|
||||
#define bfo_hdr_hton(h, t, p) do{}while(0)
|
||||
#endif
|
||||
#endif
|
@ -1,171 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2013 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2009-2012 Oracle and/or its affiliates. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "ompi/request/request.h"
|
||||
#include "ompi/message/message.h"
|
||||
#include "pml_bfo_recvreq.h"
|
||||
|
||||
|
||||
int mca_pml_bfo_iprobe(int src,
|
||||
int tag,
|
||||
struct ompi_communicator_t *comm,
|
||||
int *matched, ompi_status_public_t * status)
|
||||
{
|
||||
int rc = OMPI_SUCCESS;
|
||||
mca_pml_bfo_recv_request_t recvreq;
|
||||
|
||||
OBJ_CONSTRUCT( &recvreq, mca_pml_bfo_recv_request_t );
|
||||
recvreq.req_recv.req_base.req_ompi.req_type = OMPI_REQUEST_PML;
|
||||
recvreq.req_recv.req_base.req_type = MCA_PML_REQUEST_IPROBE;
|
||||
|
||||
MCA_PML_BFO_RECV_REQUEST_INIT(&recvreq, NULL, 0, &ompi_mpi_char.dt, src, tag, comm, false);
|
||||
MCA_PML_BFO_RECV_REQUEST_START(&recvreq);
|
||||
|
||||
if( recvreq.req_recv.req_base.req_ompi.req_complete == true ) {
|
||||
if( NULL != status ) {
|
||||
*status = recvreq.req_recv.req_base.req_ompi.req_status;
|
||||
}
|
||||
rc = recvreq.req_recv.req_base.req_ompi.req_status.MPI_ERROR;
|
||||
*matched = 1;
|
||||
} else {
|
||||
*matched = 0;
|
||||
opal_progress();
|
||||
}
|
||||
MCA_PML_BASE_RECV_REQUEST_FINI( &recvreq.req_recv );
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
int mca_pml_bfo_probe(int src,
|
||||
int tag,
|
||||
struct ompi_communicator_t *comm,
|
||||
ompi_status_public_t * status)
|
||||
{
|
||||
int rc = OMPI_SUCCESS;
|
||||
mca_pml_bfo_recv_request_t recvreq;
|
||||
|
||||
OBJ_CONSTRUCT( &recvreq, mca_pml_bfo_recv_request_t );
|
||||
recvreq.req_recv.req_base.req_ompi.req_type = OMPI_REQUEST_PML;
|
||||
recvreq.req_recv.req_base.req_type = MCA_PML_REQUEST_PROBE;
|
||||
|
||||
MCA_PML_BFO_RECV_REQUEST_INIT(&recvreq, NULL, 0, &ompi_mpi_char.dt, src, tag, comm, false);
|
||||
MCA_PML_BFO_RECV_REQUEST_START(&recvreq);
|
||||
|
||||
ompi_request_wait_completion(&recvreq.req_recv.req_base.req_ompi);
|
||||
rc = recvreq.req_recv.req_base.req_ompi.req_status.MPI_ERROR;
|
||||
if (NULL != status) {
|
||||
*status = recvreq.req_recv.req_base.req_ompi.req_status;
|
||||
}
|
||||
|
||||
MCA_PML_BASE_RECV_REQUEST_FINI( &recvreq.req_recv );
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
mca_pml_bfo_improbe(int src,
|
||||
int tag,
|
||||
struct ompi_communicator_t *comm,
|
||||
int *matched,
|
||||
struct ompi_message_t **message,
|
||||
ompi_status_public_t * status)
|
||||
{
|
||||
int rc = OMPI_SUCCESS;
|
||||
mca_pml_bfo_recv_request_t *recvreq;
|
||||
|
||||
*message = ompi_message_alloc();
|
||||
if (NULL == *message) return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
|
||||
MCA_PML_BFO_RECV_REQUEST_ALLOC(recvreq);
|
||||
if (NULL == recvreq)
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
recvreq->req_recv.req_base.req_type = MCA_PML_REQUEST_IMPROBE;
|
||||
|
||||
/* initialize the request enough to probe and get the status */
|
||||
MCA_PML_BFO_RECV_REQUEST_INIT(recvreq, NULL, 0, &ompi_mpi_char.dt,
|
||||
src, tag, comm, false);
|
||||
MCA_PML_BFO_RECV_REQUEST_START(recvreq);
|
||||
|
||||
if( recvreq->req_recv.req_base.req_ompi.req_complete == true ) {
|
||||
if( NULL != status ) {
|
||||
*status = recvreq->req_recv.req_base.req_ompi.req_status;
|
||||
}
|
||||
*matched = 1;
|
||||
|
||||
(*message)->comm = comm;
|
||||
(*message)->req_ptr = recvreq;
|
||||
(*message)->peer = recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE;
|
||||
(*message)->count = recvreq->req_recv.req_base.req_ompi.req_status._ucount;
|
||||
|
||||
rc = recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR;
|
||||
} else {
|
||||
*matched = 0;
|
||||
|
||||
/* we only free if we didn't match, because we're going to
|
||||
translate the request into a receive request later on if it
|
||||
was matched */
|
||||
MCA_PML_BFO_RECV_REQUEST_RETURN( recvreq );
|
||||
ompi_message_return(*message);
|
||||
*message = MPI_MESSAGE_NULL;
|
||||
|
||||
opal_progress();
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
mca_pml_bfo_mprobe(int src,
|
||||
int tag,
|
||||
struct ompi_communicator_t *comm,
|
||||
struct ompi_message_t **message,
|
||||
ompi_status_public_t * status)
|
||||
{
|
||||
int rc = OMPI_SUCCESS;
|
||||
mca_pml_bfo_recv_request_t *recvreq;
|
||||
|
||||
*message = ompi_message_alloc();
|
||||
if (NULL == *message) return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
|
||||
MCA_PML_BFO_RECV_REQUEST_ALLOC(recvreq);
|
||||
if (NULL == recvreq)
|
||||
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
recvreq->req_recv.req_base.req_type = MCA_PML_REQUEST_MPROBE;
|
||||
|
||||
/* initialize the request enough to probe and get the status */
|
||||
MCA_PML_BFO_RECV_REQUEST_INIT(recvreq, NULL, 0, &ompi_mpi_char.dt,
|
||||
src, tag, comm, false);
|
||||
MCA_PML_BFO_RECV_REQUEST_START(recvreq);
|
||||
|
||||
ompi_request_wait_completion(&recvreq->req_recv.req_base.req_ompi);
|
||||
rc = recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR;
|
||||
|
||||
if( NULL != status ) {
|
||||
*status = recvreq->req_recv.req_base.req_ompi.req_status;
|
||||
}
|
||||
|
||||
(*message)->comm = comm;
|
||||
(*message)->req_ptr = recvreq;
|
||||
(*message)->peer = recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE;
|
||||
(*message)->count = recvreq->req_recv.req_base.req_ompi.req_status._ucount;
|
||||
|
||||
return rc;
|
||||
}
|
@ -1,308 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2013 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2010-2012 Oracle and/or its affiliates. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "ompi/request/request.h"
|
||||
#include "pml_bfo_recvreq.h"
|
||||
#include "pml_bfo_recvfrag.h"
|
||||
#include "ompi/peruse/peruse-internal.h"
|
||||
#include "ompi/message/message.h"
|
||||
|
||||
int mca_pml_bfo_irecv_init(void *addr,
|
||||
size_t count,
|
||||
ompi_datatype_t * datatype,
|
||||
int src,
|
||||
int tag,
|
||||
struct ompi_communicator_t *comm,
|
||||
struct ompi_request_t **request)
|
||||
{
|
||||
mca_pml_bfo_recv_request_t *recvreq;
|
||||
MCA_PML_BFO_RECV_REQUEST_ALLOC(recvreq);
|
||||
if (NULL == recvreq)
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
|
||||
MCA_PML_BFO_RECV_REQUEST_INIT(recvreq,
|
||||
addr,
|
||||
count, datatype, src, tag, comm, true);
|
||||
|
||||
PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
|
||||
&((recvreq)->req_recv.req_base),
|
||||
PERUSE_RECV);
|
||||
|
||||
*request = (ompi_request_t *) recvreq;
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_pml_bfo_irecv(void *addr,
|
||||
size_t count,
|
||||
ompi_datatype_t * datatype,
|
||||
int src,
|
||||
int tag,
|
||||
struct ompi_communicator_t *comm,
|
||||
struct ompi_request_t **request)
|
||||
{
|
||||
mca_pml_bfo_recv_request_t *recvreq;
|
||||
MCA_PML_BFO_RECV_REQUEST_ALLOC(recvreq);
|
||||
if (NULL == recvreq)
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
|
||||
MCA_PML_BFO_RECV_REQUEST_INIT(recvreq,
|
||||
addr,
|
||||
count, datatype, src, tag, comm, false);
|
||||
|
||||
PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
|
||||
&((recvreq)->req_recv.req_base),
|
||||
PERUSE_RECV);
|
||||
|
||||
MCA_PML_BFO_RECV_REQUEST_START(recvreq);
|
||||
*request = (ompi_request_t *) recvreq;
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int mca_pml_bfo_recv(void *addr,
|
||||
size_t count,
|
||||
ompi_datatype_t * datatype,
|
||||
int src,
|
||||
int tag,
|
||||
struct ompi_communicator_t *comm,
|
||||
ompi_status_public_t * status)
|
||||
{
|
||||
int rc;
|
||||
mca_pml_bfo_recv_request_t *recvreq;
|
||||
MCA_PML_BFO_RECV_REQUEST_ALLOC(recvreq);
|
||||
if (NULL == recvreq)
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
|
||||
MCA_PML_BFO_RECV_REQUEST_INIT(recvreq,
|
||||
addr,
|
||||
count, datatype, src, tag, comm, false);
|
||||
|
||||
PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
|
||||
&((recvreq)->req_recv.req_base),
|
||||
PERUSE_RECV);
|
||||
|
||||
MCA_PML_BFO_RECV_REQUEST_START(recvreq);
|
||||
ompi_request_wait_completion(&recvreq->req_recv.req_base.req_ompi);
|
||||
|
||||
if (NULL != status) { /* return status */
|
||||
*status = recvreq->req_recv.req_base.req_ompi.req_status;
|
||||
}
|
||||
rc = recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR;
|
||||
ompi_request_free( (ompi_request_t**)&recvreq );
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
mca_pml_bfo_imrecv( void *buf,
|
||||
size_t count,
|
||||
ompi_datatype_t *datatype,
|
||||
struct ompi_message_t **message,
|
||||
struct ompi_request_t **request )
|
||||
{
|
||||
mca_pml_bfo_recv_frag_t* frag;
|
||||
mca_pml_bfo_recv_request_t *recvreq;
|
||||
mca_pml_bfo_hdr_t *hdr;
|
||||
int src, tag;
|
||||
ompi_communicator_t *comm;
|
||||
mca_pml_bfo_comm_proc_t* proc;
|
||||
mca_pml_bfo_comm_t* bfo_comm;
|
||||
uint64_t seq;
|
||||
|
||||
/* get the request from the message and the frag from the request
|
||||
before we overwrite everything */
|
||||
recvreq = (mca_pml_bfo_recv_request_t*) (*message)->req_ptr;
|
||||
frag = (mca_pml_bfo_recv_frag_t*) recvreq->req_recv.req_base.req_addr;
|
||||
src = recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE;
|
||||
tag = recvreq->req_recv.req_base.req_ompi.req_status.MPI_TAG;
|
||||
comm = (*message)->comm;
|
||||
bfo_comm = recvreq->req_recv.req_base.req_comm->c_pml_comm;
|
||||
seq = recvreq->req_recv.req_base.req_sequence;
|
||||
|
||||
/* make the request a recv request again */
|
||||
/* The old request kept pointers to comm and the char datatype.
|
||||
We're about to release those, but need to make sure comm
|
||||
doesn't go out of scope (we don't care about the char datatype
|
||||
anymore). So retain comm, then release the frag, then reinit
|
||||
the frag (which will retain comm), then release comm (but the
|
||||
frag still has it's ref, so it'll stay in scope). Make
|
||||
sense? */
|
||||
OBJ_RETAIN(comm);
|
||||
MCA_PML_BASE_RECV_REQUEST_FINI(&recvreq->req_recv);
|
||||
recvreq->req_recv.req_base.req_type = MCA_PML_REQUEST_RECV;
|
||||
MCA_PML_BFO_RECV_REQUEST_INIT(recvreq,
|
||||
buf,
|
||||
count, datatype,
|
||||
src, tag, comm, false);
|
||||
OBJ_RELEASE(comm);
|
||||
|
||||
PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
|
||||
&((recvreq)->req_recv.req_base),
|
||||
PERUSE_RECV);
|
||||
|
||||
/* init/re-init the request */
|
||||
recvreq->req_lock = 0;
|
||||
recvreq->req_pipeline_depth = 0;
|
||||
recvreq->req_bytes_received = 0;
|
||||
/* What about req_rdma_cnt ? */
|
||||
recvreq->req_rdma_idx = 0;
|
||||
recvreq->req_pending = false;
|
||||
recvreq->req_ack_sent = false;
|
||||
|
||||
MCA_PML_BASE_RECV_START(&recvreq->req_recv.req_base);
|
||||
|
||||
/* Note - sequence number already assigned */
|
||||
recvreq->req_recv.req_base.req_sequence = seq;
|
||||
|
||||
proc = &bfo_comm->procs[recvreq->req_recv.req_base.req_peer];
|
||||
recvreq->req_recv.req_base.req_proc = proc->ompi_proc;
|
||||
prepare_recv_req_converter(recvreq);
|
||||
|
||||
/* we can't go through the match, since we already have the match.
|
||||
Cheat and do what REQUEST_START does, but without the frag
|
||||
search */
|
||||
hdr = (mca_pml_bfo_hdr_t*)frag->segments->seg_addr.pval;
|
||||
switch(hdr->hdr_common.hdr_type) {
|
||||
case MCA_PML_BFO_HDR_TYPE_MATCH:
|
||||
mca_pml_bfo_recv_request_progress_match(recvreq, frag->btl, frag->segments,
|
||||
frag->num_segments);
|
||||
break;
|
||||
case MCA_PML_BFO_HDR_TYPE_RNDV:
|
||||
mca_pml_bfo_recv_request_progress_rndv(recvreq, frag->btl, frag->segments,
|
||||
frag->num_segments);
|
||||
break;
|
||||
case MCA_PML_BFO_HDR_TYPE_RGET:
|
||||
mca_pml_bfo_recv_request_progress_rget(recvreq, frag->btl, frag->segments,
|
||||
frag->num_segments);
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
MCA_PML_BFO_RECV_FRAG_RETURN(frag);
|
||||
|
||||
ompi_message_return(*message);
|
||||
*message = MPI_MESSAGE_NULL;
|
||||
*request = (ompi_request_t *) recvreq;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
mca_pml_bfo_mrecv( void *buf,
|
||||
size_t count,
|
||||
ompi_datatype_t *datatype,
|
||||
struct ompi_message_t **message,
|
||||
ompi_status_public_t* status )
|
||||
{
|
||||
mca_pml_bfo_recv_frag_t* frag;
|
||||
mca_pml_bfo_recv_request_t *recvreq;
|
||||
mca_pml_bfo_hdr_t *hdr;
|
||||
int src, tag, rc;
|
||||
ompi_communicator_t *comm;
|
||||
mca_pml_bfo_comm_proc_t* proc;
|
||||
mca_pml_bfo_comm_t* bfo_comm;
|
||||
uint64_t seq;
|
||||
|
||||
/* get the request from the message and the frag from the request
|
||||
before we overwrite everything */
|
||||
comm = (*message)->comm;
|
||||
recvreq = (mca_pml_bfo_recv_request_t*) (*message)->req_ptr;
|
||||
frag = (mca_pml_bfo_recv_frag_t*) recvreq->req_recv.req_base.req_addr;
|
||||
src = recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE;
|
||||
tag = recvreq->req_recv.req_base.req_ompi.req_status.MPI_TAG;
|
||||
seq = recvreq->req_recv.req_base.req_sequence;
|
||||
bfo_comm = recvreq->req_recv.req_base.req_comm->c_pml_comm;
|
||||
|
||||
/* make the request a recv request again */
|
||||
/* The old request kept pointers to comm and the char datatype.
|
||||
We're about to release those, but need to make sure comm
|
||||
doesn't go out of scope (we don't care about the char datatype
|
||||
anymore). So retain comm, then release the frag, then reinit
|
||||
the frag (which will retain comm), then release comm (but the
|
||||
frag still has it's ref, so it'll stay in scope). Make
|
||||
sense? */
|
||||
OBJ_RETAIN(comm);
|
||||
MCA_PML_BASE_RECV_REQUEST_FINI(&recvreq->req_recv);
|
||||
recvreq->req_recv.req_base.req_type = MCA_PML_REQUEST_RECV;
|
||||
MCA_PML_BFO_RECV_REQUEST_INIT(recvreq,
|
||||
buf,
|
||||
count, datatype,
|
||||
src, tag, comm, false);
|
||||
OBJ_RELEASE(comm);
|
||||
|
||||
PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
|
||||
&((recvreq)->req_recv.req_base),
|
||||
PERUSE_RECV);
|
||||
|
||||
/* init/re-init the request */
|
||||
recvreq->req_lock = 0;
|
||||
recvreq->req_pipeline_depth = 0;
|
||||
recvreq->req_bytes_received = 0;
|
||||
recvreq->req_rdma_cnt = 0;
|
||||
recvreq->req_rdma_idx = 0;
|
||||
recvreq->req_pending = false;
|
||||
|
||||
MCA_PML_BASE_RECV_START(&recvreq->req_recv.req_base);
|
||||
|
||||
/* Note - sequence number already assigned */
|
||||
recvreq->req_recv.req_base.req_sequence = seq;
|
||||
|
||||
proc = &bfo_comm->procs[recvreq->req_recv.req_base.req_peer];
|
||||
recvreq->req_recv.req_base.req_proc = proc->ompi_proc;
|
||||
prepare_recv_req_converter(recvreq);
|
||||
|
||||
/* we can't go through the match, since we already have the match.
|
||||
Cheat and do what REQUEST_START does, but without the frag
|
||||
search */
|
||||
hdr = (mca_pml_bfo_hdr_t*)frag->segments->seg_addr.pval;
|
||||
switch(hdr->hdr_common.hdr_type) {
|
||||
case MCA_PML_BFO_HDR_TYPE_MATCH:
|
||||
mca_pml_bfo_recv_request_progress_match(recvreq, frag->btl, frag->segments,
|
||||
frag->num_segments);
|
||||
break;
|
||||
case MCA_PML_BFO_HDR_TYPE_RNDV:
|
||||
mca_pml_bfo_recv_request_progress_rndv(recvreq, frag->btl, frag->segments,
|
||||
frag->num_segments);
|
||||
break;
|
||||
case MCA_PML_BFO_HDR_TYPE_RGET:
|
||||
mca_pml_bfo_recv_request_progress_rget(recvreq, frag->btl, frag->segments,
|
||||
frag->num_segments);
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
|
||||
ompi_message_return(*message);
|
||||
*message = MPI_MESSAGE_NULL;
|
||||
ompi_request_wait_completion(&(recvreq->req_recv.req_base.req_ompi));
|
||||
|
||||
MCA_PML_BFO_RECV_FRAG_RETURN(frag);
|
||||
|
||||
if (NULL != status) { /* return status */
|
||||
*status = recvreq->req_recv.req_base.req_ompi.req_status;
|
||||
}
|
||||
rc = recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR;
|
||||
ompi_request_free( (ompi_request_t**)&recvreq );
|
||||
return rc;
|
||||
}
|
||||
|
@ -1,129 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2013 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "pml_bfo.h"
|
||||
#include "pml_bfo_sendreq.h"
|
||||
#include "pml_bfo_recvreq.h"
|
||||
#include "ompi/peruse/peruse-internal.h"
|
||||
|
||||
int mca_pml_bfo_isend_init(void *buf,
|
||||
size_t count,
|
||||
ompi_datatype_t * datatype,
|
||||
int dst,
|
||||
int tag,
|
||||
mca_pml_base_send_mode_t sendmode,
|
||||
ompi_communicator_t * comm,
|
||||
ompi_request_t ** request)
|
||||
{
|
||||
mca_pml_bfo_send_request_t *sendreq = NULL;
|
||||
MCA_PML_BFO_SEND_REQUEST_ALLOC(comm, dst, sendreq);
|
||||
if (NULL == sendreq)
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
|
||||
MCA_PML_BFO_SEND_REQUEST_INIT(sendreq,
|
||||
buf,
|
||||
count,
|
||||
datatype,
|
||||
dst, tag,
|
||||
comm, sendmode, true);
|
||||
|
||||
PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
|
||||
&(sendreq)->req_send.req_base,
|
||||
PERUSE_SEND);
|
||||
|
||||
*request = (ompi_request_t *) sendreq;
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int mca_pml_bfo_isend(void *buf,
|
||||
size_t count,
|
||||
ompi_datatype_t * datatype,
|
||||
int dst,
|
||||
int tag,
|
||||
mca_pml_base_send_mode_t sendmode,
|
||||
ompi_communicator_t * comm,
|
||||
ompi_request_t ** request)
|
||||
{
|
||||
int rc;
|
||||
mca_pml_bfo_send_request_t *sendreq = NULL;
|
||||
|
||||
MCA_PML_BFO_SEND_REQUEST_ALLOC(comm, dst, sendreq);
|
||||
if (NULL == sendreq)
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
|
||||
MCA_PML_BFO_SEND_REQUEST_INIT(sendreq,
|
||||
buf,
|
||||
count,
|
||||
datatype,
|
||||
dst, tag,
|
||||
comm, sendmode, false);
|
||||
|
||||
PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
|
||||
&(sendreq)->req_send.req_base,
|
||||
PERUSE_SEND);
|
||||
|
||||
MCA_PML_BFO_SEND_REQUEST_START(sendreq, rc);
|
||||
*request = (ompi_request_t *) sendreq;
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
int mca_pml_bfo_send(void *buf,
|
||||
size_t count,
|
||||
ompi_datatype_t * datatype,
|
||||
int dst,
|
||||
int tag,
|
||||
mca_pml_base_send_mode_t sendmode,
|
||||
ompi_communicator_t * comm)
|
||||
{
|
||||
int rc;
|
||||
mca_pml_bfo_send_request_t *sendreq;
|
||||
|
||||
MCA_PML_BFO_SEND_REQUEST_ALLOC(comm, dst, sendreq);
|
||||
if (NULL == sendreq)
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
|
||||
MCA_PML_BFO_SEND_REQUEST_INIT(sendreq,
|
||||
buf,
|
||||
count,
|
||||
datatype,
|
||||
dst, tag,
|
||||
comm, sendmode, false);
|
||||
|
||||
PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
|
||||
&(sendreq)->req_send.req_base,
|
||||
PERUSE_SEND);
|
||||
|
||||
MCA_PML_BFO_SEND_REQUEST_START(sendreq, rc);
|
||||
if (rc != OMPI_SUCCESS) {
|
||||
MCA_PML_BFO_SEND_REQUEST_RETURN( sendreq );
|
||||
return rc;
|
||||
}
|
||||
|
||||
ompi_request_wait_completion(&sendreq->req_send.req_base.req_ompi);
|
||||
|
||||
rc = sendreq->req_send.req_base.req_ompi.req_status.MPI_ERROR;
|
||||
ompi_request_free( (ompi_request_t**)&sendreq );
|
||||
return rc;
|
||||
}
|
@ -1,78 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2008 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "pml_bfo.h"
|
||||
#include "pml_bfo_sendreq.h"
|
||||
#include "ompi/mca/bml/base/base.h"
|
||||
|
||||
int mca_pml_bfo_progress(void)
|
||||
{
|
||||
int i, queue_length = opal_list_get_size(&mca_pml_bfo.send_pending);
|
||||
int j, completed_requests = 0;
|
||||
bool send_succedded;
|
||||
|
||||
if( OPAL_LIKELY(0 == queue_length) )
|
||||
return 0;
|
||||
|
||||
for( i = 0; i < queue_length; i++ ) {
|
||||
mca_pml_bfo_send_pending_t pending_type = MCA_PML_BFO_SEND_PENDING_NONE;
|
||||
mca_pml_bfo_send_request_t* sendreq;
|
||||
mca_bml_base_endpoint_t* endpoint;
|
||||
|
||||
sendreq = get_request_from_send_pending(&pending_type);
|
||||
if(OPAL_UNLIKELY(NULL == sendreq))
|
||||
break;
|
||||
|
||||
switch(pending_type) {
|
||||
case MCA_PML_BFO_SEND_PENDING_NONE:
|
||||
assert(0);
|
||||
return 0;
|
||||
case MCA_PML_BFO_SEND_PENDING_SCHEDULE:
|
||||
if( mca_pml_bfo_send_request_schedule_exclusive(sendreq) ==
|
||||
OMPI_ERR_OUT_OF_RESOURCE ) {
|
||||
return 0;
|
||||
}
|
||||
completed_requests++;
|
||||
break;
|
||||
case MCA_PML_BFO_SEND_PENDING_START:
|
||||
endpoint = sendreq->req_endpoint;
|
||||
send_succedded = false;
|
||||
for(j = 0; j < (int)mca_bml_base_btl_array_get_size(&endpoint->btl_eager); j++) {
|
||||
mca_bml_base_btl_t* bml_btl;
|
||||
int rc;
|
||||
|
||||
/* select a btl */
|
||||
bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
|
||||
rc = mca_pml_bfo_send_request_start_btl(sendreq, bml_btl);
|
||||
if( OPAL_LIKELY(OMPI_SUCCESS == rc) ) {
|
||||
send_succedded = true;
|
||||
completed_requests++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if( false == send_succedded ) {
|
||||
add_request_to_send_pending(sendreq, MCA_PML_BFO_SEND_PENDING_START, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
return completed_requests;
|
||||
}
|
||||
|
@ -1,118 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
||||
/*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "ompi/mca/bml/bml.h"
|
||||
#include "opal/mca/mpool/mpool.h"
|
||||
#include "pml_bfo.h"
|
||||
#include "pml_bfo_rdma.h"
|
||||
|
||||
/* Use this registration if no registration needed for a BTL instead of NULL.
|
||||
* This will help other code to distinguish case when memory is not registered
|
||||
* from case when registration is not needed */
|
||||
static mca_mpool_base_registration_t pml_bfo_dummy_reg;
|
||||
|
||||
/*
|
||||
* Check to see if memory is registered or can be registered. Build a
|
||||
* set of registrations on the request.
|
||||
*/
|
||||
|
||||
size_t mca_pml_bfo_rdma_btls(
|
||||
mca_bml_base_endpoint_t* bml_endpoint,
|
||||
unsigned char* base,
|
||||
size_t size,
|
||||
mca_pml_bfo_com_btl_t* rdma_btls)
|
||||
{
|
||||
int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
|
||||
double weight_total = 0;
|
||||
int num_btls_used = 0, n;
|
||||
|
||||
/* shortcut when there are no rdma capable btls */
|
||||
if(num_btls == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* check to see if memory is registered */
|
||||
for(n = 0; n < num_btls && num_btls_used < mca_pml_bfo.max_rdma_per_request;
|
||||
n++) {
|
||||
mca_bml_base_btl_t* bml_btl =
|
||||
mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma,
|
||||
(bml_endpoint->btl_rdma_index + n) % num_btls);
|
||||
mca_mpool_base_registration_t* reg = &pml_bfo_dummy_reg;
|
||||
mca_mpool_base_module_t *btl_mpool = bml_btl->btl->btl_mpool;
|
||||
|
||||
if( NULL != btl_mpool ) {
|
||||
if(!mca_pml_bfo.leave_pinned) {
|
||||
/* look through existing registrations */
|
||||
btl_mpool->mpool_find(btl_mpool, base, size, ®);
|
||||
} else {
|
||||
/* register the memory */
|
||||
btl_mpool->mpool_register(btl_mpool, base, size, 0, ®);
|
||||
}
|
||||
|
||||
if(NULL == reg)
|
||||
continue;
|
||||
}
|
||||
|
||||
rdma_btls[num_btls_used].bml_btl = bml_btl;
|
||||
rdma_btls[num_btls_used].btl_reg = reg;
|
||||
weight_total += bml_btl->btl_weight;
|
||||
num_btls_used++;
|
||||
}
|
||||
|
||||
/* if we don't use leave_pinned and all BTLs that already have this memory
|
||||
* registered amount to less then half of available bandwidth - fall back to
|
||||
* pipeline protocol */
|
||||
if(0 == num_btls_used || (!mca_pml_bfo.leave_pinned && weight_total < 0.5))
|
||||
return 0;
|
||||
|
||||
mca_pml_bfo_calc_weighted_length(rdma_btls, num_btls_used, size,
|
||||
weight_total);
|
||||
|
||||
bml_endpoint->btl_rdma_index = (bml_endpoint->btl_rdma_index + 1) % num_btls;
|
||||
return num_btls_used;
|
||||
}
|
||||
|
||||
size_t mca_pml_bfo_rdma_pipeline_btls( mca_bml_base_endpoint_t* bml_endpoint,
|
||||
size_t size,
|
||||
mca_pml_bfo_com_btl_t* rdma_btls )
|
||||
{
|
||||
int i, num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
|
||||
double weight_total = 0;
|
||||
|
||||
for(i = 0; i < num_btls && i < mca_pml_bfo.max_rdma_per_request; i++) {
|
||||
rdma_btls[i].bml_btl =
|
||||
mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma);
|
||||
if(NULL != rdma_btls[i].bml_btl->btl->btl_mpool)
|
||||
rdma_btls[i].btl_reg = NULL;
|
||||
else
|
||||
rdma_btls[i].btl_reg = &pml_bfo_dummy_reg;
|
||||
|
||||
weight_total += rdma_btls[i].bml_btl->btl_weight;
|
||||
}
|
||||
|
||||
mca_pml_bfo_calc_weighted_length(rdma_btls, i, size, weight_total);
|
||||
|
||||
return i;
|
||||
}
|
@ -1,42 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*/
|
||||
|
||||
#ifndef MCA_PML_BFO_RDMA_H
|
||||
#define MCA_PML_BFO_RDMA_H
|
||||
|
||||
struct mca_bml_base_endpoint_t;
|
||||
|
||||
/*
|
||||
* Of the set of available btls that support RDMA,
|
||||
* find those that already have registrations - or
|
||||
* register if required (for leave_pinned option)
|
||||
*/
|
||||
size_t mca_pml_bfo_rdma_btls(struct mca_bml_base_endpoint_t* endpoint,
|
||||
unsigned char* base, size_t size, struct mca_pml_bfo_com_btl_t* btls);
|
||||
|
||||
/* Choose RDMA BTLs to use for sending of a request by pipeline protocol.
|
||||
* Calculate number of bytes to send through each BTL according to available
|
||||
* bandwidth */
|
||||
size_t mca_pml_bfo_rdma_pipeline_btls(struct mca_bml_base_endpoint_t* endpoint,
|
||||
size_t size, mca_pml_bfo_com_btl_t* rdma_btls);
|
||||
#endif
|
||||
|
@ -1,30 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "pml_bfo.h"
|
||||
#include "pml_bfo_rdmafrag.h"
|
||||
|
||||
|
||||
OBJ_CLASS_INSTANCE(
|
||||
mca_pml_bfo_rdma_frag_t,
|
||||
ompi_free_list_item_t,
|
||||
NULL,
|
||||
NULL);
|
@ -1,75 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2013 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*/
|
||||
|
||||
#ifndef MCA_PML_BFO_RDMAFRAG_H
|
||||
#define MCA_PML_BFO_RDMAFRAG_H
|
||||
|
||||
#include "pml_bfo_hdr.h"
|
||||
#include "opal/mca/mpool/base/base.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
typedef enum {
|
||||
MCA_PML_BFO_RDMA_PUT,
|
||||
MCA_PML_BFO_RDMA_GET
|
||||
} mca_pml_bfo_rdma_state_t;
|
||||
|
||||
struct mca_pml_bfo_rdma_frag_t {
|
||||
opal_free_list_item_t super;
|
||||
mca_bml_base_btl_t* rdma_bml;
|
||||
#if PML_BFO
|
||||
mca_btl_base_module_t* rdma_btl;
|
||||
#endif /* PML_BFO */
|
||||
mca_pml_bfo_hdr_t rdma_hdr;
|
||||
mca_pml_bfo_rdma_state_t rdma_state;
|
||||
size_t rdma_length;
|
||||
uint8_t rdma_segs[MCA_BTL_SEG_MAX_SIZE * MCA_BTL_DES_MAX_SEGMENTS];
|
||||
void *rdma_req;
|
||||
struct mca_bml_base_endpoint_t* rdma_ep;
|
||||
opal_convertor_t convertor;
|
||||
struct mca_mpool_base_registration_t* reg;
|
||||
uint32_t retries;
|
||||
};
|
||||
typedef struct mca_pml_bfo_rdma_frag_t mca_pml_bfo_rdma_frag_t;
|
||||
|
||||
OBJ_CLASS_DECLARATION(mca_pml_bfo_rdma_frag_t);
|
||||
|
||||
|
||||
#define MCA_PML_BFO_RDMA_FRAG_ALLOC(frag) \
|
||||
do { \
|
||||
opal_free_list_item_t* item; \
|
||||
OPAL_FREE_LIST_WAIT_MT(&mca_pml_bfo.rdma_frags, item); \
|
||||
frag = (mca_pml_bfo_rdma_frag_t*)item; \
|
||||
} while(0)
|
||||
|
||||
#define MCA_PML_BFO_RDMA_FRAG_RETURN(frag) \
|
||||
do { \
|
||||
/* return fragment */ \
|
||||
OPAL_FREE_LIST_RETURN_MT(&mca_pml_bfo.rdma_frags, \
|
||||
(opal_free_list_item_t*)frag); \
|
||||
} while(0)
|
||||
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
||||
|
@ -1,743 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2013 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2007 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2006-2008 University of Houston. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Oracle and/or its affiliates. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "opal/threads/mutex.h"
|
||||
#include "opal/prefetch.h"
|
||||
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "ompi/peruse/peruse-internal.h"
|
||||
#include "ompi/memchecker.h"
|
||||
|
||||
#include "pml_bfo.h"
|
||||
#include "pml_bfo_comm.h"
|
||||
#include "pml_bfo_recvfrag.h"
|
||||
#include "pml_bfo_recvreq.h"
|
||||
#include "pml_bfo_sendreq.h"
|
||||
#include "pml_bfo_hdr.h"
|
||||
#if PML_BFO
|
||||
#include "pml_bfo_failover.h"
|
||||
#endif /* PML_BFO */
|
||||
|
||||
OBJ_CLASS_INSTANCE( mca_pml_bfo_buffer_t,
|
||||
ompi_free_list_item_t,
|
||||
NULL,
|
||||
NULL );
|
||||
|
||||
OBJ_CLASS_INSTANCE( mca_pml_bfo_recv_frag_t,
|
||||
opal_list_item_t,
|
||||
NULL,
|
||||
NULL );
|
||||
|
||||
/**
|
||||
* Static functions.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Append a unexpected descriptor to a queue. This function will allocate and
|
||||
* initialize the fragment (if necessary) and then will add it to the specified
|
||||
* queue. The allocated fragment is not returned to the caller.
|
||||
*/
|
||||
static void
|
||||
append_frag_to_list(opal_list_t *queue, mca_btl_base_module_t *btl,
|
||||
mca_pml_bfo_match_hdr_t *hdr, mca_btl_base_segment_t* segments,
|
||||
size_t num_segments, mca_pml_bfo_recv_frag_t* frag)
|
||||
{
|
||||
if(NULL == frag) {
|
||||
MCA_PML_BFO_RECV_FRAG_ALLOC(frag);
|
||||
MCA_PML_BFO_RECV_FRAG_INIT(frag, hdr, segments, num_segments, btl);
|
||||
}
|
||||
opal_list_append(queue, (opal_list_item_t*)frag);
|
||||
}
|
||||
|
||||
/**
|
||||
* Match incoming recv_frags against posted receives.
|
||||
* Supports out of order delivery.
|
||||
*
|
||||
* @param frag_header (IN) Header of received recv_frag.
|
||||
* @param frag_desc (IN) Received recv_frag descriptor.
|
||||
* @param match_made (OUT) Flag indicating wether a match was made.
|
||||
* @param additional_matches (OUT) List of additional matches
|
||||
* @return OMPI_SUCCESS or error status on failure.
|
||||
*/
|
||||
static int mca_pml_bfo_recv_frag_match( mca_btl_base_module_t *btl,
|
||||
mca_pml_bfo_match_hdr_t *hdr,
|
||||
mca_btl_base_segment_t* segments,
|
||||
size_t num_segments,
|
||||
int type);
|
||||
|
||||
static mca_pml_bfo_recv_request_t*
|
||||
match_one(mca_btl_base_module_t *btl,
|
||||
mca_pml_bfo_match_hdr_t *hdr, mca_btl_base_segment_t* segments,
|
||||
size_t num_segments, ompi_communicator_t *comm_ptr,
|
||||
mca_pml_bfo_comm_proc_t *proc,
|
||||
mca_pml_bfo_recv_frag_t* frag);
|
||||
|
||||
void mca_pml_bfo_recv_frag_callback_match(mca_btl_base_module_t* btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t* des,
|
||||
void* cbdata )
|
||||
{
|
||||
mca_btl_base_segment_t* segments = des->des_local;
|
||||
mca_pml_bfo_match_hdr_t* hdr = (mca_pml_bfo_match_hdr_t*)segments->seg_addr.pval;
|
||||
ompi_communicator_t *comm_ptr;
|
||||
mca_pml_bfo_recv_request_t *match = NULL;
|
||||
mca_pml_bfo_comm_t *comm;
|
||||
mca_pml_bfo_comm_proc_t *proc;
|
||||
size_t num_segments = des->des_local_count;
|
||||
size_t bytes_received = 0;
|
||||
|
||||
assert(num_segments <= MCA_BTL_DES_MAX_SEGMENTS);
|
||||
|
||||
if( OPAL_UNLIKELY(segments->seg_len < OMPI_PML_BFO_MATCH_HDR_LEN) ) {
|
||||
return;
|
||||
}
|
||||
bfo_hdr_ntoh(((mca_pml_bfo_hdr_t*) hdr), MCA_PML_BFO_HDR_TYPE_MATCH);
|
||||
|
||||
/* communicator pointer */
|
||||
comm_ptr = ompi_comm_lookup(hdr->hdr_ctx);
|
||||
if(OPAL_UNLIKELY(NULL == comm_ptr)) {
|
||||
/* This is a special case. A message for a not yet existing
|
||||
* communicator can happens. Instead of doing a matching we
|
||||
* will temporarily add it the a pending queue in the PML.
|
||||
* Later on, when the communicator is completely instantiated,
|
||||
* this pending queue will be searched and all matching fragments
|
||||
* moved to the right communicator.
|
||||
*/
|
||||
append_frag_to_list( &mca_pml_bfo.non_existing_communicator_pending,
|
||||
btl, hdr, segments, num_segments, NULL );
|
||||
return;
|
||||
}
|
||||
comm = (mca_pml_bfo_comm_t *)comm_ptr->c_pml_comm;
|
||||
|
||||
/* source sequence number */
|
||||
proc = &comm->procs[hdr->hdr_src];
|
||||
|
||||
/* We generate the MSG_ARRIVED event as soon as the PML is aware
|
||||
* of a matching fragment arrival. Independing if it is received
|
||||
* on the correct order or not. This will allow the tools to
|
||||
* figure out if the messages are not received in the correct
|
||||
* order (if multiple network interfaces).
|
||||
*/
|
||||
PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_ARRIVED, comm_ptr,
|
||||
hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
|
||||
|
||||
/* get next expected message sequence number - if threaded
|
||||
* run, lock to make sure that if another thread is processing
|
||||
* a frag from the same message a match is made only once.
|
||||
* Also, this prevents other posted receives (for a pair of
|
||||
* end points) from being processed, and potentially "loosing"
|
||||
* the fragment.
|
||||
*/
|
||||
OPAL_THREAD_LOCK(&comm->matching_lock);
|
||||
|
||||
/* get sequence number of next message that can be processed */
|
||||
if(OPAL_UNLIKELY((((uint16_t) hdr->hdr_seq) != ((uint16_t) proc->expected_sequence)) ||
|
||||
(opal_list_get_size(&proc->frags_cant_match) > 0 ))) {
|
||||
goto slow_path;
|
||||
}
|
||||
|
||||
/* This is the sequence number we were expecting, so we can try
|
||||
* matching it to already posted receives.
|
||||
*/
|
||||
|
||||
/* We're now expecting the next sequence number. */
|
||||
proc->expected_sequence++;
|
||||
|
||||
/* We generate the SEARCH_POSTED_QUEUE only when the message is
|
||||
* received in the correct sequence. Otherwise, we delay the event
|
||||
* generation until we reach the correct sequence number.
|
||||
*/
|
||||
PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_BEGIN, comm_ptr,
|
||||
hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
|
||||
|
||||
match = match_one(btl, hdr, segments, num_segments, comm_ptr, proc, NULL);
|
||||
|
||||
/* The match is over. We generate the SEARCH_POSTED_Q_END here,
|
||||
* before going into the mca_pml_bfo_check_cantmatch_for_match so
|
||||
* we can make a difference for the searching time for all
|
||||
* messages.
|
||||
*/
|
||||
PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_END, comm_ptr,
|
||||
hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
|
||||
|
||||
/* release matching lock before processing fragment */
|
||||
OPAL_THREAD_UNLOCK(&comm->matching_lock);
|
||||
|
||||
if(OPAL_LIKELY(match)) {
|
||||
bytes_received = segments->seg_len - OMPI_PML_BFO_MATCH_HDR_LEN;
|
||||
match->req_recv.req_bytes_packed = bytes_received;
|
||||
|
||||
MCA_PML_BFO_RECV_REQUEST_MATCHED(match, hdr);
|
||||
if(match->req_bytes_expected > 0) {
|
||||
struct iovec iov[MCA_BTL_DES_MAX_SEGMENTS];
|
||||
uint32_t iov_count = 1;
|
||||
|
||||
/*
|
||||
* Make user buffer accessable(defined) before unpacking.
|
||||
*/
|
||||
MEMCHECKER(
|
||||
memchecker_call(&opal_memchecker_base_mem_defined,
|
||||
match->req_recv.req_base.req_addr,
|
||||
match->req_recv.req_base.req_count,
|
||||
match->req_recv.req_base.req_datatype);
|
||||
);
|
||||
|
||||
iov[0].iov_len = bytes_received;
|
||||
iov[0].iov_base = (IOVBASE_TYPE*)((unsigned char*)segments->seg_addr.pval +
|
||||
OMPI_PML_BFO_MATCH_HDR_LEN);
|
||||
while (iov_count < num_segments) {
|
||||
bytes_received += segments[iov_count].seg_len;
|
||||
iov[iov_count].iov_len = segments[iov_count].seg_len;
|
||||
iov[iov_count].iov_base = (IOVBASE_TYPE*)((unsigned char*)segments[iov_count].seg_addr.pval);
|
||||
iov_count++;
|
||||
}
|
||||
opal_convertor_unpack( &match->req_recv.req_base.req_convertor,
|
||||
iov,
|
||||
&iov_count,
|
||||
&bytes_received );
|
||||
match->req_bytes_received = bytes_received;
|
||||
/*
|
||||
* Unpacking finished, make the user buffer unaccessable again.
|
||||
*/
|
||||
MEMCHECKER(
|
||||
memchecker_call(&opal_memchecker_base_mem_noaccess,
|
||||
match->req_recv.req_base.req_addr,
|
||||
match->req_recv.req_base.req_count,
|
||||
match->req_recv.req_base.req_datatype);
|
||||
);
|
||||
}
|
||||
|
||||
/* no need to check if complete we know we are.. */
|
||||
/* don't need a rmb as that is for checking */
|
||||
recv_request_pml_complete(match);
|
||||
}
|
||||
return;
|
||||
|
||||
slow_path:
|
||||
OPAL_THREAD_UNLOCK(&comm->matching_lock);
|
||||
#if PML_BFO
|
||||
if (true == mca_pml_bfo_is_duplicate_msg(proc, hdr)) {
|
||||
return;
|
||||
}
|
||||
#endif /* PML_BFO */
|
||||
mca_pml_bfo_recv_frag_match(btl, hdr, segments,
|
||||
num_segments, MCA_PML_BFO_HDR_TYPE_MATCH);
|
||||
}
|
||||
|
||||
|
||||
void mca_pml_bfo_recv_frag_callback_rndv(mca_btl_base_module_t* btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t* des,
|
||||
void* cbdata )
|
||||
{
|
||||
mca_btl_base_segment_t* segments = des->des_local;
|
||||
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
|
||||
|
||||
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_bfo_common_hdr_t)) ) {
|
||||
return;
|
||||
}
|
||||
bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_RNDV);
|
||||
mca_pml_bfo_recv_frag_match(btl, &hdr->hdr_match, segments,
|
||||
des->des_local_count, MCA_PML_BFO_HDR_TYPE_RNDV);
|
||||
return;
|
||||
}
|
||||
|
||||
void mca_pml_bfo_recv_frag_callback_rget(mca_btl_base_module_t* btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t* des,
|
||||
void* cbdata )
|
||||
{
|
||||
mca_btl_base_segment_t* segments = des->des_local;
|
||||
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
|
||||
|
||||
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_bfo_common_hdr_t)) ) {
|
||||
return;
|
||||
}
|
||||
bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_RGET);
|
||||
mca_pml_bfo_recv_frag_match(btl, &hdr->hdr_match, segments,
|
||||
des->des_local_count, MCA_PML_BFO_HDR_TYPE_RGET);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void mca_pml_bfo_recv_frag_callback_ack(mca_btl_base_module_t* btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t* des,
|
||||
void* cbdata )
|
||||
{
|
||||
mca_btl_base_segment_t* segments = des->des_local;
|
||||
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
|
||||
mca_pml_bfo_send_request_t* sendreq;
|
||||
|
||||
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_bfo_common_hdr_t)) ) {
|
||||
return;
|
||||
}
|
||||
|
||||
bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_ACK);
|
||||
sendreq = (mca_pml_bfo_send_request_t*)hdr->hdr_ack.hdr_src_req.pval;
|
||||
sendreq->req_recv = hdr->hdr_ack.hdr_dst_req;
|
||||
#if PML_BFO
|
||||
MCA_PML_BFO_ERROR_CHECK_ON_ACK_CALLBACK(sendreq);
|
||||
#endif /* PML_BFO */
|
||||
|
||||
/* if the request should be delivered entirely by copy in/out
|
||||
* then throttle sends */
|
||||
if(hdr->hdr_common.hdr_flags & MCA_PML_BFO_HDR_FLAGS_NORDMA)
|
||||
sendreq->req_throttle_sends = true;
|
||||
|
||||
mca_pml_bfo_send_request_copy_in_out(sendreq,
|
||||
hdr->hdr_ack.hdr_send_offset,
|
||||
sendreq->req_send.req_bytes_packed -
|
||||
hdr->hdr_ack.hdr_send_offset);
|
||||
|
||||
if (sendreq->req_state != 0) {
|
||||
/* Typical receipt of an ACK message causes req_state to be
|
||||
* decremented. However, a send request that started as an
|
||||
* RGET request can become a RNDV. For example, when the
|
||||
* receiver determines that its receive buffer is not
|
||||
* contiguous and therefore cannot support the RGET
|
||||
* protocol. A send request that started with the RGET
|
||||
* protocol has req_state == 0 and as such should not be
|
||||
* decremented.
|
||||
*/
|
||||
OPAL_THREAD_ADD_FETCH32(&sendreq->req_state, -1);
|
||||
}
|
||||
|
||||
if(send_request_pml_complete_check(sendreq) == false)
|
||||
mca_pml_bfo_send_request_schedule(sendreq);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void mca_pml_bfo_recv_frag_callback_frag(mca_btl_base_module_t* btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t* des,
|
||||
void* cbdata ) {
|
||||
mca_btl_base_segment_t* segments = des->des_local;
|
||||
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
|
||||
mca_pml_bfo_recv_request_t* recvreq;
|
||||
|
||||
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_bfo_common_hdr_t)) ) {
|
||||
return;
|
||||
}
|
||||
bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_FRAG);
|
||||
recvreq = (mca_pml_bfo_recv_request_t*)hdr->hdr_frag.hdr_dst_req.pval;
|
||||
#if PML_BFO
|
||||
MCA_PML_BFO_ERROR_CHECK_ON_FRAG_CALLBACK(recvreq);
|
||||
#endif /* PML_BFO */
|
||||
mca_pml_bfo_recv_request_progress_frag(recvreq,btl,segments,des->des_local_count);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
void mca_pml_bfo_recv_frag_callback_put(mca_btl_base_module_t* btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t* des,
|
||||
void* cbdata ) {
|
||||
mca_btl_base_segment_t* segments = des->des_local;
|
||||
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
|
||||
mca_pml_bfo_send_request_t* sendreq;
|
||||
|
||||
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_bfo_common_hdr_t)) ) {
|
||||
return;
|
||||
}
|
||||
|
||||
bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_PUT);
|
||||
sendreq = (mca_pml_bfo_send_request_t*)hdr->hdr_rdma.hdr_req.pval;
|
||||
#if PML_BFO
|
||||
MCA_PML_BFO_ERROR_CHECK_ON_PUT_CALLBACK(sendreq);
|
||||
#endif /* PML_BFO */
|
||||
mca_pml_bfo_send_request_put(sendreq,btl,&hdr->hdr_rdma);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
void mca_pml_bfo_recv_frag_callback_fin(mca_btl_base_module_t* btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t* des,
|
||||
void* cbdata ) {
|
||||
mca_btl_base_segment_t* segments = des->des_local;
|
||||
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
|
||||
mca_btl_base_descriptor_t* rdma;
|
||||
|
||||
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_bfo_common_hdr_t)) ) {
|
||||
return;
|
||||
}
|
||||
|
||||
bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_FIN);
|
||||
rdma = (mca_btl_base_descriptor_t*)hdr->hdr_fin.hdr_des.pval;
|
||||
#if PML_BFO
|
||||
if (true == mca_pml_bfo_is_duplicate_fin(hdr, rdma, btl)) {
|
||||
return;
|
||||
}
|
||||
#endif /* PML_BFO */
|
||||
rdma->des_cbfunc(btl, NULL, rdma,
|
||||
hdr->hdr_fin.hdr_fail ? OMPI_ERROR : OMPI_SUCCESS);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
#define PML_MAX_SEQ ~((mca_pml_sequence_t)0);
|
||||
|
||||
static inline mca_pml_bfo_recv_request_t* get_posted_recv(opal_list_t *queue)
|
||||
{
|
||||
if(opal_list_get_size(queue) == 0)
|
||||
return NULL;
|
||||
|
||||
return (mca_pml_bfo_recv_request_t*)opal_list_get_first(queue);
|
||||
}
|
||||
|
||||
static inline mca_pml_bfo_recv_request_t* get_next_posted_recv(
|
||||
opal_list_t *queue,
|
||||
mca_pml_bfo_recv_request_t* req)
|
||||
{
|
||||
opal_list_item_t *i = opal_list_get_next((opal_list_item_t*)req);
|
||||
|
||||
if(opal_list_get_end(queue) == i)
|
||||
return NULL;
|
||||
|
||||
return (mca_pml_bfo_recv_request_t*)i;
|
||||
}
|
||||
|
||||
static mca_pml_bfo_recv_request_t *match_incomming(
|
||||
mca_pml_bfo_match_hdr_t *hdr, mca_pml_bfo_comm_t *comm,
|
||||
mca_pml_bfo_comm_proc_t *proc)
|
||||
{
|
||||
mca_pml_bfo_recv_request_t *specific_recv, *wild_recv;
|
||||
mca_pml_sequence_t wild_recv_seq, specific_recv_seq;
|
||||
int tag = hdr->hdr_tag;
|
||||
|
||||
specific_recv = get_posted_recv(&proc->specific_receives);
|
||||
wild_recv = get_posted_recv(&comm->wild_receives);
|
||||
|
||||
wild_recv_seq = wild_recv ?
|
||||
wild_recv->req_recv.req_base.req_sequence : PML_MAX_SEQ;
|
||||
specific_recv_seq = specific_recv ?
|
||||
specific_recv->req_recv.req_base.req_sequence : PML_MAX_SEQ;
|
||||
|
||||
/* they are equal only if both are PML_MAX_SEQ */
|
||||
while(wild_recv_seq != specific_recv_seq) {
|
||||
mca_pml_bfo_recv_request_t **match;
|
||||
opal_list_t *queue;
|
||||
int req_tag;
|
||||
mca_pml_sequence_t *seq;
|
||||
|
||||
if (OPAL_UNLIKELY(wild_recv_seq < specific_recv_seq)) {
|
||||
match = &wild_recv;
|
||||
queue = &comm->wild_receives;
|
||||
seq = &wild_recv_seq;
|
||||
} else {
|
||||
match = &specific_recv;
|
||||
queue = &proc->specific_receives;
|
||||
seq = &specific_recv_seq;
|
||||
}
|
||||
|
||||
req_tag = (*match)->req_recv.req_base.req_tag;
|
||||
if(req_tag == tag || (req_tag == OMPI_ANY_TAG && tag >= 0)) {
|
||||
opal_list_remove_item(queue, (opal_list_item_t*)(*match));
|
||||
PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_REQ_REMOVE_FROM_POSTED_Q,
|
||||
&((*match)->req_recv.req_base), PERUSE_RECV);
|
||||
return *match;
|
||||
}
|
||||
|
||||
*match = get_next_posted_recv(queue, *match);
|
||||
*seq = (*match) ? (*match)->req_recv.req_base.req_sequence : PML_MAX_SEQ;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static mca_pml_bfo_recv_request_t*
|
||||
match_one(mca_btl_base_module_t *btl,
|
||||
mca_pml_bfo_match_hdr_t *hdr, mca_btl_base_segment_t* segments,
|
||||
size_t num_segments, ompi_communicator_t *comm_ptr,
|
||||
mca_pml_bfo_comm_proc_t *proc,
|
||||
mca_pml_bfo_recv_frag_t* frag)
|
||||
{
|
||||
mca_pml_bfo_recv_request_t *match;
|
||||
mca_pml_bfo_comm_t *comm = (mca_pml_bfo_comm_t *)comm_ptr->c_pml_comm;
|
||||
|
||||
do {
|
||||
match = match_incomming(hdr, comm, proc);
|
||||
|
||||
/* if match found, process data */
|
||||
if(OPAL_LIKELY(NULL != match)) {
|
||||
match->req_recv.req_base.req_proc = proc->ompi_proc;
|
||||
|
||||
if(OPAL_UNLIKELY(MCA_PML_REQUEST_PROBE == match->req_recv.req_base.req_type)) {
|
||||
/* complete the probe */
|
||||
mca_pml_bfo_recv_request_matched_probe(match, btl, segments,
|
||||
num_segments);
|
||||
/* attempt to match actual request */
|
||||
continue;
|
||||
} else if (MCA_PML_REQUEST_MPROBE == match->req_recv.req_base.req_type) {
|
||||
/* create a receive frag and associate it with the
|
||||
request, which is then completed so that it can be
|
||||
restarted later during mrecv */
|
||||
mca_pml_bfo_recv_frag_t *tmp;
|
||||
if(NULL == frag) {
|
||||
MCA_PML_BFO_RECV_FRAG_ALLOC(tmp);
|
||||
MCA_PML_BFO_RECV_FRAG_INIT(tmp, hdr, segments, num_segments, btl);
|
||||
} else {
|
||||
tmp = frag;
|
||||
}
|
||||
|
||||
match->req_recv.req_base.req_addr = tmp;
|
||||
mca_pml_bfo_recv_request_matched_probe(match, btl, segments,
|
||||
num_segments);
|
||||
/* this frag is already processed, so we want to break out
|
||||
of the loop and not end up back on the unexpected queue. */
|
||||
return NULL;
|
||||
}
|
||||
|
||||
PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_MSG_MATCH_POSTED_REQ,
|
||||
&(match->req_recv.req_base), PERUSE_RECV);
|
||||
return match;
|
||||
}
|
||||
|
||||
/* if no match found, place on unexpected queue */
|
||||
append_frag_to_list(&proc->unexpected_frags, btl, hdr, segments,
|
||||
num_segments, frag);
|
||||
PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_INSERT_IN_UNEX_Q, comm_ptr,
|
||||
hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
|
||||
return NULL;
|
||||
} while(true);
|
||||
}
|
||||
|
||||
static mca_pml_bfo_recv_frag_t* check_cantmatch_for_match(mca_pml_bfo_comm_proc_t *proc)
|
||||
{
|
||||
mca_pml_bfo_recv_frag_t *frag;
|
||||
|
||||
/* search the list for a fragment from the send with sequence
|
||||
* number next_msg_seq_expected
|
||||
*/
|
||||
for(frag = (mca_pml_bfo_recv_frag_t*)opal_list_get_first(&proc->frags_cant_match);
|
||||
frag != (mca_pml_bfo_recv_frag_t*)opal_list_get_end(&proc->frags_cant_match);
|
||||
frag = (mca_pml_bfo_recv_frag_t*)opal_list_get_next(frag))
|
||||
{
|
||||
mca_pml_bfo_match_hdr_t* hdr = &frag->hdr.hdr_match;
|
||||
/*
|
||||
* If the message has the next expected seq from that proc...
|
||||
*/
|
||||
if(hdr->hdr_seq != proc->expected_sequence)
|
||||
continue;
|
||||
|
||||
opal_list_remove_item(&proc->frags_cant_match, (opal_list_item_t*)frag);
|
||||
return frag;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* RCS/CTS receive side matching
|
||||
*
|
||||
* @param hdr list of parameters needed for matching
|
||||
* This list is also embeded in frag,
|
||||
* but this allows to save a memory copy when
|
||||
* a match is made in this routine. (IN)
|
||||
* @param frag pointer to receive fragment which we want
|
||||
* to match (IN/OUT). If a match is not made,
|
||||
* hdr is copied to frag.
|
||||
* @param match_made parameter indicating if we matched frag/
|
||||
* hdr (OUT)
|
||||
* @param additional_matches if a match is made with frag, we
|
||||
* may be able to match fragments that previously
|
||||
* have arrived out-of-order. If this is the
|
||||
* case, the associated fragment descriptors are
|
||||
* put on this list for further processing. (OUT)
|
||||
*
|
||||
* @return OMPI error code
|
||||
*
|
||||
* This routine is used to try and match a newly arrived message fragment
|
||||
* to pre-posted receives. The following assumptions are made
|
||||
* - fragments are received out of order
|
||||
* - for long messages, e.g. more than one fragment, a RTS/CTS algorithm
|
||||
* is used.
|
||||
* - 2nd and greater fragments include a receive descriptor pointer
|
||||
* - fragments may be dropped
|
||||
* - fragments may be corrupt
|
||||
* - this routine may be called simultaneously by more than one thread
|
||||
*/
|
||||
static int mca_pml_bfo_recv_frag_match( mca_btl_base_module_t *btl,
|
||||
mca_pml_bfo_match_hdr_t *hdr,
|
||||
mca_btl_base_segment_t* segments,
|
||||
size_t num_segments,
|
||||
int type)
|
||||
{
|
||||
/* local variables */
|
||||
uint16_t next_msg_seq_expected, frag_msg_seq;
|
||||
ompi_communicator_t *comm_ptr;
|
||||
mca_pml_bfo_recv_request_t *match = NULL;
|
||||
mca_pml_bfo_comm_t *comm;
|
||||
mca_pml_bfo_comm_proc_t *proc;
|
||||
mca_pml_bfo_recv_frag_t* frag = NULL;
|
||||
|
||||
/* communicator pointer */
|
||||
comm_ptr = ompi_comm_lookup(hdr->hdr_ctx);
|
||||
if(OPAL_UNLIKELY(NULL == comm_ptr)) {
|
||||
/* This is a special case. A message for a not yet existing
|
||||
* communicator can happens. Instead of doing a matching we
|
||||
* will temporarily add it the a pending queue in the PML.
|
||||
* Later on, when the communicator is completely instantiated,
|
||||
* this pending queue will be searched and all matching fragments
|
||||
* moved to the right communicator.
|
||||
*/
|
||||
append_frag_to_list( &mca_pml_bfo.non_existing_communicator_pending,
|
||||
btl, hdr, segments, num_segments, NULL );
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
comm = (mca_pml_bfo_comm_t *)comm_ptr->c_pml_comm;
|
||||
|
||||
/* source sequence number */
|
||||
frag_msg_seq = hdr->hdr_seq;
|
||||
proc = &comm->procs[hdr->hdr_src];
|
||||
|
||||
/**
|
||||
* We generate the MSG_ARRIVED event as soon as the PML is aware of a matching
|
||||
* fragment arrival. Independing if it is received on the correct order or not.
|
||||
* This will allow the tools to figure out if the messages are not received in the
|
||||
* correct order (if multiple network interfaces).
|
||||
*/
|
||||
PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_ARRIVED, comm_ptr,
|
||||
hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
|
||||
|
||||
/* get next expected message sequence number - if threaded
|
||||
* run, lock to make sure that if another thread is processing
|
||||
* a frag from the same message a match is made only once.
|
||||
* Also, this prevents other posted receives (for a pair of
|
||||
* end points) from being processed, and potentially "loosing"
|
||||
* the fragment.
|
||||
*/
|
||||
OPAL_THREAD_LOCK(&comm->matching_lock);
|
||||
|
||||
#if PML_BFO
|
||||
if(OPAL_UNLIKELY(hdr->hdr_common.hdr_flags & MCA_PML_BFO_HDR_FLAGS_RESTART)) {
|
||||
if (NULL == (match = mca_pml_bfo_get_request(hdr))) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
} else {
|
||||
#endif /* PML_BFO */
|
||||
/* get sequence number of next message that can be processed */
|
||||
next_msg_seq_expected = (uint16_t)proc->expected_sequence;
|
||||
if(OPAL_UNLIKELY(frag_msg_seq != next_msg_seq_expected))
|
||||
goto wrong_seq;
|
||||
|
||||
/*
|
||||
* This is the sequence number we were expecting,
|
||||
* so we can try matching it to already posted
|
||||
* receives.
|
||||
*/
|
||||
|
||||
out_of_order_match:
|
||||
/* We're now expecting the next sequence number. */
|
||||
proc->expected_sequence++;
|
||||
|
||||
/**
|
||||
* We generate the SEARCH_POSTED_QUEUE only when the message is received
|
||||
* in the correct sequence. Otherwise, we delay the event generation until
|
||||
* we reach the correct sequence number.
|
||||
*/
|
||||
PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_BEGIN, comm_ptr,
|
||||
hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
|
||||
|
||||
match = match_one(btl, hdr, segments, num_segments, comm_ptr, proc, frag);
|
||||
|
||||
/**
|
||||
* The match is over. We generate the SEARCH_POSTED_Q_END here, before going
|
||||
* into the mca_pml_bfo_check_cantmatch_for_match so we can make a difference
|
||||
* for the searching time for all messages.
|
||||
*/
|
||||
PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_END, comm_ptr,
|
||||
hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
|
||||
|
||||
/* release matching lock before processing fragment */
|
||||
OPAL_THREAD_UNLOCK(&comm->matching_lock);
|
||||
|
||||
#if PML_BFO
|
||||
}
|
||||
#endif /* PML_BFO */
|
||||
if(OPAL_LIKELY(match)) {
|
||||
switch(type) {
|
||||
case MCA_PML_BFO_HDR_TYPE_MATCH:
|
||||
mca_pml_bfo_recv_request_progress_match(match, btl, segments, num_segments);
|
||||
break;
|
||||
case MCA_PML_BFO_HDR_TYPE_RNDV:
|
||||
mca_pml_bfo_recv_request_progress_rndv(match, btl, segments, num_segments);
|
||||
break;
|
||||
case MCA_PML_BFO_HDR_TYPE_RGET:
|
||||
mca_pml_bfo_recv_request_progress_rget(match, btl, segments, num_segments);
|
||||
break;
|
||||
}
|
||||
|
||||
if(OPAL_UNLIKELY(frag))
|
||||
MCA_PML_BFO_RECV_FRAG_RETURN(frag);
|
||||
}
|
||||
|
||||
/*
|
||||
* Now that new message has arrived, check to see if
|
||||
* any fragments on the c_c_frags_cant_match list
|
||||
* may now be used to form new matchs
|
||||
*/
|
||||
if(OPAL_UNLIKELY(opal_list_get_size(&proc->frags_cant_match) > 0)) {
|
||||
OPAL_THREAD_LOCK(&comm->matching_lock);
|
||||
if((frag = check_cantmatch_for_match(proc))) {
|
||||
hdr = &frag->hdr.hdr_match;
|
||||
segments = frag->segments;
|
||||
num_segments = frag->num_segments;
|
||||
btl = frag->btl;
|
||||
type = hdr->hdr_common.hdr_type;
|
||||
goto out_of_order_match;
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&comm->matching_lock);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
wrong_seq:
|
||||
/*
|
||||
* This message comes after the next expected, so it
|
||||
* is ahead of sequence. Save it for later.
|
||||
*/
|
||||
#if PML_BFO
|
||||
if (true == mca_pml_bfo_is_duplicate_msg(proc, hdr)) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
#endif /* PML_BFO */
|
||||
append_frag_to_list(&proc->frags_cant_match, btl, hdr, segments,
|
||||
num_segments, NULL);
|
||||
OPAL_THREAD_UNLOCK(&comm->matching_lock);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
@ -1,172 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2013 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*/
|
||||
|
||||
#ifndef MCA_PML_BFO_RECVFRAG_H
|
||||
#define MCA_PML_BFO_RECVFRAG_H
|
||||
|
||||
#include "pml_bfo_hdr.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
struct mca_pml_bfo_buffer_t {
|
||||
size_t len;
|
||||
void * addr;
|
||||
};
|
||||
typedef struct mca_pml_bfo_buffer_t mca_pml_bfo_buffer_t;
|
||||
|
||||
|
||||
struct mca_pml_bfo_recv_frag_t {
|
||||
opal_free_list_item_t super;
|
||||
mca_pml_bfo_hdr_t hdr;
|
||||
size_t num_segments;
|
||||
mca_btl_base_module_t* btl;
|
||||
mca_btl_base_segment_t segments[MCA_BTL_DES_MAX_SEGMENTS];
|
||||
mca_pml_bfo_buffer_t buffers[MCA_BTL_DES_MAX_SEGMENTS];
|
||||
unsigned char addr[1];
|
||||
};
|
||||
typedef struct mca_pml_bfo_recv_frag_t mca_pml_bfo_recv_frag_t;
|
||||
|
||||
OBJ_CLASS_DECLARATION(mca_pml_bfo_recv_frag_t);
|
||||
|
||||
|
||||
#define MCA_PML_BFO_RECV_FRAG_ALLOC(frag) \
|
||||
do { \
|
||||
opal_free_list_item_t* item; \
|
||||
OPAL_FREE_LIST_WAIT_MT(&mca_pml_bfo.recv_frags, item); \
|
||||
frag = (mca_pml_bfo_recv_frag_t*)item; \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define MCA_PML_BFO_RECV_FRAG_INIT(frag, hdr, segs, cnt, btl ) \
|
||||
do { \
|
||||
size_t i, _size; \
|
||||
mca_btl_base_segment_t* macro_segments = frag->segments; \
|
||||
mca_pml_bfo_buffer_t* buffers = frag->buffers; \
|
||||
unsigned char* _ptr = (unsigned char*)frag->addr; \
|
||||
/* init recv_frag */ \
|
||||
frag->btl = btl; \
|
||||
frag->hdr = *(mca_pml_bfo_hdr_t*)hdr; \
|
||||
frag->num_segments = 1; \
|
||||
_size = segs[0].seg_len; \
|
||||
for( i = 1; i < cnt; i++ ) { \
|
||||
_size += segs[i].seg_len; \
|
||||
} \
|
||||
/* copy over data */ \
|
||||
if(_size <= mca_pml_bfo.unexpected_limit ) { \
|
||||
macro_segments[0].seg_addr.pval = frag->addr; \
|
||||
} else { \
|
||||
buffers[0].len = _size; \
|
||||
buffers[0].addr = (char*) \
|
||||
mca_pml_bfo.allocator->alc_alloc( mca_pml_bfo.allocator, \
|
||||
buffers[0].len, \
|
||||
0, NULL); \
|
||||
_ptr = (unsigned char*)(buffers[0].addr); \
|
||||
macro_segments[0].seg_addr.pval = buffers[0].addr; \
|
||||
} \
|
||||
macro_segments[0].seg_len = _size; \
|
||||
for( i = 0; i < cnt; i++ ) { \
|
||||
memcpy( _ptr, segs[i].seg_addr.pval, segs[i].seg_len); \
|
||||
_ptr += segs[i].seg_len; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define MCA_PML_BFO_RECV_FRAG_RETURN(frag) \
|
||||
do { \
|
||||
if( frag->segments[0].seg_len > mca_pml_bfo.unexpected_limit ) { \
|
||||
/* return buffers */ \
|
||||
mca_pml_bfo.allocator->alc_free( mca_pml_bfo.allocator, \
|
||||
frag->buffers[0].addr ); \
|
||||
} \
|
||||
frag->num_segments = 0; \
|
||||
\
|
||||
/* return recv_frag */ \
|
||||
OPAL_FREE_LIST_RETURN(&mca_pml_bfo.recv_frags, \
|
||||
(opal_free_list_item_t*)frag); \
|
||||
} while(0)
|
||||
|
||||
|
||||
/**
|
||||
* Callback from BTL on receipt of a recv_frag (match).
|
||||
*/
|
||||
|
||||
extern void mca_pml_bfo_recv_frag_callback_match( mca_btl_base_module_t *btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t* descriptor,
|
||||
void* cbdata );
|
||||
|
||||
/**
|
||||
* Callback from BTL on receipt of a recv_frag (rndv).
|
||||
*/
|
||||
|
||||
extern void mca_pml_bfo_recv_frag_callback_rndv( mca_btl_base_module_t *btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t* descriptor,
|
||||
void* cbdata );
|
||||
/**
|
||||
* Callback from BTL on receipt of a recv_frag (rget).
|
||||
*/
|
||||
|
||||
extern void mca_pml_bfo_recv_frag_callback_rget( mca_btl_base_module_t *btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t* descriptor,
|
||||
void* cbdata );
|
||||
|
||||
/**
|
||||
* Callback from BTL on receipt of a recv_frag (ack).
|
||||
*/
|
||||
|
||||
extern void mca_pml_bfo_recv_frag_callback_ack( mca_btl_base_module_t *btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t* descriptor,
|
||||
void* cbdata );
|
||||
/**
|
||||
* Callback from BTL on receipt of a recv_frag (frag).
|
||||
*/
|
||||
|
||||
extern void mca_pml_bfo_recv_frag_callback_frag( mca_btl_base_module_t *btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t* descriptor,
|
||||
void* cbdata );
|
||||
/**
|
||||
* Callback from BTL on receipt of a recv_frag (put).
|
||||
*/
|
||||
|
||||
extern void mca_pml_bfo_recv_frag_callback_put( mca_btl_base_module_t *btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t* descriptor,
|
||||
void* cbdata );
|
||||
/**
|
||||
* Callback from BTL on receipt of a recv_frag (fin).
|
||||
*/
|
||||
|
||||
extern void mca_pml_bfo_recv_frag_callback_fin( mca_btl_base_module_t *btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t* descriptor,
|
||||
void* cbdata );
|
||||
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
||||
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -1,449 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2016 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2007 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*/
|
||||
#ifndef OMPI_PML_BFO_RECV_REQUEST_H
|
||||
#define OMPI_PML_BFO_RECV_REQUEST_H
|
||||
|
||||
#include "pml_bfo.h"
|
||||
#include "pml_bfo_rdma.h"
|
||||
#include "pml_bfo_rdmafrag.h"
|
||||
#include "ompi/proc/proc.h"
|
||||
#include "ompi/mca/pml/bfo/pml_bfo_comm.h"
|
||||
#include "opal/mca/mpool/base/base.h"
|
||||
#include "ompi/mca/pml/base/pml_base_recvreq.h"
|
||||
#if PML_BFO
|
||||
#define RECVREQ_RECVERRSENT 0x01
|
||||
#define RECVREQ_RNDVRESTART_RECVED 0x02
|
||||
#define RECVREQ_RNDVRESTART_ACKED 0x04
|
||||
#endif /* PML_BFO */
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
struct mca_pml_bfo_recv_request_t {
|
||||
mca_pml_base_recv_request_t req_recv;
|
||||
opal_ptr_t remote_req_send;
|
||||
#if PML_BFO
|
||||
int32_t req_msgseq; /* PML sequence number */
|
||||
int32_t req_events; /* number of outstanding events on request */
|
||||
int32_t req_restartseq; /* sequence number of restarted request */
|
||||
int32_t req_errstate; /* state of request if in error */
|
||||
#endif /* PML_BFO */
|
||||
int32_t req_lock;
|
||||
size_t req_pipeline_depth;
|
||||
size_t req_bytes_received; /**< amount of data transferred into the user buffer */
|
||||
size_t req_bytes_expected; /**< local size of the data as suggested by the user */
|
||||
size_t req_rdma_offset;
|
||||
size_t req_send_offset;
|
||||
uint32_t req_rdma_cnt;
|
||||
uint32_t req_rdma_idx;
|
||||
bool req_pending;
|
||||
bool req_ack_sent; /**< whether ack was sent to the sender */
|
||||
bool req_match_received; /**< Prevent request to be completed prematurely */
|
||||
opal_mutex_t lock;
|
||||
mca_pml_bfo_com_btl_t req_rdma[1];
|
||||
};
|
||||
typedef struct mca_pml_bfo_recv_request_t mca_pml_bfo_recv_request_t;
|
||||
|
||||
OBJ_CLASS_DECLARATION(mca_pml_bfo_recv_request_t);
|
||||
|
||||
static inline bool lock_recv_request(mca_pml_bfo_recv_request_t *recvreq)
|
||||
{
|
||||
return OPAL_THREAD_ADD_FETCH32(&recvreq->req_lock, 1) == 1;
|
||||
}
|
||||
|
||||
static inline bool unlock_recv_request(mca_pml_bfo_recv_request_t *recvreq)
|
||||
{
|
||||
return OPAL_THREAD_ADD_FETCH32(&recvreq->req_lock, -1) == 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Allocate a recv request from the modules free list.
|
||||
*
|
||||
* @param rc (OUT) OMPI_SUCCESS or error status on failure.
|
||||
* @return Receive request.
|
||||
*/
|
||||
#define MCA_PML_BFO_RECV_REQUEST_ALLOC(recvreq) \
|
||||
do { \
|
||||
ompi_free_list_item_t* item; \
|
||||
OMPI_FREE_LIST_GET_MT(&mca_pml_base_recv_requests, item); \
|
||||
recvreq = (mca_pml_bfo_recv_request_t*)item; \
|
||||
} while(0)
|
||||
|
||||
|
||||
/**
|
||||
* Initialize a receive request with call parameters.
|
||||
*
|
||||
* @param request (IN) Receive request.
|
||||
* @param addr (IN) User buffer.
|
||||
* @param count (IN) Number of elements of indicated datatype.
|
||||
* @param datatype (IN) User defined datatype.
|
||||
* @param src (IN) Source rank w/in the communicator.
|
||||
* @param tag (IN) User defined tag.
|
||||
* @param comm (IN) Communicator.
|
||||
* @param persistent (IN) Is this a ersistent request.
|
||||
*/
|
||||
#define MCA_PML_BFO_RECV_REQUEST_INIT( request, \
|
||||
addr, \
|
||||
count, \
|
||||
datatype, \
|
||||
src, \
|
||||
tag, \
|
||||
comm, \
|
||||
persistent) \
|
||||
do { \
|
||||
MCA_PML_BASE_RECV_REQUEST_INIT( &(request)->req_recv, \
|
||||
addr, \
|
||||
count, \
|
||||
datatype, \
|
||||
src, \
|
||||
tag, \
|
||||
comm, \
|
||||
persistent); \
|
||||
} while(0)
|
||||
|
||||
/**
|
||||
* Mark the request as completed at MPI level for internal purposes.
|
||||
*
|
||||
* @param recvreq (IN) Receive request.
|
||||
*/
|
||||
#define MCA_PML_BFO_RECV_REQUEST_MPI_COMPLETE( recvreq ) \
|
||||
do { \
|
||||
PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_COMPLETE, \
|
||||
&(recvreq->req_recv.req_base), PERUSE_RECV ); \
|
||||
ompi_request_complete( &(recvreq->req_recv.req_base.req_ompi), true ); \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
* Free the PML receive request
|
||||
*/
|
||||
#define MCA_PML_BFO_RECV_REQUEST_RETURN(recvreq) \
|
||||
{ \
|
||||
MCA_PML_BASE_RECV_REQUEST_FINI(&(recvreq)->req_recv); \
|
||||
OPAL_FREE_LIST_RETURN( &mca_pml_base_recv_requests, \
|
||||
(opal_free_list_item_t*)(recvreq)); \
|
||||
}
|
||||
|
||||
/**
|
||||
* Complete receive request. Request structure cannot be accessed after calling
|
||||
* this function any more.
|
||||
*
|
||||
* @param recvreq (IN) Receive request.
|
||||
*/
|
||||
static inline void
|
||||
recv_request_pml_complete(mca_pml_bfo_recv_request_t *recvreq)
|
||||
{
|
||||
size_t i;
|
||||
|
||||
if(false == recvreq->req_recv.req_base.req_pml_complete) {
|
||||
|
||||
if(recvreq->req_recv.req_bytes_packed > 0) {
|
||||
PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_END,
|
||||
&recvreq->req_recv.req_base, PERUSE_RECV );
|
||||
}
|
||||
|
||||
for(i = 0; i < recvreq->req_rdma_cnt; i++) {
|
||||
mca_mpool_base_registration_t* btl_reg = recvreq->req_rdma[i].btl_reg;
|
||||
if( NULL != btl_reg && btl_reg->mpool != NULL) {
|
||||
btl_reg->mpool->mpool_deregister( btl_reg->mpool, btl_reg );
|
||||
}
|
||||
}
|
||||
recvreq->req_rdma_cnt = 0;
|
||||
#if PML_BFO
|
||||
recvreq->req_msgseq -= 100;
|
||||
#endif /* PML_BFO */
|
||||
|
||||
if(true == recvreq->req_recv.req_base.req_free_called) {
|
||||
if( MPI_SUCCESS != recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR ) {
|
||||
ompi_mpi_abort(&ompi_mpi_comm_world.comm, MPI_ERR_REQUEST);
|
||||
}
|
||||
MCA_PML_BFO_RECV_REQUEST_RETURN(recvreq);
|
||||
} else {
|
||||
/* initialize request status */
|
||||
recvreq->req_recv.req_base.req_pml_complete = true;
|
||||
recvreq->req_recv.req_base.req_ompi.req_status._ucount =
|
||||
recvreq->req_bytes_received;
|
||||
if (recvreq->req_recv.req_bytes_packed > recvreq->req_bytes_expected) {
|
||||
recvreq->req_recv.req_base.req_ompi.req_status._ucount =
|
||||
recvreq->req_recv.req_bytes_packed;
|
||||
recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR =
|
||||
MPI_ERR_TRUNCATE;
|
||||
}
|
||||
MCA_PML_BFO_RECV_REQUEST_MPI_COMPLETE(recvreq);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool
|
||||
recv_request_pml_complete_check(mca_pml_bfo_recv_request_t *recvreq)
|
||||
{
|
||||
#if OPAL_ENABLE_MULTI_THREADS
|
||||
opal_atomic_rmb();
|
||||
#endif
|
||||
if(recvreq->req_match_received &&
|
||||
recvreq->req_bytes_received >= recvreq->req_recv.req_bytes_packed &&
|
||||
#if PML_BFO
|
||||
(0 == recvreq->req_events) && lock_recv_request(recvreq)) {
|
||||
#else /* PML_BFO */
|
||||
lock_recv_request(recvreq)) {
|
||||
#endif /* PML_BFO */
|
||||
recv_request_pml_complete(recvreq);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
extern void mca_pml_bfo_recv_req_start(mca_pml_bfo_recv_request_t *req);
|
||||
#define MCA_PML_BFO_RECV_REQUEST_START(r) mca_pml_bfo_recv_req_start(r)
|
||||
|
||||
static inline void prepare_recv_req_converter(mca_pml_bfo_recv_request_t *req)
|
||||
{
|
||||
if( req->req_recv.req_base.req_datatype->super.size | req->req_recv.req_base.req_count ) {
|
||||
opal_convertor_copy_and_prepare_for_recv(
|
||||
req->req_recv.req_base.req_proc->super.proc_convertor,
|
||||
&(req->req_recv.req_base.req_datatype->super),
|
||||
req->req_recv.req_base.req_count,
|
||||
req->req_recv.req_base.req_addr,
|
||||
0,
|
||||
&req->req_recv.req_base.req_convertor);
|
||||
opal_convertor_get_unpacked_size(&req->req_recv.req_base.req_convertor,
|
||||
&req->req_bytes_expected);
|
||||
}
|
||||
}
|
||||
|
||||
#define MCA_PML_BFO_RECV_REQUEST_MATCHED(request, hdr) \
|
||||
recv_req_matched(request, hdr)
|
||||
|
||||
static inline void recv_req_matched(mca_pml_bfo_recv_request_t *req,
|
||||
mca_pml_bfo_match_hdr_t *hdr)
|
||||
{
|
||||
req->req_recv.req_base.req_ompi.req_status.MPI_SOURCE = hdr->hdr_src;
|
||||
req->req_recv.req_base.req_ompi.req_status.MPI_TAG = hdr->hdr_tag;
|
||||
req->req_match_received = true;
|
||||
#if PML_BFO
|
||||
req->req_msgseq = hdr->hdr_seq;
|
||||
#endif /* PML_BFO */
|
||||
#if OPAL_ENABLE_MULTI_THREADS
|
||||
opal_atomic_wmb();
|
||||
#endif
|
||||
if(req->req_recv.req_bytes_packed > 0) {
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
if(MPI_ANY_SOURCE == req->req_recv.req_base.req_peer) {
|
||||
/* non wildcard prepared during post recv */
|
||||
prepare_recv_req_converter(req);
|
||||
}
|
||||
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT */
|
||||
PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_REQ_XFER_BEGIN,
|
||||
&req->req_recv.req_base, PERUSE_RECV);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
|
||||
#define MCA_PML_BFO_RECV_REQUEST_UNPACK( request, \
|
||||
segments, \
|
||||
num_segments, \
|
||||
seg_offset, \
|
||||
data_offset, \
|
||||
bytes_received, \
|
||||
bytes_delivered) \
|
||||
do { \
|
||||
bytes_delivered = 0; \
|
||||
if(request->req_recv.req_bytes_packed > 0) { \
|
||||
struct iovec iov[MCA_BTL_DES_MAX_SEGMENTS]; \
|
||||
uint32_t iov_count = 0; \
|
||||
size_t max_data = bytes_received; \
|
||||
size_t n, offset = seg_offset; \
|
||||
mca_btl_base_segment_t* segment = segments; \
|
||||
\
|
||||
OPAL_THREAD_LOCK(&request->lock); \
|
||||
for( n = 0; n < num_segments; n++, segment++ ) { \
|
||||
if(offset >= segment->seg_len) { \
|
||||
offset -= segment->seg_len; \
|
||||
} else { \
|
||||
iov[iov_count].iov_len = segment->seg_len - offset; \
|
||||
iov[iov_count].iov_base = (IOVBASE_TYPE*) \
|
||||
((unsigned char*)segment->seg_addr.pval + offset); \
|
||||
iov_count++; \
|
||||
offset = 0; \
|
||||
} \
|
||||
} \
|
||||
PERUSE_TRACE_COMM_OMPI_EVENT (PERUSE_COMM_REQ_XFER_CONTINUE, \
|
||||
&(recvreq->req_recv.req_base), max_data, \
|
||||
PERUSE_RECV); \
|
||||
opal_convertor_set_position( &(request->req_recv.req_base.req_convertor), \
|
||||
&data_offset ); \
|
||||
opal_convertor_unpack( &(request)->req_recv.req_base.req_convertor, \
|
||||
iov, \
|
||||
&iov_count, \
|
||||
&max_data ); \
|
||||
bytes_delivered = max_data; \
|
||||
OPAL_THREAD_UNLOCK(&request->lock); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
|
||||
void mca_pml_bfo_recv_request_progress_match(
|
||||
mca_pml_bfo_recv_request_t* req,
|
||||
struct mca_btl_base_module_t* btl,
|
||||
mca_btl_base_segment_t* segments,
|
||||
size_t num_segments);
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
|
||||
void mca_pml_bfo_recv_request_progress_frag(
|
||||
mca_pml_bfo_recv_request_t* req,
|
||||
struct mca_btl_base_module_t* btl,
|
||||
mca_btl_base_segment_t* segments,
|
||||
size_t num_segments);
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
|
||||
void mca_pml_bfo_recv_request_progress_rndv(
|
||||
mca_pml_bfo_recv_request_t* req,
|
||||
struct mca_btl_base_module_t* btl,
|
||||
mca_btl_base_segment_t* segments,
|
||||
size_t num_segments);
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
|
||||
void mca_pml_bfo_recv_request_progress_rget(
|
||||
mca_pml_bfo_recv_request_t* req,
|
||||
struct mca_btl_base_module_t* btl,
|
||||
mca_btl_base_segment_t* segments,
|
||||
size_t num_segments);
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
|
||||
void mca_pml_bfo_recv_request_matched_probe(
|
||||
mca_pml_bfo_recv_request_t* req,
|
||||
struct mca_btl_base_module_t* btl,
|
||||
mca_btl_base_segment_t* segments,
|
||||
size_t num_segments);
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
|
||||
int mca_pml_bfo_recv_request_schedule_once(
|
||||
mca_pml_bfo_recv_request_t* req, mca_bml_base_btl_t* start_bml_btl);
|
||||
|
||||
static inline int mca_pml_bfo_recv_request_schedule_exclusive(
|
||||
mca_pml_bfo_recv_request_t* req,
|
||||
mca_bml_base_btl_t* start_bml_btl)
|
||||
{
|
||||
int rc;
|
||||
|
||||
do {
|
||||
rc = mca_pml_bfo_recv_request_schedule_once(req, start_bml_btl);
|
||||
if(rc == OMPI_ERR_OUT_OF_RESOURCE)
|
||||
break;
|
||||
} while(!unlock_recv_request(req));
|
||||
|
||||
if(OMPI_SUCCESS == rc)
|
||||
recv_request_pml_complete_check(req);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
static inline void mca_pml_bfo_recv_request_schedule(
|
||||
mca_pml_bfo_recv_request_t* req,
|
||||
mca_bml_base_btl_t* start_bml_btl)
|
||||
{
|
||||
if(!lock_recv_request(req))
|
||||
return;
|
||||
|
||||
(void)mca_pml_bfo_recv_request_schedule_exclusive(req, start_bml_btl);
|
||||
}
|
||||
|
||||
#define MCA_PML_BFO_ADD_ACK_TO_PENDING(P, S, D, O) \
|
||||
do { \
|
||||
mca_pml_bfo_pckt_pending_t *_pckt; \
|
||||
\
|
||||
MCA_PML_BFO_PCKT_PENDING_ALLOC(_pckt); \
|
||||
_pckt->hdr.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_ACK; \
|
||||
_pckt->hdr.hdr_ack.hdr_src_req.lval = (S); \
|
||||
_pckt->hdr.hdr_ack.hdr_dst_req.pval = (D); \
|
||||
_pckt->hdr.hdr_ack.hdr_send_offset = (O); \
|
||||
_pckt->proc = (P); \
|
||||
_pckt->bml_btl = NULL; \
|
||||
OPAL_THREAD_LOCK(&mca_pml_bfo.lock); \
|
||||
opal_list_append(&mca_pml_bfo.pckt_pending, \
|
||||
(opal_list_item_t*)_pckt); \
|
||||
OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); \
|
||||
} while(0)
|
||||
|
||||
int mca_pml_bfo_recv_request_ack_send_btl(ompi_proc_t* proc,
|
||||
mca_bml_base_btl_t* bml_btl, uint64_t hdr_src_req, void *hdr_dst_req,
|
||||
uint64_t hdr_rdma_offset, bool nordma);
|
||||
|
||||
static inline int mca_pml_bfo_recv_request_ack_send(ompi_proc_t* proc,
|
||||
uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset,
|
||||
bool nordma)
|
||||
{
|
||||
size_t i;
|
||||
mca_bml_base_btl_t* bml_btl;
|
||||
mca_bml_base_endpoint_t* endpoint =
|
||||
(mca_bml_base_endpoint_t*)proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
|
||||
|
||||
for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) {
|
||||
bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
|
||||
if(mca_pml_bfo_recv_request_ack_send_btl(proc, bml_btl, hdr_src_req,
|
||||
hdr_dst_req, hdr_send_offset, nordma) == OMPI_SUCCESS)
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
MCA_PML_BFO_ADD_ACK_TO_PENDING(proc, hdr_src_req, hdr_dst_req,
|
||||
hdr_send_offset);
|
||||
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
int mca_pml_bfo_recv_request_get_frag(mca_pml_bfo_rdma_frag_t* frag);
|
||||
|
||||
/* This function tries to continue recvreq that stuck due to resource
|
||||
* unavailability. Recvreq is added to recv_pending list if scheduling of put
|
||||
* operation cannot be accomplished for some reason. */
|
||||
void mca_pml_bfo_recv_request_process_pending(void);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
||||
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -1,499 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2016 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2009-2012 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef OMPI_PML_BFO_SEND_REQUEST_H
|
||||
#define OMPI_PML_BFO_SEND_REQUEST_H
|
||||
|
||||
#include "opal/mca/btl/btl.h"
|
||||
#include "opal/mca/mpool/base/base.h"
|
||||
#include "ompi/mca/pml/base/pml_base_sendreq.h"
|
||||
#include "pml_bfo_comm.h"
|
||||
#include "pml_bfo_hdr.h"
|
||||
#include "pml_bfo_rdma.h"
|
||||
#include "pml_bfo_rdmafrag.h"
|
||||
#include "ompi/mca/bml/bml.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
typedef enum {
|
||||
MCA_PML_BFO_SEND_PENDING_NONE,
|
||||
MCA_PML_BFO_SEND_PENDING_SCHEDULE,
|
||||
MCA_PML_BFO_SEND_PENDING_START
|
||||
} mca_pml_bfo_send_pending_t;
|
||||
|
||||
struct mca_pml_bfo_send_request_t {
|
||||
mca_pml_base_send_request_t req_send;
|
||||
mca_bml_base_endpoint_t* req_endpoint;
|
||||
opal_ptr_t req_recv;
|
||||
#if PML_BFO
|
||||
int32_t req_events; /* number of outstanding events on request */
|
||||
int32_t req_restartseq; /* sequence number of restarted request */
|
||||
int32_t req_restart; /* state of restarted request */
|
||||
int32_t req_error; /* non-zero when error has occurred on request */
|
||||
#endif /* PML_BFO */
|
||||
int32_t req_state;
|
||||
int32_t req_lock;
|
||||
bool req_throttle_sends;
|
||||
size_t req_pipeline_depth;
|
||||
size_t req_bytes_delivered;
|
||||
uint32_t req_rdma_cnt;
|
||||
mca_pml_bfo_send_pending_t req_pending;
|
||||
opal_mutex_t req_send_range_lock;
|
||||
opal_list_t req_send_ranges;
|
||||
mca_pml_bfo_com_btl_t req_rdma[1];
|
||||
};
|
||||
typedef struct mca_pml_bfo_send_request_t mca_pml_bfo_send_request_t;
|
||||
|
||||
OBJ_CLASS_DECLARATION(mca_pml_bfo_send_request_t);
|
||||
|
||||
struct mca_pml_bfo_send_range_t {
|
||||
opal_free_list_item_t base;
|
||||
uint64_t range_send_offset;
|
||||
uint64_t range_send_length;
|
||||
int range_btl_idx;
|
||||
int range_btl_cnt;
|
||||
mca_pml_bfo_com_btl_t range_btls[1];
|
||||
};
|
||||
typedef struct mca_pml_bfo_send_range_t mca_pml_bfo_send_range_t;
|
||||
OBJ_CLASS_DECLARATION(mca_pml_bfo_send_range_t);
|
||||
|
||||
static inline bool lock_send_request(mca_pml_bfo_send_request_t *sendreq)
|
||||
{
|
||||
return OPAL_THREAD_ADD_FETCH32(&sendreq->req_lock, 1) == 1;
|
||||
}
|
||||
|
||||
static inline bool unlock_send_request(mca_pml_bfo_send_request_t *sendreq)
|
||||
{
|
||||
return OPAL_THREAD_ADD_FETCH32(&sendreq->req_lock, -1) == 0;
|
||||
}
|
||||
|
||||
static inline void
|
||||
add_request_to_send_pending(mca_pml_bfo_send_request_t* sendreq,
|
||||
const mca_pml_bfo_send_pending_t type,
|
||||
const bool append)
|
||||
{
|
||||
opal_list_item_t *item = (opal_list_item_t*)sendreq;
|
||||
|
||||
OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
|
||||
sendreq->req_pending = type;
|
||||
if(append)
|
||||
opal_list_append(&mca_pml_bfo.send_pending, item);
|
||||
else
|
||||
opal_list_prepend(&mca_pml_bfo.send_pending, item);
|
||||
|
||||
OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
|
||||
}
|
||||
|
||||
static inline mca_pml_bfo_send_request_t*
|
||||
get_request_from_send_pending(mca_pml_bfo_send_pending_t *type)
|
||||
{
|
||||
mca_pml_bfo_send_request_t *sendreq;
|
||||
|
||||
OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
|
||||
sendreq = (mca_pml_bfo_send_request_t*)
|
||||
opal_list_remove_first(&mca_pml_bfo.send_pending);
|
||||
if(sendreq) {
|
||||
*type = sendreq->req_pending;
|
||||
sendreq->req_pending = MCA_PML_BFO_SEND_PENDING_NONE;
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
|
||||
|
||||
return sendreq;
|
||||
}
|
||||
|
||||
#define MCA_PML_BFO_SEND_REQUEST_ALLOC( comm, \
|
||||
dst, \
|
||||
sendreq) \
|
||||
{ \
|
||||
ompi_proc_t *proc = ompi_comm_peer_lookup( comm, dst ); \
|
||||
opal_free_list_item_t* item; \
|
||||
\
|
||||
sendreq = NULL; \
|
||||
if( OPAL_LIKELY(NULL != proc) ) { \
|
||||
OPAL_FREE_LIST_WAIT_MT(&mca_pml_base_send_requests, item); \
|
||||
sendreq = (mca_pml_bfo_send_request_t*)item; \
|
||||
sendreq->req_send.req_base.req_proc = proc; \
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
#define MCA_PML_BFO_SEND_REQUEST_INIT( sendreq, \
|
||||
buf, \
|
||||
count, \
|
||||
datatype, \
|
||||
dst, \
|
||||
tag, \
|
||||
comm, \
|
||||
sendmode, \
|
||||
persistent) \
|
||||
{ \
|
||||
MCA_PML_BASE_SEND_REQUEST_INIT(&sendreq->req_send, \
|
||||
buf, \
|
||||
count, \
|
||||
datatype, \
|
||||
dst, \
|
||||
tag, \
|
||||
comm, \
|
||||
sendmode, \
|
||||
persistent, \
|
||||
0); /* convertor_flags */ \
|
||||
(sendreq)->req_recv.pval = NULL; \
|
||||
}
|
||||
|
||||
|
||||
static inline void mca_pml_bfo_free_rdma_resources(mca_pml_bfo_send_request_t* sendreq)
|
||||
{
|
||||
size_t r;
|
||||
|
||||
/* return mpool resources */
|
||||
for(r = 0; r < sendreq->req_rdma_cnt; r++) {
|
||||
struct mca_btl_base_registration_handle_t* handle = sendreq->req_rdma[r].btl_reg;
|
||||
mca_bml_base_btl_t *bml_btl = sendreq->req_rdma[r].bml_btl;
|
||||
|
||||
if( NULL != handle ) {
|
||||
mca_bml_base_deregister_mem (bml_btl, handle);
|
||||
sendreq->req_rdma[r].btl_reg = NULL;
|
||||
}
|
||||
}
|
||||
sendreq->req_rdma_cnt = 0;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Start a send request.
|
||||
*/
|
||||
|
||||
#define MCA_PML_BFO_SEND_REQUEST_START(sendreq, rc) \
|
||||
do { \
|
||||
rc = mca_pml_bfo_send_request_start(sendreq); \
|
||||
} while (0)
|
||||
|
||||
|
||||
/*
|
||||
* Mark a send request as completed at the MPI level.
|
||||
*/
|
||||
|
||||
#define MCA_PML_BFO_SEND_REQUEST_MPI_COMPLETE(sendreq, with_signal) \
|
||||
do { \
|
||||
(sendreq)->req_send.req_base.req_ompi.req_status.MPI_SOURCE = \
|
||||
(sendreq)->req_send.req_base.req_comm->c_my_rank; \
|
||||
(sendreq)->req_send.req_base.req_ompi.req_status.MPI_TAG = \
|
||||
(sendreq)->req_send.req_base.req_tag; \
|
||||
(sendreq)->req_send.req_base.req_ompi.req_status.MPI_ERROR = OMPI_SUCCESS; \
|
||||
(sendreq)->req_send.req_base.req_ompi.req_status._ucount = \
|
||||
(sendreq)->req_send.req_bytes_packed; \
|
||||
PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_COMPLETE, \
|
||||
&(sendreq->req_send.req_base), PERUSE_SEND); \
|
||||
\
|
||||
ompi_request_complete( &((sendreq)->req_send.req_base.req_ompi), (with_signal) ); \
|
||||
} while(0)
|
||||
|
||||
/*
|
||||
* Release resources associated with a request
|
||||
*/
|
||||
|
||||
#define MCA_PML_BFO_SEND_REQUEST_RETURN(sendreq) \
|
||||
do { \
|
||||
MCA_PML_BASE_SEND_REQUEST_FINI((&(sendreq)->req_send)); \
|
||||
OPAL_FREE_LIST_RETURN( &mca_pml_base_send_requests, \
|
||||
(opal_free_list_item_t*)sendreq); \
|
||||
} while(0)
|
||||
|
||||
|
||||
/*
|
||||
* The PML has completed a send request. Note that this request
|
||||
* may have been orphaned by the user or have already completed
|
||||
* at the MPI level.
|
||||
* This function will never be called directly from the upper level, as it
|
||||
* should only be an internal call to the PML.
|
||||
*
|
||||
*/
|
||||
static inline void
|
||||
send_request_pml_complete(mca_pml_bfo_send_request_t *sendreq)
|
||||
{
|
||||
if(false == sendreq->req_send.req_base.req_pml_complete) {
|
||||
if(sendreq->req_send.req_bytes_packed > 0) {
|
||||
PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_END,
|
||||
&(sendreq->req_send.req_base), PERUSE_SEND);
|
||||
}
|
||||
|
||||
/* return mpool resources */
|
||||
mca_pml_bfo_free_rdma_resources(sendreq);
|
||||
|
||||
if (sendreq->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED &&
|
||||
sendreq->req_send.req_addr != sendreq->req_send.req_base.req_addr) {
|
||||
mca_pml_base_bsend_request_fini((ompi_request_t*)sendreq);
|
||||
}
|
||||
|
||||
sendreq->req_send.req_base.req_pml_complete = true;
|
||||
|
||||
if( !REQUEST_COMPLETE( &((sendreq->req_send).req_base.req_ompi)) ) {
|
||||
/* Should only be called for long messages (maybe synchronous) */
|
||||
MCA_PML_BFO_SEND_REQUEST_MPI_COMPLETE(sendreq, true);
|
||||
} else {
|
||||
if( MPI_SUCCESS != sendreq->req_send.req_base.req_ompi.req_status.MPI_ERROR ) {
|
||||
ompi_mpi_abort(&ompi_mpi_comm_world.comm, MPI_ERR_REQUEST);
|
||||
}
|
||||
}
|
||||
#if PML_BFO
|
||||
sendreq->req_send.req_base.req_sequence -= 100;
|
||||
#endif /* PML_BFO */
|
||||
|
||||
if(true == sendreq->req_send.req_base.req_free_called) {
|
||||
MCA_PML_BFO_SEND_REQUEST_RETURN(sendreq);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* returns true if request was completed on PML level */
|
||||
static inline bool
|
||||
send_request_pml_complete_check(mca_pml_bfo_send_request_t *sendreq)
|
||||
{
|
||||
#if OPAL_ENABLE_MULTI_THREADS
|
||||
opal_atomic_rmb();
|
||||
#endif
|
||||
/* if no more events are expected for the request and the whole message is
|
||||
* already sent and send fragment scheduling isn't running in another
|
||||
* thread then complete the request on PML level. From now on, if user
|
||||
* called free on this request, the request structure can be reused for
|
||||
* another request or if the request is persistent it can be restarted */
|
||||
if(sendreq->req_state == 0 &&
|
||||
sendreq->req_bytes_delivered >= sendreq->req_send.req_bytes_packed
|
||||
&& lock_send_request(sendreq)) {
|
||||
send_request_pml_complete(sendreq);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Schedule additional fragments
|
||||
*/
|
||||
int
|
||||
mca_pml_bfo_send_request_schedule_once(mca_pml_bfo_send_request_t*);
|
||||
|
||||
static inline int
|
||||
mca_pml_bfo_send_request_schedule_exclusive(mca_pml_bfo_send_request_t* sendreq)
|
||||
{
|
||||
int rc;
|
||||
do {
|
||||
rc = mca_pml_bfo_send_request_schedule_once(sendreq);
|
||||
if(rc == OMPI_ERR_OUT_OF_RESOURCE)
|
||||
break;
|
||||
} while(!unlock_send_request(sendreq));
|
||||
|
||||
if(OMPI_SUCCESS == rc)
|
||||
send_request_pml_complete_check(sendreq);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
static inline void
|
||||
mca_pml_bfo_send_request_schedule(mca_pml_bfo_send_request_t* sendreq)
|
||||
{
|
||||
/*
|
||||
* Only allow one thread in this routine for a given request.
|
||||
* However, we cannot block callers on a mutex, so simply keep track
|
||||
* of the number of times the routine has been called and run through
|
||||
* the scheduling logic once for every call.
|
||||
*/
|
||||
|
||||
if(!lock_send_request(sendreq))
|
||||
return;
|
||||
|
||||
mca_pml_bfo_send_request_schedule_exclusive(sendreq);
|
||||
}
|
||||
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
int mca_pml_bfo_send_request_start_cuda(
|
||||
mca_pml_bfo_send_request_t* sendreq,
|
||||
mca_bml_base_btl_t* bml_btl,
|
||||
size_t size);
|
||||
#endif /* OPAL_CUDA_SUPPORT */
|
||||
|
||||
/**
|
||||
* Start the specified request
|
||||
*/
|
||||
|
||||
int mca_pml_bfo_send_request_start_buffered(
|
||||
mca_pml_bfo_send_request_t* sendreq,
|
||||
mca_bml_base_btl_t* bml_btl,
|
||||
size_t size);
|
||||
|
||||
int mca_pml_bfo_send_request_start_copy(
|
||||
mca_pml_bfo_send_request_t* sendreq,
|
||||
mca_bml_base_btl_t* bml_btl,
|
||||
size_t size);
|
||||
|
||||
int mca_pml_bfo_send_request_start_prepare(
|
||||
mca_pml_bfo_send_request_t* sendreq,
|
||||
mca_bml_base_btl_t* bml_btl,
|
||||
size_t size);
|
||||
|
||||
int mca_pml_bfo_send_request_start_rdma(
|
||||
mca_pml_bfo_send_request_t* sendreq,
|
||||
mca_bml_base_btl_t* bml_btl,
|
||||
size_t size);
|
||||
|
||||
int mca_pml_bfo_send_request_start_rndv(
|
||||
mca_pml_bfo_send_request_t* sendreq,
|
||||
mca_bml_base_btl_t* bml_btl,
|
||||
size_t size,
|
||||
int flags);
|
||||
|
||||
static inline int
|
||||
mca_pml_bfo_send_request_start_btl( mca_pml_bfo_send_request_t* sendreq,
|
||||
mca_bml_base_btl_t* bml_btl )
|
||||
{
|
||||
size_t size = sendreq->req_send.req_bytes_packed;
|
||||
mca_btl_base_module_t* btl = bml_btl->btl;
|
||||
size_t eager_limit = btl->btl_eager_limit - sizeof(mca_pml_bfo_hdr_t);
|
||||
int rc;
|
||||
|
||||
if( OPAL_LIKELY(size <= eager_limit) ) {
|
||||
switch(sendreq->req_send.req_send_mode) {
|
||||
case MCA_PML_BASE_SEND_SYNCHRONOUS:
|
||||
rc = mca_pml_bfo_send_request_start_rndv(sendreq, bml_btl, size, 0);
|
||||
break;
|
||||
case MCA_PML_BASE_SEND_BUFFERED:
|
||||
rc = mca_pml_bfo_send_request_start_copy(sendreq, bml_btl, size);
|
||||
break;
|
||||
case MCA_PML_BASE_SEND_COMPLETE:
|
||||
rc = mca_pml_bfo_send_request_start_prepare(sendreq, bml_btl, size);
|
||||
break;
|
||||
default:
|
||||
if (size != 0 && bml_btl->btl_flags & MCA_BTL_FLAGS_SEND_INPLACE) {
|
||||
rc = mca_pml_bfo_send_request_start_prepare(sendreq, bml_btl, size);
|
||||
} else {
|
||||
rc = mca_pml_bfo_send_request_start_copy(sendreq, bml_btl, size);
|
||||
}
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
size = eager_limit;
|
||||
if(OPAL_UNLIKELY(btl->btl_rndv_eager_limit < eager_limit))
|
||||
size = btl->btl_rndv_eager_limit;
|
||||
if(sendreq->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED) {
|
||||
rc = mca_pml_bfo_send_request_start_buffered(sendreq, bml_btl, size);
|
||||
} else if
|
||||
(opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == false) {
|
||||
unsigned char *base;
|
||||
opal_convertor_get_current_pointer( &sendreq->req_send.req_base.req_convertor, (void**)&base );
|
||||
|
||||
if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_bfo_rdma_btls(
|
||||
sendreq->req_endpoint,
|
||||
base,
|
||||
sendreq->req_send.req_bytes_packed,
|
||||
sendreq->req_rdma))) {
|
||||
rc = mca_pml_bfo_send_request_start_rdma(sendreq, bml_btl,
|
||||
sendreq->req_send.req_bytes_packed);
|
||||
if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
|
||||
mca_pml_bfo_free_rdma_resources(sendreq);
|
||||
}
|
||||
} else {
|
||||
rc = mca_pml_bfo_send_request_start_rndv(sendreq, bml_btl, size,
|
||||
MCA_PML_BFO_HDR_FLAGS_CONTIG);
|
||||
}
|
||||
} else {
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
if (sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA) {
|
||||
return mca_pml_bfo_send_request_start_cuda(sendreq, bml_btl, size);
|
||||
}
|
||||
#endif /* OPAL_CUDA_SUPPORT */
|
||||
rc = mca_pml_bfo_send_request_start_rndv(sendreq, bml_btl, size, 0);
|
||||
}
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
static inline int
|
||||
mca_pml_bfo_send_request_start( mca_pml_bfo_send_request_t* sendreq )
|
||||
{
|
||||
mca_pml_bfo_comm_t* comm = sendreq->req_send.req_base.req_comm->c_pml_comm;
|
||||
mca_bml_base_endpoint_t* endpoint = (mca_bml_base_endpoint_t*)
|
||||
sendreq->req_send.req_base.req_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
|
||||
size_t i;
|
||||
|
||||
if( OPAL_UNLIKELY(endpoint == NULL) ) {
|
||||
return OMPI_ERR_UNREACH;
|
||||
}
|
||||
|
||||
sendreq->req_endpoint = endpoint;
|
||||
sendreq->req_state = 0;
|
||||
sendreq->req_lock = 0;
|
||||
sendreq->req_pipeline_depth = 0;
|
||||
sendreq->req_bytes_delivered = 0;
|
||||
sendreq->req_pending = MCA_PML_BFO_SEND_PENDING_NONE;
|
||||
sendreq->req_send.req_base.req_sequence = OPAL_THREAD_ADD_FETCH32(
|
||||
&comm->procs[sendreq->req_send.req_base.req_peer].send_sequence,1);
|
||||
#if PML_BFO
|
||||
sendreq->req_restartseq = 0; /* counts up restarts */
|
||||
sendreq->req_restart = 0; /* reset in case we restart again */
|
||||
sendreq->req_error = 0; /* clear error state */
|
||||
sendreq->req_events = 0; /* clear events, probably 0 anyways */
|
||||
#endif /* PML_BFO */
|
||||
|
||||
MCA_PML_BASE_SEND_START( &sendreq->req_send.req_base );
|
||||
|
||||
for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) {
|
||||
mca_bml_base_btl_t* bml_btl;
|
||||
int rc;
|
||||
|
||||
/* select a btl */
|
||||
bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
|
||||
rc = mca_pml_bfo_send_request_start_btl(sendreq, bml_btl);
|
||||
if( OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != rc) )
|
||||
return rc;
|
||||
}
|
||||
add_request_to_send_pending(sendreq, MCA_PML_BFO_SEND_PENDING_START, true);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initiate a put scheduled by the receiver.
|
||||
*/
|
||||
|
||||
void mca_pml_bfo_send_request_put( mca_pml_bfo_send_request_t* sendreq,
|
||||
mca_btl_base_module_t* btl,
|
||||
mca_pml_bfo_rdma_hdr_t* hdr );
|
||||
|
||||
int mca_pml_bfo_send_request_put_frag(mca_pml_bfo_rdma_frag_t* frag);
|
||||
|
||||
/* This function tries to continue sendreq that was stuck because of resource
|
||||
* unavailability. A sendreq may be added to send_pending list if there is no
|
||||
* resource to send initial packet or there is not resource to schedule data
|
||||
* for sending. The reason the sendreq was added to the list is stored inside
|
||||
* sendreq struct and appropriate operation is retried when resource became
|
||||
* available. bml_btl passed to the function doesn't represents sendreq
|
||||
* destination, it represents BTL on which resource was freed, so only this BTL
|
||||
* should be considered for sending packets */
|
||||
void mca_pml_bfo_send_request_process_pending(mca_bml_base_btl_t *bml_btl);
|
||||
|
||||
void mca_pml_bfo_send_request_copy_in_out(mca_pml_bfo_send_request_t *sendreq,
|
||||
uint64_t send_offset, uint64_t send_length);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* OMPI_PML_BFO_SEND_REQUEST_H */
|
@ -1,148 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2016 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2008 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "pml_bfo.h"
|
||||
#include "pml_bfo_recvreq.h"
|
||||
#include "pml_bfo_sendreq.h"
|
||||
#include "ompi/memchecker.h"
|
||||
|
||||
|
||||
int mca_pml_bfo_start(size_t count, ompi_request_t** requests)
|
||||
{
|
||||
int rc;
|
||||
size_t i;
|
||||
bool reuse_old_request = true;
|
||||
|
||||
for(i=0; i<count; i++) {
|
||||
mca_pml_base_request_t *pml_request = (mca_pml_base_request_t*)requests[i];
|
||||
if(NULL == pml_request) {
|
||||
continue;
|
||||
}
|
||||
if (OMPI_REQUEST_PML != requests[i]->req_type) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* If the persistent request is currently active - obtain the
|
||||
* request lock and verify the status is incomplete. if the
|
||||
* pml layer has not completed the request - mark the request
|
||||
* as free called - so that it will be freed when the request
|
||||
* completes - and create a new request.
|
||||
*/
|
||||
|
||||
#if OPAL_ENABLE_MULTI_THREADS
|
||||
opal_atomic_rmb();
|
||||
#endif
|
||||
reuse_old_request = true;
|
||||
switch(pml_request->req_ompi.req_state) {
|
||||
case OMPI_REQUEST_INACTIVE:
|
||||
if(pml_request->req_pml_complete == true)
|
||||
break;
|
||||
/* otherwise fall through */
|
||||
case OMPI_REQUEST_ACTIVE: {
|
||||
|
||||
ompi_request_t *request;
|
||||
if (pml_request->req_pml_complete == false) {
|
||||
/* free request after it completes */
|
||||
pml_request->req_free_called = true;
|
||||
} else {
|
||||
/* can reuse the existing request */
|
||||
break;
|
||||
}
|
||||
|
||||
reuse_old_request = false;
|
||||
/* allocate a new request */
|
||||
switch(pml_request->req_type) {
|
||||
case MCA_PML_REQUEST_SEND: {
|
||||
mca_pml_base_send_mode_t sendmode =
|
||||
((mca_pml_base_send_request_t*)pml_request)->req_send_mode;
|
||||
rc = mca_pml_bfo_isend_init(
|
||||
pml_request->req_addr,
|
||||
pml_request->req_count,
|
||||
pml_request->req_datatype,
|
||||
pml_request->req_peer,
|
||||
pml_request->req_tag,
|
||||
sendmode,
|
||||
pml_request->req_comm,
|
||||
&request);
|
||||
break;
|
||||
}
|
||||
case MCA_PML_REQUEST_RECV:
|
||||
rc = mca_pml_bfo_irecv_init(
|
||||
pml_request->req_addr,
|
||||
pml_request->req_count,
|
||||
pml_request->req_datatype,
|
||||
pml_request->req_peer,
|
||||
pml_request->req_tag,
|
||||
pml_request->req_comm,
|
||||
&request);
|
||||
break;
|
||||
default:
|
||||
rc = OMPI_ERR_REQUEST;
|
||||
break;
|
||||
}
|
||||
if(OMPI_SUCCESS != rc)
|
||||
return rc;
|
||||
pml_request = (mca_pml_base_request_t*)request;
|
||||
requests[i] = request;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
return OMPI_ERR_REQUEST;
|
||||
}
|
||||
|
||||
/* start the request */
|
||||
switch(pml_request->req_type) {
|
||||
case MCA_PML_REQUEST_SEND:
|
||||
{
|
||||
mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)pml_request;
|
||||
MEMCHECKER(
|
||||
memchecker_call(&opal_memchecker_base_isdefined,
|
||||
pml_request->req_addr, pml_request->req_count,
|
||||
pml_request->req_datatype);
|
||||
);
|
||||
if( reuse_old_request && (sendreq->req_send.req_bytes_packed != 0) ) {
|
||||
size_t offset = 0;
|
||||
/**
|
||||
* Reset the convertor in case we're dealing with the original
|
||||
* request, which when completed do not reset the convertor.
|
||||
*/
|
||||
opal_convertor_set_position( &sendreq->req_send.req_base.req_convertor,
|
||||
&offset );
|
||||
}
|
||||
MCA_PML_BFO_SEND_REQUEST_START(sendreq, rc);
|
||||
if(rc != OMPI_SUCCESS)
|
||||
return rc;
|
||||
break;
|
||||
}
|
||||
case MCA_PML_REQUEST_RECV:
|
||||
{
|
||||
mca_pml_bfo_recv_request_t* recvreq = (mca_pml_bfo_recv_request_t*)pml_request;
|
||||
MCA_PML_BFO_RECV_REQUEST_START(recvreq);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
return OMPI_ERR_REQUEST;
|
||||
}
|
||||
}
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
@ -1 +0,0 @@
|
||||
DIRECT_CALL_HEADER="ompi/mca/pml/bfo/pml_bfo.h"
|
Загрузка…
Ссылка в новой задаче
Block a user