1
1

Merge pull request #5933 from hppritcha/topic/remove_bfo_pml

remove the bfo pml
Этот коммит содержится в:
Howard Pritchard 2018-10-17 09:39:58 -06:00 коммит произвёл GitHub
родитель 43547ade4c 7d6774acf8
Коммит a435bfe1cf
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
32 изменённых файлов: 0 добавлений и 11029 удалений

Просмотреть файл

Просмотреть файл

@ -1,78 +0,0 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
# Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
#
# Copyright (c) 2017 IBM Corporation. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
AM_CPPFLAGS = -DPML_BFO=1
dist_ompidata_DATA = \
help-mpi-pml-bfo.txt
EXTRA_DIST = post_configure.sh
bfo_sources = \
pml_bfo.c \
pml_bfo.h \
pml_bfo_comm.c \
pml_bfo_comm.h \
pml_bfo_component.c \
pml_bfo_component.h \
pml_bfo_failover.c \
pml_bfo_failover.h \
pml_bfo_hdr.h \
pml_bfo_iprobe.c \
pml_bfo_irecv.c \
pml_bfo_isend.c \
pml_bfo_progress.c \
pml_bfo_rdma.c \
pml_bfo_rdma.h \
pml_bfo_rdmafrag.c \
pml_bfo_rdmafrag.h \
pml_bfo_recvfrag.c \
pml_bfo_recvfrag.h \
pml_bfo_recvreq.c \
pml_bfo_recvreq.h \
pml_bfo_sendreq.c \
pml_bfo_sendreq.h \
pml_bfo_start.c
# If we have CUDA support requested, build the CUDA file also
if OPAL_cuda_support
bfo_sources += \
pml_bfo_cuda.c
endif
if MCA_BUILD_ompi_pml_bfo_DSO
component_noinst =
component_install = mca_pml_bfo.la
else
component_noinst = libmca_pml_bfo.la
component_install =
endif
mcacomponentdir = $(ompilibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_pml_bfo_la_SOURCES = $(bfo_sources)
mca_pml_bfo_la_LDFLAGS = -module -avoid-version
mca_pml_bfo_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la
noinst_LTLIBRARIES = $(component_noinst)
libmca_pml_bfo_la_SOURCES = $(bfo_sources)
libmca_pml_bfo_la_LDFLAGS = -module -avoid-version

Просмотреть файл

@ -1,340 +0,0 @@
Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
BFO DESIGN DOCUMENT
This document describes the use and design of the bfo. In addition,
there is a section at the end explaining why this functionality was
not merged into the ob1 PML.
1. GENERAL USAGE
First, one has to configure the failover code into the openib BTL so
that bfo will work correctly. To do this:
configure --enable-btl-openib-failover.
Then, when running one needs to select the bfo PML explicitly.
mpirun --mca pml bfo
Note that one needs to both configure with --enable-btl-openib-failover
and run with --mca pml bfo to get the failover support. If one of
these two steps is skipped, then the MPI job will just abort in the
case of an error like it normally does with the ob1 PML.
2. GENERAL FUNCTION
The bfo failover feature requires two or more openib BTLs in use. In
normal operation, it will stripe the communication over the multiple
BTLs. When an error is detected, it will stop using the BTL that
incurred the error and continue the communication over the remaining
BTL. Once a BTL has been mapped out, it cannot be used by the job
again, even if the underlying fabric becomes functional again. Only
new jobs started after the fabric comes back up will use both BTLs.
The bfo works in conjunction with changes that were made in the openib
BTL. As noted above, those changes need to be configured into the
BTL for everything to work properly.
The bfo only fails over between openib BTLs. It cannot failover from
an openib BTL to TCP, for example.
3. GENERAL DESIGN
The bfo (Btl FailOver) PML was designed to work in clusters that have
multiple openib BTLs. It was designed to be lightweight so as to
avoid any adverse effects on latency. To that end, there is no
tracking of fragments or messages in the bfo PML. Rather, it depends
on the underlying BTL to notify it of each fragment that has an error.
The bfo then decides what needs to be done based on the type of
fragment that gets an error.
No additional sequence numbers were introduced in the bfo. Instead,
it makes use of the sequence numbers that exist in the MATCH, RNDV and
RGET fragment header. In that way, duplicate fragments that have
MATCH information in them can be detected. Other fragments, like PUT
and ACK, are never retransmitted so it does not matter that they do
not have sequence numbers. The FIN header was a special case in that
it was changed to include the MATCH header so that the tag, source,
and context fields could be used to check for duplicate FINs.
Note that the assumption is that the underlying BTL will always issue
a callback with an error flag when it thinks a fragment has an error.
This means that even after an error is detected on a BTL, the BTL
continues to be checked for any other messages that may also complete
with an error. This is potentially a unique characteristic of the
openib BTL when running over RC connections that allows the BFO to
work properly.
One scenario that is particularly difficult to handle is the case
where a fragment has an error but the message actually makes it to the
other side. It is because of this that all fragments need to be
checked to make sure they are not a duplicate. This scenario also
complicates some of the rendezvous protocols as the two sides may not
agree where the problem occurred. For example, one can imagine a
sender getting an error on a final FIN message, but the FIN message
actually arrives at the other side. The receiver thinks the
communication is done and moves on. The sender thinks there was a
problem, and that the communication needs to restart.
It is also important to note that a message cannot signal a successful
completion and *not* make it to the receiver. This would probably cause
the bfo to hang.
4. ERRORS
Errors are detected in the openib BTL layer and propagated to the PML
layer. Typically, the errors occur while polling the completion
queue, but can happen in other areas as well. When an error occurs,
an additional callback is called so the PML can map out the connection
for future sending. Then the callback associated with the fragment is
called, but with the error field set to OMPI_ERROR. This way, the PML
knows that this fragment may not have made it to the remote side.
The first callback into the PML is via the mca_pml_bfo_error_handler()
callback and the PML uses this to remove a connection for future
sending. If the error_proc_t field is NULL, then the entire BTL is
removed for any future communication. If the error_proc_t is not
NULL, then the BTL is only removed for the connection associated with
the error_proc_t.
The second callback is the standard one for a completion event, and
this can trigger various activities in the PML. The regular callback
function is called but the status is set to OMPI_ERROR. The PML layer
detects this and calls some failover specific routines depending on
the type of fragment that got the error.
5. RECOVERY OF MATCH FRAGMENTS
Note: For a general description of how the various fragments interact,
see Appendix 1 at the end of this document.
In the case of a MATCH fragment, the fragment is simply resent. Care
has to be taken with a MATCH fragment that is sent via the standard
interface and one that is sent via the sendi interface. In the
standard send, the send request is still available and is therefore
reset reused to send the MATCH fragment. In the case of the sendi
fragment, the send request is gone, so the fragment is regenerated
from the information contained within the fragment.
6. RECOVERY OF RNDV or LARGE MESSAGE RDMA
In the case of a large message RDMA transfer or a RNDV transfer where
the message consists of several fragments, the restart is a little
more complicated. This includes fragments like RNDV, PUT, RGET, FRAG,
FIN, and RDMA write and RDMA read completions. In most cases, the
requests associated with these fragments are reset and restarted.
First, it should be pointed out that a new variable was added to the
send and receive requests. This variable tracks outstanding send
events that have not yet received their completion events. This new
variable is used so that a request is not restarted until all the
outstanding events have completed. If one does not wait for the
outstanding events to complete, then one may restart a request and
then a completion event will happen on the wrong request.
There is a second variable added to each request and that is one that
shows whether the request is already in an error state. When a request
reaches the state that it has an error flagged on it and the outstanding
completion events are down to zero, it can start the restart dance
as described below.
7. SPECIAL CASE FOR FIN FRAGMENT
Like the MATCH fragment, the FIN message is also simply resent. Like
the sendi MATCH fragment, there may be no request associated with the
FIN message when it gets an error, so the fragment is recreated from
the information in the fragment. The FIN fragment was modified to
have additional information like what is in a MATCH fragment including
the context, source, and tag. In this way, we can figure out if the
FIN message is a duplicate on the receiving side.
8. RESTART DANCE
When the bfo determines that there are no outstanding completion events,
a restart dance is initiated. There are four new PML message types that
have been created to participate in the dance.
1. RNDVRESTARTNOTIFY
2. RECVERRNOTIFY
3. RNDVRESTARTACK
4. RNDVRESTARTNACK
When the send request is in an error state and the outstanding
completion events is zero, RNDVRESTARTNOTIFY is sent from the sender
to the receiver to let it know that the communication needs to be
restarted. Upon receipt of the RNDVRESTARTNOTIFY, the receiver first
checks to make sure that it is still pointing to a valid receiver
request. If so, it marks the receive request in error. It then
checks to see if there are any outstanding completion events on the
receiver. If there are no outstanding completion events, the receiver
sends the RNDVRESTARTACK. If there are outstanding completion events,
then the RNDVRESTARTACK gets sent later when a completion event occurs
that brings the outstanding event count to zero.
In the case that the receiver determines that it is no longer looking
at a valid receive request, which means the request is complete, the
receiver responds with a RNDVRESTARTNACK. While rare, this case can
happen for example, when a final FRAG message triggers an error on the
sender, but actually makes it to the receiver.
The RECVERRNOTIFY fragment is used so the receiver can let the sender
sender know that it had an error. The sender then waits for all of
its completion events, and then sends a RNDVRESTARTNOTIFY.
All the handling of these new messages is contained in the
pml_bfo_failover files.
9. BTL SUPPORT
The openib BTL also supplies a lot of support for the bfo PML. First,
fragments can be stored in the BTL during normal operation if
resources become scarce. This means that when an error is detected in
the BTL, it needs to scour its internal queues for fragments that are
destined for the BTL and error them out. The function
error_out_all_pending_frags() takes care of this functionality. And
some of the fragments stored can be coalesced, so care has to be taken
to tease out each message from a coalesced fragment.
There is also some special code in the BTL to handle some strange
occurrences that were observed in the BTL. First, there are times
where only one half of the connection gets an error. This can result
in a mismatch between what the PML thinks is available to it and can
cause hangs. Therefore, when a BTL detects an error, it sends a
special message down the working BTL connection to tell the remote
side that it needs to be brought down as well.
Secondly, it has been observed that a message can get stuck in the
eager RDMA connection between two BTLs. In this case, an error is
detected on one side, but the other side never sees the message.
Therefore, a special message is sent to the other side telling it to
move along in the eager RDMA connection. This is all somewhat
confusing. See the code in the btl_openib_failover.c file for the
details.
10. MERGING
Every effort was made to try and merge the bfo PML into the ob1 PML.
The idea was that any upgrades to the ob1 PML would automatically make
it into the bfo PML and this would enhance maintainability of all the
code. However, it was deemed that this merging would cause more
problems than it would solve. What was attempted and why the
conclusion was made are documented here.
One can look at the bfo and easily see the differences between it and
ob1. All the bfo specific code is surrounded by #if PML_BFO. In
addition, there are two additional files in the bfo,
pml_bfo_failover.c and pml_bfo_failover.h.
To merge them, the following was attempted. First, add all the code
in #if regions into the ob1 PML. As of this writing, there are 73
#ifs that would have to be added into ob1.
Secondly, remove almost all the pml_bfo files and replace them with
links to the ob1 files.
Third, create a new header file that did name shifting of all the
functions so that ob1 and bfo could live together. This also included
having to create macros for the names of header files as well. To
help illustrate the name shifting issue, here is what the file might
look like in the bfo directory.
/* Need macros for the header files as they are different in the
* different PMLs */
#define PML "bfo"
#define PML_OB1_H "pml_bfo.h"
#define PML_OB1_COMM_H "pml_bfo_comm.h"
#define PML_OB1_COMPONENT_H "pml_bfo_component.h"
#define PML_OB1_HDR_H "pml_bfo_hdr.h"
#define PML_OB1_RDMA_H "pml_bfo_rdma.h"
#define PML_OB1_RDMAFRAG_H "pml_bfo_rdmafrag.h"
#define PML_OB1_RECVFRAG_H "pml_bfo_recvfrag.h"
#define PML_OB1_RECVREQ_H "pml_bfo_recvreq.h"
#define PML_OB1_SENDREQ_H "pml_bfo_sendreq.h"
/* Name shifting of functions from ob1 to bfo (incomplete list) */
#define mca_pml_ob1 mca_pml_bfo
#define mca_pml_ob1_t mca_pml_bfo_t
#define mca_pml_ob1_component mca_pml_bfo_component
#define mca_pml_ob1_add_procs mca_pml_bfo_add_procs
#define mca_pml_ob1_del_procs mca_pml_bfo_del_procs
#define mca_pml_ob1_enable mca_pml_bfo_enable
#define mca_pml_ob1_progress mca_pml_bfo_progress
#define mca_pml_ob1_add_comm mca_pml_bfo_add_comm
#define mca_pml_ob1_del_comm mca_pml_bfo_del_comm
#define mca_pml_ob1_irecv_init mca_pml_bfo_irecv_init
#define mca_pml_ob1_irecv mca_pml_bfo_irecv
#define mca_pml_ob1_recv mca_pml_bfo_recv
#define mca_pml_ob1_isend_init mca_pml_bfo_isend_init
#define mca_pml_ob1_isend mca_pml_bfo_isend
#define mca_pml_ob1_send mca_pml_bfo_send
#define mca_pml_ob1_iprobe mca_pml_bfo_iprobe
[...and much more ...]
The pml_bfo_hdr.h file was not a link because the changes in it were
so extensive. Also the Makefile was kept separate so it could include
the additional failover files as well as add a compile directive that
would force the files to be compiled as bfo instead of ob1.
After these changes were made, several independent developers reviewed
the results and concluded that making these changes would have too
much of a negative impact on ob1 maintenance. First, the code became
much harder to read with all the additional #ifdefs. Secondly, the
possibility of adding other features, like csum, to ob1 would only
make this issue even worse. Therefore, it was decided to keep the bfo
PML separate from ob1.
11. UTILITIES
In an ideal world, any bug fixes that are made in the ob1 PML would
also be made in the csum and the bfo PMLs. However, that does not
always happen. Therefore, there are two new utilities added to the
contrib directory.
check-ob1-revision.pl
check-ob1-pml-diffs.pl
The first one can be run to see if ob1 has changed from its last known
state. Here is an example.
machine =>check-ob1-revision.pl
Running svn diff -r24138 ../ompi/mca/pml/ob1
No new changes detected in ob1. Everything is fine.
If there are differences, then one needs to review them and potentially
add them to the bfo (and csum also if one feels like it).
After that, bump up the value in the script to the latest value.
The second script allows one to see the differences between the ob1
and bfo PML. Here is an example.
machine =>check-ob1-pml-diffs.pl
Starting script to check differences between bfo and ob1...
Files Compared: pml_ob1.c and pml_bfo.c
No differences encountered
Files Compared: pml_ob1.h and pml_bfo.h
[...snip...]
Files Compared: pml_ob1_start.c and pml_bfo_start.c
No differences encountered
There is a lot more in the script that tells how it is used.
Appendix 1: SIMPLE OVERVIEW OF COMMUNICATION PROTOCOLS
The drawings below attempt to describe some of the general flow of
fragments in the various protocols that are supported in the PMLs.
The "read" and "write" are actual RDMA actions and do not pertain to
fragments that are sent. As can be inferred, they use FIN messages to
indicate their completion.
MATCH PROTOCOL
sender >->->-> MATCH >->->-> receiver
SEND WITH MULTIPLE FRAGMENTS
sender >->->-> RNDV >->->-> receiver
<-<-<-< ACK <-<-<-<
>->->-> FRAG >->->->
>->->-> FRAG >->->->
>->->-> FRAG >->->->
RDMA PUT
sender >->->-> RNDV >->->-> receiver
<-<-<-< PUT <-<-<-<
<-<-<-< PUT <-<-<-<
>->->-> write >->->->
>->->-> FIN >->->->
>->->-> write >->->->
>->->-> FIN >->->->
RMA GET
sender >->->-> RGET >->->-> receiver
<-<-<-< read <-<-<-<
<-<-<-< FIN <-<-<-<

Просмотреть файл

@ -1,27 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2013 Sandia National Laboratories. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_ompi_pml_bfo_POST_CONFIG(will_build)
# ----------------------------------------
# The BFO PML requires a BML endpoint tag to compile, so require it.
# Require in POST_CONFIG instead of CONFIG so that we only require it
# if we're not disabled.
AC_DEFUN([MCA_ompi_pml_bfo_POST_CONFIG], [
AS_IF([test "$1" = "1"], [OMPI_REQUIRE_ENDPOINT_TAG([BML])])
])dnl
# MCA_ompi_pml_bfo_CONFIG(action-if-can-compile,
# [action-if-cant-compile])
# ------------------------------------------------
# We can always build, unless we were explicitly disabled.
AC_DEFUN([MCA_ompi_pml_bfo_CONFIG],[
AC_CONFIG_FILES([ompi/mca/pml/bfo/Makefile])
[$1]
])dnl

Просмотреть файл

@ -1,20 +0,0 @@
# -*- text -*-
#
# Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
[eager_limit_too_small]
The "eager limit" MCA parameter in the %s BTL was set to a value which
is too low for Open MPI to function properly. Please re-run your job
with a higher eager limit value for this BTL; the exact MCA parameter
name and its corresponding minimum value is shown below.
Local host: %s
BTL name: %s
BTL eager limit value: %d (set via btl_%s_eager_limit)
BTL eager limit minimum: %d
MCA parameter name: btl_%s_eager_limit

Просмотреть файл

@ -1,7 +0,0 @@
#
# owner/status file
# owner: institution that is responsible for this package
# status: e.g. active, maintenance, unmaintained
#
owner: NVIDIA
status: unmaintained

Просмотреть файл

@ -1,897 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2009 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
* Copyright (c) 2006-2008 University of Houston. All rights reserved.
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include <stdlib.h>
#include <string.h>
#include "opal/class/opal_bitmap.h"
#include "opal/util/output.h"
#include "opal/util/show_help.h"
#include "opal/mca/btl/btl.h"
#include "opal/mca/btl/base/base.h"
#include "opal/mca/pmix/pmix.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/mca/pml/base/base.h"
#include "ompi/mca/pml/base/base.h"
#include "ompi/mca/bml/base/base.h"
#include "ompi/runtime/ompi_cr.h"
#include "pml_bfo.h"
#include "pml_bfo_component.h"
#include "pml_bfo_comm.h"
#include "pml_bfo_hdr.h"
#include "pml_bfo_recvfrag.h"
#include "pml_bfo_sendreq.h"
#include "pml_bfo_recvreq.h"
#include "pml_bfo_rdmafrag.h"
#if PML_BFO
#include "pml_bfo_failover.h"
#endif /* PML_BFO */
mca_pml_bfo_t mca_pml_bfo = {
{
mca_pml_bfo_add_procs,
mca_pml_bfo_del_procs,
mca_pml_bfo_enable,
mca_pml_bfo_progress,
mca_pml_bfo_add_comm,
mca_pml_bfo_del_comm,
mca_pml_bfo_irecv_init,
mca_pml_bfo_irecv,
mca_pml_bfo_recv,
mca_pml_bfo_isend_init,
mca_pml_bfo_isend,
mca_pml_bfo_send,
mca_pml_bfo_iprobe,
mca_pml_bfo_probe,
mca_pml_bfo_start,
mca_pml_bfo_improbe,
mca_pml_bfo_mprobe,
mca_pml_bfo_imrecv,
mca_pml_bfo_mrecv,
mca_pml_bfo_dump,
mca_pml_bfo_ft_event,
65535,
INT_MAX
}
};
void mca_pml_bfo_error_handler( struct mca_btl_base_module_t* btl,
int32_t flags, ompi_proc_t* errproc,
char* btlinfo );
int mca_pml_bfo_enable(bool enable)
{
if( false == enable ) {
return OMPI_SUCCESS;
}
OBJ_CONSTRUCT(&mca_pml_bfo.lock, opal_mutex_t);
/* fragments */
OBJ_CONSTRUCT(&mca_pml_bfo.rdma_frags, opal_free_list_t);
opal_free_list_init( &mca_pml_bfo.rdma_frags,
sizeof(mca_pml_bfo_rdma_frag_t),
opal_cache_line_size,
OBJ_CLASS(mca_pml_bfo_rdma_frag_t),
0,opal_cache_line_size,
mca_pml_bfo.free_list_num,
mca_pml_bfo.free_list_max,
mca_pml_bfo.free_list_inc,
NULL, 0, NULL, NULL, NULL );
OBJ_CONSTRUCT(&mca_pml_bfo.recv_frags, opal_free_list_t);
opal_free_list_init( &mca_pml_bfo.recv_frags,
sizeof(mca_pml_bfo_recv_frag_t) + mca_pml_bfo.unexpected_limit,
opal_cache_line_size,
OBJ_CLASS(mca_pml_bfo_recv_frag_t),
0,opal_cache_line_size,
mca_pml_bfo.free_list_num,
mca_pml_bfo.free_list_max,
mca_pml_bfo.free_list_inc,
NULL, 0, NULL, NULL, NULL );
OBJ_CONSTRUCT(&mca_pml_bfo.pending_pckts, opal_free_list_t);
opal_free_list_init( &mca_pml_bfo.pending_pckts,
sizeof(mca_pml_bfo_pckt_pending_t),
opal_cache_line_size,
OBJ_CLASS(mca_pml_bfo_pckt_pending_t),
0,opal_cache_line_size,
mca_pml_bfo.free_list_num,
mca_pml_bfo.free_list_max,
mca_pml_bfo.free_list_inc,
NULL, 0, NULL, NULL, NULL );
OBJ_CONSTRUCT(&mca_pml_bfo.buffers, opal_free_list_t);
OBJ_CONSTRUCT(&mca_pml_bfo.send_ranges, opal_free_list_t);
opal_free_list_init( &mca_pml_bfo.send_ranges,
sizeof(mca_pml_bfo_send_range_t) +
(mca_pml_bfo.max_send_per_range - 1) * sizeof(mca_pml_bfo_com_btl_t),
opal_cache_line_size,
OBJ_CLASS(mca_pml_bfo_send_range_t),
0,opal_cache_line_size,
mca_pml_bfo.free_list_num,
mca_pml_bfo.free_list_max,
mca_pml_bfo.free_list_inc,
NULL, 0, NULL, NULL, NULL );
/* pending operations */
OBJ_CONSTRUCT(&mca_pml_bfo.send_pending, opal_list_t);
OBJ_CONSTRUCT(&mca_pml_bfo.recv_pending, opal_list_t);
OBJ_CONSTRUCT(&mca_pml_bfo.pckt_pending, opal_list_t);
OBJ_CONSTRUCT(&mca_pml_bfo.rdma_pending, opal_list_t);
/* missing communicator pending list */
OBJ_CONSTRUCT(&mca_pml_bfo.non_existing_communicator_pending, opal_list_t);
/**
* If we get here this is the PML who get selected for the run. We
* should get ownership for the send and receive requests list, and
* initialize them with the size of our own requests.
*/
opal_free_list_init( &mca_pml_base_send_requests,
sizeof(mca_pml_bfo_send_request_t) +
(mca_pml_bfo.max_rdma_per_request - 1) *
sizeof(mca_pml_bfo_com_btl_t),
opal_cache_line_size,
OBJ_CLASS(mca_pml_bfo_send_request_t),
0,opal_cache_line_size,
mca_pml_bfo.free_list_num,
mca_pml_bfo.free_list_max,
mca_pml_bfo.free_list_inc,
NULL, 0, NULL, NULL, NULL );
opal_free_list_init( &mca_pml_base_recv_requests,
sizeof(mca_pml_bfo_recv_request_t) +
(mca_pml_bfo.max_rdma_per_request - 1) *
sizeof(mca_pml_bfo_com_btl_t),
opal_cache_line_size,
OBJ_CLASS(mca_pml_bfo_recv_request_t),
0,opal_cache_line_size,
mca_pml_bfo.free_list_num,
mca_pml_bfo.free_list_max,
mca_pml_bfo.free_list_inc,
NULL, 0, NULL, NULL, NULL );
mca_pml_bfo.enabled = true;
return OMPI_SUCCESS;
}
int mca_pml_bfo_add_comm(ompi_communicator_t* comm)
{
/* allocate pml specific comm data */
mca_pml_bfo_comm_t* pml_comm = OBJ_NEW(mca_pml_bfo_comm_t);
opal_list_item_t *item, *next_item;
mca_pml_bfo_recv_frag_t* frag;
mca_pml_bfo_comm_proc_t* pml_proc;
mca_pml_bfo_match_hdr_t* hdr;
int i;
if (NULL == pml_comm) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
/* should never happen, but it was, so check */
if (comm->c_contextid > mca_pml_bfo.super.pml_max_contextid) {
OBJ_RELEASE(pml_comm);
return OMPI_ERR_OUT_OF_RESOURCE;
}
mca_pml_bfo_comm_init_size(pml_comm, comm->c_remote_group->grp_proc_count);
comm->c_pml_comm = pml_comm;
for( i = 0; i < comm->c_remote_group->grp_proc_count; i++ ) {
pml_comm->procs[i].ompi_proc = ompi_group_peer_lookup(comm->c_remote_group,i);
OBJ_RETAIN(pml_comm->procs[i].ompi_proc);
}
/* Grab all related messages from the non_existing_communicator pending queue */
for( item = opal_list_get_first(&mca_pml_bfo.non_existing_communicator_pending);
item != opal_list_get_end(&mca_pml_bfo.non_existing_communicator_pending);
item = next_item ) {
frag = (mca_pml_bfo_recv_frag_t*)item;
next_item = opal_list_get_next(item);
hdr = &frag->hdr.hdr_match;
/* Is this fragment for the current communicator ? */
if( frag->hdr.hdr_match.hdr_ctx != comm->c_contextid )
continue;
/* As we now know we work on a fragment for this communicator
* we should remove it from the
* non_existing_communicator_pending list. */
opal_list_remove_item( &mca_pml_bfo.non_existing_communicator_pending,
item );
add_fragment_to_unexpected:
/* We generate the MSG_ARRIVED event as soon as the PML is aware
* of a matching fragment arrival. Independing if it is received
* on the correct order or not. This will allow the tools to
* figure out if the messages are not received in the correct
* order (if multiple network interfaces).
*/
PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_ARRIVED, comm,
hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
/* There is no matching to be done, and no lock to be held on the communicator as
* we know at this point that the communicator has not yet been returned to the user.
* The only required protection is around the non_existing_communicator_pending queue.
* We just have to push the fragment into the unexpected list of the corresponding
* proc, or into the out-of-order (cant_match) list.
*/
pml_proc = &(pml_comm->procs[hdr->hdr_src]);
if( ((uint16_t)hdr->hdr_seq) == ((uint16_t)pml_proc->expected_sequence) ) {
/* We're now expecting the next sequence number. */
pml_proc->expected_sequence++;
opal_list_append( &pml_proc->unexpected_frags, (opal_list_item_t*)frag );
PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_INSERT_IN_UNEX_Q, comm,
hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
/* And now the ugly part. As some fragments can be inserted in the cant_match list,
* every time we succesfully add a fragment in the unexpected list we have to make
* sure the next one is not in the cant_match. Otherwise, we will endup in a deadlock
* situation as the cant_match is only checked when a new fragment is received from
* the network.
*/
for(frag = (mca_pml_bfo_recv_frag_t *)opal_list_get_first(&pml_proc->frags_cant_match);
frag != (mca_pml_bfo_recv_frag_t *)opal_list_get_end(&pml_proc->frags_cant_match);
frag = (mca_pml_bfo_recv_frag_t *)opal_list_get_next(frag)) {
hdr = &frag->hdr.hdr_match;
/* If the message has the next expected seq from that proc... */
if(hdr->hdr_seq != pml_proc->expected_sequence)
continue;
opal_list_remove_item(&pml_proc->frags_cant_match, (opal_list_item_t*)frag);
goto add_fragment_to_unexpected;
}
} else {
opal_list_append( &pml_proc->frags_cant_match, (opal_list_item_t*)frag );
}
}
return OMPI_SUCCESS;
}
int mca_pml_bfo_del_comm(ompi_communicator_t* comm)
{
mca_pml_bfo_comm_t* pml_comm = comm->c_pml_comm;
int i;
for( i = 0; i < comm->c_remote_group->grp_proc_count; i++ ) {
OBJ_RELEASE(pml_comm->procs[i].ompi_proc);
}
OBJ_RELEASE(comm->c_pml_comm);
comm->c_pml_comm = NULL;
return OMPI_SUCCESS;
}
/*
* For each proc setup a datastructure that indicates the BTLs
* that can be used to reach the destination.
*
*/
int mca_pml_bfo_add_procs(ompi_proc_t** procs, size_t nprocs)
{
opal_bitmap_t reachable;
int rc;
opal_list_item_t *item;
if(nprocs == 0)
return OMPI_SUCCESS;
OBJ_CONSTRUCT(&reachable, opal_bitmap_t);
rc = opal_bitmap_init(&reachable, (int)nprocs);
if(OMPI_SUCCESS != rc)
return rc;
/*
* JJH: Disable this in FT enabled builds since
* we use a wrapper PML. It will cause this check to
* return failure as all processes will return the wrapper PML
* component in use instead of the wrapped PML component underneath.
*/
#if OPAL_ENABLE_FT_CR == 0
/* make sure remote procs are using the same PML as us */
if (OMPI_SUCCESS != (rc = mca_pml_base_pml_check_selected("bfo",
procs,
nprocs))) {
return rc;
}
#endif
rc = mca_bml.bml_add_procs( nprocs,
procs,
&reachable );
if(OMPI_SUCCESS != rc)
goto cleanup_and_return;
/* Check that values supplied by all initialized btls will work
for us. Note that this is the list of all initialized BTLs,
not the ones used for the just added procs. This is a little
overkill and inaccurate, as we may end up not using the BTL in
question and all add_procs calls after the first one are
duplicating an already completed check. But the final
initialization of the PML occurs before the final
initialization of the BTLs, and iterating through the in-use
BTLs requires iterating over the procs, as the BML does not
expose all currently in use btls. */
for (item = opal_list_get_first(&mca_btl_base_modules_initialized) ;
item != opal_list_get_end(&mca_btl_base_modules_initialized) ;
item = opal_list_get_next(item)) {
mca_btl_base_selected_module_t *sm =
(mca_btl_base_selected_module_t*) item;
if (sm->btl_module->btl_eager_limit < sizeof(mca_pml_bfo_hdr_t)) {
opal_show_help("help-mpi-pml-bfo.txt", "eager_limit_too_small",
true,
sm->btl_component->btl_version.mca_component_name,
ompi_process_info.nodename,
sm->btl_component->btl_version.mca_component_name,
sm->btl_module->btl_eager_limit,
sm->btl_component->btl_version.mca_component_name,
sizeof(mca_pml_bfo_hdr_t),
sm->btl_component->btl_version.mca_component_name);
rc = OMPI_ERR_BAD_PARAM;
goto cleanup_and_return;
}
}
/* TODO: Move these callback registration to another place */
rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_MATCH,
mca_pml_bfo_recv_frag_callback_match,
NULL );
if(OMPI_SUCCESS != rc)
goto cleanup_and_return;
rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_RNDV,
mca_pml_bfo_recv_frag_callback_rndv,
NULL );
if(OMPI_SUCCESS != rc)
goto cleanup_and_return;
rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_RGET,
mca_pml_bfo_recv_frag_callback_rget,
NULL );
if(OMPI_SUCCESS != rc)
goto cleanup_and_return;
rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_ACK,
mca_pml_bfo_recv_frag_callback_ack,
NULL );
if(OMPI_SUCCESS != rc)
goto cleanup_and_return;
rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_FRAG,
mca_pml_bfo_recv_frag_callback_frag,
NULL );
if(OMPI_SUCCESS != rc)
goto cleanup_and_return;
rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_PUT,
mca_pml_bfo_recv_frag_callback_put,
NULL );
if(OMPI_SUCCESS != rc)
goto cleanup_and_return;
rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_FIN,
mca_pml_bfo_recv_frag_callback_fin,
NULL );
if(OMPI_SUCCESS != rc)
goto cleanup_and_return;
#if PML_BFO
rc = mca_pml_bfo_register_callbacks();
if(OMPI_SUCCESS != rc)
goto cleanup_and_return;
#endif /* PML_BFO */
/* register error handlers */
rc = mca_bml.bml_register_error((mca_btl_base_module_error_cb_fn_t)mca_pml_bfo_error_handler);
if(OMPI_SUCCESS != rc)
goto cleanup_and_return;
cleanup_and_return:
OBJ_DESTRUCT(&reachable);
return rc;
}
/*
* iterate through each proc and notify any PTLs associated
* with the proc that it is/has gone away
*/
int mca_pml_bfo_del_procs(ompi_proc_t** procs, size_t nprocs)
{
return mca_bml.bml_del_procs(nprocs, procs);
}
/*
* diagnostics
*/
int mca_pml_bfo_dump(struct ompi_communicator_t* comm, int verbose)
{
struct mca_pml_comm_t* pml_comm = comm->c_pml_comm;
int i;
/* iterate through all procs on communicator */
for( i = 0; i < (int)pml_comm->num_procs; i++ ) {
mca_pml_bfo_comm_proc_t* proc = &pml_comm->procs[i];
mca_bml_base_endpoint_t* ep = (mca_bml_base_endpoint_t*)proc->ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
size_t n;
opal_output(0, "[Rank %d]\n", i);
/* dump all receive queues */
/* dump all btls */
for(n=0; n<ep->btl_eager.arr_size; n++) {
mca_bml_base_btl_t* bml_btl = &ep->btl_eager.bml_btls[n];
bml_btl->btl->btl_dump(bml_btl->btl, bml_btl->btl_endpoint, verbose);
}
}
return OMPI_SUCCESS;
}
static void mca_pml_bfo_fin_completion( mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* ep,
struct mca_btl_base_descriptor_t* des,
int status )
{
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context;
#if PML_BFO
if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
mca_pml_bfo_repost_fin(des);
return;
}
MCA_PML_BFO_CHECK_EAGER_BML_BTL_ON_FIN_COMPLETION(bml_btl, btl, des);
#endif /* PML_BFO */
/* check for pending requests */
MCA_PML_BFO_PROGRESS_PENDING(bml_btl);
}
/**
* Send an FIN to the peer. If we fail to send this ack (no more available
* fragments or the send failed) this function automatically add the FIN
* to the list of pending FIN, Which guarantee that the FIN will be sent
* later.
*/
int mca_pml_bfo_send_fin( ompi_proc_t* proc,
mca_bml_base_btl_t* bml_btl,
opal_ptr_t hdr_des,
uint8_t order,
#if PML_BFO
uint32_t status,
uint16_t seq,
uint8_t restartseq,
uint16_t ctx, uint32_t src)
#else /* PML_BFO */
uint32_t status )
#endif /* PML_BFO */
{
mca_btl_base_descriptor_t* fin;
mca_pml_bfo_fin_hdr_t* hdr;
int rc;
mca_bml_base_alloc(bml_btl, &fin, order, sizeof(mca_pml_bfo_fin_hdr_t),
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
if(NULL == fin) {
MCA_PML_BFO_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status);
return OMPI_ERR_OUT_OF_RESOURCE;
}
fin->des_cbfunc = mca_pml_bfo_fin_completion;
fin->des_cbdata = NULL;
/* fill in header */
hdr = (mca_pml_bfo_fin_hdr_t*)fin->des_local->seg_addr.pval;
hdr->hdr_common.hdr_flags = 0;
hdr->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_FIN;
hdr->hdr_des = hdr_des;
hdr->hdr_fail = status;
#if PML_BFO
fin->des_cbdata = proc;
hdr->hdr_match.hdr_seq = seq;
hdr->hdr_match.hdr_ctx = ctx;
hdr->hdr_match.hdr_src = src;
hdr->hdr_match.hdr_common.hdr_flags = restartseq; /* use unused hdr_flags field */
#endif /* PML_BFO */
bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_FIN, proc);
/* queue request */
rc = mca_bml_base_send( bml_btl,
fin,
MCA_PML_BFO_HDR_TYPE_FIN );
if( OPAL_LIKELY( rc >= 0 ) ) {
if( OPAL_LIKELY( 1 == rc ) ) {
MCA_PML_BFO_PROGRESS_PENDING(bml_btl);
}
return OMPI_SUCCESS;
}
mca_bml_base_free(bml_btl, fin);
MCA_PML_BFO_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status);
return OMPI_ERR_OUT_OF_RESOURCE;
}
void mca_pml_bfo_process_pending_packets(mca_bml_base_btl_t* bml_btl)
{
mca_pml_bfo_pckt_pending_t *pckt;
int32_t i, rc, s = (int32_t)opal_list_get_size(&mca_pml_bfo.pckt_pending);
for(i = 0; i < s; i++) {
mca_bml_base_btl_t *send_dst = NULL;
OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
pckt = (mca_pml_bfo_pckt_pending_t*)
opal_list_remove_first(&mca_pml_bfo.pckt_pending);
OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
if(NULL == pckt)
break;
if(pckt->bml_btl != NULL &&
pckt->bml_btl->btl == bml_btl->btl) {
send_dst = pckt->bml_btl;
} else {
mca_bml_base_endpoint_t* endpoint =
(mca_bml_base_endpoint_t*) pckt->proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
send_dst = mca_bml_base_btl_array_find(
&endpoint->btl_eager, bml_btl->btl);
}
if(NULL == send_dst) {
OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
opal_list_append(&mca_pml_bfo.pckt_pending,
(opal_list_item_t*)pckt);
OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
continue;
}
switch(pckt->hdr.hdr_common.hdr_type) {
case MCA_PML_BFO_HDR_TYPE_ACK:
rc = mca_pml_bfo_recv_request_ack_send_btl(pckt->proc,
send_dst,
pckt->hdr.hdr_ack.hdr_src_req.lval,
pckt->hdr.hdr_ack.hdr_dst_req.pval,
pckt->hdr.hdr_ack.hdr_send_offset,
pckt->hdr.hdr_common.hdr_flags & MCA_PML_BFO_HDR_FLAGS_NORDMA);
if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) {
OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
opal_list_append(&mca_pml_bfo.pckt_pending,
(opal_list_item_t*)pckt);
OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
return;
}
break;
case MCA_PML_BFO_HDR_TYPE_FIN:
rc = mca_pml_bfo_send_fin(pckt->proc, send_dst,
pckt->hdr.hdr_fin.hdr_des,
pckt->order,
#if PML_BFO
pckt->hdr.hdr_fin.hdr_fail,
pckt->hdr.hdr_fin.hdr_match.hdr_seq,
pckt->hdr.hdr_fin.hdr_match.hdr_common.hdr_flags,
pckt->hdr.hdr_fin.hdr_match.hdr_ctx,
pckt->hdr.hdr_fin.hdr_match.hdr_src);
#else /* PML_BFO */
pckt->hdr.hdr_fin.hdr_fail);
#endif /* PML_BFO */
if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) {
return;
}
break;
default:
opal_output(0, "[%s:%d] wrong header type\n",
__FILE__, __LINE__);
break;
}
/* We're done with this packet, return it back to the free list */
MCA_PML_BFO_PCKT_PENDING_RETURN(pckt);
}
}
void mca_pml_bfo_process_pending_rdma(void)
{
mca_pml_bfo_rdma_frag_t* frag;
int32_t i, rc, s = (int32_t)opal_list_get_size(&mca_pml_bfo.rdma_pending);
for(i = 0; i < s; i++) {
OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
frag = (mca_pml_bfo_rdma_frag_t*)
opal_list_remove_first(&mca_pml_bfo.rdma_pending);
OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
if(NULL == frag)
break;
if(frag->rdma_state == MCA_PML_BFO_RDMA_PUT) {
frag->retries++;
rc = mca_pml_bfo_send_request_put_frag(frag);
} else {
rc = mca_pml_bfo_recv_request_get_frag(frag);
}
if(OMPI_ERR_OUT_OF_RESOURCE == rc)
break;
}
}
void mca_pml_bfo_error_handler(
struct mca_btl_base_module_t* btl, int32_t flags,
ompi_proc_t* errproc, char* btlinfo ) {
#if PML_BFO
if (flags & MCA_BTL_ERROR_FLAGS_NONFATAL) {
mca_pml_bfo_failover_error_handler(btl, flags, errproc, btlinfo);
return;
}
#endif /* PML_BFO */
ompi_rte_abort(-1, NULL);
}
#if OPAL_ENABLE_FT_CR == 0
int mca_pml_bfo_ft_event( int state ) {
return OMPI_SUCCESS;
}
#else
int mca_pml_bfo_ft_event( int state )
{
static bool first_continue_pass = false;
ompi_proc_t** procs = NULL;
size_t num_procs;
int ret, p;
if(OPAL_CRS_CHECKPOINT == state) {
if( opal_cr_timing_barrier_enabled ) {
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR1);
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
return ret;
}
}
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P0);
}
else if(OPAL_CRS_CONTINUE == state) {
first_continue_pass = !first_continue_pass;
if( !first_continue_pass ) {
if( opal_cr_timing_barrier_enabled ) {
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR0);
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
return ret;
}
}
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P2);
}
if (opal_cr_continue_like_restart && !first_continue_pass) {
/*
* Get a list of processes
*/
procs = ompi_proc_all(&num_procs);
if(NULL == procs) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
/*
* Refresh the proc structure, and publish our proc info in the modex.
* NOTE: Do *not* call ompi_proc_finalize as there are many places in
* the code that point to indv. procs in this strucutre. For our
* needs here we only need to fix up the modex, bml and pml
* references.
*/
if (OMPI_SUCCESS != (ret = ompi_proc_refresh())) {
opal_output(0,
"pml:bfo: ft_event(Restart): proc_refresh Failed %d",
ret);
for(p = 0; p < (int)num_procs; ++p) {
OBJ_RELEASE(procs[p]);
}
free (procs);
return ret;
}
}
}
else if(OPAL_CRS_RESTART_PRE == state ) {
/* Nothing here */
}
else if(OPAL_CRS_RESTART == state ) {
/*
* Get a list of processes
*/
procs = ompi_proc_all(&num_procs);
if(NULL == procs) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
/*
* Clean out the modex information since it is invalid now.
* ompi_rte_purge_proc_attrs();
* This happens at the ORTE level, so doing it again here will cause
* some issues with socket caching.
*/
/*
* Refresh the proc structure, and publish our proc info in the modex.
* NOTE: Do *not* call ompi_proc_finalize as there are many places in
* the code that point to indv. procs in this strucutre. For our
* needs here we only need to fix up the modex, bml and pml
* references.
*/
if (OMPI_SUCCESS != (ret = ompi_proc_refresh())) {
opal_output(0,
"pml:bfo: ft_event(Restart): proc_refresh Failed %d",
ret);
for(p = 0; p < (int)num_procs; ++p) {
OBJ_RELEASE(procs[p]);
}
free (procs);
return ret;
}
}
else if(OPAL_CRS_TERM == state ) {
;
}
else {
;
}
/* Call the BML
* BML is expected to call ft_event in
* - BTL(s)
* - MPool(s)
*/
if( OMPI_SUCCESS != (ret = mca_bml.bml_ft_event(state))) {
opal_output(0, "pml:base: ft_event: BML ft_event function failed: %d\n",
ret);
}
if(OPAL_CRS_CHECKPOINT == state) {
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P1);
if( opal_cr_timing_barrier_enabled ) {
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR0);
/* JJH Cannot barrier here due to progress engine -- ompi_rte_barrier();*/
}
}
else if(OPAL_CRS_CONTINUE == state) {
if( !first_continue_pass ) {
if( opal_cr_timing_barrier_enabled ) {
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR1);
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
return ret;
}
}
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3);
}
if (opal_cr_continue_like_restart && !first_continue_pass) {
/*
* Exchange the modex information once again.
* BTLs will have republished their modex information.
*/
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
return ret;
}
/*
* Startup the PML stack now that the modex is running again
* Add the new procs (BTLs redo modex recv's)
*/
if( OMPI_SUCCESS != (ret = mca_pml_bfo_add_procs(procs, num_procs) ) ) {
opal_output(0, "pml:bfo: ft_event(Restart): Failed in add_procs (%d)", ret);
return ret;
}
/* Is this barrier necessary ? JJH */
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
return ret;
}
if( NULL != procs ) {
for(p = 0; p < (int)num_procs; ++p) {
OBJ_RELEASE(procs[p]);
}
free(procs);
procs = NULL;
}
}
if( !first_continue_pass ) {
if( opal_cr_timing_barrier_enabled ) {
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR2);
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
return ret;
}
}
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP1);
}
}
else if(OPAL_CRS_RESTART_PRE == state ) {
/* Nothing here */
}
else if(OPAL_CRS_RESTART == state ) {
/*
* Exchange the modex information once again.
* BTLs will have republished their modex information.
*/
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
return ret;
}
/*
* Startup the PML stack now that the modex is running again
* Add the new procs (BTLs redo modex recv's)
*/
if( OMPI_SUCCESS != (ret = mca_pml_bfo_add_procs(procs, num_procs) ) ) {
opal_output(0, "pml:bfo: ft_event(Restart): Failed in add_procs (%d)", ret);
return ret;
}
/* Is this barrier necessary ? JJH */
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
return ret;
}
if( NULL != procs ) {
for(p = 0; p < (int)num_procs; ++p) {
OBJ_RELEASE(procs[p]);
}
free(procs);
procs = NULL;
}
}
else if(OPAL_CRS_TERM == state ) {
;
}
else {
;
}
return OMPI_SUCCESS;
}
#endif /* OPAL_ENABLE_FT_CR */
int mca_pml_bfo_com_btl_comp(const void *v1, const void *v2)
{
const mca_pml_bfo_com_btl_t *b1 = (const mca_pml_bfo_com_btl_t *) v1;
const mca_pml_bfo_com_btl_t *b2 = (const mca_pml_bfo_com_btl_t *) v2;
if(b1->bml_btl->btl_weight < b2->bml_btl->btl_weight)
return 1;
if(b1->bml_btl->btl_weight > b2->bml_btl->btl_weight)
return -1;
return 0;
}

Просмотреть файл

@ -1,362 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2013 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*/
#ifndef MCA_PML_BFO_H
#define MCA_PML_BFO_H
#include "ompi_config.h"
#include "opal/class/opal_free_list.h"
#include "ompi/request/request.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/mca/pml/base/pml_base_request.h"
#include "ompi/mca/pml/base/pml_base_bsend.h"
#include "ompi/mca/pml/base/pml_base_sendreq.h"
#include "ompi/datatype/ompi_datatype.h"
#include "pml_bfo_hdr.h"
#include "ompi/mca/bml/base/base.h"
#include "ompi/proc/proc.h"
#include "opal/mca/allocator/base/base.h"
BEGIN_C_DECLS
/**
* BFO PML module
*/
struct mca_pml_bfo_t {
mca_pml_base_module_t super;
int priority;
int free_list_num; /* initial size of free list */
int free_list_max; /* maximum size of free list */
int free_list_inc; /* number of elements to grow free list */
unsigned int send_pipeline_depth;
unsigned int recv_pipeline_depth;
unsigned int rdma_put_retries_limit;
int max_rdma_per_request;
int max_send_per_range;
bool leave_pinned;
int leave_pinned_pipeline;
/* lock queue access */
opal_mutex_t lock;
/* free lists */
opal_free_list_t rdma_frags;
opal_free_list_t recv_frags;
opal_free_list_t pending_pckts;
opal_free_list_t buffers;
opal_free_list_t send_ranges;
/* list of pending operations */
opal_list_t pckt_pending;
opal_list_t send_pending;
opal_list_t recv_pending;
opal_list_t rdma_pending;
/* List of pending fragments without a matching communicator */
opal_list_t non_existing_communicator_pending;
bool enabled;
char* allocator_name;
mca_allocator_base_module_t* allocator;
unsigned int unexpected_limit;
};
typedef struct mca_pml_bfo_t mca_pml_bfo_t;
extern mca_pml_bfo_t mca_pml_bfo;
extern int mca_pml_bfo_output;
/*
* PML interface functions.
*/
extern int mca_pml_bfo_add_comm(
struct ompi_communicator_t* comm
);
extern int mca_pml_bfo_del_comm(
struct ompi_communicator_t* comm
);
extern int mca_pml_bfo_add_procs(
struct ompi_proc_t **procs,
size_t nprocs
);
extern int mca_pml_bfo_del_procs(
struct ompi_proc_t **procs,
size_t nprocs
);
extern int mca_pml_bfo_enable( bool enable );
extern int mca_pml_bfo_progress(void);
extern int mca_pml_bfo_iprobe( int dst,
int tag,
struct ompi_communicator_t* comm,
int *matched,
ompi_status_public_t* status );
extern int mca_pml_bfo_probe( int dst,
int tag,
struct ompi_communicator_t* comm,
ompi_status_public_t* status );
extern int mca_pml_bfo_improbe( int dst,
int tag,
struct ompi_communicator_t* comm,
int *matched,
struct ompi_message_t **message,
ompi_status_public_t* status );
extern int mca_pml_bfo_mprobe( int dst,
int tag,
struct ompi_communicator_t* comm,
struct ompi_message_t **message,
ompi_status_public_t* status );
extern int mca_pml_bfo_isend_init( void *buf,
size_t count,
ompi_datatype_t *datatype,
int dst,
int tag,
mca_pml_base_send_mode_t mode,
struct ompi_communicator_t* comm,
struct ompi_request_t **request );
extern int mca_pml_bfo_isend( void *buf,
size_t count,
ompi_datatype_t *datatype,
int dst,
int tag,
mca_pml_base_send_mode_t mode,
struct ompi_communicator_t* comm,
struct ompi_request_t **request );
extern int mca_pml_bfo_send( void *buf,
size_t count,
ompi_datatype_t *datatype,
int dst,
int tag,
mca_pml_base_send_mode_t mode,
struct ompi_communicator_t* comm );
extern int mca_pml_bfo_irecv_init( void *buf,
size_t count,
ompi_datatype_t *datatype,
int src,
int tag,
struct ompi_communicator_t* comm,
struct ompi_request_t **request );
extern int mca_pml_bfo_irecv( void *buf,
size_t count,
ompi_datatype_t *datatype,
int src,
int tag,
struct ompi_communicator_t* comm,
struct ompi_request_t **request );
extern int mca_pml_bfo_recv( void *buf,
size_t count,
ompi_datatype_t *datatype,
int src,
int tag,
struct ompi_communicator_t* comm,
ompi_status_public_t* status );
extern int mca_pml_bfo_imrecv( void *buf,
size_t count,
ompi_datatype_t *datatype,
struct ompi_message_t **message,
struct ompi_request_t **request );
extern int mca_pml_bfo_mrecv( void *buf,
size_t count,
ompi_datatype_t *datatype,
struct ompi_message_t **message,
ompi_status_public_t* status );
extern int mca_pml_bfo_dump( struct ompi_communicator_t* comm,
int verbose );
extern int mca_pml_bfo_start( size_t count,
ompi_request_t** requests );
extern int mca_pml_bfo_ft_event( int state );
END_C_DECLS
struct mca_pml_bfo_pckt_pending_t {
opal_free_list_item_t super;
ompi_proc_t* proc;
mca_pml_bfo_hdr_t hdr;
struct mca_bml_base_btl_t *bml_btl;
uint8_t order;
};
typedef struct mca_pml_bfo_pckt_pending_t mca_pml_bfo_pckt_pending_t;
OBJ_CLASS_DECLARATION(mca_pml_bfo_pckt_pending_t);
#define MCA_PML_BFO_PCKT_PENDING_ALLOC(pckt) \
do { \
opal_free_list_item_t* item; \
OPAL_FREE_LIST_WAIT(&mca_pml_bfo.pending_pckts, item); \
pckt = (mca_pml_bfo_pckt_pending_t*)item; \
} while (0)
#define MCA_PML_BFO_PCKT_PENDING_RETURN(pckt) \
do { \
/* return packet */ \
OPAL_FREE_LIST_RETURN(&mca_pml_bfo.pending_pckts, \
(opal_free_list_item_t*)pckt); \
} while(0)
#define MCA_PML_BFO_ADD_FIN_TO_PENDING(P, D, B, O, S) \
do { \
mca_pml_bfo_pckt_pending_t *_pckt; \
\
MCA_PML_BFO_PCKT_PENDING_ALLOC(_pckt); \
_pckt->hdr.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_FIN; \
_pckt->hdr.hdr_fin.hdr_des = (D); \
_pckt->hdr.hdr_fin.hdr_fail = (S); \
_pckt->proc = (P); \
_pckt->bml_btl = (B); \
_pckt->order = (O); \
OPAL_THREAD_LOCK(&mca_pml_bfo.lock); \
opal_list_append(&mca_pml_bfo.pckt_pending, \
(opal_list_item_t*)_pckt); \
OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); \
} while(0)
int mca_pml_bfo_send_fin(ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl,
#if PML_BFO
opal_ptr_t hdr_des, uint8_t order, uint32_t status,
uint16_t seq, uint8_t reqseq, uint16_t ctx, uint32_t src);
#else /* PML_BFO */
opal_ptr_t hdr_des, uint8_t order, uint32_t status);
#endif /* PML_BFO */
/* This function tries to resend FIN/ACK packets from pckt_pending queue.
* Packets are added to the queue when sending of FIN or ACK is failed due to
* resource unavailability. bml_btl passed to the function doesn't represents
* packet's destination, it represents BTL on which resource was freed, so only
* this BTL should be considered for resending packets */
void mca_pml_bfo_process_pending_packets(mca_bml_base_btl_t* bml_btl);
/* This function retries failed PUT/GET operations on frag. When RDMA operation
* cannot be accomplished for some reason, frag is put on the rdma_pending list.
* Later the operation is retried. The destination of RDMA operation is stored
* inside the frag structure */
void mca_pml_bfo_process_pending_rdma(void);
#define MCA_PML_BFO_PROGRESS_PENDING(bml_btl) \
do { \
if(opal_list_get_size(&mca_pml_bfo.pckt_pending)) \
mca_pml_bfo_process_pending_packets(bml_btl); \
if(opal_list_get_size(&mca_pml_bfo.recv_pending)) \
mca_pml_bfo_recv_request_process_pending(); \
if(opal_list_get_size(&mca_pml_bfo.send_pending)) \
mca_pml_bfo_send_request_process_pending(bml_btl); \
if(opal_list_get_size(&mca_pml_bfo.rdma_pending)) \
mca_pml_bfo_process_pending_rdma(); \
} while (0)
/*
* Compute the total number of bytes on supplied descriptor
*/
static inline int mca_pml_bfo_compute_segment_length (size_t seg_size, void *segments, size_t count,
size_t hdrlen) {
size_t i, length;
for (i = 0, length = -hdrlen ; i < count ; ++i) {
mca_btl_base_segment_t *segment =
(mca_btl_base_segment_t *)((char *) segments + i * seg_size);
length += segment->seg_len;
}
return length;
}
static inline int mca_pml_bfo_compute_segment_length_base (mca_btl_base_segment_t *segments,
size_t count, size_t hdrlen) {
size_t i, length;
for (i = 0, length = -hdrlen ; i < count ; ++i) {
length += segments[i].seg_len;
}
return length;
}
/* represent BTL chosen for sending request */
struct mca_pml_bfo_com_btl_t {
mca_bml_base_btl_t *bml_btl;
struct mca_mpool_base_registration_t* btl_reg;
size_t length;
};
typedef struct mca_pml_bfo_com_btl_t mca_pml_bfo_com_btl_t;
int mca_pml_bfo_com_btl_comp(const void *v1, const void *v2);
/* Calculate what percentage of a message to send through each BTL according to
* relative weight */
static inline void
mca_pml_bfo_calc_weighted_length( mca_pml_bfo_com_btl_t *btls, int num_btls, size_t size,
double weight_total )
{
int i;
size_t length_left;
/* shortcut for common case for only one BTL */
if( OPAL_LIKELY(1 == num_btls) ) {
btls[0].length = size;
return;
}
/* sort BTLs according of their weights so BTLs with smaller weight will
* not hijack all of the traffic */
qsort( btls, num_btls, sizeof(mca_pml_bfo_com_btl_t),
mca_pml_bfo_com_btl_comp );
for(length_left = size, i = 0; i < num_btls; i++) {
mca_bml_base_btl_t* bml_btl = btls[i].bml_btl;
size_t length = 0;
if( OPAL_UNLIKELY(0 != length_left) ) {
length = (length_left > bml_btl->btl->btl_eager_limit)?
((size_t)(size * (bml_btl->btl_weight / weight_total))) :
length_left;
if(length > length_left)
length = length_left;
length_left -= length;
}
btls[i].length = length;
}
/* account for rounding errors */
btls[0].length += length_left;
}
#endif

Просмотреть файл

@ -1,100 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2010-2012 Oracle and/or its affiliates. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include <string.h>
#include "pml_bfo.h"
#include "pml_bfo_comm.h"
static void mca_pml_bfo_comm_proc_construct(mca_pml_bfo_comm_proc_t* proc)
{
proc->expected_sequence = 1;
proc->ompi_proc = NULL;
proc->send_sequence = 0;
OBJ_CONSTRUCT(&proc->frags_cant_match, opal_list_t);
OBJ_CONSTRUCT(&proc->specific_receives, opal_list_t);
OBJ_CONSTRUCT(&proc->unexpected_frags, opal_list_t);
}
static void mca_pml_bfo_comm_proc_destruct(mca_pml_bfo_comm_proc_t* proc)
{
OBJ_DESTRUCT(&proc->frags_cant_match);
OBJ_DESTRUCT(&proc->specific_receives);
OBJ_DESTRUCT(&proc->unexpected_frags);
}
static OBJ_CLASS_INSTANCE(
mca_pml_bfo_comm_proc_t,
opal_object_t,
mca_pml_bfo_comm_proc_construct,
mca_pml_bfo_comm_proc_destruct);
static void mca_pml_bfo_comm_construct(mca_pml_bfo_comm_t* comm)
{
OBJ_CONSTRUCT(&comm->wild_receives, opal_list_t);
OBJ_CONSTRUCT(&comm->matching_lock, opal_mutex_t);
comm->recv_sequence = 0;
comm->procs = NULL;
comm->last_probed = 0;
comm->num_procs = 0;
}
static void mca_pml_bfo_comm_destruct(mca_pml_bfo_comm_t* comm)
{
size_t i;
for(i=0; i<comm->num_procs; i++)
OBJ_DESTRUCT((&comm->procs[i]));
if(NULL != comm->procs)
free(comm->procs);
OBJ_DESTRUCT(&comm->wild_receives);
OBJ_DESTRUCT(&comm->matching_lock);
}
OBJ_CLASS_INSTANCE(
mca_pml_bfo_comm_t,
opal_object_t,
mca_pml_bfo_comm_construct,
mca_pml_bfo_comm_destruct);
int mca_pml_bfo_comm_init_size(mca_pml_bfo_comm_t* comm, size_t size)
{
size_t i;
/* send message sequence-number support - sender side */
comm->procs = (mca_pml_bfo_comm_proc_t*)malloc(sizeof(mca_pml_bfo_comm_proc_t)*size);
if(NULL == comm->procs) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
for(i=0; i<size; i++) {
OBJ_CONSTRUCT(comm->procs+i, mca_pml_bfo_comm_proc_t);
}
comm->num_procs = size;
return OMPI_SUCCESS;
}

Просмотреть файл

@ -1,81 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2010-2012 Oracle and/or its affiliates. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*/
#ifndef MCA_PML_BFO_COMM_H
#define MCA_PML_BFO_COMM_H
#include "opal/threads/mutex.h"
#include "opal/class/opal_list.h"
#include "ompi/proc/proc.h"
BEGIN_C_DECLS
struct mca_pml_bfo_comm_proc_t {
opal_object_t super;
uint16_t expected_sequence; /**< send message sequence number - receiver side */
struct ompi_proc_t* ompi_proc;
#if OPAL_ENABLE_MULTI_THREADS
volatile int32_t send_sequence; /**< send side sequence number */
#else
int32_t send_sequence; /**< send side sequence number */
#endif
opal_list_t frags_cant_match; /**< out-of-order fragment queues */
opal_list_t specific_receives; /**< queues of unmatched specific receives */
opal_list_t unexpected_frags; /**< unexpected fragment queues */
};
typedef struct mca_pml_bfo_comm_proc_t mca_pml_bfo_comm_proc_t;
/**
* Cached on ompi_communicator_t to hold queues/state
* used by the PML<->PTL interface for matching logic.
*/
struct mca_pml_comm_t {
opal_object_t super;
#if OPAL_ENABLE_MULTI_THREADS
volatile uint32_t recv_sequence; /**< recv request sequence number - receiver side */
#else
uint32_t recv_sequence; /**< recv request sequence number - receiver side */
#endif
opal_mutex_t matching_lock; /**< matching lock */
opal_list_t wild_receives; /**< queue of unmatched wild (source process not specified) receives */
mca_pml_bfo_comm_proc_t* procs;
size_t num_procs;
size_t last_probed;
};
typedef struct mca_pml_comm_t mca_pml_bfo_comm_t;
OBJ_CLASS_DECLARATION(mca_pml_bfo_comm_t);
/**
* Initialize an instance of mca_pml_bfo_comm_t based on the communicator size.
*
* @param comm Instance of mca_pml_bfo_comm_t
* @param size Size of communicator
* @return OMPI_SUCCESS or error status on failure.
*/
extern int mca_pml_bfo_comm_init_size(mca_pml_bfo_comm_t* comm, size_t size);
END_C_DECLS
#endif

Просмотреть файл

@ -1,274 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2009 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "opal/mca/event/event.h"
#include "mpi.h"
#include "ompi/runtime/params.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/mca/pml/base/pml_base_bsend.h"
#include "pml_bfo.h"
#include "pml_bfo_hdr.h"
#include "pml_bfo_sendreq.h"
#include "pml_bfo_recvreq.h"
#include "pml_bfo_rdmafrag.h"
#include "pml_bfo_recvfrag.h"
#include "ompi/mca/bml/base/base.h"
#include "pml_bfo_component.h"
#include "opal/mca/allocator/base/base.h"
#include "opal/runtime/opal_params.h"
OBJ_CLASS_INSTANCE( mca_pml_bfo_pckt_pending_t,
ompi_free_list_item_t,
NULL,
NULL );
static int mca_pml_bfo_component_register(void);
static int mca_pml_bfo_component_open(void);
static int mca_pml_bfo_component_close(void);
static mca_pml_base_module_t*
mca_pml_bfo_component_init( int* priority, bool enable_progress_threads,
bool enable_mpi_threads );
static int mca_pml_bfo_component_fini(void);
int mca_pml_bfo_output = 0;
static int mca_pml_bfo_verbose = 0;
mca_pml_base_component_2_0_0_t mca_pml_bfo_component = {
/* First, the mca_base_component_t struct containing meta
information about the component itself */
.pmlm_version = {
MCA_PML_BASE_VERSION_2_0_0,
.mca_component_name = "bfo",
MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
OMPI_RELEASE_VERSION),
.mca_open_component = mca_pml_bfo_component_open,
.mca_close_component = mca_pml_bfo_component_close,
.mca_register_component_params = mca_pml_bfo_component_register,
},
.pmlm_data = {
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
.pmlm_init = mca_pml_bfo_component_init,
.pmlm_finalize = mca_pml_bfo_component_fini,
};
void *mca_pml_bfo_seg_alloc( struct mca_mpool_base_module_t* mpool,
size_t* size,
mca_mpool_base_registration_t** registration);
void mca_pml_bfo_seg_free( struct mca_mpool_base_module_t* mpool,
void* segment );
static inline int mca_pml_bfo_param_register_int(
const char* param_name,
int default_value,
int *storage)
{
*storage = default_value;
(void) mca_base_component_var_register(&mca_pml_bfo_component.pmlm_version, param_name,
NULL, MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, storage);
return *storage;
}
static inline unsigned int mca_pml_bfo_param_register_uint(
const char* param_name,
unsigned int default_value,
unsigned int *storage)
{
*storage = default_value;
(void) mca_base_component_var_register(&mca_pml_bfo_component.pmlm_version, param_name,
NULL, MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, storage);
return *storage;
}
static int mca_pml_bfo_component_register(void)
{
int default_priority;
#if PML_BFO
default_priority = 5;
#else /* PML_BFO */
default_priority = 20;
mca_pml_bfo_param_register_int("priority", 20);
#endif /* PML_BFO */
(void) mca_pml_bfo_param_register_int("verbose", 0, &mca_pml_bfo_verbose);
(void) mca_pml_bfo_param_register_int("free_list_num", 4, &mca_pml_bfo.free_list_num);
(void) mca_pml_bfo_param_register_int("free_list_max", -1, &mca_pml_bfo.free_list_max);
(void) mca_pml_bfo_param_register_int("free_list_inc", 64, &mca_pml_bfo.free_list_inc);
(void) mca_pml_bfo_param_register_int("priority", default_priority, &mca_pml_bfo.priority);
(void) mca_pml_bfo_param_register_uint("send_pipeline_depth", 3, &mca_pml_bfo.send_pipeline_depth);
(void) mca_pml_bfo_param_register_uint("recv_pipeline_depth", 4, &mca_pml_bfo.recv_pipeline_depth);
(void) mca_pml_bfo_param_register_uint("rdma_put_retries_limit", 5, &mca_pml_bfo.rdma_put_retries_limit);
(void) mca_pml_bfo_param_register_int("max_rdma_per_request", 4, &mca_pml_bfo.max_rdma_per_request);
(void) mca_pml_bfo_param_register_int("max_send_per_range", 4, &mca_pml_bfo.max_send_per_range);
(void) mca_pml_bfo_param_register_uint("unexpected_limit", 128, &mca_pml_bfo.unexpected_limit);
mca_pml_bfo.allocator_name = "bucket";
(void) mca_base_component_var_register(&mca_pml_bfo_component.pmlm_version,
"allocator",
"Name of allocator component for unexpected messages",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_pml_bfo.allocator_name);
return OMPI_SUCCESS;
}
static int mca_pml_bfo_component_open(void)
{
mca_pml_bfo_output = opal_output_open(NULL);
opal_output_set_verbosity(mca_pml_bfo_output, mca_pml_bfo_verbose);
mca_pml_bfo.enabled = false;
return mca_base_framework_open(&ompi_bml_base_framework, 0);
}
static int mca_pml_bfo_component_close(void)
{
int rc;
if (OMPI_SUCCESS != (rc = mca_base_framework_close(&ompi_bml_base_framework))) {
return rc;
}
opal_output_close(mca_pml_bfo_output);
return OMPI_SUCCESS;
}
static mca_pml_base_module_t*
mca_pml_bfo_component_init( int* priority,
bool enable_progress_threads,
bool enable_mpi_threads )
{
mca_allocator_base_component_t* allocator_component;
opal_output_verbose( 10, mca_pml_bfo_output,
"in bfo, my priority is %d\n", mca_pml_bfo.priority);
if((*priority) > mca_pml_bfo.priority) {
*priority = mca_pml_bfo.priority;
return NULL;
}
*priority = mca_pml_bfo.priority;
allocator_component = mca_allocator_component_lookup( mca_pml_bfo.allocator_name );
if(NULL == allocator_component) {
opal_output(0, "mca_pml_bfo_component_init: can't find allocator: %s\n", mca_pml_bfo.allocator_name);
return NULL;
}
mca_pml_bfo.allocator = allocator_component->allocator_init(true,
mca_pml_bfo_seg_alloc,
mca_pml_bfo_seg_free, NULL);
if(NULL == mca_pml_bfo.allocator) {
opal_output(0, "mca_pml_bfo_component_init: unable to initialize allocator\n");
return NULL;
}
if(OMPI_SUCCESS != mca_bml_base_init( enable_progress_threads,
enable_mpi_threads)) {
return NULL;
}
/* Set this here (vs in component_open()) because
opal_leave_pinned* may have been set after MCA params were
read (e.g., by the openib btl) */
mca_pml_bfo.leave_pinned = (1 == opal_leave_pinned);
mca_pml_bfo.leave_pinned_pipeline = (int) opal_leave_pinned_pipeline;
return &mca_pml_bfo.super;
}
int mca_pml_bfo_component_fini(void)
{
int rc;
/* Shutdown BML */
if(OMPI_SUCCESS != (rc = mca_bml.bml_finalize()))
return rc;
if(!mca_pml_bfo.enabled)
return OMPI_SUCCESS; /* never selected.. return success.. */
mca_pml_bfo.enabled = false; /* not anymore */
OBJ_DESTRUCT(&mca_pml_bfo.rdma_pending);
OBJ_DESTRUCT(&mca_pml_bfo.pckt_pending);
OBJ_DESTRUCT(&mca_pml_bfo.recv_pending);
OBJ_DESTRUCT(&mca_pml_bfo.send_pending);
OBJ_DESTRUCT(&mca_pml_bfo.non_existing_communicator_pending);
OBJ_DESTRUCT(&mca_pml_bfo.buffers);
OBJ_DESTRUCT(&mca_pml_bfo.pending_pckts);
OBJ_DESTRUCT(&mca_pml_bfo.recv_frags);
OBJ_DESTRUCT(&mca_pml_bfo.rdma_frags);
OBJ_DESTRUCT(&mca_pml_bfo.lock);
if(OMPI_SUCCESS != (rc = mca_pml_bfo.allocator->alc_finalize(mca_pml_bfo.allocator))) {
return rc;
}
#if 0
if (mca_pml_base_send_requests.fl_num_allocated !=
mca_pml_base_send_requests.super.opal_list_length) {
opal_output(0, "bfo send requests: %d allocated %d returned\n",
mca_pml_base_send_requests.fl_num_allocated,
mca_pml_base_send_requests.super.opal_list_length);
}
if (mca_pml_base_recv_requests.fl_num_allocated !=
mca_pml_base_recv_requests.super.opal_list_length) {
opal_output(0, "bfo recv requests: %d allocated %d returned\n",
mca_pml_base_recv_requests.fl_num_allocated,
mca_pml_base_recv_requests.super.opal_list_length);
}
#endif
return OMPI_SUCCESS;
}
void *mca_pml_bfo_seg_alloc( struct mca_mpool_base_module_t* mpool,
size_t* size,
mca_mpool_base_registration_t** registration) {
return malloc(*size);
}
void mca_pml_bfo_seg_free( struct mca_mpool_base_module_t* mpool,
void* segment ) {
free(segment);
}

Просмотреть файл

@ -1,33 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*/
#ifndef MCA_PML_BFO_COMPONENT_H
#define MCA_PML_BFO_COMPONENT_H
BEGIN_C_DECLS
/*
* PML module functions.
*/
OMPI_MODULE_DECLSPEC extern mca_pml_base_component_2_0_0_t mca_pml_bfo_component;
END_C_DECLS
#endif

Просмотреть файл

@ -1,157 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2008 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2008 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
* Copyright (c) 2010-2012 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012-2015 NVIDIA Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "opal/prefetch.h"
#include "opal/mca/btl/btl.h"
#include "opal/mca/mpool/mpool.h"
#include "ompi/constants.h"
#include "ompi/mca/pml/pml.h"
#include "pml_bfo.h"
#include "pml_bfo_hdr.h"
#include "pml_bfo_rdmafrag.h"
#include "pml_bfo_recvreq.h"
#include "pml_bfo_sendreq.h"
#include "ompi/mca/bml/base/base.h"
#include "ompi/memchecker.h"
size_t mca_pml_bfo_rdma_cuda_btls(
mca_bml_base_endpoint_t* bml_endpoint,
unsigned char* base,
size_t size,
mca_pml_bfo_com_btl_t* rdma_btls);
int mca_pml_bfo_cuda_need_buffers(void * rreq,
mca_btl_base_module_t* btl);
/**
* Handle the CUDA buffer.
*/
int mca_pml_bfo_send_request_start_cuda(mca_pml_bfo_send_request_t* sendreq,
mca_bml_base_btl_t* bml_btl,
size_t size) {
int rc;
sendreq->req_send.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
if (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == false) {
unsigned char *base;
opal_convertor_get_current_pointer( &sendreq->req_send.req_base.req_convertor, (void**)&base );
/* Set flag back */
sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_bfo_rdma_cuda_btls(
sendreq->req_endpoint,
base,
sendreq->req_send.req_bytes_packed,
sendreq->req_rdma))) {
rc = mca_pml_bfo_send_request_start_rdma(sendreq, bml_btl,
sendreq->req_send.req_bytes_packed);
if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
mca_pml_bfo_free_rdma_resources(sendreq);
}
} else {
if (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_PUT) {
rc = mca_pml_bfo_send_request_start_rndv(sendreq, bml_btl, size,
MCA_PML_BFO_HDR_FLAGS_CONTIG);
} else {
rc = mca_pml_bfo_send_request_start_rndv(sendreq, bml_btl, size, 0);
}
}
} else {
/* Do not send anything with first rendezvous message as copying GPU
* memory into RNDV message is expensive. */
sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
rc = mca_pml_bfo_send_request_start_rndv(sendreq, bml_btl, 0, 0);
}
return rc;
}
size_t mca_pml_bfo_rdma_cuda_btls(
mca_bml_base_endpoint_t* bml_endpoint,
unsigned char* base,
size_t size,
mca_pml_bfo_com_btl_t* rdma_btls)
{
int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
double weight_total = 0;
int num_btls_used = 0, n;
/* shortcut when there are no rdma capable btls */
if(num_btls == 0) {
return 0;
}
/* check to see if memory is registered */
for(n = 0; n < num_btls && num_btls_used < mca_pml_bfo.max_rdma_per_request;
n++) {
mca_bml_base_btl_t* bml_btl =
mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n);
if (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET) {
mca_mpool_base_registration_t* reg = NULL;
mca_mpool_base_module_t *btl_mpool = bml_btl->btl->btl_mpool;
if( NULL != btl_mpool ) {
/* register the memory */
btl_mpool->mpool_register(btl_mpool, base, size, 0, &reg);
}
if(NULL == reg)
continue;
rdma_btls[num_btls_used].bml_btl = bml_btl;
rdma_btls[num_btls_used].btl_reg = reg;
weight_total += bml_btl->btl_weight;
num_btls_used++;
}
}
/* if we don't use leave_pinned and all BTLs that already have this memory
* registered amount to less then half of available bandwidth - fall back to
* pipeline protocol */
if(0 == num_btls_used || (!mca_pml_bfo.leave_pinned && weight_total < 0.5))
return 0;
mca_pml_bfo_calc_weighted_length(rdma_btls, num_btls_used, size,
weight_total);
return num_btls_used;
}
int mca_pml_bfo_cuda_need_buffers(void * rreq,
mca_btl_base_module_t* btl)
{
mca_pml_bfo_recv_request_t* recvreq = (mca_pml_bfo_recv_request_t*)rreq;
if ((recvreq->req_recv.req_base.req_convertor.flags & CONVERTOR_CUDA) &&
(btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET)) {
recvreq->req_recv.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
if(opal_convertor_need_buffers(&recvreq->req_recv.req_base.req_convertor) == true) {
recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
return true;
} else {
recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
return false;
}
}
return true;
}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,398 +0,0 @@
/*
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
* Functions that implement failover capabilities.
*/
#ifndef MCA_PML_BFO_FAILOVER_H
#define MCA_PML_BFO_FAILOVER_H
#include "opal/mca/btl/btl.h"
#include "pml_bfo_hdr.h"
BEGIN_C_DECLS
bool mca_pml_bfo_is_duplicate_msg(mca_pml_bfo_comm_proc_t* proc,
mca_pml_bfo_match_hdr_t *hdr);
bool mca_pml_bfo_is_duplicate_fin(mca_pml_bfo_hdr_t* hdr, mca_btl_base_descriptor_t* rdma,
mca_btl_base_module_t* btl);
mca_pml_bfo_recv_request_t* mca_pml_bfo_get_request(mca_pml_bfo_match_hdr_t *hdr);
void mca_pml_bfo_send_request_restart(mca_pml_bfo_send_request_t* sendreq,
bool repost, mca_btl_base_tag_t tag);
void mca_pml_bfo_send_request_rndvrestartnotify(mca_pml_bfo_send_request_t* sendreq,
bool repost, mca_btl_base_tag_t tag, int status,
mca_btl_base_module_t* btl);
void
mca_pml_bfo_rndvrestartnotify_completion(mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* ep,
struct mca_btl_base_descriptor_t* des,
int status);
void
mca_pml_bfo_check_recv_ctl_completion_status(mca_btl_base_module_t* btl,
struct mca_btl_base_descriptor_t* des,
int status);
/* Reset a receive request to the beginning */
void mca_pml_bfo_recv_request_reset(mca_pml_bfo_recv_request_t* recvreq);
/* Notify sender that receiver detected an error */
void mca_pml_bfo_recv_request_recverrnotify(mca_pml_bfo_recv_request_t* recvreq,
mca_btl_base_tag_t tag, int status);
/* Ack the RNDVRESTARTNOTIFY message */
void mca_pml_bfo_recv_request_rndvrestartack(mca_pml_bfo_recv_request_t* recvreq,
mca_btl_base_tag_t tag, int status,
mca_btl_base_module_t* btl);
/* Nack the RNDVRESTARTNOTIFY message */
void mca_pml_bfo_recv_request_rndvrestartnack(mca_btl_base_descriptor_t* olddes,
ompi_proc_t* ompi_proc, bool repost);
void mca_pml_bfo_recv_restart_completion(mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* ep,
struct mca_btl_base_descriptor_t* des,
int status);
void mca_pml_bfo_failover_error_handler(struct mca_btl_base_module_t* btl,
int32_t flags, ompi_proc_t *errproc, char *btlname);
void mca_pml_bfo_repost_match_fragment(struct mca_btl_base_descriptor_t* des);
void mca_pml_bfo_repost_fin(struct mca_btl_base_descriptor_t* des);
void mca_pml_bfo_map_out_btl(struct mca_btl_base_module_t* btl,
ompi_proc_t *errproc, char *btlname);
extern void mca_pml_bfo_map_out( mca_btl_base_module_t *btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* descriptor,
void* cbdata );
int mca_pml_bfo_register_callbacks(void);
void mca_pml_bfo_update_rndv_fields(mca_pml_bfo_hdr_t* hdr,
mca_pml_bfo_send_request_t*, char *type);
void mca_pml_bfo_update_bml_btl(mca_bml_base_btl_t** bml_btl, mca_btl_base_module_t* btl,
struct mca_btl_base_descriptor_t* des);
void mca_pml_bfo_find_recvreq_eager_bml_btl(mca_bml_base_btl_t** bml_btl,
mca_btl_base_module_t* btl,
mca_pml_bfo_recv_request_t* recvreq,
char* type);
void mca_pml_bfo_find_sendreq_eager_bml_btl(mca_bml_base_btl_t** bml_btl,
mca_btl_base_module_t* btl,
mca_pml_bfo_send_request_t* sendreq,
char* type);
void mca_pml_bfo_find_sendreq_rdma_bml_btl(mca_bml_base_btl_t** bml_btl,
mca_btl_base_module_t* btl,
mca_pml_bfo_send_request_t* sendreq,
char* type);
void mca_pml_bfo_update_eager_bml_btl_recv_ctl(mca_bml_base_btl_t** bml_btl,
mca_btl_base_module_t* btl,
struct mca_btl_base_descriptor_t* des);
void mca_pml_bfo_find_recvreq_rdma_bml_btl(mca_bml_base_btl_t** bml_btl,
mca_btl_base_module_t* btl,
mca_pml_bfo_recv_request_t* recvreq,
char* type);
bool mca_pml_bfo_rndv_completion_status_error(struct mca_btl_base_descriptor_t* des,
mca_pml_bfo_send_request_t* sendreq);
void mca_pml_bfo_send_ctl_completion_status_error(struct mca_btl_base_descriptor_t* des);
void mca_pml_bfo_completion_sendreq_has_error(mca_pml_bfo_send_request_t* sendreq,
int status,
mca_btl_base_module_t* btl,
int type,
char *description);
/**
* Four new callbacks for the four new message types.
*/
extern void mca_pml_bfo_recv_frag_callback_rndvrestartnotify( mca_btl_base_module_t *btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* descriptor,
void* cbdata );
extern void mca_pml_bfo_recv_frag_callback_rndvrestartack( mca_btl_base_module_t *btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* descriptor,
void* cbdata );
extern void mca_pml_bfo_recv_frag_callback_rndvrestartnack( mca_btl_base_module_t *btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* descriptor,
void* cbdata );
extern void mca_pml_bfo_recv_frag_callback_recverrnotify( mca_btl_base_module_t *btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* descriptor,
void* cbdata );
/**
* A bunch of macros to help isolate failover code from regular ob1 code.
*/
/* Drop any ACK fragments if request is in error state. Do not want
* to initiate any more activity. */
#define MCA_PML_BFO_ERROR_CHECK_ON_ACK_CALLBACK(sendreq) \
if( OPAL_UNLIKELY((sendreq)->req_error)) { \
opal_output_verbose(20, mca_pml_bfo_output, \
"ACK: received: dropping because request in error, " \
"PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", \
(uint16_t)(sendreq)->req_send.req_base.req_sequence, \
(sendreq)->req_restartseq, \
(void *)(sendreq), (sendreq)->req_recv.pval, \
(sendreq)->req_send.req_base.req_peer); \
return; \
}
/* Drop any FRAG fragments if request is in error state. Do not want
* to initiate any more activity. */
#define MCA_PML_BFO_ERROR_CHECK_ON_FRAG_CALLBACK(recvreq) \
if( OPAL_UNLIKELY((recvreq)->req_errstate)) { \
opal_output_verbose(20, mca_pml_bfo_output, \
"FRAG: received: dropping because request in error, " \
"PML=%d, src_req=%p, dst_req=%p, peer=%d, offset=%d", \
(uint16_t)(recvreq)->req_msgseq, \
(recvreq)->remote_req_send.pval, \
(void *)(recvreq), \
(recvreq)->req_recv.req_base.req_ompi.req_status.MPI_SOURCE, \
(int)hdr->hdr_frag.hdr_frag_offset); \
return; \
}
/* Drop any PUT fragments if request is in error state. Do not want
* to initiate any more activity. */
#define MCA_PML_BFO_ERROR_CHECK_ON_PUT_CALLBACK(sendreq) \
if( OPAL_UNLIKELY((sendreq)->req_error)) { \
opal_output_verbose(20, mca_pml_bfo_output, \
"PUT: received: dropping because request in error, " \
"PML=%d, src_req=%p, dst_req=%p, peer=%d", \
(uint16_t)(sendreq)->req_send.req_base.req_sequence, \
(void *)(sendreq), (sendreq)->req_recv.pval, \
(sendreq)->req_send.req_base.req_peer); \
return; \
}
/**
* Macros for pml_bfo_recvreq.c file.
*/
/* This can happen if a FIN message arrives after the request was
* marked in error. So, just drop the message. Note that the status
* field is not being checked. That is because the status field is the
* value returned in the FIN hdr.hdr_fail field and may be used for
* other things. Note that we allow the various fields to be updated
* in case this actually completes the request and the sending side
* thinks it is done. */
#define MCA_PML_BFO_ERROR_CHECK_ON_FIN_FOR_PUT(recvreq) \
if( OPAL_UNLIKELY((recvreq)->req_errstate)) { \
opal_output_verbose(20, mca_pml_bfo_output, \
"FIN: received on broken request, skipping, " \
"PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", \
(recvreq)->req_msgseq, (recvreq)->req_restartseq, \
(recvreq)->remote_req_send.pval, (void *)(recvreq), \
(recvreq)->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); \
/* Even though in error, it still might complete. */ \
recv_request_pml_complete_check(recvreq); \
return; \
}
#define MCA_PML_BFO_ERROR_CHECK_ON_RDMA_READ_COMPLETION(recvreq) \
if ((recvreq)->req_errstate) { \
opal_output_verbose(30, mca_pml_bfo_output, \
"RDMA read: completion failed, error already seen, " \
"PML=%d, RQS=%d, src_req=%lx, dst_req=%lx, peer=%d", \
(recvreq)->req_msgseq, (recvreq)->req_restartseq, \
(unsigned long)(recvreq)->remote_req_send.pval, \
(unsigned long)(recvreq), \
(recvreq)->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); \
return; \
} else { \
opal_output_verbose(30, mca_pml_bfo_output, \
"RDMA read: completion failed, sending RECVERRNOTIFY to " \
"sender, PML=%d, RQS=%d, src_req=%lx, dst_req=%lx, peer=%d", \
(recvreq)->req_msgseq, (recvreq)->req_restartseq, \
(unsigned long)(recvreq)->remote_req_send.pval, \
(unsigned long)(recvreq), \
(recvreq)->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); \
mca_pml_bfo_recv_request_recverrnotify(recvreq, MCA_PML_BFO_HDR_TYPE_RGET, status); \
}
#define MCA_PML_BFO_SECOND_ERROR_CHECK_ON_RDMA_READ_COMPLETION(recvreq, status, btl) \
/* See if the request has received a RNDVRESTARTNOTIFY */ \
if( OPAL_UNLIKELY(recvreq->req_errstate)) { \
if (recvreq->req_errstate & RECVREQ_RNDVRESTART_RECVED) { \
opal_output_verbose(30, mca_pml_bfo_output, \
"RDMA read: completion: recvreq has error, outstanding events=%d " \
"PML=%d, RQS=%d, src_req=%lx, dst_req=%lx, status=%d, peer=%d", \
recvreq->req_events, recvreq->req_msgseq, recvreq->req_restartseq, \
(unsigned long)recvreq->remote_req_send.pval, \
(unsigned long)recvreq, status, \
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); \
if (0 == recvreq->req_events) { \
mca_pml_bfo_recv_request_rndvrestartack(recvreq, MCA_PML_BFO_HDR_TYPE_RGET, \
status, btl); \
} \
} \
MCA_PML_BFO_RDMA_FRAG_RETURN(frag); \
return; \
}
/**
* Macros for pml_bfo_sendreq.c file.
*/
/* This macro is called on the sending side after receiving
* a PUT message. There is a chance that this PUT message
* has shown up and is attempting to modify the state of
* the req_state, but the req_state is no longer being tracked
* because the RNDV message has turned into a RGET message
* because it got an error on the RNDV completion.
*/
#define MCA_PML_BFO_VERIFY_SENDREQ_REQ_STATE_VALUE(sendreq) \
if (sendreq->req_state == -1) { \
OPAL_THREAD_ADD_FETCH32(&sendreq->req_state, 1); \
}
/* Now check the error state. This request can be in error if the
* RNDV message made it over, but the receiver got an error trying to
* send the ACK back and therefore sent a RECVERRNOTIFY message. In
* that case, we want to start the restart dance as the receiver has
* matched this message already. Only restart if there are no
* outstanding events on send request. */
#define MCA_PML_BFO_RNDV_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, status, btl, type, description) \
if( OPAL_UNLIKELY ((sendreq)->req_error)) { \
mca_pml_bfo_completion_sendreq_has_error(sendreq, status, \
btl, type, description); \
return; \
}
/**
* This macro is called within the frag completion function in two
* places. It is called to see if any errors occur prior to the
* completion event on the frag. It is then called a second time
* after the scheduling routine is called as the scheduling routine
* may have detected that a BTL that was cached on the request had
* been removed and therefore marked the request in error. In that
* case, the scheduling of fragments can no longer proceed properly,
* and if there are no outstanding events, iniated the restart dance.
*/
#define MCA_PML_BFO_FRAG_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, status, btl, type, description) \
if( OPAL_UNLIKELY((sendreq)->req_error)) { \
mca_pml_bfo_completion_sendreq_has_error(sendreq, status, \
btl, type, description); \
return; \
}
/* This can happen if a FIN message arrives after the request was
* marked in error. So, just drop the message. Note that the status
* field is not checked here. That is because that is the value
* returned in the FIN hdr.hdr_fail field and may be used for other
* things. */
#define MCA_PML_BFO_RGET_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, btl, des) \
if( OPAL_UNLIKELY(sendreq->req_error)) { \
opal_output_verbose(30, mca_pml_bfo_output, \
"FIN: received on broken request, skipping, " \
"PML=%d, src_req=%lx, dst_req=%lx, peer=%d", \
(uint16_t)sendreq->req_send.req_base.req_sequence, \
(unsigned long)sendreq, (unsigned long)sendreq->req_recv.pval, \
sendreq->req_send.req_base.req_peer); \
btl->btl_free(btl, des); \
return; \
}
/* Check if there has been an error on the send request when we get
* a completion event on the RDMA write. */
#define MCA_PML_BFO_PUT_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, status, btl) \
if ( OPAL_UNLIKELY(sendreq->req_error)) { \
mca_pml_bfo_completion_sendreq_has_error(sendreq, status, btl, \
MCA_PML_BFO_HDR_TYPE_PUT, "RDMA write"); \
MCA_PML_BFO_RDMA_FRAG_RETURN(frag); \
return; \
}
#define MCA_PML_BFO_CHECK_FOR_RNDV_RESTART(hdr, sendreq, type) \
if (0 < sendreq->req_restartseq) { \
mca_pml_bfo_update_rndv_fields(hdr, sendreq, type); \
}
/* If a bml_btl gets mapped out, then we need to adjust it based
* on the btl from the callback function. These macros are called on
* every callback to make sure things are copacetic.
*/
#define MCA_PML_BFO_CHECK_EAGER_BML_BTL_ON_FIN_COMPLETION(bml_btl, btl, des) \
if (bml_btl->btl != btl) { \
ompi_proc_t *proc = (ompi_proc_t*) des->des_cbdata; \
mca_bml_base_endpoint_t* bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; \
bml_btl = mca_bml_base_btl_array_find(&bml_endpoint->btl_eager, btl); \
}
#define MCA_PML_BFO_CHECK_SENDREQ_EAGER_BML_BTL(bml_btl, btl, sendreq, type) \
if (bml_btl->btl != btl) { \
mca_pml_bfo_find_sendreq_eager_bml_btl(&bml_btl, btl, sendreq, type); \
}
#define MCA_PML_BFO_CHECK_SENDREQ_RDMA_BML_BTL(bml_btl, btl, sendreq, type) \
if (bml_btl->btl != btl) { \
mca_pml_bfo_find_sendreq_rdma_bml_btl(&bml_btl, btl, sendreq, type); \
}
#define MCA_PML_BFO_CHECK_RECVREQ_EAGER_BML_BTL(bml_btl, btl, recvreq, type) \
if (bml_btl->btl != btl) { \
mca_pml_bfo_find_recvreq_eager_bml_btl(&bml_btl, btl, recvreq, type); \
}
#define MCA_PML_BFO_CHECK_RECVREQ_RDMA_BML_BTL(bml_btl, btl, recvreq, type) \
if (bml_btl->btl != btl) { \
mca_pml_bfo_find_recvreq_rdma_bml_btl(&bml_btl, btl, recvreq, type); \
}
#define MCA_PML_BFO_CHECK_RECVREQ_EAGER_BML_BTL_RECV_CTL(bml_btl, btl, des) \
if (bml_btl->btl != btl) { \
mca_pml_bfo_update_eager_bml_btl_recv_ctl(&bml_btl, btl, des); \
}
#define MCA_PML_BFO_CHECK_FOR_REMOVED_BML(sendreq, frag, btl) \
if( OPAL_UNLIKELY(NULL == frag->rdma_bml) ) { \
opal_output_verbose(30, mca_pml_bfo_output, \
"PUT received: no matching BTL to RDMA write to, oustanding " \
"events=%d, PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", \
sendreq->req_events, \
(uint16_t)sendreq->req_send.req_base.req_sequence, \
sendreq->req_restartseq, (void *)sendreq, \
sendreq->req_recv.pval, sendreq->req_send.req_base.req_peer); \
MCA_PML_BFO_RDMA_FRAG_RETURN(frag); \
sendreq->req_error++; \
if (0 == sendreq->req_events) { \
mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false, \
MCA_PML_BFO_HDR_TYPE_PUT, \
OMPI_ERROR, btl); \
} \
return; \
}
/* This macro checks to see if the cached number of BTLs in the
* send request still matches the value from the endpoint.
* If it does not, this means that a BTL was removed from the
* available list. In this case, start the request over.
*/
#define MCA_PML_BFO_CHECK_FOR_REMOVED_BTL(sendreq, range) \
if ((int)mca_bml_base_btl_array_get_size(&sendreq->req_endpoint->btl_send) \
!= range->range_btl_cnt) { \
sendreq->req_error++; \
return OMPI_ERROR; \
}
END_C_DECLS
#endif

Просмотреть файл

@ -1,539 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2009 IBM Corporation. All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*/
#ifndef MCA_PML_BFO_HEADER_H
#define MCA_PML_BFO_HEADER_H
#include "ompi_config.h"
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif
#ifdef HAVE_NETINET_IN_H
#include <netinet/in.h>
#endif
#include "opal/types.h"
#include "opal/util/arch.h"
#include "opal/mca/btl/btl.h"
#include "ompi/proc/proc.h"
#define MCA_PML_BFO_HDR_TYPE_MATCH (MCA_BTL_TAG_PML + 1)
#define MCA_PML_BFO_HDR_TYPE_RNDV (MCA_BTL_TAG_PML + 2)
#define MCA_PML_BFO_HDR_TYPE_RGET (MCA_BTL_TAG_PML + 3)
#define MCA_PML_BFO_HDR_TYPE_ACK (MCA_BTL_TAG_PML + 4)
#define MCA_PML_BFO_HDR_TYPE_NACK (MCA_BTL_TAG_PML + 5)
#define MCA_PML_BFO_HDR_TYPE_FRAG (MCA_BTL_TAG_PML + 6)
#define MCA_PML_BFO_HDR_TYPE_GET (MCA_BTL_TAG_PML + 7)
#define MCA_PML_BFO_HDR_TYPE_PUT (MCA_BTL_TAG_PML + 8)
#define MCA_PML_BFO_HDR_TYPE_FIN (MCA_BTL_TAG_PML + 9)
#if PML_BFO
#define MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY (MCA_BTL_TAG_PML + 10)
#define MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK (MCA_BTL_TAG_PML + 11)
#define MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNACK (MCA_BTL_TAG_PML + 12)
#define MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY (MCA_BTL_TAG_PML + 13)
#endif /* PML_BFO */
#define MCA_PML_BFO_HDR_FLAGS_ACK 1 /* is an ack required */
#define MCA_PML_BFO_HDR_FLAGS_NBO 2 /* is the hdr in network byte order */
#define MCA_PML_BFO_HDR_FLAGS_PIN 4 /* is user buffer pinned */
#define MCA_PML_BFO_HDR_FLAGS_CONTIG 8 /* is user buffer contiguous */
#define MCA_PML_BFO_HDR_FLAGS_NORDMA 16 /* rest will be send by copy-in-out */
#if PML_BFO
#define MCA_PML_BFO_HDR_FLAGS_RESTART 32 /* restart RNDV because of error */
#endif /* PML_BFO */
/**
* Common hdr attributes - must be first element in each hdr type
*/
struct mca_pml_bfo_common_hdr_t {
uint8_t hdr_type; /**< type of envelope */
uint8_t hdr_flags; /**< flags indicating how fragment should be processed */
};
typedef struct mca_pml_bfo_common_hdr_t mca_pml_bfo_common_hdr_t;
#define MCA_PML_BFO_COMMON_HDR_NTOH(h)
#define MCA_PML_BFO_COMMON_HDR_HTON(h)
/**
* Header definition for the first fragment, contains the
* attributes required to match the corresponding posted receive.
*/
struct mca_pml_bfo_match_hdr_t {
mca_pml_bfo_common_hdr_t hdr_common; /**< common attributes */
uint16_t hdr_ctx; /**< communicator index */
int32_t hdr_src; /**< source rank */
int32_t hdr_tag; /**< user tag */
uint16_t hdr_seq; /**< message sequence number */
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
uint8_t hdr_padding[2]; /**< explicitly pad to 16 bytes. Compilers seem to already prefer to do this, but make it explicit just in case */
#endif
};
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
#define OMPI_PML_BFO_MATCH_HDR_LEN 16
#else
#define OMPI_PML_BFO_MATCH_HDR_LEN 14
#endif
typedef struct mca_pml_bfo_match_hdr_t mca_pml_bfo_match_hdr_t;
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_BFO_MATCH_HDR_FILL(h) \
do { \
(h).hdr_padding[0] = 0; \
(h).hdr_padding[1] = 0; \
} while(0)
#else
#define MCA_PML_BFO_MATCH_HDR_FILL(h)
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
#define MCA_PML_BFO_MATCH_HDR_NTOH(h) \
do { \
MCA_PML_BFO_COMMON_HDR_NTOH((h).hdr_common); \
(h).hdr_ctx = ntohs((h).hdr_ctx); \
(h).hdr_src = ntohl((h).hdr_src); \
(h).hdr_tag = ntohl((h).hdr_tag); \
(h).hdr_seq = ntohs((h).hdr_seq); \
} while (0)
#define MCA_PML_BFO_MATCH_HDR_HTON(h) \
do { \
MCA_PML_BFO_COMMON_HDR_HTON((h).hdr_common); \
MCA_PML_BFO_MATCH_HDR_FILL(h); \
(h).hdr_ctx = htons((h).hdr_ctx); \
(h).hdr_src = htonl((h).hdr_src); \
(h).hdr_tag = htonl((h).hdr_tag); \
(h).hdr_seq = htons((h).hdr_seq); \
} while (0)
/**
* Header definition for the first fragment when an acknowledgment
* is required. This could be the first fragment of a large message
* or a short message that requires an ack (synchronous).
*/
struct mca_pml_bfo_rendezvous_hdr_t {
mca_pml_bfo_match_hdr_t hdr_match;
uint64_t hdr_msg_length; /**< message length */
opal_ptr_t hdr_src_req; /**< pointer to source request - returned in ack */
#if PML_BFO
opal_ptr_t hdr_dst_req; /**< pointer to dst req */
uint8_t hdr_restartseq; /**< restart sequence */
#endif /* PML_BFO */
};
typedef struct mca_pml_bfo_rendezvous_hdr_t mca_pml_bfo_rendezvous_hdr_t;
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_BFO_RNDV_HDR_FILL(h) \
MCA_PML_BFO_MATCH_HDR_FILL((h).hdr_match)
#else
#define MCA_PML_BFO_RNDV_HDR_FILL(h)
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
/* Note that hdr_src_req is not put in network byte order because it
is never processed by the receiver, other than being copied into
the ack header */
#define MCA_PML_BFO_RNDV_HDR_NTOH(h) \
do { \
MCA_PML_BFO_MATCH_HDR_NTOH((h).hdr_match); \
(h).hdr_msg_length = ntoh64((h).hdr_msg_length); \
} while (0)
#define MCA_PML_BFO_RNDV_HDR_HTON(h) \
do { \
MCA_PML_BFO_MATCH_HDR_HTON((h).hdr_match); \
MCA_PML_BFO_RNDV_HDR_FILL(h); \
(h).hdr_msg_length = hton64((h).hdr_msg_length); \
} while (0)
/**
* Header definition for a combined rdma rendezvous/get
*/
struct mca_pml_bfo_rget_hdr_t {
mca_pml_bfo_rendezvous_hdr_t hdr_rndv;
uint32_t hdr_seg_cnt; /**< number of segments for rdma */
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
uint8_t hdr_padding[4];
#endif
opal_ptr_t hdr_des; /**< source descriptor */
};
typedef struct mca_pml_bfo_rget_hdr_t mca_pml_bfo_rget_hdr_t;
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_BFO_RGET_HDR_FILL(h) \
do { \
MCA_PML_BFO_RNDV_HDR_FILL((h).hdr_rndv); \
(h).hdr_padding[0] = 0; \
(h).hdr_padding[1] = 0; \
(h).hdr_padding[2] = 0; \
(h).hdr_padding[3] = 0; \
} while(0)
#else
#define MCA_PML_BFO_RGET_HDR_FILL(h)
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
#define MCA_PML_BFO_RGET_HDR_NTOH(h) \
do { \
MCA_PML_BFO_RNDV_HDR_NTOH((h).hdr_rndv); \
(h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \
} while (0)
#define MCA_PML_BFO_RGET_HDR_HTON(h) \
do { \
MCA_PML_BFO_RNDV_HDR_HTON((h).hdr_rndv); \
MCA_PML_BFO_RGET_HDR_FILL(h); \
(h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \
} while (0)
/**
* Header for subsequent fragments.
*/
struct mca_pml_bfo_frag_hdr_t {
mca_pml_bfo_common_hdr_t hdr_common; /**< common attributes */
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
uint8_t hdr_padding[6];
#endif
uint64_t hdr_frag_offset; /**< offset into message */
opal_ptr_t hdr_src_req; /**< pointer to source request */
opal_ptr_t hdr_dst_req; /**< pointer to matched receive */
};
typedef struct mca_pml_bfo_frag_hdr_t mca_pml_bfo_frag_hdr_t;
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_BFO_FRAG_HDR_FILL(h) \
do { \
(h).hdr_padding[0] = 0; \
(h).hdr_padding[1] = 0; \
(h).hdr_padding[2] = 0; \
(h).hdr_padding[3] = 0; \
(h).hdr_padding[4] = 0; \
(h).hdr_padding[5] = 0; \
} while(0)
#else
#define MCA_PML_BFO_FRAG_HDR_FILL(h)
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
#define MCA_PML_BFO_FRAG_HDR_NTOH(h) \
do { \
MCA_PML_BFO_COMMON_HDR_NTOH((h).hdr_common); \
(h).hdr_frag_offset = ntoh64((h).hdr_frag_offset); \
} while (0)
#define MCA_PML_BFO_FRAG_HDR_HTON(h) \
do { \
MCA_PML_BFO_COMMON_HDR_HTON((h).hdr_common); \
MCA_PML_BFO_FRAG_HDR_FILL(h); \
(h).hdr_frag_offset = hton64((h).hdr_frag_offset); \
} while (0)
/**
* Header used to acknowledgment outstanding fragment(s).
*/
struct mca_pml_bfo_ack_hdr_t {
mca_pml_bfo_common_hdr_t hdr_common; /**< common attributes */
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
uint8_t hdr_padding[6];
#endif
opal_ptr_t hdr_src_req; /**< source request */
opal_ptr_t hdr_dst_req; /**< matched receive request */
uint64_t hdr_send_offset; /**< starting point of copy in/out */
};
typedef struct mca_pml_bfo_ack_hdr_t mca_pml_bfo_ack_hdr_t;
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_BFO_ACK_HDR_FILL(h) \
do { \
(h).hdr_padding[0] = 0; \
(h).hdr_padding[1] = 0; \
(h).hdr_padding[2] = 0; \
(h).hdr_padding[3] = 0; \
(h).hdr_padding[4] = 0; \
(h).hdr_padding[5] = 0; \
} while (0)
#else
#define MCA_PML_BFO_ACK_HDR_FILL(h)
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
/* Note that the request headers are not put in NBO because the
src_req is already in receiver's byte order and the dst_req is not
used by the receiver for anything other than backpointers in return
headers */
#define MCA_PML_BFO_ACK_HDR_NTOH(h) \
do { \
MCA_PML_BFO_COMMON_HDR_NTOH((h).hdr_common); \
(h).hdr_send_offset = ntoh64((h).hdr_send_offset); \
} while (0)
#define MCA_PML_BFO_ACK_HDR_HTON(h) \
do { \
MCA_PML_BFO_COMMON_HDR_HTON((h).hdr_common); \
MCA_PML_BFO_ACK_HDR_FILL(h); \
(h).hdr_send_offset = hton64((h).hdr_send_offset); \
} while (0)
/**
* Header used to initiate an RDMA operation.
*/
struct mca_pml_bfo_rdma_hdr_t {
mca_pml_bfo_common_hdr_t hdr_common; /**< common attributes */
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
uint8_t hdr_padding[2]; /** two to pad out the hdr to a 4 byte alignment. hdr_req will then be 8 byte aligned after 4 for hdr_seg_cnt */
#endif
uint32_t hdr_seg_cnt; /**< number of segments for rdma */
opal_ptr_t hdr_req; /**< destination request */
#if PML_BFO
opal_ptr_t hdr_dst_req; /**< pointer to destination request */
#endif /* PML_BFO */
opal_ptr_t hdr_des; /**< source descriptor */
uint64_t hdr_rdma_offset; /**< current offset into user buffer */
mca_btl_base_segment_t hdr_segs[1]; /**< list of segments for rdma */
};
typedef struct mca_pml_bfo_rdma_hdr_t mca_pml_bfo_rdma_hdr_t;
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_BFO_RDMA_HDR_FILL(h) \
do { \
(h).hdr_padding[0] = 0; \
(h).hdr_padding[1] = 0; \
} while(0)
#else
#define MCA_PML_BFO_RDMA_HDR_FILL(h)
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
#define MCA_PML_BFO_RDMA_HDR_NTOH(h) \
do { \
MCA_PML_BFO_COMMON_HDR_NTOH((h).hdr_common); \
(h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \
(h).hdr_rdma_offset = ntoh64((h).hdr_rdma_offset); \
} while (0)
#define MCA_PML_BFO_RDMA_HDR_HTON(h) \
do { \
MCA_PML_BFO_COMMON_HDR_HTON((h).hdr_common); \
MCA_PML_BFO_RDMA_HDR_FILL(h); \
(h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \
(h).hdr_rdma_offset = hton64((h).hdr_rdma_offset); \
} while (0)
/**
* Header used to complete an RDMA operation.
*/
struct mca_pml_bfo_fin_hdr_t {
mca_pml_bfo_common_hdr_t hdr_common; /**< common attributes */
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
uint8_t hdr_padding[2];
#endif
#if PML_BFO
/* Match info is needed to check for duplicate FIN messages. */
mca_pml_bfo_match_hdr_t hdr_match;
#endif /* PML_BFO */
uint32_t hdr_fail; /**< RDMA operation failed */
opal_ptr_t hdr_des; /**< completed descriptor */
};
typedef struct mca_pml_bfo_fin_hdr_t mca_pml_bfo_fin_hdr_t;
#if PML_BFO
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_BFO_FIN_HDR_FILL(h) \
do { \
(h).hdr_padding[0] = 0; \
(h).hdr_padding[1] = 0; \
MCA_PML_BFO_MATCH_HDR_FILL((h).hdr_match); \
} while (0)
#else
#define MCA_PML_BFO_FIN_HDR_FILL(h)
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
#define MCA_PML_BFO_FIN_HDR_NTOH(h) \
do { \
MCA_PML_BFO_COMMON_HDR_NTOH((h).hdr_common); \
MCA_PML_BFO_MATCH_HDR_NTOH((h).hdr_match); \
} while (0)
#define MCA_PML_BFO_FIN_HDR_HTON(h) \
do { \
MCA_PML_BFO_COMMON_HDR_HTON((h).hdr_common); \
MCA_PML_BFO_MATCH_HDR_HTON((h).hdr_match); \
MCA_PML_BFO_FIN_HDR_FILL(h); \
} while (0)
#else /* PML_BFO */
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_BFO_FIN_HDR_FILL(h) \
do { \
(h).hdr_padding[0] = 0; \
(h).hdr_padding[1] = 0; \
} while (0)
#else
#define MCA_PML_BFO_FIN_HDR_FILL(h)
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
#define MCA_PML_BFO_FIN_HDR_NTOH(h) \
do { \
MCA_PML_BFO_COMMON_HDR_NTOH((h).hdr_common); \
} while (0)
#define MCA_PML_BFO_FIN_HDR_HTON(h) \
do { \
MCA_PML_BFO_COMMON_HDR_HTON((h).hdr_common); \
MCA_PML_BFO_FIN_HDR_FILL(h); \
} while (0)
#endif /* PML_BFO */
#if PML_BFO
/**
* Header used to restart a rendezvous request.
*/
struct mca_pml_bfo_restart_hdr_t {
mca_pml_bfo_match_hdr_t hdr_match; /**< needed to avoid duplicate messages */
uint8_t hdr_restartseq; /**< restart sequence */
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
uint8_t hdr_padding[3];
#endif
opal_ptr_t hdr_src_req; /**< source request */
opal_ptr_t hdr_dst_req; /**< matched receive request */
int32_t hdr_dst_rank; /**< needed to send NACK */
uint32_t hdr_jobid; /**< needed to send NACK */
uint32_t hdr_vpid; /**< needed to send NACK */
};
typedef struct mca_pml_bfo_restart_hdr_t mca_pml_bfo_restart_hdr_t;
/* Only need to put parts of the restart header in NBO. No need
to do hdr_src_req and hdr_dst_req as they are only used on the
by the process that originated them. */
#define MCA_PML_BFO_RESTART_HDR_NTOH(h) \
do { \
MCA_PML_BFO_MATCH_HDR_NTOH((h).hdr_match); \
(h).hdr_dst_rank = ntohl((h).hdr_dst_rank); \
(h).hdr_jobid = ntohl((h).hdr_jobid); \
(h).hdr_vpid = ntohl((h).hdr_vpid); \
} while (0)
#define MCA_PML_BFO_RESTART_HDR_HTON(h) \
do { \
MCA_PML_BFO_MATCH_HDR_HTON((h).hdr_match); \
(h).hdr_dst_rank = htonl((h).hdr_dst_rank); \
(h).hdr_jobid = htonl((h).hdr_jobid); \
(h).hdr_vpid = htonl((h).hdr_vpid); \
} while (0)
#endif /* PML_BFO */
/**
* Union of defined hdr types.
*/
union mca_pml_bfo_hdr_t {
mca_pml_bfo_common_hdr_t hdr_common;
mca_pml_bfo_match_hdr_t hdr_match;
mca_pml_bfo_rendezvous_hdr_t hdr_rndv;
mca_pml_bfo_rget_hdr_t hdr_rget;
mca_pml_bfo_frag_hdr_t hdr_frag;
mca_pml_bfo_ack_hdr_t hdr_ack;
mca_pml_bfo_rdma_hdr_t hdr_rdma;
mca_pml_bfo_fin_hdr_t hdr_fin;
#if PML_BFO
mca_pml_bfo_restart_hdr_t hdr_restart;
#endif /* PML_BFO */
};
typedef union mca_pml_bfo_hdr_t mca_pml_bfo_hdr_t;
#if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT
static inline __opal_attribute_always_inline__ void
bfo_hdr_ntoh(mca_pml_bfo_hdr_t *hdr, const uint8_t hdr_type)
{
if(!(hdr->hdr_common.hdr_flags & MCA_PML_BFO_HDR_FLAGS_NBO))
return;
switch(hdr_type) {
case MCA_PML_BFO_HDR_TYPE_MATCH:
MCA_PML_BFO_MATCH_HDR_NTOH(hdr->hdr_match);
break;
case MCA_PML_BFO_HDR_TYPE_RNDV:
MCA_PML_BFO_RNDV_HDR_NTOH(hdr->hdr_rndv);
break;
case MCA_PML_BFO_HDR_TYPE_RGET:
MCA_PML_BFO_RGET_HDR_NTOH(hdr->hdr_rget);
break;
case MCA_PML_BFO_HDR_TYPE_ACK:
MCA_PML_BFO_ACK_HDR_NTOH(hdr->hdr_ack);
break;
case MCA_PML_BFO_HDR_TYPE_FRAG:
MCA_PML_BFO_FRAG_HDR_NTOH(hdr->hdr_frag);
break;
case MCA_PML_BFO_HDR_TYPE_PUT:
MCA_PML_BFO_RDMA_HDR_NTOH(hdr->hdr_rdma);
break;
case MCA_PML_BFO_HDR_TYPE_FIN:
MCA_PML_BFO_FIN_HDR_NTOH(hdr->hdr_fin);
break;
default:
assert(0);
break;
}
}
#else
#define bfo_hdr_ntoh(h, t) do{}while(0)
#endif
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
#define bfo_hdr_hton(h, t, p) \
bfo_hdr_hton_intr((mca_pml_bfo_hdr_t*)h, t, p)
static inline __opal_attribute_always_inline__ void
bfo_hdr_hton_intr(mca_pml_bfo_hdr_t *hdr, const uint8_t hdr_type,
const ompi_proc_t *proc)
{
#ifdef WORDS_BIGENDIAN
hdr->hdr_common.hdr_flags |= MCA_PML_BFO_HDR_FLAGS_NBO;
#else
if(!(proc->super.proc_arch & OPAL_ARCH_ISBIGENDIAN))
return;
hdr->hdr_common.hdr_flags |= MCA_PML_BFO_HDR_FLAGS_NBO;
switch(hdr_type) {
case MCA_PML_BFO_HDR_TYPE_MATCH:
MCA_PML_BFO_MATCH_HDR_HTON(hdr->hdr_match);
break;
case MCA_PML_BFO_HDR_TYPE_RNDV:
MCA_PML_BFO_RNDV_HDR_HTON(hdr->hdr_rndv);
break;
case MCA_PML_BFO_HDR_TYPE_RGET:
MCA_PML_BFO_RGET_HDR_HTON(hdr->hdr_rget);
break;
case MCA_PML_BFO_HDR_TYPE_ACK:
MCA_PML_BFO_ACK_HDR_HTON(hdr->hdr_ack);
break;
case MCA_PML_BFO_HDR_TYPE_FRAG:
MCA_PML_BFO_FRAG_HDR_HTON(hdr->hdr_frag);
break;
case MCA_PML_BFO_HDR_TYPE_PUT:
MCA_PML_BFO_RDMA_HDR_HTON(hdr->hdr_rdma);
break;
case MCA_PML_BFO_HDR_TYPE_FIN:
MCA_PML_BFO_FIN_HDR_HTON(hdr->hdr_fin);
break;
default:
assert(0);
break;
}
#endif
}
#else
#define bfo_hdr_hton(h, t, p) do{}while(0)
#endif
#endif

Просмотреть файл

@ -1,171 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2013 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2009-2012 Oracle and/or its affiliates. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "ompi/request/request.h"
#include "ompi/message/message.h"
#include "pml_bfo_recvreq.h"
int mca_pml_bfo_iprobe(int src,
int tag,
struct ompi_communicator_t *comm,
int *matched, ompi_status_public_t * status)
{
int rc = OMPI_SUCCESS;
mca_pml_bfo_recv_request_t recvreq;
OBJ_CONSTRUCT( &recvreq, mca_pml_bfo_recv_request_t );
recvreq.req_recv.req_base.req_ompi.req_type = OMPI_REQUEST_PML;
recvreq.req_recv.req_base.req_type = MCA_PML_REQUEST_IPROBE;
MCA_PML_BFO_RECV_REQUEST_INIT(&recvreq, NULL, 0, &ompi_mpi_char.dt, src, tag, comm, false);
MCA_PML_BFO_RECV_REQUEST_START(&recvreq);
if( recvreq.req_recv.req_base.req_ompi.req_complete == true ) {
if( NULL != status ) {
*status = recvreq.req_recv.req_base.req_ompi.req_status;
}
rc = recvreq.req_recv.req_base.req_ompi.req_status.MPI_ERROR;
*matched = 1;
} else {
*matched = 0;
opal_progress();
}
MCA_PML_BASE_RECV_REQUEST_FINI( &recvreq.req_recv );
return rc;
}
int mca_pml_bfo_probe(int src,
int tag,
struct ompi_communicator_t *comm,
ompi_status_public_t * status)
{
int rc = OMPI_SUCCESS;
mca_pml_bfo_recv_request_t recvreq;
OBJ_CONSTRUCT( &recvreq, mca_pml_bfo_recv_request_t );
recvreq.req_recv.req_base.req_ompi.req_type = OMPI_REQUEST_PML;
recvreq.req_recv.req_base.req_type = MCA_PML_REQUEST_PROBE;
MCA_PML_BFO_RECV_REQUEST_INIT(&recvreq, NULL, 0, &ompi_mpi_char.dt, src, tag, comm, false);
MCA_PML_BFO_RECV_REQUEST_START(&recvreq);
ompi_request_wait_completion(&recvreq.req_recv.req_base.req_ompi);
rc = recvreq.req_recv.req_base.req_ompi.req_status.MPI_ERROR;
if (NULL != status) {
*status = recvreq.req_recv.req_base.req_ompi.req_status;
}
MCA_PML_BASE_RECV_REQUEST_FINI( &recvreq.req_recv );
return rc;
}
int
mca_pml_bfo_improbe(int src,
int tag,
struct ompi_communicator_t *comm,
int *matched,
struct ompi_message_t **message,
ompi_status_public_t * status)
{
int rc = OMPI_SUCCESS;
mca_pml_bfo_recv_request_t *recvreq;
*message = ompi_message_alloc();
if (NULL == *message) return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
MCA_PML_BFO_RECV_REQUEST_ALLOC(recvreq);
if (NULL == recvreq)
return OMPI_ERR_OUT_OF_RESOURCE;
recvreq->req_recv.req_base.req_type = MCA_PML_REQUEST_IMPROBE;
/* initialize the request enough to probe and get the status */
MCA_PML_BFO_RECV_REQUEST_INIT(recvreq, NULL, 0, &ompi_mpi_char.dt,
src, tag, comm, false);
MCA_PML_BFO_RECV_REQUEST_START(recvreq);
if( recvreq->req_recv.req_base.req_ompi.req_complete == true ) {
if( NULL != status ) {
*status = recvreq->req_recv.req_base.req_ompi.req_status;
}
*matched = 1;
(*message)->comm = comm;
(*message)->req_ptr = recvreq;
(*message)->peer = recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE;
(*message)->count = recvreq->req_recv.req_base.req_ompi.req_status._ucount;
rc = recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR;
} else {
*matched = 0;
/* we only free if we didn't match, because we're going to
translate the request into a receive request later on if it
was matched */
MCA_PML_BFO_RECV_REQUEST_RETURN( recvreq );
ompi_message_return(*message);
*message = MPI_MESSAGE_NULL;
opal_progress();
}
return rc;
}
int
mca_pml_bfo_mprobe(int src,
int tag,
struct ompi_communicator_t *comm,
struct ompi_message_t **message,
ompi_status_public_t * status)
{
int rc = OMPI_SUCCESS;
mca_pml_bfo_recv_request_t *recvreq;
*message = ompi_message_alloc();
if (NULL == *message) return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
MCA_PML_BFO_RECV_REQUEST_ALLOC(recvreq);
if (NULL == recvreq)
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
recvreq->req_recv.req_base.req_type = MCA_PML_REQUEST_MPROBE;
/* initialize the request enough to probe and get the status */
MCA_PML_BFO_RECV_REQUEST_INIT(recvreq, NULL, 0, &ompi_mpi_char.dt,
src, tag, comm, false);
MCA_PML_BFO_RECV_REQUEST_START(recvreq);
ompi_request_wait_completion(&recvreq->req_recv.req_base.req_ompi);
rc = recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR;
if( NULL != status ) {
*status = recvreq->req_recv.req_base.req_ompi.req_status;
}
(*message)->comm = comm;
(*message)->req_ptr = recvreq;
(*message)->peer = recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE;
(*message)->count = recvreq->req_recv.req_base.req_ompi.req_status._ucount;
return rc;
}

Просмотреть файл

@ -1,308 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2013 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2010-2012 Oracle and/or its affiliates. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "ompi/request/request.h"
#include "pml_bfo_recvreq.h"
#include "pml_bfo_recvfrag.h"
#include "ompi/peruse/peruse-internal.h"
#include "ompi/message/message.h"
int mca_pml_bfo_irecv_init(void *addr,
size_t count,
ompi_datatype_t * datatype,
int src,
int tag,
struct ompi_communicator_t *comm,
struct ompi_request_t **request)
{
mca_pml_bfo_recv_request_t *recvreq;
MCA_PML_BFO_RECV_REQUEST_ALLOC(recvreq);
if (NULL == recvreq)
return OMPI_ERR_OUT_OF_RESOURCE;
MCA_PML_BFO_RECV_REQUEST_INIT(recvreq,
addr,
count, datatype, src, tag, comm, true);
PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
&((recvreq)->req_recv.req_base),
PERUSE_RECV);
*request = (ompi_request_t *) recvreq;
return OMPI_SUCCESS;
}
int mca_pml_bfo_irecv(void *addr,
size_t count,
ompi_datatype_t * datatype,
int src,
int tag,
struct ompi_communicator_t *comm,
struct ompi_request_t **request)
{
mca_pml_bfo_recv_request_t *recvreq;
MCA_PML_BFO_RECV_REQUEST_ALLOC(recvreq);
if (NULL == recvreq)
return OMPI_ERR_OUT_OF_RESOURCE;
MCA_PML_BFO_RECV_REQUEST_INIT(recvreq,
addr,
count, datatype, src, tag, comm, false);
PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
&((recvreq)->req_recv.req_base),
PERUSE_RECV);
MCA_PML_BFO_RECV_REQUEST_START(recvreq);
*request = (ompi_request_t *) recvreq;
return OMPI_SUCCESS;
}
int mca_pml_bfo_recv(void *addr,
size_t count,
ompi_datatype_t * datatype,
int src,
int tag,
struct ompi_communicator_t *comm,
ompi_status_public_t * status)
{
int rc;
mca_pml_bfo_recv_request_t *recvreq;
MCA_PML_BFO_RECV_REQUEST_ALLOC(recvreq);
if (NULL == recvreq)
return OMPI_ERR_OUT_OF_RESOURCE;
MCA_PML_BFO_RECV_REQUEST_INIT(recvreq,
addr,
count, datatype, src, tag, comm, false);
PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
&((recvreq)->req_recv.req_base),
PERUSE_RECV);
MCA_PML_BFO_RECV_REQUEST_START(recvreq);
ompi_request_wait_completion(&recvreq->req_recv.req_base.req_ompi);
if (NULL != status) { /* return status */
*status = recvreq->req_recv.req_base.req_ompi.req_status;
}
rc = recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR;
ompi_request_free( (ompi_request_t**)&recvreq );
return rc;
}
int
mca_pml_bfo_imrecv( void *buf,
size_t count,
ompi_datatype_t *datatype,
struct ompi_message_t **message,
struct ompi_request_t **request )
{
mca_pml_bfo_recv_frag_t* frag;
mca_pml_bfo_recv_request_t *recvreq;
mca_pml_bfo_hdr_t *hdr;
int src, tag;
ompi_communicator_t *comm;
mca_pml_bfo_comm_proc_t* proc;
mca_pml_bfo_comm_t* bfo_comm;
uint64_t seq;
/* get the request from the message and the frag from the request
before we overwrite everything */
recvreq = (mca_pml_bfo_recv_request_t*) (*message)->req_ptr;
frag = (mca_pml_bfo_recv_frag_t*) recvreq->req_recv.req_base.req_addr;
src = recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE;
tag = recvreq->req_recv.req_base.req_ompi.req_status.MPI_TAG;
comm = (*message)->comm;
bfo_comm = recvreq->req_recv.req_base.req_comm->c_pml_comm;
seq = recvreq->req_recv.req_base.req_sequence;
/* make the request a recv request again */
/* The old request kept pointers to comm and the char datatype.
We're about to release those, but need to make sure comm
doesn't go out of scope (we don't care about the char datatype
anymore). So retain comm, then release the frag, then reinit
the frag (which will retain comm), then release comm (but the
frag still has it's ref, so it'll stay in scope). Make
sense? */
OBJ_RETAIN(comm);
MCA_PML_BASE_RECV_REQUEST_FINI(&recvreq->req_recv);
recvreq->req_recv.req_base.req_type = MCA_PML_REQUEST_RECV;
MCA_PML_BFO_RECV_REQUEST_INIT(recvreq,
buf,
count, datatype,
src, tag, comm, false);
OBJ_RELEASE(comm);
PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
&((recvreq)->req_recv.req_base),
PERUSE_RECV);
/* init/re-init the request */
recvreq->req_lock = 0;
recvreq->req_pipeline_depth = 0;
recvreq->req_bytes_received = 0;
/* What about req_rdma_cnt ? */
recvreq->req_rdma_idx = 0;
recvreq->req_pending = false;
recvreq->req_ack_sent = false;
MCA_PML_BASE_RECV_START(&recvreq->req_recv.req_base);
/* Note - sequence number already assigned */
recvreq->req_recv.req_base.req_sequence = seq;
proc = &bfo_comm->procs[recvreq->req_recv.req_base.req_peer];
recvreq->req_recv.req_base.req_proc = proc->ompi_proc;
prepare_recv_req_converter(recvreq);
/* we can't go through the match, since we already have the match.
Cheat and do what REQUEST_START does, but without the frag
search */
hdr = (mca_pml_bfo_hdr_t*)frag->segments->seg_addr.pval;
switch(hdr->hdr_common.hdr_type) {
case MCA_PML_BFO_HDR_TYPE_MATCH:
mca_pml_bfo_recv_request_progress_match(recvreq, frag->btl, frag->segments,
frag->num_segments);
break;
case MCA_PML_BFO_HDR_TYPE_RNDV:
mca_pml_bfo_recv_request_progress_rndv(recvreq, frag->btl, frag->segments,
frag->num_segments);
break;
case MCA_PML_BFO_HDR_TYPE_RGET:
mca_pml_bfo_recv_request_progress_rget(recvreq, frag->btl, frag->segments,
frag->num_segments);
break;
default:
assert(0);
}
MCA_PML_BFO_RECV_FRAG_RETURN(frag);
ompi_message_return(*message);
*message = MPI_MESSAGE_NULL;
*request = (ompi_request_t *) recvreq;
return OMPI_SUCCESS;
}
int
mca_pml_bfo_mrecv( void *buf,
size_t count,
ompi_datatype_t *datatype,
struct ompi_message_t **message,
ompi_status_public_t* status )
{
mca_pml_bfo_recv_frag_t* frag;
mca_pml_bfo_recv_request_t *recvreq;
mca_pml_bfo_hdr_t *hdr;
int src, tag, rc;
ompi_communicator_t *comm;
mca_pml_bfo_comm_proc_t* proc;
mca_pml_bfo_comm_t* bfo_comm;
uint64_t seq;
/* get the request from the message and the frag from the request
before we overwrite everything */
comm = (*message)->comm;
recvreq = (mca_pml_bfo_recv_request_t*) (*message)->req_ptr;
frag = (mca_pml_bfo_recv_frag_t*) recvreq->req_recv.req_base.req_addr;
src = recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE;
tag = recvreq->req_recv.req_base.req_ompi.req_status.MPI_TAG;
seq = recvreq->req_recv.req_base.req_sequence;
bfo_comm = recvreq->req_recv.req_base.req_comm->c_pml_comm;
/* make the request a recv request again */
/* The old request kept pointers to comm and the char datatype.
We're about to release those, but need to make sure comm
doesn't go out of scope (we don't care about the char datatype
anymore). So retain comm, then release the frag, then reinit
the frag (which will retain comm), then release comm (but the
frag still has it's ref, so it'll stay in scope). Make
sense? */
OBJ_RETAIN(comm);
MCA_PML_BASE_RECV_REQUEST_FINI(&recvreq->req_recv);
recvreq->req_recv.req_base.req_type = MCA_PML_REQUEST_RECV;
MCA_PML_BFO_RECV_REQUEST_INIT(recvreq,
buf,
count, datatype,
src, tag, comm, false);
OBJ_RELEASE(comm);
PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
&((recvreq)->req_recv.req_base),
PERUSE_RECV);
/* init/re-init the request */
recvreq->req_lock = 0;
recvreq->req_pipeline_depth = 0;
recvreq->req_bytes_received = 0;
recvreq->req_rdma_cnt = 0;
recvreq->req_rdma_idx = 0;
recvreq->req_pending = false;
MCA_PML_BASE_RECV_START(&recvreq->req_recv.req_base);
/* Note - sequence number already assigned */
recvreq->req_recv.req_base.req_sequence = seq;
proc = &bfo_comm->procs[recvreq->req_recv.req_base.req_peer];
recvreq->req_recv.req_base.req_proc = proc->ompi_proc;
prepare_recv_req_converter(recvreq);
/* we can't go through the match, since we already have the match.
Cheat and do what REQUEST_START does, but without the frag
search */
hdr = (mca_pml_bfo_hdr_t*)frag->segments->seg_addr.pval;
switch(hdr->hdr_common.hdr_type) {
case MCA_PML_BFO_HDR_TYPE_MATCH:
mca_pml_bfo_recv_request_progress_match(recvreq, frag->btl, frag->segments,
frag->num_segments);
break;
case MCA_PML_BFO_HDR_TYPE_RNDV:
mca_pml_bfo_recv_request_progress_rndv(recvreq, frag->btl, frag->segments,
frag->num_segments);
break;
case MCA_PML_BFO_HDR_TYPE_RGET:
mca_pml_bfo_recv_request_progress_rget(recvreq, frag->btl, frag->segments,
frag->num_segments);
break;
default:
assert(0);
}
ompi_message_return(*message);
*message = MPI_MESSAGE_NULL;
ompi_request_wait_completion(&(recvreq->req_recv.req_base.req_ompi));
MCA_PML_BFO_RECV_FRAG_RETURN(frag);
if (NULL != status) { /* return status */
*status = recvreq->req_recv.req_base.req_ompi.req_status;
}
rc = recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR;
ompi_request_free( (ompi_request_t**)&recvreq );
return rc;
}

Просмотреть файл

@ -1,129 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2013 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "pml_bfo.h"
#include "pml_bfo_sendreq.h"
#include "pml_bfo_recvreq.h"
#include "ompi/peruse/peruse-internal.h"
int mca_pml_bfo_isend_init(void *buf,
size_t count,
ompi_datatype_t * datatype,
int dst,
int tag,
mca_pml_base_send_mode_t sendmode,
ompi_communicator_t * comm,
ompi_request_t ** request)
{
mca_pml_bfo_send_request_t *sendreq = NULL;
MCA_PML_BFO_SEND_REQUEST_ALLOC(comm, dst, sendreq);
if (NULL == sendreq)
return OMPI_ERR_OUT_OF_RESOURCE;
MCA_PML_BFO_SEND_REQUEST_INIT(sendreq,
buf,
count,
datatype,
dst, tag,
comm, sendmode, true);
PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
&(sendreq)->req_send.req_base,
PERUSE_SEND);
*request = (ompi_request_t *) sendreq;
return OMPI_SUCCESS;
}
int mca_pml_bfo_isend(void *buf,
size_t count,
ompi_datatype_t * datatype,
int dst,
int tag,
mca_pml_base_send_mode_t sendmode,
ompi_communicator_t * comm,
ompi_request_t ** request)
{
int rc;
mca_pml_bfo_send_request_t *sendreq = NULL;
MCA_PML_BFO_SEND_REQUEST_ALLOC(comm, dst, sendreq);
if (NULL == sendreq)
return OMPI_ERR_OUT_OF_RESOURCE;
MCA_PML_BFO_SEND_REQUEST_INIT(sendreq,
buf,
count,
datatype,
dst, tag,
comm, sendmode, false);
PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
&(sendreq)->req_send.req_base,
PERUSE_SEND);
MCA_PML_BFO_SEND_REQUEST_START(sendreq, rc);
*request = (ompi_request_t *) sendreq;
return rc;
}
int mca_pml_bfo_send(void *buf,
size_t count,
ompi_datatype_t * datatype,
int dst,
int tag,
mca_pml_base_send_mode_t sendmode,
ompi_communicator_t * comm)
{
int rc;
mca_pml_bfo_send_request_t *sendreq;
MCA_PML_BFO_SEND_REQUEST_ALLOC(comm, dst, sendreq);
if (NULL == sendreq)
return OMPI_ERR_OUT_OF_RESOURCE;
MCA_PML_BFO_SEND_REQUEST_INIT(sendreq,
buf,
count,
datatype,
dst, tag,
comm, sendmode, false);
PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
&(sendreq)->req_send.req_base,
PERUSE_SEND);
MCA_PML_BFO_SEND_REQUEST_START(sendreq, rc);
if (rc != OMPI_SUCCESS) {
MCA_PML_BFO_SEND_REQUEST_RETURN( sendreq );
return rc;
}
ompi_request_wait_completion(&sendreq->req_send.req_base.req_ompi);
rc = sendreq->req_send.req_base.req_ompi.req_status.MPI_ERROR;
ompi_request_free( (ompi_request_t**)&sendreq );
return rc;
}

Просмотреть файл

@ -1,78 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2008 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "pml_bfo.h"
#include "pml_bfo_sendreq.h"
#include "ompi/mca/bml/base/base.h"
int mca_pml_bfo_progress(void)
{
int i, queue_length = opal_list_get_size(&mca_pml_bfo.send_pending);
int j, completed_requests = 0;
bool send_succedded;
if( OPAL_LIKELY(0 == queue_length) )
return 0;
for( i = 0; i < queue_length; i++ ) {
mca_pml_bfo_send_pending_t pending_type = MCA_PML_BFO_SEND_PENDING_NONE;
mca_pml_bfo_send_request_t* sendreq;
mca_bml_base_endpoint_t* endpoint;
sendreq = get_request_from_send_pending(&pending_type);
if(OPAL_UNLIKELY(NULL == sendreq))
break;
switch(pending_type) {
case MCA_PML_BFO_SEND_PENDING_NONE:
assert(0);
return 0;
case MCA_PML_BFO_SEND_PENDING_SCHEDULE:
if( mca_pml_bfo_send_request_schedule_exclusive(sendreq) ==
OMPI_ERR_OUT_OF_RESOURCE ) {
return 0;
}
completed_requests++;
break;
case MCA_PML_BFO_SEND_PENDING_START:
endpoint = sendreq->req_endpoint;
send_succedded = false;
for(j = 0; j < (int)mca_bml_base_btl_array_get_size(&endpoint->btl_eager); j++) {
mca_bml_base_btl_t* bml_btl;
int rc;
/* select a btl */
bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
rc = mca_pml_bfo_send_request_start_btl(sendreq, bml_btl);
if( OPAL_LIKELY(OMPI_SUCCESS == rc) ) {
send_succedded = true;
completed_requests++;
break;
}
}
if( false == send_succedded ) {
add_request_to_send_pending(sendreq, MCA_PML_BFO_SEND_PENDING_START, true);
}
}
}
return completed_requests;
}

Просмотреть файл

@ -1,118 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
#include "ompi_config.h"
#include "ompi/constants.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/mca/bml/bml.h"
#include "opal/mca/mpool/mpool.h"
#include "pml_bfo.h"
#include "pml_bfo_rdma.h"
/* Use this registration if no registration needed for a BTL instead of NULL.
* This will help other code to distinguish case when memory is not registered
* from case when registration is not needed */
static mca_mpool_base_registration_t pml_bfo_dummy_reg;
/*
* Check to see if memory is registered or can be registered. Build a
* set of registrations on the request.
*/
size_t mca_pml_bfo_rdma_btls(
mca_bml_base_endpoint_t* bml_endpoint,
unsigned char* base,
size_t size,
mca_pml_bfo_com_btl_t* rdma_btls)
{
int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
double weight_total = 0;
int num_btls_used = 0, n;
/* shortcut when there are no rdma capable btls */
if(num_btls == 0) {
return 0;
}
/* check to see if memory is registered */
for(n = 0; n < num_btls && num_btls_used < mca_pml_bfo.max_rdma_per_request;
n++) {
mca_bml_base_btl_t* bml_btl =
mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma,
(bml_endpoint->btl_rdma_index + n) % num_btls);
mca_mpool_base_registration_t* reg = &pml_bfo_dummy_reg;
mca_mpool_base_module_t *btl_mpool = bml_btl->btl->btl_mpool;
if( NULL != btl_mpool ) {
if(!mca_pml_bfo.leave_pinned) {
/* look through existing registrations */
btl_mpool->mpool_find(btl_mpool, base, size, &reg);
} else {
/* register the memory */
btl_mpool->mpool_register(btl_mpool, base, size, 0, &reg);
}
if(NULL == reg)
continue;
}
rdma_btls[num_btls_used].bml_btl = bml_btl;
rdma_btls[num_btls_used].btl_reg = reg;
weight_total += bml_btl->btl_weight;
num_btls_used++;
}
/* if we don't use leave_pinned and all BTLs that already have this memory
* registered amount to less then half of available bandwidth - fall back to
* pipeline protocol */
if(0 == num_btls_used || (!mca_pml_bfo.leave_pinned && weight_total < 0.5))
return 0;
mca_pml_bfo_calc_weighted_length(rdma_btls, num_btls_used, size,
weight_total);
bml_endpoint->btl_rdma_index = (bml_endpoint->btl_rdma_index + 1) % num_btls;
return num_btls_used;
}
size_t mca_pml_bfo_rdma_pipeline_btls( mca_bml_base_endpoint_t* bml_endpoint,
size_t size,
mca_pml_bfo_com_btl_t* rdma_btls )
{
int i, num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
double weight_total = 0;
for(i = 0; i < num_btls && i < mca_pml_bfo.max_rdma_per_request; i++) {
rdma_btls[i].bml_btl =
mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma);
if(NULL != rdma_btls[i].bml_btl->btl->btl_mpool)
rdma_btls[i].btl_reg = NULL;
else
rdma_btls[i].btl_reg = &pml_bfo_dummy_reg;
weight_total += rdma_btls[i].bml_btl->btl_weight;
}
mca_pml_bfo_calc_weighted_length(rdma_btls, i, size, weight_total);
return i;
}

Просмотреть файл

@ -1,42 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*/
#ifndef MCA_PML_BFO_RDMA_H
#define MCA_PML_BFO_RDMA_H
struct mca_bml_base_endpoint_t;
/*
* Of the set of available btls that support RDMA,
* find those that already have registrations - or
* register if required (for leave_pinned option)
*/
size_t mca_pml_bfo_rdma_btls(struct mca_bml_base_endpoint_t* endpoint,
unsigned char* base, size_t size, struct mca_pml_bfo_com_btl_t* btls);
/* Choose RDMA BTLs to use for sending of a request by pipeline protocol.
* Calculate number of bytes to send through each BTL according to available
* bandwidth */
size_t mca_pml_bfo_rdma_pipeline_btls(struct mca_bml_base_endpoint_t* endpoint,
size_t size, mca_pml_bfo_com_btl_t* rdma_btls);
#endif

Просмотреть файл

@ -1,30 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "pml_bfo.h"
#include "pml_bfo_rdmafrag.h"
OBJ_CLASS_INSTANCE(
mca_pml_bfo_rdma_frag_t,
ompi_free_list_item_t,
NULL,
NULL);

Просмотреть файл

@ -1,75 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2013 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*/
#ifndef MCA_PML_BFO_RDMAFRAG_H
#define MCA_PML_BFO_RDMAFRAG_H
#include "pml_bfo_hdr.h"
#include "opal/mca/mpool/base/base.h"
BEGIN_C_DECLS
typedef enum {
MCA_PML_BFO_RDMA_PUT,
MCA_PML_BFO_RDMA_GET
} mca_pml_bfo_rdma_state_t;
struct mca_pml_bfo_rdma_frag_t {
opal_free_list_item_t super;
mca_bml_base_btl_t* rdma_bml;
#if PML_BFO
mca_btl_base_module_t* rdma_btl;
#endif /* PML_BFO */
mca_pml_bfo_hdr_t rdma_hdr;
mca_pml_bfo_rdma_state_t rdma_state;
size_t rdma_length;
uint8_t rdma_segs[MCA_BTL_SEG_MAX_SIZE * MCA_BTL_DES_MAX_SEGMENTS];
void *rdma_req;
struct mca_bml_base_endpoint_t* rdma_ep;
opal_convertor_t convertor;
struct mca_mpool_base_registration_t* reg;
uint32_t retries;
};
typedef struct mca_pml_bfo_rdma_frag_t mca_pml_bfo_rdma_frag_t;
OBJ_CLASS_DECLARATION(mca_pml_bfo_rdma_frag_t);
#define MCA_PML_BFO_RDMA_FRAG_ALLOC(frag) \
do { \
opal_free_list_item_t* item; \
OPAL_FREE_LIST_WAIT_MT(&mca_pml_bfo.rdma_frags, item); \
frag = (mca_pml_bfo_rdma_frag_t*)item; \
} while(0)
#define MCA_PML_BFO_RDMA_FRAG_RETURN(frag) \
do { \
/* return fragment */ \
OPAL_FREE_LIST_RETURN_MT(&mca_pml_bfo.rdma_frags, \
(opal_free_list_item_t*)frag); \
} while(0)
END_C_DECLS
#endif

Просмотреть файл

@ -1,743 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2013 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2007 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
* Copyright (c) 2006-2008 University of Houston. All rights reserved.
* Copyright (c) 2009-2012 Oracle and/or its affiliates. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*/
#include "ompi_config.h"
#include "opal/class/opal_list.h"
#include "opal/threads/mutex.h"
#include "opal/prefetch.h"
#include "ompi/constants.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/peruse/peruse-internal.h"
#include "ompi/memchecker.h"
#include "pml_bfo.h"
#include "pml_bfo_comm.h"
#include "pml_bfo_recvfrag.h"
#include "pml_bfo_recvreq.h"
#include "pml_bfo_sendreq.h"
#include "pml_bfo_hdr.h"
#if PML_BFO
#include "pml_bfo_failover.h"
#endif /* PML_BFO */
OBJ_CLASS_INSTANCE( mca_pml_bfo_buffer_t,
ompi_free_list_item_t,
NULL,
NULL );
OBJ_CLASS_INSTANCE( mca_pml_bfo_recv_frag_t,
opal_list_item_t,
NULL,
NULL );
/**
* Static functions.
*/
/**
* Append a unexpected descriptor to a queue. This function will allocate and
* initialize the fragment (if necessary) and then will add it to the specified
* queue. The allocated fragment is not returned to the caller.
*/
static void
append_frag_to_list(opal_list_t *queue, mca_btl_base_module_t *btl,
mca_pml_bfo_match_hdr_t *hdr, mca_btl_base_segment_t* segments,
size_t num_segments, mca_pml_bfo_recv_frag_t* frag)
{
if(NULL == frag) {
MCA_PML_BFO_RECV_FRAG_ALLOC(frag);
MCA_PML_BFO_RECV_FRAG_INIT(frag, hdr, segments, num_segments, btl);
}
opal_list_append(queue, (opal_list_item_t*)frag);
}
/**
* Match incoming recv_frags against posted receives.
* Supports out of order delivery.
*
* @param frag_header (IN) Header of received recv_frag.
* @param frag_desc (IN) Received recv_frag descriptor.
* @param match_made (OUT) Flag indicating wether a match was made.
* @param additional_matches (OUT) List of additional matches
* @return OMPI_SUCCESS or error status on failure.
*/
static int mca_pml_bfo_recv_frag_match( mca_btl_base_module_t *btl,
mca_pml_bfo_match_hdr_t *hdr,
mca_btl_base_segment_t* segments,
size_t num_segments,
int type);
static mca_pml_bfo_recv_request_t*
match_one(mca_btl_base_module_t *btl,
mca_pml_bfo_match_hdr_t *hdr, mca_btl_base_segment_t* segments,
size_t num_segments, ompi_communicator_t *comm_ptr,
mca_pml_bfo_comm_proc_t *proc,
mca_pml_bfo_recv_frag_t* frag);
void mca_pml_bfo_recv_frag_callback_match(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des,
void* cbdata )
{
mca_btl_base_segment_t* segments = des->des_local;
mca_pml_bfo_match_hdr_t* hdr = (mca_pml_bfo_match_hdr_t*)segments->seg_addr.pval;
ompi_communicator_t *comm_ptr;
mca_pml_bfo_recv_request_t *match = NULL;
mca_pml_bfo_comm_t *comm;
mca_pml_bfo_comm_proc_t *proc;
size_t num_segments = des->des_local_count;
size_t bytes_received = 0;
assert(num_segments <= MCA_BTL_DES_MAX_SEGMENTS);
if( OPAL_UNLIKELY(segments->seg_len < OMPI_PML_BFO_MATCH_HDR_LEN) ) {
return;
}
bfo_hdr_ntoh(((mca_pml_bfo_hdr_t*) hdr), MCA_PML_BFO_HDR_TYPE_MATCH);
/* communicator pointer */
comm_ptr = ompi_comm_lookup(hdr->hdr_ctx);
if(OPAL_UNLIKELY(NULL == comm_ptr)) {
/* This is a special case. A message for a not yet existing
* communicator can happens. Instead of doing a matching we
* will temporarily add it the a pending queue in the PML.
* Later on, when the communicator is completely instantiated,
* this pending queue will be searched and all matching fragments
* moved to the right communicator.
*/
append_frag_to_list( &mca_pml_bfo.non_existing_communicator_pending,
btl, hdr, segments, num_segments, NULL );
return;
}
comm = (mca_pml_bfo_comm_t *)comm_ptr->c_pml_comm;
/* source sequence number */
proc = &comm->procs[hdr->hdr_src];
/* We generate the MSG_ARRIVED event as soon as the PML is aware
* of a matching fragment arrival. Independing if it is received
* on the correct order or not. This will allow the tools to
* figure out if the messages are not received in the correct
* order (if multiple network interfaces).
*/
PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_ARRIVED, comm_ptr,
hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
/* get next expected message sequence number - if threaded
* run, lock to make sure that if another thread is processing
* a frag from the same message a match is made only once.
* Also, this prevents other posted receives (for a pair of
* end points) from being processed, and potentially "loosing"
* the fragment.
*/
OPAL_THREAD_LOCK(&comm->matching_lock);
/* get sequence number of next message that can be processed */
if(OPAL_UNLIKELY((((uint16_t) hdr->hdr_seq) != ((uint16_t) proc->expected_sequence)) ||
(opal_list_get_size(&proc->frags_cant_match) > 0 ))) {
goto slow_path;
}
/* This is the sequence number we were expecting, so we can try
* matching it to already posted receives.
*/
/* We're now expecting the next sequence number. */
proc->expected_sequence++;
/* We generate the SEARCH_POSTED_QUEUE only when the message is
* received in the correct sequence. Otherwise, we delay the event
* generation until we reach the correct sequence number.
*/
PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_BEGIN, comm_ptr,
hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
match = match_one(btl, hdr, segments, num_segments, comm_ptr, proc, NULL);
/* The match is over. We generate the SEARCH_POSTED_Q_END here,
* before going into the mca_pml_bfo_check_cantmatch_for_match so
* we can make a difference for the searching time for all
* messages.
*/
PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_END, comm_ptr,
hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
/* release matching lock before processing fragment */
OPAL_THREAD_UNLOCK(&comm->matching_lock);
if(OPAL_LIKELY(match)) {
bytes_received = segments->seg_len - OMPI_PML_BFO_MATCH_HDR_LEN;
match->req_recv.req_bytes_packed = bytes_received;
MCA_PML_BFO_RECV_REQUEST_MATCHED(match, hdr);
if(match->req_bytes_expected > 0) {
struct iovec iov[MCA_BTL_DES_MAX_SEGMENTS];
uint32_t iov_count = 1;
/*
* Make user buffer accessable(defined) before unpacking.
*/
MEMCHECKER(
memchecker_call(&opal_memchecker_base_mem_defined,
match->req_recv.req_base.req_addr,
match->req_recv.req_base.req_count,
match->req_recv.req_base.req_datatype);
);
iov[0].iov_len = bytes_received;
iov[0].iov_base = (IOVBASE_TYPE*)((unsigned char*)segments->seg_addr.pval +
OMPI_PML_BFO_MATCH_HDR_LEN);
while (iov_count < num_segments) {
bytes_received += segments[iov_count].seg_len;
iov[iov_count].iov_len = segments[iov_count].seg_len;
iov[iov_count].iov_base = (IOVBASE_TYPE*)((unsigned char*)segments[iov_count].seg_addr.pval);
iov_count++;
}
opal_convertor_unpack( &match->req_recv.req_base.req_convertor,
iov,
&iov_count,
&bytes_received );
match->req_bytes_received = bytes_received;
/*
* Unpacking finished, make the user buffer unaccessable again.
*/
MEMCHECKER(
memchecker_call(&opal_memchecker_base_mem_noaccess,
match->req_recv.req_base.req_addr,
match->req_recv.req_base.req_count,
match->req_recv.req_base.req_datatype);
);
}
/* no need to check if complete we know we are.. */
/* don't need a rmb as that is for checking */
recv_request_pml_complete(match);
}
return;
slow_path:
OPAL_THREAD_UNLOCK(&comm->matching_lock);
#if PML_BFO
if (true == mca_pml_bfo_is_duplicate_msg(proc, hdr)) {
return;
}
#endif /* PML_BFO */
mca_pml_bfo_recv_frag_match(btl, hdr, segments,
num_segments, MCA_PML_BFO_HDR_TYPE_MATCH);
}
void mca_pml_bfo_recv_frag_callback_rndv(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des,
void* cbdata )
{
mca_btl_base_segment_t* segments = des->des_local;
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_bfo_common_hdr_t)) ) {
return;
}
bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_RNDV);
mca_pml_bfo_recv_frag_match(btl, &hdr->hdr_match, segments,
des->des_local_count, MCA_PML_BFO_HDR_TYPE_RNDV);
return;
}
void mca_pml_bfo_recv_frag_callback_rget(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des,
void* cbdata )
{
mca_btl_base_segment_t* segments = des->des_local;
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_bfo_common_hdr_t)) ) {
return;
}
bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_RGET);
mca_pml_bfo_recv_frag_match(btl, &hdr->hdr_match, segments,
des->des_local_count, MCA_PML_BFO_HDR_TYPE_RGET);
return;
}
void mca_pml_bfo_recv_frag_callback_ack(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des,
void* cbdata )
{
mca_btl_base_segment_t* segments = des->des_local;
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
mca_pml_bfo_send_request_t* sendreq;
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_bfo_common_hdr_t)) ) {
return;
}
bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_ACK);
sendreq = (mca_pml_bfo_send_request_t*)hdr->hdr_ack.hdr_src_req.pval;
sendreq->req_recv = hdr->hdr_ack.hdr_dst_req;
#if PML_BFO
MCA_PML_BFO_ERROR_CHECK_ON_ACK_CALLBACK(sendreq);
#endif /* PML_BFO */
/* if the request should be delivered entirely by copy in/out
* then throttle sends */
if(hdr->hdr_common.hdr_flags & MCA_PML_BFO_HDR_FLAGS_NORDMA)
sendreq->req_throttle_sends = true;
mca_pml_bfo_send_request_copy_in_out(sendreq,
hdr->hdr_ack.hdr_send_offset,
sendreq->req_send.req_bytes_packed -
hdr->hdr_ack.hdr_send_offset);
if (sendreq->req_state != 0) {
/* Typical receipt of an ACK message causes req_state to be
* decremented. However, a send request that started as an
* RGET request can become a RNDV. For example, when the
* receiver determines that its receive buffer is not
* contiguous and therefore cannot support the RGET
* protocol. A send request that started with the RGET
* protocol has req_state == 0 and as such should not be
* decremented.
*/
OPAL_THREAD_ADD_FETCH32(&sendreq->req_state, -1);
}
if(send_request_pml_complete_check(sendreq) == false)
mca_pml_bfo_send_request_schedule(sendreq);
return;
}
void mca_pml_bfo_recv_frag_callback_frag(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des,
void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_local;
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
mca_pml_bfo_recv_request_t* recvreq;
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_bfo_common_hdr_t)) ) {
return;
}
bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_FRAG);
recvreq = (mca_pml_bfo_recv_request_t*)hdr->hdr_frag.hdr_dst_req.pval;
#if PML_BFO
MCA_PML_BFO_ERROR_CHECK_ON_FRAG_CALLBACK(recvreq);
#endif /* PML_BFO */
mca_pml_bfo_recv_request_progress_frag(recvreq,btl,segments,des->des_local_count);
return;
}
void mca_pml_bfo_recv_frag_callback_put(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des,
void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_local;
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
mca_pml_bfo_send_request_t* sendreq;
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_bfo_common_hdr_t)) ) {
return;
}
bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_PUT);
sendreq = (mca_pml_bfo_send_request_t*)hdr->hdr_rdma.hdr_req.pval;
#if PML_BFO
MCA_PML_BFO_ERROR_CHECK_ON_PUT_CALLBACK(sendreq);
#endif /* PML_BFO */
mca_pml_bfo_send_request_put(sendreq,btl,&hdr->hdr_rdma);
return;
}
void mca_pml_bfo_recv_frag_callback_fin(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des,
void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_local;
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
mca_btl_base_descriptor_t* rdma;
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_bfo_common_hdr_t)) ) {
return;
}
bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_FIN);
rdma = (mca_btl_base_descriptor_t*)hdr->hdr_fin.hdr_des.pval;
#if PML_BFO
if (true == mca_pml_bfo_is_duplicate_fin(hdr, rdma, btl)) {
return;
}
#endif /* PML_BFO */
rdma->des_cbfunc(btl, NULL, rdma,
hdr->hdr_fin.hdr_fail ? OMPI_ERROR : OMPI_SUCCESS);
return;
}
#define PML_MAX_SEQ ~((mca_pml_sequence_t)0);
static inline mca_pml_bfo_recv_request_t* get_posted_recv(opal_list_t *queue)
{
if(opal_list_get_size(queue) == 0)
return NULL;
return (mca_pml_bfo_recv_request_t*)opal_list_get_first(queue);
}
static inline mca_pml_bfo_recv_request_t* get_next_posted_recv(
opal_list_t *queue,
mca_pml_bfo_recv_request_t* req)
{
opal_list_item_t *i = opal_list_get_next((opal_list_item_t*)req);
if(opal_list_get_end(queue) == i)
return NULL;
return (mca_pml_bfo_recv_request_t*)i;
}
static mca_pml_bfo_recv_request_t *match_incomming(
mca_pml_bfo_match_hdr_t *hdr, mca_pml_bfo_comm_t *comm,
mca_pml_bfo_comm_proc_t *proc)
{
mca_pml_bfo_recv_request_t *specific_recv, *wild_recv;
mca_pml_sequence_t wild_recv_seq, specific_recv_seq;
int tag = hdr->hdr_tag;
specific_recv = get_posted_recv(&proc->specific_receives);
wild_recv = get_posted_recv(&comm->wild_receives);
wild_recv_seq = wild_recv ?
wild_recv->req_recv.req_base.req_sequence : PML_MAX_SEQ;
specific_recv_seq = specific_recv ?
specific_recv->req_recv.req_base.req_sequence : PML_MAX_SEQ;
/* they are equal only if both are PML_MAX_SEQ */
while(wild_recv_seq != specific_recv_seq) {
mca_pml_bfo_recv_request_t **match;
opal_list_t *queue;
int req_tag;
mca_pml_sequence_t *seq;
if (OPAL_UNLIKELY(wild_recv_seq < specific_recv_seq)) {
match = &wild_recv;
queue = &comm->wild_receives;
seq = &wild_recv_seq;
} else {
match = &specific_recv;
queue = &proc->specific_receives;
seq = &specific_recv_seq;
}
req_tag = (*match)->req_recv.req_base.req_tag;
if(req_tag == tag || (req_tag == OMPI_ANY_TAG && tag >= 0)) {
opal_list_remove_item(queue, (opal_list_item_t*)(*match));
PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_REQ_REMOVE_FROM_POSTED_Q,
&((*match)->req_recv.req_base), PERUSE_RECV);
return *match;
}
*match = get_next_posted_recv(queue, *match);
*seq = (*match) ? (*match)->req_recv.req_base.req_sequence : PML_MAX_SEQ;
}
return NULL;
}
static mca_pml_bfo_recv_request_t*
match_one(mca_btl_base_module_t *btl,
mca_pml_bfo_match_hdr_t *hdr, mca_btl_base_segment_t* segments,
size_t num_segments, ompi_communicator_t *comm_ptr,
mca_pml_bfo_comm_proc_t *proc,
mca_pml_bfo_recv_frag_t* frag)
{
mca_pml_bfo_recv_request_t *match;
mca_pml_bfo_comm_t *comm = (mca_pml_bfo_comm_t *)comm_ptr->c_pml_comm;
do {
match = match_incomming(hdr, comm, proc);
/* if match found, process data */
if(OPAL_LIKELY(NULL != match)) {
match->req_recv.req_base.req_proc = proc->ompi_proc;
if(OPAL_UNLIKELY(MCA_PML_REQUEST_PROBE == match->req_recv.req_base.req_type)) {
/* complete the probe */
mca_pml_bfo_recv_request_matched_probe(match, btl, segments,
num_segments);
/* attempt to match actual request */
continue;
} else if (MCA_PML_REQUEST_MPROBE == match->req_recv.req_base.req_type) {
/* create a receive frag and associate it with the
request, which is then completed so that it can be
restarted later during mrecv */
mca_pml_bfo_recv_frag_t *tmp;
if(NULL == frag) {
MCA_PML_BFO_RECV_FRAG_ALLOC(tmp);
MCA_PML_BFO_RECV_FRAG_INIT(tmp, hdr, segments, num_segments, btl);
} else {
tmp = frag;
}
match->req_recv.req_base.req_addr = tmp;
mca_pml_bfo_recv_request_matched_probe(match, btl, segments,
num_segments);
/* this frag is already processed, so we want to break out
of the loop and not end up back on the unexpected queue. */
return NULL;
}
PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_MSG_MATCH_POSTED_REQ,
&(match->req_recv.req_base), PERUSE_RECV);
return match;
}
/* if no match found, place on unexpected queue */
append_frag_to_list(&proc->unexpected_frags, btl, hdr, segments,
num_segments, frag);
PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_INSERT_IN_UNEX_Q, comm_ptr,
hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
return NULL;
} while(true);
}
static mca_pml_bfo_recv_frag_t* check_cantmatch_for_match(mca_pml_bfo_comm_proc_t *proc)
{
mca_pml_bfo_recv_frag_t *frag;
/* search the list for a fragment from the send with sequence
* number next_msg_seq_expected
*/
for(frag = (mca_pml_bfo_recv_frag_t*)opal_list_get_first(&proc->frags_cant_match);
frag != (mca_pml_bfo_recv_frag_t*)opal_list_get_end(&proc->frags_cant_match);
frag = (mca_pml_bfo_recv_frag_t*)opal_list_get_next(frag))
{
mca_pml_bfo_match_hdr_t* hdr = &frag->hdr.hdr_match;
/*
* If the message has the next expected seq from that proc...
*/
if(hdr->hdr_seq != proc->expected_sequence)
continue;
opal_list_remove_item(&proc->frags_cant_match, (opal_list_item_t*)frag);
return frag;
}
return NULL;
}
/**
* RCS/CTS receive side matching
*
* @param hdr list of parameters needed for matching
* This list is also embeded in frag,
* but this allows to save a memory copy when
* a match is made in this routine. (IN)
* @param frag pointer to receive fragment which we want
* to match (IN/OUT). If a match is not made,
* hdr is copied to frag.
* @param match_made parameter indicating if we matched frag/
* hdr (OUT)
* @param additional_matches if a match is made with frag, we
* may be able to match fragments that previously
* have arrived out-of-order. If this is the
* case, the associated fragment descriptors are
* put on this list for further processing. (OUT)
*
* @return OMPI error code
*
* This routine is used to try and match a newly arrived message fragment
* to pre-posted receives. The following assumptions are made
* - fragments are received out of order
* - for long messages, e.g. more than one fragment, a RTS/CTS algorithm
* is used.
* - 2nd and greater fragments include a receive descriptor pointer
* - fragments may be dropped
* - fragments may be corrupt
* - this routine may be called simultaneously by more than one thread
*/
static int mca_pml_bfo_recv_frag_match( mca_btl_base_module_t *btl,
mca_pml_bfo_match_hdr_t *hdr,
mca_btl_base_segment_t* segments,
size_t num_segments,
int type)
{
/* local variables */
uint16_t next_msg_seq_expected, frag_msg_seq;
ompi_communicator_t *comm_ptr;
mca_pml_bfo_recv_request_t *match = NULL;
mca_pml_bfo_comm_t *comm;
mca_pml_bfo_comm_proc_t *proc;
mca_pml_bfo_recv_frag_t* frag = NULL;
/* communicator pointer */
comm_ptr = ompi_comm_lookup(hdr->hdr_ctx);
if(OPAL_UNLIKELY(NULL == comm_ptr)) {
/* This is a special case. A message for a not yet existing
* communicator can happens. Instead of doing a matching we
* will temporarily add it the a pending queue in the PML.
* Later on, when the communicator is completely instantiated,
* this pending queue will be searched and all matching fragments
* moved to the right communicator.
*/
append_frag_to_list( &mca_pml_bfo.non_existing_communicator_pending,
btl, hdr, segments, num_segments, NULL );
return OMPI_SUCCESS;
}
comm = (mca_pml_bfo_comm_t *)comm_ptr->c_pml_comm;
/* source sequence number */
frag_msg_seq = hdr->hdr_seq;
proc = &comm->procs[hdr->hdr_src];
/**
* We generate the MSG_ARRIVED event as soon as the PML is aware of a matching
* fragment arrival. Independing if it is received on the correct order or not.
* This will allow the tools to figure out if the messages are not received in the
* correct order (if multiple network interfaces).
*/
PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_ARRIVED, comm_ptr,
hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
/* get next expected message sequence number - if threaded
* run, lock to make sure that if another thread is processing
* a frag from the same message a match is made only once.
* Also, this prevents other posted receives (for a pair of
* end points) from being processed, and potentially "loosing"
* the fragment.
*/
OPAL_THREAD_LOCK(&comm->matching_lock);
#if PML_BFO
if(OPAL_UNLIKELY(hdr->hdr_common.hdr_flags & MCA_PML_BFO_HDR_FLAGS_RESTART)) {
if (NULL == (match = mca_pml_bfo_get_request(hdr))) {
return OMPI_SUCCESS;
}
} else {
#endif /* PML_BFO */
/* get sequence number of next message that can be processed */
next_msg_seq_expected = (uint16_t)proc->expected_sequence;
if(OPAL_UNLIKELY(frag_msg_seq != next_msg_seq_expected))
goto wrong_seq;
/*
* This is the sequence number we were expecting,
* so we can try matching it to already posted
* receives.
*/
out_of_order_match:
/* We're now expecting the next sequence number. */
proc->expected_sequence++;
/**
* We generate the SEARCH_POSTED_QUEUE only when the message is received
* in the correct sequence. Otherwise, we delay the event generation until
* we reach the correct sequence number.
*/
PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_BEGIN, comm_ptr,
hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
match = match_one(btl, hdr, segments, num_segments, comm_ptr, proc, frag);
/**
* The match is over. We generate the SEARCH_POSTED_Q_END here, before going
* into the mca_pml_bfo_check_cantmatch_for_match so we can make a difference
* for the searching time for all messages.
*/
PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_END, comm_ptr,
hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
/* release matching lock before processing fragment */
OPAL_THREAD_UNLOCK(&comm->matching_lock);
#if PML_BFO
}
#endif /* PML_BFO */
if(OPAL_LIKELY(match)) {
switch(type) {
case MCA_PML_BFO_HDR_TYPE_MATCH:
mca_pml_bfo_recv_request_progress_match(match, btl, segments, num_segments);
break;
case MCA_PML_BFO_HDR_TYPE_RNDV:
mca_pml_bfo_recv_request_progress_rndv(match, btl, segments, num_segments);
break;
case MCA_PML_BFO_HDR_TYPE_RGET:
mca_pml_bfo_recv_request_progress_rget(match, btl, segments, num_segments);
break;
}
if(OPAL_UNLIKELY(frag))
MCA_PML_BFO_RECV_FRAG_RETURN(frag);
}
/*
* Now that new message has arrived, check to see if
* any fragments on the c_c_frags_cant_match list
* may now be used to form new matchs
*/
if(OPAL_UNLIKELY(opal_list_get_size(&proc->frags_cant_match) > 0)) {
OPAL_THREAD_LOCK(&comm->matching_lock);
if((frag = check_cantmatch_for_match(proc))) {
hdr = &frag->hdr.hdr_match;
segments = frag->segments;
num_segments = frag->num_segments;
btl = frag->btl;
type = hdr->hdr_common.hdr_type;
goto out_of_order_match;
}
OPAL_THREAD_UNLOCK(&comm->matching_lock);
}
return OMPI_SUCCESS;
wrong_seq:
/*
* This message comes after the next expected, so it
* is ahead of sequence. Save it for later.
*/
#if PML_BFO
if (true == mca_pml_bfo_is_duplicate_msg(proc, hdr)) {
return OMPI_SUCCESS;
}
#endif /* PML_BFO */
append_frag_to_list(&proc->frags_cant_match, btl, hdr, segments,
num_segments, NULL);
OPAL_THREAD_UNLOCK(&comm->matching_lock);
return OMPI_SUCCESS;
}

Просмотреть файл

@ -1,172 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2013 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*/
#ifndef MCA_PML_BFO_RECVFRAG_H
#define MCA_PML_BFO_RECVFRAG_H
#include "pml_bfo_hdr.h"
BEGIN_C_DECLS
struct mca_pml_bfo_buffer_t {
size_t len;
void * addr;
};
typedef struct mca_pml_bfo_buffer_t mca_pml_bfo_buffer_t;
struct mca_pml_bfo_recv_frag_t {
opal_free_list_item_t super;
mca_pml_bfo_hdr_t hdr;
size_t num_segments;
mca_btl_base_module_t* btl;
mca_btl_base_segment_t segments[MCA_BTL_DES_MAX_SEGMENTS];
mca_pml_bfo_buffer_t buffers[MCA_BTL_DES_MAX_SEGMENTS];
unsigned char addr[1];
};
typedef struct mca_pml_bfo_recv_frag_t mca_pml_bfo_recv_frag_t;
OBJ_CLASS_DECLARATION(mca_pml_bfo_recv_frag_t);
#define MCA_PML_BFO_RECV_FRAG_ALLOC(frag) \
do { \
opal_free_list_item_t* item; \
OPAL_FREE_LIST_WAIT_MT(&mca_pml_bfo.recv_frags, item); \
frag = (mca_pml_bfo_recv_frag_t*)item; \
} while(0)
#define MCA_PML_BFO_RECV_FRAG_INIT(frag, hdr, segs, cnt, btl ) \
do { \
size_t i, _size; \
mca_btl_base_segment_t* macro_segments = frag->segments; \
mca_pml_bfo_buffer_t* buffers = frag->buffers; \
unsigned char* _ptr = (unsigned char*)frag->addr; \
/* init recv_frag */ \
frag->btl = btl; \
frag->hdr = *(mca_pml_bfo_hdr_t*)hdr; \
frag->num_segments = 1; \
_size = segs[0].seg_len; \
for( i = 1; i < cnt; i++ ) { \
_size += segs[i].seg_len; \
} \
/* copy over data */ \
if(_size <= mca_pml_bfo.unexpected_limit ) { \
macro_segments[0].seg_addr.pval = frag->addr; \
} else { \
buffers[0].len = _size; \
buffers[0].addr = (char*) \
mca_pml_bfo.allocator->alc_alloc( mca_pml_bfo.allocator, \
buffers[0].len, \
0, NULL); \
_ptr = (unsigned char*)(buffers[0].addr); \
macro_segments[0].seg_addr.pval = buffers[0].addr; \
} \
macro_segments[0].seg_len = _size; \
for( i = 0; i < cnt; i++ ) { \
memcpy( _ptr, segs[i].seg_addr.pval, segs[i].seg_len); \
_ptr += segs[i].seg_len; \
} \
} while(0)
#define MCA_PML_BFO_RECV_FRAG_RETURN(frag) \
do { \
if( frag->segments[0].seg_len > mca_pml_bfo.unexpected_limit ) { \
/* return buffers */ \
mca_pml_bfo.allocator->alc_free( mca_pml_bfo.allocator, \
frag->buffers[0].addr ); \
} \
frag->num_segments = 0; \
\
/* return recv_frag */ \
OPAL_FREE_LIST_RETURN(&mca_pml_bfo.recv_frags, \
(opal_free_list_item_t*)frag); \
} while(0)
/**
* Callback from BTL on receipt of a recv_frag (match).
*/
extern void mca_pml_bfo_recv_frag_callback_match( mca_btl_base_module_t *btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* descriptor,
void* cbdata );
/**
* Callback from BTL on receipt of a recv_frag (rndv).
*/
extern void mca_pml_bfo_recv_frag_callback_rndv( mca_btl_base_module_t *btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* descriptor,
void* cbdata );
/**
* Callback from BTL on receipt of a recv_frag (rget).
*/
extern void mca_pml_bfo_recv_frag_callback_rget( mca_btl_base_module_t *btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* descriptor,
void* cbdata );
/**
* Callback from BTL on receipt of a recv_frag (ack).
*/
extern void mca_pml_bfo_recv_frag_callback_ack( mca_btl_base_module_t *btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* descriptor,
void* cbdata );
/**
* Callback from BTL on receipt of a recv_frag (frag).
*/
extern void mca_pml_bfo_recv_frag_callback_frag( mca_btl_base_module_t *btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* descriptor,
void* cbdata );
/**
* Callback from BTL on receipt of a recv_frag (put).
*/
extern void mca_pml_bfo_recv_frag_callback_put( mca_btl_base_module_t *btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* descriptor,
void* cbdata );
/**
* Callback from BTL on receipt of a recv_frag (fin).
*/
extern void mca_pml_bfo_recv_frag_callback_fin( mca_btl_base_module_t *btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* descriptor,
void* cbdata );
END_C_DECLS
#endif

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,449 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2016 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2007 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*/
#ifndef OMPI_PML_BFO_RECV_REQUEST_H
#define OMPI_PML_BFO_RECV_REQUEST_H
#include "pml_bfo.h"
#include "pml_bfo_rdma.h"
#include "pml_bfo_rdmafrag.h"
#include "ompi/proc/proc.h"
#include "ompi/mca/pml/bfo/pml_bfo_comm.h"
#include "opal/mca/mpool/base/base.h"
#include "ompi/mca/pml/base/pml_base_recvreq.h"
#if PML_BFO
#define RECVREQ_RECVERRSENT 0x01
#define RECVREQ_RNDVRESTART_RECVED 0x02
#define RECVREQ_RNDVRESTART_ACKED 0x04
#endif /* PML_BFO */
BEGIN_C_DECLS
struct mca_pml_bfo_recv_request_t {
mca_pml_base_recv_request_t req_recv;
opal_ptr_t remote_req_send;
#if PML_BFO
int32_t req_msgseq; /* PML sequence number */
int32_t req_events; /* number of outstanding events on request */
int32_t req_restartseq; /* sequence number of restarted request */
int32_t req_errstate; /* state of request if in error */
#endif /* PML_BFO */
int32_t req_lock;
size_t req_pipeline_depth;
size_t req_bytes_received; /**< amount of data transferred into the user buffer */
size_t req_bytes_expected; /**< local size of the data as suggested by the user */
size_t req_rdma_offset;
size_t req_send_offset;
uint32_t req_rdma_cnt;
uint32_t req_rdma_idx;
bool req_pending;
bool req_ack_sent; /**< whether ack was sent to the sender */
bool req_match_received; /**< Prevent request to be completed prematurely */
opal_mutex_t lock;
mca_pml_bfo_com_btl_t req_rdma[1];
};
typedef struct mca_pml_bfo_recv_request_t mca_pml_bfo_recv_request_t;
OBJ_CLASS_DECLARATION(mca_pml_bfo_recv_request_t);
static inline bool lock_recv_request(mca_pml_bfo_recv_request_t *recvreq)
{
return OPAL_THREAD_ADD_FETCH32(&recvreq->req_lock, 1) == 1;
}
static inline bool unlock_recv_request(mca_pml_bfo_recv_request_t *recvreq)
{
return OPAL_THREAD_ADD_FETCH32(&recvreq->req_lock, -1) == 0;
}
/**
* Allocate a recv request from the modules free list.
*
* @param rc (OUT) OMPI_SUCCESS or error status on failure.
* @return Receive request.
*/
#define MCA_PML_BFO_RECV_REQUEST_ALLOC(recvreq) \
do { \
ompi_free_list_item_t* item; \
OMPI_FREE_LIST_GET_MT(&mca_pml_base_recv_requests, item); \
recvreq = (mca_pml_bfo_recv_request_t*)item; \
} while(0)
/**
* Initialize a receive request with call parameters.
*
* @param request (IN) Receive request.
* @param addr (IN) User buffer.
* @param count (IN) Number of elements of indicated datatype.
* @param datatype (IN) User defined datatype.
* @param src (IN) Source rank w/in the communicator.
* @param tag (IN) User defined tag.
* @param comm (IN) Communicator.
* @param persistent (IN) Is this a ersistent request.
*/
#define MCA_PML_BFO_RECV_REQUEST_INIT( request, \
addr, \
count, \
datatype, \
src, \
tag, \
comm, \
persistent) \
do { \
MCA_PML_BASE_RECV_REQUEST_INIT( &(request)->req_recv, \
addr, \
count, \
datatype, \
src, \
tag, \
comm, \
persistent); \
} while(0)
/**
* Mark the request as completed at MPI level for internal purposes.
*
* @param recvreq (IN) Receive request.
*/
#define MCA_PML_BFO_RECV_REQUEST_MPI_COMPLETE( recvreq ) \
do { \
PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_COMPLETE, \
&(recvreq->req_recv.req_base), PERUSE_RECV ); \
ompi_request_complete( &(recvreq->req_recv.req_base.req_ompi), true ); \
} while (0)
/*
* Free the PML receive request
*/
#define MCA_PML_BFO_RECV_REQUEST_RETURN(recvreq) \
{ \
MCA_PML_BASE_RECV_REQUEST_FINI(&(recvreq)->req_recv); \
OPAL_FREE_LIST_RETURN( &mca_pml_base_recv_requests, \
(opal_free_list_item_t*)(recvreq)); \
}
/**
* Complete receive request. Request structure cannot be accessed after calling
* this function any more.
*
* @param recvreq (IN) Receive request.
*/
static inline void
recv_request_pml_complete(mca_pml_bfo_recv_request_t *recvreq)
{
size_t i;
if(false == recvreq->req_recv.req_base.req_pml_complete) {
if(recvreq->req_recv.req_bytes_packed > 0) {
PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_END,
&recvreq->req_recv.req_base, PERUSE_RECV );
}
for(i = 0; i < recvreq->req_rdma_cnt; i++) {
mca_mpool_base_registration_t* btl_reg = recvreq->req_rdma[i].btl_reg;
if( NULL != btl_reg && btl_reg->mpool != NULL) {
btl_reg->mpool->mpool_deregister( btl_reg->mpool, btl_reg );
}
}
recvreq->req_rdma_cnt = 0;
#if PML_BFO
recvreq->req_msgseq -= 100;
#endif /* PML_BFO */
if(true == recvreq->req_recv.req_base.req_free_called) {
if( MPI_SUCCESS != recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR ) {
ompi_mpi_abort(&ompi_mpi_comm_world.comm, MPI_ERR_REQUEST);
}
MCA_PML_BFO_RECV_REQUEST_RETURN(recvreq);
} else {
/* initialize request status */
recvreq->req_recv.req_base.req_pml_complete = true;
recvreq->req_recv.req_base.req_ompi.req_status._ucount =
recvreq->req_bytes_received;
if (recvreq->req_recv.req_bytes_packed > recvreq->req_bytes_expected) {
recvreq->req_recv.req_base.req_ompi.req_status._ucount =
recvreq->req_recv.req_bytes_packed;
recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR =
MPI_ERR_TRUNCATE;
}
MCA_PML_BFO_RECV_REQUEST_MPI_COMPLETE(recvreq);
}
}
}
static inline bool
recv_request_pml_complete_check(mca_pml_bfo_recv_request_t *recvreq)
{
#if OPAL_ENABLE_MULTI_THREADS
opal_atomic_rmb();
#endif
if(recvreq->req_match_received &&
recvreq->req_bytes_received >= recvreq->req_recv.req_bytes_packed &&
#if PML_BFO
(0 == recvreq->req_events) && lock_recv_request(recvreq)) {
#else /* PML_BFO */
lock_recv_request(recvreq)) {
#endif /* PML_BFO */
recv_request_pml_complete(recvreq);
return true;
}
return false;
}
extern void mca_pml_bfo_recv_req_start(mca_pml_bfo_recv_request_t *req);
#define MCA_PML_BFO_RECV_REQUEST_START(r) mca_pml_bfo_recv_req_start(r)
static inline void prepare_recv_req_converter(mca_pml_bfo_recv_request_t *req)
{
if( req->req_recv.req_base.req_datatype->super.size | req->req_recv.req_base.req_count ) {
opal_convertor_copy_and_prepare_for_recv(
req->req_recv.req_base.req_proc->super.proc_convertor,
&(req->req_recv.req_base.req_datatype->super),
req->req_recv.req_base.req_count,
req->req_recv.req_base.req_addr,
0,
&req->req_recv.req_base.req_convertor);
opal_convertor_get_unpacked_size(&req->req_recv.req_base.req_convertor,
&req->req_bytes_expected);
}
}
#define MCA_PML_BFO_RECV_REQUEST_MATCHED(request, hdr) \
recv_req_matched(request, hdr)
static inline void recv_req_matched(mca_pml_bfo_recv_request_t *req,
mca_pml_bfo_match_hdr_t *hdr)
{
req->req_recv.req_base.req_ompi.req_status.MPI_SOURCE = hdr->hdr_src;
req->req_recv.req_base.req_ompi.req_status.MPI_TAG = hdr->hdr_tag;
req->req_match_received = true;
#if PML_BFO
req->req_msgseq = hdr->hdr_seq;
#endif /* PML_BFO */
#if OPAL_ENABLE_MULTI_THREADS
opal_atomic_wmb();
#endif
if(req->req_recv.req_bytes_packed > 0) {
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
if(MPI_ANY_SOURCE == req->req_recv.req_base.req_peer) {
/* non wildcard prepared during post recv */
prepare_recv_req_converter(req);
}
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT */
PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_REQ_XFER_BEGIN,
&req->req_recv.req_base, PERUSE_RECV);
}
}
/**
*
*/
#define MCA_PML_BFO_RECV_REQUEST_UNPACK( request, \
segments, \
num_segments, \
seg_offset, \
data_offset, \
bytes_received, \
bytes_delivered) \
do { \
bytes_delivered = 0; \
if(request->req_recv.req_bytes_packed > 0) { \
struct iovec iov[MCA_BTL_DES_MAX_SEGMENTS]; \
uint32_t iov_count = 0; \
size_t max_data = bytes_received; \
size_t n, offset = seg_offset; \
mca_btl_base_segment_t* segment = segments; \
\
OPAL_THREAD_LOCK(&request->lock); \
for( n = 0; n < num_segments; n++, segment++ ) { \
if(offset >= segment->seg_len) { \
offset -= segment->seg_len; \
} else { \
iov[iov_count].iov_len = segment->seg_len - offset; \
iov[iov_count].iov_base = (IOVBASE_TYPE*) \
((unsigned char*)segment->seg_addr.pval + offset); \
iov_count++; \
offset = 0; \
} \
} \
PERUSE_TRACE_COMM_OMPI_EVENT (PERUSE_COMM_REQ_XFER_CONTINUE, \
&(recvreq->req_recv.req_base), max_data, \
PERUSE_RECV); \
opal_convertor_set_position( &(request->req_recv.req_base.req_convertor), \
&data_offset ); \
opal_convertor_unpack( &(request)->req_recv.req_base.req_convertor, \
iov, \
&iov_count, \
&max_data ); \
bytes_delivered = max_data; \
OPAL_THREAD_UNLOCK(&request->lock); \
} \
} while (0)
/**
*
*/
void mca_pml_bfo_recv_request_progress_match(
mca_pml_bfo_recv_request_t* req,
struct mca_btl_base_module_t* btl,
mca_btl_base_segment_t* segments,
size_t num_segments);
/**
*
*/
void mca_pml_bfo_recv_request_progress_frag(
mca_pml_bfo_recv_request_t* req,
struct mca_btl_base_module_t* btl,
mca_btl_base_segment_t* segments,
size_t num_segments);
/**
*
*/
void mca_pml_bfo_recv_request_progress_rndv(
mca_pml_bfo_recv_request_t* req,
struct mca_btl_base_module_t* btl,
mca_btl_base_segment_t* segments,
size_t num_segments);
/**
*
*/
void mca_pml_bfo_recv_request_progress_rget(
mca_pml_bfo_recv_request_t* req,
struct mca_btl_base_module_t* btl,
mca_btl_base_segment_t* segments,
size_t num_segments);
/**
*
*/
void mca_pml_bfo_recv_request_matched_probe(
mca_pml_bfo_recv_request_t* req,
struct mca_btl_base_module_t* btl,
mca_btl_base_segment_t* segments,
size_t num_segments);
/**
*
*/
int mca_pml_bfo_recv_request_schedule_once(
mca_pml_bfo_recv_request_t* req, mca_bml_base_btl_t* start_bml_btl);
static inline int mca_pml_bfo_recv_request_schedule_exclusive(
mca_pml_bfo_recv_request_t* req,
mca_bml_base_btl_t* start_bml_btl)
{
int rc;
do {
rc = mca_pml_bfo_recv_request_schedule_once(req, start_bml_btl);
if(rc == OMPI_ERR_OUT_OF_RESOURCE)
break;
} while(!unlock_recv_request(req));
if(OMPI_SUCCESS == rc)
recv_request_pml_complete_check(req);
return rc;
}
static inline void mca_pml_bfo_recv_request_schedule(
mca_pml_bfo_recv_request_t* req,
mca_bml_base_btl_t* start_bml_btl)
{
if(!lock_recv_request(req))
return;
(void)mca_pml_bfo_recv_request_schedule_exclusive(req, start_bml_btl);
}
#define MCA_PML_BFO_ADD_ACK_TO_PENDING(P, S, D, O) \
do { \
mca_pml_bfo_pckt_pending_t *_pckt; \
\
MCA_PML_BFO_PCKT_PENDING_ALLOC(_pckt); \
_pckt->hdr.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_ACK; \
_pckt->hdr.hdr_ack.hdr_src_req.lval = (S); \
_pckt->hdr.hdr_ack.hdr_dst_req.pval = (D); \
_pckt->hdr.hdr_ack.hdr_send_offset = (O); \
_pckt->proc = (P); \
_pckt->bml_btl = NULL; \
OPAL_THREAD_LOCK(&mca_pml_bfo.lock); \
opal_list_append(&mca_pml_bfo.pckt_pending, \
(opal_list_item_t*)_pckt); \
OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); \
} while(0)
int mca_pml_bfo_recv_request_ack_send_btl(ompi_proc_t* proc,
mca_bml_base_btl_t* bml_btl, uint64_t hdr_src_req, void *hdr_dst_req,
uint64_t hdr_rdma_offset, bool nordma);
static inline int mca_pml_bfo_recv_request_ack_send(ompi_proc_t* proc,
uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset,
bool nordma)
{
size_t i;
mca_bml_base_btl_t* bml_btl;
mca_bml_base_endpoint_t* endpoint =
(mca_bml_base_endpoint_t*)proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) {
bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
if(mca_pml_bfo_recv_request_ack_send_btl(proc, bml_btl, hdr_src_req,
hdr_dst_req, hdr_send_offset, nordma) == OMPI_SUCCESS)
return OMPI_SUCCESS;
}
MCA_PML_BFO_ADD_ACK_TO_PENDING(proc, hdr_src_req, hdr_dst_req,
hdr_send_offset);
return OMPI_ERR_OUT_OF_RESOURCE;
}
int mca_pml_bfo_recv_request_get_frag(mca_pml_bfo_rdma_frag_t* frag);
/* This function tries to continue recvreq that stuck due to resource
* unavailability. Recvreq is added to recv_pending list if scheduling of put
* operation cannot be accomplished for some reason. */
void mca_pml_bfo_recv_request_process_pending(void);
END_C_DECLS
#endif

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,499 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2016 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2009-2012 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef OMPI_PML_BFO_SEND_REQUEST_H
#define OMPI_PML_BFO_SEND_REQUEST_H
#include "opal/mca/btl/btl.h"
#include "opal/mca/mpool/base/base.h"
#include "ompi/mca/pml/base/pml_base_sendreq.h"
#include "pml_bfo_comm.h"
#include "pml_bfo_hdr.h"
#include "pml_bfo_rdma.h"
#include "pml_bfo_rdmafrag.h"
#include "ompi/mca/bml/bml.h"
BEGIN_C_DECLS
typedef enum {
MCA_PML_BFO_SEND_PENDING_NONE,
MCA_PML_BFO_SEND_PENDING_SCHEDULE,
MCA_PML_BFO_SEND_PENDING_START
} mca_pml_bfo_send_pending_t;
struct mca_pml_bfo_send_request_t {
mca_pml_base_send_request_t req_send;
mca_bml_base_endpoint_t* req_endpoint;
opal_ptr_t req_recv;
#if PML_BFO
int32_t req_events; /* number of outstanding events on request */
int32_t req_restartseq; /* sequence number of restarted request */
int32_t req_restart; /* state of restarted request */
int32_t req_error; /* non-zero when error has occurred on request */
#endif /* PML_BFO */
int32_t req_state;
int32_t req_lock;
bool req_throttle_sends;
size_t req_pipeline_depth;
size_t req_bytes_delivered;
uint32_t req_rdma_cnt;
mca_pml_bfo_send_pending_t req_pending;
opal_mutex_t req_send_range_lock;
opal_list_t req_send_ranges;
mca_pml_bfo_com_btl_t req_rdma[1];
};
typedef struct mca_pml_bfo_send_request_t mca_pml_bfo_send_request_t;
OBJ_CLASS_DECLARATION(mca_pml_bfo_send_request_t);
struct mca_pml_bfo_send_range_t {
opal_free_list_item_t base;
uint64_t range_send_offset;
uint64_t range_send_length;
int range_btl_idx;
int range_btl_cnt;
mca_pml_bfo_com_btl_t range_btls[1];
};
typedef struct mca_pml_bfo_send_range_t mca_pml_bfo_send_range_t;
OBJ_CLASS_DECLARATION(mca_pml_bfo_send_range_t);
static inline bool lock_send_request(mca_pml_bfo_send_request_t *sendreq)
{
return OPAL_THREAD_ADD_FETCH32(&sendreq->req_lock, 1) == 1;
}
static inline bool unlock_send_request(mca_pml_bfo_send_request_t *sendreq)
{
return OPAL_THREAD_ADD_FETCH32(&sendreq->req_lock, -1) == 0;
}
static inline void
add_request_to_send_pending(mca_pml_bfo_send_request_t* sendreq,
const mca_pml_bfo_send_pending_t type,
const bool append)
{
opal_list_item_t *item = (opal_list_item_t*)sendreq;
OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
sendreq->req_pending = type;
if(append)
opal_list_append(&mca_pml_bfo.send_pending, item);
else
opal_list_prepend(&mca_pml_bfo.send_pending, item);
OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
}
static inline mca_pml_bfo_send_request_t*
get_request_from_send_pending(mca_pml_bfo_send_pending_t *type)
{
mca_pml_bfo_send_request_t *sendreq;
OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
sendreq = (mca_pml_bfo_send_request_t*)
opal_list_remove_first(&mca_pml_bfo.send_pending);
if(sendreq) {
*type = sendreq->req_pending;
sendreq->req_pending = MCA_PML_BFO_SEND_PENDING_NONE;
}
OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
return sendreq;
}
#define MCA_PML_BFO_SEND_REQUEST_ALLOC( comm, \
dst, \
sendreq) \
{ \
ompi_proc_t *proc = ompi_comm_peer_lookup( comm, dst ); \
opal_free_list_item_t* item; \
\
sendreq = NULL; \
if( OPAL_LIKELY(NULL != proc) ) { \
OPAL_FREE_LIST_WAIT_MT(&mca_pml_base_send_requests, item); \
sendreq = (mca_pml_bfo_send_request_t*)item; \
sendreq->req_send.req_base.req_proc = proc; \
} \
}
#define MCA_PML_BFO_SEND_REQUEST_INIT( sendreq, \
buf, \
count, \
datatype, \
dst, \
tag, \
comm, \
sendmode, \
persistent) \
{ \
MCA_PML_BASE_SEND_REQUEST_INIT(&sendreq->req_send, \
buf, \
count, \
datatype, \
dst, \
tag, \
comm, \
sendmode, \
persistent, \
0); /* convertor_flags */ \
(sendreq)->req_recv.pval = NULL; \
}
static inline void mca_pml_bfo_free_rdma_resources(mca_pml_bfo_send_request_t* sendreq)
{
size_t r;
/* return mpool resources */
for(r = 0; r < sendreq->req_rdma_cnt; r++) {
struct mca_btl_base_registration_handle_t* handle = sendreq->req_rdma[r].btl_reg;
mca_bml_base_btl_t *bml_btl = sendreq->req_rdma[r].bml_btl;
if( NULL != handle ) {
mca_bml_base_deregister_mem (bml_btl, handle);
sendreq->req_rdma[r].btl_reg = NULL;
}
}
sendreq->req_rdma_cnt = 0;
}
/**
* Start a send request.
*/
#define MCA_PML_BFO_SEND_REQUEST_START(sendreq, rc) \
do { \
rc = mca_pml_bfo_send_request_start(sendreq); \
} while (0)
/*
* Mark a send request as completed at the MPI level.
*/
#define MCA_PML_BFO_SEND_REQUEST_MPI_COMPLETE(sendreq, with_signal) \
do { \
(sendreq)->req_send.req_base.req_ompi.req_status.MPI_SOURCE = \
(sendreq)->req_send.req_base.req_comm->c_my_rank; \
(sendreq)->req_send.req_base.req_ompi.req_status.MPI_TAG = \
(sendreq)->req_send.req_base.req_tag; \
(sendreq)->req_send.req_base.req_ompi.req_status.MPI_ERROR = OMPI_SUCCESS; \
(sendreq)->req_send.req_base.req_ompi.req_status._ucount = \
(sendreq)->req_send.req_bytes_packed; \
PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_COMPLETE, \
&(sendreq->req_send.req_base), PERUSE_SEND); \
\
ompi_request_complete( &((sendreq)->req_send.req_base.req_ompi), (with_signal) ); \
} while(0)
/*
* Release resources associated with a request
*/
#define MCA_PML_BFO_SEND_REQUEST_RETURN(sendreq) \
do { \
MCA_PML_BASE_SEND_REQUEST_FINI((&(sendreq)->req_send)); \
OPAL_FREE_LIST_RETURN( &mca_pml_base_send_requests, \
(opal_free_list_item_t*)sendreq); \
} while(0)
/*
* The PML has completed a send request. Note that this request
* may have been orphaned by the user or have already completed
* at the MPI level.
* This function will never be called directly from the upper level, as it
* should only be an internal call to the PML.
*
*/
static inline void
send_request_pml_complete(mca_pml_bfo_send_request_t *sendreq)
{
if(false == sendreq->req_send.req_base.req_pml_complete) {
if(sendreq->req_send.req_bytes_packed > 0) {
PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_END,
&(sendreq->req_send.req_base), PERUSE_SEND);
}
/* return mpool resources */
mca_pml_bfo_free_rdma_resources(sendreq);
if (sendreq->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED &&
sendreq->req_send.req_addr != sendreq->req_send.req_base.req_addr) {
mca_pml_base_bsend_request_fini((ompi_request_t*)sendreq);
}
sendreq->req_send.req_base.req_pml_complete = true;
if( !REQUEST_COMPLETE( &((sendreq->req_send).req_base.req_ompi)) ) {
/* Should only be called for long messages (maybe synchronous) */
MCA_PML_BFO_SEND_REQUEST_MPI_COMPLETE(sendreq, true);
} else {
if( MPI_SUCCESS != sendreq->req_send.req_base.req_ompi.req_status.MPI_ERROR ) {
ompi_mpi_abort(&ompi_mpi_comm_world.comm, MPI_ERR_REQUEST);
}
}
#if PML_BFO
sendreq->req_send.req_base.req_sequence -= 100;
#endif /* PML_BFO */
if(true == sendreq->req_send.req_base.req_free_called) {
MCA_PML_BFO_SEND_REQUEST_RETURN(sendreq);
}
}
}
/* returns true if request was completed on PML level */
static inline bool
send_request_pml_complete_check(mca_pml_bfo_send_request_t *sendreq)
{
#if OPAL_ENABLE_MULTI_THREADS
opal_atomic_rmb();
#endif
/* if no more events are expected for the request and the whole message is
* already sent and send fragment scheduling isn't running in another
* thread then complete the request on PML level. From now on, if user
* called free on this request, the request structure can be reused for
* another request or if the request is persistent it can be restarted */
if(sendreq->req_state == 0 &&
sendreq->req_bytes_delivered >= sendreq->req_send.req_bytes_packed
&& lock_send_request(sendreq)) {
send_request_pml_complete(sendreq);
return true;
}
return false;
}
/**
* Schedule additional fragments
*/
int
mca_pml_bfo_send_request_schedule_once(mca_pml_bfo_send_request_t*);
static inline int
mca_pml_bfo_send_request_schedule_exclusive(mca_pml_bfo_send_request_t* sendreq)
{
int rc;
do {
rc = mca_pml_bfo_send_request_schedule_once(sendreq);
if(rc == OMPI_ERR_OUT_OF_RESOURCE)
break;
} while(!unlock_send_request(sendreq));
if(OMPI_SUCCESS == rc)
send_request_pml_complete_check(sendreq);
return rc;
}
static inline void
mca_pml_bfo_send_request_schedule(mca_pml_bfo_send_request_t* sendreq)
{
/*
* Only allow one thread in this routine for a given request.
* However, we cannot block callers on a mutex, so simply keep track
* of the number of times the routine has been called and run through
* the scheduling logic once for every call.
*/
if(!lock_send_request(sendreq))
return;
mca_pml_bfo_send_request_schedule_exclusive(sendreq);
}
#if OPAL_CUDA_SUPPORT
int mca_pml_bfo_send_request_start_cuda(
mca_pml_bfo_send_request_t* sendreq,
mca_bml_base_btl_t* bml_btl,
size_t size);
#endif /* OPAL_CUDA_SUPPORT */
/**
* Start the specified request
*/
int mca_pml_bfo_send_request_start_buffered(
mca_pml_bfo_send_request_t* sendreq,
mca_bml_base_btl_t* bml_btl,
size_t size);
int mca_pml_bfo_send_request_start_copy(
mca_pml_bfo_send_request_t* sendreq,
mca_bml_base_btl_t* bml_btl,
size_t size);
int mca_pml_bfo_send_request_start_prepare(
mca_pml_bfo_send_request_t* sendreq,
mca_bml_base_btl_t* bml_btl,
size_t size);
int mca_pml_bfo_send_request_start_rdma(
mca_pml_bfo_send_request_t* sendreq,
mca_bml_base_btl_t* bml_btl,
size_t size);
int mca_pml_bfo_send_request_start_rndv(
mca_pml_bfo_send_request_t* sendreq,
mca_bml_base_btl_t* bml_btl,
size_t size,
int flags);
static inline int
mca_pml_bfo_send_request_start_btl( mca_pml_bfo_send_request_t* sendreq,
mca_bml_base_btl_t* bml_btl )
{
size_t size = sendreq->req_send.req_bytes_packed;
mca_btl_base_module_t* btl = bml_btl->btl;
size_t eager_limit = btl->btl_eager_limit - sizeof(mca_pml_bfo_hdr_t);
int rc;
if( OPAL_LIKELY(size <= eager_limit) ) {
switch(sendreq->req_send.req_send_mode) {
case MCA_PML_BASE_SEND_SYNCHRONOUS:
rc = mca_pml_bfo_send_request_start_rndv(sendreq, bml_btl, size, 0);
break;
case MCA_PML_BASE_SEND_BUFFERED:
rc = mca_pml_bfo_send_request_start_copy(sendreq, bml_btl, size);
break;
case MCA_PML_BASE_SEND_COMPLETE:
rc = mca_pml_bfo_send_request_start_prepare(sendreq, bml_btl, size);
break;
default:
if (size != 0 && bml_btl->btl_flags & MCA_BTL_FLAGS_SEND_INPLACE) {
rc = mca_pml_bfo_send_request_start_prepare(sendreq, bml_btl, size);
} else {
rc = mca_pml_bfo_send_request_start_copy(sendreq, bml_btl, size);
}
break;
}
} else {
size = eager_limit;
if(OPAL_UNLIKELY(btl->btl_rndv_eager_limit < eager_limit))
size = btl->btl_rndv_eager_limit;
if(sendreq->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED) {
rc = mca_pml_bfo_send_request_start_buffered(sendreq, bml_btl, size);
} else if
(opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == false) {
unsigned char *base;
opal_convertor_get_current_pointer( &sendreq->req_send.req_base.req_convertor, (void**)&base );
if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_bfo_rdma_btls(
sendreq->req_endpoint,
base,
sendreq->req_send.req_bytes_packed,
sendreq->req_rdma))) {
rc = mca_pml_bfo_send_request_start_rdma(sendreq, bml_btl,
sendreq->req_send.req_bytes_packed);
if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
mca_pml_bfo_free_rdma_resources(sendreq);
}
} else {
rc = mca_pml_bfo_send_request_start_rndv(sendreq, bml_btl, size,
MCA_PML_BFO_HDR_FLAGS_CONTIG);
}
} else {
#if OPAL_CUDA_SUPPORT
if (sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA) {
return mca_pml_bfo_send_request_start_cuda(sendreq, bml_btl, size);
}
#endif /* OPAL_CUDA_SUPPORT */
rc = mca_pml_bfo_send_request_start_rndv(sendreq, bml_btl, size, 0);
}
}
return rc;
}
static inline int
mca_pml_bfo_send_request_start( mca_pml_bfo_send_request_t* sendreq )
{
mca_pml_bfo_comm_t* comm = sendreq->req_send.req_base.req_comm->c_pml_comm;
mca_bml_base_endpoint_t* endpoint = (mca_bml_base_endpoint_t*)
sendreq->req_send.req_base.req_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
size_t i;
if( OPAL_UNLIKELY(endpoint == NULL) ) {
return OMPI_ERR_UNREACH;
}
sendreq->req_endpoint = endpoint;
sendreq->req_state = 0;
sendreq->req_lock = 0;
sendreq->req_pipeline_depth = 0;
sendreq->req_bytes_delivered = 0;
sendreq->req_pending = MCA_PML_BFO_SEND_PENDING_NONE;
sendreq->req_send.req_base.req_sequence = OPAL_THREAD_ADD_FETCH32(
&comm->procs[sendreq->req_send.req_base.req_peer].send_sequence,1);
#if PML_BFO
sendreq->req_restartseq = 0; /* counts up restarts */
sendreq->req_restart = 0; /* reset in case we restart again */
sendreq->req_error = 0; /* clear error state */
sendreq->req_events = 0; /* clear events, probably 0 anyways */
#endif /* PML_BFO */
MCA_PML_BASE_SEND_START( &sendreq->req_send.req_base );
for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) {
mca_bml_base_btl_t* bml_btl;
int rc;
/* select a btl */
bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
rc = mca_pml_bfo_send_request_start_btl(sendreq, bml_btl);
if( OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != rc) )
return rc;
}
add_request_to_send_pending(sendreq, MCA_PML_BFO_SEND_PENDING_START, true);
return OMPI_SUCCESS;
}
/**
* Initiate a put scheduled by the receiver.
*/
void mca_pml_bfo_send_request_put( mca_pml_bfo_send_request_t* sendreq,
mca_btl_base_module_t* btl,
mca_pml_bfo_rdma_hdr_t* hdr );
int mca_pml_bfo_send_request_put_frag(mca_pml_bfo_rdma_frag_t* frag);
/* This function tries to continue sendreq that was stuck because of resource
* unavailability. A sendreq may be added to send_pending list if there is no
* resource to send initial packet or there is not resource to schedule data
* for sending. The reason the sendreq was added to the list is stored inside
* sendreq struct and appropriate operation is retried when resource became
* available. bml_btl passed to the function doesn't represents sendreq
* destination, it represents BTL on which resource was freed, so only this BTL
* should be considered for sending packets */
void mca_pml_bfo_send_request_process_pending(mca_bml_base_btl_t *bml_btl);
void mca_pml_bfo_send_request_copy_in_out(mca_pml_bfo_send_request_t *sendreq,
uint64_t send_offset, uint64_t send_length);
END_C_DECLS
#endif /* OMPI_PML_BFO_SEND_REQUEST_H */

Просмотреть файл

@ -1,148 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2016 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2008 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "pml_bfo.h"
#include "pml_bfo_recvreq.h"
#include "pml_bfo_sendreq.h"
#include "ompi/memchecker.h"
int mca_pml_bfo_start(size_t count, ompi_request_t** requests)
{
int rc;
size_t i;
bool reuse_old_request = true;
for(i=0; i<count; i++) {
mca_pml_base_request_t *pml_request = (mca_pml_base_request_t*)requests[i];
if(NULL == pml_request) {
continue;
}
if (OMPI_REQUEST_PML != requests[i]->req_type) {
continue;
}
/* If the persistent request is currently active - obtain the
* request lock and verify the status is incomplete. if the
* pml layer has not completed the request - mark the request
* as free called - so that it will be freed when the request
* completes - and create a new request.
*/
#if OPAL_ENABLE_MULTI_THREADS
opal_atomic_rmb();
#endif
reuse_old_request = true;
switch(pml_request->req_ompi.req_state) {
case OMPI_REQUEST_INACTIVE:
if(pml_request->req_pml_complete == true)
break;
/* otherwise fall through */
case OMPI_REQUEST_ACTIVE: {
ompi_request_t *request;
if (pml_request->req_pml_complete == false) {
/* free request after it completes */
pml_request->req_free_called = true;
} else {
/* can reuse the existing request */
break;
}
reuse_old_request = false;
/* allocate a new request */
switch(pml_request->req_type) {
case MCA_PML_REQUEST_SEND: {
mca_pml_base_send_mode_t sendmode =
((mca_pml_base_send_request_t*)pml_request)->req_send_mode;
rc = mca_pml_bfo_isend_init(
pml_request->req_addr,
pml_request->req_count,
pml_request->req_datatype,
pml_request->req_peer,
pml_request->req_tag,
sendmode,
pml_request->req_comm,
&request);
break;
}
case MCA_PML_REQUEST_RECV:
rc = mca_pml_bfo_irecv_init(
pml_request->req_addr,
pml_request->req_count,
pml_request->req_datatype,
pml_request->req_peer,
pml_request->req_tag,
pml_request->req_comm,
&request);
break;
default:
rc = OMPI_ERR_REQUEST;
break;
}
if(OMPI_SUCCESS != rc)
return rc;
pml_request = (mca_pml_base_request_t*)request;
requests[i] = request;
break;
}
default:
return OMPI_ERR_REQUEST;
}
/* start the request */
switch(pml_request->req_type) {
case MCA_PML_REQUEST_SEND:
{
mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)pml_request;
MEMCHECKER(
memchecker_call(&opal_memchecker_base_isdefined,
pml_request->req_addr, pml_request->req_count,
pml_request->req_datatype);
);
if( reuse_old_request && (sendreq->req_send.req_bytes_packed != 0) ) {
size_t offset = 0;
/**
* Reset the convertor in case we're dealing with the original
* request, which when completed do not reset the convertor.
*/
opal_convertor_set_position( &sendreq->req_send.req_base.req_convertor,
&offset );
}
MCA_PML_BFO_SEND_REQUEST_START(sendreq, rc);
if(rc != OMPI_SUCCESS)
return rc;
break;
}
case MCA_PML_REQUEST_RECV:
{
mca_pml_bfo_recv_request_t* recvreq = (mca_pml_bfo_recv_request_t*)pml_request;
MCA_PML_BFO_RECV_REQUEST_START(recvreq);
break;
}
default:
return OMPI_ERR_REQUEST;
}
}
return OMPI_SUCCESS;
}

Просмотреть файл

@ -1 +0,0 @@
DIRECT_CALL_HEADER="ompi/mca/pml/bfo/pml_bfo.h"