From 17f51a0389720054522a23a2334cbc335a156222 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Mon, 23 Mar 2009 23:52:05 +0000 Subject: [PATCH] Add a new PML module that acts as a "mini-dr" - when requested, it performs a dr-like checksum on messages for BTL's that require it, as specified by MCA params. Add two new configure options that specify: 1. when to add padding to the openib control header - this *only* happens when the configure option is specified 2. when to use the dr-like checksum as opposed to the memcpy checksum. Not selectable at runtime - to eliminate performance impacts, this is a configure-only option Also removed an unused checksum version from opal/util/crc.h. The new component still needs a little cleanup and some sync with recent ob1 bug fixes. It was created as a separate module to avoid performance hits in ob1 itself, though most of the code is duplicative. The component is only selectable by either specifying it directly, or configuring with the dr-like checksum -and- setting -mca pml_csum_enable_checksum 1. Modify the LANL platform files to take advantage of the new module. This commit was SVN r20846. --- config/ompi_configure_options.m4 | 35 + .../platform/lanl/rr-class/debug-nopanasas | 2 + .../lanl/rr-class/debug-nopanasas.conf | 18 +- contrib/platform/lanl/rr-class/debug-panasas | 4 +- .../platform/lanl/rr-class/debug-panasas.conf | 18 +- contrib/platform/lanl/rr-class/debug.conf | 99 -- .../lanl/rr-class/optimized-nopanasas | 2 + .../lanl/rr-class/optimized-nopanasas.conf | 17 +- .../platform/lanl/rr-class/optimized-panasas | 4 +- .../lanl/rr-class/optimized-panasas.conf | 17 +- contrib/platform/lanl/rr-class/optimized.conf | 97 -- contrib/platform/lanl/tlcc/debug.conf | 99 -- contrib/platform/lanl/tlcc/optimized.conf | 97 -- ompi/datatype/datatype_checksum.h | 17 + ompi/mca/btl/openib/btl_openib_frag.h | 8 +- ompi/mca/pml/csum/Makefile.am | 63 + ompi/mca/pml/csum/configure.params | 24 + ompi/mca/pml/csum/help-pml-csum.txt | 31 + ompi/mca/pml/csum/pml_csum.c | 801 ++++++++++ ompi/mca/pml/csum/pml_csum.h | 328 ++++ ompi/mca/pml/csum/pml_csum_comm.c | 98 ++ ompi/mca/pml/csum/pml_csum_comm.h | 83 + ompi/mca/pml/csum/pml_csum_component.c | 263 ++++ ompi/mca/pml/csum/pml_csum_component.h | 32 + ompi/mca/pml/csum/pml_csum_endpoint.c | 26 + ompi/mca/pml/csum/pml_csum_endpoint.h | 30 + ompi/mca/pml/csum/pml_csum_hdr.h | 455 ++++++ ompi/mca/pml/csum/pml_csum_iprobe.c | 75 + ompi/mca/pml/csum/pml_csum_irecv.c | 112 ++ ompi/mca/pml/csum/pml_csum_isend.c | 130 ++ ompi/mca/pml/csum/pml_csum_progress.c | 77 + ompi/mca/pml/csum/pml_csum_rdma.c | 124 ++ ompi/mca/pml/csum/pml_csum_rdma.h | 41 + ompi/mca/pml/csum/pml_csum_rdmafrag.c | 29 + ompi/mca/pml/csum/pml_csum_rdmafrag.h | 71 + ompi/mca/pml/csum/pml_csum_recvfrag.c | 799 ++++++++++ ompi/mca/pml/csum/pml_csum_recvfrag.h | 176 +++ ompi/mca/pml/csum/pml_csum_recvreq.c | 1110 ++++++++++++++ ompi/mca/pml/csum/pml_csum_recvreq.h | 427 ++++++ ompi/mca/pml/csum/pml_csum_sendreq.c | 1354 +++++++++++++++++ ompi/mca/pml/csum/pml_csum_sendreq.h | 515 +++++++ ompi/mca/pml/csum/pml_csum_start.c | 142 ++ ompi/mca/pml/csum/post_configure.sh | 1 + opal/util/crc.h | 16 + 44 files changed, 7561 insertions(+), 406 deletions(-) delete mode 100644 contrib/platform/lanl/rr-class/debug.conf delete mode 100644 contrib/platform/lanl/rr-class/optimized.conf delete mode 100644 contrib/platform/lanl/tlcc/debug.conf delete mode 100644 contrib/platform/lanl/tlcc/optimized.conf create mode 100644 ompi/mca/pml/csum/Makefile.am create mode 100644 ompi/mca/pml/csum/configure.params create mode 100644 ompi/mca/pml/csum/help-pml-csum.txt create mode 100644 ompi/mca/pml/csum/pml_csum.c create mode 100644 ompi/mca/pml/csum/pml_csum.h create mode 100644 ompi/mca/pml/csum/pml_csum_comm.c create mode 100644 ompi/mca/pml/csum/pml_csum_comm.h create mode 100644 ompi/mca/pml/csum/pml_csum_component.c create mode 100644 ompi/mca/pml/csum/pml_csum_component.h create mode 100644 ompi/mca/pml/csum/pml_csum_endpoint.c create mode 100644 ompi/mca/pml/csum/pml_csum_endpoint.h create mode 100644 ompi/mca/pml/csum/pml_csum_hdr.h create mode 100644 ompi/mca/pml/csum/pml_csum_iprobe.c create mode 100644 ompi/mca/pml/csum/pml_csum_irecv.c create mode 100644 ompi/mca/pml/csum/pml_csum_isend.c create mode 100644 ompi/mca/pml/csum/pml_csum_progress.c create mode 100644 ompi/mca/pml/csum/pml_csum_rdma.c create mode 100644 ompi/mca/pml/csum/pml_csum_rdma.h create mode 100644 ompi/mca/pml/csum/pml_csum_rdmafrag.c create mode 100644 ompi/mca/pml/csum/pml_csum_rdmafrag.h create mode 100644 ompi/mca/pml/csum/pml_csum_recvfrag.c create mode 100644 ompi/mca/pml/csum/pml_csum_recvfrag.h create mode 100644 ompi/mca/pml/csum/pml_csum_recvreq.c create mode 100644 ompi/mca/pml/csum/pml_csum_recvreq.h create mode 100644 ompi/mca/pml/csum/pml_csum_sendreq.c create mode 100644 ompi/mca/pml/csum/pml_csum_sendreq.h create mode 100644 ompi/mca/pml/csum/pml_csum_start.c create mode 100644 ompi/mca/pml/csum/post_configure.sh diff --git a/config/ompi_configure_options.m4 b/config/ompi_configure_options.m4 index 9167853911..2fde241ff3 100644 --- a/config/ompi_configure_options.m4 +++ b/config/ompi_configure_options.m4 @@ -12,6 +12,9 @@ dnl Copyright (c) 2004-2005 The Regents of the University of California. dnl All rights reserved. dnl Copyright (c) 2006-2008 Cisco Systems, Inc. All rights reserved. dnl Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. +dnl Copyright (c) 2009 IBM Corporation. All rights reserved. +dnl Copyright (c) 2009 Los Alamos National Security, LLC. All rights +dnl reserved. dnl $COPYRIGHT$ dnl dnl Additional copyrights may follow @@ -725,5 +728,37 @@ AC_DEFINE_UNQUOTED([OPAL_IDENT_STRING], ["$with_ident_string"], [ident string for Open MPI]) AC_MSG_RESULT([$with_ident_string]) +# +# Add padding to OpenIB header +# +AC_MSG_CHECKING([whether to add padding to the openib control header]) +AC_ARG_WITH([openib-control-hdr-padding], + [AC_HELP_STRING([--with-openib-control-hdr-padding], + [Add padding bytes to the openib control header])]) +if test "$with_openib_control_hdr_padding" = "yes"; then + AC_MSG_RESULT([yes]) + ompi_openib_pad_hdr=1 +else + AC_MSG_RESULT([no]) + ompi_openib_pad_hdr=0 +fi +AC_DEFINE_UNQUOTED([OMPI_OPENIB_PAD_HDR], + [$ompi_openib_pad_hdr], + [Add padding bytes to the openib control header]) + + +# +# Use alternative checksum algorithm +# +AC_MSG_CHECKING([whether to use an alternative checksum algo for messages]) +AC_ARG_WITH([dst-checksum], + [AC_HELP_STRING([--with-dst-checksum], + [Use an alternative checksum algorithm for messages])]) +if test "$with_dst_checksum" = "yes"; then + AC_MSG_RESULT([yes]) + CFLAGS="-DOMPI_CSUM_DST $CFLAGS" +else + AC_MSG_RESULT([no]) +fi ]) diff --git a/contrib/platform/lanl/rr-class/debug-nopanasas b/contrib/platform/lanl/rr-class/debug-nopanasas index 5200737e29..3c5464f075 100644 --- a/contrib/platform/lanl/rr-class/debug-nopanasas +++ b/contrib/platform/lanl/rr-class/debug-nopanasas @@ -28,3 +28,5 @@ with_io_romio_flags=--with-file-system=ufs+nfs with_threads=posix with_valgrind=no LDFLAGS=-L/opt/PBS/lib64 +with_openib_control_hdr_padding=yes + diff --git a/contrib/platform/lanl/rr-class/debug-nopanasas.conf b/contrib/platform/lanl/rr-class/debug-nopanasas.conf index 2abe556cc1..7b03b8500f 100644 --- a/contrib/platform/lanl/rr-class/debug-nopanasas.conf +++ b/contrib/platform/lanl/rr-class/debug-nopanasas.conf @@ -71,18 +71,30 @@ orte_tmpdir_base = /tmp ## from inadvertent job executions orte_allocation_required = 1 +## MPI behavior +mpi_leave_pinned = 0 +pml = csum +pml_csum_enable_csum = 1 +btl_openib_flags = 50 + +## Protect looped collectives +coll_sync_priority = 100 +coll_sync_barrier_before = 1000 + +## Activate hierarchical collectives +coll_hierarch_priority = 90 + ## Add the interface for out-of-band communication ## and set it up -oob_tcp_if_include=ib0,eth0 +oob_tcp_if_include=ib0 oob_tcp_peer_retries = 10 oob_tcp_disable_family = IPv6 oob_tcp_listen_mode = listen_thread oob_tcp_sndbuf = 32768 oob_tcp_rcvbuf = 32768 - ## Define the MPI interconnects -btl = sm,openib,tcp,self +btl = sm,openib,self ## Setup OpenIB btl_openib_want_fork_support = 0 diff --git a/contrib/platform/lanl/rr-class/debug-panasas b/contrib/platform/lanl/rr-class/debug-panasas index 11bcd0136a..21fe88fbd1 100644 --- a/contrib/platform/lanl/rr-class/debug-panasas +++ b/contrib/platform/lanl/rr-class/debug-panasas @@ -28,4 +28,6 @@ with_io_romio_flags=--with-file-system=ufs+nfs+panfs with_threads=posix with_valgrind=no LDFLAGS=-L/opt/PBS/lib64 -CFLAGS=-I/opt/panfs/include +CFLAGS="-I/opt/panfs/include" +with_openib_control_hdr_padding=yes + diff --git a/contrib/platform/lanl/rr-class/debug-panasas.conf b/contrib/platform/lanl/rr-class/debug-panasas.conf index 2abe556cc1..7b03b8500f 100644 --- a/contrib/platform/lanl/rr-class/debug-panasas.conf +++ b/contrib/platform/lanl/rr-class/debug-panasas.conf @@ -71,18 +71,30 @@ orte_tmpdir_base = /tmp ## from inadvertent job executions orte_allocation_required = 1 +## MPI behavior +mpi_leave_pinned = 0 +pml = csum +pml_csum_enable_csum = 1 +btl_openib_flags = 50 + +## Protect looped collectives +coll_sync_priority = 100 +coll_sync_barrier_before = 1000 + +## Activate hierarchical collectives +coll_hierarch_priority = 90 + ## Add the interface for out-of-band communication ## and set it up -oob_tcp_if_include=ib0,eth0 +oob_tcp_if_include=ib0 oob_tcp_peer_retries = 10 oob_tcp_disable_family = IPv6 oob_tcp_listen_mode = listen_thread oob_tcp_sndbuf = 32768 oob_tcp_rcvbuf = 32768 - ## Define the MPI interconnects -btl = sm,openib,tcp,self +btl = sm,openib,self ## Setup OpenIB btl_openib_want_fork_support = 0 diff --git a/contrib/platform/lanl/rr-class/debug.conf b/contrib/platform/lanl/rr-class/debug.conf deleted file mode 100644 index 2abe556cc1..0000000000 --- a/contrib/platform/lanl/rr-class/debug.conf +++ /dev/null @@ -1,99 +0,0 @@ -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# This is the default system-wide MCA parameters defaults file. -# Specifically, the MCA parameter "mca_param_files" defaults to a -# value of -# "$HOME/.openmpi/mca-params.conf:$sysconf/openmpi-mca-params.conf" -# (this file is the latter of the two). So if the default value of -# mca_param_files is not changed, this file is used to set system-wide -# MCA parameters. This file can therefore be used to set system-wide -# default MCA parameters for all users. Of course, users can override -# these values if they want, but this file is an excellent location -# for setting system-specific MCA parameters for those users who don't -# know / care enough to investigate the proper values for them. - -# Note that this file is only applicable where it is visible (in a -# filesystem sense). Specifically, MPI processes each read this file -# during their startup to determine what default values for MCA -# parameters should be used. mpirun does not bundle up the values in -# this file from the node where it was run and send them to all nodes; -# the default value decisions are effectively distributed. Hence, -# these values are only applicable on nodes that "see" this file. If -# $sysconf is a directory on a local disk, it is likely that changes -# to this file will need to be propagated to other nodes. If $sysconf -# is a directory that is shared via a networked filesystem, changes to -# this file will be visible to all nodes that share this $sysconf. - -# The format is straightforward: one per line, mca_param_name = -# rvalue. Quoting is ignored (so if you use quotes or escape -# characters, they'll be included as part of the value). For example: - -# Disable run-time MPI parameter checking -# mpi_param_check = 0 - -# Note that the value "~/" will be expanded to the current user's home -# directory. For example: - -# Change component loading path -# component_path = /usr/local/lib/openmpi:~/my_openmpi_components - -# See "ompi_info --param all all" for a full listing of Open MPI MCA -# parameters available and their default values. -# - -# Basic behavior to smooth startup -mca_component_show_load_errors = 0 -orte_abort_timeout = 10 -opal_set_max_sys_limits = 1 - -## Protect the shared file systems -orte_no_session_dirs = /panfs,/scratch,/users,/usr/projects -orte_tmpdir_base = /tmp - -## Require an allocation to run - protects the frontend -## from inadvertent job executions -orte_allocation_required = 1 - -## Add the interface for out-of-band communication -## and set it up -oob_tcp_if_include=ib0,eth0 -oob_tcp_peer_retries = 10 -oob_tcp_disable_family = IPv6 -oob_tcp_listen_mode = listen_thread -oob_tcp_sndbuf = 32768 -oob_tcp_rcvbuf = 32768 - - -## Define the MPI interconnects -btl = sm,openib,tcp,self - -## Setup OpenIB -btl_openib_want_fork_support = 0 -btl_openib_cpc_include = oob -#btl_openib_receive_queues = P,128,256,64,32,32:S,2048,1024,128,32:S,12288,1024,128,32:S,65536,1024,128,32 - -## Enable cpu affinity -mpi_paffinity_alone = 1 - -## Setup MPI options -mpi_show_handle_leaks = 1 -mpi_warn_on_fork = 1 -mpi_abort_print_stack = 1 - diff --git a/contrib/platform/lanl/rr-class/optimized-nopanasas b/contrib/platform/lanl/rr-class/optimized-nopanasas index 562c1b78cf..c07c2a6c35 100644 --- a/contrib/platform/lanl/rr-class/optimized-nopanasas +++ b/contrib/platform/lanl/rr-class/optimized-nopanasas @@ -28,3 +28,5 @@ with_io_romio_flags=--with-file-system=ufs+nfs with_threads=posix with_valgrind=no LDFLAGS=-L/opt/PBS/lib64 +with_openib_control_hdr_padding=yes + diff --git a/contrib/platform/lanl/rr-class/optimized-nopanasas.conf b/contrib/platform/lanl/rr-class/optimized-nopanasas.conf index 39e65c8780..2a1944f79e 100644 --- a/contrib/platform/lanl/rr-class/optimized-nopanasas.conf +++ b/contrib/platform/lanl/rr-class/optimized-nopanasas.conf @@ -71,9 +71,22 @@ orte_tmpdir_base = /tmp ## from inadvertent job executions orte_allocation_required = 1 +## MPI behavior +mpi_leave_pinned = 0 +pml = csum +pml_csum_enable_csum = 1 +btl_openib_flags = 50 + +## Protect looped collectives +coll_sync_priority = 100 +coll_sync_barrier_before = 1000 + +## Activate hierarchical collectives +coll_hierarch_priority = 90 + ## Add the interface for out-of-band communication ## and set it up -oob_tcp_if_include=ib0,eth0 +oob_tcp_if_include=ib0 oob_tcp_peer_retries = 10 oob_tcp_disable_family = IPv6 oob_tcp_listen_mode = listen_thread @@ -81,7 +94,7 @@ oob_tcp_sndbuf = 32768 oob_tcp_rcvbuf = 32768 ## Define the MPI interconnects -btl = sm,openib,tcp,self +btl = sm,openib,self ## Setup OpenIB btl_openib_want_fork_support = 0 diff --git a/contrib/platform/lanl/rr-class/optimized-panasas b/contrib/platform/lanl/rr-class/optimized-panasas index b75a55160c..0f002f2f22 100644 --- a/contrib/platform/lanl/rr-class/optimized-panasas +++ b/contrib/platform/lanl/rr-class/optimized-panasas @@ -28,4 +28,6 @@ with_io_romio_flags=--with-file-system=ufs+nfs+panfs with_threads=posix with_valgrind=no LDFLAGS=-L/opt/PBS/lib64 -CFLAGS=-I/opt/panfs/include +CFLAGS="-I/opt/panfs/include" +with_openib_control_hdr_padding=yes + diff --git a/contrib/platform/lanl/rr-class/optimized-panasas.conf b/contrib/platform/lanl/rr-class/optimized-panasas.conf index 39e65c8780..2a1944f79e 100644 --- a/contrib/platform/lanl/rr-class/optimized-panasas.conf +++ b/contrib/platform/lanl/rr-class/optimized-panasas.conf @@ -71,9 +71,22 @@ orte_tmpdir_base = /tmp ## from inadvertent job executions orte_allocation_required = 1 +## MPI behavior +mpi_leave_pinned = 0 +pml = csum +pml_csum_enable_csum = 1 +btl_openib_flags = 50 + +## Protect looped collectives +coll_sync_priority = 100 +coll_sync_barrier_before = 1000 + +## Activate hierarchical collectives +coll_hierarch_priority = 90 + ## Add the interface for out-of-band communication ## and set it up -oob_tcp_if_include=ib0,eth0 +oob_tcp_if_include=ib0 oob_tcp_peer_retries = 10 oob_tcp_disable_family = IPv6 oob_tcp_listen_mode = listen_thread @@ -81,7 +94,7 @@ oob_tcp_sndbuf = 32768 oob_tcp_rcvbuf = 32768 ## Define the MPI interconnects -btl = sm,openib,tcp,self +btl = sm,openib,self ## Setup OpenIB btl_openib_want_fork_support = 0 diff --git a/contrib/platform/lanl/rr-class/optimized.conf b/contrib/platform/lanl/rr-class/optimized.conf deleted file mode 100644 index 39e65c8780..0000000000 --- a/contrib/platform/lanl/rr-class/optimized.conf +++ /dev/null @@ -1,97 +0,0 @@ -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# This is the default system-wide MCA parameters defaults file. -# Specifically, the MCA parameter "mca_param_files" defaults to a -# value of -# "$HOME/.openmpi/mca-params.conf:$sysconf/openmpi-mca-params.conf" -# (this file is the latter of the two). So if the default value of -# mca_param_files is not changed, this file is used to set system-wide -# MCA parameters. This file can therefore be used to set system-wide -# default MCA parameters for all users. Of course, users can override -# these values if they want, but this file is an excellent location -# for setting system-specific MCA parameters for those users who don't -# know / care enough to investigate the proper values for them. - -# Note that this file is only applicable where it is visible (in a -# filesystem sense). Specifically, MPI processes each read this file -# during their startup to determine what default values for MCA -# parameters should be used. mpirun does not bundle up the values in -# this file from the node where it was run and send them to all nodes; -# the default value decisions are effectively distributed. Hence, -# these values are only applicable on nodes that "see" this file. If -# $sysconf is a directory on a local disk, it is likely that changes -# to this file will need to be propagated to other nodes. If $sysconf -# is a directory that is shared via a networked filesystem, changes to -# this file will be visible to all nodes that share this $sysconf. - -# The format is straightforward: one per line, mca_param_name = -# rvalue. Quoting is ignored (so if you use quotes or escape -# characters, they'll be included as part of the value). For example: - -# Disable run-time MPI parameter checking -# mpi_param_check = 0 - -# Note that the value "~/" will be expanded to the current user's home -# directory. For example: - -# Change component loading path -# component_path = /usr/local/lib/openmpi:~/my_openmpi_components - -# See "ompi_info --param all all" for a full listing of Open MPI MCA -# parameters available and their default values. -# - -# Basic behavior to smooth startup -mca_component_show_load_errors = 0 -orte_abort_timeout = 10 -opal_set_max_sys_limits = 1 - -## Protect the shared file systems -orte_no_session_dirs = /panfs,/scratch,/users,/usr/projects -orte_tmpdir_base = /tmp - -## Require an allocation to run - protects the frontend -## from inadvertent job executions -orte_allocation_required = 1 - -## Add the interface for out-of-band communication -## and set it up -oob_tcp_if_include=ib0,eth0 -oob_tcp_peer_retries = 10 -oob_tcp_disable_family = IPv6 -oob_tcp_listen_mode = listen_thread -oob_tcp_sndbuf = 32768 -oob_tcp_rcvbuf = 32768 - -## Define the MPI interconnects -btl = sm,openib,tcp,self - -## Setup OpenIB -btl_openib_want_fork_support = 0 -btl_openib_cpc_include = oob -#btl_openib_receive_queues = P,128,256,64,32,32:S,2048,1024,128,32:S,12288,1024,128,32:S,65536,1024,128,32 - -## Enable cpu affinity -mpi_paffinity_alone = 1 - -## Setup MPI options -mpi_show_handle_leaks = 0 -mpi_warn_on_fork = 1 - diff --git a/contrib/platform/lanl/tlcc/debug.conf b/contrib/platform/lanl/tlcc/debug.conf deleted file mode 100644 index 1652a178cf..0000000000 --- a/contrib/platform/lanl/tlcc/debug.conf +++ /dev/null @@ -1,99 +0,0 @@ -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# This is the default system-wide MCA parameters defaults file. -# Specifically, the MCA parameter "mca_param_files" defaults to a -# value of -# "$HOME/.openmpi/mca-params.conf:$sysconf/openmpi-mca-params.conf" -# (this file is the latter of the two). So if the default value of -# mca_param_files is not changed, this file is used to set system-wide -# MCA parameters. This file can therefore be used to set system-wide -# default MCA parameters for all users. Of course, users can override -# these values if they want, but this file is an excellent location -# for setting system-specific MCA parameters for those users who don't -# know / care enough to investigate the proper values for them. - -# Note that this file is only applicable where it is visible (in a -# filesystem sense). Specifically, MPI processes each read this file -# during their startup to determine what default values for MCA -# parameters should be used. mpirun does not bundle up the values in -# this file from the node where it was run and send them to all nodes; -# the default value decisions are effectively distributed. Hence, -# these values are only applicable on nodes that "see" this file. If -# $sysconf is a directory on a local disk, it is likely that changes -# to this file will need to be propagated to other nodes. If $sysconf -# is a directory that is shared via a networked filesystem, changes to -# this file will be visible to all nodes that share this $sysconf. - -# The format is straightforward: one per line, mca_param_name = -# rvalue. Quoting is ignored (so if you use quotes or escape -# characters, they'll be included as part of the value). For example: - -# Disable run-time MPI parameter checking -# mpi_param_check = 0 - -# Note that the value "~/" will be expanded to the current user's home -# directory. For example: - -# Change component loading path -# component_path = /usr/local/lib/openmpi:~/my_openmpi_components - -# See "ompi_info --param all all" for a full listing of Open MPI MCA -# parameters available and their default values. -# - -# Basic behavior to smooth startup -mca_component_show_load_errors = 0 -orte_abort_timeout = 10 -opal_set_max_sys_limits = 1 - -## Protect the shared file systems -orte_no_session_dirs = /panfs,/scratch,/users,/usr/projects -orte_tmpdir_base = /tmp - -## Require an allocation to run - protects the frontend -## from inadvertent job executions -orte_allocation_required = 1 - -## Add the interface for out-of-band communication -## and set it up -oob_tcp_if_include=ib0 -oob_tcp_peer_retries = 10 -oob_tcp_disable_family = IPv6 -oob_tcp_listen_mode = listen_thread -oob_tcp_sndbuf = 32768 -oob_tcp_rcvbuf = 32768 - - -## Define the MPI interconnects -btl = sm,openib,self - -## Setup OpenIB -btl_openib_want_fork_support = 0 -btl_openib_cpc_include = oob -#btl_openib_receive_queues = P,128,256,64,32,32:S,2048,1024,128,32:S,12288,1024,128,32:S,65536,1024,128,32 - -## Enable cpu affinity -mpi_paffinity_alone = 1 - -## Setup MPI options -mpi_show_handle_leaks = 1 -mpi_warn_on_fork = 1 -mpi_abort_print_stack = 1 - diff --git a/contrib/platform/lanl/tlcc/optimized.conf b/contrib/platform/lanl/tlcc/optimized.conf deleted file mode 100644 index d9091ec44a..0000000000 --- a/contrib/platform/lanl/tlcc/optimized.conf +++ /dev/null @@ -1,97 +0,0 @@ -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# This is the default system-wide MCA parameters defaults file. -# Specifically, the MCA parameter "mca_param_files" defaults to a -# value of -# "$HOME/.openmpi/mca-params.conf:$sysconf/openmpi-mca-params.conf" -# (this file is the latter of the two). So if the default value of -# mca_param_files is not changed, this file is used to set system-wide -# MCA parameters. This file can therefore be used to set system-wide -# default MCA parameters for all users. Of course, users can override -# these values if they want, but this file is an excellent location -# for setting system-specific MCA parameters for those users who don't -# know / care enough to investigate the proper values for them. - -# Note that this file is only applicable where it is visible (in a -# filesystem sense). Specifically, MPI processes each read this file -# during their startup to determine what default values for MCA -# parameters should be used. mpirun does not bundle up the values in -# this file from the node where it was run and send them to all nodes; -# the default value decisions are effectively distributed. Hence, -# these values are only applicable on nodes that "see" this file. If -# $sysconf is a directory on a local disk, it is likely that changes -# to this file will need to be propagated to other nodes. If $sysconf -# is a directory that is shared via a networked filesystem, changes to -# this file will be visible to all nodes that share this $sysconf. - -# The format is straightforward: one per line, mca_param_name = -# rvalue. Quoting is ignored (so if you use quotes or escape -# characters, they'll be included as part of the value). For example: - -# Disable run-time MPI parameter checking -# mpi_param_check = 0 - -# Note that the value "~/" will be expanded to the current user's home -# directory. For example: - -# Change component loading path -# component_path = /usr/local/lib/openmpi:~/my_openmpi_components - -# See "ompi_info --param all all" for a full listing of Open MPI MCA -# parameters available and their default values. -# - -# Basic behavior to smooth startup -mca_component_show_load_errors = 0 -orte_abort_timeout = 10 -opal_set_max_sys_limits = 1 - -## Protect the shared file systems -orte_no_session_dirs = /panfs,/scratch,/users,/usr/projects -orte_tmpdir_base = /tmp - -## Require an allocation to run - protects the frontend -## from inadvertent job executions -orte_allocation_required = 1 - -## Add the interface for out-of-band communication -## and set it up -oob_tcp_if_include=ib0 -oob_tcp_peer_retries = 10 -oob_tcp_disable_family = IPv6 -oob_tcp_listen_mode = listen_thread -oob_tcp_sndbuf = 32768 -oob_tcp_rcvbuf = 32768 - -## Define the MPI interconnects -btl = sm,openib,self - -## Setup OpenIB -btl_openib_want_fork_support = 0 -btl_openib_cpc_include = oob -#btl_openib_receive_queues = P,128,256,64,32,32:S,2048,1024,128,32:S,12288,1024,128,32:S,65536,1024,128,32 - -## Enable cpu affinity -mpi_paffinity_alone = 1 - -## Setup MPI options -mpi_show_handle_leaks = 0 -mpi_warn_on_fork = 1 - diff --git a/ompi/datatype/datatype_checksum.h b/ompi/datatype/datatype_checksum.h index 529eaf5de3..0c77a512e6 100644 --- a/ompi/datatype/datatype_checksum.h +++ b/ompi/datatype/datatype_checksum.h @@ -5,6 +5,9 @@ * reserved. * Copyright (c) 2004-2006 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2009 IBM Corporation. All rights reserved. + * Copyright (c) 2009 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -21,10 +24,24 @@ #if defined(CHECKSUM) +#if defined (OMPI_CSUM_DST) +#define MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \ +do { \ + volatile uint32_t __csum; \ + __csum = (CONVERTOR)->checksum; \ + (CONVERTOR)->checksum += OPAL_CSUM_BCOPY_PARTIAL( (SRC), (DST), (BLENGTH), (BLENGTH), &(CONVERTOR)->csum_ui1, &(CONVERTOR)->csum_ui2 ); \ + __csum += OPAL_CSUM_PARTIAL( (DST), (BLENGTH), &(CONVERTOR)->csum_ui1, &(CONVERTOR)->csum_ui2); \ + if (__csum != (CONVERTOR)->checksum) { \ + opal_output(0, "%s:%d:csum2: Invalid \'MEMCPY_CSUM check\' - dst csum:0x%04x != src csum:0x%04x\n", __FILE__, __LINE__, __csum, (CONVERTOR)->checksum); \ + } \ +} while (0) + +#else #define MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \ do { \ (CONVERTOR)->checksum += OPAL_CSUM_BCOPY_PARTIAL( (SRC), (DST), (BLENGTH), (BLENGTH), &(CONVERTOR)->csum_ui1, &(CONVERTOR)->csum_ui2 ); \ } while (0) +#endif #define COMPUTE_CSUM( SRC, BLENGTH, CONVERTOR ) \ do { \ diff --git a/ompi/mca/btl/openib/btl_openib_frag.h b/ompi/mca/btl/openib/btl_openib_frag.h index 64dff84f18..b9aa6d9ad9 100644 --- a/ompi/mca/btl/openib/btl_openib_frag.h +++ b/ompi/mca/btl/openib/btl_openib_frag.h @@ -9,7 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights + * Copyright (c) 2009 IBM Corporation. All rights reserved. + * Copyright (c) 2006-2009 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved. * $COPYRIGHT$ @@ -119,7 +120,10 @@ typedef struct mca_btl_openib_footer_t mca_btl_openib_footer_t; #define MCA_BTL_OPENIB_CONTROL_CTS 3 struct mca_btl_openib_control_header_t { - uint8_t type; + uint8_t type; +#if OMPI_OPENIB_PAD_HDR + uint8_t padding[15]; +#endif }; typedef struct mca_btl_openib_control_header_t mca_btl_openib_control_header_t; diff --git a/ompi/mca/pml/csum/Makefile.am b/ompi/mca/pml/csum/Makefile.am new file mode 100644 index 0000000000..d549f7cb88 --- /dev/null +++ b/ompi/mca/pml/csum/Makefile.am @@ -0,0 +1,63 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +dist_pkgdata_DATA = help-pml-csum.txt + +EXTRA_DIST = post_configure.sh pml_csum_endpoint.c pml_csum_endpoint.h + +csum_sources = \ + pml_csum.c \ + pml_csum.h \ + pml_csum_comm.c \ + pml_csum_comm.h \ + pml_csum_component.c \ + pml_csum_component.h \ + pml_csum_hdr.h \ + pml_csum_iprobe.c \ + pml_csum_irecv.c \ + pml_csum_isend.c \ + pml_csum_progress.c \ + pml_csum_rdma.c \ + pml_csum_rdma.h \ + pml_csum_rdmafrag.c \ + pml_csum_rdmafrag.h \ + pml_csum_recvfrag.c \ + pml_csum_recvfrag.h \ + pml_csum_recvreq.c \ + pml_csum_recvreq.h \ + pml_csum_sendreq.c \ + pml_csum_sendreq.h \ + pml_csum_start.c + +if OMPI_BUILD_pml_csum_DSO +component_noinst = +component_install = mca_pml_csum.la +else +component_noinst = libmca_pml_csum.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_pml_csum_la_SOURCES = $(csum_sources) +mca_pml_csum_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_pml_csum_la_SOURCES = $(csum_sources) +libmca_pml_csum_la_LDFLAGS = -module -avoid-version diff --git a/ompi/mca/pml/csum/configure.params b/ompi/mca/pml/csum/configure.params new file mode 100644 index 0000000000..3513f8d956 --- /dev/null +++ b/ompi/mca/pml/csum/configure.params @@ -0,0 +1,24 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2007 Los Alamos National Security, LLC. All rights +# reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module + +PARAM_CONFIG_FILES="Makefile" diff --git a/ompi/mca/pml/csum/help-pml-csum.txt b/ompi/mca/pml/csum/help-pml-csum.txt new file mode 100644 index 0000000000..0494ea915d --- /dev/null +++ b/ompi/mca/pml/csum/help-pml-csum.txt @@ -0,0 +1,31 @@ +# -*- text -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2009 IBM Corporation. All rights reserved. +# Copyright (c) 2009 Los Alamos National Security, LLC. All rights +# reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# This is the US/English general help file for Open MPI. +# +[pml:checksum-not-enabled] +Warning: This build of Open MPI was specifically configured +with support for the alternate checksum algorithm, but the +support was not enabled by the proper MCA parameter. You should +set pml_csum_enable_csum to enable checksum operation. + +While your application will be allowed to proceed, please be +advised that you will not be protected from data errors. diff --git a/ompi/mca/pml/csum/pml_csum.c b/ompi/mca/pml/csum/pml_csum.c new file mode 100644 index 0000000000..db797ebddf --- /dev/null +++ b/ompi/mca/pml/csum/pml_csum.c @@ -0,0 +1,801 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2009 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. + * Copyright (c) 2006-2008 University of Houston. All rights reserved. + * Copyright (c) 2009 IBM Corporation. All rights reserved. + * Copyright (c) 2009 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include +#include + +#include "opal/class/opal_bitmap.h" +#include "ompi/mca/pml/pml.h" +#include "ompi/mca/pml/base/base.h" +#include "ompi/mca/btl/btl.h" +#include "ompi/mca/pml/base/base.h" +#include "ompi/mca/btl/base/base.h" +#include "pml_csum.h" +#include "pml_csum_component.h" +#include "pml_csum_comm.h" +#include "pml_csum_hdr.h" +#include "pml_csum_recvfrag.h" +#include "pml_csum_sendreq.h" +#include "pml_csum_recvreq.h" +#include "pml_csum_rdmafrag.h" +#include "ompi/mca/bml/base/base.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/grpcomm/grpcomm.h" +#include "orte/util/name_fns.h" +#include "orte/runtime/orte_globals.h" +#include "opal/util/crc.h" +#include "ompi/runtime/ompi_cr.h" +#include "ompi/runtime/ompi_module_exchange.h" + +mca_pml_csum_t mca_pml_csum = { + { + mca_pml_csum_add_procs, + mca_pml_csum_del_procs, + mca_pml_csum_enable, + mca_pml_csum_progress, + mca_pml_csum_add_comm, + mca_pml_csum_del_comm, + mca_pml_csum_irecv_init, + mca_pml_csum_irecv, + mca_pml_csum_recv, + mca_pml_csum_isend_init, + mca_pml_csum_isend, + mca_pml_csum_send, + mca_pml_csum_iprobe, + mca_pml_csum_probe, + mca_pml_csum_start, + mca_pml_csum_dump, + mca_pml_csum_ft_event, + 32768, + INT_MAX + } +}; + + +void mca_pml_csum_error_handler( struct mca_btl_base_module_t* btl, + int32_t flags ); + +int mca_pml_csum_enable(bool enable) +{ + if( false == enable ) { + return OMPI_SUCCESS; + } + + OBJ_CONSTRUCT(&mca_pml_csum.lock, opal_mutex_t); + + /* fragments */ + OBJ_CONSTRUCT(&mca_pml_csum.rdma_frags, ompi_free_list_t); + ompi_free_list_init_new( &mca_pml_csum.rdma_frags, + sizeof(mca_pml_csum_rdma_frag_t), + CACHE_LINE_SIZE, + OBJ_CLASS(mca_pml_csum_rdma_frag_t), + 0,CACHE_LINE_SIZE, + mca_pml_csum.free_list_num, + mca_pml_csum.free_list_max, + mca_pml_csum.free_list_inc, + NULL ); + + OBJ_CONSTRUCT(&mca_pml_csum.recv_frags, ompi_free_list_t); + + ompi_free_list_init_new( &mca_pml_csum.recv_frags, + sizeof(mca_pml_csum_recv_frag_t) + mca_pml_csum.unexpected_limit, + CACHE_LINE_SIZE, + OBJ_CLASS(mca_pml_csum_recv_frag_t), + 0,CACHE_LINE_SIZE, + mca_pml_csum.free_list_num, + mca_pml_csum.free_list_max, + mca_pml_csum.free_list_inc, + NULL ); + + OBJ_CONSTRUCT(&mca_pml_csum.pending_pckts, ompi_free_list_t); + ompi_free_list_init_new( &mca_pml_csum.pending_pckts, + sizeof(mca_pml_csum_pckt_pending_t), + CACHE_LINE_SIZE, + OBJ_CLASS(mca_pml_csum_pckt_pending_t), + 0,CACHE_LINE_SIZE, + mca_pml_csum.free_list_num, + mca_pml_csum.free_list_max, + mca_pml_csum.free_list_inc, + NULL ); + + + OBJ_CONSTRUCT(&mca_pml_csum.buffers, ompi_free_list_t); + OBJ_CONSTRUCT(&mca_pml_csum.send_ranges, ompi_free_list_t); + ompi_free_list_init_new( &mca_pml_csum.send_ranges, + sizeof(mca_pml_csum_send_range_t) + + (mca_pml_csum.max_send_per_range - 1) * sizeof(mca_pml_csum_com_btl_t), + CACHE_LINE_SIZE, + OBJ_CLASS(mca_pml_csum_send_range_t), + 0,CACHE_LINE_SIZE, + mca_pml_csum.free_list_num, + mca_pml_csum.free_list_max, + mca_pml_csum.free_list_inc, + NULL ); + + /* pending operations */ + OBJ_CONSTRUCT(&mca_pml_csum.send_pending, opal_list_t); + OBJ_CONSTRUCT(&mca_pml_csum.recv_pending, opal_list_t); + OBJ_CONSTRUCT(&mca_pml_csum.pckt_pending, opal_list_t); + OBJ_CONSTRUCT(&mca_pml_csum.rdma_pending, opal_list_t); + /* missing communicator pending list */ + OBJ_CONSTRUCT(&mca_pml_csum.non_existing_communicator_pending, opal_list_t); + + /** + * If we get here this is the PML who get selected for the run. We + * should get ownership for the send and receive requests list, and + * initialize them with the size of our own requests. + */ + ompi_free_list_init_new( &mca_pml_base_send_requests, + sizeof(mca_pml_csum_send_request_t) + + (mca_pml_csum.max_rdma_per_request - 1) * + sizeof(mca_pml_csum_com_btl_t), + CACHE_LINE_SIZE, + OBJ_CLASS(mca_pml_csum_send_request_t), + 0,CACHE_LINE_SIZE, + mca_pml_csum.free_list_num, + mca_pml_csum.free_list_max, + mca_pml_csum.free_list_inc, + NULL ); + + ompi_free_list_init_new( &mca_pml_base_recv_requests, + sizeof(mca_pml_csum_recv_request_t) + + (mca_pml_csum.max_rdma_per_request - 1) * + sizeof(mca_pml_csum_com_btl_t), + CACHE_LINE_SIZE, + OBJ_CLASS(mca_pml_csum_recv_request_t), + 0,CACHE_LINE_SIZE, + mca_pml_csum.free_list_num, + mca_pml_csum.free_list_max, + mca_pml_csum.free_list_inc, + NULL ); + + mca_pml_csum.enabled = true; + return OMPI_SUCCESS; +} + +int mca_pml_csum_add_comm(ompi_communicator_t* comm) +{ + /* allocate pml specific comm data */ + mca_pml_csum_comm_t* pml_comm = OBJ_NEW(mca_pml_csum_comm_t); + opal_list_item_t *item, *next_item; + mca_pml_csum_recv_frag_t* frag; + mca_pml_csum_comm_proc_t* pml_proc; + mca_pml_csum_match_hdr_t* hdr; + int i; + + if (NULL == pml_comm) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + mca_pml_csum_comm_init_size(pml_comm, comm->c_remote_group->grp_proc_count); + comm->c_pml_comm = pml_comm; + + for( i = 0; i < comm->c_remote_group->grp_proc_count; i++ ) { + pml_comm->procs[i].ompi_proc = ompi_group_peer_lookup(comm->c_remote_group,i); + } + /* Grab all related messages from the non_existing_communicator pending queue */ + for( item = opal_list_get_first(&mca_pml_csum.non_existing_communicator_pending); + item != opal_list_get_end(&mca_pml_csum.non_existing_communicator_pending); + item = next_item ) { + frag = (mca_pml_csum_recv_frag_t*)item; + next_item = opal_list_get_next(item); + hdr = &frag->hdr.hdr_match; + + /* Is this fragment for the current communicator ? */ + if( frag->hdr.hdr_match.hdr_ctx != comm->c_contextid ) + continue; + + /* As we now know we work on a fragment for this communicator + * we should remove it from the + * non_existing_communicator_pending list. */ + opal_list_remove_item( &mca_pml_csum.non_existing_communicator_pending, + item ); + + add_fragment_to_unexpected: + + /* We generate the MSG_ARRIVED event as soon as the PML is aware + * of a matching fragment arrival. Independing if it is received + * on the correct order or not. This will allow the tools to + * figure out if the messages are not received in the correct + * order (if multiple network interfaces). + */ + PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_ARRIVED, comm, + hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV); + + /* There is no matching to be done, and no lock to be held on the communicator as + * we know at this point that the communicator has not yet been returned to the user. + * The only required protection is around the non_existing_communicator_pending queue. + * We just have to push the fragment into the unexpected list of the corresponding + * proc, or into the out-of-order (cant_match) list. + */ + pml_proc = &(pml_comm->procs[hdr->hdr_src]); + + if( ((uint16_t)hdr->hdr_seq) == ((uint16_t)pml_proc->expected_sequence) ) { + /* We're now expecting the next sequence number. */ + pml_proc->expected_sequence++; + opal_list_append( &pml_proc->unexpected_frags, (opal_list_item_t*)frag ); + PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_INSERT_IN_UNEX_Q, comm, + hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV); + /* And now the ugly part. As some fragments can be inserted in the cant_match list, + * every time we succesfully add a fragment in the unexpected list we have to make + * sure the next one is not in the cant_match. Otherwise, we will endup in a deadlock + * situation as the cant_match is only checked when a new fragment is received from + * the network. + */ + for(frag = (mca_pml_csum_recv_frag_t *)opal_list_get_first(&pml_proc->frags_cant_match); + frag != (mca_pml_csum_recv_frag_t *)opal_list_get_end(&pml_proc->frags_cant_match); + frag = (mca_pml_csum_recv_frag_t *)opal_list_get_next(frag)) { + hdr = &frag->hdr.hdr_match; + /* If the message has the next expected seq from that proc... */ + if(hdr->hdr_seq != pml_proc->expected_sequence) + continue; + + opal_list_remove_item(&pml_proc->frags_cant_match, (opal_list_item_t*)frag); + goto add_fragment_to_unexpected; + } + } else { + opal_list_append( &pml_proc->frags_cant_match, (opal_list_item_t*)frag ); + } + } + return OMPI_SUCCESS; +} + +int mca_pml_csum_del_comm(ompi_communicator_t* comm) +{ + OBJ_RELEASE(comm->c_pml_comm); + comm->c_pml_comm = NULL; + return OMPI_SUCCESS; +} + + +/* + * For each proc setup a datastructure that indicates the BTLs + * that can be used to reach the destination. + * + */ + +int mca_pml_csum_add_procs(ompi_proc_t** procs, size_t nprocs) +{ + opal_bitmap_t reachable; + int rc; + size_t i; + + if(nprocs == 0) + return OMPI_SUCCESS; + + /* we don't have any endpoint data we need to cache on the + ompi_proc_t, so set proc_pml to NULL */ + for (i = 0 ; i < nprocs ; ++i) { + procs[i]->proc_pml = NULL; + } + + OBJ_CONSTRUCT(&reachable, opal_bitmap_t); + rc = opal_bitmap_init(&reachable, (int)nprocs); + if(OMPI_SUCCESS != rc) + return rc; + + /* + * JJH: Disable this in FT enabled builds since + * we use a wrapper PML. It will cause this check to + * return failure as all processes will return the wrapper PML + * component in use instead of the wrapped PML component underneath. + */ +#if OPAL_ENABLE_FT == 0 + /* make sure remote procs are using the same PML as us */ + if (OMPI_SUCCESS != (rc = mca_pml_base_pml_check_selected("csum", + procs, + nprocs))) { + return rc; + } +#endif + + rc = mca_bml.bml_add_procs( nprocs, + procs, + &reachable ); + if(OMPI_SUCCESS != rc) + goto cleanup_and_return; + + rc = mca_bml.bml_register( MCA_PML_CSUM_HDR_TYPE_MATCH, + mca_pml_csum_recv_frag_callback_match, + NULL ); + if(OMPI_SUCCESS != rc) + goto cleanup_and_return; + + rc = mca_bml.bml_register( MCA_PML_CSUM_HDR_TYPE_RNDV, + mca_pml_csum_recv_frag_callback_rndv, + NULL ); + if(OMPI_SUCCESS != rc) + goto cleanup_and_return; + + rc = mca_bml.bml_register( MCA_PML_CSUM_HDR_TYPE_RGET, + mca_pml_csum_recv_frag_callback_rget, + NULL ); + if(OMPI_SUCCESS != rc) + goto cleanup_and_return; + + rc = mca_bml.bml_register( MCA_PML_CSUM_HDR_TYPE_ACK, + mca_pml_csum_recv_frag_callback_ack, + NULL ); + if(OMPI_SUCCESS != rc) + goto cleanup_and_return; + + rc = mca_bml.bml_register( MCA_PML_CSUM_HDR_TYPE_FRAG, + mca_pml_csum_recv_frag_callback_frag, + NULL ); + if(OMPI_SUCCESS != rc) + goto cleanup_and_return; + + rc = mca_bml.bml_register( MCA_PML_CSUM_HDR_TYPE_PUT, + mca_pml_csum_recv_frag_callback_put, + NULL ); + if(OMPI_SUCCESS != rc) + goto cleanup_and_return; + + rc = mca_bml.bml_register( MCA_PML_CSUM_HDR_TYPE_FIN, + mca_pml_csum_recv_frag_callback_fin, + NULL ); + if(OMPI_SUCCESS != rc) + goto cleanup_and_return; + + /* register error handlers */ + rc = mca_bml.bml_register_error(mca_pml_csum_error_handler); + if(OMPI_SUCCESS != rc) + goto cleanup_and_return; + + cleanup_and_return: + OBJ_DESTRUCT(&reachable); + + return rc; +} + +/* + * iterate through each proc and notify any PTLs associated + * with the proc that it is/has gone away + */ + +int mca_pml_csum_del_procs(ompi_proc_t** procs, size_t nprocs) +{ + return mca_bml.bml_del_procs(nprocs, procs); +} + +/* + * diagnostics + */ + +int mca_pml_csum_dump(struct ompi_communicator_t* comm, int verbose) +{ + struct mca_pml_comm_t* pml_comm = comm->c_pml_comm; + int i; + + /* iterate through all procs on communicator */ + for( i = 0; i < (int)pml_comm->num_procs; i++ ) { + mca_pml_csum_comm_proc_t* proc = &pml_comm->procs[i]; + mca_bml_base_endpoint_t* ep = (mca_bml_base_endpoint_t*)proc->ompi_proc->proc_bml; + size_t n; + + opal_output(0, "[Rank %d]\n", i); + /* dump all receive queues */ + + /* dump all btls */ + for(n=0; nbtl_eager.arr_size; n++) { + mca_bml_base_btl_t* bml_btl = &ep->btl_eager.bml_btls[n]; + bml_btl->btl->btl_dump(bml_btl->btl, bml_btl->btl_endpoint, verbose); + } + } + return OMPI_SUCCESS; +} + +static void mca_pml_csum_fin_completion( mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status ) +{ + + mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context; + + /* check for pending requests */ + MCA_PML_CSUM_PROGRESS_PENDING(bml_btl); +} + +int mca_pml_csum_send_fin( ompi_proc_t* proc, + mca_bml_base_btl_t* bml_btl, + void *hdr_des, + uint8_t order, + uint32_t status ) +{ + mca_btl_base_descriptor_t* fin; + mca_pml_csum_fin_hdr_t* hdr; + int rc; + bool do_csum = mca_pml_csum.enable_csum && + (bml_btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM); + mca_bml_base_alloc(bml_btl, &fin, order, sizeof(mca_pml_csum_fin_hdr_t), + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); + + if(NULL == fin) { + MCA_PML_CSUM_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status); + return OMPI_ERR_OUT_OF_RESOURCE; + } + fin->des_cbfunc = mca_pml_csum_fin_completion; + fin->des_cbdata = NULL; + + /* fill in header */ + hdr = (mca_pml_csum_fin_hdr_t*)fin->des_src->seg_addr.pval; + hdr->hdr_common.hdr_flags = 0; + hdr->hdr_common.hdr_type = MCA_PML_CSUM_HDR_TYPE_FIN; + hdr->hdr_common.hdr_csum = 0; + hdr->hdr_des.pval = hdr_des; + hdr->hdr_fail = status; + hdr->hdr_common.hdr_csum = (do_csum ? + opal_csum16(hdr, sizeof(mca_pml_csum_fin_hdr_t)) : OPAL_CSUM_ZERO); + if(do_csum) { + OMPI_CSUM_CSUM_DEBUG((0, "%s: Sending \'FIN\' with header csum:0x%04x\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hdr->hdr_common.hdr_csum)); + } + + csum_hdr_hton(hdr, MCA_PML_CSUM_HDR_TYPE_FIN, proc); + + /* queue request */ + rc = mca_bml_base_send( bml_btl, + fin, + MCA_PML_CSUM_HDR_TYPE_FIN ); + if( OPAL_LIKELY( rc >= 0 ) ) { + if( OPAL_LIKELY( 1 == rc ) ) { + MCA_PML_CSUM_PROGRESS_PENDING(bml_btl); + } + return OMPI_SUCCESS; + } + mca_bml_base_free(bml_btl, fin); + MCA_PML_CSUM_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status); + return OMPI_ERR_OUT_OF_RESOURCE; +} + +void mca_pml_csum_process_pending_packets(mca_bml_base_btl_t* bml_btl) +{ + mca_pml_csum_pckt_pending_t *pckt; + int32_t i, rc, s = (int32_t)opal_list_get_size(&mca_pml_csum.pckt_pending); + + for(i = 0; i < s; i++) { + mca_bml_base_btl_t *send_dst = NULL; + OPAL_THREAD_LOCK(&mca_pml_csum.lock); + pckt = (mca_pml_csum_pckt_pending_t*) + opal_list_remove_first(&mca_pml_csum.pckt_pending); + OPAL_THREAD_UNLOCK(&mca_pml_csum.lock); + if(NULL == pckt) + break; + if(pckt->bml_btl != NULL && + pckt->bml_btl->btl == bml_btl->btl) { + send_dst = pckt->bml_btl; + } else { + send_dst = mca_bml_base_btl_array_find( + &pckt->proc->proc_bml->btl_eager, bml_btl->btl); + } + if(NULL == send_dst) { + OPAL_THREAD_LOCK(&mca_pml_csum.lock); + opal_list_append(&mca_pml_csum.pckt_pending, + (opal_list_item_t*)pckt); + OPAL_THREAD_UNLOCK(&mca_pml_csum.lock); + continue; + } + + switch(pckt->hdr.hdr_common.hdr_type) { + case MCA_PML_CSUM_HDR_TYPE_ACK: + rc = mca_pml_csum_recv_request_ack_send_btl(pckt->proc, + send_dst, + pckt->hdr.hdr_ack.hdr_src_req.lval, + pckt->hdr.hdr_ack.hdr_dst_req.pval, + pckt->hdr.hdr_ack.hdr_send_offset, + pckt->hdr.hdr_common.hdr_flags & MCA_PML_CSUM_HDR_FLAGS_NORDMA); + MCA_PML_CSUM_PCKT_PENDING_RETURN(pckt); + if(OMPI_ERR_OUT_OF_RESOURCE == rc) { + MCA_PML_CSUM_ADD_ACK_TO_PENDING(pckt->proc, + pckt->hdr.hdr_ack.hdr_src_req.lval, + pckt->hdr.hdr_ack.hdr_dst_req.pval, + pckt->hdr.hdr_ack.hdr_send_offset); + return; + } + break; + case MCA_PML_CSUM_HDR_TYPE_FIN: + rc = mca_pml_csum_send_fin(pckt->proc, send_dst, + pckt->hdr.hdr_fin.hdr_des.pval, + pckt->order, + pckt->hdr.hdr_fin.hdr_fail); + MCA_PML_CSUM_PCKT_PENDING_RETURN(pckt); + if(OMPI_ERR_OUT_OF_RESOURCE == rc) + return; + break; + default: + opal_output(0, "[%s:%d] wrong header type\n", + __FILE__, __LINE__); + break; + } + } +} + +void mca_pml_csum_process_pending_rdma(void) +{ + mca_pml_csum_rdma_frag_t* frag; + int32_t i, rc, s = (int32_t)opal_list_get_size(&mca_pml_csum.rdma_pending); + + for(i = 0; i < s; i++) { + OPAL_THREAD_LOCK(&mca_pml_csum.lock); + frag = (mca_pml_csum_rdma_frag_t*) + opal_list_remove_first(&mca_pml_csum.rdma_pending); + OPAL_THREAD_UNLOCK(&mca_pml_csum.lock); + if(NULL == frag) + break; + if(frag->rdma_state == MCA_PML_CSUM_RDMA_PUT) { + frag->retries++; + rc = mca_pml_csum_send_request_put_frag(frag); + } else { + rc = mca_pml_csum_recv_request_get_frag(frag); + } + if(OMPI_ERR_OUT_OF_RESOURCE == rc) + break; + } +} + + +void mca_pml_csum_error_handler( + struct mca_btl_base_module_t* btl, + int32_t flags) { + orte_errmgr.abort(-1, NULL); +} + +#if OPAL_ENABLE_FT == 0 +int mca_pml_csum_ft_event( int state ) { + return OMPI_SUCCESS; +} +#else +int mca_pml_csum_ft_event( int state ) +{ + static bool first_continue_pass = false; + ompi_proc_t** procs = NULL; + size_t num_procs; + int ret, p; + + if(OPAL_CRS_CHECKPOINT == state) { + if( opal_cr_timing_barrier_enabled ) { + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR1); + orte_grpcomm.barrier(); + } + + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P0); + } + else if(OPAL_CRS_CONTINUE == state) { + first_continue_pass = !first_continue_pass; + + if( !first_continue_pass ) { + if( opal_cr_timing_barrier_enabled ) { + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR0); + orte_grpcomm.barrier(); + } + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P2); + } + + if( ompi_cr_continue_like_restart && !first_continue_pass ) { + /* + * Get a list of processes + */ + procs = ompi_proc_all(&num_procs); + if(NULL == procs) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + /* + * Refresh the proc structure, and publish our proc info in the modex. + * NOTE: Do *not* call ompi_proc_finalize as there are many places in + * the code that point to indv. procs in this strucutre. For our + * needs here we only need to fix up the modex, bml and pml + * references. + */ + if (OMPI_SUCCESS != (ret = ompi_proc_refresh())) { + opal_output(0, + "pml:csum: ft_event(Restart): proc_refresh Failed %d", + ret); + for(p = 0; p < (int)num_procs; ++p) { + OBJ_RELEASE(procs[p]); + } + free (procs); + return ret; + } + } + } + else if(OPAL_CRS_RESTART_PRE == state ) { + /* Nothing here */ + } + else if(OPAL_CRS_RESTART == state ) { + /* + * Get a list of processes + */ + procs = ompi_proc_all(&num_procs); + if(NULL == procs) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + /* + * Clean out the modex information since it is invalid now. + * orte_grpcomm.purge_proc_attrs(); + * This happens at the ORTE level, so doing it again here will cause + * some issues with socket caching. + */ + + + /* + * Refresh the proc structure, and publish our proc info in the modex. + * NOTE: Do *not* call ompi_proc_finalize as there are many places in + * the code that point to indv. procs in this strucutre. For our + * needs here we only need to fix up the modex, bml and pml + * references. + */ + if (OMPI_SUCCESS != (ret = ompi_proc_refresh())) { + opal_output(0, + "pml:csum: ft_event(Restart): proc_refresh Failed %d", + ret); + for(p = 0; p < (int)num_procs; ++p) { + OBJ_RELEASE(procs[p]); + } + free (procs); + return ret; + } + } + else if(OPAL_CRS_TERM == state ) { + ; + } + else { + ; + } + + /* Call the BML + * BML is expected to call ft_event in + * - BTL(s) + * - MPool(s) + */ + if( OMPI_SUCCESS != (ret = mca_bml.bml_ft_event(state))) { + opal_output(0, "pml:base: ft_event: BML ft_event function failed: %d\n", + ret); + } + + if(OPAL_CRS_CHECKPOINT == state) { + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P1); + + if( opal_cr_timing_barrier_enabled ) { + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR0); + /* JJH Cannot barrier here due to progress engine -- orte_grpcomm.barrier();*/ + } + } + else if(OPAL_CRS_CONTINUE == state) { + if( !first_continue_pass ) { + if( opal_cr_timing_barrier_enabled ) { + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR1); + orte_grpcomm.barrier(); + } + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3); + } + + if( ompi_cr_continue_like_restart && !first_continue_pass ) { + /* + * Exchange the modex information once again. + * BTLs will have republished their modex information. + */ + if (OMPI_SUCCESS != (ret = orte_grpcomm.modex(NULL))) { + opal_output(0, + "pml:csum: ft_event(Restart): Failed orte_grpcomm.modex() = %d", + ret); + return ret; + } + + /* + * Startup the PML stack now that the modex is running again + * Add the new procs (BTLs redo modex recv's) + */ + if( OMPI_SUCCESS != (ret = mca_pml_csum_add_procs(procs, num_procs) ) ) { + opal_output(0, "pml:csum: ft_event(Restart): Failed in add_procs (%d)", ret); + return ret; + } + + /* Is this barrier necessary ? JJH */ + if (OMPI_SUCCESS != (ret = orte_grpcomm.barrier())) { + opal_output(0, "pml:csum: ft_event(Restart): Failed in orte_grpcomm.barrier (%d)", ret); + return ret; + } + + if( NULL != procs ) { + for(p = 0; p < (int)num_procs; ++p) { + OBJ_RELEASE(procs[p]); + } + free(procs); + procs = NULL; + } + } + if( !first_continue_pass ) { + if( opal_cr_timing_barrier_enabled ) { + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR2); + orte_grpcomm.barrier(); + } + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP1); + } + } + else if(OPAL_CRS_RESTART_PRE == state ) { + /* Nothing here */ + } + else if(OPAL_CRS_RESTART == state ) { + /* + * Exchange the modex information once again. + * BTLs will have republished their modex information. + */ + if (OMPI_SUCCESS != (ret = orte_grpcomm.modex(NULL))) { + opal_output(0, + "pml:csum: ft_event(Restart): Failed orte_grpcomm.modex() = %d", + ret); + return ret; + } + + /* + * Startup the PML stack now that the modex is running again + * Add the new procs (BTLs redo modex recv's) + */ + if( OMPI_SUCCESS != (ret = mca_pml_csum_add_procs(procs, num_procs) ) ) { + opal_output(0, "pml:csum: ft_event(Restart): Failed in add_procs (%d)", ret); + return ret; + } + + /* Is this barrier necessary ? JJH */ + if (OMPI_SUCCESS != (ret = orte_grpcomm.barrier())) { + opal_output(0, "pml:csum: ft_event(Restart): Failed in orte_grpcomm.barrier (%d)", ret); + return ret; + } + + if( NULL != procs ) { + for(p = 0; p < (int)num_procs; ++p) { + OBJ_RELEASE(procs[p]); + } + free(procs); + procs = NULL; + } + } + else if(OPAL_CRS_TERM == state ) { + ; + } + else { + ; + } + + return OMPI_SUCCESS; +} +#endif /* OPAL_ENABLE_FT */ + +int mca_pml_csum_com_btl_comp(const void *v1, const void *v2) +{ + const mca_pml_csum_com_btl_t *b1 = (const mca_pml_csum_com_btl_t *) v1; + const mca_pml_csum_com_btl_t *b2 = (const mca_pml_csum_com_btl_t *) v2; + + if(b1->bml_btl->btl_weight < b2->bml_btl->btl_weight) + return 1; + if(b1->bml_btl->btl_weight > b2->bml_btl->btl_weight) + return -1; + + return 0; +} + diff --git a/ompi/mca/pml/csum/pml_csum.h b/ompi/mca/pml/csum/pml_csum.h new file mode 100644 index 0000000000..9d42302dca --- /dev/null +++ b/ompi/mca/pml/csum/pml_csum.h @@ -0,0 +1,328 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2007 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2009 IBM Corporation. All rights reserved. + * Copyright (c) 2009 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ + +#ifndef MCA_PML_CSUM_H +#define MCA_PML_CSUM_H + +#include "ompi_config.h" +#include "opal/threads/threads.h" +#include "ompi/class/ompi_free_list.h" +#include "ompi/request/request.h" +#include "ompi/mca/pml/pml.h" +#include "ompi/mca/pml/base/pml_base_request.h" +#include "ompi/mca/pml/base/pml_base_bsend.h" +#include "ompi/mca/pml/base/pml_base_sendreq.h" +#include "ompi/mca/btl/btl.h" +#include "ompi/datatype/datatype.h" +#include "pml_csum_hdr.h" +#include "ompi/mca/bml/base/base.h" +#include "ompi/proc/proc.h" +#include "ompi/mca/allocator/base/base.h" + +#if OMPI_CSUM_DEBUG +#define OMPI_CSUM_CSUM_DEBUG(x) opal_output x +#else +#define OMPI_CSUM_CSUM_DEBUG(x) +#endif + +BEGIN_C_DECLS + +/** + * CSUM PML module + */ + +struct mca_pml_csum_t { + mca_pml_base_module_t super; + + int free_list_num; /* initial size of free list */ + int free_list_max; /* maximum size of free list */ + int free_list_inc; /* number of elements to grow free list */ + size_t send_pipeline_depth; + size_t recv_pipeline_depth; + size_t rdma_put_retries_limit; + int max_rdma_per_request; + int max_send_per_range; + bool leave_pinned; + int leave_pinned_pipeline; + + /* lock queue access */ + opal_mutex_t lock; + + /* free lists */ + ompi_free_list_t rdma_frags; + ompi_free_list_t recv_frags; + ompi_free_list_t pending_pckts; + ompi_free_list_t buffers; + ompi_free_list_t send_ranges; + + /* list of pending operations */ + opal_list_t pckt_pending; + opal_list_t send_pending; + opal_list_t recv_pending; + opal_list_t rdma_pending; + /* List of pending fragments without a matching communicator */ + opal_list_t non_existing_communicator_pending; + bool enabled; + char* allocator_name; + mca_allocator_base_module_t* allocator; + uint32_t unexpected_limit; + + /*Enable or Disable checksum*/ + bool enable_csum; +}; +typedef struct mca_pml_csum_t mca_pml_csum_t; + +extern mca_pml_csum_t mca_pml_csum; + +/* + * PML interface functions. + */ + +extern int mca_pml_csum_add_comm( + struct ompi_communicator_t* comm +); + +extern int mca_pml_csum_del_comm( + struct ompi_communicator_t* comm +); + +extern int mca_pml_csum_add_procs( + struct ompi_proc_t **procs, + size_t nprocs +); + +extern int mca_pml_csum_del_procs( + struct ompi_proc_t **procs, + size_t nprocs +); + +extern int mca_pml_csum_enable( bool enable ); + +extern int mca_pml_csum_progress(void); + +extern int mca_pml_csum_iprobe( int dst, + int tag, + struct ompi_communicator_t* comm, + int *matched, + ompi_status_public_t* status ); + +extern int mca_pml_csum_probe( int dst, + int tag, + struct ompi_communicator_t* comm, + ompi_status_public_t* status ); + +extern int mca_pml_csum_isend_init( void *buf, + size_t count, + ompi_datatype_t *datatype, + int dst, + int tag, + mca_pml_base_send_mode_t mode, + struct ompi_communicator_t* comm, + struct ompi_request_t **request ); + +extern int mca_pml_csum_isend( void *buf, + size_t count, + ompi_datatype_t *datatype, + int dst, + int tag, + mca_pml_base_send_mode_t mode, + struct ompi_communicator_t* comm, + struct ompi_request_t **request ); + +extern int mca_pml_csum_send( void *buf, + size_t count, + ompi_datatype_t *datatype, + int dst, + int tag, + mca_pml_base_send_mode_t mode, + struct ompi_communicator_t* comm ); + +extern int mca_pml_csum_irecv_init( void *buf, + size_t count, + ompi_datatype_t *datatype, + int src, + int tag, + struct ompi_communicator_t* comm, + struct ompi_request_t **request ); + +extern int mca_pml_csum_irecv( void *buf, + size_t count, + ompi_datatype_t *datatype, + int src, + int tag, + struct ompi_communicator_t* comm, + struct ompi_request_t **request ); + +extern int mca_pml_csum_recv( void *buf, + size_t count, + ompi_datatype_t *datatype, + int src, + int tag, + struct ompi_communicator_t* comm, + ompi_status_public_t* status ); + +extern int mca_pml_csum_dump( struct ompi_communicator_t* comm, + int verbose ); + +extern int mca_pml_csum_start( size_t count, + ompi_request_t** requests ); + +extern int mca_pml_csum_ft_event( int state ); + +END_C_DECLS + +struct mca_pml_csum_pckt_pending_t { + ompi_free_list_item_t super; + ompi_proc_t* proc; + mca_pml_csum_hdr_t hdr; + struct mca_bml_base_btl_t *bml_btl; + uint8_t order; +}; +typedef struct mca_pml_csum_pckt_pending_t mca_pml_csum_pckt_pending_t; +OBJ_CLASS_DECLARATION(mca_pml_csum_pckt_pending_t); + +#define MCA_PML_CSUM_PCKT_PENDING_ALLOC(pckt,rc) \ +do { \ + ompi_free_list_item_t* item; \ + OMPI_FREE_LIST_WAIT(&mca_pml_csum.pending_pckts, item, rc); \ + pckt = (mca_pml_csum_pckt_pending_t*)item; \ +} while (0) + +#define MCA_PML_CSUM_PCKT_PENDING_RETURN(pckt) \ +do { \ + /* return packet */ \ + OMPI_FREE_LIST_RETURN(&mca_pml_csum.pending_pckts, \ + (ompi_free_list_item_t*)pckt); \ +} while(0) + +#define MCA_PML_CSUM_ADD_FIN_TO_PENDING(P, D, B, O, S) \ + do { \ + mca_pml_csum_pckt_pending_t *_pckt; \ + int _rc; \ + \ + MCA_PML_CSUM_PCKT_PENDING_ALLOC(_pckt,_rc); \ + _pckt->hdr.hdr_common.hdr_type = MCA_PML_CSUM_HDR_TYPE_FIN; \ + _pckt->hdr.hdr_fin.hdr_des.pval = (D); \ + _pckt->hdr.hdr_fin.hdr_fail = (S); \ + _pckt->proc = (P); \ + _pckt->bml_btl = (B); \ + _pckt->order = (O); \ + OPAL_THREAD_LOCK(&mca_pml_csum.lock); \ + opal_list_append(&mca_pml_csum.pckt_pending, \ + (opal_list_item_t*)_pckt); \ + OPAL_THREAD_UNLOCK(&mca_pml_csum.lock); \ + } while(0) + + +int mca_pml_csum_send_fin(ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl, + void *hdr_des, uint8_t order, uint32_t status); + +/* This function tries to resend FIN/ACK packets from pckt_pending queue. + * Packets are added to the queue when sending of FIN or ACK is failed due to + * resource unavailability. bml_btl passed to the function doesn't represents + * packet's destination, it represents BTL on which resource was freed, so only + * this BTL should be considered for resending packets */ +void mca_pml_csum_process_pending_packets(mca_bml_base_btl_t* bml_btl); + +/* This function retries failed PUT/GET operations on frag. When RDMA operation + * cannot be accomplished for some reason, frag is put on the rdma_pending list. + * Later the operation is retried. The destination of RDMA operation is stored + * inside the frag structure */ +void mca_pml_csum_process_pending_rdma(void); + +#define MCA_PML_CSUM_PROGRESS_PENDING(bml_btl) \ + do { \ + if(opal_list_get_size(&mca_pml_csum.pckt_pending)) \ + mca_pml_csum_process_pending_packets(bml_btl); \ + if(opal_list_get_size(&mca_pml_csum.recv_pending)) \ + mca_pml_csum_recv_request_process_pending(); \ + if(opal_list_get_size(&mca_pml_csum.send_pending)) \ + mca_pml_csum_send_request_process_pending(bml_btl); \ + if(opal_list_get_size(&mca_pml_csum.rdma_pending)) \ + mca_pml_csum_process_pending_rdma(); \ + } while (0) + +/* + * Compute the total number of bytes on supplied descriptor + */ +#define MCA_PML_CSUM_COMPUTE_SEGMENT_LENGTH(segments, count, hdrlen, length) \ +do { \ + size_t i; \ + \ + for( i = 0; i < count; i++ ) { \ + length += segments[i].seg_len; \ + } \ + length -= hdrlen; \ +} while(0) + +/* represent BTL chosen for sending request */ +struct mca_pml_csum_com_btl_t { + mca_bml_base_btl_t *bml_btl; + struct mca_mpool_base_registration_t* btl_reg; + size_t length; +}; +typedef struct mca_pml_csum_com_btl_t mca_pml_csum_com_btl_t; + +int mca_pml_csum_com_btl_comp(const void *v1, const void *v2); + +/* Calculate what percentage of a message to send through each BTL according to + * relative weight */ +static inline void +mca_pml_csum_calc_weighted_length( mca_pml_csum_com_btl_t *btls, int num_btls, size_t size, + double weight_total ) +{ + int i; + size_t length_left; + + /* shortcut for common case for only one BTL */ + if( OPAL_LIKELY(1 == num_btls) ) { + btls[0].length = size; + return; + } + + /* sort BTLs according of their weights so BTLs with smaller weight will + * not hijack all of the traffic */ + qsort( btls, num_btls, sizeof(mca_pml_csum_com_btl_t), + mca_pml_csum_com_btl_comp ); + + for(length_left = size, i = 0; i < num_btls; i++) { + mca_bml_base_btl_t* bml_btl = btls[i].bml_btl; + size_t length = 0; + if( OPAL_UNLIKELY(0 != length_left) ) { + length = (length_left > bml_btl->btl->btl_eager_limit)? + ((size_t)(size * (bml_btl->btl_weight / weight_total))) : + length_left; + + if(length > length_left) + length = length_left; + length_left -= length; + } + btls[i].length = length; + } + + /* account for rounding errors */ + btls[0].length += length_left; +} + +#endif diff --git a/ompi/mca/pml/csum/pml_csum_comm.c b/ompi/mca/pml/csum/pml_csum_comm.c new file mode 100644 index 0000000000..9769becc2c --- /dev/null +++ b/ompi/mca/pml/csum/pml_csum_comm.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include + +#include "pml_csum.h" +#include "pml_csum_comm.h" + + + +static void mca_pml_csum_comm_proc_construct(mca_pml_csum_comm_proc_t* proc) +{ + proc->expected_sequence = 1; + proc->ompi_proc = NULL; + proc->send_sequence = 0; + OBJ_CONSTRUCT(&proc->frags_cant_match, opal_list_t); + OBJ_CONSTRUCT(&proc->specific_receives, opal_list_t); + OBJ_CONSTRUCT(&proc->unexpected_frags, opal_list_t); +} + + +static void mca_pml_csum_comm_proc_destruct(mca_pml_csum_comm_proc_t* proc) +{ + OBJ_DESTRUCT(&proc->frags_cant_match); + OBJ_DESTRUCT(&proc->specific_receives); + OBJ_DESTRUCT(&proc->unexpected_frags); +} + + +static OBJ_CLASS_INSTANCE( + mca_pml_csum_comm_proc_t, + opal_object_t, + mca_pml_csum_comm_proc_construct, + mca_pml_csum_comm_proc_destruct); + + +static void mca_pml_csum_comm_construct(mca_pml_csum_comm_t* comm) +{ + OBJ_CONSTRUCT(&comm->wild_receives, opal_list_t); + OBJ_CONSTRUCT(&comm->matching_lock, opal_mutex_t); + comm->recv_sequence = 0; + comm->procs = NULL; + comm->num_procs = 0; +} + + +static void mca_pml_csum_comm_destruct(mca_pml_csum_comm_t* comm) +{ + size_t i; + for(i=0; inum_procs; i++) + OBJ_DESTRUCT((&comm->procs[i])); + if(NULL != comm->procs) + free(comm->procs); + OBJ_DESTRUCT(&comm->wild_receives); + OBJ_DESTRUCT(&comm->matching_lock); +} + + +OBJ_CLASS_INSTANCE( + mca_pml_csum_comm_t, + opal_object_t, + mca_pml_csum_comm_construct, + mca_pml_csum_comm_destruct); + + +int mca_pml_csum_comm_init_size(mca_pml_csum_comm_t* comm, size_t size) +{ + size_t i; + + /* send message sequence-number support - sender side */ + comm->procs = (mca_pml_csum_comm_proc_t*)malloc(sizeof(mca_pml_csum_comm_proc_t)*size); + if(NULL == comm->procs) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + for(i=0; iprocs+i, mca_pml_csum_comm_proc_t); + } + comm->num_procs = size; + return OMPI_SUCCESS; +} + + diff --git a/ompi/mca/pml/csum/pml_csum_comm.h b/ompi/mca/pml/csum/pml_csum_comm.h new file mode 100644 index 0000000000..17ac7032ea --- /dev/null +++ b/ompi/mca/pml/csum/pml_csum_comm.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ +#ifndef MCA_PML_OB1_COMM_H +#define MCA_PML_OB1_COMM_H + +#include "opal/threads/mutex.h" +#include "opal/class/opal_list.h" +#include "ompi/proc/proc.h" +#if defined(c_plusplus) || defined(__cplusplus) +extern "C" { +#endif + + +struct mca_pml_csum_comm_proc_t { + opal_object_t super; + uint16_t expected_sequence; /**< send message sequence number - receiver side */ + struct ompi_proc_t* ompi_proc; +#if OMPI_HAVE_THREAD_SUPPORT + volatile int32_t send_sequence; /**< send side sequence number */ +#else + int32_t send_sequence; /**< send side sequence number */ +#endif + opal_list_t frags_cant_match; /**< out-of-order fragment queues */ + opal_list_t specific_receives; /**< queues of unmatched specific receives */ + opal_list_t unexpected_frags; /**< unexpected fragment queues */ +}; +typedef struct mca_pml_csum_comm_proc_t mca_pml_csum_comm_proc_t; + + +/** + * Cached on ompi_communicator_t to hold queues/state + * used by the PML<->PTL interface for matching logic. + */ +struct mca_pml_comm_t { + opal_object_t super; +#if OMPI_HAVE_THREAD_SUPPORT + volatile uint32_t recv_sequence; /**< recv request sequence number - receiver side */ +#else + uint32_t recv_sequence; /**< recv request sequence number - receiver side */ +#endif + opal_mutex_t matching_lock; /**< matching lock */ + opal_list_t wild_receives; /**< queue of unmatched wild (source process not specified) receives */ + mca_pml_csum_comm_proc_t* procs; + size_t num_procs; +}; +typedef struct mca_pml_comm_t mca_pml_csum_comm_t; + +OBJ_CLASS_DECLARATION(mca_pml_csum_comm_t); + + +/** + * Initialize an instance of mca_pml_csum_comm_t based on the communicator size. + * + * @param comm Instance of mca_pml_csum_comm_t + * @param size Size of communicator + * @return OMPI_SUCCESS or error status on failure. + */ + +extern int mca_pml_csum_comm_init_size(mca_pml_csum_comm_t* comm, size_t size); + +#if defined(c_plusplus) || defined(__cplusplus) +} +#endif +#endif + diff --git a/ompi/mca/pml/csum/pml_csum_component.c b/ompi/mca/pml/csum/pml_csum_component.c new file mode 100644 index 0000000000..19b9d270af --- /dev/null +++ b/ompi/mca/pml/csum/pml_csum_component.c @@ -0,0 +1,263 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2009 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2009 IBM Corporation. All rights reserved. + * Copyright (c) 2009 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include "opal/sys/cache.h" +#include "opal/event/event.h" +#include "mpi.h" +#include "ompi/runtime/params.h" +#include "ompi/datatype/convertor.h" +#include "ompi/mca/pml/pml.h" +#include "opal/mca/base/mca_base_param.h" + +#include "orte/util/show_help.h" + +#include "ompi/mca/pml/base/pml_base_bsend.h" +#include "pml_csum.h" +#include "pml_csum_hdr.h" +#include "pml_csum_sendreq.h" +#include "pml_csum_recvreq.h" +#include "pml_csum_rdmafrag.h" +#include "pml_csum_recvfrag.h" +#include "ompi/mca/bml/base/base.h" +#include "pml_csum_component.h" +#include "ompi/mca/allocator/base/base.h" + +OBJ_CLASS_INSTANCE( mca_pml_csum_pckt_pending_t, + ompi_free_list_item_t, + NULL, + NULL ); + +static int mca_pml_csum_component_open(void); +static int mca_pml_csum_component_close(void); +static mca_pml_base_module_t* +mca_pml_csum_component_init( int* priority, bool enable_progress_threads, + bool enable_mpi_threads ); +static int mca_pml_csum_component_fini(void); + +mca_pml_base_component_2_0_0_t mca_pml_csum_component = { + + /* First, the mca_base_component_t struct containing meta + information about the component itself */ + + { + MCA_PML_BASE_VERSION_2_0_0, + + "csum", /* MCA component name */ + OMPI_MAJOR_VERSION, /* MCA component major version */ + OMPI_MINOR_VERSION, /* MCA component minor version */ + OMPI_RELEASE_VERSION, /* MCA component release version */ + mca_pml_csum_component_open, /* component open */ + mca_pml_csum_component_close /* component close */ + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + + mca_pml_csum_component_init, /* component init */ + mca_pml_csum_component_fini /* component finalize */ + +}; + +void *mca_pml_csum_seg_alloc( struct mca_mpool_base_module_t* mpool, + size_t* size, + mca_mpool_base_registration_t** registration); + +void mca_pml_csum_seg_free( struct mca_mpool_base_module_t* mpool, + void* segment ); + +static inline int mca_pml_csum_param_register_int( + const char* param_name, + int default_value) +{ + int id = mca_base_param_register_int("pml","csum",param_name,NULL,default_value); + int param_value = default_value; + mca_base_param_lookup_int(id,¶m_value); + return param_value; +} + +static int mca_pml_csum_component_open(void) +{ + mca_allocator_base_component_t* allocator_component; + + mca_pml_csum.free_list_num = + mca_pml_csum_param_register_int("free_list_num", 4); + mca_pml_csum.free_list_max = + mca_pml_csum_param_register_int("free_list_max", -1); + mca_pml_csum.free_list_inc = + mca_pml_csum_param_register_int("free_list_inc", 64); + mca_pml_csum.send_pipeline_depth = + mca_pml_csum_param_register_int("send_pipeline_depth", 3); + mca_pml_csum.recv_pipeline_depth = + mca_pml_csum_param_register_int("recv_pipeline_depth", 4); + mca_pml_csum.rdma_put_retries_limit = + mca_pml_csum_param_register_int("rdma_put_retries_limit", 5); + mca_pml_csum.max_rdma_per_request = + mca_pml_csum_param_register_int("max_rdma_per_request", 4); + mca_pml_csum.max_send_per_range = + mca_pml_csum_param_register_int("max_send_per_range", 4); + + mca_pml_csum.unexpected_limit = + mca_pml_csum_param_register_int("unexpected_limit", 128); + + mca_base_param_reg_string(&mca_pml_csum_component.pmlm_version, + "allocator", + "Name of allocator component for unexpected messages", + false, false, + "bucket", + &mca_pml_csum.allocator_name); + + allocator_component = mca_allocator_component_lookup( mca_pml_csum.allocator_name ); + if(NULL == allocator_component) { + opal_output(0, "mca_pml_csum_component_open: can't find allocator: %s\n", mca_pml_csum.allocator_name); + return OMPI_ERROR; + } + + mca_pml_csum.allocator = allocator_component->allocator_init(true, + mca_pml_csum_seg_alloc, + mca_pml_csum_seg_free, NULL); + if(NULL == mca_pml_csum.allocator) { + opal_output(0, "mca_pml_csum_component_open: unable to initialize allocator\n"); + return OMPI_ERROR; + } + + /* default is not to checksum all data */ + mca_pml_csum.enable_csum = + mca_pml_csum_param_register_int("enable_csum", 0); + ompi_convertor_checksum_enable(mca_pml_csum.enable_csum); + + mca_pml_csum.enabled = false; + return mca_bml_base_open(); +} + + +static int mca_pml_csum_component_close(void) +{ + int rc; + + if (OMPI_SUCCESS != (rc = mca_bml_base_close())) { + return rc; + } + if (NULL != mca_pml_csum.allocator_name) { + free(mca_pml_csum.allocator_name); + } + + return OMPI_SUCCESS; +} + + +static mca_pml_base_module_t* +mca_pml_csum_component_init( int* priority, + bool enable_progress_threads, + bool enable_mpi_threads ) +{ + /* if the alternative csum was defined and enable_csum set, then we must + * be selected + */ +#if defined (OMPI_CSUM_DST) + if (mca_pml_csum.enable_csum) { + goto SELECT_ME; + } else { + *priority = 0; + orte_show_help("help-pml-csum.txt", "pml:checksum-not-enabled", true); + return NULL; + } +#else + *priority = 0; + return NULL; +#endif + +SELECT_ME: + *priority = 100; + + if(OMPI_SUCCESS != mca_bml_base_init( enable_progress_threads, + enable_mpi_threads)) { + *priority = 0; + return NULL; + } + + /* Set this here (vs in component_open()) because + ompi_mpi_leave_pinned* may have been set after MCA params were + read (e.g., by the openib btl) */ + mca_pml_csum.leave_pinned = (1 == ompi_mpi_leave_pinned); + mca_pml_csum.leave_pinned_pipeline = (int) ompi_mpi_leave_pinned_pipeline; + + return &mca_pml_csum.super; +} + +int mca_pml_csum_component_fini(void) +{ + int rc; + + /* Shutdown BML */ + if(OMPI_SUCCESS != (rc = mca_bml.bml_finalize())) + return rc; + + if(!mca_pml_csum.enabled) + return OMPI_SUCCESS; /* never selected.. return success.. */ + mca_pml_csum.enabled = false; /* not anymore */ + + OBJ_DESTRUCT(&mca_pml_csum.rdma_pending); + OBJ_DESTRUCT(&mca_pml_csum.pckt_pending); + OBJ_DESTRUCT(&mca_pml_csum.recv_pending); + OBJ_DESTRUCT(&mca_pml_csum.send_pending); + OBJ_DESTRUCT(&mca_pml_csum.non_existing_communicator_pending); + OBJ_DESTRUCT(&mca_pml_csum.buffers); + OBJ_DESTRUCT(&mca_pml_csum.pending_pckts); + OBJ_DESTRUCT(&mca_pml_csum.recv_frags); + OBJ_DESTRUCT(&mca_pml_csum.rdma_frags); + OBJ_DESTRUCT(&mca_pml_csum.lock); + + if(OMPI_SUCCESS != (rc = mca_pml_csum.allocator->alc_finalize(mca_pml_csum.allocator))) { + return rc; + } + +#if 0 + if (mca_pml_base_send_requests.fl_num_allocated != + mca_pml_base_send_requests.super.opal_list_length) { + opal_output(0, "csum send requests: %d allocated %d returned\n", + mca_pml_base_send_requests.fl_num_allocated, + mca_pml_base_send_requests.super.opal_list_length); + } + if (mca_pml_base_recv_requests.fl_num_allocated != + mca_pml_base_recv_requests.super.opal_list_length) { + opal_output(0, "csum recv requests: %d allocated %d returned\n", + mca_pml_base_recv_requests.fl_num_allocated, + mca_pml_base_recv_requests.super.opal_list_length); + } +#endif + + return OMPI_SUCCESS; +} + +void *mca_pml_csum_seg_alloc( struct mca_mpool_base_module_t* mpool, + size_t* size, + mca_mpool_base_registration_t** registration) { + return malloc(*size); +} + +void mca_pml_csum_seg_free( struct mca_mpool_base_module_t* mpool, + void* segment ) { + free(segment); +} diff --git a/ompi/mca/pml/csum/pml_csum_component.h b/ompi/mca/pml/csum/pml_csum_component.h new file mode 100644 index 0000000000..b1f3a83620 --- /dev/null +++ b/ompi/mca/pml/csum/pml_csum_component.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ + +#ifndef MCA_PML_CSUM_COMPONENT_H +#define MCA_PML_CSUM_COMPONENT_H + +#include "ompi_config.h" + +BEGIN_C_DECLS +/* + * PML module functions. + */ +OMPI_MODULE_DECLSPEC extern mca_pml_base_component_2_0_0_t mca_pml_csum_component; + +END_C_DECLS +#endif diff --git a/ompi/mca/pml/csum/pml_csum_endpoint.c b/ompi/mca/pml/csum/pml_csum_endpoint.c new file mode 100644 index 0000000000..3caacbdd92 --- /dev/null +++ b/ompi/mca/pml/csum/pml_csum_endpoint.c @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include + +#include "ompi/mca/pml/pml.h" +#include "pml_csum_endpoint.h" + + diff --git a/ompi/mca/pml/csum/pml_csum_endpoint.h b/ompi/mca/pml/csum/pml_csum_endpoint.h new file mode 100644 index 0000000000..4f7fb2eb2a --- /dev/null +++ b/ompi/mca/pml/csum/pml_csum_endpoint.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ +#ifndef MCA_PML_CSUM_ENDPOINT_H +#define MCA_PML_CSUM_ENDPOINT_H + +#include "ompi/mca/btl/btl.h" + +BEGIN_C_DECLS + +END_C_DECLS +#endif + diff --git a/ompi/mca/pml/csum/pml_csum_hdr.h b/ompi/mca/pml/csum/pml_csum_hdr.h new file mode 100644 index 0000000000..7f196c9442 --- /dev/null +++ b/ompi/mca/pml/csum/pml_csum_hdr.h @@ -0,0 +1,455 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2009 IBM Corporation. All rights reserved. + * Copyright (c) 2009 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ +#ifndef MCA_PML_CSUM_HEADER_H +#define MCA_PML_CSUM_HEADER_H + +#include "ompi_config.h" +#ifdef HAVE_SYS_TYPES_H +#include +#endif +#ifdef HAVE_NETINET_IN_H +#include +#endif + +#include "opal/types.h" +#include "opal/util/arch.h" +#include "ompi/mca/btl/btl.h" +#include "ompi/proc/proc.h" + +#define MCA_PML_CSUM_HDR_TYPE_MATCH (MCA_BTL_TAG_PML + 1) +#define MCA_PML_CSUM_HDR_TYPE_RNDV (MCA_BTL_TAG_PML + 2) +#define MCA_PML_CSUM_HDR_TYPE_RGET (MCA_BTL_TAG_PML + 3) +#define MCA_PML_CSUM_HDR_TYPE_ACK (MCA_BTL_TAG_PML + 4) +#define MCA_PML_CSUM_HDR_TYPE_NACK (MCA_BTL_TAG_PML + 5) +#define MCA_PML_CSUM_HDR_TYPE_FRAG (MCA_BTL_TAG_PML + 6) +#define MCA_PML_CSUM_HDR_TYPE_GET (MCA_BTL_TAG_PML + 7) +#define MCA_PML_CSUM_HDR_TYPE_PUT (MCA_BTL_TAG_PML + 8) +#define MCA_PML_CSUM_HDR_TYPE_FIN (MCA_BTL_TAG_PML + 9) + +#define MCA_PML_CSUM_HDR_FLAGS_ACK 1 /* is an ack required */ +#define MCA_PML_CSUM_HDR_FLAGS_NBO 2 /* is the hdr in network byte order */ +#define MCA_PML_CSUM_HDR_FLAGS_PIN 4 /* is user buffer pinned */ +#define MCA_PML_CSUM_HDR_FLAGS_CONTIG 8 /* is user buffer contiguous */ +#define MCA_PML_CSUM_HDR_FLAGS_NORDMA 16 /* rest will be send by copy-in-out */ + +/** + * Common hdr attributes - must be first element in each hdr type + */ +struct mca_pml_csum_common_hdr_t { + uint8_t hdr_type; /**< type of envelope */ + uint8_t hdr_flags; /**< flags indicating how fragment should be processed */ + uint16_t hdr_csum; /**< checksum over header */ +}; +typedef struct mca_pml_csum_common_hdr_t mca_pml_csum_common_hdr_t; + +#define MCA_PML_CSUM_COMMON_HDR_NTOH(h) (h).hdr_csum = ntohs((h).hdr_csum); +#define MCA_PML_CSUM_COMMON_HDR_HTON(h) (h).hdr_csum = htons((h).hdr_csum); + +/** + * Header definition for the first fragment, contains the + * attributes required to match the corresponding posted receive. + */ +struct mca_pml_csum_match_hdr_t { + mca_pml_csum_common_hdr_t hdr_common; /**< common attributes */ + uint16_t hdr_ctx; /**< communicator index */ + uint16_t hdr_seq; /**< message sequence number */ + int32_t hdr_src; /**< source rank */ + int32_t hdr_tag; /**< user tag */ + uint32_t hdr_csum; /**< checksum over data */ +}; +#define OMPI_PML_CSUM_MATCH_HDR_LEN 20 + +typedef struct mca_pml_csum_match_hdr_t mca_pml_csum_match_hdr_t; + +#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT && OMPI_ENABLE_DEBUG +#define MCA_PML_CSUM_MATCH_HDR_FILL(h) \ +do { \ + (h).hdr_padding[0] = 0; \ + (h).hdr_padding[1] = 0; \ +} while(0) +#else +#define MCA_PML_CSUM_MATCH_HDR_FILL(h) +#endif /* OMPI_ENABLE_HETEROGENEOUS_SUPPORT && OMPI_ENABLE_DEBUG */ + +#define MCA_PML_CSUM_MATCH_HDR_NTOH(h) \ +do { \ + MCA_PML_CSUM_COMMON_HDR_NTOH((h).hdr_common); \ + (h).hdr_ctx = ntohs((h).hdr_ctx); \ + (h).hdr_src = ntohl((h).hdr_src); \ + (h).hdr_tag = ntohl((h).hdr_tag); \ + (h).hdr_seq = ntohs((h).hdr_seq); \ + (h).hdr_csum = ntohl((h).hdr_csum); \ +} while (0) + +#define MCA_PML_CSUM_MATCH_HDR_HTON(h) \ +do { \ + MCA_PML_CSUM_COMMON_HDR_HTON((h).hdr_common); \ + MCA_PML_CSUM_MATCH_HDR_FILL(h); \ + (h).hdr_ctx = htons((h).hdr_ctx); \ + (h).hdr_src = htonl((h).hdr_src); \ + (h).hdr_tag = htonl((h).hdr_tag); \ + (h).hdr_seq = htons((h).hdr_seq); \ + (h).hdr_csum = htonl((h).hdr_csum); \ +} while (0) + +/** + * Header definition for the first fragment when an acknowledgment + * is required. This could be the first fragment of a large message + * or a short message that requires an ack (synchronous). + */ +struct mca_pml_csum_rendezvous_hdr_t { + mca_pml_csum_match_hdr_t hdr_match; + uint64_t hdr_msg_length; /**< message length */ + ompi_ptr_t hdr_src_req; /**< pointer to source request - returned in ack */ +}; +typedef struct mca_pml_csum_rendezvous_hdr_t mca_pml_csum_rendezvous_hdr_t; + +#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT && OMPI_ENABLE_DEBUG +#define MCA_PML_CSUM_RNDV_HDR_FILL(h) \ + MCA_PML_CSUM_MATCH_HDR_FILL((h).hdr_match) +#else +#define MCA_PML_CSUM_RNDV_HDR_FILL(h) +#endif /* OMPI_ENABLE_HETEROGENEOUS_SUPPORT && OMPI_ENABLE_DEBUG */ + +/* Note that hdr_src_req is not put in network byte order because it + is never processed by the receiver, other than being copied into + the ack header */ +#define MCA_PML_CSUM_RNDV_HDR_NTOH(h) \ + do { \ + MCA_PML_CSUM_MATCH_HDR_NTOH((h).hdr_match); \ + (h).hdr_msg_length = ntoh64((h).hdr_msg_length); \ + } while (0) + +#define MCA_PML_CSUM_RNDV_HDR_HTON(h) \ + do { \ + MCA_PML_CSUM_MATCH_HDR_HTON((h).hdr_match); \ + MCA_PML_CSUM_RNDV_HDR_FILL(h); \ + (h).hdr_msg_length = hton64((h).hdr_msg_length); \ + } while (0) + +/** + * Header definition for a combined rdma rendezvous/get + */ +struct mca_pml_csum_rget_hdr_t { + mca_pml_csum_rendezvous_hdr_t hdr_rndv; + uint32_t hdr_seg_cnt; /**< number of segments for rdma */ +#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT + uint8_t hdr_padding[4]; +#endif + ompi_ptr_t hdr_des; /**< source descriptor */ + mca_btl_base_segment_t hdr_segs[1]; /**< list of segments for rdma */ +}; +typedef struct mca_pml_csum_rget_hdr_t mca_pml_csum_rget_hdr_t; + +#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT && OMPI_ENABLE_DEBUG +#define MCA_PML_CSUM_RGET_HDR_FILL(h) \ +do { \ + MCA_PML_CSUM_RNDV_HDR_FILL((h).hdr_rndv); \ + (h).hdr_padding[0] = 0; \ + (h).hdr_padding[1] = 0; \ + (h).hdr_padding[2] = 0; \ + (h).hdr_padding[3] = 0; \ +} while(0) +#else +#define MCA_PML_CSUM_RGET_HDR_FILL(h) +#endif /* OMPI_ENABLE_HETEROGENEOUS_SUPPORT && OMPI_ENABLE_DEBUG */ + +#define MCA_PML_CSUM_RGET_HDR_NTOH(h) \ + do { \ + MCA_PML_CSUM_RNDV_HDR_NTOH((h).hdr_rndv); \ + (h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \ + } while (0) + +#define MCA_PML_CSUM_RGET_HDR_HTON(h) \ + do { \ + MCA_PML_CSUM_RNDV_HDR_HTON((h).hdr_rndv); \ + MCA_PML_CSUM_RGET_HDR_FILL(h); \ + (h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \ + } while (0) + +/** + * Header for subsequent fragments. + */ +struct mca_pml_csum_frag_hdr_t { + mca_pml_csum_common_hdr_t hdr_common; /**< common attributes */ + uint32_t hdr_csum; + uint64_t hdr_frag_offset; /**< offset into message */ + ompi_ptr_t hdr_src_req; /**< pointer to source request */ + ompi_ptr_t hdr_dst_req; /**< pointer to matched receive */ +}; +typedef struct mca_pml_csum_frag_hdr_t mca_pml_csum_frag_hdr_t; + +#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT && OMPI_ENABLE_DEBUG +#define MCA_PML_CSUM_FRAG_HDR_FILL(h) \ +do { \ + (h).hdr_padding[0] = 0; \ + (h).hdr_padding[1] = 0; \ + (h).hdr_padding[2] = 0; \ + (h).hdr_padding[3] = 0; \ + (h).hdr_padding[4] = 0; \ + (h).hdr_padding[5] = 0; \ +} while(0) +#else +#define MCA_PML_CSUM_FRAG_HDR_FILL(h) +#endif /* OMPI_ENABLE_HETEROGENEOUS_SUPPORT && OMPI_ENABLE_DEBUG */ + +#define MCA_PML_CSUM_FRAG_HDR_NTOH(h) \ + do { \ + MCA_PML_CSUM_COMMON_HDR_NTOH((h).hdr_common); \ + (h).hdr_csum = ntohl((h).hdr_csum); \ + (h).hdr_frag_offset = ntoh64((h).hdr_frag_offset); \ + } while (0) + +#define MCA_PML_CSUM_FRAG_HDR_HTON(h) \ + do { \ + MCA_PML_CSUM_COMMON_HDR_HTON((h).hdr_common); \ + (h).hdr_csum = htonl((h).hdr_csum); \ + MCA_PML_CSUM_FRAG_HDR_FILL(h); \ + (h).hdr_frag_offset = hton64((h).hdr_frag_offset); \ + } while (0) + +/** + * Header used to acknowledgment outstanding fragment(s). + */ + +struct mca_pml_csum_ack_hdr_t { + mca_pml_csum_common_hdr_t hdr_common; /**< common attributes */ +#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT + uint8_t hdr_padding[6]; +#endif + ompi_ptr_t hdr_src_req; /**< source request */ + ompi_ptr_t hdr_dst_req; /**< matched receive request */ + uint64_t hdr_send_offset; /**< starting point of copy in/out */ +}; +typedef struct mca_pml_csum_ack_hdr_t mca_pml_csum_ack_hdr_t; + +#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT && OMPI_ENABLE_DEBUG +#define MCA_PML_CSUM_ACK_HDR_FILL(h) \ +do { \ + (h).hdr_padding[0] = 0; \ + (h).hdr_padding[1] = 0; \ + (h).hdr_padding[2] = 0; \ + (h).hdr_padding[3] = 0; \ + (h).hdr_padding[4] = 0; \ + (h).hdr_padding[5] = 0; \ +} while (0) +#else +#define MCA_PML_CSUM_ACK_HDR_FILL(h) +#endif /* OMPI_ENABLE_HETEROGENEOUS_SUPPORT && OMPI_ENABLE_DEBUG */ + +/* Note that the request headers are not put in NBO because the + src_req is already in receiver's byte order and the dst_req is not + used by the receiver for anything other than backpointers in return + headers */ +#define MCA_PML_CSUM_ACK_HDR_NTOH(h) \ + do { \ + MCA_PML_CSUM_COMMON_HDR_NTOH((h).hdr_common); \ + (h).hdr_send_offset = ntoh64((h).hdr_send_offset); \ + } while (0) + +#define MCA_PML_CSUM_ACK_HDR_HTON(h) \ + do { \ + MCA_PML_CSUM_COMMON_HDR_HTON((h).hdr_common); \ + MCA_PML_CSUM_ACK_HDR_FILL(h); \ + (h).hdr_send_offset = hton64((h).hdr_send_offset); \ + } while (0) + +/** + * Header used to initiate an RDMA operation. + */ + +struct mca_pml_csum_rdma_hdr_t { + mca_pml_csum_common_hdr_t hdr_common; /**< common attributes */ + uint32_t hdr_seg_cnt; /**< number of segments for rdma */ + ompi_ptr_t hdr_req; /**< destination request */ + ompi_ptr_t hdr_des; /**< source descriptor */ + uint64_t hdr_rdma_offset; /**< current offset into user buffer */ + mca_btl_base_segment_t hdr_segs[1]; /**< list of segments for rdma */ +}; +typedef struct mca_pml_csum_rdma_hdr_t mca_pml_csum_rdma_hdr_t; + +#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT && OMPI_ENABLE_DEBUG +#define MCA_PML_CSUM_RDMA_HDR_FILL(h) \ +do { \ + (h).hdr_padding[0] = 0; \ + (h).hdr_padding[1] = 0; \ +} while(0) +#else +#define MCA_PML_CSUM_RDMA_HDR_FILL(h) +#endif /* OMPI_ENABLE_HETEROGENEOUS_SUPPORT && OMPI_ENABLE_DEBUG */ + +#define MCA_PML_CSUM_RDMA_HDR_NTOH(h) \ + do { \ + MCA_PML_CSUM_COMMON_HDR_NTOH((h).hdr_common); \ + (h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \ + (h).hdr_rdma_offset = ntoh64((h).hdr_rdma_offset); \ + } while (0) + +#define MCA_PML_CSUM_RDMA_HDR_HTON(h) \ + do { \ + MCA_PML_CSUM_COMMON_HDR_HTON((h).hdr_common); \ + MCA_PML_CSUM_RDMA_HDR_FILL(h); \ + (h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \ + (h).hdr_rdma_offset = hton64((h).hdr_rdma_offset); \ + } while (0) + +/** + * Header used to complete an RDMA operation. + */ + +struct mca_pml_csum_fin_hdr_t { + mca_pml_csum_common_hdr_t hdr_common; /**< common attributes */ +#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT + uint8_t hdr_padding[6]; +#endif + ompi_ptr_t hdr_des; /**< completed descriptor */ + uint32_t hdr_fail; /**< RDMA operation failed */ +}; +typedef struct mca_pml_csum_fin_hdr_t mca_pml_csum_fin_hdr_t; + +#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT && OMPI_ENABLE_DEBUG +#define MCA_PML_CSUM_FIN_HDR_FILL(h) \ +do { \ + (h).hdr_padding[0] = 0; \ + (h).hdr_padding[1] = 0; \ + (h).hdr_padding[2] = 0; \ + (h).hdr_padding[3] = 0; \ + (h).hdr_padding[4] = 0; \ + (h).hdr_padding[5] = 0; \ +} while (0) +#else +#define MCA_PML_CSUM_FIN_HDR_FILL(h) +#endif /* OMPI_ENABLE_HETEROGENEOUS_SUPPORT && OMPI_ENABLE_DEBUG */ + +#define MCA_PML_CSUM_FIN_HDR_NTOH(h) \ + do { \ + MCA_PML_CSUM_COMMON_HDR_NTOH((h).hdr_common); \ + } while (0) + +#define MCA_PML_CSUM_FIN_HDR_HTON(h) \ + do { \ + MCA_PML_CSUM_COMMON_HDR_HTON((h).hdr_common); \ + MCA_PML_CSUM_FIN_HDR_FILL(h); \ + } while (0) + +/** + * Union of defined hdr types. + */ +union mca_pml_csum_hdr_t { + mca_pml_csum_common_hdr_t hdr_common; + mca_pml_csum_match_hdr_t hdr_match; + mca_pml_csum_rendezvous_hdr_t hdr_rndv; + mca_pml_csum_rget_hdr_t hdr_rget; + mca_pml_csum_frag_hdr_t hdr_frag; + mca_pml_csum_ack_hdr_t hdr_ack; + mca_pml_csum_rdma_hdr_t hdr_rdma; + mca_pml_csum_fin_hdr_t hdr_fin; +}; +typedef union mca_pml_csum_hdr_t mca_pml_csum_hdr_t; + +#if !defined(WORDS_BIGENDIAN) && OMPI_ENABLE_HETEROGENEOUS_SUPPORT +static inline __opal_attribute_always_inline__ void +csum_hdr_ntoh(mca_pml_csum_hdr_t *hdr, const uint8_t hdr_type) +{ + if(!(hdr->hdr_common.hdr_flags & MCA_PML_CSUM_HDR_FLAGS_NBO)) + return; + + switch(hdr_type) { + case MCA_PML_CSUM_HDR_TYPE_MATCH: + MCA_PML_CSUM_MATCH_HDR_NTOH(hdr->hdr_match); + break; + case MCA_PML_CSUM_HDR_TYPE_RNDV: + MCA_PML_CSUM_RNDV_HDR_NTOH(hdr->hdr_rndv); + break; + case MCA_PML_CSUM_HDR_TYPE_RGET: + MCA_PML_CSUM_RGET_HDR_NTOH(hdr->hdr_rget); + break; + case MCA_PML_CSUM_HDR_TYPE_ACK: + MCA_PML_CSUM_ACK_HDR_NTOH(hdr->hdr_ack); + break; + case MCA_PML_CSUM_HDR_TYPE_FRAG: + MCA_PML_CSUM_FRAG_HDR_NTOH(hdr->hdr_frag); + break; + case MCA_PML_CSUM_HDR_TYPE_PUT: + MCA_PML_CSUM_RDMA_HDR_NTOH(hdr->hdr_rdma); + break; + case MCA_PML_CSUM_HDR_TYPE_FIN: + MCA_PML_CSUM_FIN_HDR_NTOH(hdr->hdr_fin); + break; + default: + assert(0); + break; + } +} +#else +#define csum_hdr_ntoh(h, t) do{}while(0) +#endif + +#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT +#define csum_hdr_hton(h, t, p) \ + csum_hdr_hton_intr((mca_pml_csum_hdr_t*)h, t, p) +static inline __opal_attribute_always_inline__ void +csum_hdr_hton_intr(mca_pml_csum_hdr_t *hdr, const uint8_t hdr_type, + const ompi_proc_t *proc) +{ +#ifdef WORDS_BIGENDIAN + hdr->hdr_common.hdr_flags |= MCA_PML_CSUM_HDR_FLAGS_NBO; +#else + + if(!(proc->proc_arch & OPAL_ARCH_ISBIGENDIAN)) + return; + + hdr->hdr_common.hdr_flags |= MCA_PML_CSUM_HDR_FLAGS_NBO; + switch(hdr_type) { + case MCA_PML_CSUM_HDR_TYPE_MATCH: + MCA_PML_CSUM_MATCH_HDR_HTON(hdr->hdr_match); + break; + case MCA_PML_CSUM_HDR_TYPE_RNDV: + MCA_PML_CSUM_RNDV_HDR_HTON(hdr->hdr_rndv); + break; + case MCA_PML_CSUM_HDR_TYPE_RGET: + MCA_PML_CSUM_RGET_HDR_HTON(hdr->hdr_rget); + break; + case MCA_PML_CSUM_HDR_TYPE_ACK: + MCA_PML_CSUM_ACK_HDR_HTON(hdr->hdr_ack); + break; + case MCA_PML_CSUM_HDR_TYPE_FRAG: + MCA_PML_CSUM_FRAG_HDR_HTON(hdr->hdr_frag); + break; + case MCA_PML_CSUM_HDR_TYPE_PUT: + MCA_PML_CSUM_RDMA_HDR_HTON(hdr->hdr_rdma); + break; + case MCA_PML_CSUM_HDR_TYPE_FIN: + MCA_PML_CSUM_FIN_HDR_HTON(hdr->hdr_fin); + break; + default: + assert(0); + break; + } +#endif +} +#else +#define csum_hdr_hton(h, t, p) do{}while(0) +#endif +#endif diff --git a/ompi/mca/pml/csum/pml_csum_iprobe.c b/ompi/mca/pml/csum/pml_csum_iprobe.c new file mode 100644 index 0000000000..0ad995a103 --- /dev/null +++ b/ompi/mca/pml/csum/pml_csum_iprobe.c @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include "ompi/request/request.h" +#include "pml_csum_recvreq.h" + + +int mca_pml_csum_iprobe(int src, + int tag, + struct ompi_communicator_t *comm, + int *matched, ompi_status_public_t * status) +{ + int rc = OMPI_SUCCESS; + mca_pml_csum_recv_request_t recvreq; + + OBJ_CONSTRUCT( &recvreq, mca_pml_csum_recv_request_t ); + recvreq.req_recv.req_base.req_ompi.req_type = OMPI_REQUEST_PML; + recvreq.req_recv.req_base.req_type = MCA_PML_REQUEST_IPROBE; + + MCA_PML_CSUM_RECV_REQUEST_INIT(&recvreq, NULL, 0, &ompi_mpi_char.dt, src, tag, comm, true); + MCA_PML_CSUM_RECV_REQUEST_START(&recvreq); + + if( recvreq.req_recv.req_base.req_ompi.req_complete == true ) { + if( NULL != status ) { + *status = recvreq.req_recv.req_base.req_ompi.req_status; + } + *matched = 1; + } else { + *matched = 0; + opal_progress(); + } + MCA_PML_BASE_RECV_REQUEST_FINI( &recvreq.req_recv ); + return rc; +} + + +int mca_pml_csum_probe(int src, + int tag, + struct ompi_communicator_t *comm, + ompi_status_public_t * status) +{ + mca_pml_csum_recv_request_t recvreq; + + OBJ_CONSTRUCT( &recvreq, mca_pml_csum_recv_request_t ); + recvreq.req_recv.req_base.req_ompi.req_type = OMPI_REQUEST_PML; + recvreq.req_recv.req_base.req_type = MCA_PML_REQUEST_PROBE; + + MCA_PML_CSUM_RECV_REQUEST_INIT(&recvreq, NULL, 0, &ompi_mpi_char.dt, src, tag, comm, true); + MCA_PML_CSUM_RECV_REQUEST_START(&recvreq); + + ompi_request_wait_completion(&recvreq.req_recv.req_base.req_ompi); + + if (NULL != status) { + *status = recvreq.req_recv.req_base.req_ompi.req_status; + } + MCA_PML_BASE_RECV_REQUEST_FINI( &recvreq.req_recv ); + return OMPI_SUCCESS; +} diff --git a/ompi/mca/pml/csum/pml_csum_irecv.c b/ompi/mca/pml/csum/pml_csum_irecv.c new file mode 100644 index 0000000000..417da9053f --- /dev/null +++ b/ompi/mca/pml/csum/pml_csum_irecv.c @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include "ompi/request/request.h" +#include "pml_csum_recvreq.h" +#include "ompi/peruse/peruse-internal.h" + +int mca_pml_csum_irecv_init(void *addr, + size_t count, + ompi_datatype_t * datatype, + int src, + int tag, + struct ompi_communicator_t *comm, + struct ompi_request_t **request) +{ + int rc; + mca_pml_csum_recv_request_t *recvreq; + MCA_PML_CSUM_RECV_REQUEST_ALLOC(recvreq, rc); + if (NULL == recvreq) + return rc; + + MCA_PML_CSUM_RECV_REQUEST_INIT(recvreq, + addr, + count, datatype, src, tag, comm, true); + + PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE, + &((recvreq)->req_recv.req_base), + PERUSE_RECV); + + *request = (ompi_request_t *) recvreq; + return OMPI_SUCCESS; +} + +int mca_pml_csum_irecv(void *addr, + size_t count, + ompi_datatype_t * datatype, + int src, + int tag, + struct ompi_communicator_t *comm, + struct ompi_request_t **request) +{ + int rc; + + mca_pml_csum_recv_request_t *recvreq; + MCA_PML_CSUM_RECV_REQUEST_ALLOC(recvreq, rc); + if (NULL == recvreq) + return rc; + + MCA_PML_CSUM_RECV_REQUEST_INIT(recvreq, + addr, + count, datatype, src, tag, comm, false); + + PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE, + &((recvreq)->req_recv.req_base), + PERUSE_RECV); + + MCA_PML_CSUM_RECV_REQUEST_START(recvreq); + *request = (ompi_request_t *) recvreq; + return OMPI_SUCCESS; +} + + +int mca_pml_csum_recv(void *addr, + size_t count, + ompi_datatype_t * datatype, + int src, + int tag, + struct ompi_communicator_t *comm, + ompi_status_public_t * status) +{ + int rc; + mca_pml_csum_recv_request_t *recvreq; + MCA_PML_CSUM_RECV_REQUEST_ALLOC(recvreq, rc); + if (NULL == recvreq) + return rc; + + MCA_PML_CSUM_RECV_REQUEST_INIT(recvreq, + addr, + count, datatype, src, tag, comm, false); + + PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE, + &((recvreq)->req_recv.req_base), + PERUSE_RECV); + + MCA_PML_CSUM_RECV_REQUEST_START(recvreq); + ompi_request_wait_completion(&recvreq->req_recv.req_base.req_ompi); + + if (NULL != status) { /* return status */ + *status = recvreq->req_recv.req_base.req_ompi.req_status; + } + rc = recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR; + ompi_request_free( (ompi_request_t**)&recvreq ); + return rc; +} diff --git a/ompi/mca/pml/csum/pml_csum_isend.c b/ompi/mca/pml/csum/pml_csum_isend.c new file mode 100644 index 0000000000..341263cd79 --- /dev/null +++ b/ompi/mca/pml/csum/pml_csum_isend.c @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2007 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "pml_csum.h" +#include "pml_csum_sendreq.h" +#include "pml_csum_recvreq.h" +#include "ompi/peruse/peruse-internal.h" + +int mca_pml_csum_isend_init(void *buf, + size_t count, + ompi_datatype_t * datatype, + int dst, + int tag, + mca_pml_base_send_mode_t sendmode, + ompi_communicator_t * comm, + ompi_request_t ** request) +{ + int rc; + + mca_pml_csum_send_request_t *sendreq = NULL; + MCA_PML_CSUM_SEND_REQUEST_ALLOC(comm, dst, sendreq, rc); + if (rc != OMPI_SUCCESS) + return rc; + + MCA_PML_CSUM_SEND_REQUEST_INIT(sendreq, + buf, + count, + datatype, + dst, tag, + comm, sendmode, true); + + PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE, + &(sendreq)->req_send.req_base, + PERUSE_SEND); + + *request = (ompi_request_t *) sendreq; + return OMPI_SUCCESS; +} + + +int mca_pml_csum_isend(void *buf, + size_t count, + ompi_datatype_t * datatype, + int dst, + int tag, + mca_pml_base_send_mode_t sendmode, + ompi_communicator_t * comm, + ompi_request_t ** request) +{ + int rc; + mca_pml_csum_send_request_t *sendreq = NULL; + + MCA_PML_CSUM_SEND_REQUEST_ALLOC(comm, dst, sendreq, rc); + if (rc != OMPI_SUCCESS) + return rc; + + MCA_PML_CSUM_SEND_REQUEST_INIT(sendreq, + buf, + count, + datatype, + dst, tag, + comm, sendmode, false); + + PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE, + &(sendreq)->req_send.req_base, + PERUSE_SEND); + + MCA_PML_CSUM_SEND_REQUEST_START(sendreq, rc); + *request = (ompi_request_t *) sendreq; + return rc; +} + + +int mca_pml_csum_send(void *buf, + size_t count, + ompi_datatype_t * datatype, + int dst, + int tag, + mca_pml_base_send_mode_t sendmode, + ompi_communicator_t * comm) +{ + int rc; + mca_pml_csum_send_request_t *sendreq; + + MCA_PML_CSUM_SEND_REQUEST_ALLOC(comm, dst, sendreq, rc); + if (rc != OMPI_SUCCESS) + return rc; + + MCA_PML_CSUM_SEND_REQUEST_INIT(sendreq, + buf, + count, + datatype, + dst, tag, + comm, sendmode, false); + + PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE, + &(sendreq)->req_send.req_base, + PERUSE_SEND); + + MCA_PML_CSUM_SEND_REQUEST_START(sendreq, rc); + if (rc != OMPI_SUCCESS) { + MCA_PML_CSUM_SEND_REQUEST_RETURN( sendreq ); + return rc; + } + + ompi_request_wait_completion(&sendreq->req_send.req_base.req_ompi); + + rc = sendreq->req_send.req_base.req_ompi.req_status.MPI_ERROR; + ompi_request_free( (ompi_request_t**)&sendreq ); + return rc; +} diff --git a/ompi/mca/pml/csum/pml_csum_progress.c b/ompi/mca/pml/csum/pml_csum_progress.c new file mode 100644 index 0000000000..896e1a4e3f --- /dev/null +++ b/ompi/mca/pml/csum/pml_csum_progress.c @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2008 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "pml_csum.h" +#include "pml_csum_sendreq.h" +#include "ompi/mca/bml/base/base.h" + +int mca_pml_csum_progress(void) +{ + int i, queue_length = opal_list_get_size(&mca_pml_csum.send_pending); + int j, completed_requests = 0; + bool send_succedded; + + if( OPAL_LIKELY(0 == queue_length) ) + return 0; + + for( i = 0; i < queue_length; i++ ) { + mca_pml_csum_send_pending_t pending_type = MCA_PML_CSUM_SEND_PENDING_NONE; + mca_pml_csum_send_request_t* sendreq; + mca_bml_base_endpoint_t* endpoint; + + sendreq = get_request_from_send_pending(&pending_type); + if(OPAL_UNLIKELY(NULL == sendreq)) + break; + + switch(pending_type) { + case MCA_PML_CSUM_SEND_PENDING_NONE: + assert(0); + return 0; + case MCA_PML_CSUM_SEND_PENDING_SCHEDULE: + if( mca_pml_csum_send_request_schedule_exclusive(sendreq) == + OMPI_ERR_OUT_OF_RESOURCE ) { + return 0; + } + completed_requests++; + break; + case MCA_PML_CSUM_SEND_PENDING_START: + endpoint = sendreq->req_endpoint; + send_succedded = false; + for(j = 0; j < (int)mca_bml_base_btl_array_get_size(&endpoint->btl_eager); j++) { + mca_bml_base_btl_t* bml_btl; + int rc; + + /* select a btl */ + bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager); + rc = mca_pml_csum_send_request_start_btl(sendreq, bml_btl); + if( OPAL_LIKELY(OMPI_SUCCESS == rc) ) { + send_succedded = true; + completed_requests++; + break; + } + } + if( false == send_succedded ) { + add_request_to_send_pending(sendreq, MCA_PML_CSUM_SEND_PENDING_START, true); + } + } + } + return completed_requests; +} + diff --git a/ompi/mca/pml/csum/pml_csum_rdma.c b/ompi/mca/pml/csum/pml_csum_rdma.c new file mode 100644 index 0000000000..52c1e3b5e3 --- /dev/null +++ b/ompi/mca/pml/csum/pml_csum_rdma.c @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + + +/*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/ + +#include "ompi_config.h" +#include "ompi/constants.h" +#include "ompi/mca/pml/pml.h" +#include "ompi/mca/btl/btl.h" +#include "ompi/mca/bml/bml.h" +#include "orte/types.h" +#include "ompi/mca/mpool/mpool.h" +#include "pml_csum.h" +#include "pml_csum_rdma.h" + +/* Use this registration if no registration needed for a BTL instead of NULL. + * This will help other code to distinguish case when memory is not registered + * from case when registration is not needed */ +static mca_mpool_base_registration_t pml_csum_dummy_reg; + +/* + * Check to see if memory is registered or can be registered. Build a + * set of registrations on the request. + */ + +size_t mca_pml_csum_rdma_btls( + mca_bml_base_endpoint_t* bml_endpoint, + unsigned char* base, + size_t size, + mca_pml_csum_com_btl_t* rdma_btls) +{ + int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma); + double weight_total = 0; + int num_btls_used = 0, n; + + /* shortcut when there are no rdma capable btls */ + if(num_btls == 0) { + return 0; + } + + /* check to see if memory is registered */ + for(n = 0; n < num_btls && num_btls_used < mca_pml_csum.max_rdma_per_request; + n++) { + mca_bml_base_btl_t* bml_btl = + mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma, + (bml_endpoint->btl_rdma_index + n) % num_btls); + mca_mpool_base_registration_t* reg = NULL; + mca_mpool_base_module_t *btl_mpool = bml_btl->btl->btl_mpool; + + if(NULL != btl_mpool) { + if(!mca_pml_csum.leave_pinned) { + /* look through existing registrations */ + btl_mpool->mpool_find(btl_mpool, base, size, ®); + } else { + /* register the memory */ + btl_mpool->mpool_register(btl_mpool, base, size, 0, ®); + } + + if(NULL == reg) + bml_btl = NULL; /* skip it */ + } else { + /* if registration is not required use dummy registration */ + reg = &pml_csum_dummy_reg; + } + + if(bml_btl != NULL) { + rdma_btls[num_btls_used].bml_btl = bml_btl; + rdma_btls[num_btls_used].btl_reg = reg; + weight_total += bml_btl->btl_weight; + num_btls_used++; + } + } + + /* if we don't use leave_pinned and all BTLs that already have this memory + * registered amount to less then half of available bandwidth - fall back to + * pipeline protocol */ + if(0 == num_btls_used || (!mca_pml_csum.leave_pinned && weight_total < 0.5)) + return 0; + + mca_pml_csum_calc_weighted_length(rdma_btls, num_btls_used, size, + weight_total); + + bml_endpoint->btl_rdma_index = (bml_endpoint->btl_rdma_index + 1) % num_btls; + return num_btls_used; +} + +size_t mca_pml_csum_rdma_pipeline_btls( mca_bml_base_endpoint_t* bml_endpoint, + size_t size, + mca_pml_csum_com_btl_t* rdma_btls ) +{ + int i, num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma); + double weight_total = 0; + + for(i = 0; i < num_btls && i < mca_pml_csum.max_rdma_per_request; i++) { + rdma_btls[i].bml_btl = + mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma); + if(NULL != rdma_btls[i].bml_btl->btl->btl_mpool) + rdma_btls[i].btl_reg = NULL; + else + rdma_btls[i].btl_reg = &pml_csum_dummy_reg; + + weight_total += rdma_btls[i].bml_btl->btl_weight; + } + + mca_pml_csum_calc_weighted_length(rdma_btls, i, size, weight_total); + + return i; +} diff --git a/ompi/mca/pml/csum/pml_csum_rdma.h b/ompi/mca/pml/csum/pml_csum_rdma.h new file mode 100644 index 0000000000..109ef181ad --- /dev/null +++ b/ompi/mca/pml/csum/pml_csum_rdma.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ + +#ifndef MCA_PML_CSUM_RDMA_H +#define MCA_PML_CSUM_RDMA_H + +struct mca_bml_base_endpoint_t; + +/* + * Of the set of available btls that support RDMA, + * find those that already have registrations - or + * register if required (for leave_pinned option) + */ +size_t mca_pml_csum_rdma_btls(struct mca_bml_base_endpoint_t* endpoint, + unsigned char* base, size_t size, struct mca_pml_csum_com_btl_t* btls); + +/* Choose RDMA BTLs to use for sending of a request by pipeline protocol. + * Calculate number of bytes to send through each BTL according to available + * bandwidth */ +size_t mca_pml_csum_rdma_pipeline_btls(struct mca_bml_base_endpoint_t* endpoint, + size_t size, mca_pml_csum_com_btl_t* rdma_btls); +#endif + diff --git a/ompi/mca/pml/csum/pml_csum_rdmafrag.c b/ompi/mca/pml/csum/pml_csum_rdmafrag.c new file mode 100644 index 0000000000..438efb29e9 --- /dev/null +++ b/ompi/mca/pml/csum/pml_csum_rdmafrag.c @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "pml_csum.h" +#include "pml_csum_rdmafrag.h" + + +OBJ_CLASS_INSTANCE( + mca_pml_csum_rdma_frag_t, + ompi_free_list_item_t, + NULL, + NULL); diff --git a/ompi/mca/pml/csum/pml_csum_rdmafrag.h b/ompi/mca/pml/csum/pml_csum_rdmafrag.h new file mode 100644 index 0000000000..c0e5c55df1 --- /dev/null +++ b/ompi/mca/pml/csum/pml_csum_rdmafrag.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ + +#ifndef MCA_PML_CSUM_RDMAFRAG_H +#define MCA_PML_CSUM_RDMAFRAG_H + +#include "ompi/mca/btl/btl.h" +#include "pml_csum_hdr.h" + +BEGIN_C_DECLS + +typedef enum { + MCA_PML_CSUM_RDMA_PUT, + MCA_PML_CSUM_RDMA_GET +} mca_pml_csum_rdma_state_t; + +struct mca_pml_csum_rdma_frag_t { + ompi_free_list_item_t super; + mca_bml_base_btl_t* rdma_bml; + mca_pml_csum_hdr_t rdma_hdr; + mca_pml_csum_rdma_state_t rdma_state; + size_t rdma_length; + mca_btl_base_segment_t rdma_segs[MCA_BTL_DES_MAX_SEGMENTS]; + void *rdma_req; + struct mca_bml_base_endpoint_t* rdma_ep; + ompi_convertor_t convertor; + mca_mpool_base_registration_t* reg; + uint32_t retries; +}; +typedef struct mca_pml_csum_rdma_frag_t mca_pml_csum_rdma_frag_t; + +OBJ_CLASS_DECLARATION(mca_pml_csum_rdma_frag_t); + + +#define MCA_PML_CSUM_RDMA_FRAG_ALLOC(frag,rc) \ +do { \ + ompi_free_list_item_t* item; \ + OMPI_FREE_LIST_WAIT(&mca_pml_csum.rdma_frags, item, rc); \ + frag = (mca_pml_csum_rdma_frag_t*)item; \ +} while(0) + +#define MCA_PML_CSUM_RDMA_FRAG_RETURN(frag) \ +do { \ + /* return fragment */ \ + OMPI_FREE_LIST_RETURN(&mca_pml_csum.rdma_frags, \ + (ompi_free_list_item_t*)frag); \ +} while(0) + + +END_C_DECLS + +#endif + diff --git a/ompi/mca/pml/csum/pml_csum_recvfrag.c b/ompi/mca/pml/csum/pml_csum_recvfrag.c new file mode 100644 index 0000000000..83cb41a43b --- /dev/null +++ b/ompi/mca/pml/csum/pml_csum_recvfrag.c @@ -0,0 +1,799 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2008 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2007 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. + * Copyright (c) 2006-2008 University of Houston. All rights reserved. + * Copyright (c) 2009 IBM Corporation. All rights reserved. + * Copyright (c) 2009 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + */ + +#include "ompi_config.h" +#include "ompi/constants.h" + +#include "opal/class/opal_list.h" +#include "opal/util/crc.h" +#include "opal/threads/mutex.h" +#include "opal/prefetch.h" +#include "opal/util/output.h" + +#include "orte/util/name_fns.h" +#include "orte/runtime/orte_globals.h" + +#include "ompi/communicator/communicator.h" +#include "ompi/mca/pml/pml.h" +#include "ompi/mca/pml/base/base.h" +#include "ompi/peruse/peruse-internal.h" +#include "ompi/memchecker.h" +#include "orte/mca/errmgr/errmgr.h" + +#include "pml_csum.h" +#include "pml_csum_comm.h" +#include "pml_csum_recvfrag.h" +#include "pml_csum_recvreq.h" +#include "pml_csum_sendreq.h" +#include "pml_csum_hdr.h" + +OBJ_CLASS_INSTANCE( mca_pml_csum_buffer_t, + ompi_free_list_item_t, + NULL, + NULL ); + +OBJ_CLASS_INSTANCE( mca_pml_csum_recv_frag_t, + opal_list_item_t, + NULL, + NULL ); + +/** + * Static functions. + */ + +/** + * Append a unexpected descriptor to a queue. This function will allocate and + * initialize the fragment (if necessary) and the will added to the specified + * queue. The frag will be updated to the allocated fragment if necessary. + */ +static void +append_frag_to_list(opal_list_t *queue, mca_btl_base_module_t *btl, + mca_pml_csum_match_hdr_t *hdr, mca_btl_base_segment_t* segments, + size_t num_segments, mca_pml_csum_recv_frag_t* frag) +{ + int rc; + + if(NULL == frag) { + MCA_PML_CSUM_RECV_FRAG_ALLOC(frag, rc); + MCA_PML_CSUM_RECV_FRAG_INIT(frag, hdr, segments, num_segments, btl); + } + opal_list_append(queue, (opal_list_item_t*)frag); +} + +/** + * Match incoming recv_frags against posted receives. + * Supports out of order delivery. + * + * @param frag_header (IN) Header of received recv_frag. + * @param frag_desc (IN) Received recv_frag descriptor. + * @param match_made (OUT) Flag indicating wether a match was made. + * @param additional_matches (OUT) List of additional matches + * @return OMPI_SUCCESS or error status on failure. + */ +static int mca_pml_csum_recv_frag_match( mca_btl_base_module_t *btl, + mca_pml_csum_match_hdr_t *hdr, + mca_btl_base_segment_t* segments, + size_t num_segments, + int type); + +static mca_pml_csum_recv_request_t *match_one(mca_btl_base_module_t *btl, + mca_pml_csum_match_hdr_t *hdr, mca_btl_base_segment_t* segments, + size_t num_segments, ompi_communicator_t *comm_ptr, + mca_pml_csum_comm_proc_t *proc, + mca_pml_csum_recv_frag_t* frag); + +void mca_pml_csum_recv_frag_callback_match(mca_btl_base_module_t* btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* des, + void* cbdata ) { + mca_btl_base_segment_t* segments = des->des_dst; + mca_pml_csum_match_hdr_t* hdr = (mca_pml_csum_match_hdr_t*)segments->seg_addr.pval; + ompi_communicator_t *comm_ptr; + mca_pml_csum_recv_request_t *match = NULL; + mca_pml_csum_comm_t *comm; + mca_pml_csum_comm_proc_t *proc; + mca_pml_csum_recv_frag_t* frag = NULL; + size_t num_segments = des->des_dst_cnt; + size_t bytes_received = 0; + uint16_t csum_received, csum; + uint32_t csum_data; + bool do_csum = btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM; + + if( OPAL_UNLIKELY(segments->seg_len < OMPI_PML_CSUM_MATCH_HDR_LEN) ) { + return; + } + csum_hdr_ntoh(((mca_pml_csum_hdr_t*) hdr), MCA_PML_CSUM_HDR_TYPE_MATCH); + + if (do_csum) { + csum_received = hdr->hdr_common.hdr_csum; + hdr->hdr_common.hdr_csum = 0; + csum = opal_csum16(hdr, sizeof(mca_pml_csum_match_hdr_t)); + hdr->hdr_common.hdr_csum = csum_received; + + if (csum_received != csum) { + opal_output(0, "%s:%s:%d: Invalid \'match header\' - received csum:0x%04x != computed csum:0x%04x\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), __FILE__, __LINE__, csum_received, csum); + orte_errmgr.abort(-1,NULL); + } + } + + /* communicator pointer */ + comm_ptr = ompi_comm_lookup(hdr->hdr_ctx); + if(OPAL_UNLIKELY(NULL == comm_ptr)) { + /* This is a special case. A message for a not yet existing + * communicator can happens. Instead of doing a matching we + * will temporarily add it the a pending queue in the PML. + * Later on, when the communicator is completely instantiated, + * this pending queue will be searched and all matching fragments + * moved to the right communicator. + */ + append_frag_to_list( &mca_pml_csum.non_existing_communicator_pending, + btl, hdr, segments, num_segments, frag ); + return; + } + comm = (mca_pml_csum_comm_t *)comm_ptr->c_pml_comm; + + /* source sequence number */ + proc = &comm->procs[hdr->hdr_src]; + + /* We generate the MSG_ARRIVED event as soon as the PML is aware + * of a matching fragment arrival. Independing if it is received + * on the correct order or not. This will allow the tools to + * figure out if the messages are not received in the correct + * order (if multiple network interfaces). + */ + PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_ARRIVED, comm_ptr, + hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV); + + /* get next expected message sequence number - if threaded + * run, lock to make sure that if another thread is processing + * a frag from the same message a match is made only once. + * Also, this prevents other posted receives (for a pair of + * end points) from being processed, and potentially "loosing" + * the fragment. + */ + OPAL_THREAD_LOCK(&comm->matching_lock); + + /* get sequence number of next message that can be processed */ + if(OPAL_UNLIKELY((((uint16_t) hdr->hdr_seq) != ((uint16_t) proc->expected_sequence)) || + (opal_list_get_size(&proc->frags_cant_match) > 0 ))) { + goto slow_path; + } + + /* This is the sequence number we were expecting, so we can try + * matching it to already posted receives. + */ + + /* We're now expecting the next sequence number. */ + proc->expected_sequence++; + + /* We generate the SEARCH_POSTED_QUEUE only when the message is + * received in the correct sequence. Otherwise, we delay the event + * generation until we reach the correct sequence number. + */ + PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_BEGIN, comm_ptr, + hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV); + + match = match_one(btl, hdr, segments, num_segments, comm_ptr, proc, frag); + + /* The match is over. We generate the SEARCH_POSTED_Q_END here, + * before going into the mca_pml_csum_check_cantmatch_for_match so + * we can make a difference for the searching time for all + * messages. + */ + PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_END, comm_ptr, + hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV); + + /* release matching lock before processing fragment */ + OPAL_THREAD_UNLOCK(&comm->matching_lock); + + if(OPAL_LIKELY(match)) { + bytes_received = segments->seg_len - OMPI_PML_CSUM_MATCH_HDR_LEN; + match->req_recv.req_bytes_packed = bytes_received; + + MCA_PML_CSUM_RECV_REQUEST_MATCHED(match, hdr); + if(match->req_bytes_delivered > 0) { + struct iovec iov[2]; + uint32_t iov_count = 1; + + /* + * Make user buffer accessable(defined) before unpacking. + */ + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_defined, + match->req_recv.req_base.req_addr, + match->req_recv.req_base.req_count, + match->req_recv.req_base.req_datatype); + ); + + iov[0].iov_len = bytes_received; + iov[0].iov_base = (IOVBASE_TYPE*)((unsigned char*)segments->seg_addr.pval + + OMPI_PML_CSUM_MATCH_HDR_LEN); + while (iov_count < num_segments) { + bytes_received += segments[iov_count].seg_len; + iov[iov_count].iov_len = segments[iov_count].seg_len; + iov[iov_count].iov_base = (IOVBASE_TYPE*)((unsigned char*)segments[iov_count].seg_addr.pval); + iov_count++; + } + ompi_convertor_unpack( &match->req_recv.req_base.req_convertor, + iov, + &iov_count, + &bytes_received ); + match->req_bytes_received = bytes_received; + /* + * Unpacking finished, make the user buffer unaccessable again. + */ + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_noaccess, + match->req_recv.req_base.req_addr, + match->req_recv.req_base.req_count, + match->req_recv.req_base.req_datatype); + ); + } + if (do_csum) { + csum_data = (bytes_received > 0) ? match->req_recv.req_base.req_convertor.checksum : 0; + + OPAL_OUTPUT_VERBOSE((1, mca_pml_base_output, + "%s Received \'match\' with data csum:0x%x, header csum:0x%04x, size:%lu\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hdr->hdr_csum, csum, (unsigned long)bytes_received)); + + if (csum_data != hdr->hdr_csum) { + opal_output(0, "%s:%s:%d: Invalid \'match data\' - received csum:0x%x != computed csum:0x%x\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), __FILE__, __LINE__, hdr->hdr_csum, csum_data); + orte_errmgr.abort(-1,NULL); + } + } + + /* no need to check if complete we know we are.. */ + /* don't need a rmb as that is for checking */ + recv_request_pml_complete(match); + } + return; + + slow_path: + OPAL_THREAD_UNLOCK(&comm->matching_lock); + mca_pml_csum_recv_frag_match(btl, hdr, segments, + num_segments, MCA_PML_CSUM_HDR_TYPE_MATCH); +} + + +void mca_pml_csum_recv_frag_callback_rndv(mca_btl_base_module_t* btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* des, + void* cbdata ) { + + + + + mca_btl_base_segment_t* segments = des->des_dst; + mca_pml_csum_hdr_t* hdr = (mca_pml_csum_hdr_t*)segments->seg_addr.pval; + uint16_t csum_received, csum; + bool do_csum = btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM; + + if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_csum_common_hdr_t)) ) { + return; + } + csum_hdr_ntoh(hdr, MCA_PML_CSUM_HDR_TYPE_RNDV); + + if (do_csum) { + csum_received = hdr->hdr_common.hdr_csum; + hdr->hdr_common.hdr_csum = 0; + csum = opal_csum16(hdr, sizeof(mca_pml_csum_rendezvous_hdr_t)); + hdr->hdr_common.hdr_csum = csum_received; + if (csum_received != csum) { + opal_output(0, "%s:%s:%d: Invalid \'rndv header\' - received csum:0x%04x != computed csum:0x%04x\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), __FILE__, __LINE__, csum_received, csum); + orte_errmgr.abort(-1,NULL); + } + } + mca_pml_csum_recv_frag_match(btl, &hdr->hdr_match, segments, + des->des_dst_cnt, MCA_PML_CSUM_HDR_TYPE_RNDV); + return; +} + +void mca_pml_csum_recv_frag_callback_rget(mca_btl_base_module_t* btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* des, + void* cbdata ) { + mca_btl_base_segment_t* segments = des->des_dst; + mca_pml_csum_hdr_t* hdr = (mca_pml_csum_hdr_t*)segments->seg_addr.pval; + + if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_csum_common_hdr_t)) ) { + return; + } + csum_hdr_ntoh(hdr, MCA_PML_CSUM_HDR_TYPE_RGET); + mca_pml_csum_recv_frag_match(btl, &hdr->hdr_match, segments, + des->des_dst_cnt, MCA_PML_CSUM_HDR_TYPE_RGET); + return; +} + + + +void mca_pml_csum_recv_frag_callback_ack(mca_btl_base_module_t* btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* des, + void* cbdata ) { + mca_btl_base_segment_t* segments = des->des_dst; + mca_pml_csum_hdr_t* hdr = (mca_pml_csum_hdr_t*)segments->seg_addr.pval; + mca_pml_csum_send_request_t* sendreq; + uint16_t csum_received, csum; + bool do_csum = btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM; + + if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_csum_common_hdr_t)) ) { + return; + } + + csum_hdr_ntoh(hdr, MCA_PML_CSUM_HDR_TYPE_ACK); + + if (do_csum) { + csum_received = hdr->hdr_common.hdr_csum; + hdr->hdr_common.hdr_csum = 0; + csum = opal_csum16(hdr, sizeof(mca_pml_csum_ack_hdr_t)); + hdr->hdr_common.hdr_csum = csum_received; + OPAL_OUTPUT_VERBOSE((0, mca_pml_base_output, + "%s Received \'ACK\' with header csum:0x%04x\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), csum)); + if (csum_received != csum) { + opal_output(0, "%s:%s:%d: Invalid \'ACK header\' - received csum:0x%04x != computed csum:0x%04x\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), __FILE__, __LINE__, csum_received, csum); + orte_errmgr.abort(-1,NULL); + } + } + sendreq = (mca_pml_csum_send_request_t*)hdr->hdr_ack.hdr_src_req.pval; + sendreq->req_recv = hdr->hdr_ack.hdr_dst_req; + + /* if the request should be delivered entirely by copy in/out + * then throttle sends */ + if(hdr->hdr_common.hdr_flags & MCA_PML_CSUM_HDR_FLAGS_NORDMA) + sendreq->req_throttle_sends = true; + + mca_pml_csum_send_request_copy_in_out(sendreq, + hdr->hdr_ack.hdr_send_offset, + sendreq->req_send.req_bytes_packed - + hdr->hdr_ack.hdr_send_offset); + + OPAL_THREAD_ADD32(&sendreq->req_state, -1); + + if(send_request_pml_complete_check(sendreq) == false) + mca_pml_csum_send_request_schedule(sendreq); + + return; +} + +void mca_pml_csum_recv_frag_callback_frag(mca_btl_base_module_t* btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* des, + void* cbdata ) { + mca_btl_base_segment_t* segments = des->des_dst; + mca_pml_csum_hdr_t* hdr = (mca_pml_csum_hdr_t*)segments->seg_addr.pval; + mca_pml_csum_recv_request_t* recvreq; + uint16_t csum_received, csum; + bool do_csum = mca_pml_csum.enable_csum && + (btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM); + + if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_csum_common_hdr_t)) ) { + return; + } + csum_hdr_ntoh(hdr, MCA_PML_CSUM_HDR_TYPE_FRAG); + + if(do_csum) { + csum_received = hdr->hdr_common.hdr_csum; + hdr->hdr_common.hdr_csum = 0; + csum = opal_csum16(hdr, sizeof(mca_pml_csum_frag_hdr_t)); + hdr->hdr_common.hdr_csum = csum_received; + if(csum_received != csum) { + opal_output(0, "%s:%s:%d: Invalid \'frag header\' - received csum:0x%04x != computed csum:0x%04x\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), __FILE__, __LINE__, csum_received, csum); + orte_errmgr.abort(-1,NULL); + } + } + recvreq = (mca_pml_csum_recv_request_t*)hdr->hdr_frag.hdr_dst_req.pval; + mca_pml_csum_recv_request_progress_frag(recvreq,btl,segments,des->des_dst_cnt); + + return; +} + + +void mca_pml_csum_recv_frag_callback_put(mca_btl_base_module_t* btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* des, + void* cbdata ) { + mca_btl_base_segment_t* segments = des->des_dst; + mca_pml_csum_hdr_t* hdr = (mca_pml_csum_hdr_t*)segments->seg_addr.pval; + mca_pml_csum_send_request_t* sendreq; + uint16_t csum_received, csum; + bool do_csum = mca_pml_csum.enable_csum && + (btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM); + + if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_csum_common_hdr_t)) ) { + return; + } + + csum_hdr_ntoh(hdr, MCA_PML_CSUM_HDR_TYPE_PUT); + + if(do_csum) { + csum_received = hdr->hdr_common.hdr_csum; + hdr->hdr_common.hdr_csum = 0; + csum = opal_csum16(hdr, sizeof(mca_pml_csum_rdma_hdr_t)); + hdr->hdr_common.hdr_csum = csum_received; + OPAL_OUTPUT_VERBOSE((0, mca_pml_base_output, + "%s Received \'PUT\' with header csum:0x%04x\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), csum)); + if(csum_received != csum) { + opal_output(0, "%s:%s:%d: Invalid \'PUT header\' - received csum:0x%04x != computed csum:0x%04x\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), __FILE__, __LINE__, csum_received, csum); + orte_errmgr.abort(-1,NULL); + } + } + sendreq = (mca_pml_csum_send_request_t*)hdr->hdr_rdma.hdr_req.pval; + mca_pml_csum_send_request_put(sendreq,btl,&hdr->hdr_rdma); + + return; +} + + +void mca_pml_csum_recv_frag_callback_fin(mca_btl_base_module_t* btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* des, + void* cbdata ) { + mca_btl_base_segment_t* segments = des->des_dst; + mca_pml_csum_hdr_t* hdr = (mca_pml_csum_hdr_t*)segments->seg_addr.pval; + mca_btl_base_descriptor_t* rdma; + uint16_t csum_received, csum; + bool do_csum = mca_pml_csum.enable_csum && + (btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM); + + if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_csum_common_hdr_t)) ) { + return; + } + + csum_hdr_ntoh(hdr, MCA_PML_CSUM_HDR_TYPE_FIN); + + if(do_csum) { + csum_received = hdr->hdr_common.hdr_csum; + hdr->hdr_common.hdr_csum = 0; + csum = opal_csum16(hdr, sizeof(mca_pml_csum_fin_hdr_t)); + hdr->hdr_common.hdr_csum = csum_received; + OPAL_OUTPUT_VERBOSE((0, mca_pml_base_output, + "%s Received \'FIN\' with header csum:0x%04x\n",ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),csum)); + if(csum_received != csum) { + opal_output(0, "%s:%s:%d: Invalid \'FIN header\' - received csum:0x%04x != computed csum:0x%04x\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), __FILE__, __LINE__, csum_received, csum); + orte_errmgr.abort(-1,NULL); + } + } + rdma = (mca_btl_base_descriptor_t*)hdr->hdr_fin.hdr_des.pval; + rdma->des_cbfunc(btl, NULL, rdma, + hdr->hdr_fin.hdr_fail ? OMPI_ERROR : OMPI_SUCCESS); + + return; +} + + + +#define PML_MAX_SEQ ~((mca_pml_sequence_t)0); + +static inline mca_pml_csum_recv_request_t* get_posted_recv(opal_list_t *queue) +{ + if(opal_list_get_size(queue) == 0) + return NULL; + + return (mca_pml_csum_recv_request_t*)opal_list_get_first(queue); +} + +static inline mca_pml_csum_recv_request_t* get_next_posted_recv( + opal_list_t *queue, + mca_pml_csum_recv_request_t* req) +{ + opal_list_item_t *i = opal_list_get_next((opal_list_item_t*)req); + + if(opal_list_get_end(queue) == i) + return NULL; + + return (mca_pml_csum_recv_request_t*)i; +} + +static mca_pml_csum_recv_request_t *match_incomming( + mca_pml_csum_match_hdr_t *hdr, mca_pml_csum_comm_t *comm, + mca_pml_csum_comm_proc_t *proc) +{ + mca_pml_csum_recv_request_t *specific_recv, *wild_recv; + mca_pml_sequence_t wild_recv_seq, specific_recv_seq; + int tag = hdr->hdr_tag; + + specific_recv = get_posted_recv(&proc->specific_receives); + wild_recv = get_posted_recv(&comm->wild_receives); + + wild_recv_seq = wild_recv ? + wild_recv->req_recv.req_base.req_sequence : PML_MAX_SEQ; + specific_recv_seq = specific_recv ? + specific_recv->req_recv.req_base.req_sequence : PML_MAX_SEQ; + + /* they are equal only if both are PML_MAX_SEQ */ + while(wild_recv_seq != specific_recv_seq) { + mca_pml_csum_recv_request_t **match; + opal_list_t *queue; + int req_tag; + mca_pml_sequence_t *seq; + + if (OPAL_UNLIKELY(wild_recv_seq < specific_recv_seq)) { + match = &wild_recv; + queue = &comm->wild_receives; + seq = &wild_recv_seq; + } else { + match = &specific_recv; + queue = &proc->specific_receives; + seq = &specific_recv_seq; + } + + req_tag = (*match)->req_recv.req_base.req_tag; + if(req_tag == tag || (req_tag == OMPI_ANY_TAG && tag >= 0)) { + opal_list_remove_item(queue, (opal_list_item_t*)(*match)); + PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_REQ_REMOVE_FROM_POSTED_Q, + &((*match)->req_recv.req_base), PERUSE_RECV); + return *match; + } + + *match = get_next_posted_recv(queue, *match); + *seq = (*match) ? (*match)->req_recv.req_base.req_sequence : PML_MAX_SEQ; + } + + return NULL; +} + +static mca_pml_csum_recv_request_t *match_one(mca_btl_base_module_t *btl, + mca_pml_csum_match_hdr_t *hdr, mca_btl_base_segment_t* segments, + size_t num_segments, ompi_communicator_t *comm_ptr, + mca_pml_csum_comm_proc_t *proc, + mca_pml_csum_recv_frag_t* frag) +{ + mca_pml_csum_recv_request_t *match; + mca_pml_csum_comm_t *comm = (mca_pml_csum_comm_t *)comm_ptr->c_pml_comm; + + do { + match = match_incomming(hdr, comm, proc); + + /* if match found, process data */ + if(OPAL_UNLIKELY(NULL == match)) { + /* if no match found, place on unexpected queue */ + append_frag_to_list(&proc->unexpected_frags, btl, hdr, segments, + num_segments, frag); + PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_INSERT_IN_UNEX_Q, comm_ptr, + hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV); + return NULL; + } + + match->req_recv.req_base.req_proc = proc->ompi_proc; + + if(MCA_PML_REQUEST_PROBE == match->req_recv.req_base.req_type) { + /* complete the probe */ + mca_pml_csum_recv_request_matched_probe(match, btl, segments, + num_segments); + /* attempt to match actual request */ + continue; + } + + PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_MSG_MATCH_POSTED_REQ, + &(match->req_recv.req_base), PERUSE_RECV); + break; + } while(true); + + return match; +} + +static mca_pml_csum_recv_frag_t *check_cantmatch_for_match( + mca_pml_csum_comm_proc_t *proc) +{ + /* local parameters */ + mca_pml_csum_recv_frag_t *frag; + + /* search the list for a fragment from the send with sequence + * number next_msg_seq_expected + */ + for(frag = (mca_pml_csum_recv_frag_t *) + opal_list_get_first(&proc->frags_cant_match); + frag != (mca_pml_csum_recv_frag_t *) + opal_list_get_end(&proc->frags_cant_match); + frag = (mca_pml_csum_recv_frag_t *) + opal_list_get_next(frag)) + { + mca_pml_csum_match_hdr_t* hdr = &frag->hdr.hdr_match; + /* + * If the message has the next expected seq from that proc... + */ + if(hdr->hdr_seq != proc->expected_sequence) + continue; + + opal_list_remove_item(&proc->frags_cant_match, (opal_list_item_t*)frag); + return frag; + } + + return NULL; +} + +/** + * RCS/CTS receive side matching + * + * @param hdr list of parameters needed for matching + * This list is also embeded in frag, + * but this allows to save a memory copy when + * a match is made in this routine. (IN) + * @param frag pointer to receive fragment which we want + * to match (IN/OUT). If a match is not made, + * hdr is copied to frag. + * @param match_made parameter indicating if we matched frag/ + * hdr (OUT) + * @param additional_matches if a match is made with frag, we + * may be able to match fragments that previously + * have arrived out-of-order. If this is the + * case, the associated fragment descriptors are + * put on this list for further processing. (OUT) + * + * @return OMPI error code + * + * This routine is used to try and match a newly arrived message fragment + * to pre-posted receives. The following assumptions are made + * - fragments are received out of order + * - for long messages, e.g. more than one fragment, a RTS/CTS algorithm + * is used. + * - 2nd and greater fragments include a receive descriptor pointer + * - fragments may be dropped + * - fragments may be corrupt + * - this routine may be called simultaneously by more than one thread + */ +static int mca_pml_csum_recv_frag_match( mca_btl_base_module_t *btl, + mca_pml_csum_match_hdr_t *hdr, + mca_btl_base_segment_t* segments, + size_t num_segments, + int type) +{ + /* local variables */ + uint16_t next_msg_seq_expected, frag_msg_seq; + ompi_communicator_t *comm_ptr; + mca_pml_csum_recv_request_t *match = NULL; + mca_pml_csum_comm_t *comm; + mca_pml_csum_comm_proc_t *proc; + mca_pml_csum_recv_frag_t* frag = NULL; + + /* communicator pointer */ + comm_ptr = ompi_comm_lookup(hdr->hdr_ctx); + if(OPAL_UNLIKELY(NULL == comm_ptr)) { + /* This is a special case. A message for a not yet existing + * communicator can happens. Instead of doing a matching we + * will temporarily add it the a pending queue in the PML. + * Later on, when the communicator is completely instantiated, + * this pending queue will be searched and all matching fragments + * moved to the right communicator. + */ + append_frag_to_list( &mca_pml_csum.non_existing_communicator_pending, + btl, hdr, segments, num_segments, frag ); + return OMPI_SUCCESS; + } + comm = (mca_pml_csum_comm_t *)comm_ptr->c_pml_comm; + + /* source sequence number */ + frag_msg_seq = hdr->hdr_seq; + proc = &comm->procs[hdr->hdr_src]; + + /** + * We generate the MSG_ARRIVED event as soon as the PML is aware of a matching + * fragment arrival. Independing if it is received on the correct order or not. + * This will allow the tools to figure out if the messages are not received in the + * correct order (if multiple network interfaces). + */ + PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_ARRIVED, comm_ptr, + hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV); + + /* get next expected message sequence number - if threaded + * run, lock to make sure that if another thread is processing + * a frag from the same message a match is made only once. + * Also, this prevents other posted receives (for a pair of + * end points) from being processed, and potentially "loosing" + * the fragment. + */ + OPAL_THREAD_LOCK(&comm->matching_lock); + + /* get sequence number of next message that can be processed */ + next_msg_seq_expected = (uint16_t)proc->expected_sequence; + if(OPAL_UNLIKELY(frag_msg_seq != next_msg_seq_expected)) + goto wrong_seq; + + /* + * This is the sequence number we were expecting, + * so we can try matching it to already posted + * receives. + */ + +out_of_order_match: + /* We're now expecting the next sequence number. */ + proc->expected_sequence++; + + /** + * We generate the SEARCH_POSTED_QUEUE only when the message is received + * in the correct sequence. Otherwise, we delay the event generation until + * we reach the correct sequence number. + */ + PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_BEGIN, comm_ptr, + hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV); + + match = match_one(btl, hdr, segments, num_segments, comm_ptr, proc, frag); + + /** + * The match is over. We generate the SEARCH_POSTED_Q_END here, before going + * into the mca_pml_csum_check_cantmatch_for_match so we can make a difference + * for the searching time for all messages. + */ + PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_END, comm_ptr, + hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV); + + /* release matching lock before processing fragment */ + OPAL_THREAD_UNLOCK(&comm->matching_lock); + + if(OPAL_LIKELY(match)) { + switch(type) { + case MCA_PML_CSUM_HDR_TYPE_MATCH: + mca_pml_csum_recv_request_progress_match(match, btl, segments, num_segments); + break; + case MCA_PML_CSUM_HDR_TYPE_RNDV: + mca_pml_csum_recv_request_progress_rndv(match, btl, segments, num_segments); + break; + case MCA_PML_CSUM_HDR_TYPE_RGET: + mca_pml_csum_recv_request_progress_rget(match, btl, segments, num_segments); + break; + } + + if(OPAL_UNLIKELY(frag)) + MCA_PML_CSUM_RECV_FRAG_RETURN(frag); + } + + /* + * Now that new message has arrived, check to see if + * any fragments on the c_c_frags_cant_match list + * may now be used to form new matchs + */ + if(OPAL_UNLIKELY(opal_list_get_size(&proc->frags_cant_match) > 0)) { + OPAL_THREAD_LOCK(&comm->matching_lock); + if((frag = check_cantmatch_for_match(proc))) { + hdr = &frag->hdr.hdr_match; + segments = frag->segments; + num_segments = frag->num_segments; + btl = frag->btl; + type = hdr->hdr_common.hdr_type; + goto out_of_order_match; + } + OPAL_THREAD_UNLOCK(&comm->matching_lock); + } + + return OMPI_SUCCESS; +wrong_seq: + /* + * This message comes after the next expected, so it + * is ahead of sequence. Save it for later. + */ + append_frag_to_list(&proc->frags_cant_match, btl, hdr, segments, + num_segments, NULL); + OPAL_THREAD_UNLOCK(&comm->matching_lock); + return OMPI_SUCCESS; +} diff --git a/ompi/mca/pml/csum/pml_csum_recvfrag.h b/ompi/mca/pml/csum/pml_csum_recvfrag.h new file mode 100644 index 0000000000..71f711f079 --- /dev/null +++ b/ompi/mca/pml/csum/pml_csum_recvfrag.h @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. + * Copyright (c) 2009 IBM Corporation. All rights reserved. + * Copyright (c) 2009 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ + +#ifndef MCA_PML_CSUM_RECVFRAG_H +#define MCA_PML_CSUM_RECVFRAG_H + +#include "ompi/mca/btl/btl.h" +#include "ompi/mca/bml/bml.h" +#include "pml_csum_hdr.h" + +BEGIN_C_DECLS + +struct mca_pml_csum_buffer_t { + size_t len; + void * addr; +}; +typedef struct mca_pml_csum_buffer_t mca_pml_csum_buffer_t; + + +struct mca_pml_csum_recv_frag_t { + ompi_free_list_item_t super; + mca_pml_csum_hdr_t hdr; + size_t num_segments; + mca_btl_base_module_t* btl; + mca_btl_base_segment_t segments[MCA_BTL_DES_MAX_SEGMENTS]; + mca_pml_csum_buffer_t buffers[MCA_BTL_DES_MAX_SEGMENTS]; + unsigned char addr[1]; +}; +typedef struct mca_pml_csum_recv_frag_t mca_pml_csum_recv_frag_t; + +OBJ_CLASS_DECLARATION(mca_pml_csum_recv_frag_t); + + +#define MCA_PML_CSUM_RECV_FRAG_ALLOC(frag,rc) \ +do { \ + ompi_free_list_item_t* item; \ + OMPI_FREE_LIST_WAIT(&mca_pml_csum.recv_frags, item, rc); \ + frag = (mca_pml_csum_recv_frag_t*)item; \ +} while(0) + + +#define MCA_PML_CSUM_RECV_FRAG_INIT(frag, hdr, segs, cnt, btl ) \ +do { \ + size_t i, _size; \ + mca_btl_base_segment_t* macro_segments = frag->segments; \ + mca_pml_csum_buffer_t* buffers = frag->buffers; \ + unsigned char* _ptr = (unsigned char*)frag->addr; \ + /* init recv_frag */ \ + frag->btl = btl; \ + frag->hdr = *(mca_pml_csum_hdr_t*)hdr; \ + frag->num_segments = 1; \ + _size = segs[0].seg_len; \ + for( i = 1; i < cnt; i++ ) { \ + _size += segs[i].seg_len; \ + } \ + /* copy over data */ \ + if(_size <= mca_pml_csum.unexpected_limit ) { \ + macro_segments[0].seg_addr.pval = frag->addr; \ + } else { \ + buffers[0].len = _size; \ + buffers[0].addr = (char*) \ + mca_pml_csum.allocator->alc_alloc( mca_pml_csum.allocator, \ + buffers[0].len, \ + 0, NULL); \ + _ptr = (unsigned char*)(buffers[0].addr); \ + macro_segments[0].seg_addr.pval = buffers[0].addr; \ + } \ + macro_segments[0].seg_len = _size; \ + for( i = 0; i < cnt; i++ ) { \ + memcpy( _ptr, segs[i].seg_addr.pval, segs[i].seg_len); \ + _ptr += segs[i].seg_len; \ + } \ + } while(0) + + +#define MCA_PML_CSUM_RECV_FRAG_RETURN(frag) \ +do { \ + if( frag->segments[0].seg_len > mca_pml_csum.unexpected_limit ) { \ + /* return buffers */ \ + mca_pml_csum.allocator->alc_free( mca_pml_csum.allocator, \ + frag->buffers[0].addr ); \ + } \ + frag->num_segments = 0; \ + \ + /* return recv_frag */ \ + OMPI_FREE_LIST_RETURN(&mca_pml_csum.recv_frags, \ + (ompi_free_list_item_t*)frag); \ + } while(0) + + +/** + * Callback from BTL on receipt of a recv_frag (match). + */ + +extern void mca_pml_csum_recv_frag_callback_match( mca_btl_base_module_t *btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* descriptor, + void* cbdata ); + +/** + * Callback from BTL on receipt of a recv_frag (rndv). + */ + +extern void mca_pml_csum_recv_frag_callback_rndv( mca_btl_base_module_t *btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* descriptor, + void* cbdata ); +/** + * Callback from BTL on receipt of a recv_frag (rget). + */ + +extern void mca_pml_csum_recv_frag_callback_rget( mca_btl_base_module_t *btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* descriptor, + void* cbdata ); + +/** + * Callback from BTL on receipt of a recv_frag (ack). + */ + +extern void mca_pml_csum_recv_frag_callback_ack( mca_btl_base_module_t *btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* descriptor, + void* cbdata ); +/** + * Callback from BTL on receipt of a recv_frag (frag). + */ + +extern void mca_pml_csum_recv_frag_callback_frag( mca_btl_base_module_t *btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* descriptor, + void* cbdata ); +/** + * Callback from BTL on receipt of a recv_frag (put). + */ + +extern void mca_pml_csum_recv_frag_callback_put( mca_btl_base_module_t *btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* descriptor, + void* cbdata ); +/** + * Callback from BTL on receipt of a recv_frag (fin). + */ + +extern void mca_pml_csum_recv_frag_callback_fin( mca_btl_base_module_t *btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* descriptor, + void* cbdata ); + + +END_C_DECLS + +#endif + diff --git a/ompi/mca/pml/csum/pml_csum_recvreq.c b/ompi/mca/pml/csum/pml_csum_recvreq.c new file mode 100644 index 0000000000..f5143b042f --- /dev/null +++ b/ompi/mca/pml/csum/pml_csum_recvreq.c @@ -0,0 +1,1110 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2008 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. + * Copyright (c) 2009 IBM Corporation. All rights reserved. + * Copyright (c) 2009 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "opal/util/arch.h" +#include "opal/util/crc.h" +#include "opal/util/output.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/util/name_fns.h" +#include "orte/runtime/orte_globals.h" + +#include "ompi/mca/pml/pml.h" +#include "ompi/mca/bml/bml.h" +#include "ompi/mca/btl/btl.h" +#include "ompi/mca/mpool/mpool.h" +#include "ompi/mca/bml/base/base.h" +#include "ompi/memchecker.h" +#include "ompi/mca/pml/base/base.h" + +#include "pml_csum_comm.h" +#include "pml_csum_recvreq.h" +#include "pml_csum_recvfrag.h" +#include "pml_csum_sendreq.h" +#include "pml_csum_rdmafrag.h" + +void mca_pml_csum_recv_request_process_pending(void) +{ + mca_pml_csum_recv_request_t* recvreq; + int i, s = (int)opal_list_get_size(&mca_pml_csum.recv_pending); + + for(i = 0; i < s; i++) { + OPAL_THREAD_LOCK(&mca_pml_csum.lock); + recvreq = (mca_pml_csum_recv_request_t*) + opal_list_remove_first(&mca_pml_csum.recv_pending); + OPAL_THREAD_UNLOCK(&mca_pml_csum.lock); + if( OPAL_UNLIKELY(NULL == recvreq) ) + break; + recvreq->req_pending = false; + if(mca_pml_csum_recv_request_schedule_exclusive(recvreq, NULL) == + OMPI_ERR_OUT_OF_RESOURCE) + break; + } +} + +static int mca_pml_csum_recv_request_free(struct ompi_request_t** request) +{ + mca_pml_csum_recv_request_t* recvreq = *(mca_pml_csum_recv_request_t**)request; + + assert( false == recvreq->req_recv.req_base.req_free_called ); + + OPAL_THREAD_LOCK(&ompi_request_lock); + recvreq->req_recv.req_base.req_free_called = true; + + PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_NOTIFY, + &(recvreq->req_recv.req_base), PERUSE_RECV ); + + if( true == recvreq->req_recv.req_base.req_pml_complete ) { + /* make buffer defined when the request is compeleted, + and before releasing the objects. */ + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_defined, + recvreq->req_recv.req_base.req_addr, + recvreq->req_recv.req_base.req_count, + recvreq->req_recv.req_base.req_datatype); + ); + + MCA_PML_CSUM_RECV_REQUEST_RETURN( recvreq ); + } + + OPAL_THREAD_UNLOCK(&ompi_request_lock); + *request = MPI_REQUEST_NULL; + return OMPI_SUCCESS; +} + +static int mca_pml_csum_recv_request_cancel(struct ompi_request_t* ompi_request, int complete) +{ + mca_pml_csum_recv_request_t* request = (mca_pml_csum_recv_request_t*)ompi_request; + mca_pml_csum_comm_t* comm = request->req_recv.req_base.req_comm->c_pml_comm; + + if( true == ompi_request->req_complete ) { /* way to late to cancel this one */ + /* + * Receive request completed, make user buffer accessable. + */ + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_defined, + request->req_recv.req_base.req_addr, + request->req_recv.req_base.req_count, + request->req_recv.req_base.req_datatype); + ); + return OMPI_SUCCESS; + } + + /* The rest should be protected behind the match logic lock */ + OPAL_THREAD_LOCK(&comm->matching_lock); + if( OMPI_ANY_TAG == ompi_request->req_status.MPI_TAG ) { /* the match has not been already done */ + if( request->req_recv.req_base.req_peer == OMPI_ANY_SOURCE ) { + opal_list_remove_item( &comm->wild_receives, (opal_list_item_t*)request ); + } else { + mca_pml_csum_comm_proc_t* proc = comm->procs + request->req_recv.req_base.req_peer; + opal_list_remove_item(&proc->specific_receives, (opal_list_item_t*)request); + } + PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_REMOVE_FROM_POSTED_Q, + &(request->req_recv.req_base), PERUSE_RECV ); + /** + * As now the PML is done with this request we have to force the pml_complete + * to true. Otherwise, the request will never be freed. + */ + request->req_recv.req_base.req_pml_complete = true; + } + OPAL_THREAD_UNLOCK(&comm->matching_lock); + + OPAL_THREAD_LOCK(&ompi_request_lock); + ompi_request->req_status._cancelled = true; + /* This macro will set the req_complete to true so the MPI Test/Wait* functions + * on this request will be able to complete. As the status is marked as + * cancelled the cancel state will be detected. + */ + MCA_PML_CSUM_RECV_REQUEST_MPI_COMPLETE(request); + OPAL_THREAD_UNLOCK(&ompi_request_lock); + /* + * Receive request cancelled, make user buffer accessable. + */ + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_defined, + request->req_recv.req_base.req_addr, + request->req_recv.req_base.req_count, + request->req_recv.req_base.req_datatype); + ); + return OMPI_SUCCESS; +} + +static void mca_pml_csum_recv_request_construct(mca_pml_csum_recv_request_t* request) +{ + request->req_recv.req_base.req_type = MCA_PML_REQUEST_RECV; + request->req_recv.req_base.req_ompi.req_free = mca_pml_csum_recv_request_free; + request->req_recv.req_base.req_ompi.req_cancel = mca_pml_csum_recv_request_cancel; + request->req_rdma_cnt = 0; + OBJ_CONSTRUCT(&request->lock, opal_mutex_t); +} + +OBJ_CLASS_INSTANCE( + mca_pml_csum_recv_request_t, + mca_pml_base_recv_request_t, + mca_pml_csum_recv_request_construct, + NULL); + + +/* + * Release resources. + */ + +static void mca_pml_csum_recv_ctl_completion( mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status ) +{ + mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context; + + MCA_PML_CSUM_PROGRESS_PENDING(bml_btl); +} + +/* + * Put operation has completed remotely - update request status + */ + +static void mca_pml_csum_put_completion( mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status ) +{ + mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context; + mca_pml_csum_recv_request_t* recvreq = (mca_pml_csum_recv_request_t*)des->des_cbdata; + size_t bytes_received = 0; + + if( OPAL_LIKELY(status == OMPI_SUCCESS) ) { + MCA_PML_CSUM_COMPUTE_SEGMENT_LENGTH( des->des_dst, des->des_dst_cnt, + 0, bytes_received ); + } + OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth,-1); + + mca_bml_base_free(bml_btl, des); + + /* check completion status */ + OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, bytes_received); + if(recv_request_pml_complete_check(recvreq) == false && + recvreq->req_rdma_offset < recvreq->req_send_offset) { + /* schedule additional rdma operations */ + mca_pml_csum_recv_request_schedule(recvreq, bml_btl); + } + MCA_PML_CSUM_PROGRESS_PENDING(bml_btl); +} + +/* + * + */ + +int mca_pml_csum_recv_request_ack_send_btl( + ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl, + uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset, + bool nordma) +{ + mca_btl_base_descriptor_t* des; + mca_pml_csum_ack_hdr_t* ack; + int rc; + bool do_csum = bml_btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM; + + /* allocate descriptor */ + mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, + sizeof(mca_pml_csum_ack_hdr_t), + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); + if( OPAL_UNLIKELY(NULL == des) ) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + /* fill out header */ + ack = (mca_pml_csum_ack_hdr_t*)des->des_src->seg_addr.pval; + ack->hdr_common.hdr_type = MCA_PML_CSUM_HDR_TYPE_ACK; + ack->hdr_common.hdr_flags = nordma ? MCA_PML_CSUM_HDR_FLAGS_NORDMA : 0; + ack->hdr_common.hdr_csum = 0; + ack->hdr_src_req.lval = hdr_src_req; + ack->hdr_dst_req.pval = hdr_dst_req; + ack->hdr_send_offset = hdr_send_offset; + ack->hdr_common.hdr_csum = (do_csum ? + opal_csum16(ack, sizeof(mca_pml_csum_ack_hdr_t)) : OPAL_CSUM_ZERO); + + OPAL_OUTPUT_VERBOSE((1, mca_pml_base_output, + "%s Sending \'ACK\' with header csum:0x%04x\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ack->hdr_common.hdr_csum)); + + csum_hdr_hton(ack, MCA_PML_CSUM_HDR_TYPE_ACK, proc); + + /* initialize descriptor */ + des->des_cbfunc = mca_pml_csum_recv_ctl_completion; + + rc = mca_bml_base_send(bml_btl, des, MCA_PML_CSUM_HDR_TYPE_ACK); + if( OPAL_LIKELY( rc >= 0 ) ) { + if( OPAL_LIKELY( 1 == rc ) ) { + MCA_PML_CSUM_PROGRESS_PENDING(bml_btl); + } + return OMPI_SUCCESS; + } + mca_bml_base_free(bml_btl, des); + return OMPI_ERR_OUT_OF_RESOURCE; +} + +static int mca_pml_csum_recv_request_ack( + mca_pml_csum_recv_request_t* recvreq, + mca_pml_csum_rendezvous_hdr_t* hdr, + size_t bytes_received) +{ + ompi_proc_t* proc = (ompi_proc_t*)recvreq->req_recv.req_base.req_proc; + mca_bml_base_endpoint_t* bml_endpoint = NULL; + + bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml; + + /* by default copy everything */ + recvreq->req_send_offset = bytes_received; + if(hdr->hdr_msg_length > bytes_received) { + size_t rdma_num = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma); + /* + * lookup request buffer to determine if memory is already + * registered. + */ + + if(ompi_convertor_need_buffers(&recvreq->req_recv.req_base.req_convertor) == 0 && + hdr->hdr_match.hdr_common.hdr_flags & MCA_PML_CSUM_HDR_FLAGS_CONTIG && + rdma_num != 0) { + unsigned char *base; + ompi_convertor_get_current_pointer( &recvreq->req_recv.req_base.req_convertor, (void**)&(base) ); + + if(hdr->hdr_match.hdr_common.hdr_flags & MCA_PML_CSUM_HDR_FLAGS_PIN) + recvreq->req_rdma_cnt = mca_pml_csum_rdma_btls(bml_endpoint, + base, recvreq->req_recv.req_bytes_packed, + recvreq->req_rdma ); + else + recvreq->req_rdma_cnt = 0; + + /* memory is already registered on both sides */ + if (recvreq->req_rdma_cnt != 0) { + recvreq->req_send_offset = hdr->hdr_msg_length; + /* are rdma devices available for long rdma protocol */ + } else if(bml_endpoint->btl_send_limit < hdr->hdr_msg_length) { + /* use convertor to figure out the rdma offset for this request */ + recvreq->req_send_offset = hdr->hdr_msg_length - + bml_endpoint->btl_pipeline_send_length; + + if(recvreq->req_send_offset < bytes_received) + recvreq->req_send_offset = bytes_received; + + /* use converter to figure out the rdma offset for this + * request */ + ompi_convertor_set_position(&recvreq->req_recv.req_base.req_convertor, + &recvreq->req_send_offset); + + recvreq->req_rdma_cnt = + mca_pml_csum_rdma_pipeline_btls(bml_endpoint, + recvreq->req_send_offset - bytes_received, + recvreq->req_rdma); + } + } + /* nothing to send by copy in/out - no need to ack */ + if(recvreq->req_send_offset == hdr->hdr_msg_length) + return OMPI_SUCCESS; + } + /* let know to shedule function there is no need to put ACK flag */ + recvreq->req_ack_sent = true; + return mca_pml_csum_recv_request_ack_send(proc, hdr->hdr_src_req.lval, + recvreq, recvreq->req_send_offset, + recvreq->req_send_offset == bytes_received); +} + + +/** + * Return resources used by the RDMA + */ + +static void mca_pml_csum_rget_completion( mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status ) +{ + mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context; + mca_pml_csum_rdma_frag_t* frag = (mca_pml_csum_rdma_frag_t*)des->des_cbdata; + mca_pml_csum_recv_request_t* recvreq = (mca_pml_csum_recv_request_t*)frag->rdma_req; + + /* check completion status */ + if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) { + /* TSW - FIX */ + ORTE_ERROR_LOG(status); + orte_errmgr.abort(-1, NULL); + } + + mca_pml_csum_send_fin(recvreq->req_recv.req_base.req_proc, + bml_btl, + frag->rdma_hdr.hdr_rget.hdr_des.pval, + des->order, 0); + + /* is receive request complete */ + OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, frag->rdma_length); + recv_request_pml_complete_check(recvreq); + + MCA_PML_CSUM_RDMA_FRAG_RETURN(frag); + + MCA_PML_CSUM_PROGRESS_PENDING(bml_btl); +} + + +/* + * + */ +int mca_pml_csum_recv_request_get_frag( mca_pml_csum_rdma_frag_t* frag ) +{ + mca_pml_csum_recv_request_t* recvreq = (mca_pml_csum_recv_request_t*)frag->rdma_req; + mca_bml_base_btl_t* bml_btl = frag->rdma_bml; + mca_btl_base_descriptor_t* descriptor; + size_t save_size = frag->rdma_length; + int rc; + + /* prepare descriptor */ + mca_bml_base_prepare_dst( bml_btl, + NULL, + &recvreq->req_recv.req_base.req_convertor, + MCA_BTL_NO_ORDER, + 0, + &frag->rdma_length, + MCA_BTL_DES_FLAGS_BTL_OWNERSHIP, + &descriptor ); + if( OPAL_UNLIKELY(NULL == descriptor) ) { + frag->rdma_length = save_size; + OPAL_THREAD_LOCK(&mca_pml_csum.lock); + opal_list_append(&mca_pml_csum.rdma_pending, (opal_list_item_t*)frag); + OPAL_THREAD_UNLOCK(&mca_pml_csum.lock); + return OMPI_ERR_OUT_OF_RESOURCE; + } + + descriptor->des_src = frag->rdma_segs; + descriptor->des_src_cnt = frag->rdma_hdr.hdr_rdma.hdr_seg_cnt; + descriptor->des_cbfunc = mca_pml_csum_rget_completion; + descriptor->des_cbdata = frag; + + PERUSE_TRACE_COMM_OMPI_EVENT(PERUSE_COMM_REQ_XFER_CONTINUE, + &(recvreq->req_recv.req_base), + frag->rdma_length, PERUSE_RECV); + + /* queue up get request */ + rc = mca_bml_base_get(bml_btl,descriptor); + if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) { + if(OMPI_ERR_OUT_OF_RESOURCE == rc) { + mca_bml_base_free(bml_btl, descriptor); + OPAL_THREAD_LOCK(&mca_pml_csum.lock); + opal_list_append(&mca_pml_csum.rdma_pending, + (opal_list_item_t*)frag); + OPAL_THREAD_UNLOCK(&mca_pml_csum.lock); + return OMPI_ERR_OUT_OF_RESOURCE; + } else { + ORTE_ERROR_LOG(rc); + orte_errmgr.abort(-1, NULL); + } + } + + return OMPI_SUCCESS; +} + + + + +/* + * Update the recv request status to reflect the number of bytes + * received and actually delivered to the application. + */ + +void mca_pml_csum_recv_request_progress_frag( mca_pml_csum_recv_request_t* recvreq, + mca_btl_base_module_t* btl, + mca_btl_base_segment_t* segments, + size_t num_segments ) +{ + size_t bytes_received = 0; + size_t bytes_delivered __opal_attribute_unused__; /* is being set to zero in MCA_PML_CSUM_RECV_REQUEST_UNPACK */ + size_t data_offset = 0; + mca_pml_csum_hdr_t* hdr = (mca_pml_csum_hdr_t*)segments->seg_addr.pval; + uint32_t csum = OPAL_CSUM_ZERO; + bool do_csum = btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM; + + MCA_PML_CSUM_COMPUTE_SEGMENT_LENGTH( segments, num_segments, + 0, bytes_received ); + bytes_received -= sizeof(mca_pml_csum_frag_hdr_t); + data_offset = hdr->hdr_frag.hdr_frag_offset; + /* + * Make user buffer accessable(defined) before unpacking. + */ + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_defined, + recvreq->req_recv.req_base.req_addr, + recvreq->req_recv.req_base.req_count, + recvreq->req_recv.req_base.req_datatype); + ); + MCA_PML_CSUM_RECV_REQUEST_UNPACK( recvreq, + segments, + num_segments, + sizeof(mca_pml_csum_frag_hdr_t), + data_offset, + bytes_received, + bytes_delivered ); + /* + * Unpacking finished, make the user buffer unaccessable again. + */ + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_noaccess, + recvreq->req_recv.req_base.req_addr, + recvreq->req_recv.req_base.req_count, + recvreq->req_recv.req_base.req_datatype); + ); + + if (do_csum) { + csum = (bytes_received > 0)? + recvreq->req_recv.req_base.req_convertor.checksum : 0; + OPAL_OUTPUT_VERBOSE((0, mca_pml_base_output, + "%s Received \'frag\' with data csum:0x%x, header csum:0x%04x, size:%lu\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), csum, hdr->hdr_common.hdr_csum, (unsigned long)bytes_received)); + if(csum != hdr->hdr_frag.hdr_csum) { + opal_output(0, "%s:%s:%d: Invalid \'frag data\' - received csum:0x%x != computed csum:0x%x\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), __FILE__, __LINE__, hdr->hdr_frag.hdr_csum, csum); + orte_errmgr.abort(-1,NULL); + } + } + OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, bytes_received); + /* check completion status */ + if(recv_request_pml_complete_check(recvreq) == false && + recvreq->req_rdma_offset < recvreq->req_send_offset) { + /* schedule additional rdma operations */ + mca_pml_csum_recv_request_schedule(recvreq, NULL); + } +} + +/* + * Update the recv request status to reflect the number of bytes + * received and actually delivered to the application. + */ + +void mca_pml_csum_recv_request_progress_rget( mca_pml_csum_recv_request_t* recvreq, + mca_btl_base_module_t* btl, + mca_btl_base_segment_t* segments, + size_t num_segments ) +{ + size_t bytes_received = 0; + mca_pml_csum_rget_hdr_t* hdr = (mca_pml_csum_rget_hdr_t*)segments->seg_addr.pval; + mca_bml_base_endpoint_t* bml_endpoint = NULL; + mca_pml_csum_rdma_frag_t* frag; + size_t i, size = 0; + int rc; + + MCA_PML_CSUM_COMPUTE_SEGMENT_LENGTH( segments, num_segments, + 0, bytes_received ); + recvreq->req_recv.req_bytes_packed = hdr->hdr_rndv.hdr_msg_length; + + MCA_PML_CSUM_RECV_REQUEST_MATCHED(recvreq, &hdr->hdr_rndv.hdr_match); + + + /* if receive buffer is not contiguous we can't just RDMA read into it, so + * fall back to copy in/out protocol. It is a pity because buffer on the + * sender side is already registered. We need to be smarter here, perhaps + * do couple of RDMA reads */ + if(ompi_convertor_need_buffers(&recvreq->req_recv.req_base.req_convertor) == true) { + mca_pml_csum_recv_request_ack(recvreq, &hdr->hdr_rndv, 0); + return; + } + + MCA_PML_CSUM_RDMA_FRAG_ALLOC(frag,rc); + if( OPAL_UNLIKELY(NULL == frag) ) { + /* GLB - FIX */ + ORTE_ERROR_LOG(rc); + orte_errmgr.abort(-1, NULL); + } + + /* lookup bml datastructures */ + bml_endpoint = (mca_bml_base_endpoint_t*)recvreq->req_recv.req_base.req_proc->proc_bml; + + /* allocate/initialize a fragment */ + for(i = 0; i < hdr->hdr_seg_cnt; i++) { + frag->rdma_segs[i] = hdr->hdr_segs[i]; +#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT + if ((recvreq->req_recv.req_base.req_proc->proc_arch & OPAL_ARCH_ISBIGENDIAN) != + (ompi_proc_local()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) { + size += opal_swap_bytes4(hdr->hdr_segs[i].seg_len); + } else +#endif + { + size += hdr->hdr_segs[i].seg_len; + } + } + frag->rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl); + if( OPAL_UNLIKELY(NULL == frag->rdma_bml) ) { + opal_output(0, "[%s:%d] invalid bml for rdma get", __FILE__, __LINE__); + orte_errmgr.abort(-1, NULL); + } + frag->rdma_hdr.hdr_rget = *hdr; + frag->rdma_req = recvreq; + frag->rdma_ep = bml_endpoint; + frag->rdma_length = size; + frag->rdma_state = MCA_PML_CSUM_RDMA_GET; + frag->reg = NULL; + + mca_pml_csum_recv_request_get_frag(frag); + return; +} + +/* + * Update the recv request status to reflect the number of bytes + * received and actually delivered to the application. + */ + +void mca_pml_csum_recv_request_progress_rndv( mca_pml_csum_recv_request_t* recvreq, + mca_btl_base_module_t* btl, + mca_btl_base_segment_t* segments, + size_t num_segments ) +{ + size_t bytes_received = 0; + size_t bytes_delivered __opal_attribute_unused__; /* is being set to zero in MCA_PML_CSUM_RECV_REQUEST_UNPACK */ + size_t data_offset = 0; + mca_pml_csum_hdr_t* hdr = (mca_pml_csum_hdr_t*)segments->seg_addr.pval; + uint32_t csum = OPAL_CSUM_ZERO; + bool do_csum = btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM; + + MCA_PML_CSUM_COMPUTE_SEGMENT_LENGTH( segments, num_segments, + 0, bytes_received ); + + bytes_received -= sizeof(mca_pml_csum_rendezvous_hdr_t); + recvreq->req_recv.req_bytes_packed = hdr->hdr_rndv.hdr_msg_length; + recvreq->remote_req_send = hdr->hdr_rndv.hdr_src_req; + recvreq->req_rdma_offset = bytes_received; + MCA_PML_CSUM_RECV_REQUEST_MATCHED(recvreq, &hdr->hdr_match); + mca_pml_csum_recv_request_ack(recvreq, &hdr->hdr_rndv, bytes_received); + /** + * The PUT protocol do not attach any data to the original request. + * Therefore, we might want to avoid unpacking if there is nothing to + * unpack. + */ + if( 0 < bytes_received ) { + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_defined, + recvreq->req_recv.req_base.req_addr, + recvreq->req_recv.req_base.req_count, + recvreq->req_recv.req_base.req_datatype); + ); + MCA_PML_CSUM_RECV_REQUEST_UNPACK( recvreq, + segments, + num_segments, + sizeof(mca_pml_csum_rendezvous_hdr_t), + data_offset, + bytes_received, + bytes_delivered ); + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_noaccess, + recvreq->req_recv.req_base.req_addr, + recvreq->req_recv.req_base.req_count, + recvreq->req_recv.req_base.req_datatype); + ); + } + if (do_csum) { + csum = (bytes_received > 0)? + recvreq->req_recv.req_base.req_convertor.checksum : 0; + OPAL_OUTPUT_VERBOSE((1, mca_pml_base_output, + "%s Received \'rndv\' with data csum:0x%x, header csum:0x%04x, size:%lu\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), csum, hdr->hdr_common.hdr_csum, (unsigned long)bytes_received)); + if (csum != hdr->hdr_match.hdr_csum) { + opal_output(0, "%s:%s:%d: Invalid \'rndv data\' - received csum:0x%x != computed csum:0x%x\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), __FILE__, __LINE__, hdr->hdr_match.hdr_csum, csum); + orte_errmgr.abort(-1,NULL); + } + } + OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, bytes_received); + /* check completion status */ + if(recv_request_pml_complete_check(recvreq) == false && + recvreq->req_rdma_offset < recvreq->req_send_offset) { + /* schedule additional rdma operations */ + mca_pml_csum_recv_request_schedule(recvreq, NULL); + } +} + +/* + * Update the recv request status to reflect the number of bytes + * received and actually delivered to the application. + */ +void mca_pml_csum_recv_request_progress_match( mca_pml_csum_recv_request_t* recvreq, + mca_btl_base_module_t* btl, + mca_btl_base_segment_t* segments, + size_t num_segments ) +{ + size_t bytes_received = 0; + size_t bytes_delivered __opal_attribute_unused__; /* is being set to zero in MCA_PML_CSUM_RECV_REQUEST_UNPACK */ + size_t data_offset = 0; + mca_pml_csum_hdr_t* hdr = (mca_pml_csum_hdr_t*)segments->seg_addr.pval; + uint32_t csum = OPAL_CSUM_ZERO; + bool do_csum = btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM; + + MCA_PML_CSUM_COMPUTE_SEGMENT_LENGTH( segments, num_segments, + 0, bytes_received ); + bytes_received -= OMPI_PML_CSUM_MATCH_HDR_LEN; + recvreq->req_recv.req_bytes_packed = bytes_received; + + MCA_PML_CSUM_RECV_REQUEST_MATCHED(recvreq, &hdr->hdr_match); + /* + * Make user buffer accessable(defined) before unpacking. + */ + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_defined, + recvreq->req_recv.req_base.req_addr, + recvreq->req_recv.req_base.req_count, + recvreq->req_recv.req_base.req_datatype); + ); + MCA_PML_CSUM_RECV_REQUEST_UNPACK( recvreq, + segments, + num_segments, + OMPI_PML_CSUM_MATCH_HDR_LEN, + data_offset, + bytes_received, + bytes_delivered); + if (do_csum) { + csum = (bytes_received > 0) ? recvreq->req_recv.req_base.req_convertor.checksum : 0; + OPAL_OUTPUT_VERBOSE((1, mca_pml_base_output, + "%s Received \'match\' with data csum:0x%x, header csum:0x%04x, size:%lu\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), csum, hdr->hdr_common.hdr_csum, (unsigned long)bytes_received)); + if (csum != hdr->hdr_match.hdr_csum) { + opal_output(0, "%s:%s:%d: Invalid \'match data\' - received csum:0x%x != computed csum:0x%x\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), __FILE__, __LINE__, hdr->hdr_match.hdr_csum, csum); + orte_errmgr.abort(-1,NULL); + } + } + + /* + * Unpacking finished, make the user buffer unaccessable again. + */ + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_noaccess, + recvreq->req_recv.req_base.req_addr, + recvreq->req_recv.req_base.req_count, + recvreq->req_recv.req_base.req_datatype); + ); + + /* + * No need for atomic here, as we know there is only one fragment + * for this request. + */ + recvreq->req_bytes_received += bytes_received; + recv_request_pml_complete(recvreq); +} + + +/** + * Handle completion of a probe request + */ + +void mca_pml_csum_recv_request_matched_probe( mca_pml_csum_recv_request_t* recvreq, + mca_btl_base_module_t* btl, + mca_btl_base_segment_t* segments, + size_t num_segments ) +{ + size_t bytes_packed = 0; + mca_pml_csum_hdr_t* hdr = (mca_pml_csum_hdr_t*)segments->seg_addr.pval; + + switch(hdr->hdr_common.hdr_type) { + case MCA_PML_CSUM_HDR_TYPE_MATCH: + + MCA_PML_CSUM_COMPUTE_SEGMENT_LENGTH( segments, num_segments, + OMPI_PML_CSUM_MATCH_HDR_LEN, + bytes_packed ); + break; + + case MCA_PML_CSUM_HDR_TYPE_RNDV: + case MCA_PML_CSUM_HDR_TYPE_RGET: + + bytes_packed = hdr->hdr_rndv.hdr_msg_length; + break; + } + + /* set completion status */ + recvreq->req_recv.req_base.req_ompi.req_status.MPI_TAG = hdr->hdr_match.hdr_tag; + recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE = hdr->hdr_match.hdr_src; + recvreq->req_bytes_received = bytes_packed; + recvreq->req_bytes_delivered = bytes_packed; + recv_request_pml_complete(recvreq); +} + + +/* + * Schedule RDMA protocol. + * +*/ + +int mca_pml_csum_recv_request_schedule_once( mca_pml_csum_recv_request_t* recvreq, + mca_bml_base_btl_t *start_bml_btl ) +{ + mca_bml_base_btl_t* bml_btl; + int num_tries = recvreq->req_rdma_cnt, num_fail = 0; + size_t i, prev_bytes_remaining = 0; + size_t bytes_remaining = recvreq->req_send_offset - + recvreq->req_rdma_offset; + bool do_csum; + + /* if starting bml_btl is provided schedule next fragment on it first */ + if(start_bml_btl != NULL) { + for(i = 0; i < recvreq->req_rdma_cnt; i++) { + if(recvreq->req_rdma[i].bml_btl != start_bml_btl) + continue; + /* something left to be send? */ + if( OPAL_LIKELY(recvreq->req_rdma[i].length) ) + recvreq->req_rdma_idx = i; + break; + } + } + + while(bytes_remaining > 0 && + recvreq->req_pipeline_depth < mca_pml_csum.recv_pipeline_depth) { + size_t hdr_size; + size_t size; + mca_pml_csum_rdma_hdr_t* hdr; + mca_btl_base_descriptor_t* dst; + mca_btl_base_descriptor_t* ctl; + mca_mpool_base_registration_t * reg = NULL; + mca_btl_base_module_t* btl; + int rc, rdma_idx; + + if(prev_bytes_remaining == bytes_remaining) { + if(++num_fail == num_tries) { + OPAL_THREAD_LOCK(&mca_pml_csum.lock); + if(false == recvreq->req_pending) { + opal_list_append(&mca_pml_csum.recv_pending, + (opal_list_item_t*)recvreq); + recvreq->req_pending = true; + } + OPAL_THREAD_UNLOCK(&mca_pml_csum.lock); + return OMPI_ERR_OUT_OF_RESOURCE; + } + } else { + num_fail = 0; + prev_bytes_remaining = bytes_remaining; + } + + do { + rdma_idx = recvreq->req_rdma_idx; + bml_btl = recvreq->req_rdma[rdma_idx].bml_btl; + reg = recvreq->req_rdma[rdma_idx].btl_reg; + size = recvreq->req_rdma[rdma_idx].length; + if(++recvreq->req_rdma_idx >= recvreq->req_rdma_cnt) + recvreq->req_rdma_idx = 0; + } while(!size); + btl = bml_btl->btl; + + /* makes sure that we don't exceed BTL max rdma size + * if memory is not pinned already */ + if( (NULL == reg) && (btl->btl_rdma_pipeline_frag_size != 0) && + (size > btl->btl_rdma_pipeline_frag_size)) { + size = btl->btl_rdma_pipeline_frag_size; + } + + /* take lock to protect converter against concurrent access + * from unpack */ + OPAL_THREAD_LOCK(&recvreq->lock); + ompi_convertor_set_position( &recvreq->req_recv.req_base.req_convertor, + &recvreq->req_rdma_offset ); + + /* prepare a descriptor for RDMA */ + mca_bml_base_prepare_dst(bml_btl, reg, + &recvreq->req_recv.req_base.req_convertor, + MCA_BTL_NO_ORDER, 0, &size, MCA_BTL_DES_FLAGS_BTL_OWNERSHIP, &dst); + OPAL_THREAD_UNLOCK(&recvreq->lock); + + if(OPAL_UNLIKELY(dst == NULL)) { + continue; + } + + dst->des_cbfunc = mca_pml_csum_put_completion; + dst->des_cbdata = recvreq; + + /* prepare a descriptor for rdma control message */ + hdr_size = sizeof(mca_pml_csum_rdma_hdr_t); + if(dst->des_dst_cnt > 1) { + hdr_size += (sizeof(mca_btl_base_segment_t) * + (dst->des_dst_cnt-1)); + } + + mca_bml_base_alloc(bml_btl, &ctl, MCA_BTL_NO_ORDER, hdr_size, + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); + + if( OPAL_UNLIKELY(NULL == ctl) ) { + mca_bml_base_free(bml_btl,dst); + continue; + } + ctl->des_cbfunc = mca_pml_csum_recv_ctl_completion; + + /* fill in rdma header */ + hdr = (mca_pml_csum_rdma_hdr_t*)ctl->des_src->seg_addr.pval; + hdr->hdr_common.hdr_type = MCA_PML_CSUM_HDR_TYPE_PUT; + hdr->hdr_common.hdr_flags = + (!recvreq->req_ack_sent) ? MCA_PML_CSUM_HDR_TYPE_ACK : 0; + hdr->hdr_common.hdr_csum = 0; + hdr->hdr_req = recvreq->remote_req_send; + hdr->hdr_des.pval = dst; + hdr->hdr_rdma_offset = recvreq->req_rdma_offset; + hdr->hdr_seg_cnt = dst->des_dst_cnt; + + for( i = 0; i < dst->des_dst_cnt; i++ ) { + hdr->hdr_segs[i].seg_addr.lval = ompi_ptr_ptol(dst->des_dst[i].seg_addr.pval); + hdr->hdr_segs[i].seg_len = dst->des_dst[i].seg_len; + hdr->hdr_segs[i].seg_key.key64 = dst->des_dst[i].seg_key.key64; + } + + if(!recvreq->req_ack_sent) + recvreq->req_ack_sent = true; + + do_csum = bml_btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM; + hdr->hdr_common.hdr_csum = (do_csum ? + opal_csum16(hdr, sizeof(mca_pml_csum_rdma_hdr_t)) : OPAL_CSUM_ZERO); + + OPAL_OUTPUT_VERBOSE((1, mca_pml_base_output, + "%s Sending \'PUT\' with header csum:0x%04x\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hdr->hdr_common.hdr_csum)); + + csum_hdr_hton(hdr, MCA_PML_CSUM_HDR_TYPE_PUT, recvreq->req_recv.req_base.req_proc); + + PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE, + &(recvreq->req_recv.req_base), size, + PERUSE_RECV); + + /* send rdma request to peer */ + rc = mca_bml_base_send(bml_btl, ctl, MCA_PML_CSUM_HDR_TYPE_PUT); + if( OPAL_LIKELY( rc >= 0 ) ) { + /* update request state */ + recvreq->req_rdma_offset += size; + OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth, 1); + recvreq->req_rdma[rdma_idx].length -= size; + bytes_remaining -= size; + if( OPAL_LIKELY( 1 == rc ) ) { + /* The send is completed, trigger the callback */ + MCA_PML_CSUM_PROGRESS_PENDING(bml_btl); + } + } else { + mca_bml_base_free(bml_btl,ctl); + mca_bml_base_free(bml_btl,dst); + } + } + + return OMPI_SUCCESS; +} + +#define IS_PROB_REQ(R) \ + ((MCA_PML_REQUEST_IPROBE == (R)->req_recv.req_base.req_type) || \ + (MCA_PML_REQUEST_PROBE == (R)->req_recv.req_base.req_type)) + +static inline void append_recv_req_to_queue(opal_list_t *queue, + mca_pml_csum_recv_request_t *req) +{ + if(OPAL_UNLIKELY(req->req_recv.req_base.req_type == MCA_PML_REQUEST_IPROBE)) + return; + + opal_list_append(queue, (opal_list_item_t*)req); + + /** + * We don't want to generate this kind of event for MPI_Probe. Hopefully, + * the compiler will optimize out the empty if loop in the case where PERUSE + * support is not required by the user. + */ + if(req->req_recv.req_base.req_type != MCA_PML_REQUEST_PROBE) { + PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_REQ_INSERT_IN_POSTED_Q, + &(req->req_recv.req_base), PERUSE_RECV); + } +} + +/* + * this routine tries to match a posted receive. If a match is found, + * it places the request in the appropriate matched receive list. This + * function has to be called with the communicator matching lock held. +*/ +static mca_pml_csum_recv_frag_t* +recv_req_match_specific_proc( const mca_pml_csum_recv_request_t *req, + mca_pml_csum_comm_proc_t *proc ) +{ + opal_list_t* unexpected_frags = &proc->unexpected_frags; + opal_list_item_t *i; + mca_pml_csum_recv_frag_t* frag; + int tag = req->req_recv.req_base.req_tag; + + if(opal_list_get_size(unexpected_frags) == 0) + return NULL; + + if( OMPI_ANY_TAG == tag ) { + for (i = opal_list_get_first(unexpected_frags); + i != opal_list_get_end(unexpected_frags); + i = opal_list_get_next(i)) { + frag = (mca_pml_csum_recv_frag_t*)i; + + if( frag->hdr.hdr_match.hdr_tag >= 0 ) + return frag; + } + } else { + for (i = opal_list_get_first(unexpected_frags); + i != opal_list_get_end(unexpected_frags); + i = opal_list_get_next(i)) { + frag = (mca_pml_csum_recv_frag_t*)i; + + if( frag->hdr.hdr_match.hdr_tag == tag ) + return frag; + } + } + return NULL; +} + +/* + * this routine is used to try and match a wild posted receive - where + * wild is determined by the value assigned to the source process +*/ +static mca_pml_csum_recv_frag_t* +recv_req_match_wild( mca_pml_csum_recv_request_t* req, + mca_pml_csum_comm_proc_t **p) +{ + mca_pml_csum_comm_t* comm = req->req_recv.req_base.req_comm->c_pml_comm; + mca_pml_csum_comm_proc_t* proc = comm->procs; + size_t proc_count = comm->num_procs, i; + + /* + * Loop over all the outstanding messages to find one that matches. + * There is an outer loop over lists of messages from each + * process, then an inner loop over the messages from the + * process. + */ + for (i = 0; i < proc_count; i++) { + mca_pml_csum_recv_frag_t* frag; + + /* loop over messages from the current proc */ + if((frag = recv_req_match_specific_proc(req, &proc[i]))) { + *p = &proc[i]; + req->req_recv.req_base.req_proc = proc[i].ompi_proc; + prepare_recv_req_converter(req); + return frag; /* match found */ + } + } + + *p = NULL; + return NULL; +} + + +void mca_pml_csum_recv_req_start(mca_pml_csum_recv_request_t *req) +{ + mca_pml_csum_comm_t* comm = req->req_recv.req_base.req_comm->c_pml_comm; + mca_pml_csum_comm_proc_t* proc; + mca_pml_csum_recv_frag_t* frag; + opal_list_t *queue; + mca_pml_csum_hdr_t* hdr; + + /* init/re-init the request */ + req->req_lock = 0; + req->req_pipeline_depth = 0; + req->req_bytes_received = 0; + req->req_bytes_delivered = 0; + /* What about req_rdma_cnt ? */ + req->req_rdma_idx = 0; + req->req_pending = false; + req->req_ack_sent = false; + + MCA_PML_BASE_RECV_START(&req->req_recv.req_base); + + OPAL_THREAD_LOCK(&comm->matching_lock); + /** + * The laps of time between the ACTIVATE event and the SEARCH_UNEX one include + * the cost of the request lock. + */ + PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_SEARCH_UNEX_Q_BEGIN, + &(req->req_recv.req_base), PERUSE_RECV); + + /* assign sequence number */ + req->req_recv.req_base.req_sequence = comm->recv_sequence++; + + /* attempt to match posted recv */ + if(req->req_recv.req_base.req_peer == OMPI_ANY_SOURCE) { + frag = recv_req_match_wild(req, &proc); + queue = &comm->wild_receives; +#if !OMPI_ENABLE_HETEROGENEOUS_SUPPORT + /* As we are in a homogeneous environment we know that all remote + * architectures are exactly the same as the local one. Therefore, + * we can safely construct the convertor based on the proc + * information of rank 0. + */ + if( NULL == frag ) { + req->req_recv.req_base.req_proc = ompi_proc_local_proc; + prepare_recv_req_converter(req); + } +#endif /* !OMPI_ENABLE_HETEROGENEOUS_SUPPORT */ + } else { + proc = &comm->procs[req->req_recv.req_base.req_peer]; + req->req_recv.req_base.req_proc = proc->ompi_proc; + frag = recv_req_match_specific_proc(req, proc); + queue = &proc->specific_receives; + /* wild cardrecv will be prepared on match */ + prepare_recv_req_converter(req); + } + + if(OPAL_UNLIKELY(NULL == frag)) { + PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_SEARCH_UNEX_Q_END, + &(req->req_recv.req_base), PERUSE_RECV); + /* We didn't find any matches. Record this irecv so we can match + it when the message comes in. */ + append_recv_req_to_queue(queue, req); + req->req_match_received = false; + OPAL_THREAD_UNLOCK(&comm->matching_lock); + } else { + if(OPAL_LIKELY(!IS_PROB_REQ(req))) { + PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_REQ_MATCH_UNEX, + &(req->req_recv.req_base), PERUSE_RECV); + + PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_REMOVE_FROM_UNEX_Q, + req->req_recv.req_base.req_comm, + req->req_recv.req_base.req_ompi.req_status.MPI_SOURCE, + req->req_recv.req_base.req_ompi.req_status.MPI_TAG, + PERUSE_RECV); + + PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_SEARCH_UNEX_Q_END, + &(req->req_recv.req_base), PERUSE_RECV); + + opal_list_remove_item(&proc->unexpected_frags, + (opal_list_item_t*)frag); + OPAL_THREAD_UNLOCK(&comm->matching_lock); + + hdr = (mca_pml_csum_hdr_t*)frag->segments->seg_addr.pval; + switch(hdr->hdr_common.hdr_type) { + case MCA_PML_CSUM_HDR_TYPE_MATCH: + mca_pml_csum_recv_request_progress_match(req, frag->btl, frag->segments, + frag->num_segments); + break; + case MCA_PML_CSUM_HDR_TYPE_RNDV: + mca_pml_csum_recv_request_progress_rndv(req, frag->btl, frag->segments, + frag->num_segments); + break; + case MCA_PML_CSUM_HDR_TYPE_RGET: + mca_pml_csum_recv_request_progress_rget(req, frag->btl, frag->segments, + frag->num_segments); + break; + default: + assert(0); + } + + MCA_PML_CSUM_RECV_FRAG_RETURN(frag); + + } else { + OPAL_THREAD_UNLOCK(&comm->matching_lock); + mca_pml_csum_recv_request_matched_probe(req, frag->btl, + frag->segments, frag->num_segments); + } + } +} diff --git a/ompi/mca/pml/csum/pml_csum_recvreq.h b/ompi/mca/pml/csum/pml_csum_recvreq.h new file mode 100644 index 0000000000..d863641239 --- /dev/null +++ b/ompi/mca/pml/csum/pml_csum_recvreq.h @@ -0,0 +1,427 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2008 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2007 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ +#ifndef OMPI_PML_CSUM_RECV_REQUEST_H +#define OMPI_PML_CSUM_RECV_REQUEST_H + +#include "pml_csum.h" +#include "pml_csum_rdma.h" +#include "pml_csum_rdmafrag.h" +#include "ompi/proc/proc.h" +#include "ompi/mca/pml/csum/pml_csum_comm.h" +#include "ompi/mca/mpool/base/base.h" +#include "ompi/mca/pml/base/pml_base_recvreq.h" +#include "ompi/datatype/datatype.h" + +BEGIN_C_DECLS + +struct mca_pml_csum_recv_request_t { + mca_pml_base_recv_request_t req_recv; + ompi_ptr_t remote_req_send; + int32_t req_lock; + size_t req_pipeline_depth; + size_t req_bytes_received; /**< amount of data transferred into the user buffer */ + size_t req_bytes_delivered; /**< local size of the data as suggested by the user */ + size_t req_rdma_offset; + size_t req_send_offset; + uint32_t req_rdma_cnt; + uint32_t req_rdma_idx; + bool req_pending; + bool req_ack_sent; /**< whether ack was sent to the sender */ + bool req_match_received; /**< Prevent request to be completed prematurely */ + opal_mutex_t lock; + mca_pml_csum_com_btl_t req_rdma[1]; +}; +typedef struct mca_pml_csum_recv_request_t mca_pml_csum_recv_request_t; + +OBJ_CLASS_DECLARATION(mca_pml_csum_recv_request_t); + +static inline bool lock_recv_request(mca_pml_csum_recv_request_t *recvreq) +{ + return OPAL_THREAD_ADD32(&recvreq->req_lock, 1) == 1; +} + +static inline bool unlock_recv_request(mca_pml_csum_recv_request_t *recvreq) +{ + return OPAL_THREAD_ADD32(&recvreq->req_lock, -1) == 0; +} + +/** + * Allocate a recv request from the modules free list. + * + * @param rc (OUT) OMPI_SUCCESS or error status on failure. + * @return Receive request. + */ +#define MCA_PML_CSUM_RECV_REQUEST_ALLOC(recvreq, rc) \ +do { \ + ompi_free_list_item_t* item; \ + rc = OMPI_SUCCESS; \ + OMPI_FREE_LIST_GET(&mca_pml_base_recv_requests, item, rc); \ + recvreq = (mca_pml_csum_recv_request_t*)item; \ +} while(0) + + +/** + * Initialize a receive request with call parameters. + * + * @param request (IN) Receive request. + * @param addr (IN) User buffer. + * @param count (IN) Number of elements of indicated datatype. + * @param datatype (IN) User defined datatype. + * @param src (IN) Source rank w/in the communicator. + * @param tag (IN) User defined tag. + * @param comm (IN) Communicator. + * @param persistent (IN) Is this a ersistent request. + */ +#define MCA_PML_CSUM_RECV_REQUEST_INIT( request, \ + addr, \ + count, \ + datatype, \ + src, \ + tag, \ + comm, \ + persistent) \ +do { \ + MCA_PML_BASE_RECV_REQUEST_INIT( &(request)->req_recv, \ + addr, \ + count, \ + datatype, \ + src, \ + tag, \ + comm, \ + persistent); \ +} while(0) + +/** + * Mark the request as completed at MPI level for internal purposes. + * + * @param recvreq (IN) Receive request. + */ +#define MCA_PML_CSUM_RECV_REQUEST_MPI_COMPLETE( recvreq ) \ + do { \ + PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_COMPLETE, \ + &(recvreq->req_recv.req_base), PERUSE_RECV ); \ + ompi_request_complete( &(recvreq->req_recv.req_base.req_ompi), true ); \ + } while (0) + +/* + * Free the PML receive request + */ +#define MCA_PML_CSUM_RECV_REQUEST_RETURN(recvreq) \ + { \ + MCA_PML_BASE_RECV_REQUEST_FINI(&(recvreq)->req_recv); \ + OMPI_FREE_LIST_RETURN( &mca_pml_base_recv_requests, \ + (ompi_free_list_item_t*)(recvreq)); \ + } + +/** + * Complete receive request. Request structure cannot be accessed after calling + * this function any more. + * + * @param recvreq (IN) Receive request. + */ +static inline void +recv_request_pml_complete(mca_pml_csum_recv_request_t *recvreq) +{ + size_t i; + + assert(false == recvreq->req_recv.req_base.req_pml_complete); + + if(recvreq->req_recv.req_bytes_packed > 0) { + PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_END, + &recvreq->req_recv.req_base, PERUSE_RECV ); + } + + for(i = 0; i < recvreq->req_rdma_cnt; i++) { + mca_mpool_base_registration_t* btl_reg = recvreq->req_rdma[i].btl_reg; + if( NULL != btl_reg && btl_reg->mpool != NULL) { + btl_reg->mpool->mpool_deregister( btl_reg->mpool, btl_reg ); + } + } + recvreq->req_rdma_cnt = 0; + + OPAL_THREAD_LOCK(&ompi_request_lock); + if(true == recvreq->req_recv.req_base.req_free_called) { + MCA_PML_CSUM_RECV_REQUEST_RETURN(recvreq); + } else { + /* initialize request status */ + recvreq->req_recv.req_base.req_pml_complete = true; + recvreq->req_recv.req_base.req_ompi.req_status._count = + (int)recvreq->req_bytes_received; + if (recvreq->req_recv.req_bytes_packed > recvreq->req_bytes_delivered) { + recvreq->req_recv.req_base.req_ompi.req_status._count = + (int)recvreq->req_recv.req_bytes_packed; + recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR = + MPI_ERR_TRUNCATE; + } + MCA_PML_CSUM_RECV_REQUEST_MPI_COMPLETE(recvreq); + } + OPAL_THREAD_UNLOCK(&ompi_request_lock); +} + +static inline bool +recv_request_pml_complete_check(mca_pml_csum_recv_request_t *recvreq) +{ + opal_atomic_rmb(); + + if(recvreq->req_match_received && + recvreq->req_bytes_received >= recvreq->req_recv.req_bytes_packed && + lock_recv_request(recvreq)) { + recv_request_pml_complete(recvreq); + return true; + } + + return false; +} + +extern void mca_pml_csum_recv_req_start(mca_pml_csum_recv_request_t *req); +#define MCA_PML_CSUM_RECV_REQUEST_START(r) mca_pml_csum_recv_req_start(r) + +static inline void prepare_recv_req_converter(mca_pml_csum_recv_request_t *req) +{ + mca_bml_base_endpoint_t* endpoint = + req->req_recv.req_base.req_proc->proc_bml; + bool do_csum = mca_pml_csum.enable_csum && + (endpoint->btl_flags_or & MCA_BTL_FLAGS_NEED_CSUM); + + if( req->req_recv.req_base.req_datatype->size | req->req_recv.req_base.req_count ) { + ompi_convertor_copy_and_prepare_for_recv( + req->req_recv.req_base.req_proc->proc_convertor, + req->req_recv.req_base.req_datatype, + req->req_recv.req_base.req_count, + req->req_recv.req_base.req_addr, + (do_csum ? CONVERTOR_WITH_CHECKSUM: 0), + &req->req_recv.req_base.req_convertor); + ompi_convertor_get_unpacked_size(&req->req_recv.req_base.req_convertor, + &req->req_bytes_delivered); + } +} + +#define MCA_PML_CSUM_RECV_REQUEST_MATCHED(request, hdr) \ + recv_req_matched(request, hdr) + +static inline void recv_req_matched(mca_pml_csum_recv_request_t *req, + mca_pml_csum_match_hdr_t *hdr) +{ + req->req_recv.req_base.req_ompi.req_status.MPI_SOURCE = hdr->hdr_src; + req->req_recv.req_base.req_ompi.req_status.MPI_TAG = hdr->hdr_tag; + req->req_match_received = true; + opal_atomic_wmb(); + + if(req->req_recv.req_bytes_packed > 0) { +#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT + if(MPI_ANY_SOURCE == req->req_recv.req_base.req_peer) { + /* non wildcard prepared during post recv */ + prepare_recv_req_converter(req); + } +#endif /* OMPI_ENABLE_HETEROGENEOUS_SUPPORT */ + PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_REQ_XFER_BEGIN, + &req->req_recv.req_base, PERUSE_RECV); + } +} + + +/** + * + */ + +#define MCA_PML_CSUM_RECV_REQUEST_UNPACK( request, \ + segments, \ + num_segments, \ + seg_offset, \ + data_offset, \ + bytes_received, \ + bytes_delivered) \ +do { \ + bytes_delivered = 0; \ + if(request->req_recv.req_bytes_packed > 0) { \ + struct iovec iov[MCA_BTL_DES_MAX_SEGMENTS]; \ + uint32_t iov_count = 0; \ + size_t max_data = bytes_received; \ + size_t n, offset = seg_offset; \ + mca_btl_base_segment_t* segment = segments; \ + \ + OPAL_THREAD_LOCK(&request->lock); \ + for( n = 0; n < num_segments; n++, segment++ ) { \ + if(offset >= segment->seg_len) { \ + offset -= segment->seg_len; \ + } else { \ + iov[iov_count].iov_len = segment->seg_len - offset; \ + iov[iov_count].iov_base = (IOVBASE_TYPE*) \ + ((unsigned char*)segment->seg_addr.pval + offset); \ + iov_count++; \ + } \ + } \ + PERUSE_TRACE_COMM_OMPI_EVENT (PERUSE_COMM_REQ_XFER_CONTINUE, \ + &(recvreq->req_recv.req_base), max_data, \ + PERUSE_RECV); \ + ompi_convertor_set_position( &(request->req_recv.req_base.req_convertor), \ + &data_offset ); \ + ompi_convertor_unpack( &(request)->req_recv.req_base.req_convertor, \ + iov, \ + &iov_count, \ + &max_data ); \ + bytes_delivered = max_data; \ + OPAL_THREAD_UNLOCK(&request->lock); \ + } \ +} while (0) + + +/** + * + */ + +void mca_pml_csum_recv_request_progress_match( + mca_pml_csum_recv_request_t* req, + struct mca_btl_base_module_t* btl, + mca_btl_base_segment_t* segments, + size_t num_segments); + +/** + * + */ + +void mca_pml_csum_recv_request_progress_frag( + mca_pml_csum_recv_request_t* req, + struct mca_btl_base_module_t* btl, + mca_btl_base_segment_t* segments, + size_t num_segments); + +/** + * + */ + +void mca_pml_csum_recv_request_progress_rndv( + mca_pml_csum_recv_request_t* req, + struct mca_btl_base_module_t* btl, + mca_btl_base_segment_t* segments, + size_t num_segments); + +/** + * + */ + +void mca_pml_csum_recv_request_progress_rget( + mca_pml_csum_recv_request_t* req, + struct mca_btl_base_module_t* btl, + mca_btl_base_segment_t* segments, + size_t num_segments); + +/** + * + */ + +void mca_pml_csum_recv_request_matched_probe( + mca_pml_csum_recv_request_t* req, + struct mca_btl_base_module_t* btl, + mca_btl_base_segment_t* segments, + size_t num_segments); + +/** + * + */ + +int mca_pml_csum_recv_request_schedule_once( + mca_pml_csum_recv_request_t* req, mca_bml_base_btl_t* start_bml_btl); + +static inline int mca_pml_csum_recv_request_schedule_exclusive( + mca_pml_csum_recv_request_t* req, + mca_bml_base_btl_t* start_bml_btl) +{ + int rc; + + do { + rc = mca_pml_csum_recv_request_schedule_once(req, start_bml_btl); + if(rc == OMPI_ERR_OUT_OF_RESOURCE) + break; + } while(!unlock_recv_request(req)); + + if(OMPI_SUCCESS == rc) + recv_request_pml_complete_check(req); + + return rc; +} + +static inline void mca_pml_csum_recv_request_schedule( + mca_pml_csum_recv_request_t* req, + mca_bml_base_btl_t* start_bml_btl) +{ + if(!lock_recv_request(req)) + return; + + (void)mca_pml_csum_recv_request_schedule_exclusive(req, start_bml_btl); +} + +#define MCA_PML_CSUM_ADD_ACK_TO_PENDING(P, S, D, O) \ + do { \ + mca_pml_csum_pckt_pending_t *_pckt; \ + int _rc; \ + \ + MCA_PML_CSUM_PCKT_PENDING_ALLOC(_pckt,_rc); \ + _pckt->hdr.hdr_common.hdr_type = MCA_PML_CSUM_HDR_TYPE_ACK; \ + _pckt->hdr.hdr_ack.hdr_src_req.lval = (S); \ + _pckt->hdr.hdr_ack.hdr_dst_req.pval = (D); \ + _pckt->hdr.hdr_ack.hdr_send_offset = (O); \ + _pckt->proc = (P); \ + _pckt->bml_btl = NULL; \ + OPAL_THREAD_LOCK(&mca_pml_csum.lock); \ + opal_list_append(&mca_pml_csum.pckt_pending, \ + (opal_list_item_t*)_pckt); \ + OPAL_THREAD_UNLOCK(&mca_pml_csum.lock); \ + } while(0) + +int mca_pml_csum_recv_request_ack_send_btl(ompi_proc_t* proc, + mca_bml_base_btl_t* bml_btl, uint64_t hdr_src_req, void *hdr_dst_req, + uint64_t hdr_rdma_offset, bool nordma); + +static inline int mca_pml_csum_recv_request_ack_send(ompi_proc_t* proc, + uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset, + bool nordma) +{ + size_t i; + mca_bml_base_btl_t* bml_btl; + mca_bml_base_endpoint_t* endpoint = + (mca_bml_base_endpoint_t*)proc->proc_bml; + + for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) { + bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager); + if(mca_pml_csum_recv_request_ack_send_btl(proc, bml_btl, hdr_src_req, + hdr_dst_req, hdr_send_offset, nordma) == OMPI_SUCCESS) + return OMPI_SUCCESS; + } + + MCA_PML_CSUM_ADD_ACK_TO_PENDING(proc, hdr_src_req, hdr_dst_req, + hdr_send_offset); + + return OMPI_ERR_OUT_OF_RESOURCE; +} + +int mca_pml_csum_recv_request_get_frag(mca_pml_csum_rdma_frag_t* frag); + +/* This function tries to continue recvreq that stuck due to resource + * unavailability. Recvreq is added to recv_pending list if scheduling of put + * operation cannot be accomplished for some reason. */ +void mca_pml_csum_recv_request_process_pending(void); + +END_C_DECLS + +#endif + diff --git a/ompi/mca/pml/csum/pml_csum_sendreq.c b/ompi/mca/pml/csum/pml_csum_sendreq.c new file mode 100644 index 0000000000..753c7394cf --- /dev/null +++ b/ompi/mca/pml/csum/pml_csum_sendreq.c @@ -0,0 +1,1354 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2008 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. + * Copyright (c) 2009 IBM Corporation. All rights reserved. + * Copyright (c) 2009 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + + +#include "ompi_config.h" +#include "ompi/constants.h" + +#include "opal/util/crc.h" +#include "opal/prefetch.h" +#include "opal/util/output.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/util/name_fns.h" +#include "orte/runtime/orte_globals.h" + +#include "ompi/mca/pml/pml.h" +#include "ompi/mca/btl/btl.h" +#include "ompi/mca/mpool/mpool.h" +#include "ompi/mca/bml/base/base.h" +#include "ompi/memchecker.h" +#include "ompi/mca/pml/base/base.h" + +#include "pml_csum.h" +#include "pml_csum_hdr.h" +#include "pml_csum_sendreq.h" +#include "pml_csum_rdmafrag.h" +#include "pml_csum_recvreq.h" + +OBJ_CLASS_INSTANCE(mca_pml_csum_send_range_t, ompi_free_list_item_t, + NULL, NULL); + +void mca_pml_csum_send_request_process_pending(mca_bml_base_btl_t *bml_btl) +{ + int i, s = opal_list_get_size(&mca_pml_csum.send_pending); + + /* advance pending requests */ + for(i = 0; i < s; i++) { + mca_pml_csum_send_pending_t pending_type = MCA_PML_CSUM_SEND_PENDING_NONE; + mca_pml_csum_send_request_t* sendreq; + mca_bml_base_btl_t *send_dst; + + sendreq = get_request_from_send_pending(&pending_type); + if(OPAL_UNLIKELY(NULL == sendreq)) + break; + + switch(pending_type) { + case MCA_PML_CSUM_SEND_PENDING_SCHEDULE: + if(mca_pml_csum_send_request_schedule_exclusive(sendreq) == + OMPI_ERR_OUT_OF_RESOURCE) { + return; + } + break; + case MCA_PML_CSUM_SEND_PENDING_START: + send_dst = mca_bml_base_btl_array_find( + &sendreq->req_endpoint->btl_eager, bml_btl->btl); + if( (NULL == send_dst) || + (mca_pml_csum_send_request_start_btl(sendreq, send_dst) == + OMPI_ERR_OUT_OF_RESOURCE) ) { + /* prepend to the pending list to minimize reordering in case + * send_dst != 0 */ + add_request_to_send_pending(sendreq, + MCA_PML_CSUM_SEND_PENDING_START, NULL == send_dst); + /* if no destination try next request otherwise give up, + * no more resources on this btl */ + if(send_dst != NULL) + return; + } + break; + default: + opal_output(0, "[%s:%d] wrong send request type\n", + __FILE__, __LINE__); + break; + } + } +} + +/* + * The free call mark the final stage in a request life-cycle. Starting from this + * point the request is completed at both PML and user level, and can be used + * for others p2p communications. Therefore, in the case of the CSUM PML it should + * be added to the free request list. + */ +static int mca_pml_csum_send_request_free(struct ompi_request_t** request) +{ + mca_pml_csum_send_request_t* sendreq = *(mca_pml_csum_send_request_t**)request; + + assert( false == sendreq->req_send.req_base.req_free_called ); + + OPAL_THREAD_LOCK(&ompi_request_lock); + sendreq->req_send.req_base.req_free_called = true; + + PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_NOTIFY, + &(sendreq->req_send.req_base), PERUSE_SEND ); + + if( true == sendreq->req_send.req_base.req_pml_complete ) { + /* make buffer defined when the request is compeleted, + and before releasing the objects. */ + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_defined, + sendreq->req_send.req_base.req_addr, + sendreq->req_send.req_base.req_count, + sendreq->req_send.req_base.req_datatype); + ); + + MCA_PML_CSUM_SEND_REQUEST_RETURN( sendreq ); + } + + OPAL_THREAD_UNLOCK(&ompi_request_lock); + + *request = MPI_REQUEST_NULL; + return OMPI_SUCCESS; +} + +static int mca_pml_csum_send_request_cancel(struct ompi_request_t* request, int complete) +{ + /* we dont cancel send requests by now */ + return OMPI_SUCCESS; +} + +static void mca_pml_csum_send_request_construct(mca_pml_csum_send_request_t* req) +{ + req->req_send.req_base.req_type = MCA_PML_REQUEST_SEND; + req->req_send.req_base.req_ompi.req_free = mca_pml_csum_send_request_free; + req->req_send.req_base.req_ompi.req_cancel = mca_pml_csum_send_request_cancel; + req->req_rdma_cnt = 0; + req->req_throttle_sends = false; + OBJ_CONSTRUCT(&req->req_send_ranges, opal_list_t); + OBJ_CONSTRUCT(&req->req_send_range_lock, opal_mutex_t); +} + +static void mca_pml_csum_send_request_destruct(mca_pml_csum_send_request_t* req) +{ + OBJ_DESTRUCT(&req->req_send_ranges); + OBJ_DESTRUCT(&req->req_send_range_lock); +} + +OBJ_CLASS_INSTANCE( mca_pml_csum_send_request_t, + mca_pml_base_send_request_t, + mca_pml_csum_send_request_construct, + mca_pml_csum_send_request_destruct ); + +/** + * Completion of a short message - nothing left to schedule. + */ + +static inline void +mca_pml_csum_match_fast_completion_free( struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des ) +{ + mca_pml_csum_send_request_t* sendreq = (mca_pml_csum_send_request_t*)des->des_cbdata; + mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context; + + if( sendreq->req_send.req_bytes_packed > 0 ) { + PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_BEGIN, + &(sendreq->req_send.req_base), PERUSE_SEND ); + } + + if( sendreq->req_send.req_bytes_packed > 0 ) { + PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_END, + &(sendreq->req_send.req_base), PERUSE_SEND); + } + + /* + * We are on the fast path, so there is no need to lock the request, as at + * this point there is only one reference to it. Moreover, there is no + * need to signal anything, as nobody is waiting on it. + */ + MCA_PML_CSUM_SEND_REQUEST_MPI_COMPLETE(sendreq, false); + sendreq->req_send.req_base.req_pml_complete = true; + + /* check for pending requests */ + MCA_PML_CSUM_PROGRESS_PENDING(bml_btl); +} + +static inline void +mca_pml_csum_match_completion_free_request( mca_bml_base_btl_t* bml_btl, + mca_pml_csum_send_request_t* sendreq ) +{ + if( sendreq->req_send.req_bytes_packed > 0 ) { + PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_BEGIN, + &(sendreq->req_send.req_base), PERUSE_SEND ); + } + + /* signal request completion */ + send_request_pml_complete(sendreq); + + /* check for pending requests */ + MCA_PML_CSUM_PROGRESS_PENDING(bml_btl); +} + +static void +mca_pml_csum_match_completion_free( struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status ) +{ + mca_pml_csum_send_request_t* sendreq = (mca_pml_csum_send_request_t*)des->des_cbdata; + mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context; + + /* check completion status */ + if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) { + /* TSW - FIX */ + opal_output(0, "%s:%d FATAL", __FILE__, __LINE__); + orte_errmgr.abort(-1, NULL); + } + mca_pml_csum_match_completion_free_request( bml_btl, sendreq ); +} + +static inline void +mca_pml_csum_rndv_completion_request( mca_bml_base_btl_t* bml_btl, + mca_pml_csum_send_request_t* sendreq, + size_t req_bytes_delivered ) +{ + if( sendreq->req_send.req_bytes_packed > 0 ) { + PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_BEGIN, + &(sendreq->req_send.req_base), PERUSE_SEND ); + } + + OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered); + + /* advance the request */ + OPAL_THREAD_ADD32(&sendreq->req_state, -1); + + send_request_pml_complete_check(sendreq); + + /* check for pending requests */ + MCA_PML_CSUM_PROGRESS_PENDING(bml_btl); +} + +/* + * Completion of the first fragment of a long message that + * requires an acknowledgement + */ +static void +mca_pml_csum_rndv_completion( mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status ) +{ + mca_pml_csum_send_request_t* sendreq = (mca_pml_csum_send_request_t*)des->des_cbdata; + mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context; + size_t req_bytes_delivered = 0; + + /* check completion status */ + if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) { + /* TSW - FIX */ + opal_output(0, "%s:%d FATAL", __FILE__, __LINE__); + orte_errmgr.abort(-1, NULL); + } + + /* count bytes of user data actually delivered. As the rndv completion only + * happens in one thread, the increase of the req_bytes_delivered does not + * have to be atomic. + */ + MCA_PML_CSUM_COMPUTE_SEGMENT_LENGTH( des->des_src, + des->des_src_cnt, + sizeof(mca_pml_csum_rendezvous_hdr_t), + req_bytes_delivered ); + + mca_pml_csum_rndv_completion_request( bml_btl, sendreq, req_bytes_delivered ); +} + + +/** + * Completion of a get request. + */ + +static void +mca_pml_csum_rget_completion( mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status ) +{ + mca_pml_csum_send_request_t* sendreq = (mca_pml_csum_send_request_t*)des->des_cbdata; + mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context; + size_t req_bytes_delivered = 0; + + /* count bytes of user data actually delivered and check for request completion */ + MCA_PML_CSUM_COMPUTE_SEGMENT_LENGTH( des->des_src, des->des_src_cnt, + 0, req_bytes_delivered ); + OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered); + + send_request_pml_complete_check(sendreq); + /* free the descriptor */ + mca_bml_base_free(bml_btl, des); + MCA_PML_CSUM_PROGRESS_PENDING(bml_btl); +} + + +/** + * Completion of a control message - return resources. + */ + +static void +mca_pml_csum_send_ctl_completion( mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status ) +{ + mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context; + + /* check for pending requests */ + MCA_PML_CSUM_PROGRESS_PENDING(bml_btl); +} + +/** + * Completion of additional fragments of a large message - may need + * to schedule additional fragments. + */ + +static void +mca_pml_csum_frag_completion( mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status ) +{ + mca_pml_csum_send_request_t* sendreq = (mca_pml_csum_send_request_t*)des->des_cbdata; + mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context; + size_t req_bytes_delivered = 0; + + /* check completion status */ + if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) { + /* TSW - FIX */ + opal_output(0, "%s:%d FATAL", __FILE__, __LINE__); + orte_errmgr.abort(-1, NULL); + } + + /* count bytes of user data actually delivered */ + MCA_PML_CSUM_COMPUTE_SEGMENT_LENGTH( des->des_src, + des->des_src_cnt, + sizeof(mca_pml_csum_frag_hdr_t), + req_bytes_delivered ); + + OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, -1); + OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered); + + if(send_request_pml_complete_check(sendreq) == false) + mca_pml_csum_send_request_schedule(sendreq); + + /* check for pending requests */ + MCA_PML_CSUM_PROGRESS_PENDING(bml_btl); +} + +/** + * Buffer the entire message and mark as complete. + */ + +int mca_pml_csum_send_request_start_buffered( + mca_pml_csum_send_request_t* sendreq, + mca_bml_base_btl_t* bml_btl, + size_t size) +{ + mca_btl_base_descriptor_t* des; + mca_btl_base_segment_t* segment; + mca_pml_csum_hdr_t* hdr; + struct iovec iov; + unsigned int iov_count; + size_t max_data, req_bytes_delivered; + int rc; + + /* allocate descriptor */ + mca_bml_base_alloc(bml_btl, &des, + MCA_BTL_NO_ORDER, + sizeof(mca_pml_csum_rendezvous_hdr_t) + size, + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); + if( OPAL_UNLIKELY(NULL == des) ) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + segment = des->des_src; + + /* pack the data into the BTL supplied buffer */ + iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval + + sizeof(mca_pml_csum_rendezvous_hdr_t)); + iov.iov_len = size; + iov_count = 1; + max_data = size; + if((rc = ompi_convertor_pack( &sendreq->req_send.req_base.req_convertor, + &iov, + &iov_count, + &max_data)) < 0) { + mca_bml_base_free(bml_btl, des); + return rc; + } + req_bytes_delivered = max_data; + + /* build rendezvous header */ + hdr = (mca_pml_csum_hdr_t*)segment->seg_addr.pval; + hdr->hdr_common.hdr_flags = 0; + hdr->hdr_common.hdr_type = MCA_PML_CSUM_HDR_TYPE_RNDV; + hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; + hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; + hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; + hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; + hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed; + hdr->hdr_rndv.hdr_src_req.pval = sendreq; + + csum_hdr_hton(hdr, MCA_PML_CSUM_HDR_TYPE_RNDV, + sendreq->req_send.req_base.req_proc); + + /* update lengths */ + segment->seg_len = sizeof(mca_pml_csum_rendezvous_hdr_t) + max_data; + + des->des_cbfunc = mca_pml_csum_rndv_completion; + des->des_cbdata = sendreq; + + /* buffer the remainder of the message */ + rc = mca_pml_base_bsend_request_alloc((ompi_request_t*)sendreq); + if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) { + mca_bml_base_free(bml_btl, des); + return rc; + } + + iov.iov_base = (IOVBASE_TYPE*)(((unsigned char*)sendreq->req_send.req_addr) + max_data); + iov.iov_len = max_data = sendreq->req_send.req_bytes_packed - max_data; + + if((rc = ompi_convertor_pack( &sendreq->req_send.req_base.req_convertor, + &iov, + &iov_count, + &max_data)) < 0) { + mca_bml_base_free(bml_btl, des); + return rc; + } + + /* re-init convertor for packed data */ + ompi_convertor_prepare_for_send( &sendreq->req_send.req_base.req_convertor, + MPI_BYTE, + sendreq->req_send.req_bytes_packed, + sendreq->req_send.req_addr ); + + /* wait for ack and completion */ + sendreq->req_state = 2; + + /* request is complete at mpi level */ + OPAL_THREAD_LOCK(&ompi_request_lock); + MCA_PML_CSUM_SEND_REQUEST_MPI_COMPLETE(sendreq, true); + OPAL_THREAD_UNLOCK(&ompi_request_lock); + + /* send */ + rc = mca_bml_base_send(bml_btl, des, MCA_PML_CSUM_HDR_TYPE_RNDV); + if( OPAL_LIKELY( rc >= 0 ) ) { + if( OPAL_LIKELY( 1 == rc ) ) { + mca_pml_csum_rndv_completion_request( bml_btl, sendreq, req_bytes_delivered); + } + return OMPI_SUCCESS; + } + mca_bml_base_free(bml_btl, des ); + return rc; +} + + +/** + * We work on a buffered request with a size smaller than the eager size + * or the BTL is not able to send the data IN_PLACE. Request a segment + * that is used for initial hdr and any eager data. This is used only + * from the _START macro. + */ +int mca_pml_csum_send_request_start_copy( mca_pml_csum_send_request_t* sendreq, + mca_bml_base_btl_t* bml_btl, + size_t size ) +{ + mca_btl_base_descriptor_t* des = NULL; + mca_btl_base_segment_t* segment; + mca_pml_csum_hdr_t* hdr; + struct iovec iov; + unsigned int iov_count; + size_t max_data = size; + int rc; + bool do_csum = bml_btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM; + + if(NULL != bml_btl->btl->btl_sendi) { + mca_pml_csum_match_hdr_t match; + match.hdr_common.hdr_flags = 0; + match.hdr_common.hdr_type = MCA_PML_CSUM_HDR_TYPE_MATCH; + match.hdr_common.hdr_csum = 0; + match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; + match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; + match.hdr_tag = sendreq->req_send.req_base.req_tag; + match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; + match.hdr_csum = (size > 0 && do_csum ? + sendreq->req_send.req_base.req_convertor.checksum : OPAL_CSUM_ZERO); + match.hdr_common.hdr_csum = (do_csum ? + opal_csum16(&match, sizeof(mca_pml_csum_match_hdr_t)) : OPAL_CSUM_ZERO); + + csum_hdr_hton(&match, MCA_PML_CSUM_HDR_TYPE_MATCH, + sendreq->req_send.req_base.req_proc); + + /* try to send immediately */ + rc = mca_bml_base_sendi( bml_btl, &sendreq->req_send.req_base.req_convertor, + &match, OMPI_PML_CSUM_MATCH_HDR_LEN, + size, MCA_BTL_NO_ORDER, + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP, + MCA_PML_CSUM_HDR_TYPE_MATCH, + &des); + if( OPAL_LIKELY(OMPI_SUCCESS == rc) ) { + /* signal request completion */ + send_request_pml_complete(sendreq); + + /* check for pending requests */ + MCA_PML_CSUM_PROGRESS_PENDING(bml_btl); + return OMPI_SUCCESS; + } + } else { + /* allocate descriptor */ + mca_bml_base_alloc( bml_btl, &des, + MCA_BTL_NO_ORDER, + OMPI_PML_CSUM_MATCH_HDR_LEN + size, + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); + } + if( OPAL_UNLIKELY(NULL == des) ) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + segment = des->des_src; + + if(size > 0) { + /* pack the data into the supplied buffer */ + iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval + + OMPI_PML_CSUM_MATCH_HDR_LEN); + iov.iov_len = size; + iov_count = 1; + /* + * Before copy the user buffer, make the target part + * accessible. + */ + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_defined, + sendreq->req_send.req_base.req_addr, + sendreq->req_send.req_base.req_count, + sendreq->req_send.req_base.req_datatype); + ); + (void)ompi_convertor_pack( &sendreq->req_send.req_base.req_convertor, + &iov, &iov_count, &max_data ); + /* + * Packing finished, make the user buffer unaccessable. + */ + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_noaccess, + sendreq->req_send.req_base.req_addr, + sendreq->req_send.req_base.req_count, + sendreq->req_send.req_base.req_datatype); + ); + } + + + /* build match header */ + hdr = (mca_pml_csum_hdr_t*)segment->seg_addr.pval; + hdr->hdr_common.hdr_flags = 0; + hdr->hdr_common.hdr_type = MCA_PML_CSUM_HDR_TYPE_MATCH; + hdr->hdr_common.hdr_csum = 0; + hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; + hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; + hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; + hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; + hdr->hdr_match.hdr_csum = (size > 0 && do_csum ? + sendreq->req_send.req_base.req_convertor.checksum : OPAL_CSUM_ZERO); + hdr->hdr_common.hdr_csum = (do_csum ? + opal_csum16(hdr, sizeof(mca_pml_csum_match_hdr_t)) : OPAL_CSUM_ZERO); + + OPAL_OUTPUT_VERBOSE((1, mca_pml_base_output, + "%s Sending \'match\' with data csum:0x%x, header csum:0x%04x, size:%lu \n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hdr->hdr_match.hdr_csum, hdr->hdr_common.hdr_csum, (unsigned long)max_data)); + + csum_hdr_hton(hdr, MCA_PML_CSUM_HDR_TYPE_MATCH, + sendreq->req_send.req_base.req_proc); + + /* update lengths */ + segment->seg_len = OMPI_PML_CSUM_MATCH_HDR_LEN + max_data; + + /* short message */ + des->des_cbdata = sendreq; + des->des_cbfunc = mca_pml_csum_match_completion_free; + + /* send */ + rc = mca_bml_base_send_status(bml_btl, des, MCA_PML_CSUM_HDR_TYPE_MATCH); + if( OPAL_LIKELY( rc >= OMPI_SUCCESS ) ) { + if( OPAL_LIKELY( 1 == rc ) ) { + mca_pml_csum_match_completion_free_request( bml_btl, sendreq ); + } + return OMPI_SUCCESS; + } + switch(rc) { + case OMPI_ERR_RESOURCE_BUSY: + /* No more resources. Allow the upper level to queue the send */ + rc = OMPI_ERR_OUT_OF_RESOURCE; + break; + default: + mca_bml_base_free(bml_btl, des); + break; + } + return rc; +} + +/** + * BTL can send directly from user buffer so allow the BTL + * to prepare the segment list. Start sending a small message. + */ + +int mca_pml_csum_send_request_start_prepare( mca_pml_csum_send_request_t* sendreq, + mca_bml_base_btl_t* bml_btl, + size_t size ) +{ + mca_btl_base_descriptor_t* des; + mca_btl_base_segment_t* segment; + mca_pml_csum_hdr_t* hdr; + int rc; + + /* prepare descriptor */ + mca_bml_base_prepare_src( bml_btl, + NULL, + &sendreq->req_send.req_base.req_convertor, + MCA_BTL_NO_ORDER, + OMPI_PML_CSUM_MATCH_HDR_LEN, + &size, + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP, + &des ); + if( OPAL_UNLIKELY(NULL == des) ) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + segment = des->des_src; + + /* build match header */ + hdr = (mca_pml_csum_hdr_t*)segment->seg_addr.pval; + hdr->hdr_common.hdr_flags = 0; + hdr->hdr_common.hdr_type = MCA_PML_CSUM_HDR_TYPE_MATCH; + hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; + hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; + hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; + hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; + + csum_hdr_hton(hdr, MCA_PML_CSUM_HDR_TYPE_MATCH, + sendreq->req_send.req_base.req_proc); + + /* short message */ + des->des_cbfunc = mca_pml_csum_match_completion_free; + des->des_cbdata = sendreq; + + /* send */ + rc = mca_bml_base_send(bml_btl, des, MCA_PML_CSUM_HDR_TYPE_MATCH); + if( OPAL_LIKELY( rc >= 0 ) ) { + if( OPAL_LIKELY( 1 == rc ) ) { + mca_pml_csum_match_completion_free_request( bml_btl, sendreq ); + } + return OMPI_SUCCESS; + } + mca_bml_base_free(bml_btl, des ); + return rc; +} + + +/** + * We have contigous data that is registered - schedule across + * available nics. + */ + +int mca_pml_csum_send_request_start_rdma( mca_pml_csum_send_request_t* sendreq, + mca_bml_base_btl_t* bml_btl, + size_t size ) +{ + /* + * When req_rdma array is constructed the first element of the array always + * assigned different btl in round robin fashion (if there are more than + * one RDMA capable BTLs). This way round robin distribution of RDMA + * operation is achieved. + */ + + mca_btl_base_descriptor_t* des; + mca_btl_base_segment_t* segment; + mca_pml_csum_hdr_t* hdr; + bool need_local_cb = false; + int rc; + bool do_csum; + + bml_btl = sendreq->req_rdma[0].bml_btl; + + do_csum = mca_pml_csum.enable_csum && (bml_btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM); + + if((sendreq->req_rdma_cnt == 1) && (bml_btl->btl_flags & MCA_BTL_FLAGS_GET)) { + mca_mpool_base_registration_t* reg = sendreq->req_rdma[0].btl_reg; + mca_btl_base_descriptor_t* src; + size_t i; + size_t old_position = sendreq->req_send.req_base.req_convertor.bConverted; + + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_defined, + sendreq->req_send.req_base.req_addr, + sendreq->req_send.req_base.req_count, + sendreq->req_send.req_base.req_datatype); + ); + /* prepare source descriptor/segment(s) */ + /* PML owns this descriptor and will free it in */ + /* get_completion */ + mca_bml_base_prepare_src( bml_btl, + reg, + &sendreq->req_send.req_base.req_convertor, + MCA_BTL_NO_ORDER, + 0, + &size, + 0, + &src ); + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_noaccess, + sendreq->req_send.req_base.req_addr, + sendreq->req_send.req_base.req_count, + sendreq->req_send.req_base.req_datatype); + ); + if( OPAL_UNLIKELY(NULL == src) ) { + ompi_convertor_set_position(&sendreq->req_send.req_base.req_convertor, + &old_position); + return OMPI_ERR_OUT_OF_RESOURCE; + } + src->des_cbfunc = mca_pml_csum_rget_completion; + src->des_cbdata = sendreq; + + /* allocate space for get hdr + segment list */ + mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, + sizeof(mca_pml_csum_rget_hdr_t) + + (sizeof(mca_btl_base_segment_t) * (src->des_src_cnt-1)), + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); + if( OPAL_UNLIKELY(NULL == des) ) { + ompi_convertor_set_position( &sendreq->req_send.req_base.req_convertor, + &old_position ); + mca_bml_base_free(bml_btl, src); + return OMPI_ERR_OUT_OF_RESOURCE; + } + segment = des->des_src; + + /* build match header */ + hdr = (mca_pml_csum_hdr_t*)segment->seg_addr.pval; + hdr->hdr_common.hdr_flags = MCA_PML_CSUM_HDR_FLAGS_CONTIG|MCA_PML_CSUM_HDR_FLAGS_PIN; + hdr->hdr_common.hdr_type = MCA_PML_CSUM_HDR_TYPE_RGET; + hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; + hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; + hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; + hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; + hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed; + hdr->hdr_rndv.hdr_src_req.pval = sendreq; + hdr->hdr_rget.hdr_des.pval = src; + hdr->hdr_rget.hdr_seg_cnt = src->des_src_cnt; + + csum_hdr_hton(hdr, MCA_PML_CSUM_HDR_TYPE_RGET, + sendreq->req_send.req_base.req_proc); + + for( i = 0; i < src->des_src_cnt; i++ ) { + hdr->hdr_rget.hdr_segs[i].seg_addr.lval = ompi_ptr_ptol(src->des_src[i].seg_addr.pval); + hdr->hdr_rget.hdr_segs[i].seg_len = src->des_src[i].seg_len; + hdr->hdr_rget.hdr_segs[i].seg_key.key64 = src->des_src[i].seg_key.key64; + } + + des->des_cbfunc = mca_pml_csum_send_ctl_completion; + + /** + * Well, it's a get so we will not know when the peer get the data anyway. + * If we generate the PERUSE event here, at least we will know when do we + * sent the GET message ... + */ + if( sendreq->req_send.req_bytes_packed > 0 ) { + PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_BEGIN, + &(sendreq->req_send.req_base), PERUSE_SEND ); + } + + } else { + + /* allocate a rendezvous header - dont eager send any data + * receiver will schedule rdma put(s) of the entire message + */ + + mca_bml_base_alloc(bml_btl, &des, + MCA_BTL_NO_ORDER, + sizeof(mca_pml_csum_rendezvous_hdr_t), + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); + if( OPAL_UNLIKELY(NULL == des)) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + segment = des->des_src; + + /* build hdr */ + hdr = (mca_pml_csum_hdr_t*)segment->seg_addr.pval; + hdr->hdr_common.hdr_flags = MCA_PML_CSUM_HDR_FLAGS_CONTIG|MCA_PML_CSUM_HDR_FLAGS_PIN; + hdr->hdr_common.hdr_type = MCA_PML_CSUM_HDR_TYPE_RNDV; + hdr->hdr_common.hdr_csum = OPAL_CSUM_ZERO; + hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; + hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; + hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; + hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; + hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed; + hdr->hdr_rndv.hdr_src_req.pval = sendreq; + hdr->hdr_match.hdr_csum = OPAL_CSUM_ZERO; + hdr->hdr_common.hdr_csum = (do_csum ? + opal_csum16(hdr, sizeof(mca_pml_csum_rendezvous_hdr_t)) : OPAL_CSUM_ZERO); + + OPAL_OUTPUT_VERBOSE((1, mca_pml_base_output, + "%s Sending \'rndv\'(initiate RDMA PUT) with data csum:0x%x, header csum:0x%04x, size:%lu\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hdr->hdr_match.hdr_csum, hdr->hdr_common.hdr_csum, (unsigned long)0)); + + csum_hdr_hton(hdr, MCA_PML_CSUM_HDR_TYPE_RNDV, + sendreq->req_send.req_base.req_proc); + + /* update lengths with number of bytes actually packed */ + segment->seg_len = sizeof(mca_pml_csum_rendezvous_hdr_t); + + /* first fragment of a long message */ + des->des_cbfunc = mca_pml_csum_rndv_completion; + need_local_cb = true; + + /* wait for ack and completion */ + sendreq->req_state = 2; + } + + des->des_cbdata = sendreq; + + /* send */ + rc = mca_bml_base_send(bml_btl, des, hdr->hdr_common.hdr_type); + if( OPAL_LIKELY( rc >= 0 ) ) { + if( OPAL_LIKELY( 1 == rc ) && (true == need_local_cb)) { + mca_pml_csum_rndv_completion_request( bml_btl, sendreq, 0 ); + } + return OMPI_SUCCESS; + } + mca_bml_base_free(bml_btl, des); + return rc; +} + + +/** + * Rendezvous is required. Not doing rdma so eager send up to + * the btls eager limit. + */ + +int mca_pml_csum_send_request_start_rndv( mca_pml_csum_send_request_t* sendreq, + mca_bml_base_btl_t* bml_btl, + size_t size, + int flags ) +{ + mca_btl_base_descriptor_t* des; + mca_btl_base_segment_t* segment; + mca_pml_csum_hdr_t* hdr; + int rc; + bool do_csum = mca_pml_csum.enable_csum && + (bml_btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM); + + /* prepare descriptor */ + if(size == 0) { + mca_bml_base_alloc( bml_btl, + &des, + MCA_BTL_NO_ORDER, + sizeof(mca_pml_csum_rendezvous_hdr_t), + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP ); + } else { + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_defined, + sendreq->req_send.req_base.req_addr, + sendreq->req_send.req_base.req_count, + sendreq->req_send.req_base.req_datatype); + ); + mca_bml_base_prepare_src( bml_btl, + NULL, + &sendreq->req_send.req_base.req_convertor, + MCA_BTL_NO_ORDER, + sizeof(mca_pml_csum_rendezvous_hdr_t), + &size, + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP, + &des ); + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_noaccess, + sendreq->req_send.req_base.req_addr, + sendreq->req_send.req_base.req_count, + sendreq->req_send.req_base.req_datatype); + ); + } + + if( OPAL_UNLIKELY(NULL == des) ) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + segment = des->des_src; + + /* build hdr */ + hdr = (mca_pml_csum_hdr_t*)segment->seg_addr.pval; + hdr->hdr_common.hdr_flags = flags; + hdr->hdr_common.hdr_type = MCA_PML_CSUM_HDR_TYPE_RNDV; + hdr->hdr_common.hdr_csum = 0; + hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; + hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; + hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; + hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; + hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed; + hdr->hdr_rndv.hdr_src_req.pval = sendreq; + hdr->hdr_match.hdr_csum = (size > 0 && do_csum ? + sendreq->req_send.req_base.req_convertor.checksum : OPAL_CSUM_ZERO); + hdr->hdr_common.hdr_csum = (do_csum ? + opal_csum16(hdr, sizeof(mca_pml_csum_rendezvous_hdr_t)) : OPAL_CSUM_ZERO); + + OPAL_OUTPUT_VERBOSE((1, mca_pml_base_output, + "%s Sending \'rndv\' with data csum:0x%x, header csum:0x%04x, size:%lu\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hdr->hdr_match.hdr_csum, hdr->hdr_common.hdr_csum, (unsigned long)size)); + + csum_hdr_hton(hdr, MCA_PML_CSUM_HDR_TYPE_RNDV, + sendreq->req_send.req_base.req_proc); + + /* first fragment of a long message */ + des->des_cbdata = sendreq; + des->des_cbfunc = mca_pml_csum_rndv_completion; + + /* wait for ack and completion */ + sendreq->req_state = 2; + + /* send */ + rc = mca_bml_base_send(bml_btl, des, MCA_PML_CSUM_HDR_TYPE_RNDV); + if( OPAL_LIKELY( rc >= 0 ) ) { + if( OPAL_LIKELY( 1 == rc ) ) { + mca_pml_csum_rndv_completion_request( bml_btl, sendreq, size ); + } + return OMPI_SUCCESS; + } + mca_bml_base_free(bml_btl, des ); + return rc; +} + +void mca_pml_csum_send_request_copy_in_out( mca_pml_csum_send_request_t *sendreq, + uint64_t send_offset, + uint64_t send_length ) +{ + mca_pml_csum_send_range_t *sr; + ompi_free_list_item_t *i; + mca_bml_base_endpoint_t* bml_endpoint = sendreq->req_endpoint; + int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send); + int rc = OMPI_SUCCESS, n; + double weight_total = 0; + + if( OPAL_UNLIKELY(0 == send_length) ) + return; + + OMPI_FREE_LIST_WAIT(&mca_pml_csum.send_ranges, i, rc); + + sr = (mca_pml_csum_send_range_t*)i; + + sr->range_send_offset = send_offset; + sr->range_send_length = send_length; + sr->range_btl_idx = 0; + + for(n = 0; n < num_btls && n < mca_pml_csum.max_send_per_range; n++) { + sr->range_btls[n].bml_btl = + mca_bml_base_btl_array_get_next(&bml_endpoint->btl_send); + weight_total += sr->range_btls[n].bml_btl->btl_weight; + } + + sr->range_btl_cnt = n; + mca_pml_csum_calc_weighted_length(sr->range_btls, n, send_length, + weight_total); + + OPAL_THREAD_LOCK(&sendreq->req_send_range_lock); + opal_list_append(&sendreq->req_send_ranges, (opal_list_item_t*)sr); + OPAL_THREAD_UNLOCK(&sendreq->req_send_range_lock); +} + +static inline mca_pml_csum_send_range_t * +get_send_range_nolock(mca_pml_csum_send_request_t* sendreq) +{ + opal_list_item_t *item; + + item = opal_list_get_first(&sendreq->req_send_ranges); + + if(opal_list_get_end(&sendreq->req_send_ranges) == item) + return NULL; + + return (mca_pml_csum_send_range_t*)item; +} + +static inline mca_pml_csum_send_range_t * +get_send_range(mca_pml_csum_send_request_t* sendreq) +{ + mca_pml_csum_send_range_t *range; + + OPAL_THREAD_LOCK(&sendreq->req_send_range_lock); + range = get_send_range_nolock(sendreq); + OPAL_THREAD_UNLOCK(&sendreq->req_send_range_lock); + + return range; +} + +static inline mca_pml_csum_send_range_t * +get_next_send_range(mca_pml_csum_send_request_t* sendreq, + mca_pml_csum_send_range_t *range) +{ + OPAL_THREAD_LOCK(&sendreq->req_send_range_lock); + opal_list_remove_item(&sendreq->req_send_ranges, (opal_list_item_t *)range); + OMPI_FREE_LIST_RETURN(&mca_pml_csum.send_ranges, &range->base); + range = get_send_range_nolock(sendreq); + OPAL_THREAD_UNLOCK(&sendreq->req_send_range_lock); + + return range; +} + +/** + * Schedule pipeline of send descriptors for the given request. + * Up to the rdma threshold. If this is a send based protocol, + * the rdma threshold is the end of the message. Otherwise, schedule + * fragments up to the threshold to overlap initial registration/setup + * costs of the rdma. Only one thread can be inside this function. + */ + +int +mca_pml_csum_send_request_schedule_once(mca_pml_csum_send_request_t* sendreq) +{ + size_t prev_bytes_remaining = 0; + mca_pml_csum_send_range_t *range; + int num_fail = 0; + bool do_csum; + + /* check pipeline_depth here before attempting to get any locks */ + if(true == sendreq->req_throttle_sends && + sendreq->req_pipeline_depth >= mca_pml_csum.send_pipeline_depth) + return OMPI_SUCCESS; + + range = get_send_range(sendreq); + + while(range && (false == sendreq->req_throttle_sends || + sendreq->req_pipeline_depth < mca_pml_csum.send_pipeline_depth)) { + mca_pml_csum_frag_hdr_t* hdr; + mca_btl_base_descriptor_t* des; + int rc, btl_idx; + size_t size, offset, data_remaining = 0; + mca_bml_base_btl_t* bml_btl; + + assert(range->range_send_length != 0); + + if(prev_bytes_remaining == range->range_send_length) + num_fail++; + else + num_fail = 0; + + prev_bytes_remaining = range->range_send_length; + + if( OPAL_UNLIKELY(num_fail == range->range_btl_cnt) ) { + assert(sendreq->req_pending == MCA_PML_CSUM_SEND_PENDING_NONE); + add_request_to_send_pending(sendreq, + MCA_PML_CSUM_SEND_PENDING_SCHEDULE, true); + /* Note that request remains locked. send_request_process_pending() + * function will call shedule_exclusive() directly without taking + * the lock */ + return OMPI_ERR_OUT_OF_RESOURCE; + } + +cannot_pack: + do { + btl_idx = range->range_btl_idx; + if(++range->range_btl_idx == range->range_btl_cnt) + range->range_btl_idx = 0; + } while(!range->range_btls[btl_idx].length); + + bml_btl = range->range_btls[btl_idx].bml_btl; + /* If there is a remaining data from another BTL that was too small + * for converter to pack then send it through another BTL */ + range->range_btls[btl_idx].length += data_remaining; + size = range->range_btls[btl_idx].length; + + /* makes sure that we don't exceed BTL max send size */ + if(bml_btl->btl->btl_max_send_size != 0) { + size_t max_send_size = bml_btl->btl->btl_max_send_size - + sizeof(mca_pml_csum_frag_hdr_t); + + if (size > max_send_size) { + size = max_send_size; + } + } + + /* pack into a descriptor */ + offset = (size_t)range->range_send_offset; + ompi_convertor_set_position(&sendreq->req_send.req_base.req_convertor, + &offset); + range->range_send_offset = (uint64_t)offset; + + data_remaining = size; + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_defined, + sendreq->req_send.req_base.req_addr, + sendreq->req_send.req_base.req_count, + sendreq->req_send.req_base.req_datatype); + ); + mca_bml_base_prepare_src(bml_btl, NULL, + &sendreq->req_send.req_base.req_convertor, + MCA_BTL_NO_ORDER, + sizeof(mca_pml_csum_frag_hdr_t), + &size, MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK, &des); + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_noaccess, + sendreq->req_send.req_base.req_addr, + sendreq->req_send.req_base.req_count, + sendreq->req_send.req_base.req_datatype); + ); + + if( OPAL_UNLIKELY(des == NULL || size == 0) ) { + if(des) { + /* Converter can't pack this chunk. Append to another chunk + * from other BTL */ + mca_bml_base_free(bml_btl, des); + range->range_btls[btl_idx].length -= data_remaining; + goto cannot_pack; + } + continue; + } + + des->des_cbfunc = mca_pml_csum_frag_completion; + des->des_cbdata = sendreq; + + do_csum = mca_pml_csum.enable_csum && + (bml_btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM); + + /* setup header */ + hdr = (mca_pml_csum_frag_hdr_t*)des->des_src->seg_addr.pval; + hdr->hdr_common.hdr_flags = 0; + hdr->hdr_common.hdr_type = MCA_PML_CSUM_HDR_TYPE_FRAG; + hdr->hdr_common.hdr_csum = 0; + hdr->hdr_frag_offset = range->range_send_offset; + hdr->hdr_src_req.pval = sendreq; + hdr->hdr_dst_req = sendreq->req_recv; + hdr->hdr_csum = (size > 0 && do_csum ? + sendreq->req_send.req_base.req_convertor.checksum : OPAL_CSUM_ZERO); + hdr->hdr_common.hdr_csum = (do_csum ? + opal_csum16(hdr, sizeof(mca_pml_csum_frag_hdr_t)) : OPAL_CSUM_ZERO); + + OPAL_OUTPUT_VERBOSE((1, mca_pml_base_output, + "%s Sending \'frag\' with data csum:0x%x, header csum:0x%04x, size:%lu\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hdr->hdr_csum, hdr->hdr_common.hdr_csum, (unsigned long)size)); + + csum_hdr_hton(hdr, MCA_PML_CSUM_HDR_TYPE_FRAG, + sendreq->req_send.req_base.req_proc); + +#if OMPI_WANT_PERUSE + PERUSE_TRACE_COMM_OMPI_EVENT(PERUSE_COMM_REQ_XFER_CONTINUE, + &(sendreq->req_send.req_base), size, PERUSE_SEND); +#endif /* OMPI_WANT_PERUSE */ + + /* initiate send - note that this may complete before the call returns */ + rc = mca_bml_base_send(bml_btl, des, MCA_PML_CSUM_HDR_TYPE_FRAG); + if( OPAL_LIKELY(rc >= 0) ) { + /* update state */ + range->range_btls[btl_idx].length -= size; + range->range_send_length -= size; + range->range_send_offset += size; + OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, 1); + if(range->range_send_length == 0) { + range = get_next_send_range(sendreq, range); + prev_bytes_remaining = 0; + } + } else { + mca_bml_base_free(bml_btl,des); + } + } + + return OMPI_SUCCESS; +} + + +/** + * An RDMA put operation has completed: + * (1) Update request status and if required set completed + * (2) Send FIN control message to the destination + */ + +static void mca_pml_csum_put_completion( mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status ) +{ + mca_pml_csum_rdma_frag_t* frag = (mca_pml_csum_rdma_frag_t*)des->des_cbdata; + mca_pml_csum_send_request_t* sendreq = (mca_pml_csum_send_request_t*)frag->rdma_req; + mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context; + + /* check completion status */ + if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) { + /* TSW - FIX */ + ORTE_ERROR_LOG(status); + orte_errmgr.abort(-1, NULL); + } + + mca_pml_csum_send_fin(sendreq->req_send.req_base.req_proc, + bml_btl, + frag->rdma_hdr.hdr_rdma.hdr_des.pval, + des->order, 0); + + /* check for request completion */ + OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, frag->rdma_length); + + send_request_pml_complete_check(sendreq); + + MCA_PML_CSUM_RDMA_FRAG_RETURN(frag); + + MCA_PML_CSUM_PROGRESS_PENDING(bml_btl); +} + +int mca_pml_csum_send_request_put_frag( mca_pml_csum_rdma_frag_t* frag ) +{ + mca_mpool_base_registration_t* reg = NULL; + mca_bml_base_btl_t* bml_btl = frag->rdma_bml; + mca_btl_base_descriptor_t* des; + size_t save_size = frag->rdma_length; + int rc; + + /* setup descriptor */ + mca_bml_base_prepare_src( bml_btl, + reg, + &frag->convertor, + MCA_BTL_NO_ORDER, + 0, + &frag->rdma_length, + MCA_BTL_DES_FLAGS_BTL_OWNERSHIP, + &des ); + + if( OPAL_UNLIKELY(NULL == des) ) { + if(frag->retries < mca_pml_csum.rdma_put_retries_limit) { + size_t offset = (size_t)frag->rdma_hdr.hdr_rdma.hdr_rdma_offset; + frag->rdma_length = save_size; + ompi_convertor_set_position(&frag->convertor, &offset); + OPAL_THREAD_LOCK(&mca_pml_csum.lock); + opal_list_append(&mca_pml_csum.rdma_pending, (opal_list_item_t*)frag); + OPAL_THREAD_UNLOCK(&mca_pml_csum.lock); + } else { + mca_pml_csum_send_request_t *sendreq = + (mca_pml_csum_send_request_t*)frag->rdma_req; + + /* tell receiver to unregister memory */ + mca_pml_csum_send_fin(sendreq->req_send.req_base.req_proc, + bml_btl, frag->rdma_hdr.hdr_rdma.hdr_des.pval, + MCA_BTL_NO_ORDER, 1); + + /* send fragment by copy in/out */ + mca_pml_csum_send_request_copy_in_out(sendreq, + frag->rdma_hdr.hdr_rdma.hdr_rdma_offset, frag->rdma_length); + /* if a pointer to a receive request is not set it means that + * ACK was not yet received. Don't schedule sends before ACK */ + if(NULL != sendreq->req_recv.pval) + mca_pml_csum_send_request_schedule(sendreq); + } + return OMPI_ERR_OUT_OF_RESOURCE; + } + + des->des_dst = frag->rdma_segs; + des->des_dst_cnt = frag->rdma_hdr.hdr_rdma.hdr_seg_cnt; + des->des_cbfunc = mca_pml_csum_put_completion; + des->des_cbdata = frag; + + PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE, + &(((mca_pml_csum_send_request_t*)frag->rdma_req)->req_send.req_base), save_size, PERUSE_SEND ); + + rc = mca_bml_base_put(bml_btl, des); + if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) { + mca_bml_base_free(bml_btl, des); + frag->rdma_length = save_size; + if(OMPI_ERR_OUT_OF_RESOURCE == rc) { + OPAL_THREAD_LOCK(&mca_pml_csum.lock); + opal_list_append(&mca_pml_csum.rdma_pending, (opal_list_item_t*)frag); + OPAL_THREAD_UNLOCK(&mca_pml_csum.lock); + return OMPI_ERR_OUT_OF_RESOURCE; + } else { + /* TSW - FIX */ + ORTE_ERROR_LOG(rc); + orte_errmgr.abort(-1, NULL); + } + } + return OMPI_SUCCESS; +} + +/** + * Receiver has scheduled an RDMA operation: + * (1) Allocate an RDMA fragment to maintain the state of the operation + * (2) Call BTL prepare_src to pin/prepare source buffers + * (3) Queue the RDMA put + */ + +void mca_pml_csum_send_request_put( mca_pml_csum_send_request_t* sendreq, + mca_btl_base_module_t* btl, + mca_pml_csum_rdma_hdr_t* hdr ) +{ + mca_bml_base_endpoint_t *bml_endpoint = sendreq->req_endpoint; + mca_pml_csum_rdma_frag_t* frag; + int rc; + size_t i, size = 0; + + if(hdr->hdr_common.hdr_flags & MCA_PML_CSUM_HDR_TYPE_ACK) { + OPAL_THREAD_ADD32(&sendreq->req_state, -1); + } + + MCA_PML_CSUM_RDMA_FRAG_ALLOC(frag, rc); + + if( OPAL_UNLIKELY(NULL == frag) ) { + /* TSW - FIX */ + ORTE_ERROR_LOG(rc); + orte_errmgr.abort(-1, NULL); + } + + /* setup fragment */ + for( i = 0; i < hdr->hdr_seg_cnt; i++ ) { + frag->rdma_segs[i].seg_addr.lval = hdr->hdr_segs[i].seg_addr.lval; + frag->rdma_segs[i].seg_len = hdr->hdr_segs[i].seg_len; + frag->rdma_segs[i].seg_key.key64 = hdr->hdr_segs[i].seg_key.key64; + +#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT + if ((sendreq->req_send.req_base.req_proc->proc_arch & OPAL_ARCH_ISBIGENDIAN) != + (ompi_proc_local()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) { + size += opal_swap_bytes4(frag->rdma_segs[i].seg_len); + } else +#endif + { + size += frag->rdma_segs[i].seg_len; + } + } + + frag->rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl); + frag->rdma_hdr.hdr_rdma = *hdr; + frag->rdma_req = sendreq; + frag->rdma_ep = bml_endpoint; + frag->rdma_length = size; + frag->rdma_state = MCA_PML_CSUM_RDMA_PUT; + frag->reg = NULL; + frag->retries = 0; + + /* lookup the corresponding registration */ + for(i=0; ireq_rdma_cnt; i++) { + if(sendreq->req_rdma[i].bml_btl == frag->rdma_bml) { + frag->reg = sendreq->req_rdma[i].btl_reg; + break; + } + } + + /* RDMA writes may proceed in parallel to send and to each other, so + * create clone of the convertor for each RDMA fragment + */ + size = hdr->hdr_rdma_offset; + ompi_convertor_clone_with_position(&sendreq->req_send.req_base.req_convertor, + &frag->convertor, 0, &size); + + mca_pml_csum_send_request_put_frag(frag); +} + diff --git a/ompi/mca/pml/csum/pml_csum_sendreq.h b/ompi/mca/pml/csum/pml_csum_sendreq.h new file mode 100644 index 0000000000..77a5ee59be --- /dev/null +++ b/ompi/mca/pml/csum/pml_csum_sendreq.h @@ -0,0 +1,515 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2008 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2009 IBM Corporation. All rights reserved. + * Copyright (c) 2009 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OMPI_PML_CSUM_SEND_REQUEST_H +#define OMPI_PML_CSUM_SEND_REQUEST_H + +#include "ompi/mca/btl/btl.h" +#include "ompi/mca/pml/base/pml_base_sendreq.h" +#include "ompi/mca/mpool/base/base.h" +#include "pml_csum_comm.h" +#include "pml_csum_hdr.h" +#include "pml_csum_rdma.h" +#include "pml_csum_rdmafrag.h" +#include "ompi/datatype/convertor.h" +#include "ompi/mca/bml/bml.h" + +BEGIN_C_DECLS + +typedef enum { + MCA_PML_CSUM_SEND_PENDING_NONE, + MCA_PML_CSUM_SEND_PENDING_SCHEDULE, + MCA_PML_CSUM_SEND_PENDING_START +} mca_pml_csum_send_pending_t; + +struct mca_pml_csum_send_request_t { + mca_pml_base_send_request_t req_send; + mca_bml_base_endpoint_t* req_endpoint; + ompi_ptr_t req_recv; + int32_t req_state; + int32_t req_lock; + bool req_throttle_sends; + size_t req_pipeline_depth; + size_t req_bytes_delivered; + uint32_t req_rdma_cnt; + mca_pml_csum_send_pending_t req_pending; + opal_mutex_t req_send_range_lock; + opal_list_t req_send_ranges; + mca_pml_csum_com_btl_t req_rdma[1]; +}; +typedef struct mca_pml_csum_send_request_t mca_pml_csum_send_request_t; + +OBJ_CLASS_DECLARATION(mca_pml_csum_send_request_t); + +struct mca_pml_csum_send_range_t { + ompi_free_list_item_t base; + uint64_t range_send_offset; + uint64_t range_send_length; + int range_btl_idx; + int range_btl_cnt; + mca_pml_csum_com_btl_t range_btls[1]; +}; +typedef struct mca_pml_csum_send_range_t mca_pml_csum_send_range_t; +OBJ_CLASS_DECLARATION(mca_pml_csum_send_range_t); + +static inline bool lock_send_request(mca_pml_csum_send_request_t *sendreq) +{ + return OPAL_THREAD_ADD32(&sendreq->req_lock, 1) == 1; +} + +static inline bool unlock_send_request(mca_pml_csum_send_request_t *sendreq) +{ + return OPAL_THREAD_ADD32(&sendreq->req_lock, -1) == 0; +} + +static inline void +add_request_to_send_pending(mca_pml_csum_send_request_t* sendreq, + const mca_pml_csum_send_pending_t type, + const bool append) +{ + opal_list_item_t *item = (opal_list_item_t*)sendreq; + + OPAL_THREAD_LOCK(&mca_pml_csum.lock); + sendreq->req_pending = type; + if(append) + opal_list_append(&mca_pml_csum.send_pending, item); + else + opal_list_prepend(&mca_pml_csum.send_pending, item); + + OPAL_THREAD_UNLOCK(&mca_pml_csum.lock); +} + +static inline mca_pml_csum_send_request_t* +get_request_from_send_pending(mca_pml_csum_send_pending_t *type) +{ + mca_pml_csum_send_request_t *sendreq; + + OPAL_THREAD_LOCK(&mca_pml_csum.lock); + sendreq = (mca_pml_csum_send_request_t*) + opal_list_remove_first(&mca_pml_csum.send_pending); + if(sendreq) { + *type = sendreq->req_pending; + sendreq->req_pending = MCA_PML_CSUM_SEND_PENDING_NONE; + } + OPAL_THREAD_UNLOCK(&mca_pml_csum.lock); + + return sendreq; +} + +#define MCA_PML_CSUM_SEND_REQUEST_ALLOC( comm, \ + dst, \ + sendreq, \ + rc) \ + { \ + ompi_proc_t *proc = ompi_comm_peer_lookup( comm, dst ); \ + ompi_free_list_item_t* item; \ + \ + rc = OMPI_ERR_OUT_OF_RESOURCE; \ + if( OPAL_LIKELY(NULL != proc) ) { \ + rc = OMPI_SUCCESS; \ + OMPI_FREE_LIST_WAIT(&mca_pml_base_send_requests, item, rc); \ + sendreq = (mca_pml_csum_send_request_t*)item; \ + sendreq->req_send.req_base.req_proc = proc; \ + } \ + } + + +#define MCA_PML_CSUM_SEND_REQUEST_INIT( sendreq, \ + buf, \ + count, \ + datatype, \ + dst, \ + tag, \ + comm, \ + sendmode, \ + persistent) \ + { \ + MCA_PML_CSUM_BASE_SEND_REQUEST_INIT(&sendreq->req_send, \ + buf, \ + count, \ + datatype, \ + dst, \ + tag, \ + comm, \ + sendmode, \ + persistent); \ + (sendreq)->req_recv.pval = NULL; \ + } + +#define MCA_PML_CSUM_BASE_SEND_REQUEST_INIT( request, \ + addr, \ + count, \ + datatype, \ + peer, \ + tag, \ + comm, \ + mode, \ + persistent) \ + { \ + mca_bml_base_endpoint_t* endpoint = \ + sendreq->req_send.req_base.req_proc->proc_bml; \ + bool do_csum = mca_pml_csum.enable_csum && \ + (endpoint->btl_flags_or & MCA_BTL_FLAGS_NEED_CSUM); \ + /* increment reference counts */ \ + OBJ_RETAIN(comm); \ + \ + OMPI_REQUEST_INIT(&(request)->req_base.req_ompi, persistent); \ + (request)->req_base.req_ompi.req_mpi_object.comm = comm; \ + (request)->req_addr = addr; \ + (request)->req_send_mode = mode; \ + (request)->req_base.req_addr = addr; \ + (request)->req_base.req_count = count; \ + (request)->req_base.req_datatype = datatype; \ + (request)->req_base.req_peer = (int32_t)peer; \ + (request)->req_base.req_tag = (int32_t)tag; \ + (request)->req_base.req_comm = comm; \ + /* (request)->req_base.req_proc is set on request allocation */ \ + (request)->req_base.req_pml_complete = OPAL_INT_TO_BOOL(persistent); \ + (request)->req_base.req_free_called = false; \ + (request)->req_base.req_ompi.req_status._cancelled = 0; \ + (request)->req_bytes_packed = 0; \ + \ + /* initialize datatype convertor for this request */ \ + if( count > 0 ) { \ + OBJ_RETAIN(datatype); \ + /* We will create a convertor specialized for the */ \ + /* remote architecture and prepared with the datatype. */ \ + ompi_convertor_copy_and_prepare_for_send( \ + (request)->req_base.req_proc->proc_convertor, \ + (request)->req_base.req_datatype, \ + (request)->req_base.req_count, \ + (request)->req_base.req_addr, \ + (do_csum ? CONVERTOR_WITH_CHECKSUM: 0), \ + &(request)->req_base.req_convertor ); \ + ompi_convertor_get_packed_size( &(request)->req_base.req_convertor, \ + &((request)->req_bytes_packed) );\ + } \ + } + +static inline void mca_pml_csum_free_rdma_resources(mca_pml_csum_send_request_t* sendreq) +{ + size_t r; + + /* return mpool resources */ + for(r = 0; r < sendreq->req_rdma_cnt; r++) { + mca_mpool_base_registration_t* reg = sendreq->req_rdma[r].btl_reg; + if( NULL != reg && reg->mpool != NULL ) { + reg->mpool->mpool_deregister(reg->mpool, reg); + } + } + sendreq->req_rdma_cnt = 0; +} + + +/** + * Start a send request. + */ + +#define MCA_PML_CSUM_SEND_REQUEST_START(sendreq, rc) \ + do { \ + rc = mca_pml_csum_send_request_start(sendreq); \ + } while (0) + + +/* + * Mark a send request as completed at the MPI level. + */ + +#define MCA_PML_CSUM_SEND_REQUEST_MPI_COMPLETE(sendreq, with_signal) \ +do { \ + (sendreq)->req_send.req_base.req_ompi.req_status.MPI_SOURCE = \ + (sendreq)->req_send.req_base.req_comm->c_my_rank; \ + (sendreq)->req_send.req_base.req_ompi.req_status.MPI_TAG = \ + (sendreq)->req_send.req_base.req_tag; \ + (sendreq)->req_send.req_base.req_ompi.req_status.MPI_ERROR = OMPI_SUCCESS; \ + (sendreq)->req_send.req_base.req_ompi.req_status._count = \ + (int)(sendreq)->req_send.req_bytes_packed; \ + ompi_request_complete( &((sendreq)->req_send.req_base.req_ompi), (with_signal) ); \ + \ + PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_COMPLETE, \ + &(sendreq->req_send.req_base), PERUSE_SEND); \ +} while(0) + +/* + * Release resources associated with a request + */ + +#define MCA_PML_CSUM_SEND_REQUEST_RETURN(sendreq) \ + do { \ + /* Let the base handle the reference counts */ \ + MCA_PML_BASE_SEND_REQUEST_FINI((&(sendreq)->req_send)); \ + OMPI_FREE_LIST_RETURN( &mca_pml_base_send_requests, \ + (ompi_free_list_item_t*)sendreq); \ + } while(0) + + +/* + * The PML has completed a send request. Note that this request + * may have been orphaned by the user or have already completed + * at the MPI level. + * This function will never be called directly from the upper level, as it + * should only be an internal call to the PML. + * + */ +static inline void +send_request_pml_complete(mca_pml_csum_send_request_t *sendreq) +{ + assert(false == sendreq->req_send.req_base.req_pml_complete); + + if(sendreq->req_send.req_bytes_packed > 0) { + PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_END, + &(sendreq->req_send.req_base), PERUSE_SEND); + } + + /* return mpool resources */ + mca_pml_csum_free_rdma_resources(sendreq); + + if (sendreq->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED && + sendreq->req_send.req_addr != sendreq->req_send.req_base.req_addr) { + mca_pml_base_bsend_request_fini((ompi_request_t*)sendreq); + } + + OPAL_THREAD_LOCK(&ompi_request_lock); + if(false == sendreq->req_send.req_base.req_ompi.req_complete) { + /* Should only be called for long messages (maybe synchronous) */ + MCA_PML_CSUM_SEND_REQUEST_MPI_COMPLETE(sendreq, true); + } + sendreq->req_send.req_base.req_pml_complete = true; + + if(sendreq->req_send.req_base.req_free_called) { + MCA_PML_CSUM_SEND_REQUEST_RETURN(sendreq); + } + OPAL_THREAD_UNLOCK(&ompi_request_lock); +} + +/* returns true if request was completed on PML level */ +static inline bool +send_request_pml_complete_check(mca_pml_csum_send_request_t *sendreq) +{ + opal_atomic_rmb(); + /* if no more events are expected for the request and the whole message is + * already sent and send fragment scheduling isn't running in another + * thread then complete the request on PML level. From now on, if user + * called free on this request, the request structure can be reused for + * another request or if the request is persistent it can be restarted */ + if(sendreq->req_state == 0 && + sendreq->req_bytes_delivered >= sendreq->req_send.req_bytes_packed + && lock_send_request(sendreq)) { + send_request_pml_complete(sendreq); + return true; + } + + return false; +} + +/** + * Schedule additional fragments + */ +int +mca_pml_csum_send_request_schedule_once(mca_pml_csum_send_request_t*); + +static inline int +mca_pml_csum_send_request_schedule_exclusive(mca_pml_csum_send_request_t* sendreq) +{ + int rc; + do { + rc = mca_pml_csum_send_request_schedule_once(sendreq); + if(rc == OMPI_ERR_OUT_OF_RESOURCE) + break; + } while(!unlock_send_request(sendreq)); + + if(OMPI_SUCCESS == rc) + send_request_pml_complete_check(sendreq); + + return rc; +} + +static inline void +mca_pml_csum_send_request_schedule(mca_pml_csum_send_request_t* sendreq) +{ + /* + * Only allow one thread in this routine for a given request. + * However, we cannot block callers on a mutex, so simply keep track + * of the number of times the routine has been called and run through + * the scheduling logic once for every call. + */ + + if(!lock_send_request(sendreq)) + return; + + mca_pml_csum_send_request_schedule_exclusive(sendreq); +} + +/** + * Start the specified request + */ + +int mca_pml_csum_send_request_start_buffered( + mca_pml_csum_send_request_t* sendreq, + mca_bml_base_btl_t* bml_btl, + size_t size); + +int mca_pml_csum_send_request_start_copy( + mca_pml_csum_send_request_t* sendreq, + mca_bml_base_btl_t* bml_btl, + size_t size); + +int mca_pml_csum_send_request_start_prepare( + mca_pml_csum_send_request_t* sendreq, + mca_bml_base_btl_t* bml_btl, + size_t size); + +int mca_pml_csum_send_request_start_rdma( + mca_pml_csum_send_request_t* sendreq, + mca_bml_base_btl_t* bml_btl, + size_t size); + +int mca_pml_csum_send_request_start_rndv( + mca_pml_csum_send_request_t* sendreq, + mca_bml_base_btl_t* bml_btl, + size_t size, + int flags); + +static inline int +mca_pml_csum_send_request_start_btl( mca_pml_csum_send_request_t* sendreq, + mca_bml_base_btl_t* bml_btl ) +{ + size_t size = sendreq->req_send.req_bytes_packed; + mca_btl_base_module_t* btl = bml_btl->btl; + size_t eager_limit = btl->btl_eager_limit - sizeof(mca_pml_csum_hdr_t); + int rc; + + if( OPAL_LIKELY(size <= eager_limit) ) { + switch(sendreq->req_send.req_send_mode) { + case MCA_PML_BASE_SEND_SYNCHRONOUS: + rc = mca_pml_csum_send_request_start_rndv(sendreq, bml_btl, size, 0); + break; + case MCA_PML_BASE_SEND_BUFFERED: + rc = mca_pml_csum_send_request_start_copy(sendreq, bml_btl, size); + break; + case MCA_PML_BASE_SEND_COMPLETE: + rc = mca_pml_csum_send_request_start_prepare(sendreq, bml_btl, size); + break; + default: + if (size != 0 && bml_btl->btl_flags & MCA_BTL_FLAGS_SEND_INPLACE) { + rc = mca_pml_csum_send_request_start_prepare(sendreq, bml_btl, size); + } else { + rc = mca_pml_csum_send_request_start_copy(sendreq, bml_btl, size); + } + break; + } + } else { + size = eager_limit; + if(OPAL_UNLIKELY(btl->btl_rndv_eager_limit < eager_limit)) + size = btl->btl_rndv_eager_limit; + if(sendreq->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED) { + rc = mca_pml_csum_send_request_start_buffered(sendreq, bml_btl, size); + } else if + (ompi_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == false) { + unsigned char *base; + ompi_convertor_get_current_pointer( &sendreq->req_send.req_base.req_convertor, (void**)&base ); + + if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_csum_rdma_btls( + sendreq->req_endpoint, + base, + sendreq->req_send.req_bytes_packed, + sendreq->req_rdma))) { + rc = mca_pml_csum_send_request_start_rdma(sendreq, bml_btl, + sendreq->req_send.req_bytes_packed); + if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) { + mca_pml_csum_free_rdma_resources(sendreq); + } + } else { + rc = mca_pml_csum_send_request_start_rndv(sendreq, bml_btl, size, + MCA_PML_CSUM_HDR_FLAGS_CONTIG); + } + } else { + rc = mca_pml_csum_send_request_start_rndv(sendreq, bml_btl, size, 0); + } + } + + return rc; +} + +static inline int +mca_pml_csum_send_request_start( mca_pml_csum_send_request_t* sendreq ) +{ + mca_pml_csum_comm_t* comm = sendreq->req_send.req_base.req_comm->c_pml_comm; + mca_bml_base_endpoint_t* endpoint = (mca_bml_base_endpoint_t*) + sendreq->req_send.req_base.req_proc->proc_bml; + size_t i; + + if( OPAL_UNLIKELY(endpoint == NULL) ) { + return OMPI_ERR_UNREACH; + } + + sendreq->req_endpoint = endpoint; + sendreq->req_state = 0; + sendreq->req_lock = 0; + sendreq->req_pipeline_depth = 0; + sendreq->req_bytes_delivered = 0; + sendreq->req_pending = MCA_PML_CSUM_SEND_PENDING_NONE; + sendreq->req_send.req_base.req_sequence = OPAL_THREAD_ADD32( + &comm->procs[sendreq->req_send.req_base.req_peer].send_sequence,1); + + MCA_PML_BASE_SEND_START( &sendreq->req_send.req_base ); + + for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) { + mca_bml_base_btl_t* bml_btl; + int rc; + + /* select a btl */ + bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager); + rc = mca_pml_csum_send_request_start_btl(sendreq, bml_btl); + if( OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != rc) ) + return rc; + } + add_request_to_send_pending(sendreq, MCA_PML_CSUM_SEND_PENDING_START, true); + + return OMPI_SUCCESS; +} + +/** + * Initiate a put scheduled by the receiver. + */ + +void mca_pml_csum_send_request_put( mca_pml_csum_send_request_t* sendreq, + mca_btl_base_module_t* btl, + mca_pml_csum_rdma_hdr_t* hdr ); + +int mca_pml_csum_send_request_put_frag(mca_pml_csum_rdma_frag_t* frag); + +/* This function tries to continue sendreq that was stuck because of resource + * unavailability. A sendreq may be added to send_pending list if there is no + * resource to send initial packet or there is not resource to schedule data + * for sending. The reason the sendreq was added to the list is stored inside + * sendreq struct and appropriate operation is retried when resource became + * available. bml_btl passed to the function doesn't represents sendreq + * destination, it represents BTL on which resource was freed, so only this BTL + * should be considered for sending packets */ +void mca_pml_csum_send_request_process_pending(mca_bml_base_btl_t *bml_btl); + +void mca_pml_csum_send_request_copy_in_out(mca_pml_csum_send_request_t *sendreq, + uint64_t send_offset, uint64_t send_length); + +END_C_DECLS + +#endif /* OMPI_PML_CSUM_SEND_REQUEST_H */ diff --git a/ompi/mca/pml/csum/pml_csum_start.c b/ompi/mca/pml/csum/pml_csum_start.c new file mode 100644 index 0000000000..47949ffeeb --- /dev/null +++ b/ompi/mca/pml/csum/pml_csum_start.c @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2007 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "pml_csum.h" +#include "pml_csum_recvreq.h" +#include "pml_csum_sendreq.h" +#include "ompi/memchecker.h" + + +int mca_pml_csum_start(size_t count, ompi_request_t** requests) +{ + int rc; + size_t i; + bool reuse_old_request = true; + + for(i=0; ireq_type) { + continue; + } + + /* If the persistent request is currently active - obtain the + * request lock and verify the status is incomplete. if the + * pml layer has not completed the request - mark the request + * as free called - so that it will be freed when the request + * completes - and create a new request. + */ + + reuse_old_request = true; + switch(pml_request->req_ompi.req_state) { + case OMPI_REQUEST_INACTIVE: + if(pml_request->req_pml_complete == true) + break; + /* otherwise fall through */ + case OMPI_REQUEST_ACTIVE: { + + ompi_request_t *request; + OPAL_THREAD_LOCK(&ompi_request_lock); + if (pml_request->req_pml_complete == false) { + /* free request after it completes */ + pml_request->req_free_called = true; + } else { + /* can reuse the existing request */ + OPAL_THREAD_UNLOCK(&ompi_request_lock); + break; + } + + reuse_old_request = false; + /* allocate a new request */ + switch(pml_request->req_type) { + case MCA_PML_REQUEST_SEND: { + mca_pml_base_send_mode_t sendmode = + ((mca_pml_base_send_request_t*)pml_request)->req_send_mode; + rc = mca_pml_csum_isend_init( + pml_request->req_addr, + pml_request->req_count, + pml_request->req_datatype, + pml_request->req_peer, + pml_request->req_tag, + sendmode, + pml_request->req_comm, + &request); + break; + } + case MCA_PML_REQUEST_RECV: + rc = mca_pml_csum_irecv_init( + pml_request->req_addr, + pml_request->req_count, + pml_request->req_datatype, + pml_request->req_peer, + pml_request->req_tag, + pml_request->req_comm, + &request); + break; + default: + rc = OMPI_ERR_REQUEST; + break; + } + OPAL_THREAD_UNLOCK(&ompi_request_lock); + if(OMPI_SUCCESS != rc) + return rc; + pml_request = (mca_pml_base_request_t*)request; + requests[i] = request; + break; + } + default: + return OMPI_ERR_REQUEST; + } + + /* start the request */ + switch(pml_request->req_type) { + case MCA_PML_REQUEST_SEND: + { + mca_pml_csum_send_request_t* sendreq = (mca_pml_csum_send_request_t*)pml_request; + if( reuse_old_request && (sendreq->req_send.req_bytes_packed != 0) ) { + size_t offset = 0; + /** + * Reset the convertor in case we're dealing with the original + * request, which when completed do not reset the convertor. + */ + ompi_convertor_set_position( &sendreq->req_send.req_base.req_convertor, + &offset ); + } + MCA_PML_CSUM_SEND_REQUEST_START(sendreq, rc); + if(rc != OMPI_SUCCESS) + return rc; + break; + } + case MCA_PML_REQUEST_RECV: + { + mca_pml_csum_recv_request_t* recvreq = (mca_pml_csum_recv_request_t*)pml_request; + MCA_PML_CSUM_RECV_REQUEST_START(recvreq); + break; + } + default: + return OMPI_ERR_REQUEST; + } + } + return OMPI_SUCCESS; +} + diff --git a/ompi/mca/pml/csum/post_configure.sh b/ompi/mca/pml/csum/post_configure.sh new file mode 100644 index 0000000000..68d1a0516f --- /dev/null +++ b/ompi/mca/pml/csum/post_configure.sh @@ -0,0 +1 @@ +DIRECT_CALL_HEADER="ompi/mca/pml/csum/pml_csum.h" diff --git a/opal/util/crc.h b/opal/util/crc.h index c6eb419437..aea948190d 100644 --- a/opal/util/crc.h +++ b/opal/util/crc.h @@ -9,6 +9,9 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2009 IBM Corporation. All rights reserved. + * Copyright (c) 2009 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -102,6 +105,19 @@ opal_csum(const void * source, size_t csumlen) return opal_csum_partial(source, csumlen, &lastPartialLong, &lastPartialLength); } +static inline uint16_t +opal_csum16 (const void * source, size_t csumlen) +{ + unsigned char *src = (unsigned char *) source; + uint16_t csum = 0; + size_t i; + + for(i = 0; i < csumlen; i++) { + csum += *src++; + } + return csum; +} + OPAL_DECLSPEC unsigned int opal_uicsum_partial ( const void * source,