1
1

Add a new PML module that acts as a "mini-dr" - when requested, it performs a dr-like checksum on messages for BTL's that require it, as specified by MCA params.

Add two new configure options that specify:

1. when to add padding to the openib control header - this *only* happens when the configure option is specified

2. when to use the dr-like checksum as opposed to the memcpy checksum. Not selectable at runtime - to eliminate performance impacts, this is a configure-only option

Also removed an unused checksum version from opal/util/crc.h.

The new component still needs a little cleanup and some sync with recent ob1 bug fixes. It was created as a separate module to avoid performance hits in ob1 itself, though most of the code is duplicative. The component is only selectable by either specifying it directly, or configuring with the dr-like checksum -and- setting -mca pml_csum_enable_checksum 1.

Modify the LANL platform files to take advantage of the new module.

This commit was SVN r20846.
Этот коммит содержится в:
Ralph Castain 2009-03-23 23:52:05 +00:00
родитель fb2b41d40a
Коммит 17f51a0389
44 изменённых файлов: 7561 добавлений и 406 удалений

Просмотреть файл

@ -12,6 +12,9 @@ dnl Copyright (c) 2004-2005 The Regents of the University of California.
dnl All rights reserved.
dnl Copyright (c) 2006-2008 Cisco Systems, Inc. All rights reserved.
dnl Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
dnl Copyright (c) 2009 IBM Corporation. All rights reserved.
dnl Copyright (c) 2009 Los Alamos National Security, LLC. All rights
dnl reserved.
dnl $COPYRIGHT$
dnl
dnl Additional copyrights may follow
@ -725,5 +728,37 @@ AC_DEFINE_UNQUOTED([OPAL_IDENT_STRING], ["$with_ident_string"],
[ident string for Open MPI])
AC_MSG_RESULT([$with_ident_string])
#
# Add padding to OpenIB header
#
AC_MSG_CHECKING([whether to add padding to the openib control header])
AC_ARG_WITH([openib-control-hdr-padding],
[AC_HELP_STRING([--with-openib-control-hdr-padding],
[Add padding bytes to the openib control header])])
if test "$with_openib_control_hdr_padding" = "yes"; then
AC_MSG_RESULT([yes])
ompi_openib_pad_hdr=1
else
AC_MSG_RESULT([no])
ompi_openib_pad_hdr=0
fi
AC_DEFINE_UNQUOTED([OMPI_OPENIB_PAD_HDR],
[$ompi_openib_pad_hdr],
[Add padding bytes to the openib control header])
#
# Use alternative checksum algorithm
#
AC_MSG_CHECKING([whether to use an alternative checksum algo for messages])
AC_ARG_WITH([dst-checksum],
[AC_HELP_STRING([--with-dst-checksum],
[Use an alternative checksum algorithm for messages])])
if test "$with_dst_checksum" = "yes"; then
AC_MSG_RESULT([yes])
CFLAGS="-DOMPI_CSUM_DST $CFLAGS"
else
AC_MSG_RESULT([no])
fi
])

Просмотреть файл

@ -28,3 +28,5 @@ with_io_romio_flags=--with-file-system=ufs+nfs
with_threads=posix
with_valgrind=no
LDFLAGS=-L/opt/PBS/lib64
with_openib_control_hdr_padding=yes

Просмотреть файл

@ -71,18 +71,30 @@ orte_tmpdir_base = /tmp
## from inadvertent job executions
orte_allocation_required = 1
## MPI behavior
mpi_leave_pinned = 0
pml = csum
pml_csum_enable_csum = 1
btl_openib_flags = 50
## Protect looped collectives
coll_sync_priority = 100
coll_sync_barrier_before = 1000
## Activate hierarchical collectives
coll_hierarch_priority = 90
## Add the interface for out-of-band communication
## and set it up
oob_tcp_if_include=ib0,eth0
oob_tcp_if_include=ib0
oob_tcp_peer_retries = 10
oob_tcp_disable_family = IPv6
oob_tcp_listen_mode = listen_thread
oob_tcp_sndbuf = 32768
oob_tcp_rcvbuf = 32768
## Define the MPI interconnects
btl = sm,openib,tcp,self
btl = sm,openib,self
## Setup OpenIB
btl_openib_want_fork_support = 0

Просмотреть файл

@ -28,4 +28,6 @@ with_io_romio_flags=--with-file-system=ufs+nfs+panfs
with_threads=posix
with_valgrind=no
LDFLAGS=-L/opt/PBS/lib64
CFLAGS=-I/opt/panfs/include
CFLAGS="-I/opt/panfs/include"
with_openib_control_hdr_padding=yes

Просмотреть файл

@ -71,18 +71,30 @@ orte_tmpdir_base = /tmp
## from inadvertent job executions
orte_allocation_required = 1
## MPI behavior
mpi_leave_pinned = 0
pml = csum
pml_csum_enable_csum = 1
btl_openib_flags = 50
## Protect looped collectives
coll_sync_priority = 100
coll_sync_barrier_before = 1000
## Activate hierarchical collectives
coll_hierarch_priority = 90
## Add the interface for out-of-band communication
## and set it up
oob_tcp_if_include=ib0,eth0
oob_tcp_if_include=ib0
oob_tcp_peer_retries = 10
oob_tcp_disable_family = IPv6
oob_tcp_listen_mode = listen_thread
oob_tcp_sndbuf = 32768
oob_tcp_rcvbuf = 32768
## Define the MPI interconnects
btl = sm,openib,tcp,self
btl = sm,openib,self
## Setup OpenIB
btl_openib_want_fork_support = 0

Просмотреть файл

@ -1,99 +0,0 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the default system-wide MCA parameters defaults file.
# Specifically, the MCA parameter "mca_param_files" defaults to a
# value of
# "$HOME/.openmpi/mca-params.conf:$sysconf/openmpi-mca-params.conf"
# (this file is the latter of the two). So if the default value of
# mca_param_files is not changed, this file is used to set system-wide
# MCA parameters. This file can therefore be used to set system-wide
# default MCA parameters for all users. Of course, users can override
# these values if they want, but this file is an excellent location
# for setting system-specific MCA parameters for those users who don't
# know / care enough to investigate the proper values for them.
# Note that this file is only applicable where it is visible (in a
# filesystem sense). Specifically, MPI processes each read this file
# during their startup to determine what default values for MCA
# parameters should be used. mpirun does not bundle up the values in
# this file from the node where it was run and send them to all nodes;
# the default value decisions are effectively distributed. Hence,
# these values are only applicable on nodes that "see" this file. If
# $sysconf is a directory on a local disk, it is likely that changes
# to this file will need to be propagated to other nodes. If $sysconf
# is a directory that is shared via a networked filesystem, changes to
# this file will be visible to all nodes that share this $sysconf.
# The format is straightforward: one per line, mca_param_name =
# rvalue. Quoting is ignored (so if you use quotes or escape
# characters, they'll be included as part of the value). For example:
# Disable run-time MPI parameter checking
# mpi_param_check = 0
# Note that the value "~/" will be expanded to the current user's home
# directory. For example:
# Change component loading path
# component_path = /usr/local/lib/openmpi:~/my_openmpi_components
# See "ompi_info --param all all" for a full listing of Open MPI MCA
# parameters available and their default values.
#
# Basic behavior to smooth startup
mca_component_show_load_errors = 0
orte_abort_timeout = 10
opal_set_max_sys_limits = 1
## Protect the shared file systems
orte_no_session_dirs = /panfs,/scratch,/users,/usr/projects
orte_tmpdir_base = /tmp
## Require an allocation to run - protects the frontend
## from inadvertent job executions
orte_allocation_required = 1
## Add the interface for out-of-band communication
## and set it up
oob_tcp_if_include=ib0,eth0
oob_tcp_peer_retries = 10
oob_tcp_disable_family = IPv6
oob_tcp_listen_mode = listen_thread
oob_tcp_sndbuf = 32768
oob_tcp_rcvbuf = 32768
## Define the MPI interconnects
btl = sm,openib,tcp,self
## Setup OpenIB
btl_openib_want_fork_support = 0
btl_openib_cpc_include = oob
#btl_openib_receive_queues = P,128,256,64,32,32:S,2048,1024,128,32:S,12288,1024,128,32:S,65536,1024,128,32
## Enable cpu affinity
mpi_paffinity_alone = 1
## Setup MPI options
mpi_show_handle_leaks = 1
mpi_warn_on_fork = 1
mpi_abort_print_stack = 1

Просмотреть файл

@ -28,3 +28,5 @@ with_io_romio_flags=--with-file-system=ufs+nfs
with_threads=posix
with_valgrind=no
LDFLAGS=-L/opt/PBS/lib64
with_openib_control_hdr_padding=yes

Просмотреть файл

@ -71,9 +71,22 @@ orte_tmpdir_base = /tmp
## from inadvertent job executions
orte_allocation_required = 1
## MPI behavior
mpi_leave_pinned = 0
pml = csum
pml_csum_enable_csum = 1
btl_openib_flags = 50
## Protect looped collectives
coll_sync_priority = 100
coll_sync_barrier_before = 1000
## Activate hierarchical collectives
coll_hierarch_priority = 90
## Add the interface for out-of-band communication
## and set it up
oob_tcp_if_include=ib0,eth0
oob_tcp_if_include=ib0
oob_tcp_peer_retries = 10
oob_tcp_disable_family = IPv6
oob_tcp_listen_mode = listen_thread
@ -81,7 +94,7 @@ oob_tcp_sndbuf = 32768
oob_tcp_rcvbuf = 32768
## Define the MPI interconnects
btl = sm,openib,tcp,self
btl = sm,openib,self
## Setup OpenIB
btl_openib_want_fork_support = 0

Просмотреть файл

@ -28,4 +28,6 @@ with_io_romio_flags=--with-file-system=ufs+nfs+panfs
with_threads=posix
with_valgrind=no
LDFLAGS=-L/opt/PBS/lib64
CFLAGS=-I/opt/panfs/include
CFLAGS="-I/opt/panfs/include"
with_openib_control_hdr_padding=yes

Просмотреть файл

@ -71,9 +71,22 @@ orte_tmpdir_base = /tmp
## from inadvertent job executions
orte_allocation_required = 1
## MPI behavior
mpi_leave_pinned = 0
pml = csum
pml_csum_enable_csum = 1
btl_openib_flags = 50
## Protect looped collectives
coll_sync_priority = 100
coll_sync_barrier_before = 1000
## Activate hierarchical collectives
coll_hierarch_priority = 90
## Add the interface for out-of-band communication
## and set it up
oob_tcp_if_include=ib0,eth0
oob_tcp_if_include=ib0
oob_tcp_peer_retries = 10
oob_tcp_disable_family = IPv6
oob_tcp_listen_mode = listen_thread
@ -81,7 +94,7 @@ oob_tcp_sndbuf = 32768
oob_tcp_rcvbuf = 32768
## Define the MPI interconnects
btl = sm,openib,tcp,self
btl = sm,openib,self
## Setup OpenIB
btl_openib_want_fork_support = 0

Просмотреть файл

@ -1,97 +0,0 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the default system-wide MCA parameters defaults file.
# Specifically, the MCA parameter "mca_param_files" defaults to a
# value of
# "$HOME/.openmpi/mca-params.conf:$sysconf/openmpi-mca-params.conf"
# (this file is the latter of the two). So if the default value of
# mca_param_files is not changed, this file is used to set system-wide
# MCA parameters. This file can therefore be used to set system-wide
# default MCA parameters for all users. Of course, users can override
# these values if they want, but this file is an excellent location
# for setting system-specific MCA parameters for those users who don't
# know / care enough to investigate the proper values for them.
# Note that this file is only applicable where it is visible (in a
# filesystem sense). Specifically, MPI processes each read this file
# during their startup to determine what default values for MCA
# parameters should be used. mpirun does not bundle up the values in
# this file from the node where it was run and send them to all nodes;
# the default value decisions are effectively distributed. Hence,
# these values are only applicable on nodes that "see" this file. If
# $sysconf is a directory on a local disk, it is likely that changes
# to this file will need to be propagated to other nodes. If $sysconf
# is a directory that is shared via a networked filesystem, changes to
# this file will be visible to all nodes that share this $sysconf.
# The format is straightforward: one per line, mca_param_name =
# rvalue. Quoting is ignored (so if you use quotes or escape
# characters, they'll be included as part of the value). For example:
# Disable run-time MPI parameter checking
# mpi_param_check = 0
# Note that the value "~/" will be expanded to the current user's home
# directory. For example:
# Change component loading path
# component_path = /usr/local/lib/openmpi:~/my_openmpi_components
# See "ompi_info --param all all" for a full listing of Open MPI MCA
# parameters available and their default values.
#
# Basic behavior to smooth startup
mca_component_show_load_errors = 0
orte_abort_timeout = 10
opal_set_max_sys_limits = 1
## Protect the shared file systems
orte_no_session_dirs = /panfs,/scratch,/users,/usr/projects
orte_tmpdir_base = /tmp
## Require an allocation to run - protects the frontend
## from inadvertent job executions
orte_allocation_required = 1
## Add the interface for out-of-band communication
## and set it up
oob_tcp_if_include=ib0,eth0
oob_tcp_peer_retries = 10
oob_tcp_disable_family = IPv6
oob_tcp_listen_mode = listen_thread
oob_tcp_sndbuf = 32768
oob_tcp_rcvbuf = 32768
## Define the MPI interconnects
btl = sm,openib,tcp,self
## Setup OpenIB
btl_openib_want_fork_support = 0
btl_openib_cpc_include = oob
#btl_openib_receive_queues = P,128,256,64,32,32:S,2048,1024,128,32:S,12288,1024,128,32:S,65536,1024,128,32
## Enable cpu affinity
mpi_paffinity_alone = 1
## Setup MPI options
mpi_show_handle_leaks = 0
mpi_warn_on_fork = 1

Просмотреть файл

@ -1,99 +0,0 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the default system-wide MCA parameters defaults file.
# Specifically, the MCA parameter "mca_param_files" defaults to a
# value of
# "$HOME/.openmpi/mca-params.conf:$sysconf/openmpi-mca-params.conf"
# (this file is the latter of the two). So if the default value of
# mca_param_files is not changed, this file is used to set system-wide
# MCA parameters. This file can therefore be used to set system-wide
# default MCA parameters for all users. Of course, users can override
# these values if they want, but this file is an excellent location
# for setting system-specific MCA parameters for those users who don't
# know / care enough to investigate the proper values for them.
# Note that this file is only applicable where it is visible (in a
# filesystem sense). Specifically, MPI processes each read this file
# during their startup to determine what default values for MCA
# parameters should be used. mpirun does not bundle up the values in
# this file from the node where it was run and send them to all nodes;
# the default value decisions are effectively distributed. Hence,
# these values are only applicable on nodes that "see" this file. If
# $sysconf is a directory on a local disk, it is likely that changes
# to this file will need to be propagated to other nodes. If $sysconf
# is a directory that is shared via a networked filesystem, changes to
# this file will be visible to all nodes that share this $sysconf.
# The format is straightforward: one per line, mca_param_name =
# rvalue. Quoting is ignored (so if you use quotes or escape
# characters, they'll be included as part of the value). For example:
# Disable run-time MPI parameter checking
# mpi_param_check = 0
# Note that the value "~/" will be expanded to the current user's home
# directory. For example:
# Change component loading path
# component_path = /usr/local/lib/openmpi:~/my_openmpi_components
# See "ompi_info --param all all" for a full listing of Open MPI MCA
# parameters available and their default values.
#
# Basic behavior to smooth startup
mca_component_show_load_errors = 0
orte_abort_timeout = 10
opal_set_max_sys_limits = 1
## Protect the shared file systems
orte_no_session_dirs = /panfs,/scratch,/users,/usr/projects
orte_tmpdir_base = /tmp
## Require an allocation to run - protects the frontend
## from inadvertent job executions
orte_allocation_required = 1
## Add the interface for out-of-band communication
## and set it up
oob_tcp_if_include=ib0
oob_tcp_peer_retries = 10
oob_tcp_disable_family = IPv6
oob_tcp_listen_mode = listen_thread
oob_tcp_sndbuf = 32768
oob_tcp_rcvbuf = 32768
## Define the MPI interconnects
btl = sm,openib,self
## Setup OpenIB
btl_openib_want_fork_support = 0
btl_openib_cpc_include = oob
#btl_openib_receive_queues = P,128,256,64,32,32:S,2048,1024,128,32:S,12288,1024,128,32:S,65536,1024,128,32
## Enable cpu affinity
mpi_paffinity_alone = 1
## Setup MPI options
mpi_show_handle_leaks = 1
mpi_warn_on_fork = 1
mpi_abort_print_stack = 1

Просмотреть файл

@ -1,97 +0,0 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the default system-wide MCA parameters defaults file.
# Specifically, the MCA parameter "mca_param_files" defaults to a
# value of
# "$HOME/.openmpi/mca-params.conf:$sysconf/openmpi-mca-params.conf"
# (this file is the latter of the two). So if the default value of
# mca_param_files is not changed, this file is used to set system-wide
# MCA parameters. This file can therefore be used to set system-wide
# default MCA parameters for all users. Of course, users can override
# these values if they want, but this file is an excellent location
# for setting system-specific MCA parameters for those users who don't
# know / care enough to investigate the proper values for them.
# Note that this file is only applicable where it is visible (in a
# filesystem sense). Specifically, MPI processes each read this file
# during their startup to determine what default values for MCA
# parameters should be used. mpirun does not bundle up the values in
# this file from the node where it was run and send them to all nodes;
# the default value decisions are effectively distributed. Hence,
# these values are only applicable on nodes that "see" this file. If
# $sysconf is a directory on a local disk, it is likely that changes
# to this file will need to be propagated to other nodes. If $sysconf
# is a directory that is shared via a networked filesystem, changes to
# this file will be visible to all nodes that share this $sysconf.
# The format is straightforward: one per line, mca_param_name =
# rvalue. Quoting is ignored (so if you use quotes or escape
# characters, they'll be included as part of the value). For example:
# Disable run-time MPI parameter checking
# mpi_param_check = 0
# Note that the value "~/" will be expanded to the current user's home
# directory. For example:
# Change component loading path
# component_path = /usr/local/lib/openmpi:~/my_openmpi_components
# See "ompi_info --param all all" for a full listing of Open MPI MCA
# parameters available and their default values.
#
# Basic behavior to smooth startup
mca_component_show_load_errors = 0
orte_abort_timeout = 10
opal_set_max_sys_limits = 1
## Protect the shared file systems
orte_no_session_dirs = /panfs,/scratch,/users,/usr/projects
orte_tmpdir_base = /tmp
## Require an allocation to run - protects the frontend
## from inadvertent job executions
orte_allocation_required = 1
## Add the interface for out-of-band communication
## and set it up
oob_tcp_if_include=ib0
oob_tcp_peer_retries = 10
oob_tcp_disable_family = IPv6
oob_tcp_listen_mode = listen_thread
oob_tcp_sndbuf = 32768
oob_tcp_rcvbuf = 32768
## Define the MPI interconnects
btl = sm,openib,self
## Setup OpenIB
btl_openib_want_fork_support = 0
btl_openib_cpc_include = oob
#btl_openib_receive_queues = P,128,256,64,32,32:S,2048,1024,128,32:S,12288,1024,128,32:S,65536,1024,128,32
## Enable cpu affinity
mpi_paffinity_alone = 1
## Setup MPI options
mpi_show_handle_leaks = 0
mpi_warn_on_fork = 1

Просмотреть файл

@ -5,6 +5,9 @@
* reserved.
* Copyright (c) 2004-2006 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2009 IBM Corporation. All rights reserved.
* Copyright (c) 2009 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -21,10 +24,24 @@
#if defined(CHECKSUM)
#if defined (OMPI_CSUM_DST)
#define MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \
do { \
volatile uint32_t __csum; \
__csum = (CONVERTOR)->checksum; \
(CONVERTOR)->checksum += OPAL_CSUM_BCOPY_PARTIAL( (SRC), (DST), (BLENGTH), (BLENGTH), &(CONVERTOR)->csum_ui1, &(CONVERTOR)->csum_ui2 ); \
__csum += OPAL_CSUM_PARTIAL( (DST), (BLENGTH), &(CONVERTOR)->csum_ui1, &(CONVERTOR)->csum_ui2); \
if (__csum != (CONVERTOR)->checksum) { \
opal_output(0, "%s:%d:csum2: Invalid \'MEMCPY_CSUM check\' - dst csum:0x%04x != src csum:0x%04x\n", __FILE__, __LINE__, __csum, (CONVERTOR)->checksum); \
} \
} while (0)
#else
#define MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \
do { \
(CONVERTOR)->checksum += OPAL_CSUM_BCOPY_PARTIAL( (SRC), (DST), (BLENGTH), (BLENGTH), &(CONVERTOR)->csum_ui1, &(CONVERTOR)->csum_ui2 ); \
} while (0)
#endif
#define COMPUTE_CSUM( SRC, BLENGTH, CONVERTOR ) \
do { \

Просмотреть файл

@ -9,7 +9,8 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
* Copyright (c) 2009 IBM Corporation. All rights reserved.
* Copyright (c) 2006-2009 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2006-2007 Voltaire All rights reserved.
* $COPYRIGHT$
@ -119,7 +120,10 @@ typedef struct mca_btl_openib_footer_t mca_btl_openib_footer_t;
#define MCA_BTL_OPENIB_CONTROL_CTS 3
struct mca_btl_openib_control_header_t {
uint8_t type;
uint8_t type;
#if OMPI_OPENIB_PAD_HDR
uint8_t padding[15];
#endif
};
typedef struct mca_btl_openib_control_header_t mca_btl_openib_control_header_t;

63
ompi/mca/pml/csum/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,63 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
dist_pkgdata_DATA = help-pml-csum.txt
EXTRA_DIST = post_configure.sh pml_csum_endpoint.c pml_csum_endpoint.h
csum_sources = \
pml_csum.c \
pml_csum.h \
pml_csum_comm.c \
pml_csum_comm.h \
pml_csum_component.c \
pml_csum_component.h \
pml_csum_hdr.h \
pml_csum_iprobe.c \
pml_csum_irecv.c \
pml_csum_isend.c \
pml_csum_progress.c \
pml_csum_rdma.c \
pml_csum_rdma.h \
pml_csum_rdmafrag.c \
pml_csum_rdmafrag.h \
pml_csum_recvfrag.c \
pml_csum_recvfrag.h \
pml_csum_recvreq.c \
pml_csum_recvreq.h \
pml_csum_sendreq.c \
pml_csum_sendreq.h \
pml_csum_start.c
if OMPI_BUILD_pml_csum_DSO
component_noinst =
component_install = mca_pml_csum.la
else
component_noinst = libmca_pml_csum.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_pml_csum_la_SOURCES = $(csum_sources)
mca_pml_csum_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_pml_csum_la_SOURCES = $(csum_sources)
libmca_pml_csum_la_LDFLAGS = -module -avoid-version

24
ompi/mca/pml/csum/configure.params Обычный файл
Просмотреть файл

@ -0,0 +1,24 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2007 Los Alamos National Security, LLC. All rights
# reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
PARAM_CONFIG_FILES="Makefile"

31
ompi/mca/pml/csum/help-pml-csum.txt Обычный файл
Просмотреть файл

@ -0,0 +1,31 @@
# -*- text -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2009 IBM Corporation. All rights reserved.
# Copyright (c) 2009 Los Alamos National Security, LLC. All rights
# reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for Open MPI.
#
[pml:checksum-not-enabled]
Warning: This build of Open MPI was specifically configured
with support for the alternate checksum algorithm, but the
support was not enabled by the proper MCA parameter. You should
set pml_csum_enable_csum to enable checksum operation.
While your application will be allowed to proceed, please be
advised that you will not be protected from data errors.

801
ompi/mca/pml/csum/pml_csum.c Обычный файл
Просмотреть файл

@ -0,0 +1,801 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2009 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
* Copyright (c) 2006-2008 University of Houston. All rights reserved.
* Copyright (c) 2009 IBM Corporation. All rights reserved.
* Copyright (c) 2009 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include <stdlib.h>
#include <string.h>
#include "opal/class/opal_bitmap.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/mca/pml/base/base.h"
#include "ompi/mca/btl/btl.h"
#include "ompi/mca/pml/base/base.h"
#include "ompi/mca/btl/base/base.h"
#include "pml_csum.h"
#include "pml_csum_component.h"
#include "pml_csum_comm.h"
#include "pml_csum_hdr.h"
#include "pml_csum_recvfrag.h"
#include "pml_csum_sendreq.h"
#include "pml_csum_recvreq.h"
#include "pml_csum_rdmafrag.h"
#include "ompi/mca/bml/base/base.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/grpcomm/grpcomm.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "opal/util/crc.h"
#include "ompi/runtime/ompi_cr.h"
#include "ompi/runtime/ompi_module_exchange.h"
mca_pml_csum_t mca_pml_csum = {
{
mca_pml_csum_add_procs,
mca_pml_csum_del_procs,
mca_pml_csum_enable,
mca_pml_csum_progress,
mca_pml_csum_add_comm,
mca_pml_csum_del_comm,
mca_pml_csum_irecv_init,
mca_pml_csum_irecv,
mca_pml_csum_recv,
mca_pml_csum_isend_init,
mca_pml_csum_isend,
mca_pml_csum_send,
mca_pml_csum_iprobe,
mca_pml_csum_probe,
mca_pml_csum_start,
mca_pml_csum_dump,
mca_pml_csum_ft_event,
32768,
INT_MAX
}
};
void mca_pml_csum_error_handler( struct mca_btl_base_module_t* btl,
int32_t flags );
int mca_pml_csum_enable(bool enable)
{
if( false == enable ) {
return OMPI_SUCCESS;
}
OBJ_CONSTRUCT(&mca_pml_csum.lock, opal_mutex_t);
/* fragments */
OBJ_CONSTRUCT(&mca_pml_csum.rdma_frags, ompi_free_list_t);
ompi_free_list_init_new( &mca_pml_csum.rdma_frags,
sizeof(mca_pml_csum_rdma_frag_t),
CACHE_LINE_SIZE,
OBJ_CLASS(mca_pml_csum_rdma_frag_t),
0,CACHE_LINE_SIZE,
mca_pml_csum.free_list_num,
mca_pml_csum.free_list_max,
mca_pml_csum.free_list_inc,
NULL );
OBJ_CONSTRUCT(&mca_pml_csum.recv_frags, ompi_free_list_t);
ompi_free_list_init_new( &mca_pml_csum.recv_frags,
sizeof(mca_pml_csum_recv_frag_t) + mca_pml_csum.unexpected_limit,
CACHE_LINE_SIZE,
OBJ_CLASS(mca_pml_csum_recv_frag_t),
0,CACHE_LINE_SIZE,
mca_pml_csum.free_list_num,
mca_pml_csum.free_list_max,
mca_pml_csum.free_list_inc,
NULL );
OBJ_CONSTRUCT(&mca_pml_csum.pending_pckts, ompi_free_list_t);
ompi_free_list_init_new( &mca_pml_csum.pending_pckts,
sizeof(mca_pml_csum_pckt_pending_t),
CACHE_LINE_SIZE,
OBJ_CLASS(mca_pml_csum_pckt_pending_t),
0,CACHE_LINE_SIZE,
mca_pml_csum.free_list_num,
mca_pml_csum.free_list_max,
mca_pml_csum.free_list_inc,
NULL );
OBJ_CONSTRUCT(&mca_pml_csum.buffers, ompi_free_list_t);
OBJ_CONSTRUCT(&mca_pml_csum.send_ranges, ompi_free_list_t);
ompi_free_list_init_new( &mca_pml_csum.send_ranges,
sizeof(mca_pml_csum_send_range_t) +
(mca_pml_csum.max_send_per_range - 1) * sizeof(mca_pml_csum_com_btl_t),
CACHE_LINE_SIZE,
OBJ_CLASS(mca_pml_csum_send_range_t),
0,CACHE_LINE_SIZE,
mca_pml_csum.free_list_num,
mca_pml_csum.free_list_max,
mca_pml_csum.free_list_inc,
NULL );
/* pending operations */
OBJ_CONSTRUCT(&mca_pml_csum.send_pending, opal_list_t);
OBJ_CONSTRUCT(&mca_pml_csum.recv_pending, opal_list_t);
OBJ_CONSTRUCT(&mca_pml_csum.pckt_pending, opal_list_t);
OBJ_CONSTRUCT(&mca_pml_csum.rdma_pending, opal_list_t);
/* missing communicator pending list */
OBJ_CONSTRUCT(&mca_pml_csum.non_existing_communicator_pending, opal_list_t);
/**
* If we get here this is the PML who get selected for the run. We
* should get ownership for the send and receive requests list, and
* initialize them with the size of our own requests.
*/
ompi_free_list_init_new( &mca_pml_base_send_requests,
sizeof(mca_pml_csum_send_request_t) +
(mca_pml_csum.max_rdma_per_request - 1) *
sizeof(mca_pml_csum_com_btl_t),
CACHE_LINE_SIZE,
OBJ_CLASS(mca_pml_csum_send_request_t),
0,CACHE_LINE_SIZE,
mca_pml_csum.free_list_num,
mca_pml_csum.free_list_max,
mca_pml_csum.free_list_inc,
NULL );
ompi_free_list_init_new( &mca_pml_base_recv_requests,
sizeof(mca_pml_csum_recv_request_t) +
(mca_pml_csum.max_rdma_per_request - 1) *
sizeof(mca_pml_csum_com_btl_t),
CACHE_LINE_SIZE,
OBJ_CLASS(mca_pml_csum_recv_request_t),
0,CACHE_LINE_SIZE,
mca_pml_csum.free_list_num,
mca_pml_csum.free_list_max,
mca_pml_csum.free_list_inc,
NULL );
mca_pml_csum.enabled = true;
return OMPI_SUCCESS;
}
int mca_pml_csum_add_comm(ompi_communicator_t* comm)
{
/* allocate pml specific comm data */
mca_pml_csum_comm_t* pml_comm = OBJ_NEW(mca_pml_csum_comm_t);
opal_list_item_t *item, *next_item;
mca_pml_csum_recv_frag_t* frag;
mca_pml_csum_comm_proc_t* pml_proc;
mca_pml_csum_match_hdr_t* hdr;
int i;
if (NULL == pml_comm) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
mca_pml_csum_comm_init_size(pml_comm, comm->c_remote_group->grp_proc_count);
comm->c_pml_comm = pml_comm;
for( i = 0; i < comm->c_remote_group->grp_proc_count; i++ ) {
pml_comm->procs[i].ompi_proc = ompi_group_peer_lookup(comm->c_remote_group,i);
}
/* Grab all related messages from the non_existing_communicator pending queue */
for( item = opal_list_get_first(&mca_pml_csum.non_existing_communicator_pending);
item != opal_list_get_end(&mca_pml_csum.non_existing_communicator_pending);
item = next_item ) {
frag = (mca_pml_csum_recv_frag_t*)item;
next_item = opal_list_get_next(item);
hdr = &frag->hdr.hdr_match;
/* Is this fragment for the current communicator ? */
if( frag->hdr.hdr_match.hdr_ctx != comm->c_contextid )
continue;
/* As we now know we work on a fragment for this communicator
* we should remove it from the
* non_existing_communicator_pending list. */
opal_list_remove_item( &mca_pml_csum.non_existing_communicator_pending,
item );
add_fragment_to_unexpected:
/* We generate the MSG_ARRIVED event as soon as the PML is aware
* of a matching fragment arrival. Independing if it is received
* on the correct order or not. This will allow the tools to
* figure out if the messages are not received in the correct
* order (if multiple network interfaces).
*/
PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_ARRIVED, comm,
hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
/* There is no matching to be done, and no lock to be held on the communicator as
* we know at this point that the communicator has not yet been returned to the user.
* The only required protection is around the non_existing_communicator_pending queue.
* We just have to push the fragment into the unexpected list of the corresponding
* proc, or into the out-of-order (cant_match) list.
*/
pml_proc = &(pml_comm->procs[hdr->hdr_src]);
if( ((uint16_t)hdr->hdr_seq) == ((uint16_t)pml_proc->expected_sequence) ) {
/* We're now expecting the next sequence number. */
pml_proc->expected_sequence++;
opal_list_append( &pml_proc->unexpected_frags, (opal_list_item_t*)frag );
PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_INSERT_IN_UNEX_Q, comm,
hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
/* And now the ugly part. As some fragments can be inserted in the cant_match list,
* every time we succesfully add a fragment in the unexpected list we have to make
* sure the next one is not in the cant_match. Otherwise, we will endup in a deadlock
* situation as the cant_match is only checked when a new fragment is received from
* the network.
*/
for(frag = (mca_pml_csum_recv_frag_t *)opal_list_get_first(&pml_proc->frags_cant_match);
frag != (mca_pml_csum_recv_frag_t *)opal_list_get_end(&pml_proc->frags_cant_match);
frag = (mca_pml_csum_recv_frag_t *)opal_list_get_next(frag)) {
hdr = &frag->hdr.hdr_match;
/* If the message has the next expected seq from that proc... */
if(hdr->hdr_seq != pml_proc->expected_sequence)
continue;
opal_list_remove_item(&pml_proc->frags_cant_match, (opal_list_item_t*)frag);
goto add_fragment_to_unexpected;
}
} else {
opal_list_append( &pml_proc->frags_cant_match, (opal_list_item_t*)frag );
}
}
return OMPI_SUCCESS;
}
int mca_pml_csum_del_comm(ompi_communicator_t* comm)
{
OBJ_RELEASE(comm->c_pml_comm);
comm->c_pml_comm = NULL;
return OMPI_SUCCESS;
}
/*
* For each proc setup a datastructure that indicates the BTLs
* that can be used to reach the destination.
*
*/
int mca_pml_csum_add_procs(ompi_proc_t** procs, size_t nprocs)
{
opal_bitmap_t reachable;
int rc;
size_t i;
if(nprocs == 0)
return OMPI_SUCCESS;
/* we don't have any endpoint data we need to cache on the
ompi_proc_t, so set proc_pml to NULL */
for (i = 0 ; i < nprocs ; ++i) {
procs[i]->proc_pml = NULL;
}
OBJ_CONSTRUCT(&reachable, opal_bitmap_t);
rc = opal_bitmap_init(&reachable, (int)nprocs);
if(OMPI_SUCCESS != rc)
return rc;
/*
* JJH: Disable this in FT enabled builds since
* we use a wrapper PML. It will cause this check to
* return failure as all processes will return the wrapper PML
* component in use instead of the wrapped PML component underneath.
*/
#if OPAL_ENABLE_FT == 0
/* make sure remote procs are using the same PML as us */
if (OMPI_SUCCESS != (rc = mca_pml_base_pml_check_selected("csum",
procs,
nprocs))) {
return rc;
}
#endif
rc = mca_bml.bml_add_procs( nprocs,
procs,
&reachable );
if(OMPI_SUCCESS != rc)
goto cleanup_and_return;
rc = mca_bml.bml_register( MCA_PML_CSUM_HDR_TYPE_MATCH,
mca_pml_csum_recv_frag_callback_match,
NULL );
if(OMPI_SUCCESS != rc)
goto cleanup_and_return;
rc = mca_bml.bml_register( MCA_PML_CSUM_HDR_TYPE_RNDV,
mca_pml_csum_recv_frag_callback_rndv,
NULL );
if(OMPI_SUCCESS != rc)
goto cleanup_and_return;
rc = mca_bml.bml_register( MCA_PML_CSUM_HDR_TYPE_RGET,
mca_pml_csum_recv_frag_callback_rget,
NULL );
if(OMPI_SUCCESS != rc)
goto cleanup_and_return;
rc = mca_bml.bml_register( MCA_PML_CSUM_HDR_TYPE_ACK,
mca_pml_csum_recv_frag_callback_ack,
NULL );
if(OMPI_SUCCESS != rc)
goto cleanup_and_return;
rc = mca_bml.bml_register( MCA_PML_CSUM_HDR_TYPE_FRAG,
mca_pml_csum_recv_frag_callback_frag,
NULL );
if(OMPI_SUCCESS != rc)
goto cleanup_and_return;
rc = mca_bml.bml_register( MCA_PML_CSUM_HDR_TYPE_PUT,
mca_pml_csum_recv_frag_callback_put,
NULL );
if(OMPI_SUCCESS != rc)
goto cleanup_and_return;
rc = mca_bml.bml_register( MCA_PML_CSUM_HDR_TYPE_FIN,
mca_pml_csum_recv_frag_callback_fin,
NULL );
if(OMPI_SUCCESS != rc)
goto cleanup_and_return;
/* register error handlers */
rc = mca_bml.bml_register_error(mca_pml_csum_error_handler);
if(OMPI_SUCCESS != rc)
goto cleanup_and_return;
cleanup_and_return:
OBJ_DESTRUCT(&reachable);
return rc;
}
/*
* iterate through each proc and notify any PTLs associated
* with the proc that it is/has gone away
*/
int mca_pml_csum_del_procs(ompi_proc_t** procs, size_t nprocs)
{
return mca_bml.bml_del_procs(nprocs, procs);
}
/*
* diagnostics
*/
int mca_pml_csum_dump(struct ompi_communicator_t* comm, int verbose)
{
struct mca_pml_comm_t* pml_comm = comm->c_pml_comm;
int i;
/* iterate through all procs on communicator */
for( i = 0; i < (int)pml_comm->num_procs; i++ ) {
mca_pml_csum_comm_proc_t* proc = &pml_comm->procs[i];
mca_bml_base_endpoint_t* ep = (mca_bml_base_endpoint_t*)proc->ompi_proc->proc_bml;
size_t n;
opal_output(0, "[Rank %d]\n", i);
/* dump all receive queues */
/* dump all btls */
for(n=0; n<ep->btl_eager.arr_size; n++) {
mca_bml_base_btl_t* bml_btl = &ep->btl_eager.bml_btls[n];
bml_btl->btl->btl_dump(bml_btl->btl, bml_btl->btl_endpoint, verbose);
}
}
return OMPI_SUCCESS;
}
static void mca_pml_csum_fin_completion( mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* ep,
struct mca_btl_base_descriptor_t* des,
int status )
{
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context;
/* check for pending requests */
MCA_PML_CSUM_PROGRESS_PENDING(bml_btl);
}
int mca_pml_csum_send_fin( ompi_proc_t* proc,
mca_bml_base_btl_t* bml_btl,
void *hdr_des,
uint8_t order,
uint32_t status )
{
mca_btl_base_descriptor_t* fin;
mca_pml_csum_fin_hdr_t* hdr;
int rc;
bool do_csum = mca_pml_csum.enable_csum &&
(bml_btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM);
mca_bml_base_alloc(bml_btl, &fin, order, sizeof(mca_pml_csum_fin_hdr_t),
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
if(NULL == fin) {
MCA_PML_CSUM_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status);
return OMPI_ERR_OUT_OF_RESOURCE;
}
fin->des_cbfunc = mca_pml_csum_fin_completion;
fin->des_cbdata = NULL;
/* fill in header */
hdr = (mca_pml_csum_fin_hdr_t*)fin->des_src->seg_addr.pval;
hdr->hdr_common.hdr_flags = 0;
hdr->hdr_common.hdr_type = MCA_PML_CSUM_HDR_TYPE_FIN;
hdr->hdr_common.hdr_csum = 0;
hdr->hdr_des.pval = hdr_des;
hdr->hdr_fail = status;
hdr->hdr_common.hdr_csum = (do_csum ?
opal_csum16(hdr, sizeof(mca_pml_csum_fin_hdr_t)) : OPAL_CSUM_ZERO);
if(do_csum) {
OMPI_CSUM_CSUM_DEBUG((0, "%s: Sending \'FIN\' with header csum:0x%04x\n",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hdr->hdr_common.hdr_csum));
}
csum_hdr_hton(hdr, MCA_PML_CSUM_HDR_TYPE_FIN, proc);
/* queue request */
rc = mca_bml_base_send( bml_btl,
fin,
MCA_PML_CSUM_HDR_TYPE_FIN );
if( OPAL_LIKELY( rc >= 0 ) ) {
if( OPAL_LIKELY( 1 == rc ) ) {
MCA_PML_CSUM_PROGRESS_PENDING(bml_btl);
}
return OMPI_SUCCESS;
}
mca_bml_base_free(bml_btl, fin);
MCA_PML_CSUM_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status);
return OMPI_ERR_OUT_OF_RESOURCE;
}
void mca_pml_csum_process_pending_packets(mca_bml_base_btl_t* bml_btl)
{
mca_pml_csum_pckt_pending_t *pckt;
int32_t i, rc, s = (int32_t)opal_list_get_size(&mca_pml_csum.pckt_pending);
for(i = 0; i < s; i++) {
mca_bml_base_btl_t *send_dst = NULL;
OPAL_THREAD_LOCK(&mca_pml_csum.lock);
pckt = (mca_pml_csum_pckt_pending_t*)
opal_list_remove_first(&mca_pml_csum.pckt_pending);
OPAL_THREAD_UNLOCK(&mca_pml_csum.lock);
if(NULL == pckt)
break;
if(pckt->bml_btl != NULL &&
pckt->bml_btl->btl == bml_btl->btl) {
send_dst = pckt->bml_btl;
} else {
send_dst = mca_bml_base_btl_array_find(
&pckt->proc->proc_bml->btl_eager, bml_btl->btl);
}
if(NULL == send_dst) {
OPAL_THREAD_LOCK(&mca_pml_csum.lock);
opal_list_append(&mca_pml_csum.pckt_pending,
(opal_list_item_t*)pckt);
OPAL_THREAD_UNLOCK(&mca_pml_csum.lock);
continue;
}
switch(pckt->hdr.hdr_common.hdr_type) {
case MCA_PML_CSUM_HDR_TYPE_ACK:
rc = mca_pml_csum_recv_request_ack_send_btl(pckt->proc,
send_dst,
pckt->hdr.hdr_ack.hdr_src_req.lval,
pckt->hdr.hdr_ack.hdr_dst_req.pval,
pckt->hdr.hdr_ack.hdr_send_offset,
pckt->hdr.hdr_common.hdr_flags & MCA_PML_CSUM_HDR_FLAGS_NORDMA);
MCA_PML_CSUM_PCKT_PENDING_RETURN(pckt);
if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
MCA_PML_CSUM_ADD_ACK_TO_PENDING(pckt->proc,
pckt->hdr.hdr_ack.hdr_src_req.lval,
pckt->hdr.hdr_ack.hdr_dst_req.pval,
pckt->hdr.hdr_ack.hdr_send_offset);
return;
}
break;
case MCA_PML_CSUM_HDR_TYPE_FIN:
rc = mca_pml_csum_send_fin(pckt->proc, send_dst,
pckt->hdr.hdr_fin.hdr_des.pval,
pckt->order,
pckt->hdr.hdr_fin.hdr_fail);
MCA_PML_CSUM_PCKT_PENDING_RETURN(pckt);
if(OMPI_ERR_OUT_OF_RESOURCE == rc)
return;
break;
default:
opal_output(0, "[%s:%d] wrong header type\n",
__FILE__, __LINE__);
break;
}
}
}
void mca_pml_csum_process_pending_rdma(void)
{
mca_pml_csum_rdma_frag_t* frag;
int32_t i, rc, s = (int32_t)opal_list_get_size(&mca_pml_csum.rdma_pending);
for(i = 0; i < s; i++) {
OPAL_THREAD_LOCK(&mca_pml_csum.lock);
frag = (mca_pml_csum_rdma_frag_t*)
opal_list_remove_first(&mca_pml_csum.rdma_pending);
OPAL_THREAD_UNLOCK(&mca_pml_csum.lock);
if(NULL == frag)
break;
if(frag->rdma_state == MCA_PML_CSUM_RDMA_PUT) {
frag->retries++;
rc = mca_pml_csum_send_request_put_frag(frag);
} else {
rc = mca_pml_csum_recv_request_get_frag(frag);
}
if(OMPI_ERR_OUT_OF_RESOURCE == rc)
break;
}
}
void mca_pml_csum_error_handler(
struct mca_btl_base_module_t* btl,
int32_t flags) {
orte_errmgr.abort(-1, NULL);
}
#if OPAL_ENABLE_FT == 0
int mca_pml_csum_ft_event( int state ) {
return OMPI_SUCCESS;
}
#else
int mca_pml_csum_ft_event( int state )
{
static bool first_continue_pass = false;
ompi_proc_t** procs = NULL;
size_t num_procs;
int ret, p;
if(OPAL_CRS_CHECKPOINT == state) {
if( opal_cr_timing_barrier_enabled ) {
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR1);
orte_grpcomm.barrier();
}
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P0);
}
else if(OPAL_CRS_CONTINUE == state) {
first_continue_pass = !first_continue_pass;
if( !first_continue_pass ) {
if( opal_cr_timing_barrier_enabled ) {
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR0);
orte_grpcomm.barrier();
}
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P2);
}
if( ompi_cr_continue_like_restart && !first_continue_pass ) {
/*
* Get a list of processes
*/
procs = ompi_proc_all(&num_procs);
if(NULL == procs) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
/*
* Refresh the proc structure, and publish our proc info in the modex.
* NOTE: Do *not* call ompi_proc_finalize as there are many places in
* the code that point to indv. procs in this strucutre. For our
* needs here we only need to fix up the modex, bml and pml
* references.
*/
if (OMPI_SUCCESS != (ret = ompi_proc_refresh())) {
opal_output(0,
"pml:csum: ft_event(Restart): proc_refresh Failed %d",
ret);
for(p = 0; p < (int)num_procs; ++p) {
OBJ_RELEASE(procs[p]);
}
free (procs);
return ret;
}
}
}
else if(OPAL_CRS_RESTART_PRE == state ) {
/* Nothing here */
}
else if(OPAL_CRS_RESTART == state ) {
/*
* Get a list of processes
*/
procs = ompi_proc_all(&num_procs);
if(NULL == procs) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
/*
* Clean out the modex information since it is invalid now.
* orte_grpcomm.purge_proc_attrs();
* This happens at the ORTE level, so doing it again here will cause
* some issues with socket caching.
*/
/*
* Refresh the proc structure, and publish our proc info in the modex.
* NOTE: Do *not* call ompi_proc_finalize as there are many places in
* the code that point to indv. procs in this strucutre. For our
* needs here we only need to fix up the modex, bml and pml
* references.
*/
if (OMPI_SUCCESS != (ret = ompi_proc_refresh())) {
opal_output(0,
"pml:csum: ft_event(Restart): proc_refresh Failed %d",
ret);
for(p = 0; p < (int)num_procs; ++p) {
OBJ_RELEASE(procs[p]);
}
free (procs);
return ret;
}
}
else if(OPAL_CRS_TERM == state ) {
;
}
else {
;
}
/* Call the BML
* BML is expected to call ft_event in
* - BTL(s)
* - MPool(s)
*/
if( OMPI_SUCCESS != (ret = mca_bml.bml_ft_event(state))) {
opal_output(0, "pml:base: ft_event: BML ft_event function failed: %d\n",
ret);
}
if(OPAL_CRS_CHECKPOINT == state) {
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P1);
if( opal_cr_timing_barrier_enabled ) {
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR0);
/* JJH Cannot barrier here due to progress engine -- orte_grpcomm.barrier();*/
}
}
else if(OPAL_CRS_CONTINUE == state) {
if( !first_continue_pass ) {
if( opal_cr_timing_barrier_enabled ) {
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR1);
orte_grpcomm.barrier();
}
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3);
}
if( ompi_cr_continue_like_restart && !first_continue_pass ) {
/*
* Exchange the modex information once again.
* BTLs will have republished their modex information.
*/
if (OMPI_SUCCESS != (ret = orte_grpcomm.modex(NULL))) {
opal_output(0,
"pml:csum: ft_event(Restart): Failed orte_grpcomm.modex() = %d",
ret);
return ret;
}
/*
* Startup the PML stack now that the modex is running again
* Add the new procs (BTLs redo modex recv's)
*/
if( OMPI_SUCCESS != (ret = mca_pml_csum_add_procs(procs, num_procs) ) ) {
opal_output(0, "pml:csum: ft_event(Restart): Failed in add_procs (%d)", ret);
return ret;
}
/* Is this barrier necessary ? JJH */
if (OMPI_SUCCESS != (ret = orte_grpcomm.barrier())) {
opal_output(0, "pml:csum: ft_event(Restart): Failed in orte_grpcomm.barrier (%d)", ret);
return ret;
}
if( NULL != procs ) {
for(p = 0; p < (int)num_procs; ++p) {
OBJ_RELEASE(procs[p]);
}
free(procs);
procs = NULL;
}
}
if( !first_continue_pass ) {
if( opal_cr_timing_barrier_enabled ) {
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR2);
orte_grpcomm.barrier();
}
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP1);
}
}
else if(OPAL_CRS_RESTART_PRE == state ) {
/* Nothing here */
}
else if(OPAL_CRS_RESTART == state ) {
/*
* Exchange the modex information once again.
* BTLs will have republished their modex information.
*/
if (OMPI_SUCCESS != (ret = orte_grpcomm.modex(NULL))) {
opal_output(0,
"pml:csum: ft_event(Restart): Failed orte_grpcomm.modex() = %d",
ret);
return ret;
}
/*
* Startup the PML stack now that the modex is running again
* Add the new procs (BTLs redo modex recv's)
*/
if( OMPI_SUCCESS != (ret = mca_pml_csum_add_procs(procs, num_procs) ) ) {
opal_output(0, "pml:csum: ft_event(Restart): Failed in add_procs (%d)", ret);
return ret;
}
/* Is this barrier necessary ? JJH */
if (OMPI_SUCCESS != (ret = orte_grpcomm.barrier())) {
opal_output(0, "pml:csum: ft_event(Restart): Failed in orte_grpcomm.barrier (%d)", ret);
return ret;
}
if( NULL != procs ) {
for(p = 0; p < (int)num_procs; ++p) {
OBJ_RELEASE(procs[p]);
}
free(procs);
procs = NULL;
}
}
else if(OPAL_CRS_TERM == state ) {
;
}
else {
;
}
return OMPI_SUCCESS;
}
#endif /* OPAL_ENABLE_FT */
int mca_pml_csum_com_btl_comp(const void *v1, const void *v2)
{
const mca_pml_csum_com_btl_t *b1 = (const mca_pml_csum_com_btl_t *) v1;
const mca_pml_csum_com_btl_t *b2 = (const mca_pml_csum_com_btl_t *) v2;
if(b1->bml_btl->btl_weight < b2->bml_btl->btl_weight)
return 1;
if(b1->bml_btl->btl_weight > b2->bml_btl->btl_weight)
return -1;
return 0;
}

328
ompi/mca/pml/csum/pml_csum.h Обычный файл
Просмотреть файл

@ -0,0 +1,328 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2007 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2009 IBM Corporation. All rights reserved.
* Copyright (c) 2009 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*/
#ifndef MCA_PML_CSUM_H
#define MCA_PML_CSUM_H
#include "ompi_config.h"
#include "opal/threads/threads.h"
#include "ompi/class/ompi_free_list.h"
#include "ompi/request/request.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/mca/pml/base/pml_base_request.h"
#include "ompi/mca/pml/base/pml_base_bsend.h"
#include "ompi/mca/pml/base/pml_base_sendreq.h"
#include "ompi/mca/btl/btl.h"
#include "ompi/datatype/datatype.h"
#include "pml_csum_hdr.h"
#include "ompi/mca/bml/base/base.h"
#include "ompi/proc/proc.h"
#include "ompi/mca/allocator/base/base.h"
#if OMPI_CSUM_DEBUG
#define OMPI_CSUM_CSUM_DEBUG(x) opal_output x
#else
#define OMPI_CSUM_CSUM_DEBUG(x)
#endif
BEGIN_C_DECLS
/**
* CSUM PML module
*/
struct mca_pml_csum_t {
mca_pml_base_module_t super;
int free_list_num; /* initial size of free list */
int free_list_max; /* maximum size of free list */
int free_list_inc; /* number of elements to grow free list */
size_t send_pipeline_depth;
size_t recv_pipeline_depth;
size_t rdma_put_retries_limit;
int max_rdma_per_request;
int max_send_per_range;
bool leave_pinned;
int leave_pinned_pipeline;
/* lock queue access */
opal_mutex_t lock;
/* free lists */
ompi_free_list_t rdma_frags;
ompi_free_list_t recv_frags;
ompi_free_list_t pending_pckts;
ompi_free_list_t buffers;
ompi_free_list_t send_ranges;
/* list of pending operations */
opal_list_t pckt_pending;
opal_list_t send_pending;
opal_list_t recv_pending;
opal_list_t rdma_pending;
/* List of pending fragments without a matching communicator */
opal_list_t non_existing_communicator_pending;
bool enabled;
char* allocator_name;
mca_allocator_base_module_t* allocator;
uint32_t unexpected_limit;
/*Enable or Disable checksum*/
bool enable_csum;
};
typedef struct mca_pml_csum_t mca_pml_csum_t;
extern mca_pml_csum_t mca_pml_csum;
/*
* PML interface functions.
*/
extern int mca_pml_csum_add_comm(
struct ompi_communicator_t* comm
);
extern int mca_pml_csum_del_comm(
struct ompi_communicator_t* comm
);
extern int mca_pml_csum_add_procs(
struct ompi_proc_t **procs,
size_t nprocs
);
extern int mca_pml_csum_del_procs(
struct ompi_proc_t **procs,
size_t nprocs
);
extern int mca_pml_csum_enable( bool enable );
extern int mca_pml_csum_progress(void);
extern int mca_pml_csum_iprobe( int dst,
int tag,
struct ompi_communicator_t* comm,
int *matched,
ompi_status_public_t* status );
extern int mca_pml_csum_probe( int dst,
int tag,
struct ompi_communicator_t* comm,
ompi_status_public_t* status );
extern int mca_pml_csum_isend_init( void *buf,
size_t count,
ompi_datatype_t *datatype,
int dst,
int tag,
mca_pml_base_send_mode_t mode,
struct ompi_communicator_t* comm,
struct ompi_request_t **request );
extern int mca_pml_csum_isend( void *buf,
size_t count,
ompi_datatype_t *datatype,
int dst,
int tag,
mca_pml_base_send_mode_t mode,
struct ompi_communicator_t* comm,
struct ompi_request_t **request );
extern int mca_pml_csum_send( void *buf,
size_t count,
ompi_datatype_t *datatype,
int dst,
int tag,
mca_pml_base_send_mode_t mode,
struct ompi_communicator_t* comm );
extern int mca_pml_csum_irecv_init( void *buf,
size_t count,
ompi_datatype_t *datatype,
int src,
int tag,
struct ompi_communicator_t* comm,
struct ompi_request_t **request );
extern int mca_pml_csum_irecv( void *buf,
size_t count,
ompi_datatype_t *datatype,
int src,
int tag,
struct ompi_communicator_t* comm,
struct ompi_request_t **request );
extern int mca_pml_csum_recv( void *buf,
size_t count,
ompi_datatype_t *datatype,
int src,
int tag,
struct ompi_communicator_t* comm,
ompi_status_public_t* status );
extern int mca_pml_csum_dump( struct ompi_communicator_t* comm,
int verbose );
extern int mca_pml_csum_start( size_t count,
ompi_request_t** requests );
extern int mca_pml_csum_ft_event( int state );
END_C_DECLS
struct mca_pml_csum_pckt_pending_t {
ompi_free_list_item_t super;
ompi_proc_t* proc;
mca_pml_csum_hdr_t hdr;
struct mca_bml_base_btl_t *bml_btl;
uint8_t order;
};
typedef struct mca_pml_csum_pckt_pending_t mca_pml_csum_pckt_pending_t;
OBJ_CLASS_DECLARATION(mca_pml_csum_pckt_pending_t);
#define MCA_PML_CSUM_PCKT_PENDING_ALLOC(pckt,rc) \
do { \
ompi_free_list_item_t* item; \
OMPI_FREE_LIST_WAIT(&mca_pml_csum.pending_pckts, item, rc); \
pckt = (mca_pml_csum_pckt_pending_t*)item; \
} while (0)
#define MCA_PML_CSUM_PCKT_PENDING_RETURN(pckt) \
do { \
/* return packet */ \
OMPI_FREE_LIST_RETURN(&mca_pml_csum.pending_pckts, \
(ompi_free_list_item_t*)pckt); \
} while(0)
#define MCA_PML_CSUM_ADD_FIN_TO_PENDING(P, D, B, O, S) \
do { \
mca_pml_csum_pckt_pending_t *_pckt; \
int _rc; \
\
MCA_PML_CSUM_PCKT_PENDING_ALLOC(_pckt,_rc); \
_pckt->hdr.hdr_common.hdr_type = MCA_PML_CSUM_HDR_TYPE_FIN; \
_pckt->hdr.hdr_fin.hdr_des.pval = (D); \
_pckt->hdr.hdr_fin.hdr_fail = (S); \
_pckt->proc = (P); \
_pckt->bml_btl = (B); \
_pckt->order = (O); \
OPAL_THREAD_LOCK(&mca_pml_csum.lock); \
opal_list_append(&mca_pml_csum.pckt_pending, \
(opal_list_item_t*)_pckt); \
OPAL_THREAD_UNLOCK(&mca_pml_csum.lock); \
} while(0)
int mca_pml_csum_send_fin(ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl,
void *hdr_des, uint8_t order, uint32_t status);
/* This function tries to resend FIN/ACK packets from pckt_pending queue.
* Packets are added to the queue when sending of FIN or ACK is failed due to
* resource unavailability. bml_btl passed to the function doesn't represents
* packet's destination, it represents BTL on which resource was freed, so only
* this BTL should be considered for resending packets */
void mca_pml_csum_process_pending_packets(mca_bml_base_btl_t* bml_btl);
/* This function retries failed PUT/GET operations on frag. When RDMA operation
* cannot be accomplished for some reason, frag is put on the rdma_pending list.
* Later the operation is retried. The destination of RDMA operation is stored
* inside the frag structure */
void mca_pml_csum_process_pending_rdma(void);
#define MCA_PML_CSUM_PROGRESS_PENDING(bml_btl) \
do { \
if(opal_list_get_size(&mca_pml_csum.pckt_pending)) \
mca_pml_csum_process_pending_packets(bml_btl); \
if(opal_list_get_size(&mca_pml_csum.recv_pending)) \
mca_pml_csum_recv_request_process_pending(); \
if(opal_list_get_size(&mca_pml_csum.send_pending)) \
mca_pml_csum_send_request_process_pending(bml_btl); \
if(opal_list_get_size(&mca_pml_csum.rdma_pending)) \
mca_pml_csum_process_pending_rdma(); \
} while (0)
/*
* Compute the total number of bytes on supplied descriptor
*/
#define MCA_PML_CSUM_COMPUTE_SEGMENT_LENGTH(segments, count, hdrlen, length) \
do { \
size_t i; \
\
for( i = 0; i < count; i++ ) { \
length += segments[i].seg_len; \
} \
length -= hdrlen; \
} while(0)
/* represent BTL chosen for sending request */
struct mca_pml_csum_com_btl_t {
mca_bml_base_btl_t *bml_btl;
struct mca_mpool_base_registration_t* btl_reg;
size_t length;
};
typedef struct mca_pml_csum_com_btl_t mca_pml_csum_com_btl_t;
int mca_pml_csum_com_btl_comp(const void *v1, const void *v2);
/* Calculate what percentage of a message to send through each BTL according to
* relative weight */
static inline void
mca_pml_csum_calc_weighted_length( mca_pml_csum_com_btl_t *btls, int num_btls, size_t size,
double weight_total )
{
int i;
size_t length_left;
/* shortcut for common case for only one BTL */
if( OPAL_LIKELY(1 == num_btls) ) {
btls[0].length = size;
return;
}
/* sort BTLs according of their weights so BTLs with smaller weight will
* not hijack all of the traffic */
qsort( btls, num_btls, sizeof(mca_pml_csum_com_btl_t),
mca_pml_csum_com_btl_comp );
for(length_left = size, i = 0; i < num_btls; i++) {
mca_bml_base_btl_t* bml_btl = btls[i].bml_btl;
size_t length = 0;
if( OPAL_UNLIKELY(0 != length_left) ) {
length = (length_left > bml_btl->btl->btl_eager_limit)?
((size_t)(size * (bml_btl->btl_weight / weight_total))) :
length_left;
if(length > length_left)
length = length_left;
length_left -= length;
}
btls[i].length = length;
}
/* account for rounding errors */
btls[0].length += length_left;
}
#endif

98
ompi/mca/pml/csum/pml_csum_comm.c Обычный файл
Просмотреть файл

@ -0,0 +1,98 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include <string.h>
#include "pml_csum.h"
#include "pml_csum_comm.h"
static void mca_pml_csum_comm_proc_construct(mca_pml_csum_comm_proc_t* proc)
{
proc->expected_sequence = 1;
proc->ompi_proc = NULL;
proc->send_sequence = 0;
OBJ_CONSTRUCT(&proc->frags_cant_match, opal_list_t);
OBJ_CONSTRUCT(&proc->specific_receives, opal_list_t);
OBJ_CONSTRUCT(&proc->unexpected_frags, opal_list_t);
}
static void mca_pml_csum_comm_proc_destruct(mca_pml_csum_comm_proc_t* proc)
{
OBJ_DESTRUCT(&proc->frags_cant_match);
OBJ_DESTRUCT(&proc->specific_receives);
OBJ_DESTRUCT(&proc->unexpected_frags);
}
static OBJ_CLASS_INSTANCE(
mca_pml_csum_comm_proc_t,
opal_object_t,
mca_pml_csum_comm_proc_construct,
mca_pml_csum_comm_proc_destruct);
static void mca_pml_csum_comm_construct(mca_pml_csum_comm_t* comm)
{
OBJ_CONSTRUCT(&comm->wild_receives, opal_list_t);
OBJ_CONSTRUCT(&comm->matching_lock, opal_mutex_t);
comm->recv_sequence = 0;
comm->procs = NULL;
comm->num_procs = 0;
}
static void mca_pml_csum_comm_destruct(mca_pml_csum_comm_t* comm)
{
size_t i;
for(i=0; i<comm->num_procs; i++)
OBJ_DESTRUCT((&comm->procs[i]));
if(NULL != comm->procs)
free(comm->procs);
OBJ_DESTRUCT(&comm->wild_receives);
OBJ_DESTRUCT(&comm->matching_lock);
}
OBJ_CLASS_INSTANCE(
mca_pml_csum_comm_t,
opal_object_t,
mca_pml_csum_comm_construct,
mca_pml_csum_comm_destruct);
int mca_pml_csum_comm_init_size(mca_pml_csum_comm_t* comm, size_t size)
{
size_t i;
/* send message sequence-number support - sender side */
comm->procs = (mca_pml_csum_comm_proc_t*)malloc(sizeof(mca_pml_csum_comm_proc_t)*size);
if(NULL == comm->procs) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
for(i=0; i<size; i++) {
OBJ_CONSTRUCT(comm->procs+i, mca_pml_csum_comm_proc_t);
}
comm->num_procs = size;
return OMPI_SUCCESS;
}

83
ompi/mca/pml/csum/pml_csum_comm.h Обычный файл
Просмотреть файл

@ -0,0 +1,83 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*/
#ifndef MCA_PML_OB1_COMM_H
#define MCA_PML_OB1_COMM_H
#include "opal/threads/mutex.h"
#include "opal/class/opal_list.h"
#include "ompi/proc/proc.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
struct mca_pml_csum_comm_proc_t {
opal_object_t super;
uint16_t expected_sequence; /**< send message sequence number - receiver side */
struct ompi_proc_t* ompi_proc;
#if OMPI_HAVE_THREAD_SUPPORT
volatile int32_t send_sequence; /**< send side sequence number */
#else
int32_t send_sequence; /**< send side sequence number */
#endif
opal_list_t frags_cant_match; /**< out-of-order fragment queues */
opal_list_t specific_receives; /**< queues of unmatched specific receives */
opal_list_t unexpected_frags; /**< unexpected fragment queues */
};
typedef struct mca_pml_csum_comm_proc_t mca_pml_csum_comm_proc_t;
/**
* Cached on ompi_communicator_t to hold queues/state
* used by the PML<->PTL interface for matching logic.
*/
struct mca_pml_comm_t {
opal_object_t super;
#if OMPI_HAVE_THREAD_SUPPORT
volatile uint32_t recv_sequence; /**< recv request sequence number - receiver side */
#else
uint32_t recv_sequence; /**< recv request sequence number - receiver side */
#endif
opal_mutex_t matching_lock; /**< matching lock */
opal_list_t wild_receives; /**< queue of unmatched wild (source process not specified) receives */
mca_pml_csum_comm_proc_t* procs;
size_t num_procs;
};
typedef struct mca_pml_comm_t mca_pml_csum_comm_t;
OBJ_CLASS_DECLARATION(mca_pml_csum_comm_t);
/**
* Initialize an instance of mca_pml_csum_comm_t based on the communicator size.
*
* @param comm Instance of mca_pml_csum_comm_t
* @param size Size of communicator
* @return OMPI_SUCCESS or error status on failure.
*/
extern int mca_pml_csum_comm_init_size(mca_pml_csum_comm_t* comm, size_t size);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

263
ompi/mca/pml/csum/pml_csum_component.c Обычный файл
Просмотреть файл

@ -0,0 +1,263 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2009 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2009 IBM Corporation. All rights reserved.
* Copyright (c) 2009 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "opal/sys/cache.h"
#include "opal/event/event.h"
#include "mpi.h"
#include "ompi/runtime/params.h"
#include "ompi/datatype/convertor.h"
#include "ompi/mca/pml/pml.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/util/show_help.h"
#include "ompi/mca/pml/base/pml_base_bsend.h"
#include "pml_csum.h"
#include "pml_csum_hdr.h"
#include "pml_csum_sendreq.h"
#include "pml_csum_recvreq.h"
#include "pml_csum_rdmafrag.h"
#include "pml_csum_recvfrag.h"
#include "ompi/mca/bml/base/base.h"
#include "pml_csum_component.h"
#include "ompi/mca/allocator/base/base.h"
OBJ_CLASS_INSTANCE( mca_pml_csum_pckt_pending_t,
ompi_free_list_item_t,
NULL,
NULL );
static int mca_pml_csum_component_open(void);
static int mca_pml_csum_component_close(void);
static mca_pml_base_module_t*
mca_pml_csum_component_init( int* priority, bool enable_progress_threads,
bool enable_mpi_threads );
static int mca_pml_csum_component_fini(void);
mca_pml_base_component_2_0_0_t mca_pml_csum_component = {
/* First, the mca_base_component_t struct containing meta
information about the component itself */
{
MCA_PML_BASE_VERSION_2_0_0,
"csum", /* MCA component name */
OMPI_MAJOR_VERSION, /* MCA component major version */
OMPI_MINOR_VERSION, /* MCA component minor version */
OMPI_RELEASE_VERSION, /* MCA component release version */
mca_pml_csum_component_open, /* component open */
mca_pml_csum_component_close /* component close */
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
mca_pml_csum_component_init, /* component init */
mca_pml_csum_component_fini /* component finalize */
};
void *mca_pml_csum_seg_alloc( struct mca_mpool_base_module_t* mpool,
size_t* size,
mca_mpool_base_registration_t** registration);
void mca_pml_csum_seg_free( struct mca_mpool_base_module_t* mpool,
void* segment );
static inline int mca_pml_csum_param_register_int(
const char* param_name,
int default_value)
{
int id = mca_base_param_register_int("pml","csum",param_name,NULL,default_value);
int param_value = default_value;
mca_base_param_lookup_int(id,&param_value);
return param_value;
}
static int mca_pml_csum_component_open(void)
{
mca_allocator_base_component_t* allocator_component;
mca_pml_csum.free_list_num =
mca_pml_csum_param_register_int("free_list_num", 4);
mca_pml_csum.free_list_max =
mca_pml_csum_param_register_int("free_list_max", -1);
mca_pml_csum.free_list_inc =
mca_pml_csum_param_register_int("free_list_inc", 64);
mca_pml_csum.send_pipeline_depth =
mca_pml_csum_param_register_int("send_pipeline_depth", 3);
mca_pml_csum.recv_pipeline_depth =
mca_pml_csum_param_register_int("recv_pipeline_depth", 4);
mca_pml_csum.rdma_put_retries_limit =
mca_pml_csum_param_register_int("rdma_put_retries_limit", 5);
mca_pml_csum.max_rdma_per_request =
mca_pml_csum_param_register_int("max_rdma_per_request", 4);
mca_pml_csum.max_send_per_range =
mca_pml_csum_param_register_int("max_send_per_range", 4);
mca_pml_csum.unexpected_limit =
mca_pml_csum_param_register_int("unexpected_limit", 128);
mca_base_param_reg_string(&mca_pml_csum_component.pmlm_version,
"allocator",
"Name of allocator component for unexpected messages",
false, false,
"bucket",
&mca_pml_csum.allocator_name);
allocator_component = mca_allocator_component_lookup( mca_pml_csum.allocator_name );
if(NULL == allocator_component) {
opal_output(0, "mca_pml_csum_component_open: can't find allocator: %s\n", mca_pml_csum.allocator_name);
return OMPI_ERROR;
}
mca_pml_csum.allocator = allocator_component->allocator_init(true,
mca_pml_csum_seg_alloc,
mca_pml_csum_seg_free, NULL);
if(NULL == mca_pml_csum.allocator) {
opal_output(0, "mca_pml_csum_component_open: unable to initialize allocator\n");
return OMPI_ERROR;
}
/* default is not to checksum all data */
mca_pml_csum.enable_csum =
mca_pml_csum_param_register_int("enable_csum", 0);
ompi_convertor_checksum_enable(mca_pml_csum.enable_csum);
mca_pml_csum.enabled = false;
return mca_bml_base_open();
}
static int mca_pml_csum_component_close(void)
{
int rc;
if (OMPI_SUCCESS != (rc = mca_bml_base_close())) {
return rc;
}
if (NULL != mca_pml_csum.allocator_name) {
free(mca_pml_csum.allocator_name);
}
return OMPI_SUCCESS;
}
static mca_pml_base_module_t*
mca_pml_csum_component_init( int* priority,
bool enable_progress_threads,
bool enable_mpi_threads )
{
/* if the alternative csum was defined and enable_csum set, then we must
* be selected
*/
#if defined (OMPI_CSUM_DST)
if (mca_pml_csum.enable_csum) {
goto SELECT_ME;
} else {
*priority = 0;
orte_show_help("help-pml-csum.txt", "pml:checksum-not-enabled", true);
return NULL;
}
#else
*priority = 0;
return NULL;
#endif
SELECT_ME:
*priority = 100;
if(OMPI_SUCCESS != mca_bml_base_init( enable_progress_threads,
enable_mpi_threads)) {
*priority = 0;
return NULL;
}
/* Set this here (vs in component_open()) because
ompi_mpi_leave_pinned* may have been set after MCA params were
read (e.g., by the openib btl) */
mca_pml_csum.leave_pinned = (1 == ompi_mpi_leave_pinned);
mca_pml_csum.leave_pinned_pipeline = (int) ompi_mpi_leave_pinned_pipeline;
return &mca_pml_csum.super;
}
int mca_pml_csum_component_fini(void)
{
int rc;
/* Shutdown BML */
if(OMPI_SUCCESS != (rc = mca_bml.bml_finalize()))
return rc;
if(!mca_pml_csum.enabled)
return OMPI_SUCCESS; /* never selected.. return success.. */
mca_pml_csum.enabled = false; /* not anymore */
OBJ_DESTRUCT(&mca_pml_csum.rdma_pending);
OBJ_DESTRUCT(&mca_pml_csum.pckt_pending);
OBJ_DESTRUCT(&mca_pml_csum.recv_pending);
OBJ_DESTRUCT(&mca_pml_csum.send_pending);
OBJ_DESTRUCT(&mca_pml_csum.non_existing_communicator_pending);
OBJ_DESTRUCT(&mca_pml_csum.buffers);
OBJ_DESTRUCT(&mca_pml_csum.pending_pckts);
OBJ_DESTRUCT(&mca_pml_csum.recv_frags);
OBJ_DESTRUCT(&mca_pml_csum.rdma_frags);
OBJ_DESTRUCT(&mca_pml_csum.lock);
if(OMPI_SUCCESS != (rc = mca_pml_csum.allocator->alc_finalize(mca_pml_csum.allocator))) {
return rc;
}
#if 0
if (mca_pml_base_send_requests.fl_num_allocated !=
mca_pml_base_send_requests.super.opal_list_length) {
opal_output(0, "csum send requests: %d allocated %d returned\n",
mca_pml_base_send_requests.fl_num_allocated,
mca_pml_base_send_requests.super.opal_list_length);
}
if (mca_pml_base_recv_requests.fl_num_allocated !=
mca_pml_base_recv_requests.super.opal_list_length) {
opal_output(0, "csum recv requests: %d allocated %d returned\n",
mca_pml_base_recv_requests.fl_num_allocated,
mca_pml_base_recv_requests.super.opal_list_length);
}
#endif
return OMPI_SUCCESS;
}
void *mca_pml_csum_seg_alloc( struct mca_mpool_base_module_t* mpool,
size_t* size,
mca_mpool_base_registration_t** registration) {
return malloc(*size);
}
void mca_pml_csum_seg_free( struct mca_mpool_base_module_t* mpool,
void* segment ) {
free(segment);
}

32
ompi/mca/pml/csum/pml_csum_component.h Обычный файл
Просмотреть файл

@ -0,0 +1,32 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*/
#ifndef MCA_PML_CSUM_COMPONENT_H
#define MCA_PML_CSUM_COMPONENT_H
#include "ompi_config.h"
BEGIN_C_DECLS
/*
* PML module functions.
*/
OMPI_MODULE_DECLSPEC extern mca_pml_base_component_2_0_0_t mca_pml_csum_component;
END_C_DECLS
#endif

26
ompi/mca/pml/csum/pml_csum_endpoint.c Обычный файл
Просмотреть файл

@ -0,0 +1,26 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include <string.h>
#include "ompi/mca/pml/pml.h"
#include "pml_csum_endpoint.h"

30
ompi/mca/pml/csum/pml_csum_endpoint.h Обычный файл
Просмотреть файл

@ -0,0 +1,30 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*/
#ifndef MCA_PML_CSUM_ENDPOINT_H
#define MCA_PML_CSUM_ENDPOINT_H
#include "ompi/mca/btl/btl.h"
BEGIN_C_DECLS
END_C_DECLS
#endif

455
ompi/mca/pml/csum/pml_csum_hdr.h Обычный файл
Просмотреть файл

@ -0,0 +1,455 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2009 IBM Corporation. All rights reserved.
* Copyright (c) 2009 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*/
#ifndef MCA_PML_CSUM_HEADER_H
#define MCA_PML_CSUM_HEADER_H
#include "ompi_config.h"
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif
#ifdef HAVE_NETINET_IN_H
#include <netinet/in.h>
#endif
#include "opal/types.h"
#include "opal/util/arch.h"
#include "ompi/mca/btl/btl.h"
#include "ompi/proc/proc.h"
#define MCA_PML_CSUM_HDR_TYPE_MATCH (MCA_BTL_TAG_PML + 1)
#define MCA_PML_CSUM_HDR_TYPE_RNDV (MCA_BTL_TAG_PML + 2)
#define MCA_PML_CSUM_HDR_TYPE_RGET (MCA_BTL_TAG_PML + 3)
#define MCA_PML_CSUM_HDR_TYPE_ACK (MCA_BTL_TAG_PML + 4)
#define MCA_PML_CSUM_HDR_TYPE_NACK (MCA_BTL_TAG_PML + 5)
#define MCA_PML_CSUM_HDR_TYPE_FRAG (MCA_BTL_TAG_PML + 6)
#define MCA_PML_CSUM_HDR_TYPE_GET (MCA_BTL_TAG_PML + 7)
#define MCA_PML_CSUM_HDR_TYPE_PUT (MCA_BTL_TAG_PML + 8)
#define MCA_PML_CSUM_HDR_TYPE_FIN (MCA_BTL_TAG_PML + 9)
#define MCA_PML_CSUM_HDR_FLAGS_ACK 1 /* is an ack required */
#define MCA_PML_CSUM_HDR_FLAGS_NBO 2 /* is the hdr in network byte order */
#define MCA_PML_CSUM_HDR_FLAGS_PIN 4 /* is user buffer pinned */
#define MCA_PML_CSUM_HDR_FLAGS_CONTIG 8 /* is user buffer contiguous */
#define MCA_PML_CSUM_HDR_FLAGS_NORDMA 16 /* rest will be send by copy-in-out */
/**
* Common hdr attributes - must be first element in each hdr type
*/
struct mca_pml_csum_common_hdr_t {
uint8_t hdr_type; /**< type of envelope */
uint8_t hdr_flags; /**< flags indicating how fragment should be processed */
uint16_t hdr_csum; /**< checksum over header */
};
typedef struct mca_pml_csum_common_hdr_t mca_pml_csum_common_hdr_t;
#define MCA_PML_CSUM_COMMON_HDR_NTOH(h) (h).hdr_csum = ntohs((h).hdr_csum);
#define MCA_PML_CSUM_COMMON_HDR_HTON(h) (h).hdr_csum = htons((h).hdr_csum);
/**
* Header definition for the first fragment, contains the
* attributes required to match the corresponding posted receive.
*/
struct mca_pml_csum_match_hdr_t {
mca_pml_csum_common_hdr_t hdr_common; /**< common attributes */
uint16_t hdr_ctx; /**< communicator index */
uint16_t hdr_seq; /**< message sequence number */
int32_t hdr_src; /**< source rank */
int32_t hdr_tag; /**< user tag */
uint32_t hdr_csum; /**< checksum over data */
};
#define OMPI_PML_CSUM_MATCH_HDR_LEN 20
typedef struct mca_pml_csum_match_hdr_t mca_pml_csum_match_hdr_t;
#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT && OMPI_ENABLE_DEBUG
#define MCA_PML_CSUM_MATCH_HDR_FILL(h) \
do { \
(h).hdr_padding[0] = 0; \
(h).hdr_padding[1] = 0; \
} while(0)
#else
#define MCA_PML_CSUM_MATCH_HDR_FILL(h)
#endif /* OMPI_ENABLE_HETEROGENEOUS_SUPPORT && OMPI_ENABLE_DEBUG */
#define MCA_PML_CSUM_MATCH_HDR_NTOH(h) \
do { \
MCA_PML_CSUM_COMMON_HDR_NTOH((h).hdr_common); \
(h).hdr_ctx = ntohs((h).hdr_ctx); \
(h).hdr_src = ntohl((h).hdr_src); \
(h).hdr_tag = ntohl((h).hdr_tag); \
(h).hdr_seq = ntohs((h).hdr_seq); \
(h).hdr_csum = ntohl((h).hdr_csum); \
} while (0)
#define MCA_PML_CSUM_MATCH_HDR_HTON(h) \
do { \
MCA_PML_CSUM_COMMON_HDR_HTON((h).hdr_common); \
MCA_PML_CSUM_MATCH_HDR_FILL(h); \
(h).hdr_ctx = htons((h).hdr_ctx); \
(h).hdr_src = htonl((h).hdr_src); \
(h).hdr_tag = htonl((h).hdr_tag); \
(h).hdr_seq = htons((h).hdr_seq); \
(h).hdr_csum = htonl((h).hdr_csum); \
} while (0)
/**
* Header definition for the first fragment when an acknowledgment
* is required. This could be the first fragment of a large message
* or a short message that requires an ack (synchronous).
*/
struct mca_pml_csum_rendezvous_hdr_t {
mca_pml_csum_match_hdr_t hdr_match;
uint64_t hdr_msg_length; /**< message length */
ompi_ptr_t hdr_src_req; /**< pointer to source request - returned in ack */
};
typedef struct mca_pml_csum_rendezvous_hdr_t mca_pml_csum_rendezvous_hdr_t;
#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT && OMPI_ENABLE_DEBUG
#define MCA_PML_CSUM_RNDV_HDR_FILL(h) \
MCA_PML_CSUM_MATCH_HDR_FILL((h).hdr_match)
#else
#define MCA_PML_CSUM_RNDV_HDR_FILL(h)
#endif /* OMPI_ENABLE_HETEROGENEOUS_SUPPORT && OMPI_ENABLE_DEBUG */
/* Note that hdr_src_req is not put in network byte order because it
is never processed by the receiver, other than being copied into
the ack header */
#define MCA_PML_CSUM_RNDV_HDR_NTOH(h) \
do { \
MCA_PML_CSUM_MATCH_HDR_NTOH((h).hdr_match); \
(h).hdr_msg_length = ntoh64((h).hdr_msg_length); \
} while (0)
#define MCA_PML_CSUM_RNDV_HDR_HTON(h) \
do { \
MCA_PML_CSUM_MATCH_HDR_HTON((h).hdr_match); \
MCA_PML_CSUM_RNDV_HDR_FILL(h); \
(h).hdr_msg_length = hton64((h).hdr_msg_length); \
} while (0)
/**
* Header definition for a combined rdma rendezvous/get
*/
struct mca_pml_csum_rget_hdr_t {
mca_pml_csum_rendezvous_hdr_t hdr_rndv;
uint32_t hdr_seg_cnt; /**< number of segments for rdma */
#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT
uint8_t hdr_padding[4];
#endif
ompi_ptr_t hdr_des; /**< source descriptor */
mca_btl_base_segment_t hdr_segs[1]; /**< list of segments for rdma */
};
typedef struct mca_pml_csum_rget_hdr_t mca_pml_csum_rget_hdr_t;
#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT && OMPI_ENABLE_DEBUG
#define MCA_PML_CSUM_RGET_HDR_FILL(h) \
do { \
MCA_PML_CSUM_RNDV_HDR_FILL((h).hdr_rndv); \
(h).hdr_padding[0] = 0; \
(h).hdr_padding[1] = 0; \
(h).hdr_padding[2] = 0; \
(h).hdr_padding[3] = 0; \
} while(0)
#else
#define MCA_PML_CSUM_RGET_HDR_FILL(h)
#endif /* OMPI_ENABLE_HETEROGENEOUS_SUPPORT && OMPI_ENABLE_DEBUG */
#define MCA_PML_CSUM_RGET_HDR_NTOH(h) \
do { \
MCA_PML_CSUM_RNDV_HDR_NTOH((h).hdr_rndv); \
(h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \
} while (0)
#define MCA_PML_CSUM_RGET_HDR_HTON(h) \
do { \
MCA_PML_CSUM_RNDV_HDR_HTON((h).hdr_rndv); \
MCA_PML_CSUM_RGET_HDR_FILL(h); \
(h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \
} while (0)
/**
* Header for subsequent fragments.
*/
struct mca_pml_csum_frag_hdr_t {
mca_pml_csum_common_hdr_t hdr_common; /**< common attributes */
uint32_t hdr_csum;
uint64_t hdr_frag_offset; /**< offset into message */
ompi_ptr_t hdr_src_req; /**< pointer to source request */
ompi_ptr_t hdr_dst_req; /**< pointer to matched receive */
};
typedef struct mca_pml_csum_frag_hdr_t mca_pml_csum_frag_hdr_t;
#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT && OMPI_ENABLE_DEBUG
#define MCA_PML_CSUM_FRAG_HDR_FILL(h) \
do { \
(h).hdr_padding[0] = 0; \
(h).hdr_padding[1] = 0; \
(h).hdr_padding[2] = 0; \
(h).hdr_padding[3] = 0; \
(h).hdr_padding[4] = 0; \
(h).hdr_padding[5] = 0; \
} while(0)
#else
#define MCA_PML_CSUM_FRAG_HDR_FILL(h)
#endif /* OMPI_ENABLE_HETEROGENEOUS_SUPPORT && OMPI_ENABLE_DEBUG */
#define MCA_PML_CSUM_FRAG_HDR_NTOH(h) \
do { \
MCA_PML_CSUM_COMMON_HDR_NTOH((h).hdr_common); \
(h).hdr_csum = ntohl((h).hdr_csum); \
(h).hdr_frag_offset = ntoh64((h).hdr_frag_offset); \
} while (0)
#define MCA_PML_CSUM_FRAG_HDR_HTON(h) \
do { \
MCA_PML_CSUM_COMMON_HDR_HTON((h).hdr_common); \
(h).hdr_csum = htonl((h).hdr_csum); \
MCA_PML_CSUM_FRAG_HDR_FILL(h); \
(h).hdr_frag_offset = hton64((h).hdr_frag_offset); \
} while (0)
/**
* Header used to acknowledgment outstanding fragment(s).
*/
struct mca_pml_csum_ack_hdr_t {
mca_pml_csum_common_hdr_t hdr_common; /**< common attributes */
#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT
uint8_t hdr_padding[6];
#endif
ompi_ptr_t hdr_src_req; /**< source request */
ompi_ptr_t hdr_dst_req; /**< matched receive request */
uint64_t hdr_send_offset; /**< starting point of copy in/out */
};
typedef struct mca_pml_csum_ack_hdr_t mca_pml_csum_ack_hdr_t;
#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT && OMPI_ENABLE_DEBUG
#define MCA_PML_CSUM_ACK_HDR_FILL(h) \
do { \
(h).hdr_padding[0] = 0; \
(h).hdr_padding[1] = 0; \
(h).hdr_padding[2] = 0; \
(h).hdr_padding[3] = 0; \
(h).hdr_padding[4] = 0; \
(h).hdr_padding[5] = 0; \
} while (0)
#else
#define MCA_PML_CSUM_ACK_HDR_FILL(h)
#endif /* OMPI_ENABLE_HETEROGENEOUS_SUPPORT && OMPI_ENABLE_DEBUG */
/* Note that the request headers are not put in NBO because the
src_req is already in receiver's byte order and the dst_req is not
used by the receiver for anything other than backpointers in return
headers */
#define MCA_PML_CSUM_ACK_HDR_NTOH(h) \
do { \
MCA_PML_CSUM_COMMON_HDR_NTOH((h).hdr_common); \
(h).hdr_send_offset = ntoh64((h).hdr_send_offset); \
} while (0)
#define MCA_PML_CSUM_ACK_HDR_HTON(h) \
do { \
MCA_PML_CSUM_COMMON_HDR_HTON((h).hdr_common); \
MCA_PML_CSUM_ACK_HDR_FILL(h); \
(h).hdr_send_offset = hton64((h).hdr_send_offset); \
} while (0)
/**
* Header used to initiate an RDMA operation.
*/
struct mca_pml_csum_rdma_hdr_t {
mca_pml_csum_common_hdr_t hdr_common; /**< common attributes */
uint32_t hdr_seg_cnt; /**< number of segments for rdma */
ompi_ptr_t hdr_req; /**< destination request */
ompi_ptr_t hdr_des; /**< source descriptor */
uint64_t hdr_rdma_offset; /**< current offset into user buffer */
mca_btl_base_segment_t hdr_segs[1]; /**< list of segments for rdma */
};
typedef struct mca_pml_csum_rdma_hdr_t mca_pml_csum_rdma_hdr_t;
#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT && OMPI_ENABLE_DEBUG
#define MCA_PML_CSUM_RDMA_HDR_FILL(h) \
do { \
(h).hdr_padding[0] = 0; \
(h).hdr_padding[1] = 0; \
} while(0)
#else
#define MCA_PML_CSUM_RDMA_HDR_FILL(h)
#endif /* OMPI_ENABLE_HETEROGENEOUS_SUPPORT && OMPI_ENABLE_DEBUG */
#define MCA_PML_CSUM_RDMA_HDR_NTOH(h) \
do { \
MCA_PML_CSUM_COMMON_HDR_NTOH((h).hdr_common); \
(h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \
(h).hdr_rdma_offset = ntoh64((h).hdr_rdma_offset); \
} while (0)
#define MCA_PML_CSUM_RDMA_HDR_HTON(h) \
do { \
MCA_PML_CSUM_COMMON_HDR_HTON((h).hdr_common); \
MCA_PML_CSUM_RDMA_HDR_FILL(h); \
(h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \
(h).hdr_rdma_offset = hton64((h).hdr_rdma_offset); \
} while (0)
/**
* Header used to complete an RDMA operation.
*/
struct mca_pml_csum_fin_hdr_t {
mca_pml_csum_common_hdr_t hdr_common; /**< common attributes */
#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT
uint8_t hdr_padding[6];
#endif
ompi_ptr_t hdr_des; /**< completed descriptor */
uint32_t hdr_fail; /**< RDMA operation failed */
};
typedef struct mca_pml_csum_fin_hdr_t mca_pml_csum_fin_hdr_t;
#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT && OMPI_ENABLE_DEBUG
#define MCA_PML_CSUM_FIN_HDR_FILL(h) \
do { \
(h).hdr_padding[0] = 0; \
(h).hdr_padding[1] = 0; \
(h).hdr_padding[2] = 0; \
(h).hdr_padding[3] = 0; \
(h).hdr_padding[4] = 0; \
(h).hdr_padding[5] = 0; \
} while (0)
#else
#define MCA_PML_CSUM_FIN_HDR_FILL(h)
#endif /* OMPI_ENABLE_HETEROGENEOUS_SUPPORT && OMPI_ENABLE_DEBUG */
#define MCA_PML_CSUM_FIN_HDR_NTOH(h) \
do { \
MCA_PML_CSUM_COMMON_HDR_NTOH((h).hdr_common); \
} while (0)
#define MCA_PML_CSUM_FIN_HDR_HTON(h) \
do { \
MCA_PML_CSUM_COMMON_HDR_HTON((h).hdr_common); \
MCA_PML_CSUM_FIN_HDR_FILL(h); \
} while (0)
/**
* Union of defined hdr types.
*/
union mca_pml_csum_hdr_t {
mca_pml_csum_common_hdr_t hdr_common;
mca_pml_csum_match_hdr_t hdr_match;
mca_pml_csum_rendezvous_hdr_t hdr_rndv;
mca_pml_csum_rget_hdr_t hdr_rget;
mca_pml_csum_frag_hdr_t hdr_frag;
mca_pml_csum_ack_hdr_t hdr_ack;
mca_pml_csum_rdma_hdr_t hdr_rdma;
mca_pml_csum_fin_hdr_t hdr_fin;
};
typedef union mca_pml_csum_hdr_t mca_pml_csum_hdr_t;
#if !defined(WORDS_BIGENDIAN) && OMPI_ENABLE_HETEROGENEOUS_SUPPORT
static inline __opal_attribute_always_inline__ void
csum_hdr_ntoh(mca_pml_csum_hdr_t *hdr, const uint8_t hdr_type)
{
if(!(hdr->hdr_common.hdr_flags & MCA_PML_CSUM_HDR_FLAGS_NBO))
return;
switch(hdr_type) {
case MCA_PML_CSUM_HDR_TYPE_MATCH:
MCA_PML_CSUM_MATCH_HDR_NTOH(hdr->hdr_match);
break;
case MCA_PML_CSUM_HDR_TYPE_RNDV:
MCA_PML_CSUM_RNDV_HDR_NTOH(hdr->hdr_rndv);
break;
case MCA_PML_CSUM_HDR_TYPE_RGET:
MCA_PML_CSUM_RGET_HDR_NTOH(hdr->hdr_rget);
break;
case MCA_PML_CSUM_HDR_TYPE_ACK:
MCA_PML_CSUM_ACK_HDR_NTOH(hdr->hdr_ack);
break;
case MCA_PML_CSUM_HDR_TYPE_FRAG:
MCA_PML_CSUM_FRAG_HDR_NTOH(hdr->hdr_frag);
break;
case MCA_PML_CSUM_HDR_TYPE_PUT:
MCA_PML_CSUM_RDMA_HDR_NTOH(hdr->hdr_rdma);
break;
case MCA_PML_CSUM_HDR_TYPE_FIN:
MCA_PML_CSUM_FIN_HDR_NTOH(hdr->hdr_fin);
break;
default:
assert(0);
break;
}
}
#else
#define csum_hdr_ntoh(h, t) do{}while(0)
#endif
#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT
#define csum_hdr_hton(h, t, p) \
csum_hdr_hton_intr((mca_pml_csum_hdr_t*)h, t, p)
static inline __opal_attribute_always_inline__ void
csum_hdr_hton_intr(mca_pml_csum_hdr_t *hdr, const uint8_t hdr_type,
const ompi_proc_t *proc)
{
#ifdef WORDS_BIGENDIAN
hdr->hdr_common.hdr_flags |= MCA_PML_CSUM_HDR_FLAGS_NBO;
#else
if(!(proc->proc_arch & OPAL_ARCH_ISBIGENDIAN))
return;
hdr->hdr_common.hdr_flags |= MCA_PML_CSUM_HDR_FLAGS_NBO;
switch(hdr_type) {
case MCA_PML_CSUM_HDR_TYPE_MATCH:
MCA_PML_CSUM_MATCH_HDR_HTON(hdr->hdr_match);
break;
case MCA_PML_CSUM_HDR_TYPE_RNDV:
MCA_PML_CSUM_RNDV_HDR_HTON(hdr->hdr_rndv);
break;
case MCA_PML_CSUM_HDR_TYPE_RGET:
MCA_PML_CSUM_RGET_HDR_HTON(hdr->hdr_rget);
break;
case MCA_PML_CSUM_HDR_TYPE_ACK:
MCA_PML_CSUM_ACK_HDR_HTON(hdr->hdr_ack);
break;
case MCA_PML_CSUM_HDR_TYPE_FRAG:
MCA_PML_CSUM_FRAG_HDR_HTON(hdr->hdr_frag);
break;
case MCA_PML_CSUM_HDR_TYPE_PUT:
MCA_PML_CSUM_RDMA_HDR_HTON(hdr->hdr_rdma);
break;
case MCA_PML_CSUM_HDR_TYPE_FIN:
MCA_PML_CSUM_FIN_HDR_HTON(hdr->hdr_fin);
break;
default:
assert(0);
break;
}
#endif
}
#else
#define csum_hdr_hton(h, t, p) do{}while(0)
#endif
#endif

75
ompi/mca/pml/csum/pml_csum_iprobe.c Обычный файл
Просмотреть файл

@ -0,0 +1,75 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "ompi/request/request.h"
#include "pml_csum_recvreq.h"
int mca_pml_csum_iprobe(int src,
int tag,
struct ompi_communicator_t *comm,
int *matched, ompi_status_public_t * status)
{
int rc = OMPI_SUCCESS;
mca_pml_csum_recv_request_t recvreq;
OBJ_CONSTRUCT( &recvreq, mca_pml_csum_recv_request_t );
recvreq.req_recv.req_base.req_ompi.req_type = OMPI_REQUEST_PML;
recvreq.req_recv.req_base.req_type = MCA_PML_REQUEST_IPROBE;
MCA_PML_CSUM_RECV_REQUEST_INIT(&recvreq, NULL, 0, &ompi_mpi_char.dt, src, tag, comm, true);
MCA_PML_CSUM_RECV_REQUEST_START(&recvreq);
if( recvreq.req_recv.req_base.req_ompi.req_complete == true ) {
if( NULL != status ) {
*status = recvreq.req_recv.req_base.req_ompi.req_status;
}
*matched = 1;
} else {
*matched = 0;
opal_progress();
}
MCA_PML_BASE_RECV_REQUEST_FINI( &recvreq.req_recv );
return rc;
}
int mca_pml_csum_probe(int src,
int tag,
struct ompi_communicator_t *comm,
ompi_status_public_t * status)
{
mca_pml_csum_recv_request_t recvreq;
OBJ_CONSTRUCT( &recvreq, mca_pml_csum_recv_request_t );
recvreq.req_recv.req_base.req_ompi.req_type = OMPI_REQUEST_PML;
recvreq.req_recv.req_base.req_type = MCA_PML_REQUEST_PROBE;
MCA_PML_CSUM_RECV_REQUEST_INIT(&recvreq, NULL, 0, &ompi_mpi_char.dt, src, tag, comm, true);
MCA_PML_CSUM_RECV_REQUEST_START(&recvreq);
ompi_request_wait_completion(&recvreq.req_recv.req_base.req_ompi);
if (NULL != status) {
*status = recvreq.req_recv.req_base.req_ompi.req_status;
}
MCA_PML_BASE_RECV_REQUEST_FINI( &recvreq.req_recv );
return OMPI_SUCCESS;
}

112
ompi/mca/pml/csum/pml_csum_irecv.c Обычный файл
Просмотреть файл

@ -0,0 +1,112 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "ompi/request/request.h"
#include "pml_csum_recvreq.h"
#include "ompi/peruse/peruse-internal.h"
int mca_pml_csum_irecv_init(void *addr,
size_t count,
ompi_datatype_t * datatype,
int src,
int tag,
struct ompi_communicator_t *comm,
struct ompi_request_t **request)
{
int rc;
mca_pml_csum_recv_request_t *recvreq;
MCA_PML_CSUM_RECV_REQUEST_ALLOC(recvreq, rc);
if (NULL == recvreq)
return rc;
MCA_PML_CSUM_RECV_REQUEST_INIT(recvreq,
addr,
count, datatype, src, tag, comm, true);
PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
&((recvreq)->req_recv.req_base),
PERUSE_RECV);
*request = (ompi_request_t *) recvreq;
return OMPI_SUCCESS;
}
int mca_pml_csum_irecv(void *addr,
size_t count,
ompi_datatype_t * datatype,
int src,
int tag,
struct ompi_communicator_t *comm,
struct ompi_request_t **request)
{
int rc;
mca_pml_csum_recv_request_t *recvreq;
MCA_PML_CSUM_RECV_REQUEST_ALLOC(recvreq, rc);
if (NULL == recvreq)
return rc;
MCA_PML_CSUM_RECV_REQUEST_INIT(recvreq,
addr,
count, datatype, src, tag, comm, false);
PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
&((recvreq)->req_recv.req_base),
PERUSE_RECV);
MCA_PML_CSUM_RECV_REQUEST_START(recvreq);
*request = (ompi_request_t *) recvreq;
return OMPI_SUCCESS;
}
int mca_pml_csum_recv(void *addr,
size_t count,
ompi_datatype_t * datatype,
int src,
int tag,
struct ompi_communicator_t *comm,
ompi_status_public_t * status)
{
int rc;
mca_pml_csum_recv_request_t *recvreq;
MCA_PML_CSUM_RECV_REQUEST_ALLOC(recvreq, rc);
if (NULL == recvreq)
return rc;
MCA_PML_CSUM_RECV_REQUEST_INIT(recvreq,
addr,
count, datatype, src, tag, comm, false);
PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
&((recvreq)->req_recv.req_base),
PERUSE_RECV);
MCA_PML_CSUM_RECV_REQUEST_START(recvreq);
ompi_request_wait_completion(&recvreq->req_recv.req_base.req_ompi);
if (NULL != status) { /* return status */
*status = recvreq->req_recv.req_base.req_ompi.req_status;
}
rc = recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR;
ompi_request_free( (ompi_request_t**)&recvreq );
return rc;
}

130
ompi/mca/pml/csum/pml_csum_isend.c Обычный файл
Просмотреть файл

@ -0,0 +1,130 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2007 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "pml_csum.h"
#include "pml_csum_sendreq.h"
#include "pml_csum_recvreq.h"
#include "ompi/peruse/peruse-internal.h"
int mca_pml_csum_isend_init(void *buf,
size_t count,
ompi_datatype_t * datatype,
int dst,
int tag,
mca_pml_base_send_mode_t sendmode,
ompi_communicator_t * comm,
ompi_request_t ** request)
{
int rc;
mca_pml_csum_send_request_t *sendreq = NULL;
MCA_PML_CSUM_SEND_REQUEST_ALLOC(comm, dst, sendreq, rc);
if (rc != OMPI_SUCCESS)
return rc;
MCA_PML_CSUM_SEND_REQUEST_INIT(sendreq,
buf,
count,
datatype,
dst, tag,
comm, sendmode, true);
PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
&(sendreq)->req_send.req_base,
PERUSE_SEND);
*request = (ompi_request_t *) sendreq;
return OMPI_SUCCESS;
}
int mca_pml_csum_isend(void *buf,
size_t count,
ompi_datatype_t * datatype,
int dst,
int tag,
mca_pml_base_send_mode_t sendmode,
ompi_communicator_t * comm,
ompi_request_t ** request)
{
int rc;
mca_pml_csum_send_request_t *sendreq = NULL;
MCA_PML_CSUM_SEND_REQUEST_ALLOC(comm, dst, sendreq, rc);
if (rc != OMPI_SUCCESS)
return rc;
MCA_PML_CSUM_SEND_REQUEST_INIT(sendreq,
buf,
count,
datatype,
dst, tag,
comm, sendmode, false);
PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
&(sendreq)->req_send.req_base,
PERUSE_SEND);
MCA_PML_CSUM_SEND_REQUEST_START(sendreq, rc);
*request = (ompi_request_t *) sendreq;
return rc;
}
int mca_pml_csum_send(void *buf,
size_t count,
ompi_datatype_t * datatype,
int dst,
int tag,
mca_pml_base_send_mode_t sendmode,
ompi_communicator_t * comm)
{
int rc;
mca_pml_csum_send_request_t *sendreq;
MCA_PML_CSUM_SEND_REQUEST_ALLOC(comm, dst, sendreq, rc);
if (rc != OMPI_SUCCESS)
return rc;
MCA_PML_CSUM_SEND_REQUEST_INIT(sendreq,
buf,
count,
datatype,
dst, tag,
comm, sendmode, false);
PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
&(sendreq)->req_send.req_base,
PERUSE_SEND);
MCA_PML_CSUM_SEND_REQUEST_START(sendreq, rc);
if (rc != OMPI_SUCCESS) {
MCA_PML_CSUM_SEND_REQUEST_RETURN( sendreq );
return rc;
}
ompi_request_wait_completion(&sendreq->req_send.req_base.req_ompi);
rc = sendreq->req_send.req_base.req_ompi.req_status.MPI_ERROR;
ompi_request_free( (ompi_request_t**)&sendreq );
return rc;
}

77
ompi/mca/pml/csum/pml_csum_progress.c Обычный файл
Просмотреть файл

@ -0,0 +1,77 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2008 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "pml_csum.h"
#include "pml_csum_sendreq.h"
#include "ompi/mca/bml/base/base.h"
int mca_pml_csum_progress(void)
{
int i, queue_length = opal_list_get_size(&mca_pml_csum.send_pending);
int j, completed_requests = 0;
bool send_succedded;
if( OPAL_LIKELY(0 == queue_length) )
return 0;
for( i = 0; i < queue_length; i++ ) {
mca_pml_csum_send_pending_t pending_type = MCA_PML_CSUM_SEND_PENDING_NONE;
mca_pml_csum_send_request_t* sendreq;
mca_bml_base_endpoint_t* endpoint;
sendreq = get_request_from_send_pending(&pending_type);
if(OPAL_UNLIKELY(NULL == sendreq))
break;
switch(pending_type) {
case MCA_PML_CSUM_SEND_PENDING_NONE:
assert(0);
return 0;
case MCA_PML_CSUM_SEND_PENDING_SCHEDULE:
if( mca_pml_csum_send_request_schedule_exclusive(sendreq) ==
OMPI_ERR_OUT_OF_RESOURCE ) {
return 0;
}
completed_requests++;
break;
case MCA_PML_CSUM_SEND_PENDING_START:
endpoint = sendreq->req_endpoint;
send_succedded = false;
for(j = 0; j < (int)mca_bml_base_btl_array_get_size(&endpoint->btl_eager); j++) {
mca_bml_base_btl_t* bml_btl;
int rc;
/* select a btl */
bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
rc = mca_pml_csum_send_request_start_btl(sendreq, bml_btl);
if( OPAL_LIKELY(OMPI_SUCCESS == rc) ) {
send_succedded = true;
completed_requests++;
break;
}
}
if( false == send_succedded ) {
add_request_to_send_pending(sendreq, MCA_PML_CSUM_SEND_PENDING_START, true);
}
}
}
return completed_requests;
}

124
ompi/mca/pml/csum/pml_csum_rdma.c Обычный файл
Просмотреть файл

@ -0,0 +1,124 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
#include "ompi_config.h"
#include "ompi/constants.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/mca/btl/btl.h"
#include "ompi/mca/bml/bml.h"
#include "orte/types.h"
#include "ompi/mca/mpool/mpool.h"
#include "pml_csum.h"
#include "pml_csum_rdma.h"
/* Use this registration if no registration needed for a BTL instead of NULL.
* This will help other code to distinguish case when memory is not registered
* from case when registration is not needed */
static mca_mpool_base_registration_t pml_csum_dummy_reg;
/*
* Check to see if memory is registered or can be registered. Build a
* set of registrations on the request.
*/
size_t mca_pml_csum_rdma_btls(
mca_bml_base_endpoint_t* bml_endpoint,
unsigned char* base,
size_t size,
mca_pml_csum_com_btl_t* rdma_btls)
{
int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
double weight_total = 0;
int num_btls_used = 0, n;
/* shortcut when there are no rdma capable btls */
if(num_btls == 0) {
return 0;
}
/* check to see if memory is registered */
for(n = 0; n < num_btls && num_btls_used < mca_pml_csum.max_rdma_per_request;
n++) {
mca_bml_base_btl_t* bml_btl =
mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma,
(bml_endpoint->btl_rdma_index + n) % num_btls);
mca_mpool_base_registration_t* reg = NULL;
mca_mpool_base_module_t *btl_mpool = bml_btl->btl->btl_mpool;
if(NULL != btl_mpool) {
if(!mca_pml_csum.leave_pinned) {
/* look through existing registrations */
btl_mpool->mpool_find(btl_mpool, base, size, &reg);
} else {
/* register the memory */
btl_mpool->mpool_register(btl_mpool, base, size, 0, &reg);
}
if(NULL == reg)
bml_btl = NULL; /* skip it */
} else {
/* if registration is not required use dummy registration */
reg = &pml_csum_dummy_reg;
}
if(bml_btl != NULL) {
rdma_btls[num_btls_used].bml_btl = bml_btl;
rdma_btls[num_btls_used].btl_reg = reg;
weight_total += bml_btl->btl_weight;
num_btls_used++;
}
}
/* if we don't use leave_pinned and all BTLs that already have this memory
* registered amount to less then half of available bandwidth - fall back to
* pipeline protocol */
if(0 == num_btls_used || (!mca_pml_csum.leave_pinned && weight_total < 0.5))
return 0;
mca_pml_csum_calc_weighted_length(rdma_btls, num_btls_used, size,
weight_total);
bml_endpoint->btl_rdma_index = (bml_endpoint->btl_rdma_index + 1) % num_btls;
return num_btls_used;
}
size_t mca_pml_csum_rdma_pipeline_btls( mca_bml_base_endpoint_t* bml_endpoint,
size_t size,
mca_pml_csum_com_btl_t* rdma_btls )
{
int i, num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
double weight_total = 0;
for(i = 0; i < num_btls && i < mca_pml_csum.max_rdma_per_request; i++) {
rdma_btls[i].bml_btl =
mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma);
if(NULL != rdma_btls[i].bml_btl->btl->btl_mpool)
rdma_btls[i].btl_reg = NULL;
else
rdma_btls[i].btl_reg = &pml_csum_dummy_reg;
weight_total += rdma_btls[i].bml_btl->btl_weight;
}
mca_pml_csum_calc_weighted_length(rdma_btls, i, size, weight_total);
return i;
}

41
ompi/mca/pml/csum/pml_csum_rdma.h Обычный файл
Просмотреть файл

@ -0,0 +1,41 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*/
#ifndef MCA_PML_CSUM_RDMA_H
#define MCA_PML_CSUM_RDMA_H
struct mca_bml_base_endpoint_t;
/*
* Of the set of available btls that support RDMA,
* find those that already have registrations - or
* register if required (for leave_pinned option)
*/
size_t mca_pml_csum_rdma_btls(struct mca_bml_base_endpoint_t* endpoint,
unsigned char* base, size_t size, struct mca_pml_csum_com_btl_t* btls);
/* Choose RDMA BTLs to use for sending of a request by pipeline protocol.
* Calculate number of bytes to send through each BTL according to available
* bandwidth */
size_t mca_pml_csum_rdma_pipeline_btls(struct mca_bml_base_endpoint_t* endpoint,
size_t size, mca_pml_csum_com_btl_t* rdma_btls);
#endif

29
ompi/mca/pml/csum/pml_csum_rdmafrag.c Обычный файл
Просмотреть файл

@ -0,0 +1,29 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "pml_csum.h"
#include "pml_csum_rdmafrag.h"
OBJ_CLASS_INSTANCE(
mca_pml_csum_rdma_frag_t,
ompi_free_list_item_t,
NULL,
NULL);

71
ompi/mca/pml/csum/pml_csum_rdmafrag.h Обычный файл
Просмотреть файл

@ -0,0 +1,71 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*/
#ifndef MCA_PML_CSUM_RDMAFRAG_H
#define MCA_PML_CSUM_RDMAFRAG_H
#include "ompi/mca/btl/btl.h"
#include "pml_csum_hdr.h"
BEGIN_C_DECLS
typedef enum {
MCA_PML_CSUM_RDMA_PUT,
MCA_PML_CSUM_RDMA_GET
} mca_pml_csum_rdma_state_t;
struct mca_pml_csum_rdma_frag_t {
ompi_free_list_item_t super;
mca_bml_base_btl_t* rdma_bml;
mca_pml_csum_hdr_t rdma_hdr;
mca_pml_csum_rdma_state_t rdma_state;
size_t rdma_length;
mca_btl_base_segment_t rdma_segs[MCA_BTL_DES_MAX_SEGMENTS];
void *rdma_req;
struct mca_bml_base_endpoint_t* rdma_ep;
ompi_convertor_t convertor;
mca_mpool_base_registration_t* reg;
uint32_t retries;
};
typedef struct mca_pml_csum_rdma_frag_t mca_pml_csum_rdma_frag_t;
OBJ_CLASS_DECLARATION(mca_pml_csum_rdma_frag_t);
#define MCA_PML_CSUM_RDMA_FRAG_ALLOC(frag,rc) \
do { \
ompi_free_list_item_t* item; \
OMPI_FREE_LIST_WAIT(&mca_pml_csum.rdma_frags, item, rc); \
frag = (mca_pml_csum_rdma_frag_t*)item; \
} while(0)
#define MCA_PML_CSUM_RDMA_FRAG_RETURN(frag) \
do { \
/* return fragment */ \
OMPI_FREE_LIST_RETURN(&mca_pml_csum.rdma_frags, \
(ompi_free_list_item_t*)frag); \
} while(0)
END_C_DECLS
#endif

799
ompi/mca/pml/csum/pml_csum_recvfrag.c Обычный файл
Просмотреть файл

@ -0,0 +1,799 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2008 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2007 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
* Copyright (c) 2006-2008 University of Houston. All rights reserved.
* Copyright (c) 2009 IBM Corporation. All rights reserved.
* Copyright (c) 2009 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*/
#include "ompi_config.h"
#include "ompi/constants.h"
#include "opal/class/opal_list.h"
#include "opal/util/crc.h"
#include "opal/threads/mutex.h"
#include "opal/prefetch.h"
#include "opal/util/output.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/mca/pml/base/base.h"
#include "ompi/peruse/peruse-internal.h"
#include "ompi/memchecker.h"
#include "orte/mca/errmgr/errmgr.h"
#include "pml_csum.h"
#include "pml_csum_comm.h"
#include "pml_csum_recvfrag.h"
#include "pml_csum_recvreq.h"
#include "pml_csum_sendreq.h"
#include "pml_csum_hdr.h"
OBJ_CLASS_INSTANCE( mca_pml_csum_buffer_t,
ompi_free_list_item_t,
NULL,
NULL );
OBJ_CLASS_INSTANCE( mca_pml_csum_recv_frag_t,
opal_list_item_t,
NULL,
NULL );
/**
* Static functions.
*/
/**
* Append a unexpected descriptor to a queue. This function will allocate and
* initialize the fragment (if necessary) and the will added to the specified
* queue. The frag will be updated to the allocated fragment if necessary.
*/
static void
append_frag_to_list(opal_list_t *queue, mca_btl_base_module_t *btl,
mca_pml_csum_match_hdr_t *hdr, mca_btl_base_segment_t* segments,
size_t num_segments, mca_pml_csum_recv_frag_t* frag)
{
int rc;
if(NULL == frag) {
MCA_PML_CSUM_RECV_FRAG_ALLOC(frag, rc);
MCA_PML_CSUM_RECV_FRAG_INIT(frag, hdr, segments, num_segments, btl);
}
opal_list_append(queue, (opal_list_item_t*)frag);
}
/**
* Match incoming recv_frags against posted receives.
* Supports out of order delivery.
*
* @param frag_header (IN) Header of received recv_frag.
* @param frag_desc (IN) Received recv_frag descriptor.
* @param match_made (OUT) Flag indicating wether a match was made.
* @param additional_matches (OUT) List of additional matches
* @return OMPI_SUCCESS or error status on failure.
*/
static int mca_pml_csum_recv_frag_match( mca_btl_base_module_t *btl,
mca_pml_csum_match_hdr_t *hdr,
mca_btl_base_segment_t* segments,
size_t num_segments,
int type);
static mca_pml_csum_recv_request_t *match_one(mca_btl_base_module_t *btl,
mca_pml_csum_match_hdr_t *hdr, mca_btl_base_segment_t* segments,
size_t num_segments, ompi_communicator_t *comm_ptr,
mca_pml_csum_comm_proc_t *proc,
mca_pml_csum_recv_frag_t* frag);
void mca_pml_csum_recv_frag_callback_match(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des,
void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_dst;
mca_pml_csum_match_hdr_t* hdr = (mca_pml_csum_match_hdr_t*)segments->seg_addr.pval;
ompi_communicator_t *comm_ptr;
mca_pml_csum_recv_request_t *match = NULL;
mca_pml_csum_comm_t *comm;
mca_pml_csum_comm_proc_t *proc;
mca_pml_csum_recv_frag_t* frag = NULL;
size_t num_segments = des->des_dst_cnt;
size_t bytes_received = 0;
uint16_t csum_received, csum;
uint32_t csum_data;
bool do_csum = btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM;
if( OPAL_UNLIKELY(segments->seg_len < OMPI_PML_CSUM_MATCH_HDR_LEN) ) {
return;
}
csum_hdr_ntoh(((mca_pml_csum_hdr_t*) hdr), MCA_PML_CSUM_HDR_TYPE_MATCH);
if (do_csum) {
csum_received = hdr->hdr_common.hdr_csum;
hdr->hdr_common.hdr_csum = 0;
csum = opal_csum16(hdr, sizeof(mca_pml_csum_match_hdr_t));
hdr->hdr_common.hdr_csum = csum_received;
if (csum_received != csum) {
opal_output(0, "%s:%s:%d: Invalid \'match header\' - received csum:0x%04x != computed csum:0x%04x\n",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), __FILE__, __LINE__, csum_received, csum);
orte_errmgr.abort(-1,NULL);
}
}
/* communicator pointer */
comm_ptr = ompi_comm_lookup(hdr->hdr_ctx);
if(OPAL_UNLIKELY(NULL == comm_ptr)) {
/* This is a special case. A message for a not yet existing
* communicator can happens. Instead of doing a matching we
* will temporarily add it the a pending queue in the PML.
* Later on, when the communicator is completely instantiated,
* this pending queue will be searched and all matching fragments
* moved to the right communicator.
*/
append_frag_to_list( &mca_pml_csum.non_existing_communicator_pending,
btl, hdr, segments, num_segments, frag );
return;
}
comm = (mca_pml_csum_comm_t *)comm_ptr->c_pml_comm;
/* source sequence number */
proc = &comm->procs[hdr->hdr_src];
/* We generate the MSG_ARRIVED event as soon as the PML is aware
* of a matching fragment arrival. Independing if it is received
* on the correct order or not. This will allow the tools to
* figure out if the messages are not received in the correct
* order (if multiple network interfaces).
*/
PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_ARRIVED, comm_ptr,
hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
/* get next expected message sequence number - if threaded
* run, lock to make sure that if another thread is processing
* a frag from the same message a match is made only once.
* Also, this prevents other posted receives (for a pair of
* end points) from being processed, and potentially "loosing"
* the fragment.
*/
OPAL_THREAD_LOCK(&comm->matching_lock);
/* get sequence number of next message that can be processed */
if(OPAL_UNLIKELY((((uint16_t) hdr->hdr_seq) != ((uint16_t) proc->expected_sequence)) ||
(opal_list_get_size(&proc->frags_cant_match) > 0 ))) {
goto slow_path;
}
/* This is the sequence number we were expecting, so we can try
* matching it to already posted receives.
*/
/* We're now expecting the next sequence number. */
proc->expected_sequence++;
/* We generate the SEARCH_POSTED_QUEUE only when the message is
* received in the correct sequence. Otherwise, we delay the event
* generation until we reach the correct sequence number.
*/
PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_BEGIN, comm_ptr,
hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
match = match_one(btl, hdr, segments, num_segments, comm_ptr, proc, frag);
/* The match is over. We generate the SEARCH_POSTED_Q_END here,
* before going into the mca_pml_csum_check_cantmatch_for_match so
* we can make a difference for the searching time for all
* messages.
*/
PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_END, comm_ptr,
hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
/* release matching lock before processing fragment */
OPAL_THREAD_UNLOCK(&comm->matching_lock);
if(OPAL_LIKELY(match)) {
bytes_received = segments->seg_len - OMPI_PML_CSUM_MATCH_HDR_LEN;
match->req_recv.req_bytes_packed = bytes_received;
MCA_PML_CSUM_RECV_REQUEST_MATCHED(match, hdr);
if(match->req_bytes_delivered > 0) {
struct iovec iov[2];
uint32_t iov_count = 1;
/*
* Make user buffer accessable(defined) before unpacking.
*/
MEMCHECKER(
memchecker_call(&opal_memchecker_base_mem_defined,
match->req_recv.req_base.req_addr,
match->req_recv.req_base.req_count,
match->req_recv.req_base.req_datatype);
);
iov[0].iov_len = bytes_received;
iov[0].iov_base = (IOVBASE_TYPE*)((unsigned char*)segments->seg_addr.pval +
OMPI_PML_CSUM_MATCH_HDR_LEN);
while (iov_count < num_segments) {
bytes_received += segments[iov_count].seg_len;
iov[iov_count].iov_len = segments[iov_count].seg_len;
iov[iov_count].iov_base = (IOVBASE_TYPE*)((unsigned char*)segments[iov_count].seg_addr.pval);
iov_count++;
}
ompi_convertor_unpack( &match->req_recv.req_base.req_convertor,
iov,
&iov_count,
&bytes_received );
match->req_bytes_received = bytes_received;
/*
* Unpacking finished, make the user buffer unaccessable again.
*/
MEMCHECKER(
memchecker_call(&opal_memchecker_base_mem_noaccess,
match->req_recv.req_base.req_addr,
match->req_recv.req_base.req_count,
match->req_recv.req_base.req_datatype);
);
}
if (do_csum) {
csum_data = (bytes_received > 0) ? match->req_recv.req_base.req_convertor.checksum : 0;
OPAL_OUTPUT_VERBOSE((1, mca_pml_base_output,
"%s Received \'match\' with data csum:0x%x, header csum:0x%04x, size:%lu\n",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hdr->hdr_csum, csum, (unsigned long)bytes_received));
if (csum_data != hdr->hdr_csum) {
opal_output(0, "%s:%s:%d: Invalid \'match data\' - received csum:0x%x != computed csum:0x%x\n",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), __FILE__, __LINE__, hdr->hdr_csum, csum_data);
orte_errmgr.abort(-1,NULL);
}
}
/* no need to check if complete we know we are.. */
/* don't need a rmb as that is for checking */
recv_request_pml_complete(match);
}
return;
slow_path:
OPAL_THREAD_UNLOCK(&comm->matching_lock);
mca_pml_csum_recv_frag_match(btl, hdr, segments,
num_segments, MCA_PML_CSUM_HDR_TYPE_MATCH);
}
void mca_pml_csum_recv_frag_callback_rndv(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des,
void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_dst;
mca_pml_csum_hdr_t* hdr = (mca_pml_csum_hdr_t*)segments->seg_addr.pval;
uint16_t csum_received, csum;
bool do_csum = btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM;
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_csum_common_hdr_t)) ) {
return;
}
csum_hdr_ntoh(hdr, MCA_PML_CSUM_HDR_TYPE_RNDV);
if (do_csum) {
csum_received = hdr->hdr_common.hdr_csum;
hdr->hdr_common.hdr_csum = 0;
csum = opal_csum16(hdr, sizeof(mca_pml_csum_rendezvous_hdr_t));
hdr->hdr_common.hdr_csum = csum_received;
if (csum_received != csum) {
opal_output(0, "%s:%s:%d: Invalid \'rndv header\' - received csum:0x%04x != computed csum:0x%04x\n",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), __FILE__, __LINE__, csum_received, csum);
orte_errmgr.abort(-1,NULL);
}
}
mca_pml_csum_recv_frag_match(btl, &hdr->hdr_match, segments,
des->des_dst_cnt, MCA_PML_CSUM_HDR_TYPE_RNDV);
return;
}
void mca_pml_csum_recv_frag_callback_rget(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des,
void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_dst;
mca_pml_csum_hdr_t* hdr = (mca_pml_csum_hdr_t*)segments->seg_addr.pval;
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_csum_common_hdr_t)) ) {
return;
}
csum_hdr_ntoh(hdr, MCA_PML_CSUM_HDR_TYPE_RGET);
mca_pml_csum_recv_frag_match(btl, &hdr->hdr_match, segments,
des->des_dst_cnt, MCA_PML_CSUM_HDR_TYPE_RGET);
return;
}
void mca_pml_csum_recv_frag_callback_ack(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des,
void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_dst;
mca_pml_csum_hdr_t* hdr = (mca_pml_csum_hdr_t*)segments->seg_addr.pval;
mca_pml_csum_send_request_t* sendreq;
uint16_t csum_received, csum;
bool do_csum = btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM;
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_csum_common_hdr_t)) ) {
return;
}
csum_hdr_ntoh(hdr, MCA_PML_CSUM_HDR_TYPE_ACK);
if (do_csum) {
csum_received = hdr->hdr_common.hdr_csum;
hdr->hdr_common.hdr_csum = 0;
csum = opal_csum16(hdr, sizeof(mca_pml_csum_ack_hdr_t));
hdr->hdr_common.hdr_csum = csum_received;
OPAL_OUTPUT_VERBOSE((0, mca_pml_base_output,
"%s Received \'ACK\' with header csum:0x%04x\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), csum));
if (csum_received != csum) {
opal_output(0, "%s:%s:%d: Invalid \'ACK header\' - received csum:0x%04x != computed csum:0x%04x\n",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), __FILE__, __LINE__, csum_received, csum);
orte_errmgr.abort(-1,NULL);
}
}
sendreq = (mca_pml_csum_send_request_t*)hdr->hdr_ack.hdr_src_req.pval;
sendreq->req_recv = hdr->hdr_ack.hdr_dst_req;
/* if the request should be delivered entirely by copy in/out
* then throttle sends */
if(hdr->hdr_common.hdr_flags & MCA_PML_CSUM_HDR_FLAGS_NORDMA)
sendreq->req_throttle_sends = true;
mca_pml_csum_send_request_copy_in_out(sendreq,
hdr->hdr_ack.hdr_send_offset,
sendreq->req_send.req_bytes_packed -
hdr->hdr_ack.hdr_send_offset);
OPAL_THREAD_ADD32(&sendreq->req_state, -1);
if(send_request_pml_complete_check(sendreq) == false)
mca_pml_csum_send_request_schedule(sendreq);
return;
}
void mca_pml_csum_recv_frag_callback_frag(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des,
void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_dst;
mca_pml_csum_hdr_t* hdr = (mca_pml_csum_hdr_t*)segments->seg_addr.pval;
mca_pml_csum_recv_request_t* recvreq;
uint16_t csum_received, csum;
bool do_csum = mca_pml_csum.enable_csum &&
(btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM);
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_csum_common_hdr_t)) ) {
return;
}
csum_hdr_ntoh(hdr, MCA_PML_CSUM_HDR_TYPE_FRAG);
if(do_csum) {
csum_received = hdr->hdr_common.hdr_csum;
hdr->hdr_common.hdr_csum = 0;
csum = opal_csum16(hdr, sizeof(mca_pml_csum_frag_hdr_t));
hdr->hdr_common.hdr_csum = csum_received;
if(csum_received != csum) {
opal_output(0, "%s:%s:%d: Invalid \'frag header\' - received csum:0x%04x != computed csum:0x%04x\n",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), __FILE__, __LINE__, csum_received, csum);
orte_errmgr.abort(-1,NULL);
}
}
recvreq = (mca_pml_csum_recv_request_t*)hdr->hdr_frag.hdr_dst_req.pval;
mca_pml_csum_recv_request_progress_frag(recvreq,btl,segments,des->des_dst_cnt);
return;
}
void mca_pml_csum_recv_frag_callback_put(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des,
void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_dst;
mca_pml_csum_hdr_t* hdr = (mca_pml_csum_hdr_t*)segments->seg_addr.pval;
mca_pml_csum_send_request_t* sendreq;
uint16_t csum_received, csum;
bool do_csum = mca_pml_csum.enable_csum &&
(btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM);
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_csum_common_hdr_t)) ) {
return;
}
csum_hdr_ntoh(hdr, MCA_PML_CSUM_HDR_TYPE_PUT);
if(do_csum) {
csum_received = hdr->hdr_common.hdr_csum;
hdr->hdr_common.hdr_csum = 0;
csum = opal_csum16(hdr, sizeof(mca_pml_csum_rdma_hdr_t));
hdr->hdr_common.hdr_csum = csum_received;
OPAL_OUTPUT_VERBOSE((0, mca_pml_base_output,
"%s Received \'PUT\' with header csum:0x%04x\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), csum));
if(csum_received != csum) {
opal_output(0, "%s:%s:%d: Invalid \'PUT header\' - received csum:0x%04x != computed csum:0x%04x\n",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), __FILE__, __LINE__, csum_received, csum);
orte_errmgr.abort(-1,NULL);
}
}
sendreq = (mca_pml_csum_send_request_t*)hdr->hdr_rdma.hdr_req.pval;
mca_pml_csum_send_request_put(sendreq,btl,&hdr->hdr_rdma);
return;
}
void mca_pml_csum_recv_frag_callback_fin(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des,
void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_dst;
mca_pml_csum_hdr_t* hdr = (mca_pml_csum_hdr_t*)segments->seg_addr.pval;
mca_btl_base_descriptor_t* rdma;
uint16_t csum_received, csum;
bool do_csum = mca_pml_csum.enable_csum &&
(btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM);
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_csum_common_hdr_t)) ) {
return;
}
csum_hdr_ntoh(hdr, MCA_PML_CSUM_HDR_TYPE_FIN);
if(do_csum) {
csum_received = hdr->hdr_common.hdr_csum;
hdr->hdr_common.hdr_csum = 0;
csum = opal_csum16(hdr, sizeof(mca_pml_csum_fin_hdr_t));
hdr->hdr_common.hdr_csum = csum_received;
OPAL_OUTPUT_VERBOSE((0, mca_pml_base_output,
"%s Received \'FIN\' with header csum:0x%04x\n",ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),csum));
if(csum_received != csum) {
opal_output(0, "%s:%s:%d: Invalid \'FIN header\' - received csum:0x%04x != computed csum:0x%04x\n",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), __FILE__, __LINE__, csum_received, csum);
orte_errmgr.abort(-1,NULL);
}
}
rdma = (mca_btl_base_descriptor_t*)hdr->hdr_fin.hdr_des.pval;
rdma->des_cbfunc(btl, NULL, rdma,
hdr->hdr_fin.hdr_fail ? OMPI_ERROR : OMPI_SUCCESS);
return;
}
#define PML_MAX_SEQ ~((mca_pml_sequence_t)0);
static inline mca_pml_csum_recv_request_t* get_posted_recv(opal_list_t *queue)
{
if(opal_list_get_size(queue) == 0)
return NULL;
return (mca_pml_csum_recv_request_t*)opal_list_get_first(queue);
}
static inline mca_pml_csum_recv_request_t* get_next_posted_recv(
opal_list_t *queue,
mca_pml_csum_recv_request_t* req)
{
opal_list_item_t *i = opal_list_get_next((opal_list_item_t*)req);
if(opal_list_get_end(queue) == i)
return NULL;
return (mca_pml_csum_recv_request_t*)i;
}
static mca_pml_csum_recv_request_t *match_incomming(
mca_pml_csum_match_hdr_t *hdr, mca_pml_csum_comm_t *comm,
mca_pml_csum_comm_proc_t *proc)
{
mca_pml_csum_recv_request_t *specific_recv, *wild_recv;
mca_pml_sequence_t wild_recv_seq, specific_recv_seq;
int tag = hdr->hdr_tag;
specific_recv = get_posted_recv(&proc->specific_receives);
wild_recv = get_posted_recv(&comm->wild_receives);
wild_recv_seq = wild_recv ?
wild_recv->req_recv.req_base.req_sequence : PML_MAX_SEQ;
specific_recv_seq = specific_recv ?
specific_recv->req_recv.req_base.req_sequence : PML_MAX_SEQ;
/* they are equal only if both are PML_MAX_SEQ */
while(wild_recv_seq != specific_recv_seq) {
mca_pml_csum_recv_request_t **match;
opal_list_t *queue;
int req_tag;
mca_pml_sequence_t *seq;
if (OPAL_UNLIKELY(wild_recv_seq < specific_recv_seq)) {
match = &wild_recv;
queue = &comm->wild_receives;
seq = &wild_recv_seq;
} else {
match = &specific_recv;
queue = &proc->specific_receives;
seq = &specific_recv_seq;
}
req_tag = (*match)->req_recv.req_base.req_tag;
if(req_tag == tag || (req_tag == OMPI_ANY_TAG && tag >= 0)) {
opal_list_remove_item(queue, (opal_list_item_t*)(*match));
PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_REQ_REMOVE_FROM_POSTED_Q,
&((*match)->req_recv.req_base), PERUSE_RECV);
return *match;
}
*match = get_next_posted_recv(queue, *match);
*seq = (*match) ? (*match)->req_recv.req_base.req_sequence : PML_MAX_SEQ;
}
return NULL;
}
static mca_pml_csum_recv_request_t *match_one(mca_btl_base_module_t *btl,
mca_pml_csum_match_hdr_t *hdr, mca_btl_base_segment_t* segments,
size_t num_segments, ompi_communicator_t *comm_ptr,
mca_pml_csum_comm_proc_t *proc,
mca_pml_csum_recv_frag_t* frag)
{
mca_pml_csum_recv_request_t *match;
mca_pml_csum_comm_t *comm = (mca_pml_csum_comm_t *)comm_ptr->c_pml_comm;
do {
match = match_incomming(hdr, comm, proc);
/* if match found, process data */
if(OPAL_UNLIKELY(NULL == match)) {
/* if no match found, place on unexpected queue */
append_frag_to_list(&proc->unexpected_frags, btl, hdr, segments,
num_segments, frag);
PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_INSERT_IN_UNEX_Q, comm_ptr,
hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
return NULL;
}
match->req_recv.req_base.req_proc = proc->ompi_proc;
if(MCA_PML_REQUEST_PROBE == match->req_recv.req_base.req_type) {
/* complete the probe */
mca_pml_csum_recv_request_matched_probe(match, btl, segments,
num_segments);
/* attempt to match actual request */
continue;
}
PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_MSG_MATCH_POSTED_REQ,
&(match->req_recv.req_base), PERUSE_RECV);
break;
} while(true);
return match;
}
static mca_pml_csum_recv_frag_t *check_cantmatch_for_match(
mca_pml_csum_comm_proc_t *proc)
{
/* local parameters */
mca_pml_csum_recv_frag_t *frag;
/* search the list for a fragment from the send with sequence
* number next_msg_seq_expected
*/
for(frag = (mca_pml_csum_recv_frag_t *)
opal_list_get_first(&proc->frags_cant_match);
frag != (mca_pml_csum_recv_frag_t *)
opal_list_get_end(&proc->frags_cant_match);
frag = (mca_pml_csum_recv_frag_t *)
opal_list_get_next(frag))
{
mca_pml_csum_match_hdr_t* hdr = &frag->hdr.hdr_match;
/*
* If the message has the next expected seq from that proc...
*/
if(hdr->hdr_seq != proc->expected_sequence)
continue;
opal_list_remove_item(&proc->frags_cant_match, (opal_list_item_t*)frag);
return frag;
}
return NULL;
}
/**
* RCS/CTS receive side matching
*
* @param hdr list of parameters needed for matching
* This list is also embeded in frag,
* but this allows to save a memory copy when
* a match is made in this routine. (IN)
* @param frag pointer to receive fragment which we want
* to match (IN/OUT). If a match is not made,
* hdr is copied to frag.
* @param match_made parameter indicating if we matched frag/
* hdr (OUT)
* @param additional_matches if a match is made with frag, we
* may be able to match fragments that previously
* have arrived out-of-order. If this is the
* case, the associated fragment descriptors are
* put on this list for further processing. (OUT)
*
* @return OMPI error code
*
* This routine is used to try and match a newly arrived message fragment
* to pre-posted receives. The following assumptions are made
* - fragments are received out of order
* - for long messages, e.g. more than one fragment, a RTS/CTS algorithm
* is used.
* - 2nd and greater fragments include a receive descriptor pointer
* - fragments may be dropped
* - fragments may be corrupt
* - this routine may be called simultaneously by more than one thread
*/
static int mca_pml_csum_recv_frag_match( mca_btl_base_module_t *btl,
mca_pml_csum_match_hdr_t *hdr,
mca_btl_base_segment_t* segments,
size_t num_segments,
int type)
{
/* local variables */
uint16_t next_msg_seq_expected, frag_msg_seq;
ompi_communicator_t *comm_ptr;
mca_pml_csum_recv_request_t *match = NULL;
mca_pml_csum_comm_t *comm;
mca_pml_csum_comm_proc_t *proc;
mca_pml_csum_recv_frag_t* frag = NULL;
/* communicator pointer */
comm_ptr = ompi_comm_lookup(hdr->hdr_ctx);
if(OPAL_UNLIKELY(NULL == comm_ptr)) {
/* This is a special case. A message for a not yet existing
* communicator can happens. Instead of doing a matching we
* will temporarily add it the a pending queue in the PML.
* Later on, when the communicator is completely instantiated,
* this pending queue will be searched and all matching fragments
* moved to the right communicator.
*/
append_frag_to_list( &mca_pml_csum.non_existing_communicator_pending,
btl, hdr, segments, num_segments, frag );
return OMPI_SUCCESS;
}
comm = (mca_pml_csum_comm_t *)comm_ptr->c_pml_comm;
/* source sequence number */
frag_msg_seq = hdr->hdr_seq;
proc = &comm->procs[hdr->hdr_src];
/**
* We generate the MSG_ARRIVED event as soon as the PML is aware of a matching
* fragment arrival. Independing if it is received on the correct order or not.
* This will allow the tools to figure out if the messages are not received in the
* correct order (if multiple network interfaces).
*/
PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_ARRIVED, comm_ptr,
hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
/* get next expected message sequence number - if threaded
* run, lock to make sure that if another thread is processing
* a frag from the same message a match is made only once.
* Also, this prevents other posted receives (for a pair of
* end points) from being processed, and potentially "loosing"
* the fragment.
*/
OPAL_THREAD_LOCK(&comm->matching_lock);
/* get sequence number of next message that can be processed */
next_msg_seq_expected = (uint16_t)proc->expected_sequence;
if(OPAL_UNLIKELY(frag_msg_seq != next_msg_seq_expected))
goto wrong_seq;
/*
* This is the sequence number we were expecting,
* so we can try matching it to already posted
* receives.
*/
out_of_order_match:
/* We're now expecting the next sequence number. */
proc->expected_sequence++;
/**
* We generate the SEARCH_POSTED_QUEUE only when the message is received
* in the correct sequence. Otherwise, we delay the event generation until
* we reach the correct sequence number.
*/
PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_BEGIN, comm_ptr,
hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
match = match_one(btl, hdr, segments, num_segments, comm_ptr, proc, frag);
/**
* The match is over. We generate the SEARCH_POSTED_Q_END here, before going
* into the mca_pml_csum_check_cantmatch_for_match so we can make a difference
* for the searching time for all messages.
*/
PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_END, comm_ptr,
hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
/* release matching lock before processing fragment */
OPAL_THREAD_UNLOCK(&comm->matching_lock);
if(OPAL_LIKELY(match)) {
switch(type) {
case MCA_PML_CSUM_HDR_TYPE_MATCH:
mca_pml_csum_recv_request_progress_match(match, btl, segments, num_segments);
break;
case MCA_PML_CSUM_HDR_TYPE_RNDV:
mca_pml_csum_recv_request_progress_rndv(match, btl, segments, num_segments);
break;
case MCA_PML_CSUM_HDR_TYPE_RGET:
mca_pml_csum_recv_request_progress_rget(match, btl, segments, num_segments);
break;
}
if(OPAL_UNLIKELY(frag))
MCA_PML_CSUM_RECV_FRAG_RETURN(frag);
}
/*
* Now that new message has arrived, check to see if
* any fragments on the c_c_frags_cant_match list
* may now be used to form new matchs
*/
if(OPAL_UNLIKELY(opal_list_get_size(&proc->frags_cant_match) > 0)) {
OPAL_THREAD_LOCK(&comm->matching_lock);
if((frag = check_cantmatch_for_match(proc))) {
hdr = &frag->hdr.hdr_match;
segments = frag->segments;
num_segments = frag->num_segments;
btl = frag->btl;
type = hdr->hdr_common.hdr_type;
goto out_of_order_match;
}
OPAL_THREAD_UNLOCK(&comm->matching_lock);
}
return OMPI_SUCCESS;
wrong_seq:
/*
* This message comes after the next expected, so it
* is ahead of sequence. Save it for later.
*/
append_frag_to_list(&proc->frags_cant_match, btl, hdr, segments,
num_segments, NULL);
OPAL_THREAD_UNLOCK(&comm->matching_lock);
return OMPI_SUCCESS;
}

176
ompi/mca/pml/csum/pml_csum_recvfrag.h Обычный файл
Просмотреть файл

@ -0,0 +1,176 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
* Copyright (c) 2009 IBM Corporation. All rights reserved.
* Copyright (c) 2009 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*/
#ifndef MCA_PML_CSUM_RECVFRAG_H
#define MCA_PML_CSUM_RECVFRAG_H
#include "ompi/mca/btl/btl.h"
#include "ompi/mca/bml/bml.h"
#include "pml_csum_hdr.h"
BEGIN_C_DECLS
struct mca_pml_csum_buffer_t {
size_t len;
void * addr;
};
typedef struct mca_pml_csum_buffer_t mca_pml_csum_buffer_t;
struct mca_pml_csum_recv_frag_t {
ompi_free_list_item_t super;
mca_pml_csum_hdr_t hdr;
size_t num_segments;
mca_btl_base_module_t* btl;
mca_btl_base_segment_t segments[MCA_BTL_DES_MAX_SEGMENTS];
mca_pml_csum_buffer_t buffers[MCA_BTL_DES_MAX_SEGMENTS];
unsigned char addr[1];
};
typedef struct mca_pml_csum_recv_frag_t mca_pml_csum_recv_frag_t;
OBJ_CLASS_DECLARATION(mca_pml_csum_recv_frag_t);
#define MCA_PML_CSUM_RECV_FRAG_ALLOC(frag,rc) \
do { \
ompi_free_list_item_t* item; \
OMPI_FREE_LIST_WAIT(&mca_pml_csum.recv_frags, item, rc); \
frag = (mca_pml_csum_recv_frag_t*)item; \
} while(0)
#define MCA_PML_CSUM_RECV_FRAG_INIT(frag, hdr, segs, cnt, btl ) \
do { \
size_t i, _size; \
mca_btl_base_segment_t* macro_segments = frag->segments; \
mca_pml_csum_buffer_t* buffers = frag->buffers; \
unsigned char* _ptr = (unsigned char*)frag->addr; \
/* init recv_frag */ \
frag->btl = btl; \
frag->hdr = *(mca_pml_csum_hdr_t*)hdr; \
frag->num_segments = 1; \
_size = segs[0].seg_len; \
for( i = 1; i < cnt; i++ ) { \
_size += segs[i].seg_len; \
} \
/* copy over data */ \
if(_size <= mca_pml_csum.unexpected_limit ) { \
macro_segments[0].seg_addr.pval = frag->addr; \
} else { \
buffers[0].len = _size; \
buffers[0].addr = (char*) \
mca_pml_csum.allocator->alc_alloc( mca_pml_csum.allocator, \
buffers[0].len, \
0, NULL); \
_ptr = (unsigned char*)(buffers[0].addr); \
macro_segments[0].seg_addr.pval = buffers[0].addr; \
} \
macro_segments[0].seg_len = _size; \
for( i = 0; i < cnt; i++ ) { \
memcpy( _ptr, segs[i].seg_addr.pval, segs[i].seg_len); \
_ptr += segs[i].seg_len; \
} \
} while(0)
#define MCA_PML_CSUM_RECV_FRAG_RETURN(frag) \
do { \
if( frag->segments[0].seg_len > mca_pml_csum.unexpected_limit ) { \
/* return buffers */ \
mca_pml_csum.allocator->alc_free( mca_pml_csum.allocator, \
frag->buffers[0].addr ); \
} \
frag->num_segments = 0; \
\
/* return recv_frag */ \
OMPI_FREE_LIST_RETURN(&mca_pml_csum.recv_frags, \
(ompi_free_list_item_t*)frag); \
} while(0)
/**
* Callback from BTL on receipt of a recv_frag (match).
*/
extern void mca_pml_csum_recv_frag_callback_match( mca_btl_base_module_t *btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* descriptor,
void* cbdata );
/**
* Callback from BTL on receipt of a recv_frag (rndv).
*/
extern void mca_pml_csum_recv_frag_callback_rndv( mca_btl_base_module_t *btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* descriptor,
void* cbdata );
/**
* Callback from BTL on receipt of a recv_frag (rget).
*/
extern void mca_pml_csum_recv_frag_callback_rget( mca_btl_base_module_t *btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* descriptor,
void* cbdata );
/**
* Callback from BTL on receipt of a recv_frag (ack).
*/
extern void mca_pml_csum_recv_frag_callback_ack( mca_btl_base_module_t *btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* descriptor,
void* cbdata );
/**
* Callback from BTL on receipt of a recv_frag (frag).
*/
extern void mca_pml_csum_recv_frag_callback_frag( mca_btl_base_module_t *btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* descriptor,
void* cbdata );
/**
* Callback from BTL on receipt of a recv_frag (put).
*/
extern void mca_pml_csum_recv_frag_callback_put( mca_btl_base_module_t *btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* descriptor,
void* cbdata );
/**
* Callback from BTL on receipt of a recv_frag (fin).
*/
extern void mca_pml_csum_recv_frag_callback_fin( mca_btl_base_module_t *btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* descriptor,
void* cbdata );
END_C_DECLS
#endif

1110
ompi/mca/pml/csum/pml_csum_recvreq.c Обычный файл

Разница между файлами не показана из-за своего большого размера Загрузить разницу

427
ompi/mca/pml/csum/pml_csum_recvreq.h Обычный файл
Просмотреть файл

@ -0,0 +1,427 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2008 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2007 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*/
#ifndef OMPI_PML_CSUM_RECV_REQUEST_H
#define OMPI_PML_CSUM_RECV_REQUEST_H
#include "pml_csum.h"
#include "pml_csum_rdma.h"
#include "pml_csum_rdmafrag.h"
#include "ompi/proc/proc.h"
#include "ompi/mca/pml/csum/pml_csum_comm.h"
#include "ompi/mca/mpool/base/base.h"
#include "ompi/mca/pml/base/pml_base_recvreq.h"
#include "ompi/datatype/datatype.h"
BEGIN_C_DECLS
struct mca_pml_csum_recv_request_t {
mca_pml_base_recv_request_t req_recv;
ompi_ptr_t remote_req_send;
int32_t req_lock;
size_t req_pipeline_depth;
size_t req_bytes_received; /**< amount of data transferred into the user buffer */
size_t req_bytes_delivered; /**< local size of the data as suggested by the user */
size_t req_rdma_offset;
size_t req_send_offset;
uint32_t req_rdma_cnt;
uint32_t req_rdma_idx;
bool req_pending;
bool req_ack_sent; /**< whether ack was sent to the sender */
bool req_match_received; /**< Prevent request to be completed prematurely */
opal_mutex_t lock;
mca_pml_csum_com_btl_t req_rdma[1];
};
typedef struct mca_pml_csum_recv_request_t mca_pml_csum_recv_request_t;
OBJ_CLASS_DECLARATION(mca_pml_csum_recv_request_t);
static inline bool lock_recv_request(mca_pml_csum_recv_request_t *recvreq)
{
return OPAL_THREAD_ADD32(&recvreq->req_lock, 1) == 1;
}
static inline bool unlock_recv_request(mca_pml_csum_recv_request_t *recvreq)
{
return OPAL_THREAD_ADD32(&recvreq->req_lock, -1) == 0;
}
/**
* Allocate a recv request from the modules free list.
*
* @param rc (OUT) OMPI_SUCCESS or error status on failure.
* @return Receive request.
*/
#define MCA_PML_CSUM_RECV_REQUEST_ALLOC(recvreq, rc) \
do { \
ompi_free_list_item_t* item; \
rc = OMPI_SUCCESS; \
OMPI_FREE_LIST_GET(&mca_pml_base_recv_requests, item, rc); \
recvreq = (mca_pml_csum_recv_request_t*)item; \
} while(0)
/**
* Initialize a receive request with call parameters.
*
* @param request (IN) Receive request.
* @param addr (IN) User buffer.
* @param count (IN) Number of elements of indicated datatype.
* @param datatype (IN) User defined datatype.
* @param src (IN) Source rank w/in the communicator.
* @param tag (IN) User defined tag.
* @param comm (IN) Communicator.
* @param persistent (IN) Is this a ersistent request.
*/
#define MCA_PML_CSUM_RECV_REQUEST_INIT( request, \
addr, \
count, \
datatype, \
src, \
tag, \
comm, \
persistent) \
do { \
MCA_PML_BASE_RECV_REQUEST_INIT( &(request)->req_recv, \
addr, \
count, \
datatype, \
src, \
tag, \
comm, \
persistent); \
} while(0)
/**
* Mark the request as completed at MPI level for internal purposes.
*
* @param recvreq (IN) Receive request.
*/
#define MCA_PML_CSUM_RECV_REQUEST_MPI_COMPLETE( recvreq ) \
do { \
PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_COMPLETE, \
&(recvreq->req_recv.req_base), PERUSE_RECV ); \
ompi_request_complete( &(recvreq->req_recv.req_base.req_ompi), true ); \
} while (0)
/*
* Free the PML receive request
*/
#define MCA_PML_CSUM_RECV_REQUEST_RETURN(recvreq) \
{ \
MCA_PML_BASE_RECV_REQUEST_FINI(&(recvreq)->req_recv); \
OMPI_FREE_LIST_RETURN( &mca_pml_base_recv_requests, \
(ompi_free_list_item_t*)(recvreq)); \
}
/**
* Complete receive request. Request structure cannot be accessed after calling
* this function any more.
*
* @param recvreq (IN) Receive request.
*/
static inline void
recv_request_pml_complete(mca_pml_csum_recv_request_t *recvreq)
{
size_t i;
assert(false == recvreq->req_recv.req_base.req_pml_complete);
if(recvreq->req_recv.req_bytes_packed > 0) {
PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_END,
&recvreq->req_recv.req_base, PERUSE_RECV );
}
for(i = 0; i < recvreq->req_rdma_cnt; i++) {
mca_mpool_base_registration_t* btl_reg = recvreq->req_rdma[i].btl_reg;
if( NULL != btl_reg && btl_reg->mpool != NULL) {
btl_reg->mpool->mpool_deregister( btl_reg->mpool, btl_reg );
}
}
recvreq->req_rdma_cnt = 0;
OPAL_THREAD_LOCK(&ompi_request_lock);
if(true == recvreq->req_recv.req_base.req_free_called) {
MCA_PML_CSUM_RECV_REQUEST_RETURN(recvreq);
} else {
/* initialize request status */
recvreq->req_recv.req_base.req_pml_complete = true;
recvreq->req_recv.req_base.req_ompi.req_status._count =
(int)recvreq->req_bytes_received;
if (recvreq->req_recv.req_bytes_packed > recvreq->req_bytes_delivered) {
recvreq->req_recv.req_base.req_ompi.req_status._count =
(int)recvreq->req_recv.req_bytes_packed;
recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR =
MPI_ERR_TRUNCATE;
}
MCA_PML_CSUM_RECV_REQUEST_MPI_COMPLETE(recvreq);
}
OPAL_THREAD_UNLOCK(&ompi_request_lock);
}
static inline bool
recv_request_pml_complete_check(mca_pml_csum_recv_request_t *recvreq)
{
opal_atomic_rmb();
if(recvreq->req_match_received &&
recvreq->req_bytes_received >= recvreq->req_recv.req_bytes_packed &&
lock_recv_request(recvreq)) {
recv_request_pml_complete(recvreq);
return true;
}
return false;
}
extern void mca_pml_csum_recv_req_start(mca_pml_csum_recv_request_t *req);
#define MCA_PML_CSUM_RECV_REQUEST_START(r) mca_pml_csum_recv_req_start(r)
static inline void prepare_recv_req_converter(mca_pml_csum_recv_request_t *req)
{
mca_bml_base_endpoint_t* endpoint =
req->req_recv.req_base.req_proc->proc_bml;
bool do_csum = mca_pml_csum.enable_csum &&
(endpoint->btl_flags_or & MCA_BTL_FLAGS_NEED_CSUM);
if( req->req_recv.req_base.req_datatype->size | req->req_recv.req_base.req_count ) {
ompi_convertor_copy_and_prepare_for_recv(
req->req_recv.req_base.req_proc->proc_convertor,
req->req_recv.req_base.req_datatype,
req->req_recv.req_base.req_count,
req->req_recv.req_base.req_addr,
(do_csum ? CONVERTOR_WITH_CHECKSUM: 0),
&req->req_recv.req_base.req_convertor);
ompi_convertor_get_unpacked_size(&req->req_recv.req_base.req_convertor,
&req->req_bytes_delivered);
}
}
#define MCA_PML_CSUM_RECV_REQUEST_MATCHED(request, hdr) \
recv_req_matched(request, hdr)
static inline void recv_req_matched(mca_pml_csum_recv_request_t *req,
mca_pml_csum_match_hdr_t *hdr)
{
req->req_recv.req_base.req_ompi.req_status.MPI_SOURCE = hdr->hdr_src;
req->req_recv.req_base.req_ompi.req_status.MPI_TAG = hdr->hdr_tag;
req->req_match_received = true;
opal_atomic_wmb();
if(req->req_recv.req_bytes_packed > 0) {
#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT
if(MPI_ANY_SOURCE == req->req_recv.req_base.req_peer) {
/* non wildcard prepared during post recv */
prepare_recv_req_converter(req);
}
#endif /* OMPI_ENABLE_HETEROGENEOUS_SUPPORT */
PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_REQ_XFER_BEGIN,
&req->req_recv.req_base, PERUSE_RECV);
}
}
/**
*
*/
#define MCA_PML_CSUM_RECV_REQUEST_UNPACK( request, \
segments, \
num_segments, \
seg_offset, \
data_offset, \
bytes_received, \
bytes_delivered) \
do { \
bytes_delivered = 0; \
if(request->req_recv.req_bytes_packed > 0) { \
struct iovec iov[MCA_BTL_DES_MAX_SEGMENTS]; \
uint32_t iov_count = 0; \
size_t max_data = bytes_received; \
size_t n, offset = seg_offset; \
mca_btl_base_segment_t* segment = segments; \
\
OPAL_THREAD_LOCK(&request->lock); \
for( n = 0; n < num_segments; n++, segment++ ) { \
if(offset >= segment->seg_len) { \
offset -= segment->seg_len; \
} else { \
iov[iov_count].iov_len = segment->seg_len - offset; \
iov[iov_count].iov_base = (IOVBASE_TYPE*) \
((unsigned char*)segment->seg_addr.pval + offset); \
iov_count++; \
} \
} \
PERUSE_TRACE_COMM_OMPI_EVENT (PERUSE_COMM_REQ_XFER_CONTINUE, \
&(recvreq->req_recv.req_base), max_data, \
PERUSE_RECV); \
ompi_convertor_set_position( &(request->req_recv.req_base.req_convertor), \
&data_offset ); \
ompi_convertor_unpack( &(request)->req_recv.req_base.req_convertor, \
iov, \
&iov_count, \
&max_data ); \
bytes_delivered = max_data; \
OPAL_THREAD_UNLOCK(&request->lock); \
} \
} while (0)
/**
*
*/
void mca_pml_csum_recv_request_progress_match(
mca_pml_csum_recv_request_t* req,
struct mca_btl_base_module_t* btl,
mca_btl_base_segment_t* segments,
size_t num_segments);
/**
*
*/
void mca_pml_csum_recv_request_progress_frag(
mca_pml_csum_recv_request_t* req,
struct mca_btl_base_module_t* btl,
mca_btl_base_segment_t* segments,
size_t num_segments);
/**
*
*/
void mca_pml_csum_recv_request_progress_rndv(
mca_pml_csum_recv_request_t* req,
struct mca_btl_base_module_t* btl,
mca_btl_base_segment_t* segments,
size_t num_segments);
/**
*
*/
void mca_pml_csum_recv_request_progress_rget(
mca_pml_csum_recv_request_t* req,
struct mca_btl_base_module_t* btl,
mca_btl_base_segment_t* segments,
size_t num_segments);
/**
*
*/
void mca_pml_csum_recv_request_matched_probe(
mca_pml_csum_recv_request_t* req,
struct mca_btl_base_module_t* btl,
mca_btl_base_segment_t* segments,
size_t num_segments);
/**
*
*/
int mca_pml_csum_recv_request_schedule_once(
mca_pml_csum_recv_request_t* req, mca_bml_base_btl_t* start_bml_btl);
static inline int mca_pml_csum_recv_request_schedule_exclusive(
mca_pml_csum_recv_request_t* req,
mca_bml_base_btl_t* start_bml_btl)
{
int rc;
do {
rc = mca_pml_csum_recv_request_schedule_once(req, start_bml_btl);
if(rc == OMPI_ERR_OUT_OF_RESOURCE)
break;
} while(!unlock_recv_request(req));
if(OMPI_SUCCESS == rc)
recv_request_pml_complete_check(req);
return rc;
}
static inline void mca_pml_csum_recv_request_schedule(
mca_pml_csum_recv_request_t* req,
mca_bml_base_btl_t* start_bml_btl)
{
if(!lock_recv_request(req))
return;
(void)mca_pml_csum_recv_request_schedule_exclusive(req, start_bml_btl);
}
#define MCA_PML_CSUM_ADD_ACK_TO_PENDING(P, S, D, O) \
do { \
mca_pml_csum_pckt_pending_t *_pckt; \
int _rc; \
\
MCA_PML_CSUM_PCKT_PENDING_ALLOC(_pckt,_rc); \
_pckt->hdr.hdr_common.hdr_type = MCA_PML_CSUM_HDR_TYPE_ACK; \
_pckt->hdr.hdr_ack.hdr_src_req.lval = (S); \
_pckt->hdr.hdr_ack.hdr_dst_req.pval = (D); \
_pckt->hdr.hdr_ack.hdr_send_offset = (O); \
_pckt->proc = (P); \
_pckt->bml_btl = NULL; \
OPAL_THREAD_LOCK(&mca_pml_csum.lock); \
opal_list_append(&mca_pml_csum.pckt_pending, \
(opal_list_item_t*)_pckt); \
OPAL_THREAD_UNLOCK(&mca_pml_csum.lock); \
} while(0)
int mca_pml_csum_recv_request_ack_send_btl(ompi_proc_t* proc,
mca_bml_base_btl_t* bml_btl, uint64_t hdr_src_req, void *hdr_dst_req,
uint64_t hdr_rdma_offset, bool nordma);
static inline int mca_pml_csum_recv_request_ack_send(ompi_proc_t* proc,
uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset,
bool nordma)
{
size_t i;
mca_bml_base_btl_t* bml_btl;
mca_bml_base_endpoint_t* endpoint =
(mca_bml_base_endpoint_t*)proc->proc_bml;
for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) {
bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
if(mca_pml_csum_recv_request_ack_send_btl(proc, bml_btl, hdr_src_req,
hdr_dst_req, hdr_send_offset, nordma) == OMPI_SUCCESS)
return OMPI_SUCCESS;
}
MCA_PML_CSUM_ADD_ACK_TO_PENDING(proc, hdr_src_req, hdr_dst_req,
hdr_send_offset);
return OMPI_ERR_OUT_OF_RESOURCE;
}
int mca_pml_csum_recv_request_get_frag(mca_pml_csum_rdma_frag_t* frag);
/* This function tries to continue recvreq that stuck due to resource
* unavailability. Recvreq is added to recv_pending list if scheduling of put
* operation cannot be accomplished for some reason. */
void mca_pml_csum_recv_request_process_pending(void);
END_C_DECLS
#endif

1354
ompi/mca/pml/csum/pml_csum_sendreq.c Обычный файл

Разница между файлами не показана из-за своего большого размера Загрузить разницу

515
ompi/mca/pml/csum/pml_csum_sendreq.h Обычный файл
Просмотреть файл

@ -0,0 +1,515 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2008 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2009 IBM Corporation. All rights reserved.
* Copyright (c) 2009 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef OMPI_PML_CSUM_SEND_REQUEST_H
#define OMPI_PML_CSUM_SEND_REQUEST_H
#include "ompi/mca/btl/btl.h"
#include "ompi/mca/pml/base/pml_base_sendreq.h"
#include "ompi/mca/mpool/base/base.h"
#include "pml_csum_comm.h"
#include "pml_csum_hdr.h"
#include "pml_csum_rdma.h"
#include "pml_csum_rdmafrag.h"
#include "ompi/datatype/convertor.h"
#include "ompi/mca/bml/bml.h"
BEGIN_C_DECLS
typedef enum {
MCA_PML_CSUM_SEND_PENDING_NONE,
MCA_PML_CSUM_SEND_PENDING_SCHEDULE,
MCA_PML_CSUM_SEND_PENDING_START
} mca_pml_csum_send_pending_t;
struct mca_pml_csum_send_request_t {
mca_pml_base_send_request_t req_send;
mca_bml_base_endpoint_t* req_endpoint;
ompi_ptr_t req_recv;
int32_t req_state;
int32_t req_lock;
bool req_throttle_sends;
size_t req_pipeline_depth;
size_t req_bytes_delivered;
uint32_t req_rdma_cnt;
mca_pml_csum_send_pending_t req_pending;
opal_mutex_t req_send_range_lock;
opal_list_t req_send_ranges;
mca_pml_csum_com_btl_t req_rdma[1];
};
typedef struct mca_pml_csum_send_request_t mca_pml_csum_send_request_t;
OBJ_CLASS_DECLARATION(mca_pml_csum_send_request_t);
struct mca_pml_csum_send_range_t {
ompi_free_list_item_t base;
uint64_t range_send_offset;
uint64_t range_send_length;
int range_btl_idx;
int range_btl_cnt;
mca_pml_csum_com_btl_t range_btls[1];
};
typedef struct mca_pml_csum_send_range_t mca_pml_csum_send_range_t;
OBJ_CLASS_DECLARATION(mca_pml_csum_send_range_t);
static inline bool lock_send_request(mca_pml_csum_send_request_t *sendreq)
{
return OPAL_THREAD_ADD32(&sendreq->req_lock, 1) == 1;
}
static inline bool unlock_send_request(mca_pml_csum_send_request_t *sendreq)
{
return OPAL_THREAD_ADD32(&sendreq->req_lock, -1) == 0;
}
static inline void
add_request_to_send_pending(mca_pml_csum_send_request_t* sendreq,
const mca_pml_csum_send_pending_t type,
const bool append)
{
opal_list_item_t *item = (opal_list_item_t*)sendreq;
OPAL_THREAD_LOCK(&mca_pml_csum.lock);
sendreq->req_pending = type;
if(append)
opal_list_append(&mca_pml_csum.send_pending, item);
else
opal_list_prepend(&mca_pml_csum.send_pending, item);
OPAL_THREAD_UNLOCK(&mca_pml_csum.lock);
}
static inline mca_pml_csum_send_request_t*
get_request_from_send_pending(mca_pml_csum_send_pending_t *type)
{
mca_pml_csum_send_request_t *sendreq;
OPAL_THREAD_LOCK(&mca_pml_csum.lock);
sendreq = (mca_pml_csum_send_request_t*)
opal_list_remove_first(&mca_pml_csum.send_pending);
if(sendreq) {
*type = sendreq->req_pending;
sendreq->req_pending = MCA_PML_CSUM_SEND_PENDING_NONE;
}
OPAL_THREAD_UNLOCK(&mca_pml_csum.lock);
return sendreq;
}
#define MCA_PML_CSUM_SEND_REQUEST_ALLOC( comm, \
dst, \
sendreq, \
rc) \
{ \
ompi_proc_t *proc = ompi_comm_peer_lookup( comm, dst ); \
ompi_free_list_item_t* item; \
\
rc = OMPI_ERR_OUT_OF_RESOURCE; \
if( OPAL_LIKELY(NULL != proc) ) { \
rc = OMPI_SUCCESS; \
OMPI_FREE_LIST_WAIT(&mca_pml_base_send_requests, item, rc); \
sendreq = (mca_pml_csum_send_request_t*)item; \
sendreq->req_send.req_base.req_proc = proc; \
} \
}
#define MCA_PML_CSUM_SEND_REQUEST_INIT( sendreq, \
buf, \
count, \
datatype, \
dst, \
tag, \
comm, \
sendmode, \
persistent) \
{ \
MCA_PML_CSUM_BASE_SEND_REQUEST_INIT(&sendreq->req_send, \
buf, \
count, \
datatype, \
dst, \
tag, \
comm, \
sendmode, \
persistent); \
(sendreq)->req_recv.pval = NULL; \
}
#define MCA_PML_CSUM_BASE_SEND_REQUEST_INIT( request, \
addr, \
count, \
datatype, \
peer, \
tag, \
comm, \
mode, \
persistent) \
{ \
mca_bml_base_endpoint_t* endpoint = \
sendreq->req_send.req_base.req_proc->proc_bml; \
bool do_csum = mca_pml_csum.enable_csum && \
(endpoint->btl_flags_or & MCA_BTL_FLAGS_NEED_CSUM); \
/* increment reference counts */ \
OBJ_RETAIN(comm); \
\
OMPI_REQUEST_INIT(&(request)->req_base.req_ompi, persistent); \
(request)->req_base.req_ompi.req_mpi_object.comm = comm; \
(request)->req_addr = addr; \
(request)->req_send_mode = mode; \
(request)->req_base.req_addr = addr; \
(request)->req_base.req_count = count; \
(request)->req_base.req_datatype = datatype; \
(request)->req_base.req_peer = (int32_t)peer; \
(request)->req_base.req_tag = (int32_t)tag; \
(request)->req_base.req_comm = comm; \
/* (request)->req_base.req_proc is set on request allocation */ \
(request)->req_base.req_pml_complete = OPAL_INT_TO_BOOL(persistent); \
(request)->req_base.req_free_called = false; \
(request)->req_base.req_ompi.req_status._cancelled = 0; \
(request)->req_bytes_packed = 0; \
\
/* initialize datatype convertor for this request */ \
if( count > 0 ) { \
OBJ_RETAIN(datatype); \
/* We will create a convertor specialized for the */ \
/* remote architecture and prepared with the datatype. */ \
ompi_convertor_copy_and_prepare_for_send( \
(request)->req_base.req_proc->proc_convertor, \
(request)->req_base.req_datatype, \
(request)->req_base.req_count, \
(request)->req_base.req_addr, \
(do_csum ? CONVERTOR_WITH_CHECKSUM: 0), \
&(request)->req_base.req_convertor ); \
ompi_convertor_get_packed_size( &(request)->req_base.req_convertor, \
&((request)->req_bytes_packed) );\
} \
}
static inline void mca_pml_csum_free_rdma_resources(mca_pml_csum_send_request_t* sendreq)
{
size_t r;
/* return mpool resources */
for(r = 0; r < sendreq->req_rdma_cnt; r++) {
mca_mpool_base_registration_t* reg = sendreq->req_rdma[r].btl_reg;
if( NULL != reg && reg->mpool != NULL ) {
reg->mpool->mpool_deregister(reg->mpool, reg);
}
}
sendreq->req_rdma_cnt = 0;
}
/**
* Start a send request.
*/
#define MCA_PML_CSUM_SEND_REQUEST_START(sendreq, rc) \
do { \
rc = mca_pml_csum_send_request_start(sendreq); \
} while (0)
/*
* Mark a send request as completed at the MPI level.
*/
#define MCA_PML_CSUM_SEND_REQUEST_MPI_COMPLETE(sendreq, with_signal) \
do { \
(sendreq)->req_send.req_base.req_ompi.req_status.MPI_SOURCE = \
(sendreq)->req_send.req_base.req_comm->c_my_rank; \
(sendreq)->req_send.req_base.req_ompi.req_status.MPI_TAG = \
(sendreq)->req_send.req_base.req_tag; \
(sendreq)->req_send.req_base.req_ompi.req_status.MPI_ERROR = OMPI_SUCCESS; \
(sendreq)->req_send.req_base.req_ompi.req_status._count = \
(int)(sendreq)->req_send.req_bytes_packed; \
ompi_request_complete( &((sendreq)->req_send.req_base.req_ompi), (with_signal) ); \
\
PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_COMPLETE, \
&(sendreq->req_send.req_base), PERUSE_SEND); \
} while(0)
/*
* Release resources associated with a request
*/
#define MCA_PML_CSUM_SEND_REQUEST_RETURN(sendreq) \
do { \
/* Let the base handle the reference counts */ \
MCA_PML_BASE_SEND_REQUEST_FINI((&(sendreq)->req_send)); \
OMPI_FREE_LIST_RETURN( &mca_pml_base_send_requests, \
(ompi_free_list_item_t*)sendreq); \
} while(0)
/*
* The PML has completed a send request. Note that this request
* may have been orphaned by the user or have already completed
* at the MPI level.
* This function will never be called directly from the upper level, as it
* should only be an internal call to the PML.
*
*/
static inline void
send_request_pml_complete(mca_pml_csum_send_request_t *sendreq)
{
assert(false == sendreq->req_send.req_base.req_pml_complete);
if(sendreq->req_send.req_bytes_packed > 0) {
PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_END,
&(sendreq->req_send.req_base), PERUSE_SEND);
}
/* return mpool resources */
mca_pml_csum_free_rdma_resources(sendreq);
if (sendreq->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED &&
sendreq->req_send.req_addr != sendreq->req_send.req_base.req_addr) {
mca_pml_base_bsend_request_fini((ompi_request_t*)sendreq);
}
OPAL_THREAD_LOCK(&ompi_request_lock);
if(false == sendreq->req_send.req_base.req_ompi.req_complete) {
/* Should only be called for long messages (maybe synchronous) */
MCA_PML_CSUM_SEND_REQUEST_MPI_COMPLETE(sendreq, true);
}
sendreq->req_send.req_base.req_pml_complete = true;
if(sendreq->req_send.req_base.req_free_called) {
MCA_PML_CSUM_SEND_REQUEST_RETURN(sendreq);
}
OPAL_THREAD_UNLOCK(&ompi_request_lock);
}
/* returns true if request was completed on PML level */
static inline bool
send_request_pml_complete_check(mca_pml_csum_send_request_t *sendreq)
{
opal_atomic_rmb();
/* if no more events are expected for the request and the whole message is
* already sent and send fragment scheduling isn't running in another
* thread then complete the request on PML level. From now on, if user
* called free on this request, the request structure can be reused for
* another request or if the request is persistent it can be restarted */
if(sendreq->req_state == 0 &&
sendreq->req_bytes_delivered >= sendreq->req_send.req_bytes_packed
&& lock_send_request(sendreq)) {
send_request_pml_complete(sendreq);
return true;
}
return false;
}
/**
* Schedule additional fragments
*/
int
mca_pml_csum_send_request_schedule_once(mca_pml_csum_send_request_t*);
static inline int
mca_pml_csum_send_request_schedule_exclusive(mca_pml_csum_send_request_t* sendreq)
{
int rc;
do {
rc = mca_pml_csum_send_request_schedule_once(sendreq);
if(rc == OMPI_ERR_OUT_OF_RESOURCE)
break;
} while(!unlock_send_request(sendreq));
if(OMPI_SUCCESS == rc)
send_request_pml_complete_check(sendreq);
return rc;
}
static inline void
mca_pml_csum_send_request_schedule(mca_pml_csum_send_request_t* sendreq)
{
/*
* Only allow one thread in this routine for a given request.
* However, we cannot block callers on a mutex, so simply keep track
* of the number of times the routine has been called and run through
* the scheduling logic once for every call.
*/
if(!lock_send_request(sendreq))
return;
mca_pml_csum_send_request_schedule_exclusive(sendreq);
}
/**
* Start the specified request
*/
int mca_pml_csum_send_request_start_buffered(
mca_pml_csum_send_request_t* sendreq,
mca_bml_base_btl_t* bml_btl,
size_t size);
int mca_pml_csum_send_request_start_copy(
mca_pml_csum_send_request_t* sendreq,
mca_bml_base_btl_t* bml_btl,
size_t size);
int mca_pml_csum_send_request_start_prepare(
mca_pml_csum_send_request_t* sendreq,
mca_bml_base_btl_t* bml_btl,
size_t size);
int mca_pml_csum_send_request_start_rdma(
mca_pml_csum_send_request_t* sendreq,
mca_bml_base_btl_t* bml_btl,
size_t size);
int mca_pml_csum_send_request_start_rndv(
mca_pml_csum_send_request_t* sendreq,
mca_bml_base_btl_t* bml_btl,
size_t size,
int flags);
static inline int
mca_pml_csum_send_request_start_btl( mca_pml_csum_send_request_t* sendreq,
mca_bml_base_btl_t* bml_btl )
{
size_t size = sendreq->req_send.req_bytes_packed;
mca_btl_base_module_t* btl = bml_btl->btl;
size_t eager_limit = btl->btl_eager_limit - sizeof(mca_pml_csum_hdr_t);
int rc;
if( OPAL_LIKELY(size <= eager_limit) ) {
switch(sendreq->req_send.req_send_mode) {
case MCA_PML_BASE_SEND_SYNCHRONOUS:
rc = mca_pml_csum_send_request_start_rndv(sendreq, bml_btl, size, 0);
break;
case MCA_PML_BASE_SEND_BUFFERED:
rc = mca_pml_csum_send_request_start_copy(sendreq, bml_btl, size);
break;
case MCA_PML_BASE_SEND_COMPLETE:
rc = mca_pml_csum_send_request_start_prepare(sendreq, bml_btl, size);
break;
default:
if (size != 0 && bml_btl->btl_flags & MCA_BTL_FLAGS_SEND_INPLACE) {
rc = mca_pml_csum_send_request_start_prepare(sendreq, bml_btl, size);
} else {
rc = mca_pml_csum_send_request_start_copy(sendreq, bml_btl, size);
}
break;
}
} else {
size = eager_limit;
if(OPAL_UNLIKELY(btl->btl_rndv_eager_limit < eager_limit))
size = btl->btl_rndv_eager_limit;
if(sendreq->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED) {
rc = mca_pml_csum_send_request_start_buffered(sendreq, bml_btl, size);
} else if
(ompi_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == false) {
unsigned char *base;
ompi_convertor_get_current_pointer( &sendreq->req_send.req_base.req_convertor, (void**)&base );
if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_csum_rdma_btls(
sendreq->req_endpoint,
base,
sendreq->req_send.req_bytes_packed,
sendreq->req_rdma))) {
rc = mca_pml_csum_send_request_start_rdma(sendreq, bml_btl,
sendreq->req_send.req_bytes_packed);
if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
mca_pml_csum_free_rdma_resources(sendreq);
}
} else {
rc = mca_pml_csum_send_request_start_rndv(sendreq, bml_btl, size,
MCA_PML_CSUM_HDR_FLAGS_CONTIG);
}
} else {
rc = mca_pml_csum_send_request_start_rndv(sendreq, bml_btl, size, 0);
}
}
return rc;
}
static inline int
mca_pml_csum_send_request_start( mca_pml_csum_send_request_t* sendreq )
{
mca_pml_csum_comm_t* comm = sendreq->req_send.req_base.req_comm->c_pml_comm;
mca_bml_base_endpoint_t* endpoint = (mca_bml_base_endpoint_t*)
sendreq->req_send.req_base.req_proc->proc_bml;
size_t i;
if( OPAL_UNLIKELY(endpoint == NULL) ) {
return OMPI_ERR_UNREACH;
}
sendreq->req_endpoint = endpoint;
sendreq->req_state = 0;
sendreq->req_lock = 0;
sendreq->req_pipeline_depth = 0;
sendreq->req_bytes_delivered = 0;
sendreq->req_pending = MCA_PML_CSUM_SEND_PENDING_NONE;
sendreq->req_send.req_base.req_sequence = OPAL_THREAD_ADD32(
&comm->procs[sendreq->req_send.req_base.req_peer].send_sequence,1);
MCA_PML_BASE_SEND_START( &sendreq->req_send.req_base );
for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) {
mca_bml_base_btl_t* bml_btl;
int rc;
/* select a btl */
bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
rc = mca_pml_csum_send_request_start_btl(sendreq, bml_btl);
if( OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != rc) )
return rc;
}
add_request_to_send_pending(sendreq, MCA_PML_CSUM_SEND_PENDING_START, true);
return OMPI_SUCCESS;
}
/**
* Initiate a put scheduled by the receiver.
*/
void mca_pml_csum_send_request_put( mca_pml_csum_send_request_t* sendreq,
mca_btl_base_module_t* btl,
mca_pml_csum_rdma_hdr_t* hdr );
int mca_pml_csum_send_request_put_frag(mca_pml_csum_rdma_frag_t* frag);
/* This function tries to continue sendreq that was stuck because of resource
* unavailability. A sendreq may be added to send_pending list if there is no
* resource to send initial packet or there is not resource to schedule data
* for sending. The reason the sendreq was added to the list is stored inside
* sendreq struct and appropriate operation is retried when resource became
* available. bml_btl passed to the function doesn't represents sendreq
* destination, it represents BTL on which resource was freed, so only this BTL
* should be considered for sending packets */
void mca_pml_csum_send_request_process_pending(mca_bml_base_btl_t *bml_btl);
void mca_pml_csum_send_request_copy_in_out(mca_pml_csum_send_request_t *sendreq,
uint64_t send_offset, uint64_t send_length);
END_C_DECLS
#endif /* OMPI_PML_CSUM_SEND_REQUEST_H */

142
ompi/mca/pml/csum/pml_csum_start.c Обычный файл
Просмотреть файл

@ -0,0 +1,142 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2007 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2008 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "pml_csum.h"
#include "pml_csum_recvreq.h"
#include "pml_csum_sendreq.h"
#include "ompi/memchecker.h"
int mca_pml_csum_start(size_t count, ompi_request_t** requests)
{
int rc;
size_t i;
bool reuse_old_request = true;
for(i=0; i<count; i++) {
mca_pml_base_request_t *pml_request = (mca_pml_base_request_t*)requests[i];
if(NULL == pml_request) {
continue;
}
if (OMPI_REQUEST_PML != requests[i]->req_type) {
continue;
}
/* If the persistent request is currently active - obtain the
* request lock and verify the status is incomplete. if the
* pml layer has not completed the request - mark the request
* as free called - so that it will be freed when the request
* completes - and create a new request.
*/
reuse_old_request = true;
switch(pml_request->req_ompi.req_state) {
case OMPI_REQUEST_INACTIVE:
if(pml_request->req_pml_complete == true)
break;
/* otherwise fall through */
case OMPI_REQUEST_ACTIVE: {
ompi_request_t *request;
OPAL_THREAD_LOCK(&ompi_request_lock);
if (pml_request->req_pml_complete == false) {
/* free request after it completes */
pml_request->req_free_called = true;
} else {
/* can reuse the existing request */
OPAL_THREAD_UNLOCK(&ompi_request_lock);
break;
}
reuse_old_request = false;
/* allocate a new request */
switch(pml_request->req_type) {
case MCA_PML_REQUEST_SEND: {
mca_pml_base_send_mode_t sendmode =
((mca_pml_base_send_request_t*)pml_request)->req_send_mode;
rc = mca_pml_csum_isend_init(
pml_request->req_addr,
pml_request->req_count,
pml_request->req_datatype,
pml_request->req_peer,
pml_request->req_tag,
sendmode,
pml_request->req_comm,
&request);
break;
}
case MCA_PML_REQUEST_RECV:
rc = mca_pml_csum_irecv_init(
pml_request->req_addr,
pml_request->req_count,
pml_request->req_datatype,
pml_request->req_peer,
pml_request->req_tag,
pml_request->req_comm,
&request);
break;
default:
rc = OMPI_ERR_REQUEST;
break;
}
OPAL_THREAD_UNLOCK(&ompi_request_lock);
if(OMPI_SUCCESS != rc)
return rc;
pml_request = (mca_pml_base_request_t*)request;
requests[i] = request;
break;
}
default:
return OMPI_ERR_REQUEST;
}
/* start the request */
switch(pml_request->req_type) {
case MCA_PML_REQUEST_SEND:
{
mca_pml_csum_send_request_t* sendreq = (mca_pml_csum_send_request_t*)pml_request;
if( reuse_old_request && (sendreq->req_send.req_bytes_packed != 0) ) {
size_t offset = 0;
/**
* Reset the convertor in case we're dealing with the original
* request, which when completed do not reset the convertor.
*/
ompi_convertor_set_position( &sendreq->req_send.req_base.req_convertor,
&offset );
}
MCA_PML_CSUM_SEND_REQUEST_START(sendreq, rc);
if(rc != OMPI_SUCCESS)
return rc;
break;
}
case MCA_PML_REQUEST_RECV:
{
mca_pml_csum_recv_request_t* recvreq = (mca_pml_csum_recv_request_t*)pml_request;
MCA_PML_CSUM_RECV_REQUEST_START(recvreq);
break;
}
default:
return OMPI_ERR_REQUEST;
}
}
return OMPI_SUCCESS;
}

1
ompi/mca/pml/csum/post_configure.sh Обычный файл
Просмотреть файл

@ -0,0 +1 @@
DIRECT_CALL_HEADER="ompi/mca/pml/csum/pml_csum.h"

Просмотреть файл

@ -9,6 +9,9 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2009 IBM Corporation. All rights reserved.
* Copyright (c) 2009 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -102,6 +105,19 @@ opal_csum(const void * source, size_t csumlen)
return opal_csum_partial(source, csumlen, &lastPartialLong, &lastPartialLength);
}
static inline uint16_t
opal_csum16 (const void * source, size_t csumlen)
{
unsigned char *src = (unsigned char *) source;
uint16_t csum = 0;
size_t i;
for(i = 0; i < csumlen; i++) {
csum += *src++;
}
return csum;
}
OPAL_DECLSPEC unsigned int
opal_uicsum_partial (
const void * source,