New btl that extends sm btl to support GPU transfers within a node.
Uses new CUDA IPC support. Also, a few minor changes in PML to take advantage of it. This code has no effect unless user asks for it explicitly via configure arguments. Otherwise, it is either #ifdef'ed out or not compiled. This commit was SVN r26039.
Этот коммит содержится в:
родитель
aff49e98b4
Коммит
b0a84b0a7d
245
contrib/check-btl-sm-diffs.pl
Исполняемый файл
245
contrib/check-btl-sm-diffs.pl
Исполняемый файл
@ -0,0 +1,245 @@
|
||||
#!/usr/bin/env perl
|
||||
#
|
||||
# Copyright (c) 2012 NVIDIA Corporation. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This script is run to see the differences between some other BTL and
|
||||
# the sm BTL. By default, it looks at the sm BTL but it can also be
|
||||
# used with other BTLs. Prior to running the diff, it does some
|
||||
# preprocessing on the files. First, it removes all the copyright
|
||||
# headers as differences in them are of no concern. Secondly, it
|
||||
# converts all the BTL specific strings in the BTL to be compared to
|
||||
# "sm". For example, with the smcuda, all "smcuda" strings are converted
|
||||
# to "sm" and all "SMCUDA" strings are converted to "SM". In this way,
|
||||
# we avoid any spurious differences just related to the difference in
|
||||
# the names of the functions and variables.
|
||||
#
|
||||
# Lastly, in the case of smcuda only, it can strip out all code within
|
||||
# the BTL that is contained within specific #ifdef strings. See the
|
||||
# code and comments below to see how it works.
|
||||
#
|
||||
# This script must be run from this directory as it makes assumptions
|
||||
# about where the PML directories are located. Here are some
|
||||
# examples.
|
||||
#
|
||||
# Run using all defaults.
|
||||
# > check-btl-sm-diffs.pl
|
||||
#
|
||||
# Do not remove the SMCUDA specific code
|
||||
# > check-ob1-pml-diffs.pl -s
|
||||
#
|
||||
# Do not remove the SMCUDA specific code and save results in DIFFS
|
||||
# > check-ob1-pml-diffs.pl -s -o DIFFS
|
||||
#
|
||||
#
|
||||
use strict;
|
||||
|
||||
use File::Copy;
|
||||
use File::Path;
|
||||
use Getopt::Long;
|
||||
|
||||
my $diffdir = "diffdir";
|
||||
my $btlsdir = "../ompi/mca/btl";
|
||||
my $cmd;
|
||||
my $cmd_output;
|
||||
my $contents;
|
||||
my $smcudafile;
|
||||
my @smcudafiles;
|
||||
my $smfile;
|
||||
my @smfiles;
|
||||
my $alloutput;
|
||||
|
||||
|
||||
# Command line parsing
|
||||
my $verbose_arg = 0;
|
||||
my $show_arg = 1;
|
||||
my $showall_arg = 0;
|
||||
my $help_arg = 0;
|
||||
my $btl_arg = "smcuda";
|
||||
my $output_arg = "";
|
||||
|
||||
&Getopt::Long::Configure("bundling");
|
||||
my $ok = Getopt::Long::GetOptions("verbose|v!" => \$verbose_arg,
|
||||
"showall|s!" => \$showall_arg,
|
||||
"show|S!" => \$show_arg,
|
||||
"btl|p=s" => \$btl_arg,
|
||||
"output|o=s" => \$output_arg,
|
||||
"help|h!" => \$help_arg);
|
||||
|
||||
if (!$ok || $help_arg) {
|
||||
print "
|
||||
Usage: $0 [--show|-S] [--showall|-s] [--btl|-p=BTL] [--output|-o=OUTPUTFILE]
|
||||
[--verbose|-v] [--help|-h]
|
||||
|
||||
Runs a diff between the the files in the sm and the smcuda directory
|
||||
and prints the output to stdout. Prior to checking the differences,
|
||||
the script removes all copyright header code. It also first removes
|
||||
all CUDA specific code in the smcuda files. Specifically, the script
|
||||
removes all code that is within the following ifdefs.
|
||||
#ifdef OMPI_CUDA_SUPPORT
|
||||
...
|
||||
#endif /*OMPI_CUDA_SUPPORT */
|
||||
To view the smcuda specific code in the diff, run with the -s switch.
|
||||
|
||||
-s Show all the differences between the files.
|
||||
-S Show all the differences between the files that are not
|
||||
within \"\#ifdef OMPI_CUDA_SUPPORT\" statements. (default: -S)
|
||||
-p BTL - which BTL to compare to sm (default: smcuda)
|
||||
-o File name where to write the output to (instead of stdout).
|
||||
-v Verbose - show more details of script activities.
|
||||
-h This help
|
||||
\n";
|
||||
exit(0);
|
||||
}
|
||||
|
||||
my $btl = $btl_arg;
|
||||
my $BTL = $btl;
|
||||
$BTL =~ tr/a-z/A-Z/;
|
||||
|
||||
# Change into BTL directory that is being compared to sm.
|
||||
# In the default case, we just end up where we started in
|
||||
# the smcuda directory.
|
||||
chdir "$btlsdir/$btl";
|
||||
|
||||
print "\nStarting script to check differences between $btl and sm...\n";
|
||||
|
||||
if (! -d $diffdir) {
|
||||
mkdir ("$diffdir", 0777) || print $!;
|
||||
}
|
||||
|
||||
# Copy smcuda files into temp directory.
|
||||
@smcudafiles = <*.[h|c]>;
|
||||
foreach $smcudafile (@smcudafiles) {
|
||||
copy ($smcudafile, $diffdir);
|
||||
}
|
||||
if ($verbose_arg) {
|
||||
print "Copied all $btl files to temp directory\n";
|
||||
}
|
||||
|
||||
chdir $diffdir;
|
||||
|
||||
# Using crude preprocessor, strip out all SMCUDA specific code.
|
||||
# If -s switch is provided, then leave SMCUDA specific code.
|
||||
foreach $smcudafile (@smcudafiles) {
|
||||
$contents = Read($smcudafile);
|
||||
die("Couldn't Read $smcudafile!\n") if (!$contents);
|
||||
|
||||
if (!$showall_arg) {
|
||||
# First, remove all the #if-#else code.
|
||||
# #ifdef OMPI_CUDA_SUPPORT
|
||||
# ...stuff...
|
||||
# # else /* OMPI_CUDA_SUPPORT */
|
||||
# Then, remove all the #if-#endif code.
|
||||
# #ifdef OMPI_CUDA_SUPPORT
|
||||
# ...stuff...
|
||||
# #endif /* OMPI_CUDA_SUPPORT */
|
||||
# Then, remove leftover #endif from the #if-#else.
|
||||
# So, three pattern matching steps.
|
||||
# Some notes about the regular expression.
|
||||
# 1. Need the .*? so the #endif is matched with the closest if.
|
||||
# 2. Added the comment OMPI_CUDA_SUPPORT on the #endif to get the right match.
|
||||
# 3. Need the \n at the end to avoid leaving extra newlines.
|
||||
$contents =~ s/#if OMPI_CUDA_SUPPORT(.*?)((#else \/\* OMPI_CUDA_SUPPORT \*\/\n)|(#endif \/\* OMPI_CUDA_SUPPORT \*\/\n))//gis;
|
||||
$contents =~ s/#endif \/\* OMPI_CUDA_SUPPORT \*\/\n//gis;
|
||||
}
|
||||
|
||||
# Strip off the copyright header also.
|
||||
$contents =~ s/\/\*(.*?)\$HEADER\$\n \*\/\n//is;
|
||||
|
||||
# Now replace the string $btl with sm so we can
|
||||
# not get spurious diffs when comparing to sm.
|
||||
$contents =~ s/$btl/sm/g;
|
||||
$contents =~ s/$BTL/SM/g;
|
||||
|
||||
Write($smcudafile, $contents);
|
||||
}
|
||||
if ($verbose_arg) {
|
||||
print "All $btl specific code and copyrights has been removed from $btl files\n";
|
||||
print "All $btl/$BTL strings converted to sm/SM strings in bfo files\n";
|
||||
}
|
||||
|
||||
# Copy sm files into temp directory
|
||||
chdir "../../sm";
|
||||
@smfiles = <*.[h|c]>;
|
||||
foreach $smfile (@smfiles) {
|
||||
copy ($smfile, "../$btl/$diffdir");
|
||||
}
|
||||
if ($verbose_arg) {
|
||||
print "Copied all sm files to temp directory\n";
|
||||
}
|
||||
|
||||
chdir "../$btl/$diffdir";
|
||||
|
||||
# Strip off copyright from sm files.
|
||||
foreach $smfile (@smfiles) {
|
||||
# Strip off the copyright header also.
|
||||
$contents = Read($smfile);
|
||||
die("Couldn't Read $smfile!\n") if (!$contents);
|
||||
$contents =~ s/\/\*(.*?)\$HEADER\$\n \*\/\n//is;
|
||||
# Strip away KNEM as that is not in smcuda
|
||||
$contents =~ s/#if OMPI_BTL_SM_HAVE_KNEM(.*?)((#else\n)|(#endif\n)|(#endif \/\* OMPI_BTL_SM_HAVE_KNEM \*\/\n))//gis;
|
||||
$contents =~ s/#endif \/\* OMPI_BTL_SM_HAVE_KNEM \*\/\n//gis;
|
||||
Write($smfile, $contents);
|
||||
}
|
||||
if ($verbose_arg) {
|
||||
print "Removed copyright strings from all sm files\n";
|
||||
}
|
||||
|
||||
|
||||
# Now do a diff on the files.
|
||||
if ($verbose_arg) {
|
||||
print "Now running diffs on all the files...\n\n";
|
||||
}
|
||||
foreach $smfile (@smfiles) {
|
||||
$smcudafile = $smfile;
|
||||
$smcudafile =~ s/sm/$btl/;
|
||||
$cmd = "diff -c $smfile $smcudafile";
|
||||
$cmd_output = "";
|
||||
open (CMD, "$cmd|");
|
||||
$cmd_output .= $_
|
||||
while (<CMD>);
|
||||
close(CMD);
|
||||
if ($output_arg eq "") {
|
||||
print "Files Compared: $smfile and $smcudafile\n";
|
||||
print "$cmd_output";
|
||||
} else {
|
||||
if ($cmd_output ne "No differences encountered\n") {
|
||||
$alloutput = $alloutput . $cmd_output;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
chdir "..";
|
||||
if ($output_arg ne "") {
|
||||
rmtree($output_arg);
|
||||
Write($output_arg, $alloutput);
|
||||
}
|
||||
rmtree("$diffdir");
|
||||
|
||||
# Function to read file into a string.
|
||||
sub Read {
|
||||
my ($file) = @_;
|
||||
|
||||
my $contents;
|
||||
open (INPUT, $file) or warn "Can't open $file: $!";
|
||||
while (<INPUT>) {
|
||||
$contents .= $_;
|
||||
}
|
||||
close(INPUT) or warn "Can't close $file: $!";
|
||||
return $contents;
|
||||
}
|
||||
|
||||
# Function to write string to a file.
|
||||
sub Write {
|
||||
my ($filename, $body) = @_;
|
||||
|
||||
# Write out the file
|
||||
die("Failed to write to file: $!") if (! open(FILE, "> $filename"));
|
||||
|
||||
print FILE $body;
|
||||
close FILE;
|
||||
}
|
@ -13,6 +13,7 @@
|
||||
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012 NVIDIA Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -195,6 +196,10 @@ typedef uint8_t mca_btl_base_tag_t;
|
||||
/* btl can support failover if enabled */
|
||||
#define MCA_BTL_FLAGS_FAILOVER_SUPPORT 0x0200
|
||||
|
||||
#define MCA_BTL_FLAGS_CUDA_PUT 0x0400
|
||||
#define MCA_BTL_FLAGS_CUDA_GET 0x0800
|
||||
#define MCA_BTL_FLAGS_CUDA_RDMA (MCA_BTL_FLAGS_CUDA_GET|MCA_BTL_FLAGS_CUDA_PUT)
|
||||
|
||||
/* Default exclusivity levels */
|
||||
#define MCA_BTL_EXCLUSIVITY_HIGH (64*1024) /* internal loopback */
|
||||
#define MCA_BTL_EXCLUSIVITY_DEFAULT 1024 /* GM/IB/etc. */
|
||||
@ -241,7 +246,16 @@ struct mca_btl_base_segment_t {
|
||||
uint32_t key32[4];
|
||||
uint64_t key64[2];
|
||||
uint8_t key8[16];
|
||||
#if OMPI_CUDA_SUPPORT
|
||||
uint8_t cudakey[128]; /* 64 bytes for CUDA mem handle, 64 bytes for CUDA event handle */
|
||||
#endif /* OMPI_CUDA_SUPPORT */
|
||||
} seg_key;
|
||||
#if OMPI_CUDA_SUPPORT
|
||||
/** Address of the entire memory handle */
|
||||
ompi_ptr_t memh_seg_addr;
|
||||
/** Length in bytes of entire memory handle */
|
||||
uint32_t memh_seg_len;
|
||||
#endif /* OMPI_CUDA_SUPPORT */
|
||||
};
|
||||
typedef struct mca_btl_base_segment_t mca_btl_base_segment_t;
|
||||
|
||||
|
62
ompi/mca/btl/smcuda/Makefile.am
Обычный файл
62
ompi/mca/btl/smcuda/Makefile.am
Обычный файл
@ -0,0 +1,62 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2012 NVIDIA Corporation. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
dist_pkgdata_DATA = help-mpi-btl-smcuda.txt
|
||||
|
||||
libmca_btl_smcuda_la_sources = \
|
||||
btl_smcuda.c \
|
||||
btl_smcuda.h \
|
||||
btl_smcuda_component.c \
|
||||
btl_smcuda_endpoint.h \
|
||||
btl_smcuda_fifo.h \
|
||||
btl_smcuda_frag.c \
|
||||
btl_smcuda_frag.h
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_ompi_btl_smcuda_DSO
|
||||
component_noinst =
|
||||
component_install = mca_btl_smcuda.la
|
||||
else
|
||||
component_noinst = libmca_btl_smcuda.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
# See ompi/mca/common/cuda/Makefile.am for an explanation of
|
||||
# libmca_common_sm.la.
|
||||
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_btl_smcuda_la_SOURCES = $(libmca_btl_smcuda_la_sources)
|
||||
mca_btl_smcuda_la_LDFLAGS = -module -avoid-version
|
||||
mca_btl_smcuda_la_LIBADD = \
|
||||
$(top_ompi_builddir)/ompi/mca/common/sm/libmca_common_sm.la
|
||||
mca_btl_smcuda_la_CPPFLAGS = $(btl_smcuda_CPPFLAGS)
|
||||
if MCA_ompi_cuda_support
|
||||
mca_btl_smcuda_la_LIBADD += \
|
||||
$(top_ompi_builddir)/ompi/mca/common/cuda/libmca_common_cuda.la
|
||||
endif
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_btl_smcuda_la_SOURCES = $(libmca_btl_smcuda_la_sources)
|
||||
libmca_btl_smcuda_la_LDFLAGS = -module -avoid-version
|
||||
libmca_btl_smcuda_la_CPPFLAGS = $(btl_smcuda_CPPFLAGS)
|
1121
ompi/mca/btl/smcuda/btl_smcuda.c
Обычный файл
1121
ompi/mca/btl/smcuda/btl_smcuda.c
Обычный файл
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
513
ompi/mca/btl/smcuda/btl_smcuda.h
Обычный файл
513
ompi/mca/btl/smcuda/btl_smcuda.h
Обычный файл
@ -0,0 +1,513 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire. All rights reserved.
|
||||
* Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2012 NVIDIA Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*/
|
||||
#ifndef MCA_BTL_SMCUDA_H
|
||||
#define MCA_BTL_SMCUDA_H
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include <stddef.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#ifdef HAVE_STDINT_H
|
||||
#include <stdint.h>
|
||||
#endif /* HAVE_STDINT_H */
|
||||
#ifdef HAVE_SCHED_H
|
||||
#include <sched.h>
|
||||
#endif /* HAVE_SCHED_H */
|
||||
|
||||
#include "opal/util/bit_ops.h"
|
||||
#include "opal/class/opal_free_list.h"
|
||||
#include "ompi/mca/btl/btl.h"
|
||||
#include "ompi/mca/common/sm/common_sm.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/*
|
||||
* Shared Memory FIFOs
|
||||
*
|
||||
* The FIFO is implemented as a circular queue with head and tail pointers
|
||||
* (integer indices). For efficient wraparound indexing, the size of the
|
||||
* queue is constrained to be a power of two and we "&" indices with a "mask".
|
||||
*
|
||||
* More than one process can write to the FIFO head. Therefore, there is a head
|
||||
* lock. One cannot write until the head slot is empty, indicated by the special
|
||||
* queue entry SM_FIFO_FREE.
|
||||
*
|
||||
* Only the receiver can read the FIFO tail. Therefore, the tail lock is
|
||||
* required only in multithreaded applications. If a tail read returns the
|
||||
* SM_FIFO_FREE value, that means the FIFO is empty. Once a non-FREE value
|
||||
* has been read, the queue slot is *not* automatically reset to SM_FIFO_FREE.
|
||||
* Rather, read tail slots are reset "lazily" (see "lazy_free" and "num_to_clear")
|
||||
* to reduce the number of memory barriers and improve performance.
|
||||
*
|
||||
* Since the FIFO lives in shared memory that is mapped differently into
|
||||
* each address space, the "queue" pointer is relative (each process must
|
||||
* add its own offset) and the queue_recv pointer is meaningful only in the
|
||||
* receiver's address space.
|
||||
*
|
||||
* Since multiple processes access different parts of the FIFO structure in
|
||||
* different ways, we introduce padding to keep different parts on different
|
||||
* cachelines.
|
||||
*/
|
||||
|
||||
#define SM_FIFO_FREE (void *) (-2)
|
||||
/* We can't use opal_cache_line_size here because we need a
|
||||
compile-time constant for padding the struct. We can't really have
|
||||
a compile-time constant that is portable, either (e.g., compile on
|
||||
one machine and run on another). So just use a big enough cache
|
||||
line that should hopefully be good in most places. */
|
||||
#define SM_CACHE_LINE_PAD 128
|
||||
|
||||
struct sm_fifo_t {
|
||||
/* This queue pointer is used only by the heads. */
|
||||
volatile void **queue;
|
||||
char pad0[SM_CACHE_LINE_PAD - sizeof(void **)];
|
||||
/* This lock is used by the heads. */
|
||||
opal_atomic_lock_t head_lock;
|
||||
char pad1[SM_CACHE_LINE_PAD - sizeof(opal_atomic_lock_t)];
|
||||
/* This index is used by the head holding the head lock. */
|
||||
volatile int head;
|
||||
char pad2[SM_CACHE_LINE_PAD - sizeof(int)];
|
||||
/* This mask is used "read only" by all processes. */
|
||||
unsigned int mask;
|
||||
char pad3[SM_CACHE_LINE_PAD - sizeof(int)];
|
||||
/* The following are used only by the tail. */
|
||||
volatile void **queue_recv;
|
||||
opal_atomic_lock_t tail_lock;
|
||||
volatile int tail;
|
||||
int num_to_clear;
|
||||
int lazy_free;
|
||||
char pad4[SM_CACHE_LINE_PAD - sizeof(void **) -
|
||||
sizeof(opal_atomic_lock_t) -
|
||||
sizeof(int) * 3];
|
||||
};
|
||||
typedef struct sm_fifo_t sm_fifo_t;
|
||||
|
||||
/*
|
||||
* Shared Memory resource managment
|
||||
*/
|
||||
|
||||
#if OMPI_ENABLE_PROGRESS_THREADS == 1
|
||||
#define DATA (char)0
|
||||
#define DONE (char)1
|
||||
#endif
|
||||
|
||||
typedef struct mca_btl_smcuda_mem_node_t {
|
||||
mca_mpool_base_module_t* sm_mpool; /**< shared memory pool */
|
||||
} mca_btl_smcuda_mem_node_t;
|
||||
|
||||
/**
|
||||
* Shared Memory (SM) BTL module.
|
||||
*/
|
||||
struct mca_btl_smcuda_component_t {
|
||||
mca_btl_base_component_2_0_0_t super; /**< base BTL component */
|
||||
int sm_free_list_num; /**< initial size of free lists */
|
||||
int sm_free_list_max; /**< maximum size of free lists */
|
||||
int sm_free_list_inc; /**< number of elements to alloc when growing free lists */
|
||||
int32_t sm_max_procs; /**< upper limit on the number of processes using the shared memory pool */
|
||||
int sm_extra_procs; /**< number of extra procs to allow */
|
||||
char* sm_mpool_name; /**< name of shared memory pool module */
|
||||
mca_mpool_base_module_t **sm_mpools; /**< shared memory pools (one for each memory node) */
|
||||
mca_mpool_base_module_t *sm_mpool; /**< mpool on local node */
|
||||
void* sm_mpool_base; /**< base address of shared memory pool */
|
||||
size_t eager_limit; /**< first fragment size */
|
||||
size_t max_frag_size; /**< maximum (second and beyone) fragment size */
|
||||
opal_mutex_t sm_lock;
|
||||
mca_common_sm_module_t *sm_seg; /**< description of shared memory segment */
|
||||
volatile sm_fifo_t **shm_fifo; /**< pointer to fifo 2D array in shared memory */
|
||||
char **shm_bases; /**< pointer to base pointers in shared memory */
|
||||
uint16_t *shm_mem_nodes; /**< pointer to mem noded in shared memory */
|
||||
sm_fifo_t **fifo; /**< cached copy of the pointer to the 2D
|
||||
fifo array. The address in the shared
|
||||
memory segment sm_ctl_header is a relative,
|
||||
but this one, in process private memory, is
|
||||
a real virtual address */
|
||||
uint16_t *mem_nodes; /**< cached copy of mem nodes of each local rank */
|
||||
size_t fifo_size; /**< number of FIFO queue entries */
|
||||
size_t fifo_lazy_free; /**< number of reads before lazy fifo free is triggered */
|
||||
int nfifos; /**< number of FIFOs per receiver */
|
||||
int32_t num_smp_procs; /**< current number of smp procs on this host */
|
||||
int32_t my_smp_rank; /**< My SMP process rank. Used for accessing
|
||||
* SMP specfic data structures. */
|
||||
ompi_free_list_t sm_frags_eager; /**< free list of sm first */
|
||||
ompi_free_list_t sm_frags_max; /**< free list of sm second */
|
||||
ompi_free_list_t sm_frags_user;
|
||||
ompi_free_list_t sm_first_frags_to_progress; /**< list of first
|
||||
fragments that are
|
||||
awaiting resources */
|
||||
struct mca_btl_base_endpoint_t **sm_peers;
|
||||
|
||||
opal_free_list_t pending_send_fl;
|
||||
int num_outstanding_frags; /**< number of fragments sent but not yet returned to free list */
|
||||
int num_pending_sends; /**< total number on all of my pending-send queues */
|
||||
int mem_node;
|
||||
int num_mem_nodes;
|
||||
|
||||
#if OMPI_ENABLE_PROGRESS_THREADS == 1
|
||||
char sm_fifo_path[PATH_MAX]; /**< path to fifo used to signal this process */
|
||||
int sm_fifo_fd; /**< file descriptor corresponding to opened fifo */
|
||||
opal_thread_t sm_fifo_thread;
|
||||
#endif
|
||||
struct mca_btl_smcuda_t **sm_btls;
|
||||
struct mca_btl_smcuda_frag_t **table;
|
||||
size_t sm_num_btls;
|
||||
size_t sm_max_btls;
|
||||
|
||||
|
||||
/** MCA: should we be using knem or not? neg=try but continue if
|
||||
not available, 0=don't try, 1=try and fail if not available */
|
||||
int use_knem;
|
||||
|
||||
/** MCA: minimal message size (bytes) to offload on DMA engine
|
||||
when using knem */
|
||||
uint32_t knem_dma_min;
|
||||
|
||||
/** MCA: how many simultaneous ongoing knem operations to
|
||||
support */
|
||||
int knem_max_simultaneous;
|
||||
|
||||
/** If we want DMA and DMA is supported, this will be loaded with
|
||||
KNEM_FLAG_DMA. Otherwise, it'll be 0. */
|
||||
int knem_dma_flag;
|
||||
};
|
||||
typedef struct mca_btl_smcuda_component_t mca_btl_smcuda_component_t;
|
||||
OMPI_MODULE_DECLSPEC extern mca_btl_smcuda_component_t mca_btl_smcuda_component;
|
||||
|
||||
/**
|
||||
* SM BTL Interface
|
||||
*/
|
||||
struct mca_btl_smcuda_t {
|
||||
mca_btl_base_module_t super; /**< base BTL interface */
|
||||
bool btl_inited; /**< flag indicating if btl has been inited */
|
||||
mca_btl_base_module_error_cb_fn_t error_cb;
|
||||
|
||||
};
|
||||
typedef struct mca_btl_smcuda_t mca_btl_smcuda_t;
|
||||
OMPI_MODULE_DECLSPEC extern mca_btl_smcuda_t mca_btl_smcuda;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
struct btl_smcuda_pending_send_item_t
|
||||
{
|
||||
opal_free_list_item_t super;
|
||||
void *data;
|
||||
};
|
||||
typedef struct btl_smcuda_pending_send_item_t btl_smcuda_pending_send_item_t;
|
||||
|
||||
/***
|
||||
* FIFO support for sm BTL.
|
||||
*/
|
||||
|
||||
/***
|
||||
* One or more FIFO components may be a pointer that must be
|
||||
* accessed by multiple processes. Since the shared region may
|
||||
* be mmapped differently into each process's address space,
|
||||
* these pointers will be relative to some base address. Here,
|
||||
* we define macros to translate between relative addresses and
|
||||
* virtual addresses.
|
||||
*/
|
||||
#define VIRTUAL2RELATIVE(VADDR ) ((long)(VADDR) - (long)mca_btl_smcuda_component.shm_bases[mca_btl_smcuda_component.my_smp_rank])
|
||||
#define RELATIVE2VIRTUAL(OFFSET) ((long)(OFFSET) + (long)mca_btl_smcuda_component.shm_bases[mca_btl_smcuda_component.my_smp_rank])
|
||||
|
||||
static inline int sm_fifo_init(int fifo_size, mca_mpool_base_module_t *mpool,
|
||||
sm_fifo_t *fifo, int lazy_free)
|
||||
{
|
||||
int i, qsize;
|
||||
|
||||
/* figure out the queue size (a power of two that is at least 1) */
|
||||
qsize = opal_next_poweroftwo_inclusive (fifo_size);
|
||||
|
||||
/* allocate the queue in the receiver's address space */
|
||||
fifo->queue_recv = (volatile void **)mpool->mpool_alloc(
|
||||
mpool, sizeof(void *) * qsize, opal_cache_line_size, 0, NULL);
|
||||
if(NULL == fifo->queue_recv) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* initialize the queue */
|
||||
for ( i = 0; i < qsize; i++ )
|
||||
fifo->queue_recv[i] = SM_FIFO_FREE;
|
||||
|
||||
/* shift queue address to be relative */
|
||||
fifo->queue = (volatile void **) VIRTUAL2RELATIVE(fifo->queue_recv);
|
||||
|
||||
/* initialize the locks */
|
||||
opal_atomic_init(&(fifo->head_lock), OPAL_ATOMIC_UNLOCKED);
|
||||
opal_atomic_init(&(fifo->tail_lock), OPAL_ATOMIC_UNLOCKED);
|
||||
opal_atomic_unlock(&(fifo->head_lock)); /* should be unnecessary */
|
||||
opal_atomic_unlock(&(fifo->tail_lock)); /* should be unnecessary */
|
||||
|
||||
/* other initializations */
|
||||
fifo->head = 0;
|
||||
fifo->mask = qsize - 1;
|
||||
fifo->tail = 0;
|
||||
fifo->num_to_clear = 0;
|
||||
fifo->lazy_free = lazy_free;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static inline int sm_fifo_write(void *value, sm_fifo_t *fifo)
|
||||
{
|
||||
volatile void **q = (volatile void **) RELATIVE2VIRTUAL(fifo->queue);
|
||||
|
||||
/* if there is no free slot to write, report exhausted resource */
|
||||
opal_atomic_rmb();
|
||||
if ( SM_FIFO_FREE != q[fifo->head] )
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
|
||||
/* otherwise, write to the slot and advance the head index */
|
||||
q[fifo->head] = value;
|
||||
opal_atomic_wmb();
|
||||
fifo->head = (fifo->head + 1) & fifo->mask;
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static inline void *sm_fifo_read(sm_fifo_t *fifo)
|
||||
{
|
||||
void *value;
|
||||
|
||||
/* read the next queue entry */
|
||||
value = (void *) fifo->queue_recv[fifo->tail];
|
||||
|
||||
opal_atomic_rmb();
|
||||
|
||||
/* if you read a non-empty slot, advance the tail pointer */
|
||||
if ( SM_FIFO_FREE != value ) {
|
||||
|
||||
fifo->tail = ( fifo->tail + 1 ) & fifo->mask;
|
||||
fifo->num_to_clear += 1;
|
||||
|
||||
/* check if it's time to free slots, which we do lazily */
|
||||
if ( fifo->num_to_clear >= fifo->lazy_free ) {
|
||||
int i = (fifo->tail - fifo->num_to_clear ) & fifo->mask;
|
||||
|
||||
while ( fifo->num_to_clear > 0 ) {
|
||||
fifo->queue_recv[i] = SM_FIFO_FREE;
|
||||
i = (i+1) & fifo->mask;
|
||||
fifo->num_to_clear -= 1;
|
||||
}
|
||||
opal_atomic_wmb();
|
||||
}
|
||||
}
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
/**
|
||||
* shared memory component progress.
|
||||
*/
|
||||
extern int mca_btl_smcuda_component_progress(void);
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Register a callback function that is called on error..
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @return Status indicating if cleanup was successful
|
||||
*/
|
||||
|
||||
int mca_btl_smcuda_register_error_cb(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
mca_btl_base_module_error_cb_fn_t cbfunc
|
||||
);
|
||||
|
||||
/**
|
||||
* Cleanup any resources held by the BTL.
|
||||
*
|
||||
* @param btl BTL instance.
|
||||
* @return OMPI_SUCCESS or error status on failure.
|
||||
*/
|
||||
|
||||
extern int mca_btl_smcuda_finalize(
|
||||
struct mca_btl_base_module_t* btl
|
||||
);
|
||||
|
||||
|
||||
/**
|
||||
* PML->BTL notification of change in the process list.
|
||||
* PML->BTL Notification that a receive fragment has been matched.
|
||||
* Called for message that is send from process with the virtual
|
||||
* address of the shared memory segment being different than that of
|
||||
* the receiver.
|
||||
*
|
||||
* @param btl (IN)
|
||||
* @param proc (IN)
|
||||
* @param peer (OUT)
|
||||
* @return OMPI_SUCCESS or error status on failure.
|
||||
*
|
||||
*/
|
||||
|
||||
extern int mca_btl_smcuda_add_procs(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
size_t nprocs,
|
||||
struct ompi_proc_t **procs,
|
||||
struct mca_btl_base_endpoint_t** peers,
|
||||
struct opal_bitmap_t* reachability
|
||||
);
|
||||
|
||||
|
||||
/**
|
||||
* PML->BTL notification of change in the process list.
|
||||
*
|
||||
* @param btl (IN) BTL instance
|
||||
* @param proc (IN) Peer process
|
||||
* @param peer (IN) Peer addressing information.
|
||||
* @return Status indicating if cleanup was successful
|
||||
*
|
||||
*/
|
||||
extern int mca_btl_smcuda_del_procs(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
size_t nprocs,
|
||||
struct ompi_proc_t **procs,
|
||||
struct mca_btl_base_endpoint_t **peers
|
||||
);
|
||||
|
||||
|
||||
/**
|
||||
* Allocate a segment.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param size (IN) Request segment size.
|
||||
*/
|
||||
extern mca_btl_base_descriptor_t* mca_btl_smcuda_alloc(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
uint8_t order,
|
||||
size_t size,
|
||||
uint32_t flags
|
||||
);
|
||||
|
||||
/**
|
||||
* Return a segment allocated by this BTL.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param segment (IN) Allocated segment.
|
||||
*/
|
||||
extern int mca_btl_smcuda_free(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
mca_btl_base_descriptor_t* segment
|
||||
);
|
||||
|
||||
|
||||
/**
|
||||
* Pack data
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param peer (IN) BTL peer addressing
|
||||
*/
|
||||
struct mca_btl_base_descriptor_t* mca_btl_smcuda_prepare_src(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t* size,
|
||||
uint32_t flags
|
||||
);
|
||||
|
||||
|
||||
/**
|
||||
* Initiate an inlined send to the peer or return a descriptor.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param peer (IN) BTL peer addressing
|
||||
*/
|
||||
extern int mca_btl_smcuda_sendi( struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct opal_convertor_t* convertor,
|
||||
void* header,
|
||||
size_t header_size,
|
||||
size_t payload_size,
|
||||
uint8_t order,
|
||||
uint32_t flags,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t** descriptor );
|
||||
|
||||
/**
|
||||
* Initiate a send to the peer.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param peer (IN) BTL peer addressing
|
||||
*/
|
||||
extern int mca_btl_smcuda_send(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_btl_base_descriptor_t* descriptor,
|
||||
mca_btl_base_tag_t tag
|
||||
);
|
||||
|
||||
#if OMPI_CUDA_SUPPORT
|
||||
/**
|
||||
* Remote get using device memory.
|
||||
*/
|
||||
extern int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* ep,
|
||||
struct mca_btl_base_descriptor_t* descriptor);
|
||||
|
||||
extern struct mca_btl_base_descriptor_t* mca_btl_smcuda_prepare_dst(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t* size,
|
||||
uint32_t flags);
|
||||
#endif /* OMPI_CUDA_SUPPORT */
|
||||
|
||||
/**
|
||||
* Fault Tolerance Event Notification Function
|
||||
* @param state Checkpoint Stae
|
||||
* @return OMPI_SUCCESS or failure status
|
||||
*/
|
||||
int mca_btl_smcuda_ft_event(int state);
|
||||
|
||||
#if OMPI_ENABLE_PROGRESS_THREADS == 1
|
||||
void mca_btl_smcuda_component_event_thread(opal_object_t*);
|
||||
#endif
|
||||
|
||||
#if OMPI_ENABLE_PROGRESS_THREADS == 1
|
||||
#define MCA_BTL_SMCUDA_SIGNAL_PEER(peer) \
|
||||
{ \
|
||||
unsigned char cmd = DATA; \
|
||||
if(write(peer->fifo_fd, &cmd, sizeof(cmd)) != sizeof(cmd)) { \
|
||||
opal_output(0, "mca_btl_smcuda_send: write fifo failed: errno=%d\n", errno); \
|
||||
} \
|
||||
}
|
||||
#else
|
||||
#define MCA_BTL_SMCUDA_SIGNAL_PEER(peer)
|
||||
#endif
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
||||
|
565
ompi/mca/btl/smcuda/btl_smcuda_component.c
Обычный файл
565
ompi/mca/btl/smcuda/btl_smcuda_component.c
Обычный файл
@ -0,0 +1,565 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2011 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire. All rights reserved.
|
||||
* Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2011 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011-2012 NVIDIA Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
#include "ompi_config.h"
|
||||
#include <errno.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif /* HAVE_STRING_H */
|
||||
#ifdef HAVE_FCNTL_H
|
||||
#include <fcntl.h>
|
||||
#endif /* HAVE_FCNTL_H */
|
||||
#ifdef HAVE_SYS_TYPES_H
|
||||
#include <sys/types.h>
|
||||
#endif /* HAVE_SYS_TYPES_H */
|
||||
#ifdef HAVE_SYS_MMAN_H
|
||||
#include <sys/mman.h>
|
||||
#endif /* HAVE_SYS_MMAN_H */
|
||||
#ifdef HAVE_SYS_STAT_H
|
||||
#include <sys/stat.h> /* for mkfifo */
|
||||
#endif /* HAVE_SYS_STAT_H */
|
||||
|
||||
#include "ompi/constants.h"
|
||||
#include "opal/mca/event/event.h"
|
||||
#include "opal/util/bit_ops.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "ompi/mca/mpool/base/base.h"
|
||||
#if OMPI_CUDA_SUPPORT
|
||||
#include "ompi/runtime/params.h"
|
||||
#include "ompi/mca/common/cuda/common_cuda.h"
|
||||
#endif /* OMPI_CUDA_SUPPORT */
|
||||
#include "ompi/mca/common/sm/common_sm.h"
|
||||
#include "ompi/mca/btl/base/btl_base_error.h"
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
#include "opal/runtime/opal_cr.h"
|
||||
#endif
|
||||
|
||||
#include "btl_smcuda.h"
|
||||
#include "btl_smcuda_frag.h"
|
||||
#include "btl_smcuda_fifo.h"
|
||||
|
||||
static int mca_btl_smcuda_component_open(void);
|
||||
static int mca_btl_smcuda_component_close(void);
|
||||
static int smcuda_register(void);
|
||||
static mca_btl_base_module_t** mca_btl_smcuda_component_init(
|
||||
int *num_btls,
|
||||
bool enable_progress_threads,
|
||||
bool enable_mpi_threads
|
||||
);
|
||||
|
||||
|
||||
/*
|
||||
* Shared Memory (SM) component instance.
|
||||
*/
|
||||
mca_btl_smcuda_component_t mca_btl_smcuda_component = {
|
||||
{ /* super is being filled in */
|
||||
/* First, the mca_base_component_t struct containing meta information
|
||||
about the component itself */
|
||||
{
|
||||
MCA_BTL_BASE_VERSION_2_0_0,
|
||||
|
||||
"smcuda", /* MCA component name */
|
||||
OMPI_MAJOR_VERSION, /* MCA component major version */
|
||||
OMPI_MINOR_VERSION, /* MCA component minor version */
|
||||
OMPI_RELEASE_VERSION, /* MCA component release version */
|
||||
mca_btl_smcuda_component_open, /* component open */
|
||||
mca_btl_smcuda_component_close, /* component close */
|
||||
NULL,
|
||||
smcuda_register,
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
|
||||
mca_btl_smcuda_component_init,
|
||||
mca_btl_smcuda_component_progress,
|
||||
} /* end super */
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* utility routines for parameter registration
|
||||
*/
|
||||
|
||||
static inline char* mca_btl_smcuda_param_register_string(
|
||||
const char* param_name,
|
||||
const char* default_value)
|
||||
{
|
||||
char *param_value;
|
||||
int id = mca_base_param_register_string("btl","sm",param_name,NULL,default_value);
|
||||
mca_base_param_lookup_string(id, ¶m_value);
|
||||
return param_value;
|
||||
}
|
||||
|
||||
static inline int mca_btl_smcuda_param_register_int(
|
||||
const char* param_name,
|
||||
int default_value)
|
||||
{
|
||||
int id = mca_base_param_register_int("btl","sm",param_name,NULL,default_value);
|
||||
int param_value = default_value;
|
||||
mca_base_param_lookup_int(id,¶m_value);
|
||||
return param_value;
|
||||
}
|
||||
|
||||
|
||||
static int smcuda_register(void)
|
||||
{
|
||||
/* register SM component parameters */
|
||||
mca_btl_smcuda_component.sm_free_list_num =
|
||||
mca_btl_smcuda_param_register_int("free_list_num", 8);
|
||||
mca_btl_smcuda_component.sm_free_list_max =
|
||||
mca_btl_smcuda_param_register_int("free_list_max", -1);
|
||||
mca_btl_smcuda_component.sm_free_list_inc =
|
||||
mca_btl_smcuda_param_register_int("free_list_inc", 64);
|
||||
mca_btl_smcuda_component.sm_max_procs =
|
||||
mca_btl_smcuda_param_register_int("max_procs", -1);
|
||||
mca_btl_smcuda_component.sm_mpool_name =
|
||||
mca_btl_smcuda_param_register_string("mpool", "sm");
|
||||
mca_btl_smcuda_component.fifo_size =
|
||||
mca_btl_smcuda_param_register_int("fifo_size", 4096);
|
||||
mca_btl_smcuda_component.nfifos =
|
||||
mca_btl_smcuda_param_register_int("num_fifos", 1);
|
||||
|
||||
mca_btl_smcuda_component.fifo_lazy_free =
|
||||
mca_btl_smcuda_param_register_int("fifo_lazy_free", 120);
|
||||
|
||||
/* default number of extra procs to allow for future growth */
|
||||
mca_btl_smcuda_component.sm_extra_procs =
|
||||
mca_btl_smcuda_param_register_int("sm_extra_procs", 0);
|
||||
|
||||
#if OMPI_CUDA_SUPPORT
|
||||
mca_btl_smcuda.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH;
|
||||
#else /* OMPI_CUDA_SUPPORT */
|
||||
mca_btl_smcuda.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH-1;
|
||||
#endif /* OMPI_CUDA_SUPPORT */
|
||||
mca_btl_smcuda.super.btl_eager_limit = 4*1024;
|
||||
mca_btl_smcuda.super.btl_rndv_eager_limit = 4*1024;
|
||||
mca_btl_smcuda.super.btl_max_send_size = 32*1024;
|
||||
mca_btl_smcuda.super.btl_rdma_pipeline_send_length = 64*1024;
|
||||
mca_btl_smcuda.super.btl_rdma_pipeline_frag_size = 64*1024;
|
||||
mca_btl_smcuda.super.btl_min_rdma_pipeline_size = 64*1024;
|
||||
mca_btl_smcuda.super.btl_flags = MCA_BTL_FLAGS_SEND;
|
||||
#if OMPI_CUDA_SUPPORT
|
||||
mca_btl_smcuda.super.btl_flags |= MCA_BTL_FLAGS_CUDA_GET;
|
||||
#endif /* OMPI_CUDA_SUPPORT */
|
||||
mca_btl_smcuda.super.btl_bandwidth = 9000; /* Mbs */
|
||||
mca_btl_smcuda.super.btl_latency = 1; /* Microsecs */
|
||||
|
||||
/* Call the BTL based to register its MCA params */
|
||||
mca_btl_base_param_register(&mca_btl_smcuda_component.super.btl_version,
|
||||
&mca_btl_smcuda.super);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Called by MCA framework to open the component, registers
|
||||
* component parameters.
|
||||
*/
|
||||
|
||||
static int mca_btl_smcuda_component_open(void)
|
||||
{
|
||||
mca_btl_smcuda_component.sm_max_btls = 1;
|
||||
|
||||
/* make sure the number of fifos is a power of 2 */
|
||||
mca_btl_smcuda_component.nfifos = opal_next_poweroftwo_inclusive (mca_btl_smcuda_component.nfifos);
|
||||
|
||||
/* make sure that queue size and lazy free parameter are compatible */
|
||||
if (mca_btl_smcuda_component.fifo_lazy_free >= (mca_btl_smcuda_component.fifo_size >> 1) )
|
||||
mca_btl_smcuda_component.fifo_lazy_free = (mca_btl_smcuda_component.fifo_size >> 1);
|
||||
if (mca_btl_smcuda_component.fifo_lazy_free <= 0)
|
||||
mca_btl_smcuda_component.fifo_lazy_free = 1;
|
||||
|
||||
mca_btl_smcuda_component.max_frag_size = mca_btl_smcuda.super.btl_max_send_size;
|
||||
mca_btl_smcuda_component.eager_limit = mca_btl_smcuda.super.btl_eager_limit;
|
||||
|
||||
/* initialize objects */
|
||||
OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_lock, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_frags_eager, ompi_free_list_t);
|
||||
OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_frags_max, ompi_free_list_t);
|
||||
OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_frags_user, ompi_free_list_t);
|
||||
OBJ_CONSTRUCT(&mca_btl_smcuda_component.pending_send_fl, opal_free_list_t);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* component cleanup - sanity checking of queue lengths
|
||||
*/
|
||||
|
||||
static int mca_btl_smcuda_component_close(void)
|
||||
{
|
||||
int return_value = OMPI_SUCCESS;
|
||||
|
||||
|
||||
OBJ_DESTRUCT(&mca_btl_smcuda_component.sm_lock);
|
||||
/**
|
||||
* We don't have to destroy the fragment lists. They are allocated
|
||||
* directly into the mmapped file, they will auto-magically disappear
|
||||
* when the file get unmapped.
|
||||
*/
|
||||
/*OBJ_DESTRUCT(&mca_btl_smcuda_component.sm_frags_eager);*/
|
||||
/*OBJ_DESTRUCT(&mca_btl_smcuda_component.sm_frags_max);*/
|
||||
|
||||
/* unmap the shared memory control structure */
|
||||
if(mca_btl_smcuda_component.sm_seg != NULL) {
|
||||
return_value = mca_common_sm_fini( mca_btl_smcuda_component.sm_seg );
|
||||
if( OMPI_SUCCESS != return_value ) {
|
||||
return_value=OMPI_ERROR;
|
||||
opal_output(0," mca_common_sm_fini failed\n");
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
/* unlink file, so that it will be deleted when all references
|
||||
* to it are gone - no error checking, since we want all procs
|
||||
* to call this, so that in an abnormal termination scenario,
|
||||
* this file will still get cleaned up */
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
/* Only unlink the file if we are *not* restarting
|
||||
* If we are restarting the file will be unlinked at a later time.
|
||||
*/
|
||||
if(OPAL_CR_STATUS_RESTART_PRE != opal_cr_checkpointing_state &&
|
||||
OPAL_CR_STATUS_RESTART_POST != opal_cr_checkpointing_state ) {
|
||||
unlink(mca_btl_smcuda_component.sm_seg->shmem_ds.seg_name);
|
||||
}
|
||||
#else
|
||||
unlink(mca_btl_smcuda_component.sm_seg->shmem_ds.seg_name);
|
||||
#endif
|
||||
OBJ_RELEASE(mca_btl_smcuda_component.sm_seg);
|
||||
}
|
||||
|
||||
#if OMPI_ENABLE_PROGRESS_THREADS == 1
|
||||
/* close/cleanup fifo create for event notification */
|
||||
if(mca_btl_smcuda_component.sm_fifo_fd > 0) {
|
||||
/* write a done message down the pipe */
|
||||
unsigned char cmd = DONE;
|
||||
if( write(mca_btl_smcuda_component.sm_fifo_fd,&cmd,sizeof(cmd)) !=
|
||||
sizeof(cmd)){
|
||||
opal_output(0, "mca_btl_smcuda_component_close: write fifo failed: errno=%d\n",
|
||||
errno);
|
||||
}
|
||||
opal_thread_join(&mca_btl_smcuda_component.sm_fifo_thread, NULL);
|
||||
close(mca_btl_smcuda_component.sm_fifo_fd);
|
||||
unlink(mca_btl_smcuda_component.sm_fifo_path);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (NULL != mca_btl_smcuda_component.sm_mpool_name) {
|
||||
free(mca_btl_smcuda_component.sm_mpool_name);
|
||||
}
|
||||
|
||||
CLEANUP:
|
||||
|
||||
/* return */
|
||||
return return_value;
|
||||
}
|
||||
|
||||
/*
|
||||
* SM component initialization
|
||||
*/
|
||||
static mca_btl_base_module_t** mca_btl_smcuda_component_init(
|
||||
int *num_btls,
|
||||
bool enable_progress_threads,
|
||||
bool enable_mpi_threads)
|
||||
{
|
||||
mca_btl_base_module_t **btls = NULL;
|
||||
|
||||
*num_btls = 0;
|
||||
|
||||
/* if no session directory was created, then we cannot be used */
|
||||
if (!orte_create_session_dirs) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* lookup/create shared memory pool only when used */
|
||||
mca_btl_smcuda_component.sm_mpool = NULL;
|
||||
mca_btl_smcuda_component.sm_mpool_base = NULL;
|
||||
|
||||
#if OMPI_ENABLE_PROGRESS_THREADS == 1
|
||||
/* create a named pipe to receive events */
|
||||
sprintf( mca_btl_smcuda_component.sm_fifo_path,
|
||||
"%s"OPAL_PATH_SEP"sm_fifo.%lu", orte_process_info.job_session_dir,
|
||||
(unsigned long)ORTE_PROC_MY_NAME->vpid );
|
||||
if(mkfifo(mca_btl_smcuda_component.sm_fifo_path, 0660) < 0) {
|
||||
opal_output(0, "mca_btl_smcuda_component_init: mkfifo failed with errno=%d\n",errno);
|
||||
return NULL;
|
||||
}
|
||||
mca_btl_smcuda_component.sm_fifo_fd = open(mca_btl_smcuda_component.sm_fifo_path, O_RDWR);
|
||||
if(mca_btl_smcuda_component.sm_fifo_fd < 0) {
|
||||
opal_output(0, "mca_btl_smcuda_component_init: open(%s) failed with errno=%d\n",
|
||||
mca_btl_smcuda_component.sm_fifo_path, errno);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_fifo_thread, opal_thread_t);
|
||||
mca_btl_smcuda_component.sm_fifo_thread.t_run = (opal_thread_fn_t) mca_btl_smcuda_component_event_thread;
|
||||
opal_thread_start(&mca_btl_smcuda_component.sm_fifo_thread);
|
||||
#endif
|
||||
|
||||
mca_btl_smcuda_component.sm_btls = (mca_btl_smcuda_t **) malloc( mca_btl_smcuda_component.sm_max_btls * sizeof (mca_btl_smcuda_t *));
|
||||
if (NULL == mca_btl_smcuda_component.sm_btls) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* allocate the Shared Memory BTL */
|
||||
*num_btls = 1;
|
||||
btls = (mca_btl_base_module_t**)malloc(sizeof(mca_btl_base_module_t*));
|
||||
if (NULL == btls) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* get pointer to the btls */
|
||||
btls[0] = (mca_btl_base_module_t*)(&(mca_btl_smcuda));
|
||||
mca_btl_smcuda_component.sm_btls[0] = (mca_btl_smcuda_t*)(&(mca_btl_smcuda));
|
||||
|
||||
/* initialize some BTL data */
|
||||
/* start with no SM procs */
|
||||
mca_btl_smcuda_component.num_smp_procs = 0;
|
||||
mca_btl_smcuda_component.my_smp_rank = -1; /* not defined */
|
||||
mca_btl_smcuda_component.sm_num_btls = 1;
|
||||
/* set flag indicating btl not inited */
|
||||
mca_btl_smcuda.btl_inited = false;
|
||||
|
||||
#if OMPI_CUDA_SUPPORT
|
||||
/* Assume CUDA GET works. */
|
||||
mca_btl_smcuda.super.btl_get = mca_btl_smcuda_get_cuda;
|
||||
#endif /* OMPI_CUDA_SUPPORT */
|
||||
|
||||
return btls;
|
||||
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* SM component progress.
|
||||
*/
|
||||
|
||||
#if OMPI_ENABLE_PROGRESS_THREADS == 1
|
||||
void mca_btl_smcuda_component_event_thread(opal_object_t* thread)
|
||||
{
|
||||
while(1) {
|
||||
unsigned char cmd;
|
||||
if(read(mca_btl_smcuda_component.sm_fifo_fd, &cmd, sizeof(cmd)) != sizeof(cmd)) {
|
||||
/* error condition */
|
||||
return;
|
||||
}
|
||||
if( DONE == cmd ){
|
||||
/* return when done message received */
|
||||
return;
|
||||
}
|
||||
mca_btl_smcuda_component_progress();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void btl_smcuda_process_pending_sends(struct mca_btl_base_endpoint_t *ep)
|
||||
{
|
||||
btl_smcuda_pending_send_item_t *si;
|
||||
int rc;
|
||||
|
||||
while ( 0 < opal_list_get_size(&ep->pending_sends) ) {
|
||||
/* Note that we access the size of ep->pending_sends unlocked
|
||||
as it doesn't really matter if the result is wrong as
|
||||
opal_list_remove_first is called with a lock and we handle it
|
||||
not finding an item to process */
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
si = (btl_smcuda_pending_send_item_t*)opal_list_remove_first(&ep->pending_sends);
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
|
||||
if(NULL == si) return; /* Another thread got in before us. Thats ok. */
|
||||
|
||||
OPAL_THREAD_ADD32(&mca_btl_smcuda_component.num_pending_sends, -1);
|
||||
|
||||
MCA_BTL_SMCUDA_FIFO_WRITE(ep, ep->my_smp_rank, ep->peer_smp_rank, si->data,
|
||||
true, false, rc);
|
||||
|
||||
OPAL_FREE_LIST_RETURN(&mca_btl_smcuda_component.pending_send_fl, (opal_list_item_t*)si);
|
||||
|
||||
if ( OMPI_SUCCESS != rc )
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
int mca_btl_smcuda_component_progress(void)
|
||||
{
|
||||
/* local variables */
|
||||
mca_btl_smcuda_frag_t *frag;
|
||||
mca_btl_smcuda_frag_t Frag;
|
||||
sm_fifo_t *fifo = NULL;
|
||||
mca_btl_smcuda_hdr_t *hdr;
|
||||
int my_smp_rank = mca_btl_smcuda_component.my_smp_rank;
|
||||
int peer_smp_rank, j, rc = 0, nevents = 0;
|
||||
|
||||
/* first, deal with any pending sends */
|
||||
/* This check should be fast since we only need to check one variable. */
|
||||
if ( 0 < mca_btl_smcuda_component.num_pending_sends ) {
|
||||
|
||||
/* perform a loop to find the endpoints that have pending sends */
|
||||
/* This can take a while longer if there are many endpoints to check. */
|
||||
for ( peer_smp_rank = 0; peer_smp_rank < mca_btl_smcuda_component.num_smp_procs; peer_smp_rank++) {
|
||||
struct mca_btl_base_endpoint_t* endpoint;
|
||||
if ( peer_smp_rank == my_smp_rank )
|
||||
continue;
|
||||
endpoint = mca_btl_smcuda_component.sm_peers[peer_smp_rank];
|
||||
if ( 0 < opal_list_get_size(&endpoint->pending_sends) )
|
||||
btl_smcuda_process_pending_sends(endpoint);
|
||||
}
|
||||
}
|
||||
|
||||
/* poll each fifo */
|
||||
for(j = 0; j < FIFO_MAP_NUM(mca_btl_smcuda_component.num_smp_procs); j++) {
|
||||
fifo = &(mca_btl_smcuda_component.fifo[my_smp_rank][j]);
|
||||
recheck_peer:
|
||||
/* aquire thread lock */
|
||||
if(opal_using_threads()) {
|
||||
opal_atomic_lock(&(fifo->tail_lock));
|
||||
}
|
||||
|
||||
hdr = (mca_btl_smcuda_hdr_t *)sm_fifo_read(fifo);
|
||||
|
||||
/* release thread lock */
|
||||
if(opal_using_threads()) {
|
||||
opal_atomic_unlock(&(fifo->tail_lock));
|
||||
}
|
||||
|
||||
if(SM_FIFO_FREE == hdr) {
|
||||
continue;
|
||||
}
|
||||
|
||||
nevents++;
|
||||
/* dispatch fragment by type */
|
||||
switch(((uintptr_t)hdr) & MCA_BTL_SMCUDA_FRAG_TYPE_MASK) {
|
||||
case MCA_BTL_SMCUDA_FRAG_SEND:
|
||||
{
|
||||
mca_btl_active_message_callback_t* reg;
|
||||
/* change the address from address relative to the shared
|
||||
* memory address, to a true virtual address */
|
||||
hdr = (mca_btl_smcuda_hdr_t *) RELATIVE2VIRTUAL(hdr);
|
||||
peer_smp_rank = hdr->my_smp_rank;
|
||||
#if OPAL_ENABLE_DEBUG
|
||||
if ( FIFO_MAP(peer_smp_rank) != j ) {
|
||||
opal_output(0, "mca_btl_smcuda_component_progress: "
|
||||
"rank %d got %d on FIFO %d, but this sender should send to FIFO %d\n",
|
||||
my_smp_rank, peer_smp_rank, j, FIFO_MAP(peer_smp_rank));
|
||||
}
|
||||
#endif
|
||||
/* recv upcall */
|
||||
reg = mca_btl_base_active_message_trigger + hdr->tag;
|
||||
Frag.segment.seg_addr.pval = ((char*)hdr) +
|
||||
sizeof(mca_btl_smcuda_hdr_t);
|
||||
Frag.segment.seg_len = hdr->len;
|
||||
Frag.base.des_dst_cnt = 1;
|
||||
Frag.base.des_dst = &(Frag.segment);
|
||||
reg->cbfunc(&mca_btl_smcuda.super, hdr->tag, &(Frag.base),
|
||||
reg->cbdata);
|
||||
/* return the fragment */
|
||||
MCA_BTL_SMCUDA_FIFO_WRITE(
|
||||
mca_btl_smcuda_component.sm_peers[peer_smp_rank],
|
||||
my_smp_rank, peer_smp_rank, hdr->frag, false, true, rc);
|
||||
break;
|
||||
}
|
||||
case MCA_BTL_SMCUDA_FRAG_ACK:
|
||||
{
|
||||
int status = (uintptr_t)hdr & MCA_BTL_SMCUDA_FRAG_STATUS_MASK;
|
||||
int btl_ownership;
|
||||
struct mca_btl_base_endpoint_t* endpoint;
|
||||
|
||||
frag = (mca_btl_smcuda_frag_t *)((char*)((uintptr_t)hdr &
|
||||
(~(MCA_BTL_SMCUDA_FRAG_TYPE_MASK |
|
||||
MCA_BTL_SMCUDA_FRAG_STATUS_MASK))));
|
||||
|
||||
endpoint = frag->endpoint;
|
||||
btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
||||
if( MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags ) {
|
||||
/* completion callback */
|
||||
frag->base.des_cbfunc(&mca_btl_smcuda.super, frag->endpoint,
|
||||
&frag->base, status?OMPI_ERROR:OMPI_SUCCESS);
|
||||
}
|
||||
if( btl_ownership ) {
|
||||
MCA_BTL_SMCUDA_FRAG_RETURN(frag);
|
||||
}
|
||||
OPAL_THREAD_ADD32(&mca_btl_smcuda_component.num_outstanding_frags, -1);
|
||||
if ( 0 < opal_list_get_size(&endpoint->pending_sends) ) {
|
||||
btl_smcuda_process_pending_sends(endpoint);
|
||||
}
|
||||
goto recheck_peer;
|
||||
}
|
||||
default:
|
||||
/* unknown */
|
||||
/*
|
||||
* This code path should presumably never be called.
|
||||
* It's unclear if it should exist or, if so, how it should be written.
|
||||
* If we want to return it to the sending process,
|
||||
* we have to figure out who the sender is.
|
||||
* It seems we need to subtract the mask bits.
|
||||
* Then, hopefully this is an sm header that has an smp_rank field.
|
||||
* Presumably that means the received header was relative.
|
||||
* Or, maybe this code should just be removed.
|
||||
*/
|
||||
opal_output(0, "mca_btl_smcuda_component_progress read an unknown type of header");
|
||||
hdr = (mca_btl_smcuda_hdr_t *) RELATIVE2VIRTUAL(hdr);
|
||||
peer_smp_rank = hdr->my_smp_rank;
|
||||
hdr = (mca_btl_smcuda_hdr_t*)((uintptr_t)hdr->frag |
|
||||
MCA_BTL_SMCUDA_FRAG_STATUS_MASK);
|
||||
MCA_BTL_SMCUDA_FIFO_WRITE(
|
||||
mca_btl_smcuda_component.sm_peers[peer_smp_rank],
|
||||
my_smp_rank, peer_smp_rank, hdr, false, true, rc);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
#if OMPI_CUDA_SUPPORT
|
||||
/* Check to see if there are any outstanding CUDA events that have
|
||||
* completed. If so, issue the PML callbacks on the fragments.
|
||||
*/
|
||||
while (1 == progress_one_cuda_event((mca_btl_base_descriptor_t **)&frag)) {
|
||||
int btl_ownership;
|
||||
btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
||||
if (0 != (MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags)) {
|
||||
frag->base.des_cbfunc(&mca_btl_smcuda.super,
|
||||
frag->endpoint, &frag->base,
|
||||
OMPI_SUCCESS);
|
||||
}
|
||||
|
||||
if (btl_ownership) {
|
||||
if(frag->registration != NULL) {
|
||||
frag->endpoint->mpool->mpool_deregister(frag->endpoint->mpool,
|
||||
(mca_mpool_base_registration_t*)frag->registration);
|
||||
frag->registration = NULL;
|
||||
}
|
||||
MCA_BTL_SMCUDA_FRAG_RETURN(frag);
|
||||
}
|
||||
nevents++;
|
||||
}
|
||||
#endif /* OMPI_CUDA_SUPPORT */
|
||||
return nevents;
|
||||
}
|
51
ompi/mca/btl/smcuda/btl_smcuda_endpoint.h
Обычный файл
51
ompi/mca/btl/smcuda/btl_smcuda_endpoint.h
Обычный файл
@ -0,0 +1,51 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire. All rights reserved.
|
||||
* Copyright (c) 2012 NVIDIA Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*/
|
||||
#ifndef MCA_BTL_SMCUDA_ENDPOINT_H
|
||||
#define MCA_BTL_SMCUDA_ENDPOINT_H
|
||||
|
||||
/**
|
||||
* An abstraction that represents a connection to a endpoint process.
|
||||
* An instance of mca_ptl_base_endpoint_t is associated w/ each process
|
||||
* and BTL pair at startup.
|
||||
*/
|
||||
|
||||
struct mca_btl_base_endpoint_t {
|
||||
int my_smp_rank; /**< My SMP process rank. Used for accessing
|
||||
* SMP specfic data structures. */
|
||||
int peer_smp_rank; /**< My peer's SMP process rank. Used for accessing
|
||||
* SMP specfic data structures. */
|
||||
#if OMPI_CUDA_SUPPORT
|
||||
mca_mpool_base_module_t *mpool; /**< mpool for remotely registered memory */
|
||||
#endif /* OMPI_CUDA_SUPPORT */
|
||||
#if OMPI_ENABLE_PROGRESS_THREADS == 1
|
||||
int fifo_fd; /**< pipe/fifo used to signal endpoint that data is queued */
|
||||
#endif
|
||||
opal_list_t pending_sends; /**< pending data to send */
|
||||
|
||||
/** lock for concurrent access to endpoint state */
|
||||
opal_mutex_t endpoint_lock;
|
||||
|
||||
};
|
||||
|
||||
void btl_smcuda_process_pending_sends(struct mca_btl_base_endpoint_t *ep);
|
||||
#endif
|
87
ompi/mca/btl/smcuda/btl_smcuda_fifo.h
Обычный файл
87
ompi/mca/btl/smcuda/btl_smcuda_fifo.h
Обычный файл
@ -0,0 +1,87 @@
|
||||
#ifndef MCA_BTL_SMCUDA_FIFO_H
|
||||
#define MCA_BTL_SMCUDA_FIFO_H
|
||||
|
||||
#include "btl_smcuda.h"
|
||||
#include "btl_smcuda_endpoint.h"
|
||||
|
||||
static void
|
||||
add_pending(struct mca_btl_base_endpoint_t *ep, void *data, bool resend)
|
||||
{
|
||||
int rc;
|
||||
btl_smcuda_pending_send_item_t *si;
|
||||
opal_free_list_item_t *i;
|
||||
OPAL_FREE_LIST_GET(&mca_btl_smcuda_component.pending_send_fl, i, rc);
|
||||
|
||||
/* don't handle error for now */
|
||||
assert(i != NULL && rc == OMPI_SUCCESS);
|
||||
|
||||
si = (btl_smcuda_pending_send_item_t*)i;
|
||||
si->data = data;
|
||||
|
||||
OPAL_THREAD_ADD32(&mca_btl_smcuda_component.num_pending_sends, +1);
|
||||
|
||||
/* if data was on pending send list then prepend it to the list to
|
||||
* minimize reordering */
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
if (resend)
|
||||
opal_list_prepend(&ep->pending_sends, (opal_list_item_t*)si);
|
||||
else
|
||||
opal_list_append(&ep->pending_sends, (opal_list_item_t*)si);
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* FIFO_MAP(x) defines which FIFO on the receiver should be used
|
||||
* by sender rank x. The map is some many-to-one hash.
|
||||
*
|
||||
* FIFO_MAP_NUM(n) defines how many FIFOs the receiver has for
|
||||
* n senders.
|
||||
*
|
||||
* That is,
|
||||
*
|
||||
* for all 0 <= x < n:
|
||||
*
|
||||
* 0 <= FIFO_MAP(x) < FIFO_MAP_NUM(n)
|
||||
*
|
||||
* For example, using some power-of-two nfifos, we could have
|
||||
*
|
||||
* FIFO_MAP(x) = x & (nfifos-1)
|
||||
* FIFO_MAP_NUM(n) = min(nfifos,n)
|
||||
*
|
||||
* Interesting limits include:
|
||||
*
|
||||
* nfifos very large: In this case, each sender has its
|
||||
* own dedicated FIFO on each receiver and the receiver
|
||||
* has one FIFO per sender.
|
||||
*
|
||||
* nfifos == 1: In this case, all senders use the same
|
||||
* FIFO and each receiver has just one FIFO for all senders.
|
||||
*/
|
||||
#define FIFO_MAP(x) ((x) & (mca_btl_smcuda_component.nfifos - 1))
|
||||
#define FIFO_MAP_NUM(n) ( (mca_btl_smcuda_component.nfifos) < (n) ? (mca_btl_smcuda_component.nfifos) : (n) )
|
||||
|
||||
|
||||
#define MCA_BTL_SMCUDA_FIFO_WRITE(endpoint_peer, my_smp_rank, \
|
||||
peer_smp_rank, hdr, resend, retry_pending_sends, rc) \
|
||||
do { \
|
||||
sm_fifo_t* fifo = &(mca_btl_smcuda_component.fifo[peer_smp_rank][FIFO_MAP(my_smp_rank)]); \
|
||||
\
|
||||
if ( retry_pending_sends ) { \
|
||||
if ( 0 < opal_list_get_size(&endpoint_peer->pending_sends) ) { \
|
||||
btl_smcuda_process_pending_sends(endpoint_peer); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
opal_atomic_lock(&(fifo->head_lock)); \
|
||||
/* post fragment */ \
|
||||
if(sm_fifo_write(hdr, fifo) != OMPI_SUCCESS) { \
|
||||
add_pending(endpoint_peer, hdr, resend); \
|
||||
rc = OMPI_ERR_RESOURCE_BUSY; \
|
||||
} else { \
|
||||
MCA_BTL_SMCUDA_SIGNAL_PEER(endpoint_peer); \
|
||||
rc = OMPI_SUCCESS; \
|
||||
} \
|
||||
opal_atomic_unlock(&(fifo->head_lock)); \
|
||||
} while(0)
|
||||
|
||||
#endif
|
82
ompi/mca/btl/smcuda/btl_smcuda_frag.c
Обычный файл
82
ompi/mca/btl/smcuda/btl_smcuda_frag.c
Обычный файл
@ -0,0 +1,82 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 NVIDIA Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
#include "ompi_config.h"
|
||||
#include "btl_smcuda_frag.h"
|
||||
|
||||
|
||||
static inline void mca_btl_smcuda_frag_common_constructor(mca_btl_smcuda_frag_t* frag)
|
||||
{
|
||||
frag->hdr = (mca_btl_smcuda_hdr_t*)frag->base.super.ptr;
|
||||
if(frag->hdr != NULL) {
|
||||
frag->hdr->frag = (mca_btl_smcuda_frag_t*)((uintptr_t)frag |
|
||||
MCA_BTL_SMCUDA_FRAG_ACK);
|
||||
frag->segment.seg_addr.pval = ((char*)frag->hdr) +
|
||||
sizeof(mca_btl_smcuda_hdr_t);
|
||||
frag->hdr->my_smp_rank = mca_btl_smcuda_component.my_smp_rank;
|
||||
}
|
||||
frag->segment.seg_len = frag->size;
|
||||
frag->base.des_src = &frag->segment;
|
||||
frag->base.des_src_cnt = 1;
|
||||
frag->base.des_dst = &frag->segment;
|
||||
frag->base.des_dst_cnt = 1;
|
||||
frag->base.des_flags = 0;
|
||||
#if OMPI_CUDA_SUPPORT
|
||||
frag->registration = NULL;
|
||||
#endif /* OMPI_CUDA_SUPPORT */
|
||||
}
|
||||
|
||||
static void mca_btl_smcuda_frag1_constructor(mca_btl_smcuda_frag_t* frag)
|
||||
{
|
||||
frag->size = mca_btl_smcuda_component.eager_limit;
|
||||
frag->my_list = &mca_btl_smcuda_component.sm_frags_eager;
|
||||
mca_btl_smcuda_frag_common_constructor(frag);
|
||||
}
|
||||
|
||||
static void mca_btl_smcuda_frag2_constructor(mca_btl_smcuda_frag_t* frag)
|
||||
{
|
||||
frag->size = mca_btl_smcuda_component.max_frag_size;
|
||||
frag->my_list = &mca_btl_smcuda_component.sm_frags_max;
|
||||
mca_btl_smcuda_frag_common_constructor(frag);
|
||||
}
|
||||
|
||||
static void mca_btl_smcuda_user_constructor(mca_btl_smcuda_frag_t* frag)
|
||||
{
|
||||
frag->size = 0;
|
||||
frag->my_list = &mca_btl_smcuda_component.sm_frags_user;
|
||||
mca_btl_smcuda_frag_common_constructor(frag);
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(
|
||||
mca_btl_smcuda_frag1_t,
|
||||
mca_btl_base_descriptor_t,
|
||||
mca_btl_smcuda_frag1_constructor,
|
||||
NULL);
|
||||
|
||||
OBJ_CLASS_INSTANCE(
|
||||
mca_btl_smcuda_frag2_t,
|
||||
mca_btl_base_descriptor_t,
|
||||
mca_btl_smcuda_frag2_constructor,
|
||||
NULL);
|
||||
|
||||
OBJ_CLASS_INSTANCE(
|
||||
mca_btl_smcuda_user_t,
|
||||
mca_btl_base_descriptor_t,
|
||||
mca_btl_smcuda_user_constructor,
|
||||
NULL);
|
101
ompi/mca/btl/smcuda/btl_smcuda_frag.h
Обычный файл
101
ompi/mca/btl/smcuda/btl_smcuda_frag.h
Обычный файл
@ -0,0 +1,101 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 NVIDIA Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*/
|
||||
#ifndef MCA_BTL_SMCUDA_SEND_FRAG_H
|
||||
#define MCA_BTL_SMCUDA_SEND_FRAG_H
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "btl_smcuda.h"
|
||||
|
||||
|
||||
#define MCA_BTL_SMCUDA_FRAG_TYPE_MASK ((uintptr_t)0x3)
|
||||
#define MCA_BTL_SMCUDA_FRAG_SEND ((uintptr_t)0x0)
|
||||
#define MCA_BTL_SMCUDA_FRAG_ACK ((uintptr_t)0x1)
|
||||
#define MCA_BTL_SMCUDA_FRAG_PUT ((uintptr_t)0x2)
|
||||
#define MCA_BTL_SMCUDA_FRAG_GET ((uintptr_t)0x3)
|
||||
|
||||
#define MCA_BTL_SMCUDA_FRAG_STATUS_MASK ((uintptr_t)0x4)
|
||||
|
||||
struct mca_btl_smcuda_frag_t;
|
||||
|
||||
struct mca_btl_smcuda_hdr_t {
|
||||
struct mca_btl_smcuda_frag_t *frag;
|
||||
size_t len;
|
||||
int my_smp_rank;
|
||||
mca_btl_base_tag_t tag;
|
||||
};
|
||||
typedef struct mca_btl_smcuda_hdr_t mca_btl_smcuda_hdr_t;
|
||||
|
||||
/**
|
||||
* shared memory send fragment derived type.
|
||||
*/
|
||||
struct mca_btl_smcuda_frag_t {
|
||||
mca_btl_base_descriptor_t base;
|
||||
mca_btl_base_segment_t segment;
|
||||
struct mca_btl_base_endpoint_t *endpoint;
|
||||
#if OMPI_CUDA_SUPPORT
|
||||
struct mca_mpool_base_registration_t *registration;
|
||||
#endif /* OMPI_CUDA_SUPPORT */
|
||||
size_t size;
|
||||
/* pointer written to the FIFO, this is the base of the shared memory region */
|
||||
mca_btl_smcuda_hdr_t *hdr;
|
||||
ompi_free_list_t* my_list;
|
||||
};
|
||||
typedef struct mca_btl_smcuda_frag_t mca_btl_smcuda_frag_t;
|
||||
typedef struct mca_btl_smcuda_frag_t mca_btl_smcuda_frag1_t;
|
||||
typedef struct mca_btl_smcuda_frag_t mca_btl_smcuda_frag2_t;
|
||||
typedef struct mca_btl_smcuda_frag_t mca_btl_smcuda_user_t;
|
||||
|
||||
|
||||
OBJ_CLASS_DECLARATION(mca_btl_smcuda_frag_t);
|
||||
OBJ_CLASS_DECLARATION(mca_btl_smcuda_frag1_t);
|
||||
OBJ_CLASS_DECLARATION(mca_btl_smcuda_frag2_t);
|
||||
OBJ_CLASS_DECLARATION(mca_btl_smcuda_user_t);
|
||||
|
||||
#define MCA_BTL_SMCUDA_FRAG_ALLOC_EAGER(frag, rc) \
|
||||
{ \
|
||||
ompi_free_list_item_t* item; \
|
||||
OMPI_FREE_LIST_GET(&mca_btl_smcuda_component.sm_frags_eager, item, rc); \
|
||||
frag = (mca_btl_smcuda_frag_t*)item; \
|
||||
}
|
||||
|
||||
#define MCA_BTL_SMCUDA_FRAG_ALLOC_MAX(frag, rc) \
|
||||
{ \
|
||||
ompi_free_list_item_t* item; \
|
||||
OMPI_FREE_LIST_GET(&mca_btl_smcuda_component.sm_frags_max, item, rc); \
|
||||
frag = (mca_btl_smcuda_frag_t*)item; \
|
||||
}
|
||||
|
||||
#define MCA_BTL_SMCUDA_FRAG_ALLOC_USER(frag, rc) \
|
||||
{ \
|
||||
ompi_free_list_item_t* item; \
|
||||
OMPI_FREE_LIST_GET(&mca_btl_smcuda_component.sm_frags_user, item, rc); \
|
||||
frag = (mca_btl_smcuda_frag_t*)item; \
|
||||
}
|
||||
|
||||
|
||||
#define MCA_BTL_SMCUDA_FRAG_RETURN(frag) \
|
||||
{ \
|
||||
OMPI_FREE_LIST_RETURN(frag->my_list, (ompi_free_list_item_t*)(frag)); \
|
||||
}
|
||||
#endif
|
26
ompi/mca/btl/smcuda/configure.m4
Обычный файл
26
ompi/mca/btl/smcuda/configure.m4
Обычный файл
@ -0,0 +1,26 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2009 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2012 NVIDIA Corporation. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_btl_smcuda_CONFIG([action-if-can-compile],
|
||||
# [action-if-cant-compile])
|
||||
# ------------------------------------------------
|
||||
AC_DEFUN([MCA_ompi_btl_smcuda_CONFIG],[
|
||||
AC_CONFIG_FILES([ompi/mca/btl/smcuda/Makefile])
|
||||
|
||||
# Only build if CUDA 4.1 support is available
|
||||
AS_IF([test "x$CUDA_SUPPORT_41" = "x1"],
|
||||
[$1],
|
||||
[$2])
|
||||
|
||||
])dnl
|
20
ompi/mca/btl/smcuda/help-mpi-btl-smcuda.txt
Обычный файл
20
ompi/mca/btl/smcuda/help-mpi-btl-smcuda.txt
Обычный файл
@ -0,0 +1,20 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2006-2010 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2012 NVIDIA Corporation. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English help file for Open MPI's shared memory support.
|
||||
#
|
||||
[CUDA RDMA requested but not supported]
|
||||
WARNING: CUDA RDMA support was requested for the shared memory
|
||||
(sm) BTL, but it is not supported. Continuing without it.
|
||||
|
||||
Local host: %s
|
@ -11,6 +11,7 @@
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
|
||||
# Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2012 NVIDIA Corporation. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -47,6 +48,12 @@ ob1_sources = \
|
||||
pml_ob1_sendreq.h \
|
||||
pml_ob1_start.c
|
||||
|
||||
# If we have CUDA support requested, build the CUDA file also
|
||||
if MCA_ompi_cuda_support
|
||||
ob1_sources += \
|
||||
pml_ob1_cuda.c
|
||||
endif
|
||||
|
||||
if MCA_BUILD_ompi_pml_ob1_DSO
|
||||
component_noinst =
|
||||
component_install = mca_pml_ob1.la
|
||||
|
163
ompi/mca/pml/ob1/pml_ob1_cuda.c
Обычный файл
163
ompi/mca/pml/ob1/pml_ob1_cuda.c
Обычный файл
@ -0,0 +1,163 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2008 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2008 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012 NVIDIA Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "opal/prefetch.h"
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "ompi/mca/btl/btl.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "ompi/mca/mpool/mpool.h"
|
||||
#include "pml_ob1.h"
|
||||
#include "pml_ob1_hdr.h"
|
||||
#include "pml_ob1_rdmafrag.h"
|
||||
#include "pml_ob1_recvreq.h"
|
||||
#include "pml_ob1_sendreq.h"
|
||||
#include "ompi/mca/bml/base/base.h"
|
||||
#include "ompi/memchecker.h"
|
||||
|
||||
size_t mca_pml_ob1_rdma_cuda_btls(
|
||||
mca_bml_base_endpoint_t* bml_endpoint,
|
||||
unsigned char* base,
|
||||
size_t size,
|
||||
mca_pml_ob1_com_btl_t* rdma_btls);
|
||||
|
||||
int mca_pml_ob1_cuda_need_buffers(void * rreq,
|
||||
mca_btl_base_module_t* btl);
|
||||
|
||||
/**
|
||||
* Handle the CUDA buffer.
|
||||
*/
|
||||
int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
|
||||
mca_bml_base_btl_t* bml_btl,
|
||||
size_t size) {
|
||||
int rc;
|
||||
#if OMPI_CUDA_SUPPORT_41
|
||||
sendreq->req_send.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
|
||||
if (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == false) {
|
||||
unsigned char *base;
|
||||
opal_convertor_get_current_pointer( &sendreq->req_send.req_base.req_convertor, (void**)&base );
|
||||
/* Set flag back */
|
||||
sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
|
||||
if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_ob1_rdma_cuda_btls(
|
||||
sendreq->req_endpoint,
|
||||
base,
|
||||
sendreq->req_send.req_bytes_packed,
|
||||
sendreq->req_rdma))) {
|
||||
rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
|
||||
sendreq->req_send.req_bytes_packed);
|
||||
if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
|
||||
mca_pml_ob1_free_rdma_resources(sendreq);
|
||||
}
|
||||
} else {
|
||||
if (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_PUT) {
|
||||
rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, size,
|
||||
MCA_PML_OB1_HDR_FLAGS_CONTIG);
|
||||
} else {
|
||||
rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, size, 0);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* Do not send anything with first rendezvous message as copying GPU
|
||||
* memory into RNDV message is expensive. */
|
||||
sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
|
||||
rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
|
||||
}
|
||||
#else
|
||||
/* Just do the rendezvous but set initial data to be sent to zero */
|
||||
rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
|
||||
#endif /* OMPI_CUDA_SUPPORT_41 */
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
|
||||
size_t mca_pml_ob1_rdma_cuda_btls(
|
||||
mca_bml_base_endpoint_t* bml_endpoint,
|
||||
unsigned char* base,
|
||||
size_t size,
|
||||
mca_pml_ob1_com_btl_t* rdma_btls)
|
||||
{
|
||||
int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
|
||||
double weight_total = 0;
|
||||
int num_btls_used = 0, n;
|
||||
|
||||
/* shortcut when there are no rdma capable btls */
|
||||
if(num_btls == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* check to see if memory is registered */
|
||||
for(n = 0; n < num_btls && num_btls_used < mca_pml_ob1.max_rdma_per_request;
|
||||
n++) {
|
||||
mca_bml_base_btl_t* bml_btl =
|
||||
mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n);
|
||||
|
||||
if (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET) {
|
||||
mca_mpool_base_registration_t* reg = NULL;
|
||||
mca_mpool_base_module_t *btl_mpool = bml_btl->btl->btl_mpool;
|
||||
|
||||
if( NULL != btl_mpool ) {
|
||||
/* register the memory */
|
||||
btl_mpool->mpool_register(btl_mpool, base, size, 0, ®);
|
||||
}
|
||||
|
||||
if(NULL == reg)
|
||||
continue;
|
||||
|
||||
rdma_btls[num_btls_used].bml_btl = bml_btl;
|
||||
rdma_btls[num_btls_used].btl_reg = reg;
|
||||
weight_total += bml_btl->btl_weight;
|
||||
num_btls_used++;
|
||||
}
|
||||
}
|
||||
|
||||
/* if we don't use leave_pinned and all BTLs that already have this memory
|
||||
* registered amount to less then half of available bandwidth - fall back to
|
||||
* pipeline protocol */
|
||||
if(0 == num_btls_used || (!mca_pml_ob1.leave_pinned && weight_total < 0.5))
|
||||
return 0;
|
||||
|
||||
mca_pml_ob1_calc_weighted_length(rdma_btls, num_btls_used, size,
|
||||
weight_total);
|
||||
|
||||
return num_btls_used;
|
||||
}
|
||||
|
||||
int mca_pml_ob1_cuda_need_buffers(void * rreq,
|
||||
mca_btl_base_module_t* btl)
|
||||
{
|
||||
mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)rreq;
|
||||
if ((recvreq->req_recv.req_base.req_convertor.flags & CONVERTOR_CUDA) &&
|
||||
(btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET)) {
|
||||
recvreq->req_recv.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
|
||||
if(opal_convertor_need_buffers(&recvreq->req_recv.req_base.req_convertor) == true) {
|
||||
recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
|
||||
return true;
|
||||
} else {
|
||||
recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
@ -11,6 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2011 Sandia National Laboratories. All rights reserved.
|
||||
* Copyright (c) 2012 NVIDIA Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -34,6 +35,11 @@
|
||||
#include "opal/util/arch.h"
|
||||
#include "ompi/memchecker.h"
|
||||
|
||||
#if OMPI_CUDA_SUPPORT
|
||||
int mca_pml_ob1_cuda_need_buffers(mca_pml_ob1_recv_request_t* recvreq,
|
||||
mca_btl_base_module_t* btl);
|
||||
#endif /* OMPI_CUDA_SUPPORT */
|
||||
|
||||
void mca_pml_ob1_recv_request_process_pending(void)
|
||||
{
|
||||
mca_pml_ob1_recv_request_t* recvreq;
|
||||
@ -485,8 +491,15 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
|
||||
* sender side is already registered. We need to be smarter here, perhaps
|
||||
* do couple of RDMA reads */
|
||||
if(opal_convertor_need_buffers(&recvreq->req_recv.req_base.req_convertor) == true) {
|
||||
#if OMPI_CUDA_SUPPORT
|
||||
if (mca_pml_ob1_cuda_need_buffers(recvreq, btl)) {
|
||||
mca_pml_ob1_recv_request_ack(recvreq, &hdr->hdr_rndv, 0);
|
||||
return;
|
||||
}
|
||||
#else /* OMPI_CUDA_SUPPORT */
|
||||
mca_pml_ob1_recv_request_ack(recvreq, &hdr->hdr_rndv, 0);
|
||||
return;
|
||||
#endif /* OMPI_CUDA_SUPPORT */
|
||||
}
|
||||
|
||||
MCA_PML_OB1_RDMA_FRAG_ALLOC(frag,rc);
|
||||
@ -513,10 +526,29 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
|
||||
}
|
||||
}
|
||||
frag->rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl);
|
||||
#if OMPI_CUDA_SUPPORT
|
||||
if( OPAL_UNLIKELY(NULL == frag->rdma_bml) ) {
|
||||
if (recvreq->req_recv.req_base.req_convertor.flags & CONVERTOR_CUDA) {
|
||||
/* Check to see if this is a CUDA get */
|
||||
if (btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET) {
|
||||
frag->rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_send, btl);
|
||||
}
|
||||
if( OPAL_UNLIKELY(NULL == frag->rdma_bml) ) {
|
||||
opal_output(0, "[%s:%d] invalid bml for rdma get", __FILE__, __LINE__);
|
||||
orte_errmgr.abort(-1, NULL);
|
||||
}
|
||||
} else {
|
||||
/* Just default back to send and receive. Must be mix of GPU and HOST memory. */
|
||||
mca_pml_ob1_recv_request_ack(recvreq, &hdr->hdr_rndv, 0);
|
||||
return;
|
||||
}
|
||||
}
|
||||
#else /* OMPI_CUDA_SUPPORT */
|
||||
if( OPAL_UNLIKELY(NULL == frag->rdma_bml) ) {
|
||||
opal_output(0, "[%s:%d] invalid bml for rdma get", __FILE__, __LINE__);
|
||||
orte_errmgr.abort(-1, NULL);
|
||||
}
|
||||
#endif /* OMPI_CUDA_SUPPORT */
|
||||
frag->rdma_hdr.hdr_rget = *hdr;
|
||||
frag->rdma_req = recvreq;
|
||||
frag->rdma_ep = bml_endpoint;
|
||||
|
@ -11,6 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012 NVIDIA Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -637,7 +638,7 @@ int mca_pml_ob1_send_request_start_rdma( mca_pml_ob1_send_request_t* sendreq,
|
||||
int rc;
|
||||
|
||||
bml_btl = sendreq->req_rdma[0].bml_btl;
|
||||
if((sendreq->req_rdma_cnt == 1) && (bml_btl->btl_flags & MCA_BTL_FLAGS_GET)) {
|
||||
if((sendreq->req_rdma_cnt == 1) && (bml_btl->btl_flags & (MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_CUDA_GET))) {
|
||||
mca_mpool_base_registration_t* reg = sendreq->req_rdma[0].btl_reg;
|
||||
mca_btl_base_descriptor_t* src;
|
||||
size_t i;
|
||||
@ -706,8 +707,15 @@ int mca_pml_ob1_send_request_start_rdma( mca_pml_ob1_send_request_t* sendreq,
|
||||
for( i = 0; i < src->des_src_cnt; i++ ) {
|
||||
hdr->hdr_rget.hdr_segs[i].seg_addr.lval = ompi_ptr_ptol(src->des_src[i].seg_addr.pval);
|
||||
hdr->hdr_rget.hdr_segs[i].seg_len = src->des_src[i].seg_len;
|
||||
#if OMPI_CUDA_SUPPORT_41
|
||||
memcpy(hdr->hdr_rget.hdr_segs[i].seg_key.cudakey, src->des_src[i].seg_key.cudakey,
|
||||
sizeof(src->des_src[i].seg_key.cudakey));
|
||||
hdr->hdr_rget.hdr_segs[i].memh_seg_addr.lval = ompi_ptr_ptol(src->des_src[i].memh_seg_addr.pval);
|
||||
hdr->hdr_rget.hdr_segs[i].memh_seg_len = src->des_src[i].memh_seg_len;
|
||||
#else /* OMPI_CUDA_SUPPORT_41 */
|
||||
hdr->hdr_rget.hdr_segs[i].seg_key.key64[0] = src->des_src[i].seg_key.key64[0];
|
||||
hdr->hdr_rget.hdr_segs[i].seg_key.key64[1] = src->des_src[i].seg_key.key64[1];
|
||||
#endif /* OMPI_CUDA_SUPPORT_41 */
|
||||
}
|
||||
|
||||
des->des_cbfunc = mca_pml_ob1_send_ctl_completion;
|
||||
|
@ -10,7 +10,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011 NVIDIA Corporation. All rights reserved.
|
||||
* Copyright (c) 2011-2012 NVIDIA Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -311,6 +311,13 @@ mca_pml_ob1_send_request_schedule(mca_pml_ob1_send_request_t* sendreq)
|
||||
mca_pml_ob1_send_request_schedule_exclusive(sendreq);
|
||||
}
|
||||
|
||||
#if OMPI_CUDA_SUPPORT
|
||||
int mca_pml_ob1_send_request_start_cuda(
|
||||
mca_pml_ob1_send_request_t* sendreq,
|
||||
mca_bml_base_btl_t* bml_btl,
|
||||
size_t size);
|
||||
#endif /* OMPI_CUDA_SUPPORT */
|
||||
|
||||
/**
|
||||
* Start the specified request
|
||||
*/
|
||||
@ -395,13 +402,11 @@ mca_pml_ob1_send_request_start_btl( mca_pml_ob1_send_request_t* sendreq,
|
||||
MCA_PML_OB1_HDR_FLAGS_CONTIG);
|
||||
}
|
||||
} else {
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
/* Do not send anything with first rendezvous message as copying GPU
|
||||
* memory into RNDV message is expensive. */
|
||||
#if OMPI_CUDA_SUPPORT
|
||||
if (sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA) {
|
||||
size = 0;
|
||||
return mca_pml_ob1_send_request_start_cuda(sendreq, bml_btl, size);
|
||||
}
|
||||
#endif
|
||||
#endif /* OMPI_CUDA_SUPPORT */
|
||||
rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, size, 0);
|
||||
}
|
||||
}
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user