Add collective module to handle CUDA aware buffers and reductions. Per RFC sent last week.
Reviewed by bosilca. This commit was SVN r31894.
Этот коммит содержится в:
родитель
2614dfc4bf
Коммит
570e313c9b
37
ompi/mca/coll/cuda/Makefile.am
Обычный файл
37
ompi/mca/coll/cuda/Makefile.am
Обычный файл
@ -0,0 +1,37 @@
|
||||
#
|
||||
# Copyright (c) 2014 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2014 NVIDIA Corporation. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
sources = coll_cuda_module.c coll_cuda_reduce.c coll_cuda_allreduce.c \
|
||||
coll_cuda_reduce_scatter_block.c coll_cuda_component.c \
|
||||
coll_cuda_scan.c coll_cuda_exscan.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_ompi_coll_cuda_DSO
|
||||
component_noinst =
|
||||
component_install = mca_coll_cuda.la
|
||||
else
|
||||
component_noinst = libmca_coll_cuda.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(ompilibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_coll_cuda_la_SOURCES = $(sources)
|
||||
mca_coll_cuda_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_coll_cuda_la_SOURCES =$(sources)
|
||||
libmca_coll_cuda_la_LDFLAGS = -module -avoid-version
|
||||
|
101
ompi/mca/coll/cuda/coll_cuda.h
Обычный файл
101
ompi/mca/coll/cuda/coll_cuda.h
Обычный файл
@ -0,0 +1,101 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 NVIDIA Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef MCA_COLL_CUDA_EXPORT_H
|
||||
#define MCA_COLL_CUDA_EXPORT_H
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "mpi.h"
|
||||
|
||||
#include "opal/class/opal_object.h"
|
||||
#include "opal/mca/mca.h"
|
||||
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "ompi/mca/coll/base/base.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/* API functions */
|
||||
|
||||
int mca_coll_cuda_init_query(bool enable_progress_threads,
|
||||
bool enable_mpi_threads);
|
||||
mca_coll_base_module_t
|
||||
*mca_coll_cuda_comm_query(struct ompi_communicator_t *comm,
|
||||
int *priority);
|
||||
|
||||
int mca_coll_cuda_module_enable(mca_coll_base_module_t *module,
|
||||
struct ompi_communicator_t *comm);
|
||||
|
||||
int
|
||||
mca_coll_cuda_allreduce(void *sbuf, void *rbuf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module);
|
||||
|
||||
int mca_coll_cuda_reduce(void *sbuf, void *rbuf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module);
|
||||
|
||||
int mca_coll_cuda_exscan(void *sbuf, void *rbuf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module);
|
||||
|
||||
int mca_coll_cuda_scan(void *sbuf, void *rbuf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module);
|
||||
|
||||
int
|
||||
mca_coll_cuda_reduce_scatter_block(void *sbuf, void *rbuf, int rcount,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module);
|
||||
|
||||
/* Types */
|
||||
/* Module */
|
||||
|
||||
typedef struct mca_coll_cuda_module_t {
|
||||
mca_coll_base_module_t super;
|
||||
|
||||
/* Pointers to all the "real" collective functions */
|
||||
mca_coll_base_comm_coll_t c_coll;
|
||||
} mca_coll_cuda_module_t;
|
||||
|
||||
OBJ_CLASS_DECLARATION(mca_coll_cuda_module_t);
|
||||
|
||||
/* Component */
|
||||
|
||||
typedef struct mca_coll_cuda_component_t {
|
||||
mca_coll_base_component_2_0_0_t super;
|
||||
|
||||
int priority; /* Priority of this component */
|
||||
int disable_cuda_coll; /* Force disable of the CUDA collective component */
|
||||
} mca_coll_cuda_component_t;
|
||||
|
||||
/* Globally exported variables */
|
||||
|
||||
OMPI_MODULE_DECLSPEC extern mca_coll_cuda_component_t mca_coll_cuda_component;
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MCA_COLL_CUDA_EXPORT_H */
|
77
ompi/mca/coll/cuda/coll_cuda_allreduce.c
Обычный файл
77
ompi/mca/coll/cuda/coll_cuda_allreduce.c
Обычный файл
@ -0,0 +1,77 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 NVIDIA Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "coll_cuda.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include "ompi/op/op.h"
|
||||
#include "opal/datatype/opal_convertor.h"
|
||||
#include "opal/datatype/opal_datatype_cuda.h"
|
||||
|
||||
/*
|
||||
* allreduce_intra
|
||||
*
|
||||
* Function: - allreduce using other MPI collectives
|
||||
* Accepts: - same as MPI_Allreduce()
|
||||
* Returns: - MPI_SUCCESS or error code
|
||||
*/
|
||||
int
|
||||
mca_coll_cuda_allreduce(void *sbuf, void *rbuf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_cuda_module_t *s = (mca_coll_cuda_module_t*) module;
|
||||
ptrdiff_t true_lb, true_extent, lb, extent;
|
||||
char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2;
|
||||
const char *sbuf2;
|
||||
size_t bufsize;
|
||||
int rc;
|
||||
|
||||
ompi_datatype_get_extent(dtype, &lb, &extent);
|
||||
ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent);
|
||||
bufsize = true_extent + (ptrdiff_t)(count - 1) * extent;
|
||||
if ((MPI_IN_PLACE != sbuf) && (opal_cuda_check_bufs((char *)sbuf, NULL))) {
|
||||
sbuf1 = (char*)malloc(bufsize);
|
||||
if (NULL == sbuf1) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
opal_cuda_memcpy_sync(sbuf1, sbuf, bufsize);
|
||||
sbuf2 = sbuf; /* save away original buffer */
|
||||
sbuf = sbuf1 - lb;
|
||||
}
|
||||
|
||||
if (opal_cuda_check_bufs(rbuf, NULL)) {
|
||||
rbuf1 = (char*)malloc(bufsize);
|
||||
if (NULL == rbuf1) {
|
||||
if (NULL != sbuf1) free(sbuf1);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
opal_cuda_memcpy_sync(rbuf1, rbuf, bufsize);
|
||||
rbuf2 = rbuf; /* save away original buffer */
|
||||
rbuf = rbuf1 - lb;
|
||||
}
|
||||
rc = s->c_coll.coll_allreduce(sbuf, rbuf, count, dtype, op, comm, s->c_coll.coll_allreduce_module);
|
||||
if (NULL != sbuf1) {
|
||||
free(sbuf1);
|
||||
}
|
||||
if (NULL != rbuf1) {
|
||||
rbuf = rbuf2;
|
||||
opal_cuda_memcpy_sync(rbuf, rbuf1, bufsize);
|
||||
free(rbuf1);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
92
ompi/mca/coll/cuda/coll_cuda_component.c
Обычный файл
92
ompi/mca/coll/cuda/coll_cuda_component.c
Обычный файл
@ -0,0 +1,92 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 NVIDIA Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "mpi.h"
|
||||
#include "ompi/constants.h"
|
||||
#include "coll_cuda.h"
|
||||
|
||||
/*
|
||||
* Public string showing the coll ompi_cuda component version number
|
||||
*/
|
||||
const char *mca_coll_cuda_component_version_string =
|
||||
"Open MPI cuda collective MCA component version " OMPI_VERSION;
|
||||
|
||||
/*
|
||||
* Local function
|
||||
*/
|
||||
static int cuda_register(void);
|
||||
|
||||
/*
|
||||
* Instantiate the public struct with all of our public information
|
||||
* and pointers to our public functions in it
|
||||
*/
|
||||
|
||||
mca_coll_cuda_component_t mca_coll_cuda_component = {
|
||||
{
|
||||
/* First, the mca_component_t struct containing meta information
|
||||
* about the component itself */
|
||||
|
||||
{
|
||||
MCA_COLL_BASE_VERSION_2_0_0,
|
||||
|
||||
/* Component name and version */
|
||||
"cuda",
|
||||
OMPI_MAJOR_VERSION,
|
||||
OMPI_MINOR_VERSION,
|
||||
OMPI_RELEASE_VERSION,
|
||||
|
||||
/* Component open and close functions */
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
cuda_register
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
|
||||
/* Initialization / querying functions */
|
||||
|
||||
mca_coll_cuda_init_query,
|
||||
mca_coll_cuda_comm_query
|
||||
},
|
||||
|
||||
/* cuda-specific component information */
|
||||
|
||||
/* Priority: use a low priority, but allow others to be lower */
|
||||
50,
|
||||
};
|
||||
|
||||
|
||||
static int cuda_register(void)
|
||||
{
|
||||
(void) mca_base_component_var_register(&mca_coll_cuda_component.super.collm_version,
|
||||
"priority", "Priority of the cuda coll component; only relevant if barrier_before or barrier_after is > 0",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_6,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_coll_cuda_component.priority);
|
||||
|
||||
(void) mca_base_component_var_register(&mca_coll_cuda_component.super.collm_version,
|
||||
"disable_cuda_coll", "Automatically handle the CUDA buffers for the MPI collective.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_2,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_coll_cuda_component.disable_cuda_coll);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
70
ompi/mca/coll/cuda/coll_cuda_exscan.c
Обычный файл
70
ompi/mca/coll/cuda/coll_cuda_exscan.c
Обычный файл
@ -0,0 +1,70 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 NVIDIA Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "coll_cuda.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include "ompi/op/op.h"
|
||||
#include "opal/datatype/opal_convertor.h"
|
||||
#include "opal/datatype/opal_datatype_cuda.h"
|
||||
|
||||
int mca_coll_cuda_exscan(void *sbuf, void *rbuf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_cuda_module_t *s = (mca_coll_cuda_module_t*) module;
|
||||
ptrdiff_t true_lb, true_extent, lb, extent;
|
||||
char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2;
|
||||
const char *sbuf2;
|
||||
size_t bufsize;
|
||||
int rc;
|
||||
|
||||
ompi_datatype_get_extent(dtype, &lb, &extent);
|
||||
ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent);
|
||||
bufsize = true_extent + (ptrdiff_t)(count - 1) * extent;
|
||||
if ((MPI_IN_PLACE != sbuf) && (opal_cuda_check_bufs((char *)sbuf, NULL))) {
|
||||
sbuf1 = (char*)malloc(bufsize);
|
||||
if (NULL == sbuf1) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
opal_cuda_memcpy_sync(sbuf1, sbuf, bufsize);
|
||||
sbuf2 = sbuf; /* save away original buffer */
|
||||
sbuf = sbuf1 - lb;
|
||||
}
|
||||
|
||||
if (opal_cuda_check_bufs(rbuf, NULL)) {
|
||||
rbuf1 = (char*)malloc(bufsize);
|
||||
if (NULL == rbuf1) {
|
||||
if (NULL != sbuf1) free(sbuf1);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
opal_cuda_memcpy_sync(rbuf1, rbuf, bufsize);
|
||||
rbuf2 = rbuf; /* save away original buffer */
|
||||
rbuf = rbuf1 - lb;
|
||||
}
|
||||
|
||||
rc = s->c_coll.coll_exscan(sbuf, rbuf, count, dtype, op, comm,
|
||||
s->c_coll.coll_exscan_module);
|
||||
if (NULL != sbuf1) {
|
||||
free(sbuf1);
|
||||
}
|
||||
if (NULL != rbuf1) {
|
||||
rbuf = rbuf2;
|
||||
opal_cuda_memcpy_sync(rbuf, rbuf1, bufsize);
|
||||
free(rbuf1);
|
||||
}
|
||||
return rc;
|
||||
}
|
158
ompi/mca/coll/cuda/coll_cuda_module.c
Обычный файл
158
ompi/mca/coll/cuda/coll_cuda_module.c
Обычный файл
@ -0,0 +1,158 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 NVIDIA Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif
|
||||
#include <stdio.h>
|
||||
|
||||
#include "coll_cuda.h"
|
||||
|
||||
#include "mpi.h"
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "ompi/mca/coll/base/base.h"
|
||||
#include "coll_cuda.h"
|
||||
|
||||
|
||||
static void mca_coll_cuda_module_construct(mca_coll_cuda_module_t *module)
|
||||
{
|
||||
memset(&(module->c_coll), 0, sizeof(module->c_coll));
|
||||
}
|
||||
|
||||
static void mca_coll_cuda_module_destruct(mca_coll_cuda_module_t *module)
|
||||
{
|
||||
OBJ_RELEASE(module->c_coll.coll_allreduce_module);
|
||||
OBJ_RELEASE(module->c_coll.coll_reduce_module);
|
||||
OBJ_RELEASE(module->c_coll.coll_reduce_scatter_block_module);
|
||||
OBJ_RELEASE(module->c_coll.coll_scatter_module);
|
||||
/* If the exscan module is not NULL, then this was an
|
||||
intracommunicator, and therefore scan will have a module as
|
||||
well. */
|
||||
if (NULL != module->c_coll.coll_exscan_module) {
|
||||
OBJ_RELEASE(module->c_coll.coll_exscan_module);
|
||||
OBJ_RELEASE(module->c_coll.coll_scan_module);
|
||||
}
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_coll_cuda_module_t, mca_coll_base_module_t,
|
||||
mca_coll_cuda_module_construct,
|
||||
mca_coll_cuda_module_destruct);
|
||||
|
||||
|
||||
/*
|
||||
* Initial query function that is invoked during MPI_INIT, allowing
|
||||
* this component to disqualify itself if it doesn't support the
|
||||
* required level of thread support.
|
||||
*/
|
||||
int mca_coll_cuda_init_query(bool enable_progress_threads,
|
||||
bool enable_mpi_threads)
|
||||
{
|
||||
/* Nothing to do */
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Invoked when there's a new communicator that has been created.
|
||||
* Look at the communicator and decide which set of functions and
|
||||
* priority we want to return.
|
||||
*/
|
||||
mca_coll_base_module_t *
|
||||
mca_coll_cuda_comm_query(struct ompi_communicator_t *comm,
|
||||
int *priority)
|
||||
{
|
||||
mca_coll_cuda_module_t *cuda_module;
|
||||
|
||||
cuda_module = OBJ_NEW(mca_coll_cuda_module_t);
|
||||
if (NULL == cuda_module) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
*priority = mca_coll_cuda_component.priority;
|
||||
strcpy(cuda_module->super.coll_name, "cuda");
|
||||
|
||||
/* Choose whether to use [intra|inter] */
|
||||
cuda_module->super.coll_module_enable = mca_coll_cuda_module_enable;
|
||||
cuda_module->super.ft_event = NULL;
|
||||
|
||||
cuda_module->super.coll_allgather = NULL;
|
||||
cuda_module->super.coll_allgatherv = NULL;
|
||||
cuda_module->super.coll_allreduce = mca_coll_cuda_allreduce;
|
||||
cuda_module->super.coll_alltoall = NULL;
|
||||
cuda_module->super.coll_alltoallv = NULL;
|
||||
cuda_module->super.coll_alltoallw = NULL;
|
||||
cuda_module->super.coll_barrier = NULL;
|
||||
cuda_module->super.coll_bcast = NULL;
|
||||
cuda_module->super.coll_exscan = mca_coll_cuda_exscan;
|
||||
cuda_module->super.coll_gather = NULL;
|
||||
cuda_module->super.coll_gatherv = NULL;
|
||||
cuda_module->super.coll_reduce = mca_coll_cuda_reduce;
|
||||
cuda_module->super.coll_reduce_scatter = NULL;
|
||||
cuda_module->super.coll_reduce_scatter_block = mca_coll_cuda_reduce_scatter_block;
|
||||
cuda_module->super.coll_scan = mca_coll_cuda_scan;
|
||||
cuda_module->super.coll_scatter = NULL;
|
||||
cuda_module->super.coll_scatterv = NULL;
|
||||
|
||||
return &(cuda_module->super);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Init module on the communicator
|
||||
*/
|
||||
int mca_coll_cuda_module_enable(mca_coll_base_module_t *module,
|
||||
struct ompi_communicator_t *comm)
|
||||
{
|
||||
bool good = true;
|
||||
char *msg = NULL;
|
||||
mca_coll_cuda_module_t *s = (mca_coll_cuda_module_t*) module;
|
||||
|
||||
#define CHECK_AND_RETAIN(src, dst, name) \
|
||||
if (NULL == (src)->c_coll.coll_ ## name ## _module) { \
|
||||
good = false; \
|
||||
msg = #name; \
|
||||
} else if (good) { \
|
||||
(dst)->c_coll.coll_ ## name ## _module = (src)->c_coll.coll_ ## name ## _module;\
|
||||
(dst)->c_coll.coll_ ## name = (src)->c_coll.coll_ ## name ; \
|
||||
OBJ_RETAIN((src)->c_coll.coll_ ## name ## _module); \
|
||||
}
|
||||
|
||||
CHECK_AND_RETAIN(comm, s, allreduce);
|
||||
CHECK_AND_RETAIN(comm, s, reduce);
|
||||
CHECK_AND_RETAIN(comm, s, reduce_scatter_block);
|
||||
CHECK_AND_RETAIN(comm, s, scatter);
|
||||
if (!OMPI_COMM_IS_INTER(comm)) {
|
||||
/* MPI does not define scan/exscan on intercommunicators */
|
||||
CHECK_AND_RETAIN(comm, s, exscan);
|
||||
CHECK_AND_RETAIN(comm, s, scan);
|
||||
}
|
||||
|
||||
/* All done */
|
||||
if (good) {
|
||||
return OMPI_SUCCESS;
|
||||
} else {
|
||||
orte_show_help("help-coll-cuda.txt", "missing collective", true,
|
||||
orte_process_info.nodename,
|
||||
mca_coll_cuda_component.priority, msg);
|
||||
return OMPI_ERR_NOT_FOUND;
|
||||
}
|
||||
}
|
||||
|
79
ompi/mca/coll/cuda/coll_cuda_reduce.c
Обычный файл
79
ompi/mca/coll/cuda/coll_cuda_reduce.c
Обычный файл
@ -0,0 +1,79 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2014 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 NVIDIA Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "coll_cuda.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include "ompi/op/op.h"
|
||||
#include "opal/datatype/opal_convertor.h"
|
||||
#include "opal/datatype/opal_datatype_cuda.h"
|
||||
|
||||
/*
|
||||
* reduce_log_inter
|
||||
*
|
||||
* Function: - reduction using O(N) algorithm
|
||||
* Accepts: - same as MPI_Reduce()
|
||||
* Returns: - MPI_SUCCESS or error code
|
||||
*/
|
||||
int
|
||||
mca_coll_cuda_reduce(void *sbuf, void *rbuf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
int root, struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_cuda_module_t *s = (mca_coll_cuda_module_t*) module;
|
||||
ptrdiff_t true_lb, true_extent, lb, extent;
|
||||
char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2;
|
||||
const char *sbuf2;
|
||||
size_t bufsize;
|
||||
int rc;
|
||||
|
||||
ompi_datatype_get_extent(dtype, &lb, &extent);
|
||||
ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent);
|
||||
bufsize = true_extent + (ptrdiff_t)(count - 1) * extent;
|
||||
if ((MPI_IN_PLACE != sbuf) && (opal_cuda_check_bufs((char *)sbuf, NULL))) {
|
||||
sbuf1 = (char*)malloc(bufsize);
|
||||
if (NULL == sbuf1) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
opal_cuda_memcpy_sync(sbuf1, sbuf, bufsize);
|
||||
sbuf2 = sbuf; /* save away original buffer */
|
||||
sbuf = sbuf1 - lb;
|
||||
}
|
||||
|
||||
if (opal_cuda_check_bufs(rbuf, NULL)) {
|
||||
rbuf1 = (char*)malloc(bufsize);
|
||||
if (NULL == rbuf1) {
|
||||
if (NULL != sbuf1) free(sbuf1);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
opal_cuda_memcpy_sync(rbuf1, rbuf, bufsize);
|
||||
rbuf2 = rbuf; /* save away original buffer */
|
||||
rbuf = rbuf1 - lb;
|
||||
}
|
||||
rc = s->c_coll.coll_reduce((void *) sbuf, rbuf, count,
|
||||
dtype, op, root, comm,
|
||||
s->c_coll.coll_reduce_module);
|
||||
|
||||
if (NULL != sbuf1) {
|
||||
free(sbuf1);
|
||||
}
|
||||
if (NULL != rbuf1) {
|
||||
rbuf = rbuf2;
|
||||
opal_cuda_memcpy_sync(rbuf, rbuf1, bufsize);
|
||||
free(rbuf1);
|
||||
}
|
||||
return rc;
|
||||
}
|
83
ompi/mca/coll/cuda/coll_cuda_reduce_scatter_block.c
Обычный файл
83
ompi/mca/coll/cuda/coll_cuda_reduce_scatter_block.c
Обычный файл
@ -0,0 +1,83 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 NVIDIA Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "coll_cuda.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include "ompi/op/op.h"
|
||||
#include "opal/datatype/opal_convertor.h"
|
||||
#include "opal/datatype/opal_datatype_cuda.h"
|
||||
|
||||
/*
|
||||
* reduce_scatter_block
|
||||
*
|
||||
* Function: - reduce then scatter
|
||||
* Accepts: - same as MPI_Reduce_scatter_block()
|
||||
* Returns: - MPI_SUCCESS or error code
|
||||
*
|
||||
* Algorithm:
|
||||
* reduce and scatter (needs to be cleaned
|
||||
* up at some point)
|
||||
*/
|
||||
int
|
||||
mca_coll_cuda_reduce_scatter_block(void *sbuf, void *rbuf, int rcount,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_cuda_module_t *s = (mca_coll_cuda_module_t*) module;
|
||||
ptrdiff_t true_lb, true_extent, lb, extent;
|
||||
char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2;
|
||||
const char *sbuf2;
|
||||
size_t sbufsize, rbufsize;
|
||||
int rc;
|
||||
|
||||
ompi_datatype_get_extent(dtype, &lb, &extent);
|
||||
ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent);
|
||||
sbufsize = (true_extent + (ptrdiff_t)(rcount - 1) * extent) * ompi_comm_size(comm);
|
||||
rbufsize = true_extent + (ptrdiff_t)(rcount - 1) * extent;
|
||||
if ((MPI_IN_PLACE != sbuf) && (opal_cuda_check_bufs((char *)sbuf, NULL))) {
|
||||
sbuf1 = (char*)malloc(sbufsize);
|
||||
if (NULL == sbuf1) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
opal_cuda_memcpy_sync(sbuf1, sbuf, sbufsize);
|
||||
sbuf2 = sbuf; /* save away original buffer */
|
||||
sbuf = sbuf1 - lb;
|
||||
}
|
||||
|
||||
if (opal_cuda_check_bufs(rbuf, NULL)) {
|
||||
rbuf1 = (char*)malloc(rbufsize);
|
||||
if (NULL == rbuf1) {
|
||||
if (NULL != sbuf1) free(sbuf1);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
opal_cuda_memcpy_sync(rbuf1, rbuf, rbufsize);
|
||||
rbuf2 = rbuf; /* save away original buffer */
|
||||
rbuf = rbuf1 - lb;
|
||||
}
|
||||
rc = s->c_coll.coll_reduce_scatter_block(sbuf, rbuf, rcount, dtype, op, comm,
|
||||
s->c_coll.coll_reduce_scatter_block_module);
|
||||
if (NULL != sbuf1) {
|
||||
free(sbuf1);
|
||||
}
|
||||
if (NULL != rbuf1) {
|
||||
rbuf = rbuf2;
|
||||
opal_cuda_memcpy_sync(rbuf, rbuf1, rbufsize);
|
||||
free(rbuf1);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
76
ompi/mca/coll/cuda/coll_cuda_scan.c
Обычный файл
76
ompi/mca/coll/cuda/coll_cuda_scan.c
Обычный файл
@ -0,0 +1,76 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 NVIDIA Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "coll_cuda.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include "ompi/op/op.h"
|
||||
#include "opal/datatype/opal_convertor.h"
|
||||
#include "opal/datatype/opal_datatype_cuda.h"
|
||||
|
||||
/*
|
||||
* scan
|
||||
*
|
||||
* Function: - scan
|
||||
* Accepts: - same arguments as MPI_Scan()
|
||||
* Returns: - MPI_SUCCESS or error code
|
||||
*/
|
||||
int mca_coll_cuda_scan(void *sbuf, void *rbuf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_cuda_module_t *s = (mca_coll_cuda_module_t*) module;
|
||||
ptrdiff_t true_lb, true_extent, lb, extent;
|
||||
char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2;
|
||||
const char *sbuf2;
|
||||
size_t bufsize;
|
||||
int rc;
|
||||
|
||||
ompi_datatype_get_extent(dtype, &lb, &extent);
|
||||
ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent);
|
||||
bufsize = true_extent + (ptrdiff_t)(count - 1) * extent;
|
||||
if ((MPI_IN_PLACE != sbuf) && (opal_cuda_check_bufs((char *)sbuf, NULL))) {
|
||||
sbuf1 = (char*)malloc(bufsize);
|
||||
if (NULL == sbuf1) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
opal_cuda_memcpy_sync(sbuf1, sbuf, bufsize);
|
||||
sbuf2 = sbuf; /* save away original buffer */
|
||||
sbuf = sbuf1 - lb;
|
||||
}
|
||||
|
||||
if (opal_cuda_check_bufs(rbuf, NULL)) {
|
||||
rbuf1 = (char*)malloc(bufsize);
|
||||
if (NULL == rbuf1) {
|
||||
if (NULL != sbuf1) free(sbuf1);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
opal_cuda_memcpy_sync(rbuf1, rbuf, bufsize);
|
||||
rbuf2 = rbuf; /* save away original buffer */
|
||||
rbuf = rbuf1 - lb;
|
||||
}
|
||||
rc = s->c_coll.coll_scan(sbuf, rbuf, count, dtype, op, comm,
|
||||
s->c_coll.coll_scan_module);
|
||||
if (NULL != sbuf1) {
|
||||
free(sbuf1);
|
||||
}
|
||||
if (NULL != rbuf1) {
|
||||
rbuf = rbuf2;
|
||||
opal_cuda_memcpy_sync(rbuf, rbuf1, bufsize);
|
||||
free(rbuf1);
|
||||
}
|
||||
return rc;
|
||||
}
|
29
ompi/mca/coll/cuda/configure.m4
Обычный файл
29
ompi/mca/coll/cuda/configure.m4
Обычный файл
@ -0,0 +1,29 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2014 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2014 NVIDIA Corporation. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_coll_cuda_CONFIG([action-if-can-compile],
|
||||
# [action-if-cant-compile])
|
||||
# ------------------------------------------------
|
||||
AC_DEFUN([MCA_ompi_coll_cuda_CONFIG],[
|
||||
AC_CONFIG_FILES([ompi/mca/coll/cuda/Makefile])
|
||||
|
||||
# make sure that CUDA-aware checks have been done
|
||||
AC_REQUIRE([OPAL_CHECK_CUDA])
|
||||
|
||||
# Only build if CUDA support is available
|
||||
AS_IF([test "x$CUDA_SUPPORT" = "x1"],
|
||||
[$1],
|
||||
[$2])
|
||||
|
||||
])dnl
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user