From 8db1f89dd101ad55606720ba78a4a41e1f21f5de Mon Sep 17 00:00:00 2001 From: Rolf vandeVaart Date: Wed, 17 Sep 2014 16:55:01 +0000 Subject: [PATCH] Small change to allow CUDA-aware to work with non-reduction nonblocking collectives. Only used when CUDA-aware feature compiled in. This commit was SVN r32750. --- ompi/mca/coll/libnbc/nbc_ialltoall.c | 5 +++++ ompi/mca/coll/libnbc/nbc_internal.h | 13 +++++++++++++ 2 files changed, 18 insertions(+) diff --git a/ompi/mca/coll/libnbc/nbc_ialltoall.c b/ompi/mca/coll/libnbc/nbc_ialltoall.c index b1a4943d12..2938ec0d26 100644 --- a/ompi/mca/coll/libnbc/nbc_ialltoall.c +++ b/ompi/mca/coll/libnbc/nbc_ialltoall.c @@ -6,6 +6,7 @@ * rights reserved. * Copyright (c) 2013 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2014 NVIDIA Corporation. All rights reserved. * * Author(s): Torsten Hoefler * @@ -108,7 +109,11 @@ int ompi_coll_libnbc_ialltoall(void* sendbuf, int sendcount, MPI_Datatype sendty } /* phase 1 - rotate n data blocks upwards into the tmpbuffer */ +#if OPAL_CUDA_SUPPORT + if(NBC_Type_intrinsic(srctype) && !(opal_cuda_check_bufs((char *)tgt, (char *)src))) { +#else if(NBC_Type_intrinsic(sendtype)) { +#endif /* OPAL_CUDA_SUPPORT */ /* contiguous - just copy (1st copy) */ memcpy(handle->tmpbuf, (char*)sendbuf+datasize*rank, datasize*(p-rank)); if(rank != 0) memcpy((char*)handle->tmpbuf+datasize*(p-rank), sendbuf, datasize*(rank)); diff --git a/ompi/mca/coll/libnbc/nbc_internal.h b/ompi/mca/coll/libnbc/nbc_internal.h index 87f16e1067..b89d3310de 100644 --- a/ompi/mca/coll/libnbc/nbc_internal.h +++ b/ompi/mca/coll/libnbc/nbc_internal.h @@ -8,6 +8,7 @@ * Author(s): Torsten Hoefler * * Copyright (c) 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014 NVIDIA Corporation. All rights reserved. * */ #ifndef __NBC_INTERNAL_H__ @@ -20,6 +21,10 @@ #include "mpi.h" #include "coll_libnbc.h" +#if OPAL_CUDA_SUPPORT +#include "opal/datatype/opal_convertor.h" +#include "opal/datatype/opal_datatype_cuda.h" +#endif /* OPAL_CUDA_SUPPORT */ #include "ompi/include/ompi/constants.h" #include "ompi/request/request.h" #include "ompi/datatype/ompi_datatype.h" @@ -483,7 +488,11 @@ static inline int NBC_Copy(void *src, int srccount, MPI_Datatype srctype, void * OPAL_PTRDIFF_TYPE ext, lb; void *packbuf; +#if OPAL_CUDA_SUPPORT + if((srctype == tgttype) && NBC_Type_intrinsic(srctype) && !(opal_cuda_check_bufs((char *)tgt, (char *)src))) { +#else if((srctype == tgttype) && NBC_Type_intrinsic(srctype)) { +#endif /* OPAL_CUDA_SUPPORT */ /* if we have the same types and they are contiguous (intrinsic * types are contiguous), we can just use a single memcpy */ res = ompi_datatype_get_extent(srctype, &lb, &ext); @@ -511,7 +520,11 @@ static inline int NBC_Unpack(void *src, int srccount, MPI_Datatype srctype, void int size, pos, res; OPAL_PTRDIFF_TYPE ext, lb; +#if OPAL_CUDA_SUPPORT + if(NBC_Type_intrinsic(srctype) && !(opal_cuda_check_bufs((char *)tgt, (char *)src))) { +#else if(NBC_Type_intrinsic(srctype)) { +#endif /* OPAL_CUDA_SUPPORT */ /* if we have the same types and they are contiguous (intrinsic * types are contiguous), we can just use a single memcpy */ res = ompi_datatype_get_extent (srctype, &lb, &ext);