/* * Copyright (c) 2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2006 The Technical University of Chemnitz. All * rights reserved. * * Author(s): Torsten Hoefler * * Copyright (c) 2012 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2014 NVIDIA Corporation. All rights reserved. * */ #ifndef __NBC_INTERNAL_H__ #define __NBC_INTERNAL_H__ #include "ompi_config.h" /* correct fortran bindings */ #define NBC_F77_FUNC_ F77_FUNC_ #include "mpi.h" #include "coll_libnbc.h" #if OPAL_CUDA_SUPPORT #include "opal/datatype/opal_convertor.h" #include "opal/datatype/opal_datatype_cuda.h" #endif /* OPAL_CUDA_SUPPORT */ #include "ompi/include/ompi/constants.h" #include "ompi/request/request.h" #include "ompi/datatype/ompi_datatype.h" #include #include #include #include #include #include #include "libdict/dict.h" #ifdef __cplusplus extern "C" { #endif /* log(2) */ #define LOG2 0.69314718055994530941 /* true/false */ #define true 1 #define false 0 /* all collectives */ #define NBC_ALLGATHER 0 #define NBC_ALLGATHERV 1 #define NBC_ALLREDUCE 2 #define NBC_ALLTOALL 3 #define NBC_ALLTOALLV 4 #define NBC_ALLTOALLW 5 #define NBC_BARRIER 6 #define NBC_BCAST 7 #define NBC_EXSCAN 8 #define NBC_GATHER 9 #define NBC_GATHERV 10 #define NBC_REDUCE 11 #define NBC_REDUCESCAT 12 #define NBC_SCAN 13 #define NBC_SCATTER 14 #define NBC_SCATTERV 15 /* set the number of collectives in nbc.h !!!! */ /* several typedefs for NBC */ /* the function type enum */ typedef enum { SEND, RECV, OP, COPY, UNPACK } NBC_Fn_type; /* the send argument struct */ typedef struct { void *buf; char tmpbuf; int count; MPI_Datatype datatype; int dest; } NBC_Args_send; /* the receive argument struct */ typedef struct { void *buf; char tmpbuf; int count; MPI_Datatype datatype; int source; } NBC_Args_recv; /* the operation argument struct */ typedef struct { void *buf1; char tmpbuf1; void *buf2; char tmpbuf2; void *buf3; char tmpbuf3; int count; MPI_Op op; MPI_Datatype datatype; } NBC_Args_op; /* the copy argument struct */ typedef struct { void *src; char tmpsrc; int srccount; MPI_Datatype srctype; void *tgt; char tmptgt; int tgtcount; MPI_Datatype tgttype; } NBC_Args_copy; /* unpack operation arguments */ typedef struct { void *inbuf; char tmpinbuf; int count; MPI_Datatype datatype; void *outbuf; char tmpoutbuf; } NBC_Args_unpack; /* internal function prototypes */ int NBC_Sched_create(NBC_Schedule* schedule); int NBC_Sched_send(void* buf, char tmpbuf, int count, MPI_Datatype datatype, int dest, NBC_Schedule *schedule); int NBC_Sched_recv(void* buf, char tmpbuf, int count, MPI_Datatype datatype, int source, NBC_Schedule *schedule); int NBC_Sched_op(void* buf3, char tmpbuf3, void* buf1, char tmpbuf1, void* buf2, char tmpbuf2, int count, MPI_Datatype datatype, MPI_Op op, NBC_Schedule *schedule); int NBC_Sched_copy(void *src, char tmpsrc, int srccount, MPI_Datatype srctype, void *tgt, char tmptgt, int tgtcount, MPI_Datatype tgttype, NBC_Schedule *schedule); int NBC_Sched_unpack(void *inbuf, char tmpinbuf, int count, MPI_Datatype datatype, void *outbuf, char tmpoutbuf, NBC_Schedule *schedule); int NBC_Sched_barrier(NBC_Schedule *schedule); int NBC_Sched_commit(NBC_Schedule *schedule); #ifdef NBC_CACHE_SCHEDULE /* this is a dummy structure which is used to get the schedule out of * the collop specific structure. The schedule pointer HAS to be at the * first position and should NOT BE REORDERED by the compiler (C * guarantees that */ struct NBC_dummyarg { NBC_Schedule *schedule; }; typedef struct { NBC_Schedule *schedule; void *sendbuf; int sendcount; MPI_Datatype sendtype; void* recvbuf; int recvcount; MPI_Datatype recvtype; } NBC_Alltoall_args; int NBC_Alltoall_args_compare(NBC_Alltoall_args *a, NBC_Alltoall_args *b, void *param); typedef struct { NBC_Schedule *schedule; void *sendbuf; int sendcount; MPI_Datatype sendtype; void* recvbuf; int recvcount; MPI_Datatype recvtype; } NBC_Allgather_args; int NBC_Allgather_args_compare(NBC_Allgather_args *a, NBC_Allgather_args *b, void *param); typedef struct { NBC_Schedule *schedule; void *sendbuf; void* recvbuf; int count; MPI_Datatype datatype; MPI_Op op; } NBC_Allreduce_args; int NBC_Allreduce_args_compare(NBC_Allreduce_args *a, NBC_Allreduce_args *b, void *param); typedef struct { NBC_Schedule *schedule; void *buffer; int count; MPI_Datatype datatype; int root; } NBC_Bcast_args; int NBC_Bcast_args_compare(NBC_Bcast_args *a, NBC_Bcast_args *b, void *param); typedef struct { NBC_Schedule *schedule; void *sendbuf; int sendcount; MPI_Datatype sendtype; void* recvbuf; int recvcount; MPI_Datatype recvtype; int root; } NBC_Gather_args; int NBC_Gather_args_compare(NBC_Gather_args *a, NBC_Gather_args *b, void *param); typedef struct { NBC_Schedule *schedule; void *sendbuf; void* recvbuf; int count; MPI_Datatype datatype; MPI_Op op; int root; } NBC_Reduce_args; int NBC_Reduce_args_compare(NBC_Reduce_args *a, NBC_Reduce_args *b, void *param); typedef struct { NBC_Schedule *schedule; void *sendbuf; void* recvbuf; int count; MPI_Datatype datatype; MPI_Op op; } NBC_Scan_args; int NBC_Scan_args_compare(NBC_Scan_args *a, NBC_Scan_args *b, void *param); typedef struct { NBC_Schedule *schedule; void *sendbuf; int sendcount; MPI_Datatype sendtype; void* recvbuf; int recvcount; MPI_Datatype recvtype; int root; } NBC_Scatter_args; int NBC_Scatter_args_compare(NBC_Scatter_args *a, NBC_Scatter_args *b, void *param); /* Schedule cache structures/functions */ void NBC_SchedCache_args_delete(void *entry); void NBC_SchedCache_args_delete_key_dummy(void *k); #endif int NBC_Start(NBC_Handle *handle, NBC_Schedule *schedule); int NBC_Init_handle(struct ompi_communicator_t *comm, ompi_coll_libnbc_request_t **request, ompi_coll_libnbc_module_t *module); static inline int NBC_Type_intrinsic(MPI_Datatype type); static inline int NBC_Copy(void *src, int srccount, MPI_Datatype srctype, void *tgt, int tgtcount, MPI_Datatype tgttype, MPI_Comm comm); int NBC_Create_fortran_handle(int *fhandle, NBC_Handle **handle); /* some macros */ /* a schedule has the following format: * [schedule] ::= [size][round-schedule][delimiter][round-schedule][delimiter]...[end] * [size] ::= size of the schedule (int) * [round-schedule] ::= [num][type][type-args][type][type-args]... * [num] ::= number of elements in round (int) * [type] ::= function type (NBC_Fn_type) * [type-args] ::= type specific arguments (NBC_Args_send, NBC_Args_recv or, NBC_Args_op) * [delimiter] ::= 1 (char) - indicates that a round follows * [end] ::= 0 (char) - indicates that this is the last round */ /* * The addresses of components of a round-schedule may be poorly aligned. * E.g., single-char delimiters can push addresses to odd-byte boundaries. * Or even ints can push 8-byte pointers to 4-byte boundaries. * So, for greater portability, we access components of a round-schedule with memcpy. */ #define NBC_GET_BYTES(ptr,x) {memcpy(&x,ptr,sizeof(x)); ptr += sizeof(x);} #define NBC_PUT_BYTES(ptr,x) {memcpy(ptr,&x,sizeof(x)); ptr += sizeof(x);} /* NBC_GET_ROUND_SIZE returns the size in bytes of a round of a NBC_Schedule * schedule. A round has the format: * [num]{[type][type-args]} * e.g. [(int)2][(NBC_Fn_type)SEND][(NBC_Args_send)SEND-ARGS][(NBC_Fn_type)RECV][(NBC_Args_recv)RECV-ARGS] */ #define NBC_GET_ROUND_SIZE(schedule, size) \ { \ int num; \ char *p = (char*) schedule; \ NBC_Fn_type type; \ int i; \ \ NBC_GET_BYTES(p,num); \ /*NBC_DEBUG(10, "GET_ROUND_SIZE got %i elements\n", num); */\ for (i=0; i 0 va_list ap; int rank; if(NBC_DLEVEL >= level) { MPI_Comm_rank(MPI_COMM_WORLD, &rank); printf("[LibNBC - %i] ", rank); va_start(ap, fmt); vprintf(fmt, ap); va_end (ap); } #endif } /* returns true (1) or false (0) if type is intrinsic or not */ static inline int NBC_Type_intrinsic(MPI_Datatype type) { if( ( type == MPI_INT ) || ( type == MPI_LONG ) || ( type == MPI_SHORT ) || ( type == MPI_UNSIGNED ) || ( type == MPI_UNSIGNED_SHORT ) || ( type == MPI_UNSIGNED_LONG ) || ( type == MPI_FLOAT ) || ( type == MPI_DOUBLE ) || ( type == MPI_LONG_DOUBLE ) || ( type == MPI_BYTE ) || ( type == MPI_FLOAT_INT) || ( type == MPI_DOUBLE_INT) || ( type == MPI_LONG_INT) || ( type == MPI_2INT) || ( type == MPI_SHORT_INT) || ( type == MPI_LONG_DOUBLE_INT)) return 1; else return 0; } /* let's give a try to inline functions */ static inline int NBC_Copy(void *src, int srccount, MPI_Datatype srctype, void *tgt, int tgtcount, MPI_Datatype tgttype, MPI_Comm comm) { int size, pos, res; OPAL_PTRDIFF_TYPE ext, lb; void *packbuf; #if OPAL_CUDA_SUPPORT if((srctype == tgttype) && NBC_Type_intrinsic(srctype) && !(opal_cuda_check_bufs((char *)tgt, (char *)src))) { #else if((srctype == tgttype) && NBC_Type_intrinsic(srctype)) { #endif /* OPAL_CUDA_SUPPORT */ /* if we have the same types and they are contiguous (intrinsic * types are contiguous), we can just use a single memcpy */ res = ompi_datatype_get_extent(srctype, &lb, &ext); if (OMPI_SUCCESS != res) { printf("MPI Error in MPI_Type_extent() (%i)\n", res); return res; } memcpy(tgt, src, srccount*ext); } else { /* we have to pack and unpack */ res = MPI_Pack_size(srccount, srctype, comm, &size); if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Pack_size() (%i)\n", res); return res; } packbuf = malloc(size); if (NULL == packbuf) { printf("Error in malloc()\n"); return res; } pos=0; res = MPI_Pack(src, srccount, srctype, packbuf, size, &pos, comm); if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Pack() (%i)\n", res); return res; } pos=0; res = MPI_Unpack(packbuf, size, &pos, tgt, tgtcount, tgttype, comm); if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Unpack() (%i)\n", res); return res; } free(packbuf); } return NBC_OK; } static inline int NBC_Unpack(void *src, int srccount, MPI_Datatype srctype, void *tgt, MPI_Comm comm) { int size, pos, res; OPAL_PTRDIFF_TYPE ext, lb; #if OPAL_CUDA_SUPPORT if(NBC_Type_intrinsic(srctype) && !(opal_cuda_check_bufs((char *)tgt, (char *)src))) { #else if(NBC_Type_intrinsic(srctype)) { #endif /* OPAL_CUDA_SUPPORT */ /* if we have the same types and they are contiguous (intrinsic * types are contiguous), we can just use a single memcpy */ res = ompi_datatype_get_extent (srctype, &lb, &ext); if (OMPI_SUCCESS != res) { printf("MPI Error in MPI_Type_extent() (%i)\n", res); return res; } memcpy(tgt, src, srccount * ext); } else { /* we have to unpack */ res = MPI_Pack_size(srccount, srctype, comm, &size); if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Pack_size() (%i)\n", res); return res; } pos=0; res = MPI_Unpack(src, size, &pos, tgt, srccount, srctype, comm); if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Unpack() (%i)\n", res); return res; } } return NBC_OK; } /* deletes elements from dict until low watermark is reached */ static inline void NBC_SchedCache_dictwipe(hb_tree *dict, int *size) { hb_itor *itor; itor = hb_itor_new(dict); for (; hb_itor_valid(itor) && (*size>NBC_SCHED_DICT_LOWER); hb_itor_next(itor)) { hb_tree_remove(dict, hb_itor_key(itor), 0); *size = *size-1; } hb_itor_destroy(itor); } #define NBC_IN_PLACE(sendbuf, recvbuf, inplace) \ { \ inplace = 0; \ if(recvbuf == sendbuf) { \ inplace = 1; \ } else \ if(sendbuf == MPI_IN_PLACE) { \ sendbuf = recvbuf; \ inplace = 1; \ } else \ if(recvbuf == MPI_IN_PLACE) { \ recvbuf = sendbuf; \ inplace = 1; \ } \ } int NBC_Comm_neighbors_count(MPI_Comm comm, int *indegree, int *outdegree, int *weighted); int NBC_Comm_neighbors(MPI_Comm comm, int maxindegree, int sources[], int sourceweights[], int maxoutdegree, int destinations[], int destweights[]); #ifdef __cplusplus } #endif #endif