From 7bdeafb772276a9b1268cee137f71db2f212335c Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Mon, 25 Jun 2012 22:38:06 +0000 Subject: [PATCH] Start bringing in libnbc. .ompi_ignored, as there's still a long way to go This commit was SVN r26658. --- ompi/mca/coll/libnbc/.ompi_ignore | 0 ompi/mca/coll/libnbc/.ompi_unignore | 1 + ompi/mca/coll/libnbc/Makefile.am | 46 + ompi/mca/coll/libnbc/coll_libnbc.h | 97 ++ ompi/mca/coll/libnbc/coll_libnbc_component.c | 163 ++++ ompi/mca/coll/libnbc/libdict/dict.c | 106 +++ ompi/mca/coll/libnbc/libdict/dict.h | 140 +++ ompi/mca/coll/libnbc/libdict/dict_private.h | 84 ++ ompi/mca/coll/libnbc/libdict/hb_tree.c | 906 +++++++++++++++++++ ompi/mca/coll/libnbc/libdict/hb_tree.h | 64 ++ ompi/mca/coll/libnbc/nbc.c | 720 +++++++++++++++ ompi/mca/coll/libnbc/nbc.h | 59 ++ ompi/mca/coll/libnbc/nbc_iallgather.c | 257 ++++++ ompi/mca/coll/libnbc/nbc_iallgatherv.c | 124 +++ ompi/mca/coll/libnbc/nbc_iallreduce.c | 655 ++++++++++++++ ompi/mca/coll/libnbc/nbc_ialltoall.c | 358 ++++++++ ompi/mca/coll/libnbc/nbc_ialltoallv.c | 129 +++ ompi/mca/coll/libnbc/nbc_ibarrier.c | 89 ++ ompi/mca/coll/libnbc/nbc_ibcast.c | 287 ++++++ ompi/mca/coll/libnbc/nbc_ibcast_inter.c | 68 ++ ompi/mca/coll/libnbc/nbc_igather.c | 169 ++++ ompi/mca/coll/libnbc/nbc_igatherv.c | 118 +++ ompi/mca/coll/libnbc/nbc_internal.h | 625 +++++++++++++ ompi/mca/coll/libnbc/nbc_ireduce.c | 333 +++++++ ompi/mca/coll/libnbc/nbc_ireduce_scatter.c | 185 ++++ ompi/mca/coll/libnbc/nbc_iscan.c | 174 ++++ ompi/mca/coll/libnbc/nbc_iscatter.c | 170 ++++ ompi/mca/coll/libnbc/nbc_iscatterv.c | 115 +++ ompi/mca/coll/libnbc/nbc_ompi_include.h | 32 + ompi/mca/coll/libnbc/nbc_op.c | 570 ++++++++++++ 30 files changed, 6844 insertions(+) create mode 100644 ompi/mca/coll/libnbc/.ompi_ignore create mode 100644 ompi/mca/coll/libnbc/.ompi_unignore create mode 100644 ompi/mca/coll/libnbc/Makefile.am create mode 100644 ompi/mca/coll/libnbc/coll_libnbc.h create mode 100644 ompi/mca/coll/libnbc/coll_libnbc_component.c create mode 100644 ompi/mca/coll/libnbc/libdict/dict.c create mode 100644 ompi/mca/coll/libnbc/libdict/dict.h create mode 100644 ompi/mca/coll/libnbc/libdict/dict_private.h create mode 100644 ompi/mca/coll/libnbc/libdict/hb_tree.c create mode 100644 ompi/mca/coll/libnbc/libdict/hb_tree.h create mode 100644 ompi/mca/coll/libnbc/nbc.c create mode 100644 ompi/mca/coll/libnbc/nbc.h create mode 100644 ompi/mca/coll/libnbc/nbc_iallgather.c create mode 100644 ompi/mca/coll/libnbc/nbc_iallgatherv.c create mode 100644 ompi/mca/coll/libnbc/nbc_iallreduce.c create mode 100644 ompi/mca/coll/libnbc/nbc_ialltoall.c create mode 100644 ompi/mca/coll/libnbc/nbc_ialltoallv.c create mode 100644 ompi/mca/coll/libnbc/nbc_ibarrier.c create mode 100644 ompi/mca/coll/libnbc/nbc_ibcast.c create mode 100644 ompi/mca/coll/libnbc/nbc_ibcast_inter.c create mode 100644 ompi/mca/coll/libnbc/nbc_igather.c create mode 100644 ompi/mca/coll/libnbc/nbc_igatherv.c create mode 100644 ompi/mca/coll/libnbc/nbc_internal.h create mode 100644 ompi/mca/coll/libnbc/nbc_ireduce.c create mode 100644 ompi/mca/coll/libnbc/nbc_ireduce_scatter.c create mode 100644 ompi/mca/coll/libnbc/nbc_iscan.c create mode 100644 ompi/mca/coll/libnbc/nbc_iscatter.c create mode 100644 ompi/mca/coll/libnbc/nbc_iscatterv.c create mode 100644 ompi/mca/coll/libnbc/nbc_ompi_include.h create mode 100644 ompi/mca/coll/libnbc/nbc_op.c diff --git a/ompi/mca/coll/libnbc/.ompi_ignore b/ompi/mca/coll/libnbc/.ompi_ignore new file mode 100644 index 0000000000..e69de29bb2 diff --git a/ompi/mca/coll/libnbc/.ompi_unignore b/ompi/mca/coll/libnbc/.ompi_unignore new file mode 100644 index 0000000000..269b13243b --- /dev/null +++ b/ompi/mca/coll/libnbc/.ompi_unignore @@ -0,0 +1 @@ +bwbarre diff --git a/ompi/mca/coll/libnbc/Makefile.am b/ompi/mca/coll/libnbc/Makefile.am new file mode 100644 index 0000000000..993f4498e8 --- /dev/null +++ b/ompi/mca/coll/libnbc/Makefile.am @@ -0,0 +1,46 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +EXTRA_DIST = + +sources = \ + coll_libnbc_component.c \ + nbc.c \ + nbc_ibarrier.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_ompi_coll_libnbc_DSO +component_noinst = +component_install = mca_coll_libnbc.la +else +component_noinst = libmca_coll_libnbc.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_coll_libnbc_la_SOURCES = $(sources) +mca_coll_libnbc_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_coll_libnbc_la_SOURCES =$(sources) +libmca_coll_libnbc_la_LDFLAGS = -module -avoid-version diff --git a/ompi/mca/coll/libnbc/coll_libnbc.h b/ompi/mca/coll/libnbc/coll_libnbc.h new file mode 100644 index 0000000000..a1ff6c3f23 --- /dev/null +++ b/ompi/mca/coll/libnbc/coll_libnbc.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_COLL_LIBNBC_EXPORT_H +#define MCA_COLL_LIBNBC_EXPORT_H + +#include "ompi/mca/coll/coll.h" +#include "ompi/request/request.h" + +BEGIN_C_DECLS + +/* Globally exported variables */ +OMPI_MODULE_DECLSPEC extern const mca_coll_base_component_2_0_0_t mca_coll_libnbc_component; + +int ompi_coll_libnbc_ibarrier(struct ompi_communicator_t *comm, ompi_request_t ** request, + struct mca_coll_base_module_2_0_0_t *module); + + +struct ompi_coll_libnbc_module_t { + mca_coll_base_module_t super; +}; +typedef struct ompi_coll_libnbc_module_t ompi_coll_libnbc_module_t; +OBJ_CLASS_DECLARATION(ompi_coll_libnbc_module_t); + + +/* Function return codes */ +#define NBC_OK 0 /* everything went fine */ +#define NBC_SUCCESS 0 /* everything went fine (MPI compliant :) */ +#define NBC_OOR 1 /* out of resources */ +#define NBC_BAD_SCHED 2 /* bad schedule */ +#define NBC_CONTINUE 3 /* progress not done */ +#define NBC_DATATYPE_NOT_SUPPORTED 4 /* datatype not supported or not valid */ +#define NBC_OP_NOT_SUPPORTED 5 /* operation not supported or not valid */ +#define NBC_NOT_IMPLEMENTED 6 +#define NBC_INVALID_PARAM 7 /* invalid parameters */ +#define NBC_INVALID_TOPOLOGY_COMM 8 /* invalid topology attached to communicator */ + +/* number of implemented collective functions */ +#define NBC_NUM_COLL 19 + +/* a schedule is basically a pointer to some memory location where the + * schedule array resides */ +typedef void* NBC_Schedule; + +/* used to hang off a communicator */ +typedef struct { + MPI_Comm mycomm; /* save the shadow communicator here */ + int tag; +#ifdef NBC_CACHE_SCHEDULE + void *NBC_Dict[NBC_NUM_COLL]; /* this should point to a struct + hb_tree, but since this is a + public header-file, this would be + an include mess :-(. So let's void + it ...*/ + int NBC_Dict_size[NBC_NUM_COLL]; +#endif +} NBC_Comminfo; + +struct ompi_coll_libnbc_request_t { + ompi_request_t super; + MPI_Comm comm; + MPI_Comm mycomm; + long row_offset; + int tag; + volatile int req_count; + /*ompi_request_t **req_array;*/ + MPI_Request *req_array; + NBC_Comminfo *comminfo; + volatile NBC_Schedule *schedule; + void *tmpbuf; /* temporary buffer e.g. used for Reduce */ + /* TODO: we should make a handle pointer to a state later (that the user + * can move request handles) */ +}; +typedef struct ompi_coll_libnbc_request_t ompi_coll_libnbc_request_t; +OBJ_CLASS_DECLARATION(ompi_coll_libnbc_request_t); + +typedef ompi_coll_libnbc_request_t NBC_Handle; + +END_C_DECLS + +#endif /* MCA_COLL_LIBNBC_EXPORT_H */ diff --git a/ompi/mca/coll/libnbc/coll_libnbc_component.c b/ompi/mca/coll/libnbc/coll_libnbc_component.c new file mode 100644 index 0000000000..9e3f6d1fd0 --- /dev/null +++ b/ompi/mca/coll/libnbc/coll_libnbc_component.c @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "coll_libnbc.h" + +#include "mpi.h" +#include "ompi/mca/coll/coll.h" + + +/* + * Public string showing the coll ompi_libnbc component version number + */ +const char *mca_coll_libnbc_component_version_string = + "Open MPI libnbc collective MCA component version " OMPI_VERSION; + + +static int libnbc_priority = 10; + + +static int libnbc_register(void); +static int libnbc_init_query(bool, bool); +static mca_coll_base_module_t *libnbc_comm_query(struct ompi_communicator_t *, int *); +static int libnbc_module_enable(mca_coll_base_module_t *, struct ompi_communicator_t *); + + +/* + * Instantiate the public struct with all of our public information + * and pointers to our public functions in it + */ + +const mca_coll_base_component_2_0_0_t mca_coll_libnbc_component = { + + /* First, the mca_component_t struct containing meta information + * about the component itself */ + + { + MCA_COLL_BASE_VERSION_2_0_0, + + /* Component name and version */ + "libnbc", + OMPI_MAJOR_VERSION, + OMPI_MINOR_VERSION, + OMPI_RELEASE_VERSION, + + /* Component open and close functions */ + NULL, + NULL, + NULL, + libnbc_register + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + + /* Initialization / querying functions */ + + libnbc_init_query, + libnbc_comm_query +}; + + +static int +libnbc_register(void) +{ + /* Use a low priority, but allow other components to be lower */ + + mca_base_param_reg_int(&mca_coll_libnbc_component.collm_version, + "priority", + "Priority of the libnbc coll component", + false, false, libnbc_priority, + &libnbc_priority); + + return OMPI_SUCCESS; +} + + + +/* + * Initial query function that is invoked during MPI_INIT, allowing + * this component to disqualify itself if it doesn't support the + * required level of thread support. + */ +static int +libnbc_init_query(bool enable_progress_threads, + bool enable_mpi_threads) +{ + /* Nothing to do */ + return OMPI_SUCCESS; +} + + +/* + * Invoked when there's a new communicator that has been created. + * Look at the communicator and decide which set of functions and + * priority we want to return. + */ +mca_coll_base_module_t * +libnbc_comm_query(struct ompi_communicator_t *comm, + int *priority) +{ + ompi_coll_libnbc_module_t *module; + + module = OBJ_NEW(ompi_coll_libnbc_module_t); + if (NULL == module) return NULL; + + *priority = libnbc_priority; + + module->super.coll_module_enable = libnbc_module_enable; + + module->super.coll_ibarrier = ompi_coll_libnbc_ibarrier; + + module->super.ft_event = NULL; + + return &(module->super); +} + + +/* + * Init module on the communicator + */ +static int +libnbc_module_enable(mca_coll_base_module_t *module, + struct ompi_communicator_t *comm) +{ + /* All done */ + return OMPI_SUCCESS; +} + + +static void +libnbc_module_construct(ompi_coll_libnbc_module_t *module) +{ +} + +static void +libnbc_module_destruct(ompi_coll_libnbc_module_t *module) +{ +} + + +OBJ_CLASS_INSTANCE(ompi_coll_libnbc_module_t, + mca_coll_base_module_t, + libnbc_module_construct, + libnbc_module_destruct); diff --git a/ompi/mca/coll/libnbc/libdict/dict.c b/ompi/mca/coll/libnbc/libdict/dict.c new file mode 100644 index 0000000000..64caf9c9ad --- /dev/null +++ b/ompi/mca/coll/libnbc/libdict/dict.c @@ -0,0 +1,106 @@ +/* + * dict.c + * + * Implementation of generic dictionary routines. + * Copyright (C) 2001-2004 Farooq Mela. + * + * $Id: dict.c,v 1.7 2001/11/25 06:00:49 farooq Exp farooq $ + */ + +#include + +#include "dict.h" +#include "dict_private.h" + +dict_malloc_func _dict_malloc = malloc; +dict_free_func _dict_free = free; + +dict_malloc_func +dict_set_malloc(dict_malloc_func func) +{ + dict_malloc_func old = _dict_malloc; + _dict_malloc = func ? func : malloc; + return old; +} + +dict_free_func +dict_set_free(dict_free_func func) +{ + dict_free_func old = _dict_free; + _dict_free = func ? func : free; + return old; +} + +/* + * In comparing, we cannot simply subtract because that might result in signed + * overflow. + */ +int +dict_int_cmp(const void *k1, const void *k2) +{ + const int *a = (int*)k1, *b = (int*)k2; + + return (*a < *b) ? -1 : (*a > *b) ? +1 : 0; +} + +int +dict_uint_cmp(const void *k1, const void *k2) +{ + const unsigned int *a = (unsigned int*)k1, *b = (unsigned int*)k2; + + return (*a < *b) ? -1 : (*a > *b) ? +1 : 0; +} + +int +dict_long_cmp(const void *k1, const void *k2) +{ + const long *a = (long*)k1, *b = (long*)k2; + + return (*a < *b) ? -1 : (*a > *b) ? +1 : 0; +} + +int +dict_ulong_cmp(const void *k1, const void *k2) +{ + const unsigned long *a = (unsigned long*)k1, *b = (unsigned long*)k2; + + return (*a < *b) ? -1 : (*a > *b) ? +1 : 0; +} + +int +dict_ptr_cmp(const void *k1, const void *k2) +{ + return (k1 > k2) - (k1 < k2); +} + +int +dict_str_cmp(const void *k1, const void *k2) +{ + const char *a = (char*)k1, *b = (char*)k2; + char p, q; + + for (;;) { + p = *a++; q = *b++; + if (p == 0 || p != q) + break; + } + return (p > q) - (p < q); +} + +void +dict_destroy(dict *dct, int del) +{ + ASSERT(dct != NULL); + + dct->_destroy(dct->_object, del); + FREE(dct); +} + +void +dict_itor_destroy(dict_itor *itor) +{ + ASSERT(itor != NULL); + + itor->_destroy(itor->_itor); + FREE(itor); +} diff --git a/ompi/mca/coll/libnbc/libdict/dict.h b/ompi/mca/coll/libnbc/libdict/dict.h new file mode 100644 index 0000000000..4680b2bc2c --- /dev/null +++ b/ompi/mca/coll/libnbc/libdict/dict.h @@ -0,0 +1,140 @@ +/* + * dict.h + * + * Interface for generic access to dictionary library. + * Copyright (C) 2001-2004 Farooq Mela. + * + * $Id: dict.h,v 1.6 2001/11/14 05:21:10 farooq Exp farooq $ + */ + +#ifndef _DICT_H_ +#define _DICT_H_ + +#include + +#define DICT_VERSION_MAJOR 0 +#define DICT_VERSION_MINOR 2 +#define DICT_VERSION_PATCH 1 + +#ifndef __P +# if defined(__STDC__) || defined(__cplusplus) || defined(c_plusplus) || \ + defined(_MSC_VER) +# define __P(x) x +# else /* !__STDC__ && !__cplusplus && !c_plusplus && !_MSC_VER */ +# define __P(x) +# endif +#endif /* !__P */ + +#ifndef FALSE +#define FALSE 0 +#endif + +#ifndef TRUE +#define TRUE (!FALSE) +#endif + +#if defined(__cplusplus) || defined(c_plusplus) +# define BEGIN_DECL extern "C" { +# define END_DECL } +#else +# define BEGIN_DECL +# define END_DECL +#endif + +BEGIN_DECL + +typedef void *(*dict_malloc_func)(size_t); +typedef void (*dict_free_func)(void *); + +dict_malloc_func dict_set_malloc __P((dict_malloc_func func)); +dict_free_func dict_set_free __P((dict_free_func func)); + +typedef int (*dict_cmp_func) __P((const void *, const void *)); +typedef void (*dict_del_func) __P((void *)); +typedef int (*dict_vis_func) __P((const void *, void *)); +typedef unsigned (*dict_hsh_func) __P((const void *)); + +typedef struct dict dict; +typedef struct dict_itor dict_itor; + +struct dict { + void *_object; + int (*_insert) __P((void *obj, void *k, void *d, int ow)); + int (*_probe) __P((void *obj, void *key, void **dat)); + void *(*_search) __P((void *obj, const void *k)); + const void *(*_csearch) __P((const void *obj, const void *k)); + int (*_remove) __P((void *obj, const void *key, int del)); + void (*_walk) __P((void *obj, dict_vis_func func)); + unsigned (*_count) __P((const void *obj)); + void (*_empty) __P((void *obj, int del)); + void (*_destroy) __P((void *obj, int del)); + dict_itor *(*_inew) __P((void *obj)); +}; + +#define dict_private(dct) (dct)->_object +#define dict_insert(dct,k,d,o) (dct)->_insert((dct)->_object, (k), (d), (o)) +#define dict_probe(dct,k,d) (dct)->_probe((dct)->_object, (k), (d)) +#define dict_search(dct,k) (dct)->_search((dct)->_object, (k)) +#define dict_csearch(dct,k) (dct)->_csearch((dct)->_object, (k)) +#define dict_remove(dct,k,del) (dct)->_remove((dct)->_object, (k), (del)) +#define dict_walk(dct,f) (dct)->_walk((dct)->_object, (f)) +#define dict_count(dct) (dct)->_count((dct)->_object) +#define dict_empty(dct,d) (dct)->_empty((dct)->_object, (d)) +void dict_destroy __P((dict *dct, int del)); +#define dict_itor_new(dct) (dct)->_inew((dct)->_object) + +struct dict_itor { + void *_itor; + int (*_valid) __P((const void *itor)); + void (*_invalid) __P((void *itor)); + int (*_next) __P((void *itor)); + int (*_prev) __P((void *itor)); + int (*_nextn) __P((void *itor, unsigned count)); + int (*_prevn) __P((void *itor, unsigned count)); + int (*_first) __P((void *itor)); + int (*_last) __P((void *itor)); + int (*_search) __P((void *itor, const void *key)); + const void *(*_key) __P((void *itor)); + void *(*_data) __P((void *itor)); + const void *(*_cdata) __P((const void *itor)); + int (*_setdata) __P((void *itor, void *dat, int del)); + int (*_remove) __P((void *itor, int del)); + int (*_compare) __P((void *itor1, void *itor2)); + void (*_destroy) __P((void *itor)); +}; + +#define dict_itor_private(i) (i)->_itor +#define dict_itor_valid(i) (i)->_valid((i)->_itor) +#define dict_itor_invalidate(i) (i)->_invalid((i)->_itor) +#define dict_itor_next(i) (i)->_next((i)->_itor) +#define dict_itor_prev(i) (i)->_prev((i)->_itor) +#define dict_itor_nextn(i,n) (i)->_nextn((i)->_itor, (n)) +#define dict_itor_prevn(i,n) (i)->_prevn((i)->_itor, (n)) +#define dict_itor_first(i) (i)->_first((i)->_itor) +#define dict_itor_last(i) (i)->_last((i)->_itor) +#define dict_itor_search(i,k) (i)->_search((i)->_itor, (k)) +#define dict_itor_key(i) (i)->_key((i)->_itor) +#define dict_itor_data(i) (i)->_data((i)->_itor) +#define dict_itor_cdata(i) (i)->_cdata((i)->_itor) +#define dict_itor_set_data(i,dat,d) (i)->_setdata((i)->_itor, (dat), (d)) +#define dict_itor_remove(i) (i)->_remove((i)->_itor) +void dict_itor_destroy __P((dict_itor *itor)); + +int dict_int_cmp __P((const void *k1, const void *k2)); +int dict_uint_cmp __P((const void *k1, const void *k2)); +int dict_long_cmp __P((const void *k1, const void *k2)); +int dict_ulong_cmp __P((const void *k1, const void *k2)); +int dict_ptr_cmp __P((const void *k1, const void *k2)); +int dict_str_cmp __P((const void *k1, const void *k2)); + +END_DECL + +/*#include "hashtable.h"*/ +#include "hb_tree.h" +/*#include "pr_tree.h" +#include "rb_tree.h" +#include "sp_tree.h" +#include "tr_tree.h" +#include "wb_tree.h"*/ + +#endif /* !_DICT_H_ */ diff --git a/ompi/mca/coll/libnbc/libdict/dict_private.h b/ompi/mca/coll/libnbc/libdict/dict_private.h new file mode 100644 index 0000000000..5939182e19 --- /dev/null +++ b/ompi/mca/coll/libnbc/libdict/dict_private.h @@ -0,0 +1,84 @@ +/* + * dict_private.h + * + * Private definitions for libdict. + * Copyright (C) 2001 Farooq Mela. + * + * $Id: dict_private.h,v 1.8 2002/01/02 09:14:11 farooq Exp $ + */ + +#ifndef _DICT_PRIVATE_H_ +#define _DICT_PRIVATE_H_ + +#include "dict.h" + +typedef int (*insert_func) __P((void *, void *k, void *d, int o)); +typedef int (*probe_func) __P((void *, void *k, void **d)); +typedef void *(*search_func) __P((void *, const void *k)); +typedef const void *(*csearch_func) __P((const void *, const void *k)); +typedef int (*remove_func) __P((void *, const void *k, int d)); +typedef void (*walk_func) __P((void *, dict_vis_func visit)); +typedef unsigned (*count_func) __P((const void *)); +typedef void (*empty_func) __P((void *, int del)); +typedef void (*destroy_func) __P((void *, int del)); +typedef dict_itor *(*inew_func) __P((void *)); + +typedef void (*idestroy_func) __P((void *)); +typedef int (*valid_func) __P((const void *)); +typedef void (*invalidate_func) __P((void *)); +typedef int (*next_func) __P((void *)); +typedef int (*prev_func) __P((void *)); +typedef int (*nextn_func) __P((void *, unsigned count)); +typedef int (*prevn_func) __P((void *, unsigned count)); +typedef int (*first_func) __P((void *)); +typedef int (*last_func) __P((void *)); +typedef int (*isearch_func) __P((void *, const void *k)); +typedef const void *(*key_func) __P((void *)); +typedef void *(*data_func) __P((void *)); +typedef const void *(*cdata_func) __P((const void *)); +typedef int (*dataset_func) __P((void *, void *d, int del)); +typedef int (*iremove_func) __P((void *, int del)); +typedef int (*icompare_func) __P((void *, void *itor2)); + +#ifndef NDEBUG +# include +# undef ASSERT +# if defined(__GNUC__) +# define ASSERT(expr) \ + if (!(expr)) \ + fprintf(stderr, "\n%s:%d (%s) assertion failed: `%s'\n", \ + __FILE__, __LINE__, __PRETTY_FUNCTION__, #expr), \ + abort() +# else +# define ASSERT(expr) \ + if (!(expr)) \ + fprintf(stderr, "\n%s:%d assertion failed: `%s'\n", \ + __FILE__, __LINE__, #expr), \ + abort() +# endif +#else +# define ASSERT(expr) +#endif + +extern dict_malloc_func _dict_malloc; +extern dict_free_func _dict_free; +#define MALLOC(n) (*_dict_malloc)(n) +#define FREE(p) (*_dict_free)(p) + +#define ABS(a) ((a) < 0 ? -(a) : +(a)) +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) +#define SWAP(a,b,v) v = (a), (a) = (b), (b) = v +#define UNUSED(p) (void)&p + +#if defined(__GNUC__) +# define GCC_INLINE inline +# define GCC_UNUSED __attribute__((__unused__)) +# define GCC_CONST __attribute__((__const__)) +#else +# define GCC_INLINE +# define GCC_UNUSED +# define GCC_CONST +#endif + +#endif /* !_DICT_PRIVATE_H_ */ diff --git a/ompi/mca/coll/libnbc/libdict/hb_tree.c b/ompi/mca/coll/libnbc/libdict/hb_tree.c new file mode 100644 index 0000000000..c3837ed3cb --- /dev/null +++ b/ompi/mca/coll/libnbc/libdict/hb_tree.c @@ -0,0 +1,906 @@ +/* + * hb_tree.c + * + * Implementation of height balanced tree. + * Copyright (C) 2001-2004 Farooq Mela. + * + * $Id: hb_tree.c,v 1.10 2001/11/25 08:30:21 farooq Exp farooq $ + * + * cf. [Gonnet 1984], [Knuth 1998] + */ + +#include + +#include "hb_tree.h" +#include "dict_private.h" + +typedef signed char balance_t; + +typedef struct hb_node hb_node; + +struct hb_node { + void *key; + void *dat; + hb_node *parent; + hb_node *llink; + hb_node *rlink; + balance_t bal; +}; + +struct hb_tree { + hb_node *root; + unsigned count; + dict_cmp_func key_cmp; + dict_del_func key_del; + dict_del_func dat_del; +}; + +struct hb_itor { + hb_tree *tree; + hb_node *node; +}; + +static int rot_left __P((hb_tree *tree, hb_node *node)); +static int rot_right __P((hb_tree *tree, hb_node *node)); +static unsigned node_height __P((const hb_node *node)); +static unsigned node_mheight __P((const hb_node *node)); +static unsigned node_pathlen __P((const hb_node *node, unsigned level)); +static hb_node *node_new __P((void *key, void *dat)); +static hb_node *node_min __P((hb_node *node)); +static hb_node *node_max __P((hb_node *node)); +static hb_node *node_next __P((hb_node *node)); +static hb_node *node_prev __P((hb_node *node)); + +hb_tree * +hb_tree_new(dict_cmp_func key_cmp, dict_del_func key_del, + dict_del_func dat_del) +{ + hb_tree *tree; + + if ((tree = (hb_tree*)MALLOC(sizeof(*tree))) == NULL) + return NULL; + + tree->root = NULL; + tree->count = 0; + tree->key_cmp = key_cmp ? key_cmp : dict_ptr_cmp; + tree->key_del = key_del; + tree->dat_del = dat_del; + + return tree; +} + +dict * +hb_dict_new(dict_cmp_func key_cmp, dict_del_func key_del, + dict_del_func dat_del) +{ + dict *dct; + hb_tree *tree; + + if ((dct = (dict*)MALLOC(sizeof(*dct))) == NULL) + return NULL; + + if ((tree = hb_tree_new(key_cmp, key_del, dat_del)) == NULL) { + FREE(dct); + return NULL; + } + + dct->_object = tree; + dct->_inew = (inew_func)hb_dict_itor_new; + dct->_destroy = (destroy_func)hb_tree_destroy; + dct->_insert = (insert_func)hb_tree_insert; + dct->_probe = (probe_func)hb_tree_probe; + dct->_search = (search_func)hb_tree_search; + dct->_csearch = (csearch_func)hb_tree_csearch; + dct->_remove = (remove_func)hb_tree_remove; + dct->_empty = (empty_func)hb_tree_empty; + dct->_walk = (walk_func)hb_tree_walk; + dct->_count = (count_func)hb_tree_count; + + return dct; +} + +void +hb_tree_destroy(hb_tree *tree, int del) +{ + ASSERT(tree != NULL); + + if (tree->root) + hb_tree_empty(tree, del); + + FREE(tree); +} + +void +hb_tree_empty(hb_tree *tree, int del) +{ + hb_node *node, *parent; + + ASSERT(tree != NULL); + + node = tree->root; + + while (node) { + if (node->llink || node->rlink) { + node = node->llink ? node->llink : node->rlink; + continue; + } + + if (del) { + if (tree->key_del) + tree->key_del(node->key); + if (tree->dat_del) + tree->dat_del(node->dat); + } + + parent = node->parent; + FREE(node); + + if (parent) { + if (parent->llink == node) + parent->llink = NULL; + else + parent->rlink = NULL; + } + node = parent; + } + + tree->root = NULL; + tree->count = 0; +} + +void * +hb_tree_search(hb_tree *tree, const void *key) +{ + int rv; + hb_node *node; + + ASSERT(tree != NULL); + + node = tree->root; + while (node) { + rv = tree->key_cmp(key, node->key); + if (rv < 0) + node = node->llink; + else if (rv > 0) + node = node->rlink; + else + return node->dat; + } + + return NULL; +} + +const void * +hb_tree_csearch(const hb_tree *tree, const void *key) +{ + return hb_tree_csearch((hb_tree *)tree, key); +} + +int +hb_tree_insert(hb_tree *tree, void *key, void *dat, int overwrite) +{ + int rv = 0; + hb_node *node, *parent = NULL, *q = NULL; + + ASSERT(tree != NULL); + + node = tree->root; + while (node) { + rv = tree->key_cmp(key, node->key); + if (rv < 0) + parent = node, node = node->llink; + else if (rv > 0) + parent = node, node = node->rlink; + else { + if (overwrite == 0) + return 1; + if (tree->key_del) + tree->key_del(node->key); + if (tree->dat_del) + tree->dat_del(node->dat); + node->key = key; + node->dat = dat; + return 0; + } + if (parent->bal) + q = parent; + } + + if ((node = node_new(key, dat)) == NULL) + return -1; + if ((node->parent = parent) == NULL) { + tree->root = node; + ASSERT(tree->count == 0); + tree->count = 1; + return 0; + } + if (rv < 0) + parent->llink = node; + else + parent->rlink = node; + + while (parent != q) { + parent->bal = (parent->rlink == node) * 2 - 1; + node = parent; + parent = node->parent; + } + if (q) { + if (q->llink == node) { + if (--q->bal == -2) { + if (q->llink->bal > 0) + rot_left(tree, q->llink); + rot_right(tree, q); + } + } else { + if (++q->bal == +2) { + if (q->rlink->bal < 0) + rot_right(tree, q->rlink); + rot_left(tree, q); + } + } + } + tree->count++; + return 0; +} + +int +hb_tree_probe(hb_tree *tree, void *key, void **dat) +{ + int rv = 0; + hb_node *node, *parent = NULL, *q = NULL; + + ASSERT(tree != NULL); + + node = tree->root; + while (node) { + rv = tree->key_cmp(key, node->key); + if (rv < 0) + parent = node, node = node->llink; + else if (rv > 0) + parent = node, node = node->rlink; + else { + *dat = node->dat; + return 0; + } + if (parent->bal) + q = parent; + } + + if ((node = node_new(key, *dat)) == NULL) + return -1; + if ((node->parent = parent) == NULL) { + tree->root = node; + ASSERT(tree->count == 0); + tree->count = 1; + return 1; + } + if (rv < 0) + parent->llink = node; + else + parent->rlink = node; + + while (parent != q) { + parent->bal = (parent->rlink == node) * 2 - 1; + node = parent; + parent = parent->parent; + } + if (q) { + if (q->llink == node) { + if (--q->bal == -2) { + if (q->llink->bal > 0) + rot_left(tree, q->llink); + rot_right(tree, q); + } + } else { + if (++q->bal == +2) { + if (q->rlink->bal < 0) + rot_right(tree, q->rlink); + rot_left(tree, q); + } + } + } + tree->count++; + return 1; +} + +#define FREE_NODE(n) \ + if (del) { \ + if (tree->key_del) \ + tree->key_del((n)->key); \ + if (tree->dat_del) \ + tree->dat_del((n)->dat); \ + } \ + FREE(n) + +int +hb_tree_remove(hb_tree *tree, const void *key, int del) +{ + int rv, left; + hb_node *node, *out, *parent = NULL; + void *tmp; + + ASSERT(tree != NULL); + + node = tree->root; + while (node) { + rv = tree->key_cmp(key, node->key); + if (rv == 0) + break; + parent = node; + node = rv < 0 ? node->llink : node->rlink; + } + if (node == NULL) + return -1; + + if (node->llink && node->rlink) { + for (out = node->rlink; out->llink; out = out->llink) + /* void */; + SWAP(node->key, out->key, tmp); + SWAP(node->dat, out->dat, tmp); + node = out; + parent = out->parent; + } + + out = node->llink ? node->llink : node->rlink; + FREE_NODE(node); + if (out) + out->parent = parent; + if (parent == NULL) { + tree->root = out; + tree->count--; + return 0; + } + + left = parent->llink == node; + if (left) + parent->llink = out; + else + parent->rlink = out; + + for (;;) { + if (left) { + if (++parent->bal == 0) { + node = parent; + goto higher; + } + if (parent->bal == +2) { + ASSERT(parent->rlink != NULL); + if (parent->rlink->bal < 0) { + rot_right(tree, parent->rlink); + rot_left(tree, parent); + } else { + ASSERT(parent->rlink->rlink != NULL); + if (rot_left(tree, parent) == 0) + break; + } + } else { + break; + } + } else { + if (--parent->bal == 0) { + node = parent; + goto higher; + } + if (parent->bal == -2) { + ASSERT(parent->llink != NULL); + if (parent->llink->bal > 0) { + rot_left(tree, parent->llink); + rot_right(tree, parent); + } else { + ASSERT(parent->llink->llink != NULL); + if (rot_right(tree, parent) == 0) + break; + } + } else { + break; + } + } + + /* Only get here on double rotations or single rotations that changed + * subtree height - in either event, `parent->parent' is positioned + * where `parent' was positioned before any rotations. */ + node = parent->parent; +higher: + if ((parent = node->parent) == NULL) + break; + left = parent->llink == node; + } + tree->count--; + return 0; +} + +const void * +hb_tree_min(const hb_tree *tree) +{ + const hb_node *node; + + ASSERT(tree != NULL); + + if (tree->root == NULL) + return NULL; + + for (node = tree->root; node->llink; node = node->llink) + /* void */; + return node->key; +} + +const void * +hb_tree_max(const hb_tree *tree) +{ + const hb_node *node; + + ASSERT(tree != NULL); + + if ((node = tree->root) == NULL) + return NULL; + + for (; node->rlink; node = node->rlink) + /* void */; + return node->key; +} + +void +hb_tree_walk(hb_tree *tree, dict_vis_func visit) +{ + hb_node *node; + + ASSERT(tree != NULL); + + if (tree->root == NULL) + return; + for (node = node_min(tree->root); node; node = node_next(node)) + if (visit(node->key, node->dat) == 0) + break; +} + +unsigned +hb_tree_count(const hb_tree *tree) +{ + ASSERT(tree != NULL); + + return tree->count; +} + +unsigned +hb_tree_height(const hb_tree *tree) +{ + ASSERT(tree != NULL); + + return tree->root ? node_height(tree->root) : 0; +} + +unsigned +hb_tree_mheight(const hb_tree *tree) +{ + ASSERT(tree != NULL); + + return tree->root ? node_mheight(tree->root) : 0; +} + +unsigned +hb_tree_pathlen(const hb_tree *tree) +{ + ASSERT(tree != NULL); + + return tree->root ? node_pathlen(tree->root, 1) : 0; +} + +static hb_node * +node_new(void *key, void *dat) +{ + hb_node *node; + + if ((node = (hb_node*)MALLOC(sizeof(*node))) == NULL) + return NULL; + + node->key = key; + node->dat = dat; + node->parent = NULL; + node->llink = NULL; + node->rlink = NULL; + node->bal = 0; + + return node; +} + +static hb_node * +node_min(hb_node *node) +{ + ASSERT(node != NULL); + + while (node->llink) + node = node->llink; + return node; +} + +static hb_node * +node_max(hb_node *node) +{ + ASSERT(node != NULL); + + while (node->rlink) + node = node->rlink; + return node; +} + +static hb_node * +node_next(hb_node *node) +{ + hb_node *temp; + + ASSERT(node != NULL); + + if (node->rlink) { + for (node = node->rlink; node->llink; node = node->llink) + /* void */; + return node; + } + temp = node->parent; + while (temp && temp->rlink == node) { + node = temp; + temp = temp->parent; + } + return temp; +} + +static hb_node * +node_prev(hb_node *node) +{ + hb_node *temp; + + ASSERT(node != NULL); + + if (node->llink) { + for (node = node->llink; node->rlink; node = node->rlink) + /* void */; + return node; + } + temp = node->parent; + while (temp && temp->llink == node) { + node = temp; + temp = temp->parent; + } + return temp; +} + +static unsigned +node_height(const hb_node *node) +{ + unsigned l, r; + + ASSERT(node != NULL); + + l = node->llink ? node_height(node->llink) + 1 : 0; + r = node->rlink ? node_height(node->rlink) + 1 : 0; + return MAX(l, r); +} + +static unsigned +node_mheight(const hb_node *node) +{ + unsigned l, r; + + ASSERT(node != NULL); + + l = node->llink ? node_mheight(node->llink) + 1 : 0; + r = node->rlink ? node_mheight(node->rlink) + 1 : 0; + return MIN(l, r); +} + +static unsigned +node_pathlen(const hb_node *node, unsigned level) +{ + unsigned n = 0; + + ASSERT(node != NULL); + + if (node->llink) + n += level + node_pathlen(node->llink, level + 1); + if (node->rlink) + n += level + node_pathlen(node->rlink, level + 1); + return n; +} + +/* + * rot_left(T, B): + * + * / / + * B D + * / \ / \ + * A D ==> B E + * / \ / \ + * C E A C + * + */ +static int +rot_left(hb_tree *tree, hb_node *node) +{ + int hc; + hb_node *rlink, *parent; + + ASSERT(tree != NULL); + ASSERT(node != NULL); + ASSERT(node->rlink != NULL); + + rlink = node->rlink; + node->rlink = rlink->llink; + if (rlink->llink) + rlink->llink->parent = node; + parent = node->parent; + rlink->parent = parent; + if (parent) { + if (parent->llink == node) + parent->llink = rlink; + else + parent->rlink = rlink; + } else { + tree->root = rlink; + } + rlink->llink = node; + node->parent = rlink; + + hc = rlink->bal != 0; + node->bal -= 1 + MAX(rlink->bal, 0); + rlink->bal -= 1 - MIN(node->bal, 0); + return hc; +} + +/* + * rot_right(T, D): + * + * / / + * D B + * / \ / \ + * B E ==> A D + * / \ / \ + * A C C E + * + */ +static int +rot_right(hb_tree *tree, hb_node *node) +{ + int hc; + hb_node *llink, *parent; + + ASSERT(tree != NULL); + ASSERT(node != NULL); + ASSERT(node->llink != NULL); + + llink = node->llink; + node->llink = llink->rlink; + if (llink->rlink) + llink->rlink->parent = node; + parent = node->parent; + llink->parent = parent; + if (parent) { + if (parent->llink == node) + parent->llink = llink; + else + parent->rlink = llink; + } else { + tree->root = llink; + } + llink->rlink = node; + node->parent = llink; + + hc = llink->bal != 0; + node->bal += 1 - MIN(llink->bal, 0); + llink->bal += 1 + MAX(node->bal, 0); + return hc; +} + +hb_itor * +hb_itor_new(hb_tree *tree) +{ + hb_itor *itor; + + ASSERT(tree != NULL); + + if ((itor = (hb_itor*)MALLOC(sizeof(*itor))) == NULL) + return NULL; + + itor->tree = tree; + hb_itor_first(itor); + return itor; +} + +dict_itor * +hb_dict_itor_new(hb_tree *tree) +{ + dict_itor *itor; + + ASSERT(tree != NULL); + + if ((itor = (dict_itor*)MALLOC(sizeof(*itor))) == NULL) + return NULL; + + if ((itor->_itor = hb_itor_new(tree)) == NULL) { + FREE(itor); + return NULL; + } + + itor->_destroy = (idestroy_func)hb_itor_destroy; + itor->_valid = (valid_func)hb_itor_valid; + itor->_invalid = (invalidate_func)hb_itor_invalidate; + itor->_next = (next_func)hb_itor_next; + itor->_prev = (prev_func)hb_itor_prev; + itor->_nextn = (nextn_func)hb_itor_nextn; + itor->_prevn = (prevn_func)hb_itor_prevn; + itor->_first = (first_func)hb_itor_first; + itor->_last = (last_func)hb_itor_last; + itor->_search = (isearch_func)hb_itor_search; + itor->_key = (key_func)hb_itor_key; + itor->_data = (data_func)hb_itor_data; + itor->_cdata = (cdata_func)hb_itor_cdata; + itor->_setdata = (dataset_func)hb_itor_set_data; + + return itor; +} + +void +hb_itor_destroy(hb_itor *itor) +{ + ASSERT(itor != NULL); + + FREE(itor); +} + +#define RETVALID(itor) return itor->node != NULL + +int +hb_itor_valid(const hb_itor *itor) +{ + ASSERT(itor != NULL); + + RETVALID(itor); +} + +void +hb_itor_invalidate(hb_itor *itor) +{ + ASSERT(itor != NULL); + + itor->node = NULL; +} + +int +hb_itor_next(hb_itor *itor) +{ + ASSERT(itor != NULL); + + if (itor->node == NULL) + hb_itor_first(itor); + else + itor->node = node_next(itor->node); + RETVALID(itor); +} + +int +hb_itor_prev(hb_itor *itor) +{ + ASSERT(itor != NULL); + + if (itor->node == NULL) + hb_itor_last(itor); + else + itor->node = node_prev(itor->node); + RETVALID(itor); +} + +int +hb_itor_nextn(hb_itor *itor, unsigned count) +{ + ASSERT(itor != NULL); + + if (count) { + if (itor->node == NULL) { + hb_itor_first(itor); + count--; + } + + while (count-- && itor->node) + itor->node = node_next(itor->node); + } + + RETVALID(itor); +} + +int +hb_itor_prevn(hb_itor *itor, unsigned count) +{ + ASSERT(itor != NULL); + + if (count) { + if (itor->node == NULL) { + hb_itor_last(itor); + count--; + } + + while (count-- && itor->node) + itor->node = node_prev(itor->node); + } + + RETVALID(itor); +} + +int +hb_itor_first(hb_itor *itor) +{ + hb_tree *t; + + ASSERT(itor != NULL); + + t = itor->tree; + itor->node = t->root ? node_min(t->root) : NULL; + RETVALID(itor); +} + +int +hb_itor_last(hb_itor *itor) +{ + hb_tree *t; + + ASSERT(itor != NULL); + + t = itor->tree; + itor->node = t->root ? node_max(t->root) : NULL; + RETVALID(itor); +} + +int +hb_itor_search(hb_itor *itor, const void *key) +{ + int rv; + hb_node *node; + dict_cmp_func cmp; + + ASSERT(itor != NULL); + + cmp = itor->tree->key_cmp; + for (node = itor->tree->root; node;) { + rv = cmp(key, node->key); + if (rv == 0) + break; + node = rv < 0 ? node->llink : node->rlink; + } + itor->node = node; + RETVALID(itor); +} + +const void * +hb_itor_key(const hb_itor *itor) +{ + ASSERT(itor != NULL); + + return itor->node ? itor->node->key : NULL; +} + +void * +hb_itor_data(hb_itor *itor) +{ + ASSERT(itor != NULL); + + return itor->node ? itor->node->dat : NULL; +} + +const void * +hb_itor_cdata(const hb_itor *itor) +{ + ASSERT(itor != NULL); + + return itor->node ? itor->node->dat : NULL; +} + +int +hb_itor_set_data(hb_itor *itor, void *dat, int del) +{ + ASSERT(itor != NULL); + + if (itor->node == NULL) + return -1; + + if (del && itor->tree->dat_del) + itor->tree->dat_del(itor->node->dat); + itor->node->dat = dat; + return 0; +} diff --git a/ompi/mca/coll/libnbc/libdict/hb_tree.h b/ompi/mca/coll/libnbc/libdict/hb_tree.h new file mode 100644 index 0000000000..2de8af6d19 --- /dev/null +++ b/ompi/mca/coll/libnbc/libdict/hb_tree.h @@ -0,0 +1,64 @@ +/* + * hb_tree.h + * + * Interface for height balanced tree. + * Copyright (C) 2001 Farooq Mela. + * + * $Id: hb_tree.h,v 1.2 2001/09/10 06:46:41 farooq Exp $ + */ + +#ifndef _HB_TREE_H_ +#define _HB_TREE_H_ + +#include "dict.h" + +BEGIN_DECL + +struct hb_tree; +typedef struct hb_tree hb_tree; + +hb_tree *hb_tree_new __P((dict_cmp_func key_cmp, dict_del_func key_del, + dict_del_func dat_del)); +dict *hb_dict_new __P((dict_cmp_func key_cmp, dict_del_func key_del, + dict_del_func dat_del)); +void hb_tree_destroy __P((hb_tree *tree, int del)); + +int hb_tree_insert __P((hb_tree *tree, void *key, void *dat, int overwrite)); +int hb_tree_probe __P((hb_tree *tree, void *key, void **dat)); +void *hb_tree_search __P((hb_tree *tree, const void *key)); +const void *hb_tree_csearch __P((const hb_tree *tree, const void *key)); +int hb_tree_remove __P((hb_tree *tree, const void *key, int del)); +void hb_tree_empty __P((hb_tree *tree, int del)); +void hb_tree_walk __P((hb_tree *tree, dict_vis_func visit)); +unsigned hb_tree_count __P((const hb_tree *tree)); +unsigned hb_tree_height __P((const hb_tree *tree)); +unsigned hb_tree_mheight __P((const hb_tree *tree)); +unsigned hb_tree_pathlen __P((const hb_tree *tree)); +const void *hb_tree_min __P((const hb_tree *tree)); +const void *hb_tree_max __P((const hb_tree *tree)); + +struct hb_itor; +typedef struct hb_itor hb_itor; + +hb_itor *hb_itor_new __P((hb_tree *tree)); +dict_itor *hb_dict_itor_new __P((hb_tree *tree)); +void hb_itor_destroy __P((hb_itor *tree)); + +int hb_itor_valid __P((const hb_itor *itor)); +void hb_itor_invalidate __P((hb_itor *itor)); +int hb_itor_next __P((hb_itor *itor)); +int hb_itor_prev __P((hb_itor *itor)); +int hb_itor_nextn __P((hb_itor *itor, unsigned count)); +int hb_itor_prevn __P((hb_itor *itor, unsigned count)); +int hb_itor_first __P((hb_itor *itor)); +int hb_itor_last __P((hb_itor *itor)); +int hb_itor_search __P((hb_itor *itor, const void *key)); +const void *hb_itor_key __P((const hb_itor *itor)); +void *hb_itor_data __P((hb_itor *itor)); +const void *hb_itor_cdata __P((const hb_itor *itor)); +int hb_itor_set_data __P((hb_itor *itor, void *dat, int del)); +int hb_itor_remove __P((hb_itor *itor, int del)); + +END_DECL + +#endif /* !_HB_TREE_H_ */ diff --git a/ompi/mca/coll/libnbc/nbc.c b/ompi/mca/coll/libnbc/nbc.c new file mode 100644 index 0000000000..fd42bc43c6 --- /dev/null +++ b/ompi/mca/coll/libnbc/nbc.c @@ -0,0 +1,720 @@ +/* + * Copyright (c) 2006 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2006 The Technical University of Chemnitz. All + * rights reserved. + * + * Author(s): Torsten Hoefler + * + */ +#include "nbc_internal.h" + +/* only used in this file */ +static inline int NBC_Start_round(NBC_Handle *handle); + +/* #define NBC_TIMING */ + +#ifdef NBC_TIMING +static double Isend_time=0, Irecv_time=0, Wait_time=0, Test_time=0; +void NBC_Reset_times() { + Isend_time=Irecv_time=Wait_time=Test_time=0; +} +void NBC_Print_times(double div) { + printf("*** NBC_TIMES: Isend: %lf, Irecv: %lf, Wait: %lf, Test: %lf\n", Isend_time*1e6/div, Irecv_time*1e6/div, Wait_time*1e6/div, Test_time*1e6/div); +} +#endif + +/* is NBC globally initialized */ +static char GNBC_Initialized=0; + +/* the keyval (global) */ +static int gkeyval=MPI_KEYVAL_INVALID; + +static int NBC_Key_copy(MPI_Comm oldcomm, int keyval, void *extra_state, void *attribute_val_in, void *attribute_val_out, int *flag) { + /* delete the attribute in the new comm - it will be created at the + * first usage */ + *flag = 0; + + return MPI_SUCCESS; +} + +static int NBC_Key_delete(MPI_Comm comm, int keyval, void *attribute_val, void *extra_state) { + NBC_Comminfo *comminfo; + + if(keyval == gkeyval) { + comminfo=(NBC_Comminfo*)attribute_val; + free((void*)comminfo); + } else { + printf("Got wrong keyval!(%i)\n", keyval); + } + + return MPI_SUCCESS; +} + +/* allocates a new schedule array */ +int NBC_Sched_create(NBC_Schedule* schedule) { + + *schedule=malloc(2*sizeof(int)); + if(*schedule == NULL) { return NBC_OOR; } + *(int*)*schedule=2*sizeof(int); + *(((int*)*schedule)+1)=0; + + return NBC_OK; +} + +/* this function puts a send into the schedule */ +int NBC_Sched_send(void* buf, char tmpbuf, int count, MPI_Datatype datatype, int dest, NBC_Schedule *schedule) { + int size; + NBC_Args_send* send_args; + + /* get size of actual schedule */ + NBC_GET_SIZE(*schedule, size); + /*printf("schedule is %i bytes\n", size);*/ + *schedule = (NBC_Schedule)realloc(*schedule, size+sizeof(NBC_Args_send)+sizeof(NBC_Fn_type)); + if(*schedule == NULL) { printf("Error in realloc()\n"); return NBC_OOR; } + + /* adjust the function type */ + *(NBC_Fn_type*)((char*)*schedule+size)=SEND; + + /* store the passed arguments */ + send_args = (NBC_Args_send*)((char*)*schedule+size+sizeof(NBC_Fn_type)); + send_args->buf=buf; + send_args->tmpbuf=tmpbuf; + send_args->count=count; + send_args->datatype=datatype; + send_args->dest=dest; + + /* increase number of elements in schedule */ + NBC_INC_NUM_ROUND(*schedule); + NBC_DEBUG(10, "adding send - ends at byte %i\n", (int)(size+sizeof(NBC_Args_send)+sizeof(NBC_Fn_type))); + + /* increase size of schedule */ + NBC_INC_SIZE(*schedule, sizeof(NBC_Args_send)+sizeof(NBC_Fn_type)); + + return NBC_OK; +} + +/* this function puts a receive into the schedule */ +int NBC_Sched_recv(void* buf, char tmpbuf, int count, MPI_Datatype datatype, int source, NBC_Schedule *schedule) { + int size; + NBC_Args_recv* recv_args; + + /* get size of actual schedule */ + NBC_GET_SIZE(*schedule, size); + /*printf("schedule is %i bytes\n", size);*/ + *schedule = (NBC_Schedule)realloc(*schedule, size+sizeof(NBC_Args_recv)+sizeof(NBC_Fn_type)); + if(*schedule == NULL) { printf("Error in realloc()\n"); return NBC_OOR; } + + /* adjust the function type */ + *(NBC_Fn_type*)((char*)*schedule+size)=RECV; + + /* store the passed arguments */ + recv_args=(NBC_Args_recv*)((char*)*schedule+size+sizeof(NBC_Fn_type)); + recv_args->buf=buf; + recv_args->tmpbuf=tmpbuf; + recv_args->count=count; + recv_args->datatype=datatype; + recv_args->source=source; + + /* increase number of elements in schedule */ + NBC_INC_NUM_ROUND(*schedule); + NBC_DEBUG(10, "adding receive - ends at byte %i\n", (int)(size+sizeof(NBC_Args_recv)+sizeof(NBC_Fn_type))); + + /* increase size of schedule */ + NBC_INC_SIZE(*schedule, sizeof(NBC_Args_recv)+sizeof(NBC_Fn_type)); + + return NBC_OK; +} + +/* this function puts an operation into the schedule */ +int NBC_Sched_op(void *buf3, char tmpbuf3, void* buf1, char tmpbuf1, void* buf2, char tmpbuf2, int count, MPI_Datatype datatype, MPI_Op op, NBC_Schedule *schedule) { + int size; + NBC_Args_op* op_args; + + /* get size of actual schedule */ + NBC_GET_SIZE(*schedule, size); + /*printf("schedule is %i bytes\n", size);*/ + *schedule = (NBC_Schedule)realloc(*schedule, size+sizeof(NBC_Args_op)+sizeof(NBC_Fn_type)); + if(*schedule == NULL) { printf("Error in realloc()\n"); return NBC_OOR; } + + /* adjust the function type */ + *(NBC_Fn_type*)((char*)*schedule+size)=OP; + + /* store the passed arguments */ + op_args=(NBC_Args_op*)((char*)*schedule+size+sizeof(NBC_Fn_type)); + op_args->buf1=buf1; + op_args->buf2=buf2; + op_args->buf3=buf3; + op_args->tmpbuf1=tmpbuf1; + op_args->tmpbuf2=tmpbuf2; + op_args->tmpbuf3=tmpbuf3; + op_args->count=count; + op_args->op=op; + op_args->datatype=datatype; + + /* increase number of elements in schedule */ + NBC_INC_NUM_ROUND(*schedule); + NBC_DEBUG(10, "adding op - ends at byte %i\n", (int)(size+sizeof(NBC_Args_op)+sizeof(NBC_Fn_type))); + + /* increase size of schedule */ + NBC_INC_SIZE(*schedule, sizeof(NBC_Args_op)+sizeof(NBC_Fn_type)); + + return NBC_OK; +} + +/* this function puts a copy into the schedule */ +int NBC_Sched_copy(void *src, char tmpsrc, int srccount, MPI_Datatype srctype, void *tgt, char tmptgt, int tgtcount, MPI_Datatype tgttype, NBC_Schedule *schedule) { + int size; + NBC_Args_copy* copy_args; + + /* get size of actual schedule */ + NBC_GET_SIZE(*schedule, size); + /*printf("schedule is %i bytes\n", size);*/ + *schedule = (NBC_Schedule)realloc(*schedule, size+sizeof(NBC_Args_copy)+sizeof(NBC_Fn_type)); + if(*schedule == NULL) { printf("Error in realloc()\n"); return NBC_OOR; } + + /* adjust the function type */ + *(NBC_Fn_type*)((char*)*schedule+size)=COPY; + + /* store the passed arguments */ + copy_args = (NBC_Args_copy*)((char*)*schedule+size+sizeof(NBC_Fn_type)); + copy_args->src=src; + copy_args->tmpsrc=tmpsrc; + copy_args->srccount=srccount; + copy_args->srctype=srctype; + copy_args->tgt=tgt; + copy_args->tmptgt=tmptgt; + copy_args->tgtcount=tgtcount; + copy_args->tgttype=tgttype; + + /* increase number of elements in schedule */ + NBC_INC_NUM_ROUND(*schedule); + NBC_DEBUG(10, "adding copy - ends at byte %i\n", (int)(size+sizeof(NBC_Args_copy)+sizeof(NBC_Fn_type))); + + /* increase size of schedule */ + NBC_INC_SIZE(*schedule, sizeof(NBC_Args_copy)+sizeof(NBC_Fn_type)); + + return NBC_OK; +} + +/* this function puts a unpack into the schedule */ +int NBC_Sched_unpack(void *inbuf, char tmpinbuf, int count, MPI_Datatype datatype, void *outbuf, char tmpoutbuf, NBC_Schedule *schedule) { + int size; + NBC_Args_unpack* unpack_args; + + /* get size of actual schedule */ + NBC_GET_SIZE(*schedule, size); + /*printf("schedule is %i bytes\n", size);*/ + *schedule = (NBC_Schedule)realloc(*schedule, size+sizeof(NBC_Args_unpack)+sizeof(NBC_Fn_type)); + if(*schedule == NULL) { printf("Error in realloc()\n"); return NBC_OOR; } + + /* adjust the function type */ + *(NBC_Fn_type*)((char*)*schedule+size)=UNPACK; + + /* store the passed arguments */ + unpack_args = (NBC_Args_unpack*)((char*)*schedule+size+sizeof(NBC_Fn_type)); + unpack_args->inbuf=inbuf; + unpack_args->tmpinbuf=tmpinbuf; + unpack_args->count=count; + unpack_args->datatype=datatype; + unpack_args->outbuf=outbuf; + unpack_args->tmpoutbuf=tmpoutbuf; + + /* increase number of elements in schedule */ + NBC_INC_NUM_ROUND(*schedule); + NBC_DEBUG(10, "adding unpack - ends at byte %i\n", (int)(size+sizeof(NBC_Args_unpack)+sizeof(NBC_Fn_type))); + + /* increase size of schedule */ + NBC_INC_SIZE(*schedule, sizeof(NBC_Args_unpack)+sizeof(NBC_Fn_type)); + + return NBC_OK; +} + +/* this function ends a round of a schedule */ +int NBC_Sched_barrier(NBC_Schedule *schedule) { + int size; + + /* get size of actual schedule */ + NBC_GET_SIZE(*schedule, size); + /*printf("round terminated at %i bytes\n", size);*/ + *schedule = (NBC_Schedule)realloc(*schedule, size+sizeof(char)+sizeof(int)); + if(*schedule == NULL) { printf("Error in realloc()\n"); return NBC_OOR; } + + /* add the barrier char (1) because another round follows */ + *(char*)((char*)*schedule+size)=1; + + /* set round count elements = 0 for new round */ + *(int*)((char*)*schedule+size+sizeof(char))=0; + NBC_DEBUG(10, "ending round at byte %i\n", (int)(size+sizeof(char)+sizeof(int))); + + /* increase size of schedule */ + NBC_INC_SIZE(*schedule, sizeof(char)+sizeof(int)); + + return NBC_OK; +} + +/* this function ends a schedule */ +int NBC_Sched_commit(NBC_Schedule *schedule) { + int size; + + /* get size of actual schedule */ + NBC_GET_SIZE(*schedule, size); + /*printf("schedule terminated at %i bytes\n", size);*/ + *schedule = (NBC_Schedule)realloc(*schedule, size+sizeof(char)); + if(*schedule == NULL) { printf("Error in realloc()\n"); return NBC_OOR; } + + /* add the barrier char (0) because this is the last round */ + *(char*)((char*)*schedule+size)=0; + NBC_DEBUG(10, "closing schedule %p at byte %i\n", *schedule, (int)(size+sizeof(char))); + + /* increase size of schedule */ + NBC_INC_SIZE(*schedule, sizeof(char)); + + return NBC_OK; +} + +/* finishes a request + * + * to be called *only* from the progress thread !!! */ +static inline int NBC_Free(NBC_Handle* handle) { + +#ifdef NBC_CACHE_SCHEDULE + /* do not free schedule because it is in the cache */ + handle->schedule = NULL; +#else + if(handle->schedule != NULL) { + /* free schedule */ + free((void*)*(handle->schedule)); + free((void*)handle->schedule); + handle->schedule = NULL; + } +#endif + + /* if the nbc_I attached some data */ + /* problems with schedule cache here, see comment (TODO) in + * nbc_internal.h */ + if(NULL != handle->tmpbuf) { + free((void*)handle->tmpbuf); + handle->tmpbuf = NULL; + } + + return NBC_OK; +} + +/* progresses a request + * + * to be called *only* from the progress thread !!! */ +int NBC_Progress(NBC_Handle *handle) { + int flag, res, ret=NBC_CONTINUE; + long size; + char *delim; + + /* the handle is done if there is no schedule attached */ + if(handle->schedule != NULL) { + + if((handle->req_count > 0) && (handle->req_array != NULL)) { + NBC_DEBUG(50, "NBC_Progress: testing for %i requests\n", handle->req_count); +#ifdef NBC_TIMING + Test_time -= MPI_Wtime(); +#endif +#ifdef HAVE_OMPI + /*res = ompi_request_test_all(handle->req_count, handle->req_array, &flag, MPI_STATUSES_IGNORE);*/ + res = MPI_Testall(handle->req_count, handle->req_array, &flag, MPI_STATUSES_IGNORE); + if(res != OMPI_SUCCESS) { printf("MPI Error in MPI_Testall() (%i)\n", res); ret=res; goto error; } +#endif +#ifdef NBC_TIMING + Test_time += MPI_Wtime(); +#endif + } else { + flag = 1; /* we had no open requests -> proceed to next round */ + } + + /* a round is finished */ + if(flag) { + /* adjust delim to start of current round */ + NBC_DEBUG(5, "NBC_Progress: going in schedule %p to row-offset: %li\n", *handle->schedule, handle->row_offset); + delim = (char*)*handle->schedule + handle->row_offset; + NBC_DEBUG(10, "delim: %p\n", delim); + NBC_GET_ROUND_SIZE(delim, size); + NBC_DEBUG(10, "size: %li\n", size); + /* adjust delim to end of current round -> delimiter */ + delim = delim + size; + + if(handle->req_array != NULL) { + /* free request array */ + free((void*)handle->req_array); + handle->req_array = NULL; + } + handle->req_count = 0; + + if(*delim == 0) { + /* this was the last round - we're done */ + NBC_DEBUG(5, "NBC_Progress last round finished - we're done\n"); + + res = NBC_Free(handle); + if((NBC_OK != res)) { printf("Error in NBC_Free() (%i)\n", res); ret=res; goto error; } + + return NBC_OK; + } else { + NBC_DEBUG(5, "NBC_Progress round finished - goto next round\n"); + /* move delim to start of next round */ + delim = delim+1; + /* initializing handle for new virgin round */ + handle->row_offset = (long)delim - (long)*handle->schedule; + /* kick it off */ + res = NBC_Start_round(handle); + if(NBC_OK != res) { printf("Error in NBC_Start_round() (%i)\n", res); ret=res; goto error; } + } + } + } else { + ret= NBC_OK; + } + +error: + return ret; +} + +static inline int NBC_Start_round(NBC_Handle *handle) { + int *numptr; /* number of operations */ + int i, res, ret=NBC_OK; + NBC_Fn_type *typeptr; + NBC_Args_send *sendargs; + NBC_Args_recv *recvargs; + NBC_Args_op *opargs; + NBC_Args_copy *copyargs; + NBC_Args_unpack *unpackargs; + NBC_Schedule myschedule; + void *buf1, *buf2, *buf3; + + /* get schedule address */ + myschedule = (NBC_Schedule*)((char*)*handle->schedule + handle->row_offset); + + numptr = (int*)myschedule; + NBC_DEBUG(10, "start_round round at address %p : posting %i operations\n", myschedule, *numptr); + + /* typeptr is increased by sizeof(int) bytes to point to type */ + typeptr = (NBC_Fn_type*)(numptr+1); + for (i=0; i<*numptr; i++) { + /* go sizeof op-data forward */ + switch(*typeptr) { + case SEND: + NBC_DEBUG(5," SEND (offset %li) ", (long)typeptr-(long)myschedule); + sendargs = (NBC_Args_send*)(typeptr+1); + NBC_DEBUG(5,"*buf: %p, count: %i, type: %lu, dest: %i, tag: %i)\n", sendargs->buf, sendargs->count, (unsigned long)sendargs->datatype, sendargs->dest, handle->tag); + typeptr = (NBC_Fn_type*)(((NBC_Args_send*)typeptr)+1); + /* get an additional request */ + handle->req_count++; + /* get buffer */ + if(sendargs->tmpbuf) + buf1=(char*)handle->tmpbuf+(long)sendargs->buf; + else + buf1=sendargs->buf; +#ifdef NBC_TIMING + Isend_time -= MPI_Wtime(); +#endif +#ifdef HAVE_OMPI + handle->req_array = (MPI_Request*)realloc((void*)handle->req_array, (handle->req_count)*sizeof(MPI_Request)); + NBC_CHECK_NULL(handle->req_array); + /*res = MCA_PML_CALL(isend_init(buf1, sendargs->count, sendargs->datatype, sendargs->dest, handle->tag, MCA_PML_BASE_SEND_STANDARD, handle->mycomm, handle->req_array+handle->req_count-1)); + printf("MPI_Isend(%lu, %i, %lu, %i, %i, %lu) (%i)\n", (unsigned long)buf1, sendargs->count, (unsigned long)sendargs->datatype, sendargs->dest, handle->tag, (unsigned long)handle->mycomm, res);*/ + res = MPI_Isend(buf1, sendargs->count, sendargs->datatype, sendargs->dest, handle->tag, handle->mycomm, handle->req_array+handle->req_count-1); + if(OMPI_SUCCESS != res) { printf("Error in MPI_Isend(%lu, %i, %lu, %i, %i, %lu) (%i)\n", (unsigned long)buf1, sendargs->count, (unsigned long)sendargs->datatype, sendargs->dest, handle->tag, (unsigned long)handle->mycomm, res); ret=res; goto error; } +#endif +#ifdef NBC_TIMING + Isend_time += MPI_Wtime(); +#endif + break; + case RECV: + NBC_DEBUG(5, " RECV (offset %li) ", (long)typeptr-(long)myschedule); + recvargs = (NBC_Args_recv*)(typeptr+1); + NBC_DEBUG(5, "*buf: %p, count: %i, type: %lu, source: %i, tag: %i)\n", recvargs->buf, recvargs->count, (unsigned long)recvargs->datatype, recvargs->source, handle->tag); + typeptr = (NBC_Fn_type*)(((NBC_Args_recv*)typeptr)+1); + /* get an additional request - TODO: req_count NOT thread safe */ + handle->req_count++; + /* get buffer */ + if(recvargs->tmpbuf) { + buf1=(char*)handle->tmpbuf+(long)recvargs->buf; + } else { + buf1=recvargs->buf; + } +#ifdef NBC_TIMING + Irecv_time -= MPI_Wtime(); +#endif +#ifdef HAVE_OMPI + handle->req_array = (MPI_Request*)realloc((void*)handle->req_array, (handle->req_count)*sizeof(MPI_Request)); + NBC_CHECK_NULL(handle->req_array); + /*res = MCA_PML_CALL(irecv(buf1, recvargs->count, recvargs->datatype, recvargs->source, handle->tag, handle->mycomm, handle->req_array+handle->req_count-1)); + printf("MPI_Irecv(%lu, %i, %lu, %i, %i, %lu) (%i)\n", (unsigned long)buf1, recvargs->count, (unsigned long)recvargs->datatype, recvargs->source, handle->tag, (unsigned long)handle->mycomm, res); */ + res = MPI_Irecv(buf1, recvargs->count, recvargs->datatype, recvargs->source, handle->tag, handle->mycomm, handle->req_array+handle->req_count-1); + if(OMPI_SUCCESS != res) { printf("Error in MPI_Irecv(%lu, %i, %lu, %i, %i, %lu) (%i)\n", (unsigned long)buf1, recvargs->count, (unsigned long)recvargs->datatype, recvargs->source, handle->tag, (unsigned long)handle->mycomm, res); ret=res; goto error; } +#endif +#ifdef NBC_TIMING + Irecv_time += MPI_Wtime(); +#endif + break; + case OP: + NBC_DEBUG(5, " OP (offset %li) ", (long)typeptr-(long)myschedule); + opargs = (NBC_Args_op*)(typeptr+1); + NBC_DEBUG(5, "*buf1: %p, buf2: %p, count: %i, type: %lu)\n", opargs->buf1, opargs->buf2, opargs->count, (unsigned long)opargs->datatype); + typeptr = (NBC_Fn_type*)((NBC_Args_op*)typeptr+1); + /* get buffers */ + if(opargs->tmpbuf1) + buf1=(char*)handle->tmpbuf+(long)opargs->buf1; + else + buf1=opargs->buf1; + if(opargs->tmpbuf2) + buf2=(char*)handle->tmpbuf+(long)opargs->buf2; + else + buf2=opargs->buf2; + if(opargs->tmpbuf3) + buf3=(char*)handle->tmpbuf+(long)opargs->buf3; + else + buf3=opargs->buf3; + res = NBC_Operation(buf3, buf1, buf2, opargs->op, opargs->datatype, opargs->count); + if(res != NBC_OK) { printf("NBC_Operation() failed (code: %i)\n", res); ret=res; goto error; } + break; + case COPY: + NBC_DEBUG(5, " COPY (offset %li) ", (long)typeptr-(long)myschedule); + copyargs = (NBC_Args_copy*)(typeptr+1); + NBC_DEBUG(5, "*src: %lu, srccount: %i, srctype: %lu, *tgt: %lu, tgtcount: %i, tgttype: %lu)\n", (unsigned long)copyargs->src, copyargs->srccount, (unsigned long)copyargs->srctype, (unsigned long)copyargs->tgt, copyargs->tgtcount, (unsigned long)copyargs->tgttype); + typeptr = (NBC_Fn_type*)((NBC_Args_copy*)typeptr+1); + /* get buffers */ + if(copyargs->tmpsrc) + buf1=(char*)handle->tmpbuf+(long)copyargs->src; + else + buf1=copyargs->src; + if(copyargs->tmptgt) + buf2=(char*)handle->tmpbuf+(long)copyargs->tgt; + else + buf2=copyargs->tgt; + res = NBC_Copy(buf1, copyargs->srccount, copyargs->srctype, buf2, copyargs->tgtcount, copyargs->tgttype, handle->mycomm); + if(res != NBC_OK) { printf("NBC_Copy() failed (code: %i)\n", res); ret=res; goto error; } + break; + case UNPACK: + NBC_DEBUG(5, " UNPACK (offset %li) ", (long)typeptr-(long)myschedule); + unpackargs = (NBC_Args_unpack*)(typeptr+1); + NBC_DEBUG(5, "*src: %lu, srccount: %i, srctype: %lu, *tgt: %lu\n", (unsigned long)unpackargs->inbuf, unpackargs->count, (unsigned long)unpackargs->datatype, (unsigned long)unpackargs->outbuf); + typeptr = (NBC_Fn_type*)((NBC_Args_unpack*)typeptr+1); + /* get buffers */ + if(unpackargs->tmpinbuf) + buf1=(char*)handle->tmpbuf+(long)unpackargs->inbuf; + else + buf1=unpackargs->outbuf; + if(unpackargs->tmpoutbuf) + buf2=(char*)handle->tmpbuf+(long)unpackargs->outbuf; + else + buf2=unpackargs->outbuf; + res = NBC_Unpack(buf1, unpackargs->count, unpackargs->datatype, buf2, handle->mycomm); + if(res != NBC_OK) { printf("NBC_Unpack() failed (code: %i)\n", res); ret=res; goto error; } + break; + default: + printf("NBC_Start_round: bad type %li at offset %li\n", (long)*typeptr, (long)typeptr-(long)myschedule); + ret=NBC_BAD_SCHED; + goto error; + } + /* increase ptr by size of fn_type enum */ + typeptr = (NBC_Fn_type*)((NBC_Fn_type*)typeptr+1); + } + + /* check if we can make progress - not in the first round, this allows us to leave the + * initialization faster and to reach more overlap + * + * threaded case: calling progress in the first round can lead to a + * deadlock if NBC_Free is called in this round :-( */ + if(handle->row_offset != sizeof(int)) { + res = NBC_Progress(handle); + if((NBC_OK != res) && (NBC_CONTINUE != res)) { printf("Error in NBC_Progress() (%i)\n", res); ret=res; goto error; } + } + +error: + return ret; +} + +static inline int NBC_Initialize(void) { + GNBC_Initialized = 1; + + return NBC_OK; +} + +int NBC_Init_handle(struct ompi_communicator_t *comm, ompi_coll_libnbc_request_t **request, ompi_coll_libnbc_module_t *module) +{ + int res, flag; + NBC_Comminfo *comminfo; + + NBC_Handle *handle = NULL; + + handle->tmpbuf = NULL; + handle->req_count = 0; + handle->req_array = NULL; + handle->comm = comm; + handle->schedule = NULL; + /* first int is the schedule size */ + handle->row_offset = sizeof(int); + + /******************** Do the tag and shadow comm administration ... ***************/ + + /* otherwise we have to do the normal attribute stuff :-( */ + /* keyval is not initialized yet, we have to init it */ + if(MPI_KEYVAL_INVALID == gkeyval) { + res = MPI_Keyval_create(NBC_Key_copy, NBC_Key_delete, &(gkeyval), NULL); + if((MPI_SUCCESS != res)) { printf("Error in MPI_Keyval_create() (%i)\n", res); return res; } + } + + res = MPI_Attr_get(comm, gkeyval, &comminfo, &flag); + if((MPI_SUCCESS != res)) { printf("Error in MPI_Attr_get() (%i)\n", res); return res; } + + if (flag) { + /* we found it */ + comminfo->tag++; + } else { + /* we have to create a new one */ + comminfo = NBC_Init_comm(comm); + if(comminfo == NULL) { printf("Error in NBC_Init_comm() %i\n", res); return NBC_OOR; } + } + handle->tag=comminfo->tag; + handle->mycomm=comminfo->mycomm; + /*printf("got comminfo: %lu tag: %i\n", comminfo, comminfo->tag);*/ + + /* reset counter ... */ + if(handle->tag == 32767) { + handle->tag=1; + comminfo->tag=1; + NBC_DEBUG(2,"resetting tags ...\n"); + } + + /******************** end of tag and shadow comm administration ... ***************/ + handle->comminfo = comminfo; + + NBC_DEBUG(3, "got tag %i\n", handle->tag); + + return NBC_OK; +} + +NBC_Comminfo* NBC_Init_comm(MPI_Comm comm) { + int res; + NBC_Comminfo *comminfo; + + comminfo = (NBC_Comminfo*)malloc(sizeof(NBC_Comminfo)); + if(comminfo == NULL) { printf("Error in malloc()\n"); return NULL; } + + /* set tag to 1 */ + comminfo->tag=1; + /* dup and save shadow communicator */ + res = MPI_Comm_dup(comm, &(comminfo->mycomm)); + if((MPI_SUCCESS != res)) { printf("Error in MPI_Comm_dup() (%i)\n", res); return NULL; } + NBC_DEBUG(1, "created a shadow communicator for %lu ... %lu\n", (unsigned long)comm, (unsigned long)comminfo->mycomm); + +#ifdef NBC_CACHE_SCHEDULE +#if 0 + /* initialize the NBC_ALLTOALL SchedCache tree */ + comminfo->NBC_Dict[NBC_ALLTOALL] = hb_tree_new((dict_cmp_func)NBC_Alltoall_args_compare, NBC_SchedCache_args_delete_key_dummy, NBC_SchedCache_args_delete); + if(comminfo->NBC_Dict[NBC_ALLTOALL] == NULL) { printf("Error in hb_tree_new()\n"); return NULL; } + NBC_DEBUG(1, "added tree at address %lu\n", (unsigned long)comminfo->NBC_Dict[NBC_ALLTOALL]); + comminfo->NBC_Dict_size[NBC_ALLTOALL] = 0; + /* initialize the NBC_ALLGATHER SchedCache tree */ + comminfo->NBC_Dict[NBC_ALLGATHER] = hb_tree_new((dict_cmp_func)NBC_Allgather_args_compare, NBC_SchedCache_args_delete_key_dummy, NBC_SchedCache_args_delete); + if(comminfo->NBC_Dict[NBC_ALLGATHER] == NULL) { printf("Error in hb_tree_new()\n"); return NULL; } + NBC_DEBUG(1, "added tree at address %lu\n", (unsigned long)comminfo->NBC_Dict[NBC_ALLGATHER]); + comminfo->NBC_Dict_size[NBC_ALLGATHER] = 0; + /* initialize the NBC_ALLREDUCE SchedCache tree */ + comminfo->NBC_Dict[NBC_ALLREDUCE] = hb_tree_new((dict_cmp_func)NBC_Allreduce_args_compare, NBC_SchedCache_args_delete_key_dummy, NBC_SchedCache_args_delete); + if(comminfo->NBC_Dict[NBC_ALLREDUCE] == NULL) { printf("Error in hb_tree_new()\n"); return NULL; } + NBC_DEBUG(1, "added tree at address %lu\n", (unsigned long)comminfo->NBC_Dict[NBC_ALLREDUCE]); + comminfo->NBC_Dict_size[NBC_ALLREDUCE] = 0; +#endif + /* initialize the NBC_BARRIER SchedCache tree - is not needed - + * schedule is hung off directly */ + comminfo->NBC_Dict_size[NBC_BARRIER] = 0; +#if 0 + /* initialize the NBC_BCAST SchedCache tree */ + comminfo->NBC_Dict[NBC_BCAST] = hb_tree_new((dict_cmp_func)NBC_Bcast_args_compare, NBC_SchedCache_args_delete_key_dummy, NBC_SchedCache_args_delete); + if(comminfo->NBC_Dict[NBC_BCAST] == NULL) { printf("Error in hb_tree_new()\n"); return NULL; } + NBC_DEBUG(1, "added tree at address %lu\n", (unsigned long)comminfo->NBC_Dict[NBC_BCAST]); + comminfo->NBC_Dict_size[NBC_BCAST] = 0; + /* initialize the NBC_GATHER SchedCache tree */ + comminfo->NBC_Dict[NBC_GATHER] = hb_tree_new((dict_cmp_func)NBC_Gather_args_compare, NBC_SchedCache_args_delete_key_dummy, NBC_SchedCache_args_delete); + if(comminfo->NBC_Dict[NBC_GATHER] == NULL) { printf("Error in hb_tree_new()\n"); return NULL; } + NBC_DEBUG(1, "added tree at address %lu\n", (unsigned long)comminfo->NBC_Dict[NBC_GATHER]); + comminfo->NBC_Dict_size[NBC_GATHER] = 0; + /* initialize the NBC_REDUCE SchedCache tree */ + comminfo->NBC_Dict[NBC_REDUCE] = hb_tree_new((dict_cmp_func)NBC_Reduce_args_compare, NBC_SchedCache_args_delete_key_dummy, NBC_SchedCache_args_delete); + if(comminfo->NBC_Dict[NBC_REDUCE] == NULL) { printf("Error in hb_tree_new()\n"); return NULL; } + NBC_DEBUG(1, "added tree at address %lu\n", (unsigned long)comminfo->NBC_Dict[NBC_REDUCE]); + comminfo->NBC_Dict_size[NBC_REDUCE] = 0; + /* initialize the NBC_SCAN SchedCache tree */ + comminfo->NBC_Dict[NBC_SCAN] = hb_tree_new((dict_cmp_func)NBC_Scan_args_compare, NBC_SchedCache_args_delete_key_dummy, NBC_SchedCache_args_delete); + if(comminfo->NBC_Dict[NBC_SCAN] == NULL) { printf("Error in hb_tree_new()\n"); return NULL; } + NBC_DEBUG(1, "added tree at address %lu\n", (unsigned long)comminfo->NBC_Dict[NBC_SCAN]); + comminfo->NBC_Dict_size[NBC_SCAN] = 0; + /* initialize the NBC_SCATTER SchedCache tree */ + comminfo->NBC_Dict[NBC_SCATTER] = hb_tree_new((dict_cmp_func)NBC_Scatter_args_compare, NBC_SchedCache_args_delete_key_dummy, NBC_SchedCache_args_delete); + if(comminfo->NBC_Dict[NBC_SCATTER] == NULL) { printf("Error in hb_tree_new()\n"); return NULL; } + NBC_DEBUG(1, "added tree at address %lu\n", (unsigned long)comminfo->NBC_Dict[NBC_SCATTER]); + comminfo->NBC_Dict_size[NBC_SCATTER] = 0; + /* initialize the NBC_ICART_SHIFT_XCHG SchedCache tree */ + comminfo->NBC_Dict[NBC_CART_SHIFT_XCHG] = hb_tree_new((dict_cmp_func)NBC_Icart_shift_xchg_args_compare, NBC_SchedCache_args_delete_key_dummy, NBC_SchedCache_args_delete); + if(comminfo->NBC_Dict[NBC_CART_SHIFT_XCHG] == NULL) { printf("Error in hb_tree_new()\n"); return NULL; } + NBC_DEBUG(1, "added tree at address %lu\n", (unsigned long)comminfo->NBC_Dict[NBC_CART_SHIFT_XCHG]); + comminfo->NBC_Dict_size[NBC_CART_SHIFT_XCHG] = 0; + /* initialize the NBC_INEIGHBOR_XCHG SchedCache tree */ + comminfo->NBC_Dict[NBC_NEIGHBOR_XCHG] = hb_tree_new((dict_cmp_func)NBC_Ineighbor_xchg_args_compare, NBC_SchedCache_args_delete_key_dummy, NBC_SchedCache_args_delete); + if(comminfo->NBC_Dict[NBC_NEIGHBOR_XCHG] == NULL) { printf("Error in hb_tree_new()\n"); return NULL; } + NBC_DEBUG(1, "added tree at address %lu\n", (unsigned long)comminfo->NBC_Dict[NBC_NEIGHBOR_XCHG]); + comminfo->NBC_Dict_size[NBC_NEIGHBOR_XCHG] = 0; +#endif +#endif + + /* put the new attribute to the comm */ + res = MPI_Attr_put(comm, gkeyval, comminfo); + if((MPI_SUCCESS != res)) { printf("Error in MPI_Attr_put() (%i)\n", res); return NULL; } + + return comminfo; +} + +int NBC_Start(NBC_Handle *handle, NBC_Schedule *schedule) { + int res; + + handle->schedule = schedule; + + /* kick off first round */ + res = NBC_Start_round(handle); + if((NBC_OK != res)) { printf("Error in NBC_Start_round() (%i)\n", res); return res; } + + return NBC_OK; +} + +int NBC_Wait(NBC_Handle *handle) { + /* poll */ + while(NBC_OK != NBC_Progress(handle)); + + NBC_DEBUG(3, "finished request with tag %i\n", handle->tag); + + return NBC_OK; +} + +int NBC_Test(NBC_Handle *handle, int *flag, MPI_Status *status) { + int ret = NBC_Progress(handle); + *flag = ret; + + return NBC_OK; +} + + +#ifdef NBC_CACHE_SCHEDULE +void NBC_SchedCache_args_delete_key_dummy(void *k) { + /* do nothing because the key and the data element are identical :-) + * both (the single one :) is freed in NBC__args_delete() */ +} + +void NBC_SchedCache_args_delete(void *entry) { + struct NBC_dummyarg *tmp; + + tmp = (struct NBC_dummyarg*)entry; + /* free taglistentry */ + free((void*)*(tmp->schedule)); + /* the schedule pointer itself is also malloc'd */ + free((void*)tmp->schedule); + free((void*)tmp); +} +#endif diff --git a/ompi/mca/coll/libnbc/nbc.h b/ompi/mca/coll/libnbc/nbc.h new file mode 100644 index 0000000000..d53ac6ddc2 --- /dev/null +++ b/ompi/mca/coll/libnbc/nbc.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2006 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2006 The Technical University of Chemnitz. All + * rights reserved. + * + * Author(s): Torsten Hoefler + * + */ +#ifndef __NBC_H__ +#define __NBC_H__ + +#ifdef __cplusplus +extern "C" { +#endif + + +/******************************************************* + ****** external NBC functions are defined here ******* + *******************************************************/ + +/* external function prototypes */ +int NBC_Ibarrier(MPI_Comm comm, NBC_Handle* handle); +int NBC_Ibcast(void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm, NBC_Handle* handle); +int NBC_Ibcast_inter(void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm, NBC_Handle* handle); +int NBC_Ialltoallv(void* sendbuf, int *sendcounts, int *sdispls, MPI_Datatype sendtype, void* recvbuf, int *recvcounts, int *rdispls, MPI_Datatype recvtype, MPI_Comm comm, NBC_Handle* handle); +int NBC_Igather(void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm, NBC_Handle* handle); +int NBC_Igatherv(void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int *recvcounts, int *displs, MPI_Datatype recvtype, int root, MPI_Comm comm, NBC_Handle* handle); +int NBC_Iscatter(void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm, NBC_Handle* handle); +int NBC_Iscatterv(void* sendbuf, int *sendcounts, int *displs, MPI_Datatype sendtype, void* recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm, NBC_Handle* handle); +int NBC_Iallgather(void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm, NBC_Handle *handle); +int NBC_Iallgatherv(void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int *recvcounts, int *displs, MPI_Datatype recvtype, MPI_Comm comm, NBC_Handle *handle); +int NBC_Ialltoall(void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm, NBC_Handle *handle); +int NBC_Ireduce(void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm, NBC_Handle* handle); +int NBC_Iallreduce(void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm, NBC_Handle* handle); +int NBC_Iscan(void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm, NBC_Handle* handle); +int NBC_Ireduce_scatter(void* sendbuf, void* recvbuf, int *recvcounts, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm, NBC_Handle* handle); +int NBC_Icart_shift_xchg(void *sbuf, int scount, MPI_Datatype stype, void *rbuf, int rcount, MPI_Datatype rtype, int direction, int disp, MPI_Comm comm, NBC_Handle* handle); +int NBC_Ineighbor_xchg(void *sbuf, int scount, MPI_Datatype stype, void *rbuf, int rcount, MPI_Datatype rtype, MPI_Comm comm, NBC_Handle* handle); +int NBC_Ineighbor_xchgv(void *sbuf, int *scounts, int *sdispls, MPI_Datatype stype, void *rbuf, int *rcounts, int *rdispls, MPI_Datatype rtype, MPI_Comm comm, NBC_Handle* handle); + +int NBC_Comm_neighbors(MPI_Comm comm, int maxindegree, int sources[], int sourceweights[], int maxoutdegree, int destinations[], int destweights[]); +int NBC_Comm_neighbors_count(MPI_Comm comm, int *indegree, int *outdegree, int *weighted); +int NBC_Wait(NBC_Handle *handle); +int NBC_Test(NBC_Handle *handle, int *flag, MPI_Status *status); + +/* TODO: some hacks */ +int NBC_Operation(void *buf3, void *buf1, void *buf2, MPI_Op op, MPI_Datatype type, int count); + +void NBC_Reset_times(void); +void NBC_Print_times(double div); + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/ompi/mca/coll/libnbc/nbc_iallgather.c b/ompi/mca/coll/libnbc/nbc_iallgather.c new file mode 100644 index 0000000000..51f22f4b05 --- /dev/null +++ b/ompi/mca/coll/libnbc/nbc_iallgather.c @@ -0,0 +1,257 @@ +/* + * Copyright (c) 2006 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2006 The Technical University of Chemnitz. All + * rights reserved. + * + * Author(s): Torsten Hoefler + * + */ +#include "nbc_internal.h" + +#ifdef NBC_CACHE_SCHEDULE +/* tree comparison function for schedule cache */ +int NBC_Allgather_args_compare(NBC_Allgather_args *a, NBC_Allgather_args *b, void *param) { + + if( (a->sendbuf == b->sendbuf) && + (a->sendcount == b->sendcount) && + (a->sendtype == b->sendtype) && + (a->recvbuf == b->recvbuf) && + (a->recvcount == b->recvcount) && + (a->recvtype == b->recvtype) ) { + return 0; + } + if( a->sendbuf < b->sendbuf ) { + return -1; + } + return +1; +} +#endif + +#ifdef HAVE_SYS_WEAK_ALIAS_PRAGMA +#pragma weak NBC_Iallgather=PNBC_Iallgather +#define NBC_Iallgather PNBC_Iallgather +#endif + +/* simple linear MPI_Iallgather + * the algorithm uses p-1 rounds + * each node sends the packet it received last round (or has in round 0) to it's right neighbor (modulo p) + * each node receives from it's left (modulo p) neighbor */ +int NBC_Iallgather(void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm, NBC_Handle *handle) { + int rank, p, res, r; + MPI_Aint rcvext, sndext; + NBC_Schedule *schedule; + char *rbuf, *sbuf, inplace; +#ifdef NBC_CACHE_SCHEDULE + NBC_Allgather_args *args, *found, search; +#endif + + NBC_IN_PLACE(sendbuf, recvbuf, inplace); + + res = NBC_Init_handle(handle, comm); + if(res != NBC_OK) { printf("Error in NBC_Init_handle(%i)\n", res); return res; } + res = MPI_Comm_rank(comm, &rank); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_rank() (%i)\n", res); return res; } + res = MPI_Comm_size(comm, &p); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_size() (%i)\n", res); return res; } + res = MPI_Type_extent(sendtype, &sndext); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Type_extent() (%i)\n", res); return res; } + res = MPI_Type_extent(recvtype, &rcvext); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Type_extent() (%i)\n", res); return res; } + + handle->tmpbuf = NULL; + + if(!((rank == 0) && inplace)) { + /* copy my data to receive buffer */ + rbuf = ((char *)recvbuf) + (rank*recvcount*rcvext); + res = NBC_Copy(sendbuf, sendcount, sendtype, rbuf, recvcount, recvtype, comm); + if (NBC_OK != res) { printf("Error in NBC_Copy() (%i)\n", res); return res; } + } + +#ifdef NBC_CACHE_SCHEDULE + /* search schedule in communicator specific tree */ + search.sendbuf=sendbuf; + search.sendcount=sendcount; + search.sendtype=sendtype; + search.recvbuf=recvbuf; + search.recvcount=recvcount; + search.recvtype=recvtype; + found = (NBC_Allgather_args *)hb_tree_search((hb_tree*)handle->comminfo->NBC_Dict[NBC_ALLGATHER], &search); + if(found == NULL) { +#endif + schedule = (NBC_Schedule*)malloc(sizeof(NBC_Schedule)); + if (NULL == schedule) { printf("Error in malloc()\n"); return res; } + + res = NBC_Sched_create(schedule); + if(NBC_OK != res) { printf("Error in NBC_Sched_create, (%i)\n", res); return res; } + + sbuf = ((char *)recvbuf) + (rank*recvcount*rcvext); + /* do p-1 rounds */ + for(r=0;rsendbuf=sendbuf; + args->sendcount=sendcount; + args->sendtype=sendtype; + args->recvbuf=recvbuf; + args->recvcount=recvcount; + args->recvtype=recvtype; + args->schedule=schedule; + res = hb_tree_insert ((hb_tree*)handle->comminfo->NBC_Dict[NBC_ALLGATHER], args, args, 0); + if(res != 0) printf("error in dict_insert() (%i)\n", res); + /* increase number of elements for A2A */ + if(++handle->comminfo->NBC_Dict_size[NBC_ALLGATHER] > NBC_SCHED_DICT_UPPER) { + NBC_SchedCache_dictwipe((hb_tree*)handle->comminfo->NBC_Dict[NBC_ALLGATHER], &handle->comminfo->NBC_Dict_size[NBC_ALLGATHER]); + } + } else { + /* found schedule */ + schedule=found->schedule; + } +#endif + + /*NBC_PRINT_SCHED(*schedule);*/ + + res = NBC_Start(handle, schedule); + if (NBC_OK != res) { printf("Error in NBC_Start() (%i)\n", res); return res; } + + return NBC_OK; +} + + +/* this is a new possible dissemination based allgather algorithm - we should + * try it some time (big comm, small data) */ +#if 0 + +static inline void diss_unpack(int rank, int vrank, int round, int p, int *pos, void *tmpbuf, int datasize, int slotsize, void *recvbuf, int sendcount, MPI_Datatype sendtype, int recvcount, MPI_Datatype recvtype, MPI_Comm comm, NBC_Schedule *schedule) { + int r, res; + char *sbuf, *rbuf; + + sbuf = (char *)tmpbuf + (*pos*datasize); + rbuf = (char *)recvbuf + (vrank*slotsize); + printf("[%i] unpacking tmpbuf pos: %i (%lu) to rbuf elem: %i (%lu) - %i elems, datasize %i\n", rank, *pos, (unsigned long)sbuf, vrank, (unsigned long)rbuf, recvcount, datasize); + res = NBC_Sched_unpack(sbuf, recvcount, recvtype, rbuf, schedule); + if (NBC_OK != res) { printf("Error in NBC_Unpack() (%i)\n", res); } + *pos=*pos+1; + + for(r=0; r<=round; r++) { + if(r != 0) { + diss_unpack(rank, (vrank-(1<<(r-1))+p)%p, r-1, p, pos, tmpbuf, datasize, slotsize, recvbuf, sendcount, sendtype, recvcount, recvtype, comm, schedule); + } + } +} + +static inline int a2a_sched_diss(int rank, int p, MPI_Aint sndext, MPI_Aint rcvext, NBC_Schedule* schedule, void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm, NBC_Handle *handle) { + int res, r, maxround, size, speer, rpeer, pos, datasize; + char *sbuf, *rbuf; + + res = NBC_OK; + if(p < 2) return res; + + maxround = (int)ceil((log(p)/LOG2)); + + if(NBC_Type_intrinsic(sendtype)) { + datasize = sndext*sendcount; + } else { + res = MPI_Pack_size(sendcount, sendtype, comm, &datasize); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Pack_size() (%i)\n", res); return res; } + } + + /* tmpbuf is probably bigger than p -> next power of 2 */ + handle->tmpbuf=malloc(datasize*(1<tmpbuf, sbuf, datasize); + } else { + pos = 0; + res = MPI_Pack(sbuf, sendcount, sendtype, handle->tmpbuf, datasize, &pos, comm); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Pack() (%i)\n", res); return res; } + } + + printf("[%i] receive buffer is at %lu of size %i, maxround: %i\n", rank, (unsigned long)handle->tmpbuf, (int)sndext*sendcount*(1<tmpbuf+size; + sbuf = (char*)handle->tmpbuf; + + speer = (rank + (1<tmpbuf, datasize, recvcount*rcvext, recvbuf, sendcount, sendtype, recvcount, recvtype, comm, schedule); + + return NBC_OK; +} +#endif + + +#ifdef __cplusplus +extern "C" { +#endif +/* Fortran bindings */ +#ifdef HAVE_SYS_WEAK_ALIAS_PRAGMA +NBC_F77_ALLFUNC_(nbc_iallgather,NBC_IALLGATHER,(void *sendbuf, int *sendcount, int *sendtype, void *recvbuf, int *recvcount, int *recvtype, int *fcomm, int *fhandle, int *ierr)); +#pragma weak NBC_IALLGATHER = nbc_iallgather_f +#pragma weak nbc_iallgather = nbc_iallgather_f +#pragma weak nbc_iallgather_ = nbc_iallgather_f +#pragma weak nbc_iallgather__ = nbc_iallgather_f +#pragma weak PNBC_IALLGATHER = nbc_iallgather_f +#pragma weak pnbc_iallgather = nbc_iallgather_f +#pragma weak pnbc_iallgather_ = nbc_iallgather_f +#pragma weak pnbc_iallgather__ = nbc_iallgather_f +void nbc_iallgather_f(void *sendbuf, int *sendcount, int *sendtype, void *recvbuf, int *recvcount, int *recvtype, int *fcomm, int *fhandle, int *ierr) { +#else +void NBC_F77_FUNC_(nbc_iallgather,NBC_IALLGATHER)(void *sendbuf, int *sendcount, int *sendtype, void *recvbuf, int *recvcount, int *recvtype, int *fcomm, int *fhandle, int *ierr); +void NBC_F77_FUNC_(nbc_iallgather,NBC_IALLGATHER)(void *sendbuf, int *sendcount, int *sendtype, void *recvbuf, int *recvcount, int *recvtype, int *fcomm, int *fhandle, int *ierr) { +#endif + MPI_Datatype rtype, stype; + MPI_Comm comm; + NBC_Handle *handle; + + /* this is the only MPI-2 we need :-( */ + rtype = MPI_Type_f2c(*recvtype); + stype = MPI_Type_f2c(*sendtype); + comm = MPI_Comm_f2c(*fcomm); + + /* create a new handle in handle table */ + NBC_Create_fortran_handle(fhandle, &handle); + + /* call NBC function */ + *ierr = NBC_Iallgather(sendbuf, *sendcount, stype, recvbuf, *recvcount, rtype, comm, handle); +} + +#ifdef __cplusplus +} +#endif diff --git a/ompi/mca/coll/libnbc/nbc_iallgatherv.c b/ompi/mca/coll/libnbc/nbc_iallgatherv.c new file mode 100644 index 0000000000..e022f0ff9d --- /dev/null +++ b/ompi/mca/coll/libnbc/nbc_iallgatherv.c @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2006 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2006 The Technical University of Chemnitz. All + * rights reserved. + * + * Author(s): Torsten Hoefler + * + */ +#include "nbc_internal.h" + +/* an allgatherv schedule can not be cached easily because the contents + * ot the recvcounts array may change, so a comparison of the address + * would not be sufficient ... we simply do not cache it */ + +#ifdef HAVE_SYS_WEAK_ALIAS_PRAGMA +#pragma weak NBC_Iallgatherv=PNBC_Iallgatherv +#define NBC_Iallgatherv PNBC_Iallgatherv +#endif + +/* simple linear MPI_Iallgatherv + * the algorithm uses p-1 rounds + * first round: + * each node sends to it's left node (rank+1)%p sendcount elements + * each node begins with it's right node (rank-11)%p and receives from it recvcounts[(rank+1)%p] elements + * second round: + * each node sends to node (rank+2)%p sendcount elements + * each node receives from node (rank-2)%p recvcounts[(rank+2)%p] elements */ +int NBC_Iallgatherv(void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int *recvcounts, int *displs, MPI_Datatype recvtype, MPI_Comm comm, NBC_Handle *handle) { + int rank, p, res, r, speer, rpeer; + MPI_Aint rcvext, sndext; + NBC_Schedule *schedule; + char *rbuf, inplace; + + NBC_IN_PLACE(sendbuf, recvbuf, inplace); + + res = NBC_Init_handle(handle, comm); + if(res != NBC_OK) { printf("Error in NBC_Init_handle(%i)\n", res); return res; } + res = MPI_Comm_rank(comm, &rank); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_rank() (%i)\n", res); return res; } + res = MPI_Comm_size(comm, &p); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_size() (%i)\n", res); return res; } + res = MPI_Type_extent(sendtype, &sndext); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Type_extent() (%i)\n", res); return res; } + res = MPI_Type_extent(recvtype, &rcvext); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Type_extent() (%i)\n", res); return res; } + + schedule = (NBC_Schedule*)malloc(sizeof(NBC_Schedule)); + if (NULL == schedule) { printf("Error in malloc() (%i)\n", res); return res; } + + handle->tmpbuf=NULL; + + res = NBC_Sched_create(schedule); + if(res != NBC_OK) { printf("Error in NBC_Sched_create, (%i)\n", res); return res; } + + if(!inplace) { + /* copy my data to receive buffer */ + rbuf = ((char *)recvbuf) + (displs[rank]*rcvext); + NBC_Copy(sendbuf, sendcount, sendtype, rbuf, recvcounts[rank], recvtype, comm); + if (NBC_OK != res) { printf("Error in NBC_Copy() (%i)\n", res); return res; } + } + + /* do p-1 rounds */ + for(r=1;r + * + */ +#include "nbc_internal.h" +#include + +static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype datatype, void *sendbuf, void *recvbuf, MPI_Op op, NBC_Schedule *schedule, NBC_Handle *handle); +static inline int allred_sched_chain(int rank, int p, int count, MPI_Datatype datatype, void *sendbuf, void *recvbuf, MPI_Op op, int size, int ext, NBC_Schedule *schedule, NBC_Handle *handle, int fragsize); +static inline int allred_sched_ring(int rank, int p, int count, MPI_Datatype datatype, void *sendbuf, void *recvbuf, MPI_Op op, int size, int ext, NBC_Schedule *schedule, NBC_Handle *handle); + +// DCMF allreduce is actually not non-blocking!!! +#ifdef USE_DCMF +#undef USE_DCMF +#endif + +#ifdef USE_DCMF +#error "DCMF allreduce is not nonblocking and thus disabled" +#include +static int initialized=0; +static void cbfunc(void *clientdata, DCMF_Error_t *error) { + //printf("in allreduce callback!\n"); + *(unsigned*)clientdata=2; +} + +DCMF_Protocol_t allred_reg; +DCMF_Request_t allred_req; + +static void init_dcmf_allred() { + DCMF_GlobalAllreduce_Configuration_t allred_config; + allred_config.protocol = DCMF_DEFAULT_GLOBALALLREDUCE_PROTOCOL; + DCMF_GlobalAllreduce_register(&allred_reg, &allred_config); + initialized=1; +} +#endif + +#ifdef NBC_CACHE_SCHEDULE +/* tree comparison function for schedule cache */ +int NBC_Allreduce_args_compare(NBC_Allreduce_args *a, NBC_Allreduce_args *b, void *param) { + + if( (a->sendbuf == b->sendbuf) && + (a->recvbuf == b->recvbuf) && + (a->count == b->count) && + (a->datatype == b->datatype) && + (a->op == b->op) ) { + return 0; + } + if( a->sendbuf < b->sendbuf ) { + return -1; + } + return +1; +} +#endif + +#ifdef HAVE_SYS_WEAK_ALIAS_PRAGMA +#pragma weak NBC_Iallreduce=PNBC_Iallreduce +#define NBC_Iallreduce PNBC_Iallreduce +#endif + +int NBC_Iallreduce(void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm, NBC_Handle* handle) { + int rank, p, res, size; + MPI_Aint ext; + NBC_Schedule *schedule; +#ifdef NBC_CACHE_SCHEDULE + NBC_Allreduce_args *args, *found, search; +#endif + enum { NBC_ARED_BINOMIAL, NBC_ARED_RING } alg; + char inplace; + +#ifdef USE_DCMF + int ws, s; + MPI_Comm_size(comm, &s); + MPI_Comm_size(MPI_COMM_WORLD, &ws); + if(s != ws) { + printf("DCMF only works on MPI_COMM_WORLD (or dups of it) for now -- fallback needs to be implemented :-)\n"); + return NBC_NOT_IMPLEMENTED; + } + if(!initialized) init_dcmf_allred(); + handle->dcmf_hndl = (NBC_DCMF_Handle*)malloc(sizeof(NBC_DCMF_Handle)); + handle->dcmf_hndl->done=0; + handle->dcmf_hndl->type=DCMF_TYPE_ALLREDUCE; + DCMF_Callback_t callback={ cbfunc, &handle->dcmf_hndl->done }; + + + DCMF_Dt dt; + switch(datatype) { + case MPI_UNSIGNED_LONG_LONG: + dt = DCMF_UNSIGNED_LONG_LONG; + break; + case MPI_LONG_LONG: + dt = DCMF_SIGNED_LONG_LONG; + break; + case MPI_UNSIGNED: + dt = DCMF_UNSIGNED_INT; + break; + case MPI_INT: + dt = DCMF_SIGNED_INT; + break; + case MPI_UNSIGNED_LONG: // we assume it's the same as integer!!! + dt = DCMF_UNSIGNED_INT; + assert(sizeof(unsigned int) == sizeof(unsigned long)); + break; + case MPI_LONG: // we assume it's the same as integer !!! + dt = DCMF_SIGNED_INT; + assert(sizeof(int) == sizeof(long)); + break; + case MPI_DOUBLE: + dt = DCMF_DOUBLE; + break; + default: + printf("Datatype not supported\n"); + return NBC_NOT_IMPLEMENTED; + break; + } + + DCMF_Op dop; + switch(op) { + case MPI_SUM: + dop = DCMF_SUM; + break; + default: + printf("Operations not supported\n"); + return NBC_NOT_IMPLEMENTED; + break; + } + +int r; +MPI_Comm_rank(comm, &r); +printf("[%i] LibNBC starting allreduce\n", r); + DCMF_GlobalAllreduce(&allred_reg, &allred_req, callback, DCMF_MATCH_CONSISTENCY, -1 /* root?? */, (char*)sendbuf, (char*)recvbuf, count, dt, dop); +printf("[%i] LibNBC after allreduce\n", r); + +#else + NBC_IN_PLACE(sendbuf, recvbuf, inplace); + + res = NBC_Init_handle(handle, comm); + if(res != NBC_OK) { printf("Error in NBC_Init_handle(%i)\n", res); return res; } + res = MPI_Comm_rank(comm, &rank); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_rank() (%i)\n", res); return res; } + res = MPI_Comm_size(comm, &p); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_size() (%i)\n", res); return res; } + res = MPI_Type_extent(datatype, &ext); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Type_extent() (%i)\n", res); return res; } + res = MPI_Type_size(datatype, &size); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Type_size() (%i)\n", res); return res; } + + handle->tmpbuf = malloc(ext*count); + if(handle->tmpbuf == NULL) { printf("Error in malloc() (%i)\n", res); return NBC_OOR; } + + if((p == 1) && !inplace) { + /* for a single node - copy data to receivebuf */ + res = NBC_Copy(sendbuf, count, datatype, recvbuf, count, datatype, comm); + if (NBC_OK != res) { printf("Error in NBC_Copy() (%i)\n", res); return res; } + } + + /* algorithm selection */ + if(p < 4 || size*count < 65536) { + alg = NBC_ARED_BINOMIAL; + } else { + alg = NBC_ARED_RING; + } + +#ifdef NBC_CACHE_SCHEDULE + /* search schedule in communicator specific tree */ + search.sendbuf=sendbuf; + search.recvbuf=recvbuf; + search.count=count; + search.datatype=datatype; + search.op=op; + found = (NBC_Allreduce_args*)hb_tree_search((hb_tree*)handle->comminfo->NBC_Dict[NBC_ALLREDUCE], &search); + if(found == NULL) { +#endif + schedule = (NBC_Schedule*)malloc(sizeof(NBC_Schedule)); + if (NULL == schedule) { printf("Error in malloc()\n"); return res; } + + res = NBC_Sched_create(schedule); + if(res != NBC_OK) { printf("Error in NBC_Sched_create (%i)\n", res); return res; } + + switch(alg) { + case NBC_ARED_BINOMIAL: + res = allred_sched_diss(rank, p, count, datatype, sendbuf, recvbuf, op, schedule, handle); + break; + case NBC_ARED_RING: + res = allred_sched_ring(rank, p, count, datatype, sendbuf, recvbuf, op, size, ext, schedule, handle); + break; + } + if (NBC_OK != res) { printf("Error in Schedule creation() (%i)\n", res); return res; } + + res = NBC_Sched_commit(schedule); + if(res != NBC_OK) { free(handle->tmpbuf); printf("Error in NBC_Sched_commit() (%i)\n", res); return res; } + +#ifdef NBC_CACHE_SCHEDULE + /* save schedule to tree */ + args = (NBC_Allreduce_args*)malloc(sizeof(NBC_Allreduce_args)); + args->sendbuf=sendbuf; + args->recvbuf=recvbuf; + args->count=count; + args->datatype=datatype; + args->op=op; + args->schedule=schedule; + res = hb_tree_insert ((hb_tree*)handle->comminfo->NBC_Dict[NBC_ALLREDUCE], args, args, 0); + if(res != 0) printf("error in dict_insert() (%i)\n", res); + /* increase number of elements for A2A */ + if(++handle->comminfo->NBC_Dict_size[NBC_ALLREDUCE] > NBC_SCHED_DICT_UPPER) { + NBC_SchedCache_dictwipe((hb_tree*)handle->comminfo->NBC_Dict[NBC_ALLREDUCE], &handle->comminfo->NBC_Dict_size[NBC_ALLREDUCE]); + } + } else { + /* found schedule */ + schedule=found->schedule; + } +#endif + + res = NBC_Start(handle, schedule); + if(res != NBC_OK) { free(handle->tmpbuf); printf("Error in NBC_Start() (%i)\n", res); return res; } +#endif + + /* tmpbuf is freed with the handle */ + return NBC_OK; +} + + +/* binomial allreduce (binomial tree up and binomial bcast down) + * working principle: + * - each node gets a virtual rank vrank + * - the 'root' node get vrank 0 + * - node 0 gets the vrank of the 'root' + * - all other ranks stay identical (they do not matter) + * + * Algorithm: + * pairwise exchange + * round r: + * grp = rank % 2^r + * if grp == 0: receive from rank + 2^(r-1) if it exists and reduce value + * if grp == 1: send to rank - 2^(r-1) and exit function + * + * do this for R=log_2(p) rounds + * followed by a Bcast: + * Algorithm: + * - each node with vrank > 2^r and vrank < 2^r+1 receives from node + * vrank - 2^r (vrank=1 receives from 0, vrank 0 receives never) + * - each node sends each round r to node vrank + 2^r + * - a node stops to send if 2^r > commsize + * + */ +#define RANK2VRANK(rank, vrank, root) \ +{ \ + vrank = rank; \ + if (rank == 0) vrank = root; \ + if (rank == root) vrank = 0; \ +} +#define VRANK2RANK(rank, vrank, root) \ +{ \ + rank = vrank; \ + if (vrank == 0) rank = root; \ + if (vrank == root) rank = 0; \ +} +static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype datatype, void *sendbuf, void *recvbuf, MPI_Op op, NBC_Schedule *schedule, NBC_Handle *handle) { + int root, vrank, r, maxr, firstred, vpeer, peer, res; + + root = 0; /* this makes the code for ireduce and iallreduce nearly identical - could be changed to improve performance */ + RANK2VRANK(rank, vrank, root); + maxr = (int)ceil((log(p)/LOG2)); + + firstred = 1; + for(r=1; r<=maxr; r++) { + if((vrank % (1<tmpbuf); printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } + /* we have to wait until we have the data */ + res = NBC_Sched_barrier(schedule); + if(res != NBC_OK) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } + if(firstred) { + /* perform the reduce with the senbuf */ + res = NBC_Sched_op(recvbuf, false, sendbuf, false, 0, true, count, datatype, op, schedule); + firstred = 0; + } else { + /* perform the reduce in my local buffer */ + res = NBC_Sched_op(recvbuf, false, recvbuf, false, 0, true, count, datatype, op, schedule); + } + if(res != NBC_OK) { free(handle->tmpbuf); printf("Error in NBC_Sched_op() (%i)\n", res); return res; } + /* this cannot be done until handle->tmpbuf is unused :-( */ + res = NBC_Sched_barrier(schedule); + if(res != NBC_OK) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } + } + } else { + /* we have to send this round */ + vpeer = vrank - (1<<(r-1)); + VRANK2RANK(peer, vpeer, root) + if(firstred) { + /* we have to use the sendbuf in the first round .. */ + res = NBC_Sched_send(sendbuf, false, count, datatype, peer, schedule); + } else { + /* and the recvbuf in all remeining rounds */ + res = NBC_Sched_send(recvbuf, false, count, datatype, peer, schedule); + } + if(res != NBC_OK) { free(handle->tmpbuf); printf("Error in NBC_Sched_send() (%i)\n", res); return res; } + /* leave the game */ + break; + } + } + + /* this is the Bcast part - copied with minor changes from nbc_ibcast.c + * changed: buffer -> recvbuf */ + RANK2VRANK(rank, vrank, root); + + /* receive from the right hosts */ + if(vrank != 0) { + for(r=0; r= (1<tmpbuf); printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } + } + } + res = NBC_Sched_barrier(schedule); + if(NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } + } + + /* now send to the right hosts */ + for(r=0; rtmpbuf); printf("Error in NBC_Sched_send() (%i)\n", res); return res; } + } + } + /* end of the bcast */ + + return NBC_OK; +} + +static inline int allred_sched_chain(int rank, int p, int count, MPI_Datatype datatype, void *sendbuf, void *recvbuf, MPI_Op op, int size, int ext, NBC_Schedule *schedule, NBC_Handle *handle, int fragsize) { + int res, rrpeer, rbpeer, srpeer, sbpeer, numfrag, fragnum, fragcount, thiscount, bstart, bend; + long roffset, boffset; + + /* reduce peers */ + rrpeer = rank+1; + srpeer = rank-1; + /* bcast peers */ + rbpeer = rank-1; + sbpeer = rank+1; + + if(count == 0) return NBC_OK; + + numfrag = count*size/fragsize; + if((count*size)%fragsize != 0) numfrag++; + fragcount = count/numfrag; + + /* determine the starting round of bcast ... the first reduced packet + * is after p-1 rounds at rank 0 and will be sent back ... */ + bstart = p-1+rank; + /* determine the ending round of bcast ... after arrival of the first + * packet, each rank has to forward numfrag packets */ + bend = bstart+numfrag; + /*printf("[%i] numfrag: %i, count: %i, size: %i, fragcount: %i, bstart: %i, bend: %i\n", rank, numfrag, count, size, fragcount, bstart, bend);*/ + + /* this are two loops in one - this is a little nasty :-( */ + for(fragnum = 0; fragnum < bend; fragnum++) { + roffset = fragnum*fragcount*ext; + boffset = (fragnum-bstart)*fragcount*ext; + thiscount = fragcount; + + /* first numfrag rounds ... REDUCE to rank 0 */ + if(fragnum < numfrag) { + if(fragnum == numfrag-1) { + /* last fragment may not be full */ + thiscount = count-fragcount*fragnum; + } + /*printf("[%i] reduce %i elements from %lu\n", rank, thiscount, roffset); */ + + /* REDUCE - PART last node does not recv */ + if(rank != p-1) { + res = NBC_Sched_recv((char*)roffset, true, thiscount, datatype, rrpeer, schedule); + if (NBC_OK != res) { printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } + res = NBC_Sched_barrier(schedule); + /* root reduces into receivebuf */ + if(rank == 0) { + res = NBC_Sched_op((char*)recvbuf+roffset, false, (char*)sendbuf+roffset, false, (char*)roffset, true, thiscount, datatype, op, schedule); + } else { + res = NBC_Sched_op((char*)roffset, true, (char*)sendbuf+roffset, false, (char*)roffset, true, thiscount, datatype, op, schedule); + } + res = NBC_Sched_barrier(schedule); + } + + /* REDUCE PART root does not send */ + if(rank != 0) { + /* rank p-1 has to send out of sendbuffer :) */ + if(rank == p-1) { + res = NBC_Sched_send((char*)sendbuf+roffset, false, thiscount, datatype, srpeer, schedule); + } else { + res = NBC_Sched_send((char*)roffset, true, thiscount, datatype, srpeer, schedule); + } + if (NBC_OK != res) { printf("Error in NBC_Sched_send() (%i)\n", res); return res; } + /* this barrier here seems awkward but isn't!!!! */ + /*res = NBC_Sched_barrier(schedule);*/ + } + } + + /* BCAST from rank 0 */ + if(fragnum >= bstart) { + /*printf("[%i] bcast %i elements from %lu\n", rank, thiscount, boffset); */ + if(fragnum == bend-1) { + /* last fragment may not be full */ + thiscount = count-fragcount*(fragnum-bstart); + } + + /* BCAST PART root does not receive */ + if(rank != 0) { + res = NBC_Sched_recv((char*)recvbuf+boffset, false, thiscount, datatype, rbpeer, schedule); + if (NBC_OK != res) { printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } + res = NBC_Sched_barrier(schedule); + } + + /* BCAST PART last rank does not send */ + if(rank != p-1) { + res = NBC_Sched_send((char*)recvbuf+boffset, false, thiscount, datatype, sbpeer, schedule); + if (NBC_OK != res) { printf("Error in NBC_Sched_send() (%i)\n", res); return res; } + res = NBC_Sched_barrier(schedule); + } + } + } + + /*NBC_PRINT_SCHED(*schedule);*/ + + return NBC_OK; +} + +static inline int allred_sched_ring(int r, int p, int count, MPI_Datatype datatype, void *sendbuf, void *recvbuf, MPI_Op op, int size, int ext, NBC_Schedule *schedule, NBC_Handle *handle) { + int i; /* runner */ + int segsize, *segsizes, *segoffsets; /* segment sizes and offsets per segment (number of segments == number of nodes */ + int speer, rpeer; /* send and recvpeer */ + int res; + + if(count == 0) return NBC_OK; + + { + int mycount; /* temporary */ + segsizes = (int*)malloc(sizeof(int)*p); + segoffsets = (int*)malloc(sizeof(int)*p); + segsize = count/p; /* size of the segments */ + if(count%p != 0) segsize++; + mycount = count; + segoffsets[0] = 0; + for(i = 0; i reduced this round + * / -> sum (reduced in a previous step) + * + * *** round 0 *** + * 0 1 2 + * + * 00 10 20 0: [1] -> 1 + * 01 11 21 1: [2] -> 2 + * 02 12 22 2: [0] -> 0 --> send element (r+1)%p to node (r+1)%p + * + * *** round 1 *** + * 0 1 2 + * + * 00+20 10 20 0: red(0), [0] -> 1 + * 01 11+01 21 1: red(1), [1] -> 2 + * 02 12 22+12 2: red(2), [2] -> 0 --> reduce and send element (r+0)%p to node (r+1)%p + * + * *** round 2 *** + * 0 1 2 + * + * 00/20 all 20 0: red(2), [2] -> 1 + * 01 11/01 all 1: red(0), [0] -> 2 + * all 12 22/12 2: red(1), [1] -> 0 --> reduce and send (r-1)%p to node (r+1)%p + * + * *** round 3 *** + * 0 1 2 + * + * 00/20 all all 0: [1] -> 1 + * all 11/01 all 1: [2] -> 2 + * all all 22/12 2: [0] -> 0 --> send element (r-2)%p to node (r+1)%p + * + * *** round 4 *** + * 0 1 2 + * + * all all all 0: done + * all all all 1: done + * all all all 2: done + * + * -> 4 + * *** round 0 *** + * 0 1 2 3 + * + * 00 10 20 30 0: [1] -> 1 + * 01 11 21 31 1: [2] -> 2 + * 02 12 22 32 2: [3] -> 3 + * 03 13 23 33 3: [0] -> 0 --> send element (r+1)%p to node (r+1)%p + * + * *** round 1 *** + * 0 1 2 3 + * + * 00+30 10 20 30 0: red(0), [0] -> 1 + * 01 11+01 21 31 1: red(1), [1] -> 2 + * 02 12 22+12 32 2: red(2), [2] -> 3 + * 03 13 23 33+23 3: red(3), [3] -> 0 --> reduce and send element (r+0)%p to node (r+1)%p + * + * *** round 2 *** + * 0 1 2 3 + * + * 00/30 10+00/30 20 30 0: red(3), [3] -> 1 + * 01 11/01 21+11/01 31 1: red(0), [0] -> 2 + * 02 12 22/12 32+22/12 2: red(1), [1] -> 3 + * 03+33/23 13 23 33/23 3: red(2), [2] -> 0 --> reduce and send (r-1)%p to node (r+1)%p + * + * *** round 3 *** + * 0 1 2 3 + * + * 00/30 10/00/30 all 30 0: red(2), [2] -> 1 + * 01 11/01 21/11/01 all 1: red(3), [3] -> 2 + * all 12 22/12 32/22/12 2: red(0), [0] -> 3 + * 03/33/23 all 23 33/23 3: red(1), [1] -> 0 --> reduce and send (r-2)%p to node (r+1)%p + * + * *** round 4 *** + * 0 1 2 3 + * + * 00/30 10/00/30 all all 0: [1] -> 1 + * all 11/01 21/11/01 all 1: [2] -> 2 + * all all 22/12 32/22/12 2: [3] -> 3 + * 03/33/23 all all 33/23 3: [0] -> 0 --> receive and send element (r+1)%p to node (r+1)%p + * + * *** round 5 *** + * 0 1 2 3 + * + * all 10/00/30 all all 0: [0] -> 1 + * all all 21/11/01 all 1: [1] -> 2 + * all all all 32/22/12 2: [3] -> 3 + * 03/33/23 all all all 3: [4] -> 4 --> receive and send element (r-0)%p to node (r+1)%p + * + * *** round 6 *** + * 0 1 2 3 + * + * all all all all + * all all all all + * all all all all + * all all all all receive element (r-1)%p + * + * 2p-2 rounds ... every node does p-1 reductions and p-1 sends + * + */ + { + int round = 0; + /* first p-1 rounds are reductions */ + do { + int selement = (r+1-round + 2*p /*2*p avoids negative mod*/)%p; /* the element I am sending */ + int soffset = segoffsets[selement]*ext; + int relement = (r-round + 2*p /*2*p avoids negative mod*/)%p; /* the element that I receive from my neighbor */ + int roffset = segoffsets[relement]*ext; + + /* first message come out of sendbuf */ + if(round == 0) { + NBC_Sched_send((char*)sendbuf+soffset, false, segsizes[selement], datatype, speer, schedule); + //printf("[%i] round %i - sending %i\n", r, round, selement); + } else { + NBC_Sched_send((char*)recvbuf+soffset, false, segsizes[selement], datatype, speer, schedule); + //printf("[%i] round %i - sending %i\n", r, round, selement); + } + NBC_Sched_recv((char*)recvbuf+roffset, false, segsizes[relement], datatype, rpeer, schedule); + //printf("[%i] round %i - receiving %i\n", r, round, relement); + + NBC_Sched_barrier(schedule); + //printf("[%i] round %i - reducing %i\n", r, round, relement); + NBC_Sched_op((char*)recvbuf+roffset, false, (char*)sendbuf+roffset, false, (char*)recvbuf+roffset, false, segsizes[relement], datatype, op, schedule); + NBC_Sched_barrier(schedule); + + round++; + } while(round < p-1); + + do { + int selement = (r+1-round + 2*p /*2*p avoids negative mod*/)%p; /* the element I am sending */ + int soffset = segoffsets[selement]*ext; + int relement = (r-round + 2*p /*2*p avoids negative mod*/)%p; /* the element that I receive from my neighbor */ + int roffset = segoffsets[relement]*ext; + + //printf("[%i] round %i receiving %i sending %i\n", r, round, relement, selement); + NBC_Sched_send((char*)recvbuf+soffset, false, segsizes[selement], datatype, speer, schedule); + NBC_Sched_recv((char*)recvbuf+roffset, false, segsizes[relement], datatype, rpeer, schedule); + NBC_Sched_barrier(schedule); + round++; + } while (round < 2*p-2); + } + + //NBC_PRINT_SCHED(*schedule); + + return NBC_OK; +} + + + +#ifdef __cplusplus +extern "C" { +#endif +/* Fortran bindings */ +#ifdef HAVE_SYS_WEAK_ALIAS_PRAGMA +NBC_F77_ALLFUNC_(nbc_iallreduce,NBC_IALLREDUCE,(void *sendbuf, void *recvbuf, int *count, int *datatype, int *fop, int *fcomm, int *fhandle, int *ierr)); +#pragma weak NBC_IALLREDUCE = nbc_iallreduce_f +#pragma weak nbc_iallreduce = nbc_iallreduce_f +#pragma weak nbc_iallreduce_ = nbc_iallreduce_f +#pragma weak nbc_iallreduce__ = nbc_iallreduce_f +#pragma weak PNBC_IALLREDUCE = nbc_iallreduce_f +#pragma weak pnbc_iallreduce = nbc_iallreduce_f +#pragma weak pnbc_iallreduce_ = nbc_iallreduce_f +#pragma weak pnbc_iallreduce__ = nbc_iallreduce_f +void nbc_iallreduce_f(void *sendbuf, void *recvbuf, int *count, int *datatype, int *fop, int *fcomm, int *fhandle, int *ierr) { +#else +void NBC_F77_FUNC_(nbc_iallreduce,NBC_IALLREDUCE)(void *sendbuf, void *recvbuf, int *count, int *datatype, int *fop, int *fcomm, int *fhandle, int *ierr); +void NBC_F77_FUNC_(nbc_iallreduce,NBC_IALLREDUCE)(void *sendbuf, void *recvbuf, int *count, int *datatype, int *fop, int *fcomm, int *fhandle, int *ierr) { +#endif + MPI_Datatype dtype; + MPI_Comm comm; + MPI_Op op; + NBC_Handle *handle; + + /* this is the only MPI-2 we need :-( */ + dtype = MPI_Type_f2c(*datatype); + comm = MPI_Comm_f2c(*fcomm); + op = MPI_Op_f2c(*fop); + + /* create a new handle in handle table */ + NBC_Create_fortran_handle(fhandle, &handle); + + /* call NBC function */ + *ierr = NBC_Iallreduce(sendbuf, recvbuf, *count, dtype, op, comm, handle); +} + +#ifdef __cplusplus +} +#endif diff --git a/ompi/mca/coll/libnbc/nbc_ialltoall.c b/ompi/mca/coll/libnbc/nbc_ialltoall.c new file mode 100644 index 0000000000..72255a25bf --- /dev/null +++ b/ompi/mca/coll/libnbc/nbc_ialltoall.c @@ -0,0 +1,358 @@ +/* + * Copyright (c) 2006 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2006 The Technical University of Chemnitz. All + * rights reserved. + * + * Author(s): Torsten Hoefler + * + */ +#include "nbc_internal.h" + +static inline int a2a_sched_linear(int rank, int p, MPI_Aint sndext, MPI_Aint rcvext, NBC_Schedule *schedule, void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm); +static inline int a2a_sched_pairwise(int rank, int p, MPI_Aint sndext, MPI_Aint rcvext, NBC_Schedule *schedule, void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm); +static inline int a2a_sched_diss(int rank, int p, MPI_Aint sndext, MPI_Aint rcvext, NBC_Schedule* schedule, void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm, NBC_Handle *handle); + +#ifdef NBC_CACHE_SCHEDULE +/* tree comparison function for schedule cache */ +int NBC_Alltoall_args_compare(NBC_Alltoall_args *a, NBC_Alltoall_args *b, void *param) { + + if( (a->sendbuf == b->sendbuf) && + (a->sendcount == b->sendcount) && + (a->sendtype == b->sendtype) && + (a->recvbuf == b->recvbuf) && + (a->recvcount == b->recvcount) && + (a->recvtype == b->recvtype) ) { + return 0; + } + if( a->sendbuf < b->sendbuf ) { + return -1; + } + return +1; +} +#endif + +#ifdef HAVE_SYS_WEAK_ALIAS_PRAGMA +#pragma weak NBC_Ialltoall=PNBC_Ialltoall +#define NBC_Ialltoall PNBC_Ialltoall +#endif + +/* simple linear MPI_Ialltoall the (simple) algorithm just sends to all nodes */ +int NBC_Ialltoall(void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm, NBC_Handle *handle) { + int rank, p, res, a2asize, sndsize, datasize; + NBC_Schedule *schedule; + MPI_Aint rcvext, sndext; +#ifdef NBC_CACHE_SCHEDULE + NBC_Alltoall_args *args, *found, search; +#endif + char *rbuf, *sbuf, inplace; + enum {NBC_A2A_LINEAR, NBC_A2A_PAIRWISE, NBC_A2A_DISS} alg; + + NBC_IN_PLACE(sendbuf, recvbuf, inplace); + + res = NBC_Init_handle(handle, comm); + if(res != NBC_OK) { printf("Error in NBC_Init_handle(%i)\n", res); return res; } + res = MPI_Comm_rank(comm, &rank); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_rank() (%i)\n", res); return res; } + res = MPI_Comm_size(comm, &p); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_size() (%i)\n", res); return res; } + res = MPI_Type_extent(sendtype, &sndext); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Type_extent() (%i)\n", res); return res; } + res = MPI_Type_extent(recvtype, &rcvext); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Type_extent() (%i)\n", res); return res; } + res = MPI_Type_size(sendtype, &sndsize); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Type_size() (%i)\n", res); return res; } + + /* algorithm selection */ + a2asize = sndsize*sendcount*p; + /* this number is optimized for TCP on odin.cs.indiana.edu */ + if((p <= 8) && ((a2asize < 1<<17) || (sndsize*sendcount < 1<<12))) { + /* just send as fast as we can if we have less than 8 peers, if the + * total communicated size is smaller than 1<<17 *and* if we don't + * have eager messages (msgsize < 1<<13) */ + alg = NBC_A2A_LINEAR; + } else if(a2asize < (1<<12)*p) { + /*alg = NBC_A2A_DISS;*/ + alg = NBC_A2A_LINEAR; + } else + alg = NBC_A2A_LINEAR; /*NBC_A2A_PAIRWISE;*/ + + if(!inplace) { + /* copy my data to receive buffer */ + rbuf = ((char *)recvbuf) + (rank*recvcount*rcvext); + sbuf = ((char *)sendbuf) + (rank*sendcount*sndext); + res = NBC_Copy(sbuf, sendcount, sendtype, rbuf, recvcount, recvtype, comm); + if (NBC_OK != res) { printf("Error in NBC_Copy() (%i)\n", res); return res; } + } + + /* allocate temp buffer if we need one */ + if(alg == NBC_A2A_DISS) { + /* only A2A_DISS needs buffers */ + if(NBC_Type_intrinsic(sendtype)) { + datasize = sndext*sendcount; + } else { + res = MPI_Pack_size(sendcount, sendtype, comm, &datasize); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Pack_size() (%i)\n", res); return res; } + } + /* allocate temporary buffers */ + if(p % 2 == 0) { + handle->tmpbuf=malloc(datasize*p*2); + } else { + /* we cannot divide p by two, so alloc more to be safe ... */ + handle->tmpbuf=malloc(datasize*(p/2+1)*2*2); + } + + /* phase 1 - rotate n data blocks upwards into the tmpbuffer */ + if(NBC_Type_intrinsic(sendtype)) { + /* contiguous - just copy (1st copy) */ + memcpy(handle->tmpbuf, (char*)sendbuf+datasize*rank, datasize*(p-rank)); + if(rank != 0) memcpy((char*)handle->tmpbuf+datasize*(p-rank), sendbuf, datasize*(rank)); + } else { + int pos=0; + + /* non-contiguous - pack */ + res = MPI_Pack((char*)sendbuf+rank*sendcount*sndext, (p-rank)*sendcount, sendtype, handle->tmpbuf, (p-rank)*datasize, &pos, comm); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Pack() (%i)\n", res); return res; } + if(rank != 0) { + pos = 0; + MPI_Pack(sendbuf, rank*sendcount, sendtype, (char*)handle->tmpbuf+datasize*(p-rank), rank*datasize, &pos, comm); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Pack() (%i)\n", res); return res; } + } + } + } else { + handle->tmpbuf=NULL; + } + +#ifdef NBC_CACHE_SCHEDULE + /* search schedule in communicator specific tree */ + search.sendbuf=sendbuf; + search.sendcount=sendcount; + search.sendtype=sendtype; + search.recvbuf=recvbuf; + search.recvcount=recvcount; + search.recvtype=recvtype; + found = (NBC_Alltoall_args*)hb_tree_search((hb_tree*)handle->comminfo->NBC_Dict[NBC_ALLTOALL], &search); + if(found == NULL) { +#endif + /* not found - generate new schedule */ + schedule = (NBC_Schedule*)malloc(sizeof(NBC_Schedule)); + if (NULL == schedule) { printf("Error in malloc()\n"); return res; } + + res = NBC_Sched_create(schedule); + if(res != NBC_OK) { printf("Error in NBC_Sched_create (%i)\n", res); return res; } + + switch(alg) { + case NBC_A2A_LINEAR: + res = a2a_sched_linear(rank, p, sndext, rcvext, schedule, sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm); + break; + case NBC_A2A_DISS: + res = a2a_sched_diss(rank, p, sndext, rcvext, schedule, sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm, handle); + break; + case NBC_A2A_PAIRWISE: + res = a2a_sched_pairwise(rank, p, sndext, rcvext, schedule, sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm); + break; + } + + if (NBC_OK != res) { return res; } + + res = NBC_Sched_commit(schedule); + if (NBC_OK != res) { printf("Error in NBC_Sched_commit() (%i)\n", res); return res; } + +#ifdef NBC_CACHE_SCHEDULE + /* save schedule to tree */ + args = (NBC_Alltoall_args*)malloc(sizeof(NBC_Alltoall_args)); + args->sendbuf=sendbuf; + args->sendcount=sendcount; + args->sendtype=sendtype; + args->recvbuf=recvbuf; + args->recvcount=recvcount; + args->recvtype=recvtype; + args->schedule=schedule; + res = hb_tree_insert ((hb_tree*)handle->comminfo->NBC_Dict[NBC_ALLTOALL], args, args, 0); + if(res != 0) printf("error in dict_insert() (%i)\n", res); + /* increase number of elements for A2A */ + if(++handle->comminfo->NBC_Dict_size[NBC_ALLTOALL] > NBC_SCHED_DICT_UPPER) { + NBC_SchedCache_dictwipe((hb_tree*)handle->comminfo->NBC_Dict[NBC_ALLTOALL], &handle->comminfo->NBC_Dict_size[NBC_ALLTOALL]); + /*if(!rank) printf("[%i] removing %i elements - new size: %i \n", rank, SCHED_DICT_UPPER-SCHED_DICT_LOWER, handle->comminfo->NBC_Alltoall_size);*/ + } + /*if(!rank) printf("[%i] added new schedule to tree - number %i\n", rank, handle->comminfo->NBC_Dict_size[NBC_ALLTOALL]);*/ + } else { + /* found schedule */ + schedule=found->schedule; + } +#endif + + res = NBC_Start(handle, schedule); + if (NBC_OK != res) { printf("Error in NBC_Start() (%i)\n", res); return res; } + + return NBC_OK; +} + +static inline int a2a_sched_pairwise(int rank, int p, MPI_Aint sndext, MPI_Aint rcvext, NBC_Schedule* schedule, void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm) { + int res, r, sndpeer, rcvpeer; + char *rbuf, *sbuf; + + res = NBC_OK; + if(p < 2) return res; + + for(r=1;rtmpbuf+datasize*p; + stmpbuf = (char*)handle->tmpbuf+datasize*(p+p/2); + } else { + /* we cannot divide p by two, so alloc more to be safe ... */ + virtp = (p/2+1)*2; + rtmpbuf = (char*)handle->tmpbuf+datasize*p; + stmpbuf = (char*)handle->tmpbuf+datasize*(p+virtp/2); + } + + /* phase 2 - communicate */ + /*printf("[%i] temp buffer is at %lu of size %i, maxround: %i\n", rank, (unsigned long)handle->tmpbuf, (int)datasize*p*(1<tmpbuf, true, datasize, MPI_BYTE, schedule); + offset += datasize; + } + } + + speer = ( rank + r) % p; + /* add p because modulo does not work with negative values */ + rpeer = ((rank - r)+p) % p; + + /*printf("[%i] receiving %i bytes from host %i into rbuf %lu\n", rank, offset, rpeer, (unsigned long)rtmpbuf);*/ + res = NBC_Sched_recv(rtmpbuf-(unsigned long)handle->tmpbuf, true, offset, MPI_BYTE, rpeer, schedule); + if (NBC_OK != res) { printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } + + /*printf("[%i] sending %i bytes to host %i from sbuf %lu\n", rank, offset, speer, (unsigned long)stmpbuf);*/ + res = NBC_Sched_send(stmpbuf-(unsigned long)handle->tmpbuf, true, offset, MPI_BYTE, speer, schedule); + if (NBC_OK != res) { printf("Error in NBC_Sched_send() (%i)\n", res); return res; } + + res = NBC_Sched_barrier(schedule); + if (NBC_OK != res) { printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } + + /* unpack from buffer */ + offset = 0; + for(i=1; itmpbuf, true, datasize, MPI_BYTE, (void*)(long)(i*datasize), true, datasize, MPI_BYTE, schedule); + offset += datasize; + } + } + } + + /* phase 3 - reorder - data is now in wrong order in handle->tmpbuf - + * reorder it into recvbuf */ + for(i=0; i + * + */ +#include "nbc_internal.h" + +/* an alltoallv schedule can not be cached easily because the contents + * ot the recvcounts array may change, so a comparison of the address + * would not be sufficient ... we simply do not cache it */ + +#ifdef HAVE_SYS_WEAK_ALIAS_PRAGMA +#pragma weak NBC_Ialltoallv=PNBC_Ialltoallv +#define NBC_Ialltoallv PNBC_Ialltoallv +#endif + +/* simple linear Alltoallv */ +int NBC_Ialltoallv(void* sendbuf, int *sendcounts, int *sdispls, + MPI_Datatype sendtype, void* recvbuf, int *recvcounts, int *rdispls, + MPI_Datatype recvtype, MPI_Comm comm, NBC_Handle* handle) { + + int rank, p, res, i; + MPI_Aint sndext, rcvext; + NBC_Schedule *schedule; + char *rbuf, *sbuf, inplace; + + NBC_IN_PLACE(sendbuf, recvbuf, inplace); + + res = NBC_Init_handle(handle, comm); + if(res != NBC_OK) { printf("Error in NBC_Init_handle(%i)\n", res); return res; } + res = MPI_Comm_rank(comm, &rank); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_rank() (%i)\n", res); return res; } + res= MPI_Comm_size(comm, &p); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_size() (%i)\n", res); return res; } + res = MPI_Type_extent(sendtype, &sndext); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Type_extent() (%i)\n", res); return res; } + res = MPI_Type_extent(recvtype, &rcvext); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Type_extent() (%i)\n", res); return res; } + + schedule = (NBC_Schedule*)malloc(sizeof(NBC_Schedule)); + if (NULL == schedule) { printf("Error in malloc() (%i)\n", res); return res; } + + handle->tmpbuf=NULL; + + res = NBC_Sched_create(schedule); + if(res != NBC_OK) { printf("Error in NBC_Sched_create (%i)\n", res); return res; } + + /* copy data to receivbuffer */ + if((sendcounts[rank] != 0) && !inplace) { + rbuf = ((char *) recvbuf) + (rdispls[rank] * rcvext); + sbuf = ((char *) sendbuf) + (sdispls[rank] * sndext); + res = NBC_Copy(sbuf, sendcounts[rank], sendtype, rbuf, recvcounts[rank], recvtype, comm); + if (NBC_OK != res) { printf("Error in NBC_Copy() (%i)\n", res); return res; } + } + + for (i = 0; i < p; i++) { + if (i == rank) { continue; } + /* post all sends */ + if(sendcounts[i] != 0) { + sbuf = ((char *) sendbuf) + (sdispls[i] * sndext); + res = NBC_Sched_send(sbuf, false, sendcounts[i], sendtype, i, schedule); + if (NBC_OK != res) { printf("Error in NBC_Sched_send() (%i)\n", res); return res; } + } + /* post all receives */ + if(recvcounts[i] != 0) { + rbuf = ((char *) recvbuf) + (rdispls[i] * rcvext); + res = NBC_Sched_recv(rbuf, false, recvcounts[i], recvtype, i, schedule); + if (NBC_OK != res) { printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } + } + } + + /*NBC_PRINT_SCHED(*schedule);*/ + + res = NBC_Sched_commit(schedule); + if (NBC_OK != res) { printf("Error in NBC_Sched_commit() (%i)\n", res); return res; } + + res = NBC_Start(handle, schedule); + if (NBC_OK != res) { printf("Error in NBC_Start() (%i)\n", res); return res; } + + return NBC_OK; +} + + +#ifdef __cplusplus +extern "C" { +#endif +/* Fortran bindings */ +#ifdef HAVE_SYS_WEAK_ALIAS_PRAGMA +NBC_F77_ALLFUNC_(nbc_ialltoallv,NBC_IALLTOALLV,(void *sendbuf, int *sendcounts, int *sdispls, int *sendtype, + void *recvbuf, int *recvcounts, int *rdispls, int *recvtype, int *fcomm, int *fhandle, int *ierr)); +#pragma weak NBC_IALLTOALLV = nbc_ialltoallv_f +#pragma weak nbc_ialltoallv = nbc_ialltoallv_f +#pragma weak nbc_ialltoallv_ = nbc_ialltoallv_f +#pragma weak nbc_ialltoallv__ = nbc_ialltoallv_f +#pragma weak PNBC_IALLTOALLV = nbc_ialltoallv_f +#pragma weak pnbc_ialltoallv = nbc_ialltoallv_f +#pragma weak pnbc_ialltoallv_ = nbc_ialltoallv_f +#pragma weak pnbc_ialltoallv__ = nbc_ialltoallv_f +void nbc_ialltoallv_f(void *sendbuf, int *sendcounts, int *sdispls, int *sendtype, + void *recvbuf, int *recvcounts, int *rdispls, int *recvtype, int *fcomm, int *fhandle, int *ierr) { +#else +void NBC_F77_FUNC_(nbc_ialltoallv,NBC_IALLTOALLV)(void *sendbuf, int *sendcounts, int *sdispls, int *sendtype, + void *recvbuf, int *recvcounts, int *rdispls, int *recvtype, int *fcomm, int *fhandle, int *ierr); +void NBC_F77_FUNC_(nbc_ialltoallv,NBC_IALLTOALLV)(void *sendbuf, int *sendcounts, int *sdispls, int *sendtype, + void *recvbuf, int *recvcounts, int *rdispls, int *recvtype, int *fcomm, int *fhandle, int *ierr) { +#endif + MPI_Datatype rtype, stype; + MPI_Comm comm; + NBC_Handle *handle; + + /* this is the only MPI-2 we need :-( */ + rtype = MPI_Type_f2c(*recvtype); + stype = MPI_Type_f2c(*sendtype); + comm = MPI_Comm_f2c(*fcomm); + + /* create a new handle in handle table */ + NBC_Create_fortran_handle(fhandle, &handle); + + /* call NBC function */ + *ierr = NBC_Ialltoallv(sendbuf, sendcounts, sdispls, stype, recvbuf, recvcounts, rdispls, rtype, comm, handle); +} +#ifdef __cplusplus +} +#endif diff --git a/ompi/mca/coll/libnbc/nbc_ibarrier.c b/ompi/mca/coll/libnbc/nbc_ibarrier.c new file mode 100644 index 0000000000..b5324a74d9 --- /dev/null +++ b/ompi/mca/coll/libnbc/nbc_ibarrier.c @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2006 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2006 The Technical University of Chemnitz. All + * rights reserved. + * + * Author(s): Torsten Hoefler + * + */ +#include "nbc_internal.h" + +/* Dissemination implementation of MPI_Ibarrier */ +int ompi_coll_libnbc_ibarrier(struct ompi_communicator_t *comm, ompi_request_t ** request, + struct mca_coll_base_module_2_0_0_t *module) +{ + int round, rank, p, maxround, res, recvpeer, sendpeer; + NBC_Schedule *schedule; + NBC_Handle *handle; + ompi_coll_libnbc_request_t **coll_req = (ompi_coll_libnbc_request_t**) request; + ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; + + res = NBC_Init_handle(comm, coll_req, libnbc_module); + if(res != NBC_OK) { printf("Error in NBC_Init_handle(%i)\n", res); return res; } + handle = (*coll_req); + + res = MPI_Comm_rank(comm, &rank); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_rank() (%i)\n", res); return res; } + res = MPI_Comm_size(comm, &p); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_size() (%i)\n", res); return res; } + + handle->tmpbuf=(void*)malloc(2*sizeof(char)); + +#ifdef NBC_CACHE_SCHEDULE + /* there only one argument set per communicator -> hang it directly at + * the tree-position, NBC_Dict_size[...] is 0 for not initialized and + * 1 for initialized. NBC_Dict[...] is a pointer to the schedule in + * this case */ + if(handle->comminfo->NBC_Dict_size[NBC_BARRIER] == 0) { + /* we did not init it yet */ +#endif + schedule = (NBC_Schedule*)malloc(sizeof(NBC_Schedule)); + if (NULL == schedule) { printf("Error in malloc()\n"); return res; } + + round = -1; + res = NBC_Sched_create(schedule); + if(res != NBC_OK) { printf("Error in NBC_Sched_create (%i)\n", res); return res; } + + maxround = (int)ceil((log(p)/LOG2)-1); + + do { + round++; + sendpeer = (rank + (1<comminfo->NBC_Dict[NBC_BARRIER] = (hb_tree*)schedule; + handle->comminfo->NBC_Dict_size[NBC_BARRIER] = 1; + } else { + /* we found it */ + schedule = (NBC_Schedule*)handle->comminfo->NBC_Dict[NBC_BARRIER]; + } +#endif + + res = NBC_Start(handle, schedule); + if (NBC_OK != res) { printf("Error in NBC_Start() (%i)\n", res); return res; } + + return NBC_OK; +} diff --git a/ompi/mca/coll/libnbc/nbc_ibcast.c b/ompi/mca/coll/libnbc/nbc_ibcast.c new file mode 100644 index 0000000000..9114020aad --- /dev/null +++ b/ompi/mca/coll/libnbc/nbc_ibcast.c @@ -0,0 +1,287 @@ +/* + * Copyright (c) 2006 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2006 The Technical University of Chemnitz. All + * rights reserved. + * + * Author(s): Torsten Hoefler + * + */ +#include "nbc_internal.h" + +static inline int bcast_sched_binomial(int rank, int p, int root, NBC_Schedule *schedule, void *buffer, int count, MPI_Datatype datatype); +static inline int bcast_sched_linear(int rank, int p, int root, NBC_Schedule *schedule, void *buffer, int count, MPI_Datatype datatype); +static inline int bcast_sched_chain(int rank, int p, int root, NBC_Schedule *schedule, void *buffer, int count, MPI_Datatype datatype, int fragsize, int size); + +#ifdef NBC_CACHE_SCHEDULE +/* tree comparison function for schedule cache */ +int NBC_Bcast_args_compare(NBC_Bcast_args *a, NBC_Bcast_args *b, void *param) { + + if( (a->buffer == b->buffer) && + (a->count == b->count) && + (a->datatype == b->datatype) && + (a->root == b->root) ) { + return 0; + } + if( a->buffer < b->buffer ) { + return -1; + } + return +1; +} +#endif + +#ifdef HAVE_SYS_WEAK_ALIAS_PRAGMA +#pragma weak NBC_Ibcast=PNBC_Ibcast +#define NBC_Ibcast PNBC_Ibcast +#endif + +int NBC_Ibcast(void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm, NBC_Handle* handle) { + int rank, p, res, size, segsize; + NBC_Schedule *schedule; +#ifdef NBC_CACHE_SCHEDULE + NBC_Bcast_args *args, *found, search; +#endif + enum { NBC_BCAST_LINEAR, NBC_BCAST_BINOMIAL, NBC_BCAST_CHAIN } alg; + + res = NBC_Init_handle(handle, comm); + if(res != NBC_OK) { printf("Error in NBC_Init_handle(%i)\n", res); return res; } + res = MPI_Comm_rank(comm, &rank); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_rank() (%i)\n", res); return res; } + res = MPI_Comm_size(comm, &p); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_rank() (%i)\n", res); return res; } + res = MPI_Type_size(datatype, &size); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Type_size() (%i)\n", res); return res; } + + segsize = 16384; + /* algorithm selection */ + if(p <= 4) { + alg = NBC_BCAST_LINEAR; + } else if(size*count < 65536) { + alg = NBC_BCAST_BINOMIAL; + } else if(size*count < 524288) { + alg = NBC_BCAST_CHAIN; + segsize = 16384/2; + } else { + alg = NBC_BCAST_CHAIN; + segsize = 65536/2; + } + + handle->tmpbuf=NULL; + +#ifdef NBC_CACHE_SCHEDULE + /* search schedule in communicator specific tree */ + search.buffer=buffer; + search.count=count; + search.datatype=datatype; + search.root=root; + found = (NBC_Bcast_args*)hb_tree_search((hb_tree*)handle->comminfo->NBC_Dict[NBC_BCAST], &search); + if(found == NULL) { +#endif + schedule = (NBC_Schedule*)malloc(sizeof(NBC_Schedule)); + + res = NBC_Sched_create(schedule); + if(res != NBC_OK) { printf("Error in NBC_Sched_create, res = %i\n", res); return res; } + + switch(alg) { + case NBC_BCAST_LINEAR: + res = bcast_sched_linear(rank, p, root, schedule, buffer, count, datatype); + break; + case NBC_BCAST_BINOMIAL: + res = bcast_sched_binomial(rank, p, root, schedule, buffer, count, datatype); + break; + case NBC_BCAST_CHAIN: + res = bcast_sched_chain(rank, p, root, schedule, buffer, count, datatype, segsize, size); + break; + } + if (NBC_OK != res) { printf("Error in Schedule creation() (%i)\n", res); return res; } + + res = NBC_Sched_commit(schedule); + if (NBC_OK != res) { printf("Error in NBC_Sched_commit() (%i)\n", res); return res; } +#ifdef NBC_CACHE_SCHEDULE + /* save schedule to tree */ + args = (NBC_Bcast_args*)malloc(sizeof(NBC_Bcast_args)); + args->buffer=buffer; + args->count=count; + args->datatype=datatype; + args->root=root; + args->schedule=schedule; + res = hb_tree_insert ((hb_tree*)handle->comminfo->NBC_Dict[NBC_BCAST], args, args, 0); + if(res != 0) printf("error in dict_insert() (%i)\n", res); + /* increase number of elements for A2A */ + if(++handle->comminfo->NBC_Dict_size[NBC_BCAST] > NBC_SCHED_DICT_UPPER) { + NBC_SchedCache_dictwipe((hb_tree*)handle->comminfo->NBC_Dict[NBC_BCAST], &handle->comminfo->NBC_Dict_size[NBC_BCAST]); + } + } else { + /* found schedule */ + schedule=found->schedule; + } +#endif + + res = NBC_Start(handle, schedule); + if (NBC_OK != res) { printf("Error in NBC_Start() (%i)\n", res); return res; } + + return NBC_OK; +} + +/* better binomial bcast + * working principle: + * - each node gets a virtual rank vrank + * - the 'root' node get vrank 0 + * - node 0 gets the vrank of the 'root' + * - all other ranks stay identical (they do not matter) + * + * Algorithm: + * - each node with vrank > 2^r and vrank < 2^r+1 receives from node + * vrank - 2^r (vrank=1 receives from 0, vrank 0 receives never) + * - each node sends each round r to node vrank + 2^r + * - a node stops to send if 2^r > commsize + */ +#define RANK2VRANK(rank, vrank, root) \ +{ \ + vrank = rank; \ + if (rank == 0) vrank = root; \ + if (rank == root) vrank = 0; \ +} +#define VRANK2RANK(rank, vrank, root) \ +{ \ + rank = vrank; \ + if (vrank == 0) rank = root; \ + if (vrank == root) rank = 0; \ +} +static inline int bcast_sched_binomial(int rank, int p, int root, NBC_Schedule *schedule, void *buffer, int count, MPI_Datatype datatype) { + int maxr, vrank, peer, r, res; + + maxr = (int)ceil((log(p)/LOG2)); + + RANK2VRANK(rank, vrank, root); + + /* receive from the right hosts */ + if(vrank != 0) { + for(r=0; r= (1< + * + */ +#include "nbc_internal.h" + +#ifdef __cplusplus +extern "C" { +#endif + +int NBC_Ibcast_inter(void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm, NBC_Handle* handle) { + int rank, p, res, size, segsize, peer; + NBC_Schedule *schedule; + + res = NBC_Init_handle(handle, comm); + if(res != NBC_OK) { printf("Error in NBC_Init_handle(%i)\n", res); return res; } + res = MPI_Comm_rank(comm, &rank); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_rank() (%i)\n", res); return res; } + res = MPI_Comm_size(comm, &p); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_rank() (%i)\n", res); return res; } + res = MPI_Type_size(datatype, &size); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Type_size() (%i)\n", res); return res; } + + handle->tmpbuf=NULL; + + schedule = (NBC_Schedule*)malloc(sizeof(NBC_Schedule)); + + res = NBC_Sched_create(schedule); + if(res != NBC_OK) { printf("Error in NBC_Sched_create, res = %i\n", res); return res; } + + if(root != MPI_PROC_NULL) { + /* send to all others */ + if(root == MPI_ROOT) { + int remsize; + + res = MPI_Comm_remote_size(comm, &remsize); + if(MPI_SUCCESS != res) { printf("MPI_Comm_remote_size() failed\n", res); return res; } + + for (peer=0;peer + * + */ +#include "nbc_internal.h" + +#ifdef NBC_CACHE_SCHEDULE +/* tree comparison function for schedule cache */ +int NBC_Gather_args_compare(NBC_Gather_args *a, NBC_Gather_args *b, void *param) { + + if( (a->sendbuf == b->sendbuf) && + (a->sendcount == b->sendcount) && + (a->sendtype == b->sendtype) && + (a->recvbuf == b->recvbuf) && + (a->recvcount == b->recvcount) && + (a->recvtype == b->recvtype) && + (a->root == b->root) ) { + return 0; + } + if( a->sendbuf < b->sendbuf ) { + return -1; + } + return +1; +} +#endif + +#ifdef HAVE_SYS_WEAK_ALIAS_PRAGMA +#pragma weak NBC_Igather=PNBC_Igather +#define NBC_Igather PNBC_Igather +#endif + +int NBC_Igather(void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm, NBC_Handle* handle) { + int rank, p, res, i; + MPI_Aint rcvext; + NBC_Schedule *schedule; + char *rbuf, inplace; +#ifdef NBC_CACHE_SCHEDULE + NBC_Gather_args *args, *found, search; +#endif + + NBC_IN_PLACE(sendbuf, recvbuf, inplace); + + res = NBC_Init_handle(handle, comm); + if(res != NBC_OK) { printf("Error in NBC_Init_handle(%i)\n", res); return res; } + res = MPI_Comm_rank(comm, &rank); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_rank() (%i)\n", res); return res; } + res = MPI_Comm_size(comm, &p); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_rank() (%i)\n", res); return res; } + res = MPI_Type_extent(recvtype, &rcvext); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Type_extent() (%i)\n", res); return res; } + + handle->tmpbuf = NULL; + + if((rank == root) && (!inplace)) { + rbuf = ((char *)recvbuf) + (rank*recvcount*rcvext); + /* if I am the root - just copy the message (only without MPI_IN_PLACE) */ + res = NBC_Copy(sendbuf, sendcount, sendtype, rbuf, recvcount, recvtype, comm); + if (NBC_OK != res) { printf("Error in NBC_Copy() (%i)\n", res); return res; } + } + +#ifdef NBC_CACHE_SCHEDULE + /* search schedule in communicator specific tree */ + search.sendbuf=sendbuf; + search.sendcount=sendcount; + search.sendtype=sendtype; + search.recvbuf=recvbuf; + search.recvcount=recvcount; + search.recvtype=recvtype; + search.root=root; + found = (NBC_Gather_args*)hb_tree_search((hb_tree*)handle->comminfo->NBC_Dict[NBC_GATHER], &search); + if(found == NULL) { +#endif + schedule = (NBC_Schedule*)malloc(sizeof(NBC_Schedule)); + if (NULL == schedule) { printf("Error in malloc() (%i)\n", res); return res; } + + res = NBC_Sched_create(schedule); + if(res != NBC_OK) { printf("Error in NBC_Sched_create (%i)\n", res); return res; } + + /* send to root */ + if(rank != root) { + /* send msg to root */ + res = NBC_Sched_send(sendbuf, false, sendcount, sendtype, root, schedule); + if (NBC_OK != res) { printf("Error in NBC_Sched_send() (%i)\n", res); return res; } + } else { + for(i=0;isendbuf=sendbuf; + args->sendcount=sendcount; + args->sendtype=sendtype; + args->recvbuf=recvbuf; + args->recvcount=recvcount; + args->recvtype=recvtype; + args->root=root; + args->schedule=schedule; + res = hb_tree_insert ((hb_tree*)handle->comminfo->NBC_Dict[NBC_GATHER], args, args, 0); + if(res != 0) printf("error in dict_insert() (%i)\n", res); + /* increase number of elements for A2A */ + if(++handle->comminfo->NBC_Dict_size[NBC_GATHER] > NBC_SCHED_DICT_UPPER) { + NBC_SchedCache_dictwipe((hb_tree*)handle->comminfo->NBC_Dict[NBC_GATHER], &handle->comminfo->NBC_Dict_size[NBC_GATHER]); + } + } else { + /* found schedule */ + schedule=found->schedule; + } +#endif + + res = NBC_Start(handle, schedule); + if (NBC_OK != res) { printf("Error in NBC_Start() (%i)\n", res); return res; } + + return NBC_OK; +} + +#ifdef __cplusplus +extern "C" { +#endif +#ifdef HAVE_SYS_WEAK_ALIAS_PRAGMA +NBC_F77_ALLFUNC_(nbc_igather,NBC_IGATHER,(void *sendbuf, int *sendcount, int *sendtype, void *recvbuf, int *recvcount, int *recvtype, int *root, int *fcomm, int *fhandle, int *ierr)); +#pragma weak NBC_IGATHER = nbc_igather_f +#pragma weak nbc_igather = nbc_igather_f +#pragma weak nbc_igather_ = nbc_igather_f +#pragma weak nbc_igather__ = nbc_igather_f +#pragma weak PNBC_IGATHER = nbc_igather_f +#pragma weak pnbc_igather = nbc_igather_f +#pragma weak pnbc_igather_ = nbc_igather_f +#pragma weak pnbc_igather__ = nbc_igather_f +void nbc_igather_f(void *sendbuf, int *sendcount, int *sendtype, void *recvbuf, int *recvcount, int *recvtype, int *root, int *fcomm, int *fhandle, int *ierr) { +#else +/* Fortran bindings */ +void NBC_F77_FUNC_(nbc_igather,NBC_IGATHER)(void *sendbuf, int *sendcount, int *sendtype, void *recvbuf, int *recvcount, int *recvtype, int *root, int *fcomm, int *fhandle, int *ierr); +void NBC_F77_FUNC_(nbc_igather,NBC_IGATHER)(void *sendbuf, int *sendcount, int *sendtype, void *recvbuf, int *recvcount, int *recvtype, int *root, int *fcomm, int *fhandle, int *ierr) { +#endif + MPI_Datatype rtype, stype; + MPI_Comm comm; + NBC_Handle *handle; + + /* this is the only MPI-2 we need :-( */ + rtype = MPI_Type_f2c(*recvtype); + stype = MPI_Type_f2c(*sendtype); + comm = MPI_Comm_f2c(*fcomm); + + /* create a new handle in handle table */ + NBC_Create_fortran_handle(fhandle, &handle); + + /* call NBC function */ + *ierr = NBC_Igather(sendbuf, *sendcount, stype, recvbuf, *recvcount, rtype, *root, comm, handle); +} +#ifdef __cplusplus +} +#endif diff --git a/ompi/mca/coll/libnbc/nbc_igatherv.c b/ompi/mca/coll/libnbc/nbc_igatherv.c new file mode 100644 index 0000000000..96d6d6ea74 --- /dev/null +++ b/ompi/mca/coll/libnbc/nbc_igatherv.c @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2006 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2006 The Technical University of Chemnitz. All + * rights reserved. + * + * Author(s): Torsten Hoefler + * + */ +#include "nbc_internal.h" + +/* an gatherv schedule can not be cached easily because the contents + * ot the recvcounts array may change, so a comparison of the address + * would not be sufficient ... we simply do not cache it */ + +#ifdef HAVE_SYS_WEAK_ALIAS_PRAGMA +#pragma weak NBC_Igatherv=PNBC_Igatherv +#define NBC_Igatherv PNBC_Igatherv +#endif +int NBC_Igatherv(void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int *recvcounts, int *displs, MPI_Datatype recvtype, int root, MPI_Comm comm, NBC_Handle* handle) { + int rank, p, res, i; + MPI_Aint rcvext; + NBC_Schedule *schedule; + char *rbuf, inplace; + + NBC_IN_PLACE(sendbuf, recvbuf, inplace); + + res = NBC_Init_handle(handle, comm); + if(res != NBC_OK) { printf("Error in NBC_Init_handle(%i)\n", res); return res; } + res = MPI_Comm_rank(comm, &rank); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_rank() (%i)\n", res); return res; } + res = MPI_Comm_size(comm, &p); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_rank() (%i)\n", res); return res; } + res = MPI_Type_extent(recvtype, &rcvext); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Type_extent() (%i)\n", res); return res; } + + schedule = (NBC_Schedule*)malloc(sizeof(NBC_Schedule)); + if (NULL == schedule) { printf("Error in malloc() (%i)\n", res); return res; } + + handle->tmpbuf=NULL; + + res = NBC_Sched_create(schedule); + if(res != NBC_OK) { printf("Error in NBC_Sched_create (%i)\n", res); return res; } + + /* send to root */ + if(rank != root) { + /* send msg to root */ + res = NBC_Sched_send(sendbuf, false, sendcount, sendtype, root, schedule); + if (NBC_OK != res) { printf("Error in NBC_Sched_send() (%i)\n", res); return res; } + } else { + for(i=0;i + * + */ +#ifndef __NBC_INTERNAL_H__ +#define __NBC_INTERNAL_H__ +#include "ompi_config.h" + +/*********************** LibNBC tuning parameters ************************/ + +/* the debug level */ +#define NBC_DLEVEL 0 + +/* use PMPI calls to MPI backend - this is needed because otherwise th + * output will be screwed up in the profiler ... but this can be disabled + * if the profiler does not profile NBC_ calls :) */ +#define USE_PMPI 1 + +/* enable schedule caching - undef NBC_CACHE_SCHEDULE to deactivate it */ +/* TODO: this whole schedule cache stuff does not work with the tmbuf + * :-( - first, the tmpbuf must not be freed if a schedule using it is + * still in the cache and second, the tmpbuf used by the schedule must + * be attached to the handle that uses this schedule !!!! + * I.E., THIS IS EXPERIMENTAL AND MIGHT NOT WORK */ +#define NBC_CACHE_SCHEDULE +#define NBC_SCHED_DICT_UPPER 1024 /* max. number of dict entries */ +#define NBC_SCHED_DICT_LOWER 512 /* nuber of dict entries after wipe, if SCHED_DICT_UPPER is reached */ + +/********************* end of LibNBC tuning parameters ************************/ + +/* correct fortran bindings */ +#define NBC_F77_FUNC_ F77_FUNC_ + +#include "mpi.h" + +#include "coll_libnbc.h" +#include "ompi/include/ompi/constants.h" +#include "ompi/request/request.h" + +#include "nbc.h" + +#include +#include +#include +#include +#include +#include +#include "libdict/dict.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* log(2) */ +#define LOG2 0.69314718055994530941 + +/* true/false */ +#define true 1 +#define false 0 + +/* all collectives */ +#define NBC_ALLGATHER 0 +#define NBC_ALLGATHERV 1 +#define NBC_ALLREDUCE 2 +#define NBC_ALLTOALL 3 +#define NBC_ALLTOALLV 4 +#define NBC_ALLTOALLW 5 +#define NBC_BARRIER 6 +#define NBC_BCAST 7 +#define NBC_EXSCAN 8 +#define NBC_GATHER 9 +#define NBC_GATHERV 10 +#define NBC_REDUCE 11 +#define NBC_REDUCESCAT 12 +#define NBC_SCAN 13 +#define NBC_SCATTER 14 +#define NBC_SCATTERV 15 +#define NBC_CART_SHIFT_XCHG 16 +#define NBC_NEIGHBOR_XCHG 17 +/* set the number of collectives in nbc.h !!!! */ + +/* several typedefs for NBC */ + +/* the function type enum */ +typedef enum { + SEND, + RECV, + OP, + COPY, + UNPACK +} NBC_Fn_type; + +/* the send argument struct */ +typedef struct { + void *buf; + char tmpbuf; + int count; + MPI_Datatype datatype; + int dest; +} NBC_Args_send; + +/* the receive argument struct */ +typedef struct { + void *buf; + char tmpbuf; + int count; + MPI_Datatype datatype; + int source; +} NBC_Args_recv; + +/* the operation argument struct */ +typedef struct { + void *buf1; + char tmpbuf1; + void *buf2; + char tmpbuf2; + void *buf3; + char tmpbuf3; + int count; + MPI_Op op; + MPI_Datatype datatype; +} NBC_Args_op; + +/* the copy argument struct */ +typedef struct { + void *src; + char tmpsrc; + int srccount; + MPI_Datatype srctype; + void *tgt; + char tmptgt; + int tgtcount; + MPI_Datatype tgttype; +} NBC_Args_copy; + +/* unpack operation arguments */ +typedef struct { + void *inbuf; + char tmpinbuf; + int count; + MPI_Datatype datatype; + void *outbuf; + char tmpoutbuf; +} NBC_Args_unpack; + +/* internal function prototypes */ +int NBC_Sched_create(NBC_Schedule* schedule); +int NBC_Sched_send(void* buf, char tmpbuf, int count, MPI_Datatype datatype, int dest, NBC_Schedule *schedule); +int NBC_Sched_recv(void* buf, char tmpbuf, int count, MPI_Datatype datatype, int source, NBC_Schedule *schedule); +int NBC_Sched_op(void* buf3, char tmpbuf3, void* buf1, char tmpbuf1, void* buf2, char tmpbuf2, int count, MPI_Datatype datatype, MPI_Op op, NBC_Schedule *schedule); +int NBC_Sched_copy(void *src, char tmpsrc, int srccount, MPI_Datatype srctype, void *tgt, char tmptgt, int tgtcount, MPI_Datatype tgttype, NBC_Schedule *schedule); +int NBC_Sched_unpack(void *inbuf, char tmpinbuf, int count, MPI_Datatype datatype, void *outbuf, char tmpoutbuf, NBC_Schedule *schedule); +int NBC_Sched_barrier(NBC_Schedule *schedule); +int NBC_Sched_commit(NBC_Schedule *schedule); + +#ifdef NBC_CACHE_SCHEDULE +/* this is a dummy structure which is used to get the schedule out of + * the collop sepcific structure. The schedule pointer HAS to be at the + * first position and should NOT BE REORDERED by the compiler (C + * guarantees that */ +struct NBC_dummyarg { + NBC_Schedule *schedule; +}; + +typedef struct { + NBC_Schedule *schedule; + void *sendbuf; + int sendcount; + MPI_Datatype sendtype; + void* recvbuf; + int recvcount; + MPI_Datatype recvtype; +} NBC_Alltoall_args; +int NBC_Alltoall_args_compare(NBC_Alltoall_args *a, NBC_Alltoall_args *b, void *param); + +typedef struct { + NBC_Schedule *schedule; + void *sendbuf; + int sendcount; + MPI_Datatype sendtype; + void* recvbuf; + int recvcount; + MPI_Datatype recvtype; +} NBC_Allgather_args; +int NBC_Allgather_args_compare(NBC_Allgather_args *a, NBC_Allgather_args *b, void *param); + +typedef struct { + NBC_Schedule *schedule; + void *sendbuf; + void* recvbuf; + int count; + MPI_Datatype datatype; + MPI_Op op; +} NBC_Allreduce_args; +int NBC_Allreduce_args_compare(NBC_Allreduce_args *a, NBC_Allreduce_args *b, void *param); + +typedef struct { + NBC_Schedule *schedule; + void *buffer; + int count; + MPI_Datatype datatype; + int root; +} NBC_Bcast_args; +int NBC_Bcast_args_compare(NBC_Bcast_args *a, NBC_Bcast_args *b, void *param); + +typedef struct { + NBC_Schedule *schedule; + void *sendbuf; + int sendcount; + MPI_Datatype sendtype; + void* recvbuf; + int recvcount; + MPI_Datatype recvtype; + int root; +} NBC_Gather_args; +int NBC_Gather_args_compare(NBC_Gather_args *a, NBC_Gather_args *b, void *param); + +typedef struct { + NBC_Schedule *schedule; + void *sendbuf; + void* recvbuf; + int count; + MPI_Datatype datatype; + MPI_Op op; + int root; +} NBC_Reduce_args; +int NBC_Reduce_args_compare(NBC_Reduce_args *a, NBC_Reduce_args *b, void *param); + +typedef struct { + NBC_Schedule *schedule; + void *sendbuf; + void* recvbuf; + int count; + MPI_Datatype datatype; + MPI_Op op; +} NBC_Scan_args; +int NBC_Scan_args_compare(NBC_Scan_args *a, NBC_Scan_args *b, void *param); + +typedef struct { + NBC_Schedule *schedule; + void *sendbuf; + int sendcount; + MPI_Datatype sendtype; + void* recvbuf; + int recvcount; + MPI_Datatype recvtype; + int root; +} NBC_Scatter_args; +int NBC_Scatter_args_compare(NBC_Scatter_args *a, NBC_Scatter_args *b, void *param); + +typedef struct { + NBC_Schedule *schedule; + void *sbuf; + int scount; + MPI_Datatype stype; + void *rbuf; + int rcount; + MPI_Datatype rtype; + int direction; + int disp; + MPI_Comm comm; +} NBC_Icart_shift_xchg_args; +int NBC_Icart_shift_xchg_args_compare(NBC_Icart_shift_xchg_args *a, NBC_Icart_shift_xchg_args *b, void *param); + +typedef struct { + NBC_Schedule *schedule; + void *sbuf; + int scount; + MPI_Datatype stype; + void *rbuf; + int rcount; + MPI_Datatype rtype; + MPI_Comm comm; +} NBC_Ineighbor_xchg_args; +int NBC_Ineighbor_xchg_args_compare(NBC_Ineighbor_xchg_args *a, NBC_Ineighbor_xchg_args *b, void *param); + +/* Schedule cache structures/functions */ +u_int32_t adler32(u_int32_t adler, int8_t *buf, int len); +void NBC_SchedCache_args_delete(void *entry); +void NBC_SchedCache_args_delete_key_dummy(void *k); + +#endif + + +int NBC_Progress(NBC_Handle *handle); +int NBC_Start(NBC_Handle *handle, NBC_Schedule *schedule); +int NBC_Init_handle(struct ompi_communicator_t *comm, ompi_coll_libnbc_request_t **request, ompi_coll_libnbc_module_t *module); +static inline int NBC_Type_intrinsic(MPI_Datatype type); +static inline int NBC_Copy(void *src, int srccount, MPI_Datatype srctype, void *tgt, int tgtcount, MPI_Datatype tgttype, MPI_Comm comm); +NBC_Comminfo* NBC_Init_comm(MPI_Comm comm); +int NBC_Create_fortran_handle(int *fhandle, NBC_Handle **handle); + +/* some macros */ + +/* a schedule has the following format: + * [schedule] ::= [size][round-schedule][delimiter][round-schedule][delimiter]...[end] + * [size] ::= size of the schedule (int) + * [round-schedule] ::= [num][type][type-args][type][type-args]... + * [num] ::= number of elements in round (int) + * [type] ::= function type (NBC_Fn_type) + * [type-args] ::= type specific arguments (NBC_Args_send, NBC_Args_recv or, NBC_Args_op) + * [delimiter] ::= 1 (char) - indicates that a round follows + * [end] ::= 0 (char) - indicates that this is the last round + */ + +/* NBC_GET_ROUND_SIZE returns the size in bytes of a round of a NBC_Schedule + * schedule. A round has the format: + * [num]{[type][type-args]} + * e.g. [(int)2][(NBC_Fn_type)SEND][(NBC_Args_send)SEND-ARGS][(NBC_Fn_type)RECV][(NBC_Args_recv)RECV-ARGS] */ +#define NBC_GET_ROUND_SIZE(schedule, size) \ + { \ + int *numptr; \ + NBC_Fn_type *typeptr; \ + int i; \ + \ + numptr = (int*)schedule; \ + /*NBC_DEBUG(10, "GET_ROUND_SIZE got %i elements\n", *numptr); */\ + /* end is increased by sizeof(int) bytes to point to type */ \ + typeptr = (NBC_Fn_type*)((int*)(schedule)+1); \ + for (i=0; i<*numptr; i++) { \ + /* go sizeof op-data forward */ \ + switch(*typeptr) { \ + case SEND: \ + /*printf("found a SEND at offset %i\n", (int)typeptr-(int)schedule); */\ + typeptr = (NBC_Fn_type*)((NBC_Args_send*)typeptr+1); \ + break; \ + case RECV: \ + /*printf("found a RECV at offset %i\n", (int)typeptr-(int)schedule); */\ + typeptr = (NBC_Fn_type*)((NBC_Args_recv*)typeptr+1); \ + break; \ + case OP: \ + /*printf("found a OP at offset %i\n", (int)typeptr-(int)schedule); */\ + typeptr = (NBC_Fn_type*)((NBC_Args_op*)typeptr+1); \ + break; \ + case COPY: \ + /*printf("found a COPY at offset %i\n", (int)typeptr-(int)schedule); */\ + typeptr = (NBC_Fn_type*)((NBC_Args_copy*)typeptr+1); \ + break; \ + case UNPACK: \ + /*printf("found a UNPACK at offset %i\n", (int)typeptr-(int)schedule); */\ + typeptr = (NBC_Fn_type*)((NBC_Args_unpack*)typeptr+1); \ + break; \ + default: \ + printf("NBC_GET_ROUND_SIZE: bad type %li at offset %li\n", (long)*typeptr, (long)typeptr-(long)schedule); \ + return NBC_BAD_SCHED; \ + } \ + /* increase ptr by size of fn_type enum */ \ + typeptr = (NBC_Fn_type*)((NBC_Fn_type*)typeptr+1); \ + } \ + /* this could be optimized if typeptr would be used directly */ \ + size = (long)typeptr-(long)schedule; \ + } + +/* returns the size of a schedule in bytes */ +#define NBC_GET_SIZE(schedule, size) \ +{ \ + size=*(int*)schedule; \ +} + +/* increase the size of a schedule by size bytes */ +#define NBC_INC_SIZE(schedule, size) \ +{ \ + *(int*)schedule+=size; \ +} + +/* increments the number of operations in the last round */ +#define NBC_INC_NUM_ROUND(schedule) \ +{ \ + int total_size; \ + long round_size; \ + char *ptr, *lastround; \ + \ + NBC_GET_SIZE(schedule, total_size); \ + \ + /* ptr begins at first round (first int is overall size) */ \ + ptr = (char*)((char*)schedule+sizeof(int)); \ + lastround = ptr; \ + while ((long)ptr-(long)schedule < total_size) { \ + NBC_GET_ROUND_SIZE(ptr, round_size); \ + /*printf("got round size %i\n", round_size);*/ \ + lastround = ptr; \ + /* add round size */ \ + ptr=ptr+round_size; \ + /* add sizeof(char) as barrier delimiter */ \ + ptr=ptr+sizeof(char); \ + /*printf("(int)ptr-(int)schedule=%i, size=%i\n", (int)ptr-(int)schedule, size); */\ + } \ + /*printf("lastround count is at offset: %i\n", (int)lastround-(int)schedule);*/ \ + /* this is the count in the last round of the schedule */ \ + (*(int*)lastround)++; \ +} + +/* NBC_PRINT_ROUND prints a round in a schedule. A round has the format: + * [num]{[op][op-data]} types: [int]{[enum][op-type]} + * e.g. [2][SEND][SEND-ARGS][RECV][RECV-ARGS] */ +#define NBC_PRINT_ROUND(schedule) \ + { \ + int myrank, *numptr; \ + NBC_Fn_type *typeptr; \ + NBC_Args_send *sendargs; \ + NBC_Args_recv *recvargs; \ + NBC_Args_op *opargs; \ + NBC_Args_copy *copyargs; \ + NBC_Args_unpack *unpackargs; \ + int i; \ + \ + numptr = (int*)schedule; \ + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); \ + printf("has %i actions: \n", *numptr); \ + /* end is increased by sizeof(int) bytes to point to type */ \ + typeptr = (NBC_Fn_type*)((int*)(schedule)+1); \ + for (i=0; i<*numptr; i++) { \ + /* go sizeof op-data forward */ \ + switch(*typeptr) { \ + case SEND: \ + printf("[%i] SEND (offset %li) ", myrank, (long)typeptr-(long)schedule); \ + sendargs = (NBC_Args_send*)(typeptr+1); \ + printf("*buf: %lu, count: %i, type: %lu, dest: %i)\n", (unsigned long)sendargs->buf, sendargs->count, (unsigned long)sendargs->datatype, sendargs->dest); \ + typeptr = (NBC_Fn_type*)((NBC_Args_send*)typeptr+1); \ + break; \ + case RECV: \ + printf("[%i] RECV (offset %li) ", myrank, (long)typeptr-(long)schedule); \ + recvargs = (NBC_Args_recv*)(typeptr+1); \ + printf("*buf: %lu, count: %i, type: %lu, source: %i)\n", (unsigned long)recvargs->buf, recvargs->count, (unsigned long)recvargs->datatype, recvargs->source); \ + typeptr = (NBC_Fn_type*)((NBC_Args_recv*)typeptr+1); \ + break; \ + case OP: \ + printf("[%i] OP (offset %li) ", myrank, (long)typeptr-(long)schedule); \ + opargs = (NBC_Args_op*)(typeptr+1); \ + printf("*buf1: %lu, buf2: %lu, count: %i, type: %lu)\n", (unsigned long)opargs->buf1, (unsigned long)opargs->buf2, opargs->count, (unsigned long)opargs->datatype); \ + typeptr = (NBC_Fn_type*)((NBC_Args_op*)typeptr+1); \ + break; \ + case COPY: \ + printf("[%i] COPY (offset %li) ", myrank, (long)typeptr-(long)schedule); \ + copyargs = (NBC_Args_copy*)(typeptr+1); \ + printf("*src: %lu, srccount: %i, srctype: %lu, *tgt: %lu, tgtcount: %i, tgttype: %lu)\n", (unsigned long)copyargs->src, copyargs->srccount, (unsigned long)copyargs->srctype, (unsigned long)copyargs->tgt, copyargs->tgtcount, (unsigned long)copyargs->tgttype); \ + typeptr = (NBC_Fn_type*)((NBC_Args_copy*)typeptr+1); \ + break; \ + case UNPACK: \ + printf("[%i] UNPACK (offset %li) ", myrank, (long)typeptr-(long)schedule); \ + unpackargs = (NBC_Args_unpack*)(typeptr+1); \ + printf("*src: %lu, srccount: %i, srctype: %lu, *tgt: %lu\n",(unsigned long)unpackargs->inbuf, unpackargs->count, (unsigned long)unpackargs->datatype, (unsigned long)unpackargs->outbuf); \ + typeptr = (NBC_Fn_type*)((NBC_Args_unpack*)typeptr+1); \ + break; \ + default: \ + printf("[%i] NBC_PRINT_ROUND: bad type %li at offset %li\n", myrank, (long)*typeptr, (long)typeptr-(long)schedule); \ + return NBC_BAD_SCHED; \ + } \ + /* increase ptr by size of fn_type enum */ \ + typeptr = (NBC_Fn_type*)((NBC_Fn_type*)typeptr+1); \ + } \ + printf("\n"); \ + } + +#define NBC_PRINT_SCHED(schedule) \ +{ \ + int size, myrank; \ + long round_size; \ + char *ptr; \ + \ + NBC_GET_SIZE(schedule, size); \ + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); \ + printf("[%i] printing schedule of size %i\n", myrank, size); \ + \ + /* ptr begins at first round (first int is overall size) */ \ + ptr = (char*)((char*)schedule+sizeof(int)); \ + while ((long)ptr-(long)schedule < size) { \ + NBC_GET_ROUND_SIZE(ptr, round_size); \ + printf("[%i] Round at byte %li (size %li) ", myrank, (long)ptr-(long)schedule, round_size); \ + NBC_PRINT_ROUND(ptr); \ + /* add round size */ \ + ptr=ptr+round_size; \ + /* add sizeof(char) as barrier delimiter */ \ + ptr=ptr+sizeof(char); \ + } \ +} + +#define NBC_CHECK_NULL(ptr) \ +{ \ + if(ptr == NULL) { \ + printf("realloc error :-(\n"); \ + } \ +} + + + +/* +#define NBC_DEBUG(level, ...) {} +*/ + +static inline void NBC_DEBUG(int level, const char *fmt, ...) +{ +#if NBC_DLEVEL > 0 + va_list ap; + int rank; + + if(NBC_DLEVEL >= level) { + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + printf("[LibNBC - %i] ", rank); + va_start(ap, fmt); + vprintf(fmt, ap); + va_end (ap); + } +#endif +} + +/* returns true (1) or false (0) if type is intrinsic or not */ +static inline int NBC_Type_intrinsic(MPI_Datatype type) { + + if( ( type == MPI_INT ) || + ( type == MPI_LONG ) || + ( type == MPI_SHORT ) || + ( type == MPI_UNSIGNED ) || + ( type == MPI_UNSIGNED_SHORT ) || + ( type == MPI_UNSIGNED_LONG ) || + ( type == MPI_FLOAT ) || + ( type == MPI_DOUBLE ) || + ( type == MPI_LONG_DOUBLE ) || + ( type == MPI_BYTE ) || + ( type == MPI_FLOAT_INT) || + ( type == MPI_DOUBLE_INT) || + ( type == MPI_LONG_INT) || + ( type == MPI_2INT) || + ( type == MPI_SHORT_INT) || + ( type == MPI_LONG_DOUBLE_INT)) + return 1; + else + return 0; +} + +/* let's give a try to inline functions */ +static inline int NBC_Copy(void *src, int srccount, MPI_Datatype srctype, void *tgt, int tgtcount, MPI_Datatype tgttype, MPI_Comm comm) { + int size, pos, res; + MPI_Aint ext; + void *packbuf; + + if((srctype == tgttype) && NBC_Type_intrinsic(srctype)) { + /* if we have the same types and they are contiguous (intrinsic + * types are contiguous), we can just use a single memcpy */ + res = MPI_Type_extent(srctype, &ext); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Type_extent() (%i)\n", res); return res; } + memcpy(tgt, src, srccount*ext); + } else { + /* we have to pack and unpack */ + res = MPI_Pack_size(srccount, srctype, comm, &size); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Pack_size() (%i)\n", res); return res; } + packbuf = malloc(size); + if (NULL == packbuf) { printf("Error in malloc()\n"); return res; } + pos=0; + res = MPI_Pack(src, srccount, srctype, packbuf, size, &pos, comm); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Pack() (%i)\n", res); return res; } + pos=0; + res = MPI_Unpack(packbuf, size, &pos, tgt, tgtcount, tgttype, comm); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Unpack() (%i)\n", res); return res; } + free(packbuf); + } + + return NBC_OK; +} + +static inline int NBC_Unpack(void *src, int srccount, MPI_Datatype srctype, void *tgt, MPI_Comm comm) { + int size, pos, res; + MPI_Aint ext; + + if(NBC_Type_intrinsic(srctype)) { + /* if we have the same types and they are contiguous (intrinsic + * types are contiguous), we can just use a single memcpy */ + res = MPI_Type_extent(srctype, &ext); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Type_extent() (%i)\n", res); return res; } + memcpy(tgt, src, srccount*ext); + + } else { + /* we have to unpack */ + res = MPI_Pack_size(srccount, srctype, comm, &size); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Pack_size() (%i)\n", res); return res; } + pos=0; + res = MPI_Unpack(src, size, &pos, tgt, srccount, srctype, comm); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Unpack() (%i)\n", res); return res; } + } + + return NBC_OK; +} + +/* deletes elements from dict until low watermark is reached */ +static inline void NBC_SchedCache_dictwipe(hb_tree *dict, int *size) { + hb_itor *itor; + + itor = hb_itor_new(dict); + for (; hb_itor_valid(itor) && (*size>NBC_SCHED_DICT_LOWER); hb_itor_next(itor)) { + hb_tree_remove(dict, hb_itor_key(itor), 0); + *size = *size-1; + } + hb_itor_destroy(itor); +} + +#define NBC_IN_PLACE(sendbuf, recvbuf, inplace) \ +{ \ + inplace = 0; \ + if(recvbuf == sendbuf) { \ + inplace = 1; \ + } else \ + if(sendbuf == MPI_IN_PLACE) { \ + sendbuf = recvbuf; \ + inplace = 1; \ + } else \ + if(recvbuf == MPI_IN_PLACE) { \ + recvbuf = sendbuf; \ + inplace = 1; \ + } \ +} + +#ifdef __cplusplus +} +#endif + +#endif + + diff --git a/ompi/mca/coll/libnbc/nbc_ireduce.c b/ompi/mca/coll/libnbc/nbc_ireduce.c new file mode 100644 index 0000000000..50379e6ee0 --- /dev/null +++ b/ompi/mca/coll/libnbc/nbc_ireduce.c @@ -0,0 +1,333 @@ +/* + * Copyright (c) 2006 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2006 The Technical University of Chemnitz. All + * rights reserved. + * + * Author(s): Torsten Hoefler + * + */ +#include "nbc_internal.h" + +static inline int red_sched_binomial(int rank, int p, int root, void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, void *redbuf, NBC_Schedule *schedule, NBC_Handle *handle); +static inline int red_sched_chain(int rank, int p, int root, void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int ext, int size, NBC_Schedule *schedule, NBC_Handle *handle, int fragsize); + +#ifdef NBC_CACHE_SCHEDULE +/* tree comparison function for schedule cache */ +int NBC_Reduce_args_compare(NBC_Reduce_args *a, NBC_Reduce_args *b, void *param) { + + if( (a->sendbuf == b->sendbuf) && + (a->recvbuf == b->recvbuf) && + (a->count == b->count) && + (a->datatype == b->datatype) && + (a->op == b->op) && + (a->root == b->root) ) { + return 0; + } + if( a->sendbuf < b->sendbuf ) { + return -1; + } + return +1; +} +#endif + +#ifdef HAVE_SYS_WEAK_ALIAS_PRAGMA +#pragma weak NBC_Ireduce=PNBC_Ireduce +#define NBC_Ireduce PNBC_Ireduce +#endif + +/* the non-blocking reduce */ +int NBC_Ireduce(void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm, NBC_Handle* handle) { + int rank, p, res, segsize, size; + MPI_Aint ext; + NBC_Schedule *schedule; + char *redbuf=NULL, inplace; +#ifdef NBC_CACHE_SCHEDULE + NBC_Reduce_args *args, *found, search; +#endif + enum { NBC_RED_BINOMIAL, NBC_RED_CHAIN } alg; + + NBC_IN_PLACE(sendbuf, recvbuf, inplace); + + res = NBC_Init_handle(handle, comm); + if(res != NBC_OK) { printf("Error in NBC_Init_handle(%i)\n", res); return res; } + res = MPI_Comm_rank(comm, &rank); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_rank() (%i)\n", res); return res; } + res = MPI_Comm_size(comm, &p); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_size() (%i)\n", res); return res; } + res = MPI_Type_extent(datatype, &ext); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Type_extent() (%i)\n", res); return res; } + res = MPI_Type_size(datatype, &size); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Type_size() (%i)\n", res); return res; } + + /* only one node -> copy data */ + if((p == 1) && !inplace) { + res = NBC_Copy(sendbuf, count, datatype, recvbuf, count, datatype, comm); + if (NBC_OK != res) { printf("Error in NBC_Copy() (%i)\n", res); return res; } + } + + /* algorithm selection */ + if(p > 4 || size*count < 65536) { + alg = NBC_RED_BINOMIAL; + if(rank == root) { + /* root reduces in receivebuffer */ + handle->tmpbuf = malloc(ext*count); + } else { + /* recvbuf may not be valid on non-root nodes */ + handle->tmpbuf = malloc(ext*count*2); + redbuf = ((char*)handle->tmpbuf)+(ext*count); + } + } else { + handle->tmpbuf = malloc(ext*count); + alg = NBC_RED_CHAIN; + segsize = 16384/2; + } + if (NULL == handle->tmpbuf) { printf("Error in malloc() (%i)\n", res); return res; } + +#ifdef NBC_CACHE_SCHEDULE + /* search schedule in communicator specific tree */ + search.sendbuf=sendbuf; + search.recvbuf=recvbuf; + search.count=count; + search.datatype=datatype; + search.op=op; + search.root=root; + found = (NBC_Reduce_args*)hb_tree_search((hb_tree*)handle->comminfo->NBC_Dict[NBC_REDUCE], &search); + if(found == NULL) { +#endif + schedule = (NBC_Schedule*)malloc(sizeof(NBC_Schedule)); + if (NULL == schedule) { printf("Error in malloc() (%i)\n", res); return res; } + + res = NBC_Sched_create(schedule); + if(res != NBC_OK) { printf("Error in NBC_Sched_create (%i)\n", res); return res; } + + switch(alg) { + case NBC_RED_BINOMIAL: + res = red_sched_binomial(rank, p, root, sendbuf, recvbuf, count, datatype, op, redbuf, schedule, handle); + break; + case NBC_RED_CHAIN: + res = red_sched_chain(rank, p, root, sendbuf, recvbuf, count, datatype, op, ext, size, schedule, handle, segsize); + break; + } + if (NBC_OK != res) { printf("Error in Schedule creation() (%i)\n", res); return res; } + + res = NBC_Sched_commit(schedule); + if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_commit() (%i)\n", res); return res; } +#ifdef NBC_CACHE_SCHEDULE + /* save schedule to tree */ + args = (NBC_Reduce_args*)malloc(sizeof(NBC_Alltoall_args)); + args->sendbuf=sendbuf; + args->recvbuf=recvbuf; + args->count=count; + args->datatype=datatype; + args->op=op; + args->root=root; + args->schedule=schedule; + res = hb_tree_insert ((hb_tree*)handle->comminfo->NBC_Dict[NBC_REDUCE], args, args, 0); + if(res != 0) printf("error in dict_insert() (%i)\n", res); + /* increase number of elements for Reduce */ + if(++handle->comminfo->NBC_Dict_size[NBC_REDUCE] > NBC_SCHED_DICT_UPPER) { + NBC_SchedCache_dictwipe((hb_tree*)handle->comminfo->NBC_Dict[NBC_REDUCE], &handle->comminfo->NBC_Dict_size[NBC_REDUCE]); + } + } else { + /* found schedule */ + schedule=found->schedule; + } +#endif + + res = NBC_Start(handle, schedule); + if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Start() (%i)\n", res); return res; } + + /* tmpbuf is freed with the handle */ + return NBC_OK; +} + + +/* binomial reduce + * working principle: + * - each node gets a virtual rank vrank + * - the 'root' node get vrank 0 + * - node 0 gets the vrank of the 'root' + * - all other ranks stay identical (they do not matter) + * + * Algorithm: + * pairwise exchange + * round r: + * grp = rank % 2^r + * if grp == 0: receive from rank + 2^(r-1) if it exists and reduce value + * if grp == 1: send to rank - 2^(r-1) and exit function + * + * do this for R=log_2(p) rounds + * + */ +#define RANK2VRANK(rank, vrank, root) \ +{ \ + vrank = rank; \ + if (rank == 0) vrank = root; \ + if (rank == root) vrank = 0; \ +} +#define VRANK2RANK(rank, vrank, root) \ +{ \ + rank = vrank; \ + if (vrank == 0) rank = root; \ + if (vrank == root) rank = 0; \ +} +static inline int red_sched_binomial(int rank, int p, int root, void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, void *redbuf, NBC_Schedule *schedule, NBC_Handle *handle) { + int firstred, vrank, vpeer, peer, res, maxr, r; + + RANK2VRANK(rank, vrank, root); + maxr = (int)ceil((log(p)/LOG2)); + + firstred = 1; + for(r=1; r<=maxr; r++) { + if((vrank % (1<tmpbuf); printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } + /* we have to wait until we have the data */ + res = NBC_Sched_barrier(schedule); + if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } + /* perform the reduce in my local buffer */ + if(firstred) { + if(rank == root) { + /* root is the only one who reduces in the receivebuffer + * take data from sendbuf in first round - save copy */ + res = NBC_Sched_op(recvbuf, false, sendbuf, false, 0, true, count, datatype, op, schedule); + } else { + /* all others may not have a receive buffer + * take data from sendbuf in first round - save copy */ + res = NBC_Sched_op((char *)redbuf-(unsigned long)handle->tmpbuf, true, sendbuf, false, 0, true, count, datatype, op, schedule); + } + firstred = 0; + } else { + if(rank == root) { + /* root is the only one who reduces in the receivebuffer */ + res = NBC_Sched_op(recvbuf, false, recvbuf, false, 0, true, count, datatype, op, schedule); + } else { + /* all others may not have a receive buffer */ + res = NBC_Sched_op((char *)redbuf-(unsigned long)handle->tmpbuf, true, (char *)redbuf-(unsigned long)handle->tmpbuf, true, 0, true, count, datatype, op, schedule); + } + } + if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_op() (%i)\n", res); return res; } + /* this cannot be done until handle->tmpbuf is unused :-( */ + res = NBC_Sched_barrier(schedule); + if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } + } + } else { + /* we have to send this round */ + vpeer = vrank - (1<<(r-1)); + VRANK2RANK(peer, vpeer, root) + if(firstred) { + /* we did not reduce anything */ + res = NBC_Sched_send(sendbuf, false, count, datatype, peer, schedule); + } else { + /* we have to use the redbuf the root (which works in receivebuf) is never sending .. */ + res = NBC_Sched_send((char *)redbuf-(unsigned long)handle->tmpbuf, true, count, datatype, peer, schedule); + } + if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_send() (%i)\n", res); return res; } + /* leave the game */ + break; + } + } + + return NBC_OK; +} + +/* chain send ... */ +static inline int red_sched_chain(int rank, int p, int root, void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int ext, int size, NBC_Schedule *schedule, NBC_Handle *handle, int fragsize) { + int res, vrank, rpeer, speer, numfrag, fragnum, fragcount, thiscount; + long offset; + + RANK2VRANK(rank, vrank, root); + VRANK2RANK(rpeer, vrank+1, root); + VRANK2RANK(speer, vrank-1, root); + + if(count == 0) return NBC_OK; + + numfrag = count*size/fragsize; + if((count*size)%fragsize != 0) numfrag++; + fragcount = count/numfrag; + /*printf("numfrag: %i, count: %i, size: %i, fragcount: %i\n", numfrag, count, size, fragcount);*/ + + for(fragnum = 0; fragnum < numfrag; fragnum++) { + offset = fragnum*fragcount*ext; + thiscount = fragcount; + if(fragnum == numfrag-1) { + /* last fragment may not be full */ + thiscount = count-fragcount*fragnum; + } + + /* last node does not recv */ + if(vrank != p-1) { + res = NBC_Sched_recv((char*)offset, true, thiscount, datatype, rpeer, schedule); + if (NBC_OK != res) { printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } + res = NBC_Sched_barrier(schedule); + /* root reduces into receivebuf */ + if(vrank == 0) { + res = NBC_Sched_op((char*)recvbuf+offset, false, (char*)sendbuf+offset, false, (char*)offset, true, thiscount, datatype, op, schedule); + } else { + res = NBC_Sched_op((char*)offset, true, (char*)sendbuf+offset, false, (char*)offset, true, thiscount, datatype, op, schedule); + } + res = NBC_Sched_barrier(schedule); + } + + /* root does not send */ + if(vrank != 0) { + /* rank p-1 has to send out of sendbuffer :) */ + if(vrank == p-1) { + res = NBC_Sched_send((char*)sendbuf+offset, false, thiscount, datatype, speer, schedule); + } else { + res = NBC_Sched_send((char*)offset, true, thiscount, datatype, speer, schedule); + } + if (NBC_OK != res) { printf("Error in NBC_Sched_send() (%i)\n", res); return res; } + /* this barrier here seems awkward but isn't!!!! */ + res = NBC_Sched_barrier(schedule); + } + } + + return NBC_OK; +} + + +#ifdef __cplusplus +extern "C" { +#endif +/* Fortran bindings */ +#ifdef HAVE_SYS_WEAK_ALIAS_PRAGMA +NBC_F77_ALLFUNC_(nbc_ireduce,NBC_IREDUCE,(void *sendbuf, void *recvbuf, int *count, int *datatype, int *fop, int *root, int *fcomm, int *fhandle, int *ierr)); +#pragma weak NBC_IREDUCE = nbc_ireduce_f +#pragma weak nbc_ireduce = nbc_ireduce_f +#pragma weak nbc_ireduce_ = nbc_ireduce_f +#pragma weak nbc_ireduce__ = nbc_ireduce_f +#pragma weak PNBC_IREDUCE = nbc_ireduce_f +#pragma weak pnbc_ireduce = nbc_ireduce_f +#pragma weak pnbc_ireduce_ = nbc_ireduce_f +#pragma weak pnbc_ireduce__ = nbc_ireduce_f +void nbc_ireduce_f(void *sendbuf, void *recvbuf, int *count, int *datatype, int *fop, int *root, int *fcomm, int *fhandle, int *ierr) { +#else +void NBC_F77_FUNC_(nbc_ireduce,NBC_IREDUCE)(void *sendbuf, void *recvbuf, int *count, int *datatype, int *fop, int *root, int *fcomm, int *fhandle, int *ierr); +void NBC_F77_FUNC_(nbc_ireduce,NBC_IREDUCE)(void *sendbuf, void *recvbuf, int *count, int *datatype, int *fop, int *root, int *fcomm, int *fhandle, int *ierr) { +#endif + MPI_Datatype dtype; + MPI_Comm comm; + MPI_Op op; + NBC_Handle *handle; + + /* this is the only MPI-2 we need :-( */ + dtype = MPI_Type_f2c(*datatype); + comm = MPI_Comm_f2c(*fcomm); + op = MPI_Op_f2c(*fop); + + /* create a new handle in handle table */ + NBC_Create_fortran_handle(fhandle, &handle); + + /* call NBC function */ + *ierr = NBC_Ireduce(sendbuf, recvbuf, *count, dtype, op, *root, comm, handle); +} +#ifdef __cplusplus +} +#endif diff --git a/ompi/mca/coll/libnbc/nbc_ireduce_scatter.c b/ompi/mca/coll/libnbc/nbc_ireduce_scatter.c new file mode 100644 index 0000000000..4b5b453a8a --- /dev/null +++ b/ompi/mca/coll/libnbc/nbc_ireduce_scatter.c @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2006 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2006 The Technical University of Chemnitz. All + * rights reserved. + * + * Author(s): Torsten Hoefler + * + */ +#include "nbc_internal.h" + +/* an reduce_csttare schedule can not be cached easily because the contents + * ot the recvcounts array may change, so a comparison of the address + * would not be sufficient ... we simply do not cache it */ + +/* binomial reduce to rank 0 followed by a linear scatter ... + * + * Algorithm: + * pairwise exchange + * round r: + * grp = rank % 2^r + * if grp == 0: receive from rank + 2^(r-1) if it exists and reduce value + * if grp == 1: send to rank - 2^(r-1) and exit function + * + * do this for R=log_2(p) rounds + * + */ + +#ifdef HAVE_SYS_WEAK_ALIAS_PRAGMA +#pragma weak NBC_Ireduce_scatter=PNBC_Ireduce_scatter +#define NBC_Ireduce_scatter PNBC_Ireduce_scatter +#endif +int NBC_Ireduce_scatter(void* sendbuf, void* recvbuf, int *recvcounts, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm, NBC_Handle* handle) { + int peer, rank, maxr, p, r, res, count, offset, firstred; + MPI_Aint ext; + char *redbuf, *sbuf, inplace; + NBC_Schedule *schedule; + + NBC_IN_PLACE(sendbuf, recvbuf, inplace); + + res = NBC_Init_handle(handle, comm); + if(res != NBC_OK) { printf("Error in NBC_Init_handle(%i)\n", res); return res; } + res = MPI_Comm_rank(comm, &rank); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_rank() (%i)\n", res); return res; } + res = MPI_Comm_size(comm, &p); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_size() (%i)\n", res); return res; } + MPI_Type_extent(datatype, &ext); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Type_extent() (%i)\n", res); return res; } + + schedule = (NBC_Schedule*)malloc(sizeof(NBC_Schedule)); + if (NULL == schedule) { printf("Error in malloc()\n"); return NBC_OOR; } + + res = NBC_Sched_create(schedule); + if(res != NBC_OK) { printf("Error in NBC_Sched_create (%i)\n", res); return res; } + + maxr = (int)ceil((log(p)/LOG2)); + + count = 0; + for(r=0;rtmpbuf = malloc(ext*count*2); + if(handle->tmpbuf == NULL) { printf("Error in malloc()\n"); return NBC_OOR; } + + redbuf = ((char*)handle->tmpbuf)+(ext*count); + + /* copy data to redbuf if we only have a single node */ + if((p==1) && !inplace) { + res = NBC_Copy(sendbuf, count, datatype, redbuf, count, datatype, comm); + if (NBC_OK != res) { printf("Error in NBC_Copy() (%i)\n", res); return res; } + } + + firstred = 1; + for(r=1; r<=maxr; r++) { + if((rank % (1<tmpbuf); printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } + /* we have to wait until we have the data */ + res = NBC_Sched_barrier(schedule); + if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } + if(firstred) { + /* take reduce data from the sendbuf in the first round -> save copy */ + res = NBC_Sched_op(redbuf-(unsigned long)handle->tmpbuf, true, sendbuf, false, 0, true, count, datatype, op, schedule); + firstred = 0; + } else { + /* perform the reduce in my local buffer */ + res = NBC_Sched_op(redbuf-(unsigned long)handle->tmpbuf, true, redbuf-(unsigned long)handle->tmpbuf, true, 0, true, count, datatype, op, schedule); + } + if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_op() (%i)\n", res); return res; } + /* this cannot be done until handle->tmpbuf is unused :-( */ + res = NBC_Sched_barrier(schedule); + if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } + } + } else { + /* we have to send this round */ + peer = rank - (1<<(r-1)); + if(firstred) { + /* we have to send the senbuf */ + res = NBC_Sched_send(sendbuf, false, count, datatype, peer, schedule); + } else { + /* we send an already reduced value from redbuf */ + res = NBC_Sched_send(redbuf-(unsigned long)handle->tmpbuf, true, count, datatype, peer, schedule); + } + if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_send() (%i)\n", res); return res; } + /* leave the game */ + break; + } + } + + res = NBC_Sched_barrier(schedule); + if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } + + /* rank 0 is root and sends - all others receive */ + if(rank != 0) { + res = NBC_Sched_recv(recvbuf, false, recvcounts[rank], datatype, 0, schedule); + if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } + } + + if(rank == 0) { + offset = 0; + for(r=1;rtmpbuf, true, recvcounts[r], datatype, r, schedule); + if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_send() (%i)\n", res); return res; } + } + res = NBC_Sched_copy(redbuf-(unsigned long)handle->tmpbuf, true, recvcounts[0], datatype, recvbuf, false, recvcounts[0], datatype, schedule); + if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_copy() (%i)\n", res); return res; } + } + + /*NBC_PRINT_SCHED(*schedule);*/ + + res = NBC_Sched_commit(schedule); + if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_commit() (%i)\n", res); return res; } + + res = NBC_Start(handle, schedule); + if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Start() (%i)\n", res); return res; } + + /* tmpbuf is freed with the handle */ + return NBC_OK; +} + + +#ifdef __cplusplus +extern "C" { +#endif +/* Fortran bindings */ +#ifdef HAVE_SYS_WEAK_ALIAS_PRAGMA +NBC_F77_ALLFUNC_(nbc_ireduce_scatter,NBC_IREDUCE_SCATTER,(void *sendbuf, void *recvbuf, int *recvcounts, int *datatype, int *fop, int *fcomm, int *fhandle, int *ierr)); +#pragma weak NBC_IREDUCE_SCATTER = nbc_ireduce_scatter_f +#pragma weak nbc_ireduce_scatter = nbc_ireduce_scatter_f +#pragma weak nbc_ireduce_scatter_ = nbc_ireduce_scatter_f +#pragma weak nbc_ireduce_scatter__ = nbc_ireduce_scatter_f +#pragma weak PNBC_IREDUCE_SCATTER = nbc_ireduce_scatter_f +#pragma weak pnbc_ireduce_scatter = nbc_ireduce_scatter_f +#pragma weak pnbc_ireduce_scatter_ = nbc_ireduce_scatter_f +#pragma weak pnbc_ireduce_scatter__ = nbc_ireduce_scatter_f +void nbc_ireduce_scatter_f(void *sendbuf, void *recvbuf, int *recvcounts, int *datatype, int *fop, int *fcomm, int *fhandle, int *ierr) { +#else +void NBC_F77_FUNC_(nbc_ireduce_scatter,NBC_IREDUCE_SCATTER)(void *sendbuf, void *recvbuf, int *recvcounts, int *datatype, int *fop, int *fcomm, int *fhandle, int *ierr); +void NBC_F77_FUNC_(nbc_ireduce_scatter,NBC_IREDUCE_SCATTER)(void *sendbuf, void *recvbuf, int *recvcounts, int *datatype, int *fop, int *fcomm, int *fhandle, int *ierr) { +#endif + MPI_Datatype dtype; + MPI_Comm comm; + MPI_Op op; + NBC_Handle *handle; + + /* this is the only MPI-2 we need :-( */ + dtype = MPI_Type_f2c(*datatype); + comm = MPI_Comm_f2c(*fcomm); + op = MPI_Op_f2c(*fop); + + /* create a new handle in handle table */ + NBC_Create_fortran_handle(fhandle, &handle); + + /* call NBC function */ + *ierr = NBC_Ireduce_scatter(sendbuf, recvbuf, recvcounts, dtype, op, comm, handle); +} +#ifdef __cplusplus +} +#endif diff --git a/ompi/mca/coll/libnbc/nbc_iscan.c b/ompi/mca/coll/libnbc/nbc_iscan.c new file mode 100644 index 0000000000..aea6bbde2f --- /dev/null +++ b/ompi/mca/coll/libnbc/nbc_iscan.c @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2006 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2006 The Technical University of Chemnitz. All + * rights reserved. + * + * Author(s): Torsten Hoefler + * + */ +#include "nbc_internal.h" + +#ifdef NBC_CACHE_SCHEDULE +/* tree comparison function for schedule cache */ +int NBC_Scan_args_compare(NBC_Scan_args *a, NBC_Scan_args *b, void *param) { + + if( (a->sendbuf == b->sendbuf) && + (a->recvbuf == b->recvbuf) && + (a->count == b->count) && + (a->datatype == b->datatype) && + (a->op == b->op) ) { + return 0; + } + if( a->sendbuf < b->sendbuf ) { + return -1; + } + return +1; +} +#endif + +/* linear iscan + * working principle: + * 1. each node (but node 0) receives from left neigbor + * 2. performs op + * 3. all but rank p-1 do sends to it's right neigbor and exits + * + */ +#ifdef HAVE_SYS_WEAK_ALIAS_PRAGMA +#pragma weak NBC_Iscan=PNBC_Iscan +#define NBC_Iscan PNBC_Iscan +#endif +int NBC_Iscan(void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm, NBC_Handle* handle) { + int rank, p, res; + MPI_Aint ext; + NBC_Schedule *schedule; +#ifdef NBC_CACHE_SCHEDULE + NBC_Scan_args *args, *found, search; +#endif + char inplace; + + NBC_IN_PLACE(sendbuf, recvbuf, inplace); + + res = NBC_Init_handle(handle, comm); + if(res != NBC_OK) { printf("Error in NBC_Init_handle(%i)\n", res); return res; } + res = MPI_Comm_rank(comm, &rank); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_rank() (%i)\n", res); return res; } + res = MPI_Comm_size(comm, &p); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_size() (%i)\n", res); return res; } + res = MPI_Type_extent(datatype, &ext); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Type_extent() (%i)\n", res); return res; } + + handle->tmpbuf = malloc(ext*count); + if(handle->tmpbuf == NULL) { printf("Error in malloc()\n"); return NBC_OOR; } + + if((rank == 0) && !inplace) { + /* copy data to receivebuf */ + res = NBC_Copy(sendbuf, count, datatype, recvbuf, count, datatype, comm); + if (NBC_OK != res) { printf("Error in NBC_Copy() (%i)\n", res); return res; } + } + +#ifdef NBC_CACHE_SCHEDULE + /* search schedule in communicator specific tree */ + search.sendbuf=sendbuf; + search.recvbuf=recvbuf; + search.count=count; + search.datatype=datatype; + search.op=op; + found = (NBC_Scan_args*)hb_tree_search((hb_tree*)handle->comminfo->NBC_Dict[NBC_SCAN], &search); + if(found == NULL) { +#endif + schedule = (NBC_Schedule*)malloc(sizeof(NBC_Schedule)); + if (NULL == schedule) { printf("Error in malloc()\n"); return res; } + + res = NBC_Sched_create(schedule); + if(res != NBC_OK) { printf("Error in NBC_Sched_create (%i)\n", res); return res; } + + if(rank != 0) { + res = NBC_Sched_recv(0, true, count, datatype, rank-1, schedule); + if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } + /* we have to wait until we have the data */ + res = NBC_Sched_barrier(schedule); + if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } + /* perform the reduce in my local buffer */ + res = NBC_Sched_op(recvbuf, false, sendbuf, false, 0, true, count, datatype, op, schedule); + if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_op() (%i)\n", res); return res; } + /* this cannot be done until handle->tmpbuf is unused :-( */ + res = NBC_Sched_barrier(schedule); + if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } + } + if(rank != p-1) { + res = NBC_Sched_send(recvbuf, false, count, datatype, rank+1, schedule); + if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_send() (%i)\n", res); return res; } + } + + res = NBC_Sched_commit(schedule); + if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_commit() (%i)\n", res); return res; } + +#ifdef NBC_CACHE_SCHEDULE + /* save schedule to tree */ + args = (NBC_Scan_args*)malloc(sizeof(NBC_Alltoall_args)); + args->sendbuf=sendbuf; + args->recvbuf=recvbuf; + args->count=count; + args->datatype=datatype; + args->op=op; + args->schedule=schedule; + res = hb_tree_insert ((hb_tree*)handle->comminfo->NBC_Dict[NBC_SCAN], args, args, 0); + if(res != 0) printf("error in dict_insert() (%i)\n", res); + /* increase number of elements for A2A */ + if(++handle->comminfo->NBC_Dict_size[NBC_SCAN] > NBC_SCHED_DICT_UPPER) { + NBC_SchedCache_dictwipe((hb_tree*)handle->comminfo->NBC_Dict[NBC_SCAN], &handle->comminfo->NBC_Dict_size[NBC_SCAN]); + } + } else { + /* found schedule */ + schedule=found->schedule; + } +#endif + + res = NBC_Start(handle, schedule); + if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Start() (%i)\n", res); return res; } + + /* tmpbuf is freed with the handle */ + return NBC_OK; +} + + +#ifdef __cplusplus +extern "C" { +#endif +/* Fortran bindings */ +#ifdef HAVE_SYS_WEAK_ALIAS_PRAGMA +NBC_F77_ALLFUNC_(nbc_iscan,NBC_ISCAN,(void *sendbuf, void *recvbuf, int *count, int *datatype, int *fop, int *fcomm, int *fhandle, int *ierr)); +#pragma weak NBC_ISCAN = nbc_iscan_f +#pragma weak nbc_iscan = nbc_iscan_f +#pragma weak nbc_iscan_ = nbc_iscan_f +#pragma weak nbc_iscan__ = nbc_iscan_f +#pragma weak PNBC_ISCAN = nbc_iscan_f +#pragma weak pnbc_iscan = nbc_iscan_f +#pragma weak pnbc_iscan_ = nbc_iscan_f +#pragma weak pnbc_iscan__ = nbc_iscan_f +void nbc_iscan_f(void *sendbuf, void *recvbuf, int *count, int *datatype, int *fop, int *fcomm, int *fhandle, int *ierr) { +#else +void NBC_F77_FUNC_(nbc_iscan,NBC_ISCAN)(void *sendbuf, void *recvbuf, int *count, int *datatype, int *fop, int *fcomm, int *fhandle, int *ierr); +void NBC_F77_FUNC_(nbc_iscan,NBC_ISCAN)(void *sendbuf, void *recvbuf, int *count, int *datatype, int *fop, int *fcomm, int *fhandle, int *ierr) { +#endif + MPI_Datatype dtype; + MPI_Comm comm; + MPI_Op op; + NBC_Handle *handle; + + /* this is the only MPI-2 we need :-( */ + dtype = MPI_Type_f2c(*datatype); + comm = MPI_Comm_f2c(*fcomm); + op = MPI_Op_f2c(*fop); + + /* create a new handle in handle table */ + NBC_Create_fortran_handle(fhandle, &handle); + + /* call NBC function */ + *ierr = NBC_Iscan(sendbuf, recvbuf, *count, dtype, op, comm, handle); +} +#ifdef __cplusplus +} +#endif diff --git a/ompi/mca/coll/libnbc/nbc_iscatter.c b/ompi/mca/coll/libnbc/nbc_iscatter.c new file mode 100644 index 0000000000..77d7b85a28 --- /dev/null +++ b/ompi/mca/coll/libnbc/nbc_iscatter.c @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2006 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2006 The Technical University of Chemnitz. All + * rights reserved. + * + * Author(s): Torsten Hoefler + * + */ +#include "nbc_internal.h" + +#ifdef NBC_CACHE_SCHEDULE +/* tree comparison function for schedule cache */ +int NBC_Scatter_args_compare(NBC_Scatter_args *a, NBC_Scatter_args *b, void *param) { + + if( (a->sendbuf == b->sendbuf) && + (a->sendcount == b->sendcount) && + (a->sendtype == b->sendtype) && + (a->recvbuf == b->recvbuf) && + (a->recvcount == b->recvcount) && + (a->recvtype == b->recvtype) && + (a->root == b->root) ) { + return 0; + } + if( a->sendbuf < b->sendbuf ) { + return -1; + } + return +1; +} +#endif + +/* simple linear MPI_Iscatter */ +#ifdef HAVE_SYS_WEAK_ALIAS_PRAGMA +#pragma weak NBC_Iscatter=PNBC_Iscatter +#define NBC_Iscatter PNBC_Iscatter +#endif +int NBC_Iscatter(void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm, NBC_Handle* handle) { + int rank, p, res, i; + MPI_Aint sndext; + NBC_Schedule *schedule; + char *sbuf, inplace; +#ifdef NBC_CACHE_SCHEDULE + NBC_Scatter_args *args, *found, search; +#endif + + NBC_IN_PLACE(sendbuf, recvbuf, inplace); + + res = NBC_Init_handle(handle, comm); + if(res != NBC_OK) { printf("Error in NBC_Init_handle(%i)\n", res); return res; } + res = MPI_Comm_rank(comm, &rank); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_rank() (%i)\n", res); return res; } + res = MPI_Comm_size(comm, &p); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_size() (%i)\n", res); return res; } + res = MPI_Type_extent(sendtype, &sndext); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Type_extent() (%i)\n", res); return res; } + + handle->tmpbuf=NULL; + + if((rank == root) && (!inplace)) { + sbuf = ((char *)sendbuf) + (rank*sendcount*sndext); + /* if I am the root - just copy the message (not for MPI_IN_PLACE) */ + res = NBC_Copy(sbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm); + if (NBC_OK != res) { printf("Error in NBC_Copy() (%i)\n", res); return res; } + } + +#ifdef NBC_CACHE_SCHEDULE + /* search schedule in communicator specific tree */ + search.sendbuf=sendbuf; + search.sendcount=sendcount; + search.sendtype=sendtype; + search.recvbuf=recvbuf; + search.recvcount=recvcount; + search.recvtype=recvtype; + search.root=root; + found = (NBC_Scatter_args*)hb_tree_search((hb_tree*)handle->comminfo->NBC_Dict[NBC_SCATTER], &search); + if(found == NULL) { +#endif + schedule = (NBC_Schedule*)malloc(sizeof(NBC_Schedule)); + if (NULL == schedule) { printf("Error in malloc()\n"); return res; } + + res = NBC_Sched_create(schedule); + if(res != NBC_OK) { printf("Error in NBC_Sched_create (%i)\n", res); return res; } + + /* receive from root */ + if(rank != root) { + /* recv msg from root */ + res = NBC_Sched_recv(recvbuf, false, recvcount, recvtype, root, schedule); + if (NBC_OK != res) { printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } + } else { + for(i=0;isendbuf=sendbuf; + args->sendcount=sendcount; + args->sendtype=sendtype; + args->recvbuf=recvbuf; + args->recvcount=recvcount; + args->recvtype=recvtype; + args->root=root; + args->schedule=schedule; + res = hb_tree_insert ((hb_tree*)handle->comminfo->NBC_Dict[NBC_SCATTER], args, args, 0); + if(res != 0) printf("error in dict_insert() (%i)\n", res); + /* increase number of elements for A2A */ + if(++handle->comminfo->NBC_Dict_size[NBC_SCATTER] > NBC_SCHED_DICT_UPPER) { + NBC_SchedCache_dictwipe((hb_tree*)handle->comminfo->NBC_Dict[NBC_SCATTER], &handle->comminfo->NBC_Dict_size[NBC_SCATTER]); + } + } else { + /* found schedule */ + schedule=found->schedule; + } +#endif + + + res = NBC_Start(handle, schedule); + if (NBC_OK != res) { printf("Error in NBC_Start() (%i)\n", res); return res; } + + return NBC_OK; +} + + +#ifdef __cplusplus +extern "C" { +#endif +/* Fortran bindings */ +#ifdef HAVE_SYS_WEAK_ALIAS_PRAGMA +NBC_F77_ALLFUNC_(nbc_iscatter,NBC_ISCATTER,(void *sendbuf, int *sendcount, int *sendtype, void *recvbuf, int *recvcount, int *recvtype, int *root, int *fcomm, int *fhandle, int *ierr)); +#pragma weak NBC_ISCATTER = nbc_iscatter_f +#pragma weak nbc_iscatter = nbc_iscatter_f +#pragma weak nbc_iscatter_ = nbc_iscatter_f +#pragma weak nbc_iscatter__ = nbc_iscatter_f +#pragma weak PNBC_ISCATTER = nbc_iscatter_f +#pragma weak pnbc_iscatter = nbc_iscatter_f +#pragma weak pnbc_iscatter_ = nbc_iscatter_f +#pragma weak pnbc_iscatter__ = nbc_iscatter_f +void nbc_iscatter_f(void *sendbuf, int *sendcount, int *sendtype, void *recvbuf, int *recvcount, int *recvtype, int *root, int *fcomm, int *fhandle, int *ierr) { +#else +void NBC_F77_FUNC_(nbc_iscatter,NBC_ISCATTER)(void *sendbuf, int *sendcount, int *sendtype, void *recvbuf, int *recvcount, int *recvtype, int *root, int *fcomm, int *fhandle, int *ierr); +void NBC_F77_FUNC_(nbc_iscatter,NBC_ISCATTER)(void *sendbuf, int *sendcount, int *sendtype, void *recvbuf, int *recvcount, int *recvtype, int *root, int *fcomm, int *fhandle, int *ierr) { +#endif + MPI_Datatype stype, rtype; + MPI_Comm comm; + NBC_Handle *handle; + + /* this is the only MPI-2 we need :-( */ + rtype = MPI_Type_f2c(*recvtype); + stype = MPI_Type_f2c(*sendtype); + comm = MPI_Comm_f2c(*fcomm); + + /* create a new handle in handle table */ + NBC_Create_fortran_handle(fhandle, &handle); + + /* call NBC function */ + *ierr = NBC_Iscatter(sendbuf, *sendcount, stype, recvbuf, *recvcount, rtype, *root, comm, handle); +} +#ifdef __cplusplus +} +#endif diff --git a/ompi/mca/coll/libnbc/nbc_iscatterv.c b/ompi/mca/coll/libnbc/nbc_iscatterv.c new file mode 100644 index 0000000000..b7706281b9 --- /dev/null +++ b/ompi/mca/coll/libnbc/nbc_iscatterv.c @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2006 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2006 The Technical University of Chemnitz. All + * rights reserved. + * + * Author(s): Torsten Hoefler + * + */ +#include "nbc_internal.h" + +/* a scatterv schedule can not be cached easily because the contents + * ot the recvcounts array may change, so a comparison of the address + * would not be sufficient ... we simply do not cache it */ + +/* simple linear MPI_Iscatterv */ +#ifdef HAVE_SYS_WEAK_ALIAS_PRAGMA +#pragma weak NBC_Iscatterv=PNBC_Iscatterv +#define NBC_Iscatterv PNBC_Iscatterv +#endif +int NBC_Iscatterv(void* sendbuf, int *sendcounts, int *displs, MPI_Datatype sendtype, void* recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm, NBC_Handle* handle) { + int rank, p, res, i; + MPI_Aint sndext; + NBC_Schedule *schedule; + char *sbuf, inplace; + + NBC_IN_PLACE(sendbuf, recvbuf, inplace); + + res = NBC_Init_handle(handle, comm); + if(res != NBC_OK) { printf("Error in NBC_Init_handle(%i)\n", res); return res; } + res = MPI_Comm_rank(comm, &rank); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_rank() (%i)\n", res); return res; } + res = MPI_Comm_size(comm, &p); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_size() (%i)\n", res); return res; } + res = MPI_Type_extent(sendtype, &sndext); + if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Type_extent() (%i)\n", res); return res; } + + schedule = (NBC_Schedule*)malloc(sizeof(NBC_Schedule)); + if (NULL == schedule) { printf("Error in malloc()\n"); return res; } + + handle->tmpbuf=NULL; + + res = NBC_Sched_create(schedule); + if(res != NBC_OK) { printf("Error in NBC_Sched_create (%i)\n", res); return res; } + + /* receive from root */ + if(rank != root) { + /* recv msg from root */ + res = NBC_Sched_recv(recvbuf, false, recvcount, recvtype, root, schedule); + if (NBC_OK != res) { printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } + } else { + for(i=0;i + * + */ +/* this gets it's own file since it is used twice ... */ + +#include "ompi/include/mpi.h" +/* the autohell defines some macros ... that OMPI redefines ... but we + * need to undefine them before that *ARGH* */ +#undef PACKAGE_BUGREPORT +#undef PACKAGE_NAME +#undef PACKAGE_STRING +#undef PACKAGE_TARNAME +#undef PACKAGE_VERSION +#undef PACKAGE_BUGREPORT + +#include "ompi/include/ompi/constants.h" + +/* undefine the stuff set by ompi */ +#undef PACKAGE_BUGREPORT +#undef PACKAGE_NAME +#undef PACKAGE_STRING +#undef PACKAGE_TARNAME +#undef PACKAGE_VERSION +#undef PACKAGE_BUGREPORT + diff --git a/ompi/mca/coll/libnbc/nbc_op.c b/ompi/mca/coll/libnbc/nbc_op.c new file mode 100644 index 0000000000..5f03d617f6 --- /dev/null +++ b/ompi/mca/coll/libnbc/nbc_op.c @@ -0,0 +1,570 @@ +/* + * Copyright (c) 2006 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2006 The Technical University of Chemnitz. All + * rights reserved. + * + * Author(s): Torsten Hoefler + * + */ +#include "nbc.h" + +/****************** THIS FILE is automatically generated ********************* + * changes will be deleted at the next generation of this file - see nbc_op.c.m4 */ + +int NBC_Operation(void *buf3, void *buf1, void *buf2, MPI_Op op, MPI_Datatype type, int count) { + int i; + + if(type == MPI_INT) { + if(op == MPI_MIN) { + for(i=0; i *(((int*)buf2) + i)) *(((int*)buf3) + i) = *(((int*)buf2) + i); else *(((int*)buf3) + i) = *(((int*)buf1) + i); + } + } else if(op == MPI_MAX) { + for(i=0; i *(((long*)buf2) + i)) *(((long*)buf3) + i) = *(((long*)buf2) + i); else *(((long*)buf3) + i) = *(((long*)buf1) + i); + } + } else if(op == MPI_MAX) { + for(i=0; i *(((short*)buf2) + i)) *(((short*)buf3) + i) = *(((short*)buf2) + i); else *(((short*)buf3) + i) = *(((short*)buf1) + i); + } + } else if(op == MPI_MAX) { + for(i=0; i *(((unsigned int*)buf2) + i)) *(((unsigned int*)buf3) + i) = *(((unsigned int*)buf2) + i); else *(((unsigned int*)buf3) + i) = *(((unsigned int*)buf1) + i); + } + } else if(op == MPI_MAX) { + for(i=0; i *(((unsigned long*)buf2) + i)) *(((unsigned long*)buf3) + i) = *(((unsigned long*)buf2) + i); else *(((unsigned long*)buf3) + i) = *(((unsigned long*)buf1) + i); + } + } else if(op == MPI_MAX) { + for(i=0; i *(((unsigned short*)buf2) + i)) *(((unsigned short*)buf3) + i) = *(((unsigned short*)buf2) + i); else *(((unsigned short*)buf3) + i) = *(((unsigned short*)buf1) + i); + } + } else if(op == MPI_MAX) { + for(i=0; i *(((float*)buf2) + i)) *(((float*)buf3) + i) = *(((float*)buf2) + i); else *(((float*)buf3) + i) = *(((float*)buf1) + i); + } + } else if(op == MPI_MAX) { + for(i=0; i *(((double*)buf2) + i)) *(((double*)buf3) + i) = *(((double*)buf2) + i); else *(((double*)buf3) + i) = *(((double*)buf1) + i); + } + } else if(op == MPI_MAX) { + for(i=0; i *(((long double*)buf2) + i)) *(((long double*)buf3) + i) = *(((long double*)buf2) + i); else *(((long double*)buf3) + i) = *(((long double*)buf1) + i); + } + } else if(op == MPI_MAX) { + for(i=0; ival < ptr2->val) { + ptr3->val = ptr2->val; ptr3->rank = ptr2->rank; + } else { + ptr3->val = ptr1->val; ptr3->rank = ptr1->rank; + } + } + } else if(op == MPI_MINLOC) { + for(i=0; ival > ptr2->val) { + ptr3->val = ptr2->val; ptr3->rank = ptr2->rank; + } else { + ptr3->val = ptr1->val; ptr3->rank = ptr1->rank; + } + } + } else return NBC_OP_NOT_SUPPORTED; + } else if(type == MPI_DOUBLE_INT) { + if(op == MPI_MAXLOC) { + for(i=0; ival < ptr2->val) { + ptr3->val = ptr2->val; ptr3->rank = ptr2->rank; + } else { + ptr3->val = ptr1->val; ptr3->rank = ptr1->rank; + } + } + } else if(op == MPI_MINLOC) { + for(i=0; ival > ptr2->val) { + ptr3->val = ptr2->val; ptr3->rank = ptr2->rank; + } else { + ptr3->val = ptr1->val; ptr3->rank = ptr1->rank; + } + } + } else return NBC_OP_NOT_SUPPORTED; + } else if(type == MPI_LONG_INT) { + if(op == MPI_MAXLOC) { + for(i=0; ival < ptr2->val) { + ptr3->val = ptr2->val; ptr3->rank = ptr2->rank; + } else { + ptr3->val = ptr1->val; ptr3->rank = ptr1->rank; + } + } + } else if(op == MPI_MINLOC) { + for(i=0; ival > ptr2->val) { + ptr3->val = ptr2->val; ptr3->rank = ptr2->rank; + } else { + ptr3->val = ptr1->val; ptr3->rank = ptr1->rank; + } + } + } else return NBC_OP_NOT_SUPPORTED; + } else if(type == MPI_2INT) { + if(op == MPI_MAXLOC) { + for(i=0; ival < ptr2->val) { + ptr3->val = ptr2->val; ptr3->rank = ptr2->rank; + } else { + ptr3->val = ptr1->val; ptr3->rank = ptr1->rank; + } + } + } else if(op == MPI_MINLOC) { + for(i=0; ival > ptr2->val) { + ptr3->val = ptr2->val; ptr3->rank = ptr2->rank; + } else { + ptr3->val = ptr1->val; ptr3->rank = ptr1->rank; + } + } + } else return NBC_OP_NOT_SUPPORTED; + } else if(type == MPI_SHORT_INT) { + if(op == MPI_MAXLOC) { + for(i=0; ival < ptr2->val) { + ptr3->val = ptr2->val; ptr3->rank = ptr2->rank; + } else { + ptr3->val = ptr1->val; ptr3->rank = ptr1->rank; + } + } + } else if(op == MPI_MINLOC) { + for(i=0; ival > ptr2->val) { + ptr3->val = ptr2->val; ptr3->rank = ptr2->rank; + } else { + ptr3->val = ptr1->val; ptr3->rank = ptr1->rank; + } + } + } else return NBC_OP_NOT_SUPPORTED; + } else if(type == MPI_LONG_DOUBLE_INT) { + if(op == MPI_MAXLOC) { + for(i=0; ival < ptr2->val) { + ptr3->val = ptr2->val; ptr3->rank = ptr2->rank; + } else { + ptr3->val = ptr1->val; ptr3->rank = ptr1->rank; + } + } + } else if(op == MPI_MINLOC) { + for(i=0; ival > ptr2->val) { + ptr3->val = ptr2->val; ptr3->rank = ptr2->rank; + } else { + ptr3->val = ptr1->val; ptr3->rank = ptr1->rank; + } + } + } else return NBC_OP_NOT_SUPPORTED; + } else return NBC_DATATYPE_NOT_SUPPORTED; + + return NBC_OK; +}