openmpi/ompi/mca/op/example/op_example_module_max.c

/*
 * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
 *                         University Research and Technology
 *                         Corporation.  All rights reserved.
 * Copyright (c) 2004-2007 The University of Tennessee and The University
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2005 The Regents of the University of California.
 *                         All rights reserved.
 * Copyright (c) 2008-2009 Cisco Systems, Inc.  All rights reserved.
 * $COPYRIGHT$
 * 
 * Additional copyrights may follow
 * 
 * $HEADER$
 */

/** @file
 *
 * This is the max module source code.  It contains the "setup"
 * functions that will create a module for the MPI_MAX MPI_Op.
 */

#include "ompi_config.h"

#include "opal/class/opal_object.h"
#include "opal/util/output.h"

#include "ompi/constants.h"
#include "ompi/op/op.h"
#include "ompi/mca/op/op.h"
#include "ompi/mca/op/base/base.h"
#include "ompi/mca/op/example/op_example.h"

/**
 * Derive a struct from the base op module struct, allowing us to
 * cache some module-specific information for MAX.  Note that
 * information that should be shared across all modules should be put
 * on the example component.
 */
typedef struct {
    ompi_op_base_module_1_0_0_t super;

    /* Just like the ompi_op_example_component_t, this struct is meant to
       cache information on a per-module basis.  What follows are
       examples; replace them with whatever is relevant for your
       component/module.  Keep in mind that there will be one distinct
       module for each MPI_Op; you may want to have different data
       cached on the module, depending on the MPI_Op that it is
       supporting.

       In this example, we'll keep the fallback function pointers for
       several integer types. */
    ompi_op_base_handler_fn_t fallback_float;
    ompi_op_base_module_t *fallback_float_module;
    ompi_op_base_handler_fn_t fallback_real;
    ompi_op_base_module_t *fallback_real_module;

    ompi_op_base_handler_fn_t fallback_double;
    ompi_op_base_module_t *fallback_double_module;
    ompi_op_base_handler_fn_t fallback_double_precision;
    ompi_op_base_module_t *fallback_double_precision_module;
} module_max_t;

/**
 * "Constructor" for the max module class
 */
static void module_max_constructor(module_max_t *m)
{
    /* Use this function to initialize any data in the class that is
       specific to this class (i.e. do *not* initialize the parent
       data members!). */
    m->fallback_float = NULL;
    m->fallback_float_module = NULL;
    m->fallback_real = NULL;
    m->fallback_real_module = NULL;

    m->fallback_double = NULL;
    m->fallback_double_module = NULL;
    m->fallback_double_precision = NULL;
    m->fallback_double_precision_module = NULL;
}

/**
 * "Destructor" for the max module class
 */
static void module_max_destructor(module_max_t *m)
{
    /* Use this function to clean up any data members that may be
       necessary.  This may include freeing resources and/or setting
       members to sentinel values to know that the object has been
       destructed. */
    m->fallback_float = (ompi_op_base_handler_fn_t) 0xdeadbeef;
    m->fallback_float_module = (ompi_op_base_module_t*) 0xdeadbeef;
    m->fallback_real = (ompi_op_base_handler_fn_t) 0xdeadbeef;
    m->fallback_real_module = (ompi_op_base_module_t*) 0xdeadbeef;

    m->fallback_double = (ompi_op_base_handler_fn_t) 0xdeadbeef;
    m->fallback_double_module = (ompi_op_base_module_t*) 0xdeadbeef;
    m->fallback_double_precision = (ompi_op_base_handler_fn_t) 0xdeadbeef;
    m->fallback_double_precision_module = (ompi_op_base_module_t*) 0xdeadbeef;
}

/**
 * Setup the class for the max module, listing:
 * - the name of the class
 * - the "parent" of the class
 * - function pointer for the constructor (or NULL)
 * - function pointer for the destructor (or NULL)
 */
static OBJ_CLASS_INSTANCE(module_max_t,
                          ompi_op_base_module_t,
                          module_max_constructor,
                          module_max_destructor);

/**
 * Max function for C float
 */
static void max_float(void *in, void *out, int *count, 
                      ompi_datatype_t **type, ompi_op_base_module_t *module)
{
    module_max_t *m = (module_max_t*) module;

    /* Be chatty to the output, just so that we can see that this
       function was called */
    opal_output(0, "In example max float function");

    /* This is where you can decide at run-time whether to use the
       hardware or the fallback function.  For example, you could have
       logic something like this:

       extent = *count * size(int);
       if (memory_accessible_on_hw(in, extent) &&
           memory_accessible_on_hw(out, extent)) {
          ...do the function on hardware...
       } else if (extent >= large_enough) {
          ...copy host memory -> hardware memory...
          ...do the function on hardware...
          ...copy hardware memory -> host memory...
       } else {
          m->fallback_float(in, out, count, type, m->fallback_int_module);
       }
     */

    /* But for this example, we'll just call the fallback function to
       actually do the work */
    m->fallback_float(in, out, count, type, m->fallback_float_module);
}

/**
 * Max function for C double
 */
static void max_double(void *in, void *out, int *count, 
                       ompi_datatype_t **type, ompi_op_base_module_t *module)
{
    module_max_t *m = (module_max_t*) module;
    opal_output(0, "In example max double function");

    /* Just another example function -- similar to max_int() */

    m->fallback_double(in, out, count, type, m->fallback_double_module);
}

/**
 * Max function for Fortran REAL
 */
static void max_real(void *in, void *out, int *count, 
                     ompi_datatype_t **type, ompi_op_base_module_t *module)
{
    module_max_t *m = (module_max_t*) module;
    opal_output(0, "In example max real function");

    /* Just another example function -- similar to max_int() */

    m->fallback_real(in, out, count, type, m->fallback_real_module);
}

/**
 * Max function for Fortran DOUBLE PRECISION
 */
static void max_double_precision(void *in, void *out, int *count, 
                                 ompi_datatype_t **type, 
                                 ompi_op_base_module_t *module)
{
    module_max_t *m = (module_max_t*) module;
    opal_output(0, "In example max double precision function");

    /* Just another example function -- similar to max_int() */

    m->fallback_double_precision(in, out, count, type, 
                                 m->fallback_double_precision_module);
}

/**
 * Setup function for MPI_MAX.  If we get here, we can assume that a)
 * the hardware is present, b) the MPI thread scenario is what we
 * want, and c) the MAX operation is supported.  So this function's
 * job is to create a module and fill in function pointers for the
 * functions that this hardware supports.
 */
ompi_op_base_module_t *ompi_op_example_setup_max(ompi_op_t *op)
{
    module_max_t *module = OBJ_NEW(module_max_t);

    /* We defintely support the single precision floating point types */

    /* Remember that we created an *example* module (vs. a *base*
       module), so we can cache extra information on there that is
       specific for the MAX operation.  Let's cache the original
       fallback function pointers, that were passed to us in this call
       (i.e., they're already assigned on the op). */

    /* C float */
    module->super.opm_fns[OMPI_OP_BASE_TYPE_FLOAT] = max_float;
    module->fallback_float = op->o_func.intrinsic.fns[OMPI_OP_BASE_TYPE_FLOAT];
    module->fallback_float_module = 
        op->o_func.intrinsic.modules[OMPI_OP_BASE_TYPE_FLOAT];
    /* If you cache a fallback function, you *must* RETAIN (i.e.,
       increase the refcount) its module so that the module knows that
       it is being used and won't be freed/destructed. */
    OBJ_RETAIN(module->fallback_float_module);

    /* Fortran REAL */
    module->super.opm_fns[OMPI_OP_BASE_TYPE_REAL] = max_real;
    module->fallback_real = 
        op->o_func.intrinsic.fns[OMPI_OP_BASE_TYPE_REAL];
    module->fallback_real_module = 
        op->o_func.intrinsic.modules[OMPI_OP_BASE_TYPE_REAL];
    OBJ_RETAIN(module->fallback_real_module);

    /* Does our hardware support double precision? */

    if (mca_op_example_component.double_supported) {
        /* C double */
        module->super.opm_fns[OMPI_OP_BASE_TYPE_DOUBLE] = max_double;
        module->fallback_double = 
            op->o_func.intrinsic.fns[OMPI_OP_BASE_TYPE_DOUBLE];
        module->fallback_double_module = 
            op->o_func.intrinsic.modules[OMPI_OP_BASE_TYPE_DOUBLE];
        OBJ_RETAIN(module->fallback_double_module);
        
        /* Fortran DOUBLE PRECISION */
        module->super.opm_fns[OMPI_OP_BASE_TYPE_DOUBLE_PRECISION] = 
            max_double_precision;
        module->fallback_double_precision = 
            op->o_func.intrinsic.fns[OMPI_OP_BASE_TYPE_DOUBLE_PRECISION];
        module->fallback_double_precision_module = 
            op->o_func.intrinsic.modules[OMPI_OP_BASE_TYPE_DOUBLE_PRECISION];
        OBJ_RETAIN(module->fallback_double_precision_module);
    }

    /* ...not listing the rest of the floating point-typed functions
       in this example... */

    return (ompi_op_base_module_t*) module;
}
Two major things in this commit: * New "op" MPI layer framework * Addition of the MPI_REDUCE_LOCAL proposed function (for MPI-2.2) = Op framework = Add new "op" framework in the ompi layer. This framework replaces the hard-coded MPI_Op back-end functions for (MPI_Op, MPI_Datatype) tuples for pre-defined MPI_Ops, allowing components and modules to provide the back-end functions. The intent is that components can be written to take advantage of hardware acceleration (GPU, FPGA, specialized CPU instructions, etc.). Similar to other frameworks, components are intended to be able to discover at run-time if they can be used, and if so, elect themselves to be selected (or disqualify themselves from selection if they cannot run). If specialized hardware is not available, there is a default set of functions that will automatically be used. This framework is ''not'' used for user-defined MPI_Ops. The new op framework is similar to the existing coll framework, in that the final set of function pointers that are used on any given intrinsic MPI_Op can be a mixed bag of function pointers, potentially coming from multiple different op modules. This allows for hardware that only supports some of the operations, not all of them (e.g., a GPU that only supports single-precision operations). All the hard-coded back-end MPI_Op functions for (MPI_Op, MPI_Datatype) tuples still exist, but unlike coll, they're in the framework base (vs. being in a separate "basic" component) and are automatically used if no component is found at runtime that provides a module with the necessary function pointers. There is an "example" op component that will hopefully be useful to those writing meaningful op components. It is currently .ompi_ignore'd so that it doesn't impinge on other developers (it's somewhat chatty in terms of opal_output() so that you can tell when its functions have been invoked). See the README file in the example op component directory. Developers of new op components are encouraged to look at the following wiki pages: https://svn.open-mpi.org/trac/ompi/wiki/devel/Autogen https://svn.open-mpi.org/trac/ompi/wiki/devel/CreateComponent https://svn.open-mpi.org/trac/ompi/wiki/devel/CreateFramework = MPI_REDUCE_LOCAL = Part of the MPI-2.2 proposal listed here: https://svn.mpi-forum.org/trac/mpi-forum-web/ticket/24 is to add a new function named MPI_REDUCE_LOCAL. It is very easy to implement, so I added it (also because it makes testing the op framework pretty easy -- you can do it in serial rather than via parallel reductions). There's even a man page! This commit was SVN r20280. 2009-01-15 02:44:31 +03:00			`/*`
			`* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana`
			`* University Research and Technology`
			`* Corporation. All rights reserved.`
			`* Copyright (c) 2004-2007 The University of Tennessee and The University`
			`* of Tennessee Research Foundation. All rights`
			`* reserved.`
			`* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,`
			`* University of Stuttgart. All rights reserved.`
			`* Copyright (c) 2004-2005 The Regents of the University of California.`
			`* All rights reserved.`
Update Cisco copyrights for consistency This commit was SVN r22072. 2009-10-08 02:02:32 +04:00			`* Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved.`
Two major things in this commit: * New "op" MPI layer framework * Addition of the MPI_REDUCE_LOCAL proposed function (for MPI-2.2) = Op framework = Add new "op" framework in the ompi layer. This framework replaces the hard-coded MPI_Op back-end functions for (MPI_Op, MPI_Datatype) tuples for pre-defined MPI_Ops, allowing components and modules to provide the back-end functions. The intent is that components can be written to take advantage of hardware acceleration (GPU, FPGA, specialized CPU instructions, etc.). Similar to other frameworks, components are intended to be able to discover at run-time if they can be used, and if so, elect themselves to be selected (or disqualify themselves from selection if they cannot run). If specialized hardware is not available, there is a default set of functions that will automatically be used. This framework is ''not'' used for user-defined MPI_Ops. The new op framework is similar to the existing coll framework, in that the final set of function pointers that are used on any given intrinsic MPI_Op can be a mixed bag of function pointers, potentially coming from multiple different op modules. This allows for hardware that only supports some of the operations, not all of them (e.g., a GPU that only supports single-precision operations). All the hard-coded back-end MPI_Op functions for (MPI_Op, MPI_Datatype) tuples still exist, but unlike coll, they're in the framework base (vs. being in a separate "basic" component) and are automatically used if no component is found at runtime that provides a module with the necessary function pointers. There is an "example" op component that will hopefully be useful to those writing meaningful op components. It is currently .ompi_ignore'd so that it doesn't impinge on other developers (it's somewhat chatty in terms of opal_output() so that you can tell when its functions have been invoked). See the README file in the example op component directory. Developers of new op components are encouraged to look at the following wiki pages: https://svn.open-mpi.org/trac/ompi/wiki/devel/Autogen https://svn.open-mpi.org/trac/ompi/wiki/devel/CreateComponent https://svn.open-mpi.org/trac/ompi/wiki/devel/CreateFramework = MPI_REDUCE_LOCAL = Part of the MPI-2.2 proposal listed here: https://svn.mpi-forum.org/trac/mpi-forum-web/ticket/24 is to add a new function named MPI_REDUCE_LOCAL. It is very easy to implement, so I added it (also because it makes testing the op framework pretty easy -- you can do it in serial rather than via parallel reductions). There's even a man page! This commit was SVN r20280. 2009-01-15 02:44:31 +03:00			`* $COPYRIGHT$`
			`*`
			`* Additional copyrights may follow`
			`*`
			`* $HEADER$`
			`*/`

			`/** @file`
			`*`
			`* This is the max module source code. It contains the "setup"`
			`* functions that will create a module for the MPI_MAX MPI_Op.`
			`*/`

			`#include "ompi_config.h"`

			`#include "opal/class/opal_object.h"`
			`#include "opal/util/output.h"`

			`#include "ompi/constants.h"`
			`#include "ompi/op/op.h"`
			`#include "ompi/mca/op/op.h"`
			`#include "ompi/mca/op/base/base.h"`
			`#include "ompi/mca/op/example/op_example.h"`

			`/**`
			`* Derive a struct from the base op module struct, allowing us to`
			`* cache some module-specific information for MAX. Note that`
			`* information that should be shared across all modules should be put`
			`* on the example component.`
			`*/`
			`typedef struct {`
			`ompi_op_base_module_1_0_0_t super;`

			`/* Just like the ompi_op_example_component_t, this struct is meant to`
			`cache information on a per-module basis. What follows are`
			`examples; replace them with whatever is relevant for your`
			`component/module. Keep in mind that there will be one distinct`
			`module for each MPI_Op; you may want to have different data`
			`cached on the module, depending on the MPI_Op that it is`
			`supporting.`

			`In this example, we'll keep the fallback function pointers for`
			`several integer types. */`
			`ompi_op_base_handler_fn_t fallback_float;`
			`ompi_op_base_module_t *fallback_float_module;`
			`ompi_op_base_handler_fn_t fallback_real;`
			`ompi_op_base_module_t *fallback_real_module;`

			`ompi_op_base_handler_fn_t fallback_double;`
			`ompi_op_base_module_t *fallback_double_module;`
			`ompi_op_base_handler_fn_t fallback_double_precision;`
			`ompi_op_base_module_t *fallback_double_precision_module;`
			`} module_max_t;`

			`/**`
			`* "Constructor" for the max module class`
			`*/`
			`static void module_max_constructor(module_max_t *m)`
			`{`
			`/* Use this function to initialize any data in the class that is`
			`specific to this class (i.e. do not initialize the parent`
			`data members!). */`
			`m->fallback_float = NULL;`
			`m->fallback_float_module = NULL;`
			`m->fallback_real = NULL;`
			`m->fallback_real_module = NULL;`

			`m->fallback_double = NULL;`
			`m->fallback_double_module = NULL;`
			`m->fallback_double_precision = NULL;`
			`m->fallback_double_precision_module = NULL;`
			`}`

			`/**`
			`* "Destructor" for the max module class`
			`*/`
			`static void module_max_destructor(module_max_t *m)`
			`{`
			`/* Use this function to clean up any data members that may be`
			`necessary. This may include freeing resources and/or setting`
			`members to sentinel values to know that the object has been`
			`destructed. */`
			`m->fallback_float = (ompi_op_base_handler_fn_t) 0xdeadbeef;`
			`m->fallback_float_module = (ompi_op_base_module_t*) 0xdeadbeef;`
			`m->fallback_real = (ompi_op_base_handler_fn_t) 0xdeadbeef;`
			`m->fallback_real_module = (ompi_op_base_module_t*) 0xdeadbeef;`

			`m->fallback_double = (ompi_op_base_handler_fn_t) 0xdeadbeef;`
			`m->fallback_double_module = (ompi_op_base_module_t*) 0xdeadbeef;`
			`m->fallback_double_precision = (ompi_op_base_handler_fn_t) 0xdeadbeef;`
			`m->fallback_double_precision_module = (ompi_op_base_module_t*) 0xdeadbeef;`
			`}`

			`/**`
			`* Setup the class for the max module, listing:`
			`* - the name of the class`
			`* - the "parent" of the class`
			`* - function pointer for the constructor (or NULL)`
			`* - function pointer for the destructor (or NULL)`
			`*/`
			`static OBJ_CLASS_INSTANCE(module_max_t,`
			`ompi_op_base_module_t,`
			`module_max_constructor,`
			`module_max_destructor);`

			`/**`
			`* Max function for C float`
			`*/`
			`static void max_float(void in, void out, int *count,`
			`ompi_datatype_t *type, ompi_op_base_module_t module)`
			`{`
			`module_max_t m = (module_max_t) module;`

			`/* Be chatty to the output, just so that we can see that this`
			`function was called */`
			`opal_output(0, "In example max float function");`

			`/* This is where you can decide at run-time whether to use the`
			`hardware or the fallback function. For example, you could have`
			`logic something like this:`

			`extent = count size(int);`
			`if (memory_accessible_on_hw(in, extent) &&`
			`memory_accessible_on_hw(out, extent)) {`
			`...do the function on hardware...`
			`} else if (extent >= large_enough) {`
			`...copy host memory -> hardware memory...`
			`...do the function on hardware...`
			`...copy hardware memory -> host memory...`
			`} else {`
			`m->fallback_float(in, out, count, type, m->fallback_int_module);`
			`}`
			`*/`

			`/* But for this example, we'll just call the fallback function to`
			`actually do the work */`
			`m->fallback_float(in, out, count, type, m->fallback_float_module);`
			`}`

			`/**`
			`* Max function for C double`
			`*/`
			`static void max_double(void in, void out, int *count,`
			`ompi_datatype_t *type, ompi_op_base_module_t module)`
			`{`
			`module_max_t m = (module_max_t) module;`
			`opal_output(0, "In example max double function");`

			`/* Just another example function -- similar to max_int() */`

			`m->fallback_double(in, out, count, type, m->fallback_double_module);`
			`}`

			`/**`
			`* Max function for Fortran REAL`
			`*/`
			`static void max_real(void in, void out, int *count,`
			`ompi_datatype_t *type, ompi_op_base_module_t module)`
			`{`
			`module_max_t m = (module_max_t) module;`
			`opal_output(0, "In example max real function");`

			`/* Just another example function -- similar to max_int() */`

			`m->fallback_real(in, out, count, type, m->fallback_real_module);`
			`}`

			`/**`
			`* Max function for Fortran DOUBLE PRECISION`
			`*/`
			`static void max_double_precision(void in, void out, int *count,`
			`ompi_datatype_t **type,`
			`ompi_op_base_module_t *module)`
			`{`
			`module_max_t m = (module_max_t) module;`
			`opal_output(0, "In example max double precision function");`

			`/* Just another example function -- similar to max_int() */`

			`m->fallback_double_precision(in, out, count, type,`
			`m->fallback_double_precision_module);`
			`}`

			`/**`
			`* Setup function for MPI_MAX. If we get here, we can assume that a)`
			`* the hardware is present, b) the MPI thread scenario is what we`
			`* want, and c) the MAX operation is supported. So this function's`
			`* job is to create a module and fill in function pointers for the`
			`* functions that this hardware supports.`
			`*/`
			`ompi_op_base_module_t ompi_op_example_setup_max(ompi_op_t op)`
			`{`
			`module_max_t *module = OBJ_NEW(module_max_t);`

			`/* We defintely support the single precision floating point types */`

			`/* Remember that we created an example module (vs. a base`
			`module), so we can cache extra information on there that is`
			`specific for the MAX operation. Let's cache the original`
			`fallback function pointers, that were passed to us in this call`
			`(i.e., they're already assigned on the op). */`

			`/* C float */`
			`module->super.opm_fns[OMPI_OP_BASE_TYPE_FLOAT] = max_float;`
			`module->fallback_float = op->o_func.intrinsic.fns[OMPI_OP_BASE_TYPE_FLOAT];`
			`module->fallback_float_module =`
			`op->o_func.intrinsic.modules[OMPI_OP_BASE_TYPE_FLOAT];`
			`/* If you cache a fallback function, you must RETAIN (i.e.,`
			`increase the refcount) its module so that the module knows that`
			`it is being used and won't be freed/destructed. */`
			`OBJ_RETAIN(module->fallback_float_module);`

			`/* Fortran REAL */`
			`module->super.opm_fns[OMPI_OP_BASE_TYPE_REAL] = max_real;`
			`module->fallback_real =`
			`op->o_func.intrinsic.fns[OMPI_OP_BASE_TYPE_REAL];`
			`module->fallback_real_module =`
			`op->o_func.intrinsic.modules[OMPI_OP_BASE_TYPE_REAL];`
			`OBJ_RETAIN(module->fallback_real_module);`

			`/* Does our hardware support double precision? */`

			`if (mca_op_example_component.double_supported) {`
			`/* C double */`
			`module->super.opm_fns[OMPI_OP_BASE_TYPE_DOUBLE] = max_double;`
			`module->fallback_double =`
			`op->o_func.intrinsic.fns[OMPI_OP_BASE_TYPE_DOUBLE];`
			`module->fallback_double_module =`
			`op->o_func.intrinsic.modules[OMPI_OP_BASE_TYPE_DOUBLE];`
			`OBJ_RETAIN(module->fallback_double_module);`

			`/* Fortran DOUBLE PRECISION */`
			`module->super.opm_fns[OMPI_OP_BASE_TYPE_DOUBLE_PRECISION] =`
			`max_double_precision;`
			`module->fallback_double_precision =`
			`op->o_func.intrinsic.fns[OMPI_OP_BASE_TYPE_DOUBLE_PRECISION];`
			`module->fallback_double_precision_module =`
			`op->o_func.intrinsic.modules[OMPI_OP_BASE_TYPE_DOUBLE_PRECISION];`
			`OBJ_RETAIN(module->fallback_double_precision_module);`
			`}`

			`/* ...not listing the rest of the floating point-typed functions`
			`in this example... */`

			`return (ompi_op_base_module_t*) module;`
			`}`