1
1
openmpi/ompi/mca/coll/libnbc/nbc_op.c.m4
Torsten Hoefler 6b22641669 added LibNBC (http://www.unixer.de/NBC) as collv1 (blocking) component.
I know it does not make much sense but one can play around with the
performance. Numbers are available at http://www.unixer.de/research/nbcoll/perf/.
This is the first step towards collv2. Next step includes the addition
of non-blocking functions to the MPI-Layer and the collv1 interface.

It implements all MPI-1 collective algorithms in a non-blocking manner.
However, the collv1 interface does not allow non-blocking collectives so
that all collectives are used blocking by the ompi-glue layer.

I wanted to add LibNBC as a separate subdirectory, but I could not
convince the buildsystem (and had not the time). So the component looks
pretty messy. It would be great if somebody could explain me how to move
all nbc*{c,h}, and {hb,dict}*{c,h} to a seperate subdirectory.

It's .ompi_ignored because I did not test it exhaustively yet.

This commit was SVN r11401.
2006-08-24 16:47:18 +00:00

245 строки
10 KiB
Plaintext

dnl/*
dnl * Copyright (c) 2006 The Trustees of Indiana University and Indiana
dnl * University Research and Technology
dnl * Corporation. All rights reserved.
dnl * Copyright (c) 2006 The Technical University of Chemnitz. All
dnl * rights reserved.
dnl */
dnl
dnl this m4 code generate all MPI intrinsic operations
dnl every macro is prefixed with m4_ to retain clarity (this means that
dnl everything prefixed with m4_ will be replaced by m4!)
dnl
dnl
dnl
dnl ########## define all MPI intrinsic Operations and appropriate C code #############
define(m4_OP_MPI_MIN, `if(m4_ARG1$1 > m4_ARG2$1) m4_ARG3$1 = m4_ARG2$1; else m4_ARG3$1 = m4_ARG1$1;')dnl
define(m4_OP_MPI_MAX, `if(m4_ARG1$1 < m4_ARG2$1) m4_ARG3$1 = m4_ARG2$1; else m4_ARG3$1 = m4_ARG1$1;')dnl
define(m4_OP_MPI_SUM, `m4_ARG3$1 = m4_ARG1$1 + m4_ARG2$1;')dnl
define(m4_OP_MPI_PROD, `m4_ARG3$1 = m4_ARG1$1 * m4_ARG2$1;')dnl
define(m4_OP_MPI_LAND, `m4_ARG3$1 = m4_ARG1$1 && m4_ARG2$1;')dnl
define(m4_OP_MPI_BAND, `m4_ARG3$1 = m4_ARG1$1 & m4_ARG2$1;')dnl
define(m4_OP_MPI_LOR, `m4_ARG3$1 = m4_ARG1$1 || m4_ARG2$1;')dnl
define(m4_OP_MPI_BOR, `m4_ARG3$1 = m4_ARG1$1 | m4_ARG2$1;')dnl
define(m4_OP_MPI_LXOR, `m4_ARG3$1 = ((m4_ARG1$1 ? 1 : 0) ^ (m4_ARG2$1 ? 1 : 0));')dnl
define(m4_OP_MPI_BXOR, `m4_ARG3$1 = ((m4_ARG1$1) ^ (m4_ARG2$1));')dnl
define(m4_OP_MPI_MINLOC, `if(m4_ARG1$1_VAL > m4_ARG2$1_VAL) {
m4_ARG3$1_VAL = m4_ARG2$1_VAL; m4_ARG3$1_RANK = m4_ARG2$1_RANK;
} else {
m4_ARG3$1_VAL = m4_ARG1$1_VAL; m4_ARG3$1_RANK = m4_ARG1$1_RANK;
}')dnl
define(m4_OP_MPI_MAXLOC, `if(m4_ARG1$1_VAL < m4_ARG2$1_VAL) {
m4_ARG3$1_VAL = m4_ARG2$1_VAL; m4_ARG3$1_RANK = m4_ARG2$1_RANK;
} else {
m4_ARG3$1_VAL = m4_ARG1$1_VAL; m4_ARG3$1_RANK = m4_ARG1$1_RANK;
}')dnl
dnl
dnl ########## define helper macros #################
dnl ########## loop-unrolled version -> slows it down :-( ######
dnl define(m4_IF, `if(op == $1) {
dnl /* loop unrolling - 4 */
dnl for(i=0; i<count-3; i=i+4) {
dnl m4_CTYPE_$2 val11, val12, val21, val22, val31, val32, val41, val42;
dnl
dnl val11 = *(((m4_CTYPE_$2*)buf1) + i);
dnl val12 = *(((m4_CTYPE_$2*)buf2) + i);
dnl val21 = *(((m4_CTYPE_$2*)buf1) + i+1);
dnl val22 = *(((m4_CTYPE_$2*)buf2) + i+1);
dnl val31 = *(((m4_CTYPE_$2*)buf1) + i+2);
dnl val32 = *(((m4_CTYPE_$2*)buf2) + i+2);
dnl val41 = *(((m4_CTYPE_$2*)buf1) + i+3);
dnl val42 = *(((m4_CTYPE_$2*)buf2) + i+3);
dnl
dnl define(m4_ARG11_$1$2, val11)dnl
dnl define(m4_ARG21_$1$2, val12)dnl
dnl define(m4_ARG31_$1$2, val11)dnl
dnl m4_OP_$1(1_$1$2)
dnl define(m4_ARG12_$1$2, val21)dnl
dnl define(m4_ARG22_$1$2, val22)dnl
dnl define(m4_ARG32_$1$2, val21)dnl
dnl m4_OP_$1(2_$1$2)
dnl define(m4_ARG13_$1$2, val31)dnl
dnl define(m4_ARG23_$1$2, val32)dnl
dnl define(m4_ARG33_$1$2, val31)dnl
dnl m4_OP_$1(3_$1$2)
dnl define(m4_ARG14_$1$2, val41)dnl
dnl define(m4_ARG24_$1$2, val42)dnl
dnl define(m4_ARG34_$1$2, val41)dnl
dnl m4_OP_$1(4_$1$2)
dnl
dnl *(((m4_CTYPE_$2*)buf3) + i) = val11;
dnl *(((m4_CTYPE_$2*)buf3) + i+1) = val21;
dnl *(((m4_CTYPE_$2*)buf3) + i+2) = val31;
dnl *(((m4_CTYPE_$2*)buf3) + i+3) = val41;
dnl }
dnl for(i=i+4;i<count;i++) {
dnl m4_CTYPE_$2 val11, val12;
dnl
dnl val11 = *(((m4_CTYPE_$2*)buf1) + i);
dnl val12 = *(((m4_CTYPE_$2*)buf2) + i);
dnl
dnl define(m4_ARG15_$1$2, val11)dnl
dnl define(m4_ARG25_$1$2, val12)dnl
dnl define(m4_ARG35_$1$2, val11)dnl
dnl m4_OP_$1(5_$1$2)
dnl
dnl *(((m4_CTYPE_$2*)buf3) + i) = val11;
dnl }
dnl }')dnl
dnl ##########################################################
dnl ########### THIS is faster as the unrolled code :-(( #####
define(m4_IF, `if(op == $1) {
for(i=0; i<count; i++) {
define(m4_ARG1_$2, `*(((m4_CTYPE_$2*)buf1) + i)')dnl
define(m4_ARG2_$2, `*(((m4_CTYPE_$2*)buf2) + i)')dnl
define(m4_ARG3_$2, `*(((m4_CTYPE_$2*)buf3) + i)')dnl
m4_OP_$1(_$2)
}
}')dnl
dnl ###############################################
define(m4_LOCIF, `if(op == $1) {
for(i=0; i<count; i++) {
typedef struct {
m4_CTYPE1_$2 val;
m4_CTYPE2_$2 rank;
} m4_CTYPE3_$2;
m4_CTYPE3_$2 *ptr1, *ptr2, *ptr3;
ptr1 = ((m4_CTYPE3_$2*)buf1) + i;
ptr2 = ((m4_CTYPE3_$2*)buf2) + i;
ptr3 = ((m4_CTYPE3_$2*)buf3) + i;
define(m4_ARG1_VAL, ptr1->val)dnl
define(m4_ARG2_VAL, ptr2->val)dnl
define(m4_ARG3_VAL, ptr3->val)dnl
define(m4_ARG1_RANK, ptr1->rank)dnl
define(m4_ARG2_RANK, ptr2->rank)dnl
define(m4_ARG3_RANK, ptr3->rank)dnl
m4_OP_$1
}
}')dnl
dnl ##########################################################
define(m4_TYPE, `if(type == $1) {
m4_OPTYPE_$1($1)
}')dnl
dnl ########## define possible operations for each type
dnl
dnl
dnl ####### MPI_INT ########
define(m4_OPTYPE_MPI_INT, `define(m4_CTYPE_$1, `int')dnl
m4_IF(MPI_MIN, $1) else m4_IF(MPI_MAX, $1) else dnl
m4_IF(MPI_SUM, $1) else m4_IF(MPI_PROD, $1) else m4_IF(MPI_LAND, $1) else dnl
m4_IF(MPI_BAND, $1) else m4_IF(MPI_LOR, $1) else m4_IF(MPI_BOR, $1) else dnl
m4_IF(MPI_LXOR, $1) else m4_IF(MPI_BXOR, $1) else return NBC_OP_NOT_SUPPORTED;')dnl
dnl
dnl ####### MPI_LONG ########
define(m4_OPTYPE_MPI_LONG, `define(m4_CTYPE_$1, `long')dnl
m4_IF(MPI_MIN, $1) else m4_IF(MPI_MAX, $1) else dnl
m4_IF(MPI_SUM, $1) else m4_IF(MPI_PROD, $1) else m4_IF(MPI_LAND, $1) else dnl
m4_IF(MPI_BAND, $1) else m4_IF(MPI_LOR, $1) else m4_IF(MPI_BOR, $1) else dnl
m4_IF(MPI_LXOR, $1) else m4_IF(MPI_BXOR, $1) else return NBC_OP_NOT_SUPPORTED;')dnl
dnl
dnl ####### MPI_SHORT ########
define(m4_OPTYPE_MPI_SHORT, `define(m4_CTYPE_$1, `short')dnl
m4_IF(MPI_MIN, $1) else m4_IF(MPI_MAX, $1) else dnl
m4_IF(MPI_SUM, $1) else m4_IF(MPI_PROD, $1) else m4_IF(MPI_LAND, $1) else dnl
m4_IF(MPI_BAND, $1) else m4_IF(MPI_LOR, $1) else m4_IF(MPI_BOR, $1) else dnl
m4_IF(MPI_LXOR, $1) else m4_IF(MPI_BXOR, $1) else return NBC_OP_NOT_SUPPORTED;')dnl
dnl
dnl ####### MPI_UNSIGNED ########
define(m4_OPTYPE_MPI_UNSIGNED, `define(m4_CTYPE_$1, `unsigned int')dnl
m4_IF(MPI_MIN, $1) else m4_IF(MPI_MAX, $1) else dnl
m4_IF(MPI_SUM, $1) else m4_IF(MPI_PROD, $1) else m4_IF(MPI_LAND, $1) else dnl
m4_IF(MPI_BAND, $1) else m4_IF(MPI_LOR, $1) else m4_IF(MPI_BOR, $1) else dnl
m4_IF(MPI_LXOR, $1) else m4_IF(MPI_BXOR, $1) else return NBC_OP_NOT_SUPPORTED;')dnl
dnl
dnl ####### MPI_UNSIGNED_LONG ########
define(m4_OPTYPE_MPI_UNSIGNED_LONG, `define(m4_CTYPE_$1, `unsigned long')dnl
m4_IF(MPI_MIN, $1) else m4_IF(MPI_MAX, $1) else dnl
m4_IF(MPI_SUM, $1) else m4_IF(MPI_PROD, $1) else m4_IF(MPI_LAND, $1) else dnl
m4_IF(MPI_BAND, $1) else m4_IF(MPI_LOR, $1) else m4_IF(MPI_BOR, $1) else dnl
m4_IF(MPI_LXOR, $1) else m4_IF(MPI_BXOR, $1) else return NBC_OP_NOT_SUPPORTED;')dnl
dnl
dnl ####### MPI_UNSIGNED_SHORT ########
define(m4_OPTYPE_MPI_UNSIGNED_SHORT, `define(m4_CTYPE_$1, `unsigned short')dnl
m4_IF(MPI_MIN, $1) else m4_IF(MPI_MAX, $1) else dnl
m4_IF(MPI_SUM, $1) else m4_IF(MPI_PROD, $1) else m4_IF(MPI_LAND, $1) else dnl
m4_IF(MPI_BAND, $1) else m4_IF(MPI_LOR, $1) else m4_IF(MPI_BOR, $1) else dnl
m4_IF(MPI_LXOR, $1) else m4_IF(MPI_BXOR, $1) else return NBC_OP_NOT_SUPPORTED;')dnl
dnl
dnl ####### MPI_FLOAT ########
define(m4_OPTYPE_MPI_FLOAT, `define(m4_CTYPE_$1, `float')dnl
m4_IF(MPI_MIN, $1) else m4_IF(MPI_MAX, $1) else dnl
m4_IF(MPI_SUM, $1) else m4_IF(MPI_PROD, $1) else return NBC_OP_NOT_SUPPORTED;')dnl
dnl
dnl ####### MPI_DOUBLE ########
define(m4_OPTYPE_MPI_DOUBLE, `define(m4_CTYPE_$1, `double')dnl
m4_IF(MPI_MIN, $1) else m4_IF(MPI_MAX, $1) else dnl
m4_IF(MPI_SUM, $1) else m4_IF(MPI_PROD, $1) else return NBC_OP_NOT_SUPPORTED;')dnl
dnl
dnl ####### MPI_LONG_DOUBLE ########
define(m4_OPTYPE_MPI_LONG_DOUBLE, `define(m4_CTYPE_$1, `long double')dnl
m4_IF(MPI_MIN, $1) else m4_IF(MPI_MAX, $1) else dnl
m4_IF(MPI_SUM, $1) else m4_IF(MPI_PROD, $1) else return NBC_OP_NOT_SUPPORTED;')dnl
dnl
dnl ####### MPI_BYTE ########
define(m4_OPTYPE_MPI_BYTE, `define(m4_CTYPE_$1, `char')dnl
m4_IF(MPI_BAND, $1) else m4_IF(MPI_BOR, $1) else dnl
m4_IF(MPI_BXOR, $1) else return NBC_OP_NOT_SUPPORTED;')dnl
dnl
dnl ####### MPI_FLOAT_INT ########
define(m4_OPTYPE_MPI_FLOAT_INT, `define(m4_CTYPE1_$1, `float')define(m4_CTYPE2_$1, `int')define(m4_CTYPE3_$1, `float_int')dnl
m4_LOCIF(MPI_MAXLOC, $1) else m4_LOCIF(MPI_MINLOC, $1) else return NBC_OP_NOT_SUPPORTED;')dnl
dnl
dnl ####### MPI_DOUBLE_INT ########
define(m4_OPTYPE_MPI_DOUBLE_INT, `define(m4_CTYPE1_$1, `double')define(m4_CTYPE2_$1, `int')define(m4_CTYPE3_$1, `double_int')dnl
m4_LOCIF(MPI_MAXLOC, $1) else m4_LOCIF(MPI_MINLOC, $1) else return NBC_OP_NOT_SUPPORTED;')dnl
dnl
dnl ####### MPI_LONG_INT ########
define(m4_OPTYPE_MPI_LONG_INT, `define(m4_CTYPE1_$1, `long')define(m4_CTYPE2_$1, `int')define(m4_CTYPE3_$1, `long_int')dnl
m4_LOCIF(MPI_MAXLOC, $1) else m4_LOCIF(MPI_MINLOC, $1) else return NBC_OP_NOT_SUPPORTED;')dnl
dnl
dnl ####### MPI_2INT ########
define(m4_OPTYPE_MPI_2INT, `define(m4_CTYPE1_$1, `int')define(m4_CTYPE2_$1, `int')define(m4_CTYPE3_$1, `int_int')dnl
m4_LOCIF(MPI_MAXLOC, $1) else m4_LOCIF(MPI_MINLOC, $1) else return NBC_OP_NOT_SUPPORTED;')dnl
dnl
dnl ####### MPI_SHORT_INT ########
define(m4_OPTYPE_MPI_SHORT_INT, `define(m4_CTYPE1_$1, `short')define(m4_CTYPE2_$1, `int')define(m4_CTYPE3_$1, `short_int')dnl
m4_LOCIF(MPI_MAXLOC, $1) else m4_LOCIF(MPI_MINLOC, $1) else return NBC_OP_NOT_SUPPORTED;')dnl
dnl
dnl ####### MPI_LONG_DOUBLE_INT ########
define(m4_OPTYPE_MPI_LONG_DOUBLE_INT, `define(m4_CTYPE1_$1, `long double')define(m4_CTYPE2_$1, `int')define(m4_CTYPE3_$1, `long_double_int')dnl
m4_LOCIF(MPI_MAXLOC, $1) else m4_LOCIF(MPI_MINLOC, $1) else return NBC_OP_NOT_SUPPORTED;')dnl
dnl
dnl ####### begin the real program :-) #########
dnl
#include "nbc.h"
/****************** THIS FILE is automatically generated *********************
* changes will be deleted at the next generation of this file - see nbc_op.c.m4 */
int NBC_Operation(void *buf3, void *buf1, void *buf2, MPI_Op op, MPI_Datatype type, int count) {
int i;
m4_TYPE(MPI_INT) else dnl
m4_TYPE(MPI_LONG) else dnl
m4_TYPE(MPI_SHORT) else dnl
m4_TYPE(MPI_UNSIGNED) else dnl
m4_TYPE(MPI_UNSIGNED_LONG) else dnl
m4_TYPE(MPI_UNSIGNED_SHORT) else dnl
m4_TYPE(MPI_FLOAT) else dnl
m4_TYPE(MPI_DOUBLE) else dnl
m4_TYPE(MPI_LONG_DOUBLE) else dnl
m4_TYPE(MPI_BYTE) else dnl
m4_TYPE(MPI_FLOAT_INT) else dnl
m4_TYPE(MPI_DOUBLE_INT) else dnl
m4_TYPE(MPI_LONG_INT) else dnl
m4_TYPE(MPI_2INT) else dnl
m4_TYPE(MPI_SHORT_INT) else dnl
m4_TYPE(MPI_LONG_DOUBLE_INT) else dnl
return NBC_DATATYPE_NOT_SUPPORTED;
return NBC_OK;
}