289 lines
8.6 KiB
C
289 lines
8.6 KiB
C
|
#ifndef AIKERN_H
|
||
|
#define AIKERN_H
|
||
|
|
||
|
typedef struct {
|
||
|
size_t runs; // also # of start-/endtimes
|
||
|
double* starts; // starttimes
|
||
|
double* ends; // endtimes
|
||
|
int flops; // flops per iteration
|
||
|
char* kern_name;
|
||
|
size_t size; // size of arrays handeld
|
||
|
} kern_result;
|
||
|
|
||
|
typedef enum {
|
||
|
SIMPLE_1_16, FMA_1_16, SIMPLE_8_1, FMA_8_1, SIMPLE_8_1_FASTMATH, FMA_8_1_MANPACK
|
||
|
} kernel_t;
|
||
|
|
||
|
/**
|
||
|
* @brief main entry point. Dispatches the kernel calls
|
||
|
* @param kernel the kernel to run
|
||
|
* @param a An array with double values of size param size
|
||
|
* @param b An array with double values of size param size
|
||
|
* @param c An array with double values of size param size
|
||
|
* @param size The size of the arrays
|
||
|
* @param runs How often the kernel should be executed
|
||
|
* @return kern_result containing information about the kernel execution
|
||
|
*
|
||
|
*
|
||
|
*/
|
||
|
kern_result kernel_dispatch(kernel_t kernel,
|
||
|
double* a, double* b, double* c,
|
||
|
size_t size, size_t runs);
|
||
|
|
||
|
/**
|
||
|
* @brief A simple 1/16 operational intensity kernel
|
||
|
* @param a An array with double values of size param size
|
||
|
* @param size Size of the three param arrays
|
||
|
* @param result Pointer to result storage
|
||
|
*
|
||
|
* === Warning ===
|
||
|
* Don't use with -O0: Stores everything on stack
|
||
|
*
|
||
|
* === Description ===
|
||
|
* Uses a simple floating point operation: a[i] = a[i] * a[i];
|
||
|
*
|
||
|
* Runs in a parallelized for loop.
|
||
|
*
|
||
|
* === Analysis ===
|
||
|
* COMM: 1 read (8 byte), 1 write = 16 bytes
|
||
|
* COMP: 1 FLOP
|
||
|
* ---------
|
||
|
* OI: 1/16
|
||
|
*
|
||
|
* === Optimization ===
|
||
|
* Nothing special
|
||
|
*
|
||
|
*/
|
||
|
void kernel_1_16_simple(double* a, size_t size);
|
||
|
|
||
|
|
||
|
/**
|
||
|
* @brief A 1/16 operational intensity kernel utilizing FMA
|
||
|
* @param a An array with double values of size param size
|
||
|
* @param b An array with double values of size param size
|
||
|
* @param c An array with double values of size param size
|
||
|
* @param size Size of the three param arrays
|
||
|
* @param result Pointer to result storage
|
||
|
*
|
||
|
* === Warning ===
|
||
|
* This is dangerous if FMA is not used/can't be used. Then there
|
||
|
* are intermediary writes (and reads) to the stack.
|
||
|
*
|
||
|
* === Description ===
|
||
|
* Uses a triad function: a[i] = a[i] * b[i] + c[i]; in order
|
||
|
* to utilize the FMA unit.
|
||
|
*
|
||
|
* Runs in a parallelized for loop.
|
||
|
*
|
||
|
* === Analysis ===
|
||
|
* With gcc -O2 -mavx -mfma FMA compiles to:
|
||
|
* vmovsd xmm0,QWORD PTR [rdi+rax*8] # 1 read (8 byte)
|
||
|
* vmovsd xmm1,QWORD PTR [rdx+rax*8] # 1 read
|
||
|
* vfmadd132sd xmm0,xmm1,QWORD PTR [rsi+rax*8] # 2 FLOPs + 1 read
|
||
|
* vmovsd QWORD PTR [rdi+rax*8],xmm0 # 1 write
|
||
|
* --------
|
||
|
* 1/16 OI
|
||
|
*
|
||
|
* === Optimization ===
|
||
|
* For packed doubles compile with -Ofast
|
||
|
*
|
||
|
*/
|
||
|
void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size);
|
||
|
|
||
|
|
||
|
|
||
|
/**
|
||
|
* @brief A simple 8/1 operational intensity kernel
|
||
|
* @param a An array with double values of size param size
|
||
|
* @param size Size of the three param arrays
|
||
|
* @param result Pointer to result storage
|
||
|
*
|
||
|
* === Warning ===
|
||
|
* Don't use with -O0: Stores everything on stack
|
||
|
*
|
||
|
* === Description ===
|
||
|
* Uses a simple floating point operation: a[i] = a[i] * a[i] * ...* a[i];
|
||
|
*
|
||
|
* Runs in a parallelized for loop.
|
||
|
*
|
||
|
* === Analysis ===
|
||
|
* With AVX and -O2 (not necessarily FMA) best results (obviously correct
|
||
|
* easy to read disassembly).
|
||
|
*
|
||
|
* With gcc -O2 -mavx compiles to:
|
||
|
* vmovsd xmm1,QWORD PTR [rdi] # 1 read
|
||
|
* vmulsd xmm0,xmm1,xmm1 # 1 FLOP+register shuffling
|
||
|
* vmulsd xmm0,xmm0,xmm1 # 127x 1 FLOP+register shuffling
|
||
|
* # [...]
|
||
|
* vmovsd QWORD PTR [rdi-0x8],xmm0 # 1 write
|
||
|
* --------
|
||
|
* 128/16 = 8/1 OI
|
||
|
*
|
||
|
* === Optimization ===
|
||
|
* Nothing special
|
||
|
*/
|
||
|
void kernel_8_1_simple(double* a, size_t size);
|
||
|
|
||
|
/**
|
||
|
* @brief A 8/1 operational intensity kernel utilizing FMA
|
||
|
* @param a An array with double values of size param size
|
||
|
* @param size Size of the three param arrays
|
||
|
* @param result Pointer to result storage
|
||
|
*
|
||
|
* === Warning ===
|
||
|
* This is dangerous if FMA is not used/can't be used. Then there
|
||
|
* are intermediary writes (and reads) to the stack.
|
||
|
*
|
||
|
* === Description ===
|
||
|
* Uses multiple triad function: a[i] = a[i] * a[i] + a[i]; in order
|
||
|
* to utilize the FMA unit.
|
||
|
*
|
||
|
* Runs in a parallelized for loop.
|
||
|
*
|
||
|
* === Analysis ===
|
||
|
* With gcc -O2 -mavx -mfma FMA compiles to:
|
||
|
* vmovsd xmm0,QWORD PTR [rdi] # 1 read
|
||
|
* vfmadd132sd xmm0,xmm0,xmm0 # 64 x 2 FLOPs+register shuffling
|
||
|
* vmovsd QWORD PTR [rdi-0x8],xmm0 # 1 write
|
||
|
* --------
|
||
|
* 128/16 = 8/1 OI
|
||
|
*
|
||
|
* === Optimization ===
|
||
|
* For packed doubles compile with -Ofast
|
||
|
*
|
||
|
*/
|
||
|
void kernel_8_1_fuseaware(double* a, size_t size);
|
||
|
|
||
|
/**
|
||
|
* @brief A simple 8/1 operational intensity kernel which
|
||
|
* undermines evil fastmath optimization
|
||
|
* @param a An array with double values of size param size
|
||
|
* @param size Size of the three param arrays
|
||
|
* @param result Pointer to result storage
|
||
|
*
|
||
|
* === Warning ===
|
||
|
* Don't use with anything other than -Ofast / -ffast-math
|
||
|
*
|
||
|
* === Description ===
|
||
|
* Uses a simple floating point operation that more closely resembles
|
||
|
* that of 8_1_fuseaware:
|
||
|
* a[i] = a[i]*a[i]; # 128x
|
||
|
*
|
||
|
* Runs in a parallelized for loop.
|
||
|
*
|
||
|
* === Analysis ===
|
||
|
* -Ofast/-ffast-math does not preserve strict IEEE compliance. It
|
||
|
* therefore is allowed to ignore non-associativity of floating
|
||
|
* point operations.
|
||
|
*
|
||
|
* x = x*x*x*x*x*x*x*x; is optimized to x *= x;x *= x;x *= x;
|
||
|
*
|
||
|
* This cleary breaks the whole OI calculation of 8_1_simple.
|
||
|
*
|
||
|
* This kernel does not introduce more byte write-outs than
|
||
|
* 8_1_simple at a high optimization level since a[i] is held
|
||
|
* in a register and only written out once at the end of an
|
||
|
* iteration.
|
||
|
*
|
||
|
*
|
||
|
* === Optimization ===
|
||
|
* Nothing special
|
||
|
*/
|
||
|
void kernel_8_1_simple_fastmath(double* a, size_t size);
|
||
|
|
||
|
|
||
|
/********************************************
|
||
|
* Kernels which potentially compile to *
|
||
|
* different operational intensities than *
|
||
|
* specified *
|
||
|
********************************************/
|
||
|
|
||
|
/**
|
||
|
* @brief A 1/16 operational intensity which might compile to a flawed oi kernel
|
||
|
* @param a An array with double values of size param size
|
||
|
* @param b An array with double values of size param size
|
||
|
* @param size Size of the three param arrays
|
||
|
*
|
||
|
* === Problem ===
|
||
|
* As soon as volatile is used gcc uses the stack for tmp.
|
||
|
* Even if "register" is in place. Resulting in one additional write per loop.
|
||
|
* Omitting volatile results in optimizing away the whole loop
|
||
|
* (checked at -O2, which is necessary for FMA to eventually step in).
|
||
|
* Maybe the value stays in cache, maybe not. It does not live a register.
|
||
|
*
|
||
|
* Even with -O3:
|
||
|
* movsd xmm0,QWORD PTR [rdi+rax*8] # 1 read
|
||
|
* mulsd xmm0,QWORD PTR [rsi+rax*8] # 1 read (+ write to xmm0, not counted)
|
||
|
* # [...] # instructions for loop
|
||
|
* movsd QWORD PTR [rsp-0x8],xmm0 # malicious write
|
||
|
*
|
||
|
* Without volatile (-O3):
|
||
|
* repz ret # that's it
|
||
|
*/
|
||
|
void kernel_1_16_simple_dangerous(double* a, double* b, size_t size);
|
||
|
|
||
|
/**
|
||
|
* @brief A 8/1 operational intensity which might compile to a flawed oi kernel
|
||
|
* @param a An array with double values of size param size
|
||
|
* @param size Size of the three param arrays
|
||
|
*
|
||
|
* === Problem ==
|
||
|
* Same as for kernel_1_16_simple_dangerous
|
||
|
*/
|
||
|
void kernel_8_1_simple_dangerous(double* a, size_t size);
|
||
|
|
||
|
/**
|
||
|
* @brief A 1/8 operational intensity which might compile to a flawed oi kernel
|
||
|
* @param a An array with double values of size param size
|
||
|
* @param size Size of the three param arrays
|
||
|
*
|
||
|
* === Problem ==
|
||
|
* Same as for kernel_1_16_simple_dangerous
|
||
|
*
|
||
|
* Without volatile the loop is optimized away completely.
|
||
|
* With volatile tmp is written to the stack in every loop
|
||
|
* (-O3). tmp could be cached or not. This might depend on
|
||
|
* how large the array is and how the cpu work internally
|
||
|
* -> unpredictable.
|
||
|
*/
|
||
|
void kernel_1_8_vo_dangerous(double* a, size_t size);
|
||
|
|
||
|
|
||
|
#ifdef INTRINS
|
||
|
void kernel_8_1_fuseaware_manpack(double* a, size_t size);
|
||
|
#endif
|
||
|
|
||
|
|
||
|
|
||
|
/****************************************
|
||
|
* Helper macros for repeating things *
|
||
|
****************************************/
|
||
|
|
||
|
#define REP0(X)
|
||
|
#define REP1(X) X
|
||
|
#define REP2(X) REP1(X) REP1(X)
|
||
|
#define REP3(X) REP2(X) REP1(X)
|
||
|
#define REP4(X) REP3(X) REP1(X)
|
||
|
#define REP5(X) REP4(X) REP1(X)
|
||
|
#define REP6(X) REP5(X) REP1(X)
|
||
|
#define REP7(X) REP6(X) REP1(X)
|
||
|
#define REP8(X) REP7(X) REP1(X)
|
||
|
#define REP9(X) REP8(X) REP1(X)
|
||
|
|
||
|
#define REP10(X) REP9(X) REP1(X)
|
||
|
#define REP20(X) REP10(X) REP10(X)
|
||
|
#define REP30(X) REP20(X) REP10(X)
|
||
|
#define REP40(X) REP30(X) REP10(X)
|
||
|
#define REP50(X) REP40(X) REP10(X)
|
||
|
#define REP60(X) REP50(X) REP10(X)
|
||
|
|
||
|
#define REP100(X) REP50(X) REP50(X)
|
||
|
|
||
|
#ifdef ENDEBUG
|
||
|
#define DEBUG(...) do { fprintf(stderr, __VA_ARGS__); fprintf(stderr, "\n"); } while(0)
|
||
|
#else
|
||
|
#define DEBUG(...)
|
||
|
#endif
|
||
|
|
||
|
#endif /* AIKERN_H */
|