i5-roofline/aikern.h

289 lines
8.6 KiB
C
Raw Permalink Normal View History

2020-09-03 16:49:50 +00:00
#ifndef AIKERN_H
#define AIKERN_H
typedef struct {
size_t runs; // also # of start-/endtimes
double* starts; // starttimes
double* ends; // endtimes
int flops; // flops per iteration
char* kern_name;
size_t size; // size of arrays handeld
} kern_result;
typedef enum {
SIMPLE_1_16, FMA_1_16, SIMPLE_8_1, FMA_8_1, SIMPLE_8_1_FASTMATH, FMA_8_1_MANPACK
} kernel_t;
/**
* @brief main entry point. Dispatches the kernel calls
* @param kernel the kernel to run
* @param a An array with double values of size param size
* @param b An array with double values of size param size
* @param c An array with double values of size param size
* @param size The size of the arrays
* @param runs How often the kernel should be executed
* @return kern_result containing information about the kernel execution
*
*
*/
kern_result kernel_dispatch(kernel_t kernel,
double* a, double* b, double* c,
size_t size, size_t runs);
/**
* @brief A simple 1/16 operational intensity kernel
* @param a An array with double values of size param size
* @param size Size of the three param arrays
* @param result Pointer to result storage
*
* === Warning ===
* Don't use with -O0: Stores everything on stack
*
* === Description ===
* Uses a simple floating point operation: a[i] = a[i] * a[i];
*
* Runs in a parallelized for loop.
*
* === Analysis ===
* COMM: 1 read (8 byte), 1 write = 16 bytes
* COMP: 1 FLOP
* ---------
* OI: 1/16
*
* === Optimization ===
* Nothing special
*
*/
void kernel_1_16_simple(double* a, size_t size);
/**
* @brief A 1/16 operational intensity kernel utilizing FMA
* @param a An array with double values of size param size
* @param b An array with double values of size param size
* @param c An array with double values of size param size
* @param size Size of the three param arrays
* @param result Pointer to result storage
*
* === Warning ===
* This is dangerous if FMA is not used/can't be used. Then there
* are intermediary writes (and reads) to the stack.
*
* === Description ===
* Uses a triad function: a[i] = a[i] * b[i] + c[i]; in order
* to utilize the FMA unit.
*
* Runs in a parallelized for loop.
*
* === Analysis ===
* With gcc -O2 -mavx -mfma FMA compiles to:
* vmovsd xmm0,QWORD PTR [rdi+rax*8] # 1 read (8 byte)
* vmovsd xmm1,QWORD PTR [rdx+rax*8] # 1 read
* vfmadd132sd xmm0,xmm1,QWORD PTR [rsi+rax*8] # 2 FLOPs + 1 read
* vmovsd QWORD PTR [rdi+rax*8],xmm0 # 1 write
* --------
* 1/16 OI
*
* === Optimization ===
* For packed doubles compile with -Ofast
*
*/
void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size);
/**
* @brief A simple 8/1 operational intensity kernel
* @param a An array with double values of size param size
* @param size Size of the three param arrays
* @param result Pointer to result storage
*
* === Warning ===
* Don't use with -O0: Stores everything on stack
*
* === Description ===
* Uses a simple floating point operation: a[i] = a[i] * a[i] * ...* a[i];
*
* Runs in a parallelized for loop.
*
* === Analysis ===
* With AVX and -O2 (not necessarily FMA) best results (obviously correct
* easy to read disassembly).
*
* With gcc -O2 -mavx compiles to:
* vmovsd xmm1,QWORD PTR [rdi] # 1 read
* vmulsd xmm0,xmm1,xmm1 # 1 FLOP+register shuffling
* vmulsd xmm0,xmm0,xmm1 # 127x 1 FLOP+register shuffling
* # [...]
* vmovsd QWORD PTR [rdi-0x8],xmm0 # 1 write
* --------
* 128/16 = 8/1 OI
*
* === Optimization ===
* Nothing special
*/
void kernel_8_1_simple(double* a, size_t size);
/**
* @brief A 8/1 operational intensity kernel utilizing FMA
* @param a An array with double values of size param size
* @param size Size of the three param arrays
* @param result Pointer to result storage
*
* === Warning ===
* This is dangerous if FMA is not used/can't be used. Then there
* are intermediary writes (and reads) to the stack.
*
* === Description ===
* Uses multiple triad function: a[i] = a[i] * a[i] + a[i]; in order
* to utilize the FMA unit.
*
* Runs in a parallelized for loop.
*
* === Analysis ===
* With gcc -O2 -mavx -mfma FMA compiles to:
* vmovsd xmm0,QWORD PTR [rdi] # 1 read
* vfmadd132sd xmm0,xmm0,xmm0 # 64 x 2 FLOPs+register shuffling
* vmovsd QWORD PTR [rdi-0x8],xmm0 # 1 write
* --------
* 128/16 = 8/1 OI
*
* === Optimization ===
* For packed doubles compile with -Ofast
*
*/
void kernel_8_1_fuseaware(double* a, size_t size);
/**
* @brief A simple 8/1 operational intensity kernel which
* undermines evil fastmath optimization
* @param a An array with double values of size param size
* @param size Size of the three param arrays
* @param result Pointer to result storage
*
* === Warning ===
* Don't use with anything other than -Ofast / -ffast-math
*
* === Description ===
* Uses a simple floating point operation that more closely resembles
* that of 8_1_fuseaware:
* a[i] = a[i]*a[i]; # 128x
*
* Runs in a parallelized for loop.
*
* === Analysis ===
* -Ofast/-ffast-math does not preserve strict IEEE compliance. It
* therefore is allowed to ignore non-associativity of floating
* point operations.
*
* x = x*x*x*x*x*x*x*x; is optimized to x *= x;x *= x;x *= x;
*
* This cleary breaks the whole OI calculation of 8_1_simple.
*
* This kernel does not introduce more byte write-outs than
* 8_1_simple at a high optimization level since a[i] is held
* in a register and only written out once at the end of an
* iteration.
*
*
* === Optimization ===
* Nothing special
*/
void kernel_8_1_simple_fastmath(double* a, size_t size);
/********************************************
* Kernels which potentially compile to *
* different operational intensities than *
* specified *
********************************************/
/**
* @brief A 1/16 operational intensity which might compile to a flawed oi kernel
* @param a An array with double values of size param size
* @param b An array with double values of size param size
* @param size Size of the three param arrays
*
* === Problem ===
* As soon as volatile is used gcc uses the stack for tmp.
* Even if "register" is in place. Resulting in one additional write per loop.
* Omitting volatile results in optimizing away the whole loop
* (checked at -O2, which is necessary for FMA to eventually step in).
* Maybe the value stays in cache, maybe not. It does not live a register.
*
* Even with -O3:
* movsd xmm0,QWORD PTR [rdi+rax*8] # 1 read
* mulsd xmm0,QWORD PTR [rsi+rax*8] # 1 read (+ write to xmm0, not counted)
* # [...] # instructions for loop
* movsd QWORD PTR [rsp-0x8],xmm0 # malicious write
*
* Without volatile (-O3):
* repz ret # that's it
*/
void kernel_1_16_simple_dangerous(double* a, double* b, size_t size);
/**
* @brief A 8/1 operational intensity which might compile to a flawed oi kernel
* @param a An array with double values of size param size
* @param size Size of the three param arrays
*
* === Problem ==
* Same as for kernel_1_16_simple_dangerous
*/
void kernel_8_1_simple_dangerous(double* a, size_t size);
/**
* @brief A 1/8 operational intensity which might compile to a flawed oi kernel
* @param a An array with double values of size param size
* @param size Size of the three param arrays
*
* === Problem ==
* Same as for kernel_1_16_simple_dangerous
*
* Without volatile the loop is optimized away completely.
* With volatile tmp is written to the stack in every loop
* (-O3). tmp could be cached or not. This might depend on
* how large the array is and how the cpu work internally
* -> unpredictable.
*/
void kernel_1_8_vo_dangerous(double* a, size_t size);
#ifdef INTRINS
void kernel_8_1_fuseaware_manpack(double* a, size_t size);
#endif
/****************************************
* Helper macros for repeating things *
****************************************/
#define REP0(X)
#define REP1(X) X
#define REP2(X) REP1(X) REP1(X)
#define REP3(X) REP2(X) REP1(X)
#define REP4(X) REP3(X) REP1(X)
#define REP5(X) REP4(X) REP1(X)
#define REP6(X) REP5(X) REP1(X)
#define REP7(X) REP6(X) REP1(X)
#define REP8(X) REP7(X) REP1(X)
#define REP9(X) REP8(X) REP1(X)
#define REP10(X) REP9(X) REP1(X)
#define REP20(X) REP10(X) REP10(X)
#define REP30(X) REP20(X) REP10(X)
#define REP40(X) REP30(X) REP10(X)
#define REP50(X) REP40(X) REP10(X)
#define REP60(X) REP50(X) REP10(X)
#define REP100(X) REP50(X) REP50(X)
#ifdef ENDEBUG
#define DEBUG(...) do { fprintf(stderr, __VA_ARGS__); fprintf(stderr, "\n"); } while(0)
#else
#define DEBUG(...)
#endif
#endif /* AIKERN_H */