#ifndef AIKERN_H #define AIKERN_H typedef struct { size_t runs; // also # of start-/endtimes double* starts; // starttimes double* ends; // endtimes int flops; // flops per iteration char* kern_name; size_t size; // size of arrays handeld } kern_result; typedef enum { SIMPLE_1_16, FMA_1_16, SIMPLE_8_1, FMA_8_1, SIMPLE_8_1_FASTMATH, FMA_8_1_MANPACK } kernel_t; /** * @brief main entry point. Dispatches the kernel calls * @param kernel the kernel to run * @param a An array with double values of size param size * @param b An array with double values of size param size * @param c An array with double values of size param size * @param size The size of the arrays * @param runs How often the kernel should be executed * @return kern_result containing information about the kernel execution * * */ kern_result kernel_dispatch(kernel_t kernel, double* a, double* b, double* c, size_t size, size_t runs); /** * @brief A simple 1/16 operational intensity kernel * @param a An array with double values of size param size * @param size Size of the three param arrays * @param result Pointer to result storage * * === Warning === * Don't use with -O0: Stores everything on stack * * === Description === * Uses a simple floating point operation: a[i] = a[i] * a[i]; * * Runs in a parallelized for loop. * * === Analysis === * COMM: 1 read (8 byte), 1 write = 16 bytes * COMP: 1 FLOP * --------- * OI: 1/16 * * === Optimization === * Nothing special * */ void kernel_1_16_simple(double* a, size_t size); /** * @brief A 1/16 operational intensity kernel utilizing FMA * @param a An array with double values of size param size * @param b An array with double values of size param size * @param c An array with double values of size param size * @param size Size of the three param arrays * @param result Pointer to result storage * * === Warning === * This is dangerous if FMA is not used/can't be used. Then there * are intermediary writes (and reads) to the stack. * * === Description === * Uses a triad function: a[i] = a[i] * b[i] + c[i]; in order * to utilize the FMA unit. * * Runs in a parallelized for loop. * * === Analysis === * With gcc -O2 -mavx -mfma FMA compiles to: * vmovsd xmm0,QWORD PTR [rdi+rax*8] # 1 read (8 byte) * vmovsd xmm1,QWORD PTR [rdx+rax*8] # 1 read * vfmadd132sd xmm0,xmm1,QWORD PTR [rsi+rax*8] # 2 FLOPs + 1 read * vmovsd QWORD PTR [rdi+rax*8],xmm0 # 1 write * -------- * 1/16 OI * * === Optimization === * For packed doubles compile with -Ofast * */ void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size); /** * @brief A simple 8/1 operational intensity kernel * @param a An array with double values of size param size * @param size Size of the three param arrays * @param result Pointer to result storage * * === Warning === * Don't use with -O0: Stores everything on stack * * === Description === * Uses a simple floating point operation: a[i] = a[i] * a[i] * ...* a[i]; * * Runs in a parallelized for loop. * * === Analysis === * With AVX and -O2 (not necessarily FMA) best results (obviously correct * easy to read disassembly). * * With gcc -O2 -mavx compiles to: * vmovsd xmm1,QWORD PTR [rdi] # 1 read * vmulsd xmm0,xmm1,xmm1 # 1 FLOP+register shuffling * vmulsd xmm0,xmm0,xmm1 # 127x 1 FLOP+register shuffling * # [...] * vmovsd QWORD PTR [rdi-0x8],xmm0 # 1 write * -------- * 128/16 = 8/1 OI * * === Optimization === * Nothing special */ void kernel_8_1_simple(double* a, size_t size); /** * @brief A 8/1 operational intensity kernel utilizing FMA * @param a An array with double values of size param size * @param size Size of the three param arrays * @param result Pointer to result storage * * === Warning === * This is dangerous if FMA is not used/can't be used. Then there * are intermediary writes (and reads) to the stack. * * === Description === * Uses multiple triad function: a[i] = a[i] * a[i] + a[i]; in order * to utilize the FMA unit. * * Runs in a parallelized for loop. * * === Analysis === * With gcc -O2 -mavx -mfma FMA compiles to: * vmovsd xmm0,QWORD PTR [rdi] # 1 read * vfmadd132sd xmm0,xmm0,xmm0 # 64 x 2 FLOPs+register shuffling * vmovsd QWORD PTR [rdi-0x8],xmm0 # 1 write * -------- * 128/16 = 8/1 OI * * === Optimization === * For packed doubles compile with -Ofast * */ void kernel_8_1_fuseaware(double* a, size_t size); /** * @brief A simple 8/1 operational intensity kernel which * undermines evil fastmath optimization * @param a An array with double values of size param size * @param size Size of the three param arrays * @param result Pointer to result storage * * === Warning === * Don't use with anything other than -Ofast / -ffast-math * * === Description === * Uses a simple floating point operation that more closely resembles * that of 8_1_fuseaware: * a[i] = a[i]*a[i]; # 128x * * Runs in a parallelized for loop. * * === Analysis === * -Ofast/-ffast-math does not preserve strict IEEE compliance. It * therefore is allowed to ignore non-associativity of floating * point operations. * * x = x*x*x*x*x*x*x*x; is optimized to x *= x;x *= x;x *= x; * * This cleary breaks the whole OI calculation of 8_1_simple. * * This kernel does not introduce more byte write-outs than * 8_1_simple at a high optimization level since a[i] is held * in a register and only written out once at the end of an * iteration. * * * === Optimization === * Nothing special */ void kernel_8_1_simple_fastmath(double* a, size_t size); /******************************************** * Kernels which potentially compile to * * different operational intensities than * * specified * ********************************************/ /** * @brief A 1/16 operational intensity which might compile to a flawed oi kernel * @param a An array with double values of size param size * @param b An array with double values of size param size * @param size Size of the three param arrays * * === Problem === * As soon as volatile is used gcc uses the stack for tmp. * Even if "register" is in place. Resulting in one additional write per loop. * Omitting volatile results in optimizing away the whole loop * (checked at -O2, which is necessary for FMA to eventually step in). * Maybe the value stays in cache, maybe not. It does not live a register. * * Even with -O3: * movsd xmm0,QWORD PTR [rdi+rax*8] # 1 read * mulsd xmm0,QWORD PTR [rsi+rax*8] # 1 read (+ write to xmm0, not counted) * # [...] # instructions for loop * movsd QWORD PTR [rsp-0x8],xmm0 # malicious write * * Without volatile (-O3): * repz ret # that's it */ void kernel_1_16_simple_dangerous(double* a, double* b, size_t size); /** * @brief A 8/1 operational intensity which might compile to a flawed oi kernel * @param a An array with double values of size param size * @param size Size of the three param arrays * * === Problem == * Same as for kernel_1_16_simple_dangerous */ void kernel_8_1_simple_dangerous(double* a, size_t size); /** * @brief A 1/8 operational intensity which might compile to a flawed oi kernel * @param a An array with double values of size param size * @param size Size of the three param arrays * * === Problem == * Same as for kernel_1_16_simple_dangerous * * Without volatile the loop is optimized away completely. * With volatile tmp is written to the stack in every loop * (-O3). tmp could be cached or not. This might depend on * how large the array is and how the cpu work internally * -> unpredictable. */ void kernel_1_8_vo_dangerous(double* a, size_t size); #ifdef INTRINS void kernel_8_1_fuseaware_manpack(double* a, size_t size); #endif /**************************************** * Helper macros for repeating things * ****************************************/ #define REP0(X) #define REP1(X) X #define REP2(X) REP1(X) REP1(X) #define REP3(X) REP2(X) REP1(X) #define REP4(X) REP3(X) REP1(X) #define REP5(X) REP4(X) REP1(X) #define REP6(X) REP5(X) REP1(X) #define REP7(X) REP6(X) REP1(X) #define REP8(X) REP7(X) REP1(X) #define REP9(X) REP8(X) REP1(X) #define REP10(X) REP9(X) REP1(X) #define REP20(X) REP10(X) REP10(X) #define REP30(X) REP20(X) REP10(X) #define REP40(X) REP30(X) REP10(X) #define REP50(X) REP40(X) REP10(X) #define REP60(X) REP50(X) REP10(X) #define REP100(X) REP50(X) REP50(X) #ifdef ENDEBUG #define DEBUG(...) do { fprintf(stderr, __VA_ARGS__); fprintf(stderr, "\n"); } while(0) #else #define DEBUG(...) #endif #endif /* AIKERN_H */