i5-roofline/aikern.h

#ifndef AIKERN_H
#define AIKERN_H

typedef struct {
  size_t	runs;				// also # of start-/endtimes
  double*	starts;				// starttimes
  double*	ends;				// endtimes
  int		flops;				// flops per iteration
  char*		kern_name;
  size_t	size;				// size of arrays handeld
} kern_result;

typedef enum {
  SIMPLE_1_16, FMA_1_16, SIMPLE_8_1, FMA_8_1, SIMPLE_8_1_FASTMATH, FMA_8_1_MANPACK
} kernel_t;

/**
 * @brief main entry point. Dispatches the kernel calls
 * @param kernel	the kernel to run
 * @param a			An array with double values of size param size
 * @param b			An array with double values of size param size
 * @param c			An array with double values of size param size
 * @param size		The size of the arrays
 * @param runs		How often the kernel should be executed
 * @return kern_result containing information about the kernel execution 
 *
 * 
 */
kern_result kernel_dispatch(kernel_t kernel,
							double* a, double* b, double* c,
							size_t size, size_t runs);

/**
 * @brief A simple 1/16 operational intensity kernel
 * @param a			An array with double values of size param size
 * @param size		Size of the three param arrays
 * @param result	Pointer to result storage
 *
 * === Warning ===
 * Don't use with -O0: Stores everything on stack
 *
 * === Description ===
 * Uses a simple floating point operation: a[i] = a[i] * a[i];
 * 
 * Runs in a parallelized for loop.
 * 
 * === Analysis ===
 * COMM: 1 read (8 byte), 1 write = 16 bytes
 * COMP: 1 FLOP
 *      ---------
 * OI:   1/16
 * 
 * === Optimization ===
 * Nothing special 
 *
 */
void kernel_1_16_simple(double* a, size_t size);


/**
 * @brief A 1/16 operational intensity kernel utilizing FMA
 * @param a			An array with double values of size param size
 * @param b			An array with double values of size param size
 * @param c			An array with double values of size param size
 * @param size		Size of the three param arrays
 * @param result	Pointer to result storage
 * 
 * === Warning ===
 * This is dangerous if FMA is not used/can't be used. Then there
 * are intermediary writes (and reads) to the stack.
 *
 * === Description ===
 * Uses a triad function: a[i] = a[i] * b[i] + c[i]; in order
 * to utilize the FMA unit.
 * 
 * Runs in a parallelized for loop.
 *
 * === Analysis ===
 * With gcc -O2 -mavx -mfma FMA compiles to:
 *	 vmovsd xmm0,QWORD PTR [rdi+rax*8]				# 1 read (8 byte)
 *	 vmovsd xmm1,QWORD PTR [rdx+rax*8]				# 1 read
 *	 vfmadd132sd xmm0,xmm1,QWORD PTR [rsi+rax*8]	# 2 FLOPs + 1 read
 *	 vmovsd QWORD PTR [rdi+rax*8],xmm0				# 1 write
 *													 --------
 *													  1/16 OI  
 * 
 * === Optimization ===
 * For packed doubles compile with -Ofast
 *
 */
void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size);


/**
 * @brief A simple 8/1 operational intensity kernel
 * @param a			An array with double values of size param size
 * @param size		Size of the three param arrays
 * @param result	Pointer to result storage
 *
 * === Warning ===
 * Don't use with -O0: Stores everything on stack
 *
 * === Description ===
 * Uses a simple floating point operation: a[i] = a[i] * a[i] * ...* a[i];
 * 
 * Runs in a parallelized for loop.
 *
 * === Analysis ===
 * With AVX and -O2 (not necessarily FMA) best results (obviously correct
 * easy to read disassembly).
 *
 * With gcc -O2 -mavx compiles to:
 *	 vmovsd xmm1,QWORD PTR [rdi]					# 1 read
 *	 vmulsd xmm0,xmm1,xmm1							# 1 FLOP+register shuffling
 *	 vmulsd xmm0,xmm0,xmm1							# 127x 1 FLOP+register shuffling
 *	 # [...]
 *	 vmovsd QWORD PTR [rdi-0x8],xmm0				# 1 write
 *													 --------
 *													  128/16 = 8/1 OI  
 * 
 * === Optimization ===
 * Nothing special
 */
void kernel_8_1_simple(double* a, size_t size);

/**
 * @brief A 8/1 operational intensity kernel utilizing FMA
 * @param a			An array with double values of size param size
 * @param size		Size of the three param arrays
 * @param result	Pointer to result storage
 *
 * === Warning ===
 * This is dangerous if FMA is not used/can't be used. Then there
 * are intermediary writes (and reads) to the stack.
 *
 * === Description ===
 * Uses multiple triad function: a[i] = a[i] * a[i] + a[i]; in order
 * to utilize the FMA unit.
 * 
 * Runs in a parallelized for loop.
 *
 * === Analysis ===
 * With gcc -O2 -mavx -mfma FMA compiles to:
 *	vmovsd xmm0,QWORD PTR [rdi]					# 1 read
 *	vfmadd132sd xmm0,xmm0,xmm0					# 64 x 2 FLOPs+register shuffling
 *	vmovsd QWORD PTR [rdi-0x8],xmm0				# 1 write
 *												  --------
 *												  128/16 = 8/1 OI  
 * 
 * === Optimization ===
 * For packed doubles compile with -Ofast
 *
 */
void kernel_8_1_fuseaware(double* a, size_t size);

/**
 * @brief A simple 8/1 operational intensity kernel which
 *		  undermines evil fastmath optimization
 * @param a			An array with double values of size param size
 * @param size		Size of the three param arrays
 * @param result	Pointer to result storage
 *
 * === Warning ===
 * Don't use with anything other than -Ofast / -ffast-math
 *
 * === Description ===
 * Uses a simple floating point operation that more closely resembles
 * that of 8_1_fuseaware:
 * a[i] = a[i]*a[i];		# 128x
 * 
 * Runs in a parallelized for loop.
 *
 * === Analysis ===
 * -Ofast/-ffast-math does not preserve strict IEEE compliance. It
 * therefore is allowed to ignore non-associativity of floating
 * point operations.
 *
 * x = x*x*x*x*x*x*x*x; is optimized to x *= x;x *= x;x *= x;
 * 
 * This cleary breaks the whole OI calculation of 8_1_simple.
 * 
 * This kernel does not introduce more byte write-outs than
 * 8_1_simple at a high optimization level since a[i] is held
 * in a register and only written out once at the end of an
 * iteration.
 * 
 * 
 * === Optimization ===
 * Nothing special
 */
void kernel_8_1_simple_fastmath(double* a, size_t size);


/********************************************
 *  Kernels which potentially compile to	*
 *  different operational intensities than	*
 *  specified								*
 ********************************************/

/**
 * @brief A 1/16 operational intensity which might compile to a flawed oi kernel
 * @param a		An array with double values of size param size
 * @param b		An array with double values of size param size
 * @param size  Size of the three param arrays
 *
 *  === Problem ===
 *	As soon as volatile is used gcc uses the stack for tmp.
 *	Even if "register" is in place. Resulting in one additional write per loop.
 *	Omitting volatile results in optimizing away the whole loop 
 *	(checked at -O2, which is necessary for FMA to eventually step in).
 *	Maybe the value stays in cache, maybe not. It does not live a register.
 *
 *	Even with -O3:
 *	movsd  xmm0,QWORD PTR [rdi+rax*8]  # 1 read
 *	mulsd  xmm0,QWORD PTR [rsi+rax*8]  # 1 read (+ write to xmm0, not counted)
 *	# [...]							   # instructions for loop
 *	movsd  QWORD PTR [rsp-0x8],xmm0    # malicious write
 *
 *	Without volatile (-O3):
 *	repz ret						   # that's it
 */
void kernel_1_16_simple_dangerous(double* a, double* b, size_t size);

/**
 * @brief A 8/1 operational intensity which might compile to a flawed oi kernel
 * @param a		An array with double values of size param size
 * @param size  Size of the three param arrays
 *
 * === Problem ==
 * Same as for kernel_1_16_simple_dangerous
 */
void kernel_8_1_simple_dangerous(double* a, size_t size);

/**
 * @brief A 1/8 operational intensity which might compile to a flawed oi kernel
 * @param a		An array with double values of size param size
 * @param size  Size of the three param arrays
 *
 * === Problem ==
 * Same as for kernel_1_16_simple_dangerous
 * 
 * Without volatile the loop is optimized away completely.
 * With volatile tmp is written to the stack in every loop
 * (-O3). tmp could be cached or not. This might depend on
 * how large the array is and how the cpu work internally
 * -> unpredictable.
 */
void kernel_1_8_vo_dangerous(double* a, size_t size);


#ifdef INTRINS
void kernel_8_1_fuseaware_manpack(double* a, size_t size);
#endif


/****************************************
 * Helper macros for repeating things	*
 ****************************************/

#define REP0(X)
#define REP1(X) X
#define REP2(X) REP1(X) REP1(X)
#define REP3(X) REP2(X) REP1(X)
#define REP4(X) REP3(X) REP1(X)
#define REP5(X) REP4(X) REP1(X)
#define REP6(X) REP5(X) REP1(X)
#define REP7(X) REP6(X) REP1(X)
#define REP8(X) REP7(X) REP1(X)
#define REP9(X) REP8(X) REP1(X)

#define REP10(X)  REP9(X)  REP1(X)
#define REP20(X) REP10(X) REP10(X)
#define REP30(X) REP20(X) REP10(X)
#define REP40(X) REP30(X) REP10(X)
#define REP50(X) REP40(X) REP10(X)
#define REP60(X) REP50(X) REP10(X)
  
#define REP100(X) REP50(X) REP50(X)

#ifdef ENDEBUG
#define DEBUG(...) do { fprintf(stderr, __VA_ARGS__); fprintf(stderr, "\n"); } while(0)
#else
#define DEBUG(...)
#endif

#endif /* AIKERN_H */
Init 2020-09-03 16:49:50 +00:00			`#ifndef AIKERN_H`
			`#define AIKERN_H`

			`typedef struct {`
			`size_t runs; // also # of start-/endtimes`
			`double* starts; // starttimes`
			`double* ends; // endtimes`
			`int flops; // flops per iteration`
			`char* kern_name;`
			`size_t size; // size of arrays handeld`
			`} kern_result;`

			`typedef enum {`
			`SIMPLE_1_16, FMA_1_16, SIMPLE_8_1, FMA_8_1, SIMPLE_8_1_FASTMATH, FMA_8_1_MANPACK`
			`} kernel_t;`

			`/**`
			`* @brief main entry point. Dispatches the kernel calls`
			`* @param kernel the kernel to run`
			`* @param a An array with double values of size param size`
			`* @param b An array with double values of size param size`
			`* @param c An array with double values of size param size`
			`* @param size The size of the arrays`
			`* @param runs How often the kernel should be executed`
			`* @return kern_result containing information about the kernel execution`
			`*`
			`*`
			`*/`
			`kern_result kernel_dispatch(kernel_t kernel,`
			`double* a, double* b, double* c,`
			`size_t size, size_t runs);`

			`/**`
			`* @brief A simple 1/16 operational intensity kernel`
			`* @param a An array with double values of size param size`
			`* @param size Size of the three param arrays`
			`* @param result Pointer to result storage`
			`*`
			`* === Warning ===`
			`* Don't use with -O0: Stores everything on stack`
			`*`
			`* === Description ===`
			`* Uses a simple floating point operation: a[i] = a[i] * a[i];`
			`*`
			`* Runs in a parallelized for loop.`
			`*`
			`* === Analysis ===`
			`* COMM: 1 read (8 byte), 1 write = 16 bytes`
			`* COMP: 1 FLOP`
			`* ---------`
			`* OI: 1/16`
			`*`
			`* === Optimization ===`
			`* Nothing special`
			`*`
			`*/`
			`void kernel_1_16_simple(double* a, size_t size);`


			`/**`
			`* @brief A 1/16 operational intensity kernel utilizing FMA`
			`* @param a An array with double values of size param size`
			`* @param b An array with double values of size param size`
			`* @param c An array with double values of size param size`
			`* @param size Size of the three param arrays`
			`* @param result Pointer to result storage`
			`*`
			`* === Warning ===`
			`* This is dangerous if FMA is not used/can't be used. Then there`
			`* are intermediary writes (and reads) to the stack.`
			`*`
			`* === Description ===`
			`* Uses a triad function: a[i] = a[i] * b[i] + c[i]; in order`
			`* to utilize the FMA unit.`
			`*`
			`* Runs in a parallelized for loop.`
			`*`
			`* === Analysis ===`
			`* With gcc -O2 -mavx -mfma FMA compiles to:`
			`* vmovsd xmm0,QWORD PTR [rdi+rax*8] # 1 read (8 byte)`
			`* vmovsd xmm1,QWORD PTR [rdx+rax*8] # 1 read`
			`* vfmadd132sd xmm0,xmm1,QWORD PTR [rsi+rax*8] # 2 FLOPs + 1 read`
			`* vmovsd QWORD PTR [rdi+rax*8],xmm0 # 1 write`
			`* --------`
			`* 1/16 OI`
			`*`
			`* === Optimization ===`
			`* For packed doubles compile with -Ofast`
			`*`
			`*/`
			`void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size);`



			`/**`
			`* @brief A simple 8/1 operational intensity kernel`
			`* @param a An array with double values of size param size`
			`* @param size Size of the three param arrays`
			`* @param result Pointer to result storage`
			`*`
			`* === Warning ===`
			`* Don't use with -O0: Stores everything on stack`
			`*`
			`* === Description ===`
			`* Uses a simple floating point operation: a[i] = a[i] * a[i] * ...* a[i];`
			`*`
			`* Runs in a parallelized for loop.`
			`*`
			`* === Analysis ===`
			`* With AVX and -O2 (not necessarily FMA) best results (obviously correct`
			`* easy to read disassembly).`
			`*`
			`* With gcc -O2 -mavx compiles to:`
			`* vmovsd xmm1,QWORD PTR [rdi] # 1 read`
			`* vmulsd xmm0,xmm1,xmm1 # 1 FLOP+register shuffling`
			`* vmulsd xmm0,xmm0,xmm1 # 127x 1 FLOP+register shuffling`
			`* # [...]`
			`* vmovsd QWORD PTR [rdi-0x8],xmm0 # 1 write`
			`* --------`
			`* 128/16 = 8/1 OI`
			`*`
			`* === Optimization ===`
			`* Nothing special`
			`*/`
			`void kernel_8_1_simple(double* a, size_t size);`

			`/**`
			`* @brief A 8/1 operational intensity kernel utilizing FMA`
			`* @param a An array with double values of size param size`
			`* @param size Size of the three param arrays`
			`* @param result Pointer to result storage`
			`*`
			`* === Warning ===`
			`* This is dangerous if FMA is not used/can't be used. Then there`
			`* are intermediary writes (and reads) to the stack.`
			`*`
			`* === Description ===`
			`* Uses multiple triad function: a[i] = a[i] * a[i] + a[i]; in order`
			`* to utilize the FMA unit.`
			`*`
			`* Runs in a parallelized for loop.`
			`*`
			`* === Analysis ===`
			`* With gcc -O2 -mavx -mfma FMA compiles to:`
			`* vmovsd xmm0,QWORD PTR [rdi] # 1 read`
			`* vfmadd132sd xmm0,xmm0,xmm0 # 64 x 2 FLOPs+register shuffling`
			`* vmovsd QWORD PTR [rdi-0x8],xmm0 # 1 write`
			`* --------`
			`* 128/16 = 8/1 OI`
			`*`
			`* === Optimization ===`
			`* For packed doubles compile with -Ofast`
			`*`
			`*/`
			`void kernel_8_1_fuseaware(double* a, size_t size);`

			`/**`
			`* @brief A simple 8/1 operational intensity kernel which`
			`* undermines evil fastmath optimization`
			`* @param a An array with double values of size param size`
			`* @param size Size of the three param arrays`
			`* @param result Pointer to result storage`
			`*`
			`* === Warning ===`
			`* Don't use with anything other than -Ofast / -ffast-math`
			`*`
			`* === Description ===`
			`* Uses a simple floating point operation that more closely resembles`
			`* that of 8_1_fuseaware:`
			`* a[i] = a[i]*a[i]; # 128x`
			`*`
			`* Runs in a parallelized for loop.`
			`*`
			`* === Analysis ===`
			`* -Ofast/-ffast-math does not preserve strict IEEE compliance. It`
			`* therefore is allowed to ignore non-associativity of floating`
			`* point operations.`
			`*`
			`* x = xxxxxxxx; is optimized to x = x;x = x;x = x;`
			`*`
			`* This cleary breaks the whole OI calculation of 8_1_simple.`
			`*`
			`* This kernel does not introduce more byte write-outs than`
			`* 8_1_simple at a high optimization level since a[i] is held`
			`* in a register and only written out once at the end of an`
			`* iteration.`
			`*`
			`*`
			`* === Optimization ===`
			`* Nothing special`
			`*/`
			`void kernel_8_1_simple_fastmath(double* a, size_t size);`


			`/********************************************`
			`* Kernels which potentially compile to *`
			`* different operational intensities than *`
			`* specified *`
			`********************************************/`

			`/**`
			`* @brief A 1/16 operational intensity which might compile to a flawed oi kernel`
			`* @param a An array with double values of size param size`
			`* @param b An array with double values of size param size`
			`* @param size Size of the three param arrays`
			`*`
			`* === Problem ===`
			`* As soon as volatile is used gcc uses the stack for tmp.`
			`* Even if "register" is in place. Resulting in one additional write per loop.`
			`* Omitting volatile results in optimizing away the whole loop`
			`* (checked at -O2, which is necessary for FMA to eventually step in).`
			`* Maybe the value stays in cache, maybe not. It does not live a register.`
			`*`
			`* Even with -O3:`
			`* movsd xmm0,QWORD PTR [rdi+rax*8] # 1 read`
			`* mulsd xmm0,QWORD PTR [rsi+rax*8] # 1 read (+ write to xmm0, not counted)`
			`* # [...] # instructions for loop`
			`* movsd QWORD PTR [rsp-0x8],xmm0 # malicious write`
			`*`
			`* Without volatile (-O3):`
			`* repz ret # that's it`
			`*/`
			`void kernel_1_16_simple_dangerous(double* a, double* b, size_t size);`

			`/**`
			`* @brief A 8/1 operational intensity which might compile to a flawed oi kernel`
			`* @param a An array with double values of size param size`
			`* @param size Size of the three param arrays`
			`*`
			`* === Problem ==`
			`* Same as for kernel_1_16_simple_dangerous`
			`*/`
			`void kernel_8_1_simple_dangerous(double* a, size_t size);`

			`/**`
			`* @brief A 1/8 operational intensity which might compile to a flawed oi kernel`
			`* @param a An array with double values of size param size`
			`* @param size Size of the three param arrays`
			`*`
			`* === Problem ==`
			`* Same as for kernel_1_16_simple_dangerous`
			`*`
			`* Without volatile the loop is optimized away completely.`
			`* With volatile tmp is written to the stack in every loop`
			`* (-O3). tmp could be cached or not. This might depend on`
			`* how large the array is and how the cpu work internally`
			`* -> unpredictable.`
			`*/`
			`void kernel_1_8_vo_dangerous(double* a, size_t size);`


			`#ifdef INTRINS`
			`void kernel_8_1_fuseaware_manpack(double* a, size_t size);`
			`#endif`



			`/****************************************`
			`* Helper macros for repeating things *`
			`****************************************/`

			`#define REP0(X)`
			`#define REP1(X) X`
			`#define REP2(X) REP1(X) REP1(X)`
			`#define REP3(X) REP2(X) REP1(X)`
			`#define REP4(X) REP3(X) REP1(X)`
			`#define REP5(X) REP4(X) REP1(X)`
			`#define REP6(X) REP5(X) REP1(X)`
			`#define REP7(X) REP6(X) REP1(X)`
			`#define REP8(X) REP7(X) REP1(X)`
			`#define REP9(X) REP8(X) REP1(X)`

			`#define REP10(X) REP9(X) REP1(X)`
			`#define REP20(X) REP10(X) REP10(X)`
			`#define REP30(X) REP20(X) REP10(X)`
			`#define REP40(X) REP30(X) REP10(X)`
			`#define REP50(X) REP40(X) REP10(X)`
			`#define REP60(X) REP50(X) REP10(X)`

			`#define REP100(X) REP50(X) REP50(X)`

			`#ifdef ENDEBUG`
			`#define DEBUG(...) do { fprintf(stderr, __VA_ARGS__); fprintf(stderr, "\n"); } while(0)`
			`#else`
			`#define DEBUG(...)`
			`#endif`

			`#endif /* AIKERN_H */`