#ifndef AIKERN_H
#define AIKERN_H

typedef struct {
  size_t	runs;				// also # of start-/endtimes
  double*	starts;				// starttimes
  double*	ends;				// endtimes
  int		flops;				// flops per iteration
  char*		kern_name;
  size_t	size;				// size of arrays handeld
} kern_result;

typedef enum {
  SIMPLE_1_16, FMA_1_16, SIMPLE_8_1, FMA_8_1, SIMPLE_8_1_FASTMATH, FMA_8_1_MANPACK
} kernel_t;

/**
 * @brief main entry point. Dispatches the kernel calls
 * @param kernel	the kernel to run
 * @param a			An array with double values of size param size
 * @param b			An array with double values of size param size
 * @param c			An array with double values of size param size
 * @param size		The size of the arrays
 * @param runs		How often the kernel should be executed
 * @return kern_result containing information about the kernel execution 
 *
 * 
 */
kern_result kernel_dispatch(kernel_t kernel,
							double* a, double* b, double* c,
							size_t size, size_t runs);

/**
 * @brief A simple 1/16 operational intensity kernel
 * @param a			An array with double values of size param size
 * @param size		Size of the three param arrays
 * @param result	Pointer to result storage
 *
 * === Warning ===
 * Don't use with -O0: Stores everything on stack
 *
 * === Description ===
 * Uses a simple floating point operation: a[i] = a[i] * a[i];
 * 
 * Runs in a parallelized for loop.
 * 
 * === Analysis ===
 * COMM: 1 read (8 byte), 1 write = 16 bytes
 * COMP: 1 FLOP
 *      ---------
 * OI:   1/16
 * 
 * === Optimization ===
 * Nothing special 
 *
 */
void kernel_1_16_simple(double* a, size_t size);


/**
 * @brief A 1/16 operational intensity kernel utilizing FMA
 * @param a			An array with double values of size param size
 * @param b			An array with double values of size param size
 * @param c			An array with double values of size param size
 * @param size		Size of the three param arrays
 * @param result	Pointer to result storage
 * 
 * === Warning ===
 * This is dangerous if FMA is not used/can't be used. Then there
 * are intermediary writes (and reads) to the stack.
 *
 * === Description ===
 * Uses a triad function: a[i] = a[i] * b[i] + c[i]; in order
 * to utilize the FMA unit.
 * 
 * Runs in a parallelized for loop.
 *
 * === Analysis ===
 * With gcc -O2 -mavx -mfma FMA compiles to:
 *	 vmovsd xmm0,QWORD PTR [rdi+rax*8]				# 1 read (8 byte)
 *	 vmovsd xmm1,QWORD PTR [rdx+rax*8]				# 1 read
 *	 vfmadd132sd xmm0,xmm1,QWORD PTR [rsi+rax*8]	# 2 FLOPs + 1 read
 *	 vmovsd QWORD PTR [rdi+rax*8],xmm0				# 1 write
 *													 --------
 *													  1/16 OI  
 * 
 * === Optimization ===
 * For packed doubles compile with -Ofast
 *
 */
void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size);


/**
 * @brief A simple 8/1 operational intensity kernel
 * @param a			An array with double values of size param size
 * @param size		Size of the three param arrays
 * @param result	Pointer to result storage
 *
 * === Warning ===
 * Don't use with -O0: Stores everything on stack
 *
 * === Description ===
 * Uses a simple floating point operation: a[i] = a[i] * a[i] * ...* a[i];
 * 
 * Runs in a parallelized for loop.
 *
 * === Analysis ===
 * With AVX and -O2 (not necessarily FMA) best results (obviously correct
 * easy to read disassembly).
 *
 * With gcc -O2 -mavx compiles to:
 *	 vmovsd xmm1,QWORD PTR [rdi]					# 1 read
 *	 vmulsd xmm0,xmm1,xmm1							# 1 FLOP+register shuffling
 *	 vmulsd xmm0,xmm0,xmm1							# 127x 1 FLOP+register shuffling
 *	 # [...]
 *	 vmovsd QWORD PTR [rdi-0x8],xmm0				# 1 write
 *													 --------
 *													  128/16 = 8/1 OI  
 * 
 * === Optimization ===
 * Nothing special
 */
void kernel_8_1_simple(double* a, size_t size);

/**
 * @brief A 8/1 operational intensity kernel utilizing FMA
 * @param a			An array with double values of size param size
 * @param size		Size of the three param arrays
 * @param result	Pointer to result storage
 *
 * === Warning ===
 * This is dangerous if FMA is not used/can't be used. Then there
 * are intermediary writes (and reads) to the stack.
 *
 * === Description ===
 * Uses multiple triad function: a[i] = a[i] * a[i] + a[i]; in order
 * to utilize the FMA unit.
 * 
 * Runs in a parallelized for loop.
 *
 * === Analysis ===
 * With gcc -O2 -mavx -mfma FMA compiles to:
 *	vmovsd xmm0,QWORD PTR [rdi]					# 1 read
 *	vfmadd132sd xmm0,xmm0,xmm0					# 64 x 2 FLOPs+register shuffling
 *	vmovsd QWORD PTR [rdi-0x8],xmm0				# 1 write
 *												  --------
 *												  128/16 = 8/1 OI  
 * 
 * === Optimization ===
 * For packed doubles compile with -Ofast
 *
 */
void kernel_8_1_fuseaware(double* a, size_t size);

/**
 * @brief A simple 8/1 operational intensity kernel which
 *		  undermines evil fastmath optimization
 * @param a			An array with double values of size param size
 * @param size		Size of the three param arrays
 * @param result	Pointer to result storage
 *
 * === Warning ===
 * Don't use with anything other than -Ofast / -ffast-math
 *
 * === Description ===
 * Uses a simple floating point operation that more closely resembles
 * that of 8_1_fuseaware:
 * a[i] = a[i]*a[i];		# 128x
 * 
 * Runs in a parallelized for loop.
 *
 * === Analysis ===
 * -Ofast/-ffast-math does not preserve strict IEEE compliance. It
 * therefore is allowed to ignore non-associativity of floating
 * point operations.
 *
 * x = x*x*x*x*x*x*x*x; is optimized to x *= x;x *= x;x *= x;
 * 
 * This cleary breaks the whole OI calculation of 8_1_simple.
 * 
 * This kernel does not introduce more byte write-outs than
 * 8_1_simple at a high optimization level since a[i] is held
 * in a register and only written out once at the end of an
 * iteration.
 * 
 * 
 * === Optimization ===
 * Nothing special
 */
void kernel_8_1_simple_fastmath(double* a, size_t size);


/********************************************
 *  Kernels which potentially compile to	*
 *  different operational intensities than	*
 *  specified								*
 ********************************************/

/**
 * @brief A 1/16 operational intensity which might compile to a flawed oi kernel
 * @param a		An array with double values of size param size
 * @param b		An array with double values of size param size
 * @param size  Size of the three param arrays
 *
 *  === Problem ===
 *	As soon as volatile is used gcc uses the stack for tmp.
 *	Even if "register" is in place. Resulting in one additional write per loop.
 *	Omitting volatile results in optimizing away the whole loop 
 *	(checked at -O2, which is necessary for FMA to eventually step in).
 *	Maybe the value stays in cache, maybe not. It does not live a register.
 *
 *	Even with -O3:
 *	movsd  xmm0,QWORD PTR [rdi+rax*8]  # 1 read
 *	mulsd  xmm0,QWORD PTR [rsi+rax*8]  # 1 read (+ write to xmm0, not counted)
 *	# [...]							   # instructions for loop
 *	movsd  QWORD PTR [rsp-0x8],xmm0    # malicious write
 *
 *	Without volatile (-O3):
 *	repz ret						   # that's it
 */
void kernel_1_16_simple_dangerous(double* a, double* b, size_t size);

/**
 * @brief A 8/1 operational intensity which might compile to a flawed oi kernel
 * @param a		An array with double values of size param size
 * @param size  Size of the three param arrays
 *
 * === Problem ==
 * Same as for kernel_1_16_simple_dangerous
 */
void kernel_8_1_simple_dangerous(double* a, size_t size);

/**
 * @brief A 1/8 operational intensity which might compile to a flawed oi kernel
 * @param a		An array with double values of size param size
 * @param size  Size of the three param arrays
 *
 * === Problem ==
 * Same as for kernel_1_16_simple_dangerous
 * 
 * Without volatile the loop is optimized away completely.
 * With volatile tmp is written to the stack in every loop
 * (-O3). tmp could be cached or not. This might depend on
 * how large the array is and how the cpu work internally
 * -> unpredictable.
 */
void kernel_1_8_vo_dangerous(double* a, size_t size);


#ifdef INTRINS
void kernel_8_1_fuseaware_manpack(double* a, size_t size);
#endif


/****************************************
 * Helper macros for repeating things	*
 ****************************************/

#define REP0(X)
#define REP1(X) X
#define REP2(X) REP1(X) REP1(X)
#define REP3(X) REP2(X) REP1(X)
#define REP4(X) REP3(X) REP1(X)
#define REP5(X) REP4(X) REP1(X)
#define REP6(X) REP5(X) REP1(X)
#define REP7(X) REP6(X) REP1(X)
#define REP8(X) REP7(X) REP1(X)
#define REP9(X) REP8(X) REP1(X)

#define REP10(X)  REP9(X)  REP1(X)
#define REP20(X) REP10(X) REP10(X)
#define REP30(X) REP20(X) REP10(X)
#define REP40(X) REP30(X) REP10(X)
#define REP50(X) REP40(X) REP10(X)
#define REP60(X) REP50(X) REP10(X)
  
#define REP100(X) REP50(X) REP50(X)

#ifdef ENDEBUG
#define DEBUG(...) do { fprintf(stderr, __VA_ARGS__); fprintf(stderr, "\n"); } while(0)
#else
#define DEBUG(...)
#endif

#endif /* AIKERN_H */