commit c61491e7138f3074d02abcc5984f4a4294c9fd67
Author: Armin Friedl <dev@friedl.net>
Date:   Thu Sep 3 18:49:50 2020 +0200

    Init

diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..d906396
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,120 @@
+all: clean bin lib
+
+# Roofline Binary
+
+## This is the least demanding target, use it if nothing else works
+nofancy: roofline roofline_o3
+	mkdir bin
+	mv $^ bin
+
+## Your processor needs an FMA unit for this target to work
+fmacap: roofline roofline_o3 roofline_fma roofline_fma_o3 roofline_fma_fast_o3 roofline_fma_fast_o2 roofline_fma_fast_fastmath_o3
+	mkdir bin
+	mv $^ bin
+
+## This will compile just everything
+bin: roofline roofline_o3 roofline_fma roofline_fma_o3 roofline_fma_fast_o3 roofline_fma_fast_o2 roofline_fma_fast_fastmath_o3 roofline_full roofline_profile roofline_full_clang roofline_full_manpack
+	mkdir bin
+	mv $^ bin
+
+roofline: roofline.c aikern.a
+	gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@
+
+roofline_o3: roofline.c aikern_o3.a
+	gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@
+
+roofline_fma: roofline.c aikern_fma.a
+	gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@
+
+roofline_fma_o3: roofline.c aikern_fma_o3.a
+	gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@
+
+roofline_fma_fast_o3: roofline.c aikern_fma_fast_o3.a
+	gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@
+
+roofline_fma_fast_o2: roofline.c aikern_fma_fast_o2.a
+	gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@
+
+roofline_fma_fast_fastmath_o3: roofline.c aikern_fma_fast_fastmath_o3.a
+	gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@
+
+roofline_full: roofline.c aikern_full.a
+	gcc -Wall -Wextra -std=c99 -fopenmp -O3 -mavx -mfma -fopenmp -Ofast -ffast-math -malign-double -march=native $^ -o $@
+
+roofline_full_clang: roofline.c aikern_full_clang.a
+	clang -Wall -Wextra -std=c99 -fopenmp -O3 -mavx -mfma -fopenmp -Ofast -ffast-math -march=native $^ -o $@
+
+roofline_full_manpack: roofline.c aikern_full_manpack.a
+	gcc -Wall -Wextra -std=c99 -fopenmp -DINTRINS -O3 -mavx -mfma -fopenmp -Ofast -ffast-math -malign-double -march=native $^ -o $@
+
+roofline_profile: roofline.c aikern_profile.a
+	gcc -Wall -Wextra -std=c99 -fopenmp -O0 -g -pg $^ -o $@
+
+# Static Libraries
+lib: aikern.a aikern_o3.a aikern_fma.a aikern_fma_o3.a aikern_fma_fast_o2.a aikern_fma_fast_o3.a aikern_fma_fast_fastmath_o3.a aikern_full.a aikern_full_clang.a aikern_profile.a aikern_full_manpack.a
+	mkdir lib
+	mv $^ lib
+
+aikern.a: aikern.c aikern.h
+	gcc -Wall -Wextra -Wno-unused -fopenmp -c -o aikern.o $<
+	ar rcs aikern.a aikern.o
+	rm aikern.o
+
+aikern_o3.a: aikern.c aikern.h
+	gcc -Wall -Wextra -Wno-unused -O3 -fopenmp -c -o aikern_o3.o $<
+	ar rcs $@ aikern_o3.o
+	rm aikern_o3.o
+
+aikern_fma.a: aikern.c aikern.h
+	gcc -Wall -Wextra -Wno-unused -O2 -mavx -mfma -fopenmp -c -o aikern_fma.o $<
+	ar rcs $@ aikern_fma.o
+	rm aikern_fma.o
+
+aikern_fma_o3.a: aikern.c aikern.h
+	gcc -Wall -Wextra -Wno-unused -O3 -mavx -mfma -fopenmp -c -o aikern_fma_o3.o $<
+	ar rcs $@ aikern_fma_o3.o
+	rm aikern_fma_o3.o
+
+aikern_fma_fast_o2.a: aikern.c aikern.h
+	gcc -Wall -Wextra -Wno-unused -O2 -mavx -mfma -fopenmp -Ofast -c -o aikern_fma_fast_o2.o $<
+	ar rcs $@ aikern_fma_fast_o2.o
+	rm aikern_fma_fast_o2.o
+
+aikern_fma_fast_o3.a: aikern.c aikern.h
+	gcc -Wall -Wextra -Wno-unused -O3 -mavx -mfma -fopenmp -Ofast -c -o aikern_fma_fast_o3.o $<
+	ar rcs $@ aikern_fma_fast_o3.o
+	rm aikern_fma_fast_o3.o
+
+aikern_fma_fast_fastmath_o3.a: aikern.c aikern.h
+	gcc -Wall -Wextra -Wno-unused -O3 -mavx -mfma -fopenmp -Ofast -ffast-math -c -o aikern_fma_fast_fastmath_o3.o $<
+	ar rcs $@ aikern_fma_fast_fastmath_o3.o
+	rm aikern_fma_fast_fastmath_o3.o
+
+aikern_full.a: aikern.c aikern.h
+	gcc -Wall -Wextra -Wno-unused -O3 -mavx -mfma -fopenmp -Ofast -ffast-math -malign-double -march=native -c -o aikern_full.o $<
+	ar rcs $@ aikern_full.o
+	rm aikern_full.o
+
+aikern_full_clang.a: aikern.c aikern.h
+	gcc -Wall -Wextra -Wno-unused -O3 -mavx -mfma -fopenmp -Ofast -ffast-math -march=native -c -o aikern_full_clang.o $<
+	ar rcs $@ aikern_full_clang.o
+	rm aikern_full_clang.o
+
+aikern_profile.a: aikern.c aikern.h
+	gcc -Wall -Wextra -Wno-unused -fopenmp -O0 -g -pg -c -o aikern_profile.o $<
+	ar rcs $@ aikern_profile.o
+	rm aikern_profile.o
+
+aikern_full_manpack.a: aikern.c aikern.h
+	gcc -Wall -Wextra -Wno-unused -DINTRINS -O3 -mavx -mfma -fopenmp -Ofast -ffast-math -malign-double -march=native -c -o aikern_full_manpack.o $<
+	ar rcs $@ aikern_full_manpack.o
+	rm aikern_full_manpack.o
+
+# Cleanup
+clean:
+	rm -f *.a
+	rm -f *.o
+	rm -f roofline roofline_o3 roofline_fma roofline_fma_o3 roofline_fma_fast_o3 roofline_fma_fast_o2 roofline_fma_fast_fastmath_o3 roofline_fma_fast_fastmath_aligned_o3 roofline_profile roofline_full_clang
+	rm -fR bin
+	rm -fR lib
+
diff --git a/aikern.c b/aikern.c
new file mode 100644
index 0000000..5bd6b0e
--- /dev/null
+++ b/aikern.c
@@ -0,0 +1,288 @@
+# include <stdlib.h>
+# include <stdio.h>
+# include <unistd.h>
+# include <stdarg.h>
+# include <errno.h>
+# include <string.h>
+# include <sys/time.h>
+
+# include "aikern.h"
+
+/* === Macros === */
+
+#ifdef ENDEBUG
+#define DEBUG(...) do { fprintf(stderr, __VA_ARGS__); fprintf(stderr, "\n"); } while(0)
+#else
+#define DEBUG(...)
+#endif
+
+/**
+ * @brief terminate program on program error
+ * @param msg additional message to print
+ * @param ret exit value
+ */
+static void bail_out(char* fmt, ...);
+
+/**
+ * @brief microseconds since epoch
+ */
+static double pin_time(void);
+
+kern_result kernel_dispatch(kernel_t kernel,
+							double* a, double* b, double* c,
+							size_t size, size_t runs)
+{
+  
+  kern_result result = {0};
+  result.runs = runs;
+  result.starts = malloc(sizeof(double)*(runs));
+  result.ends = malloc(sizeof(double)*(runs));
+  result.size = size;
+
+  if(result.starts==NULL || result.ends==NULL)
+	{
+	  bail_out("One of the mallocs failed\n. starts = %p, ends=%p", result.starts, result.ends);
+	}
+  
+  
+  switch(kernel)
+	{
+
+	case SIMPLE_1_16:
+	  result.flops = 1;
+	  result.kern_name = "Simple 1/16";
+	  for(size_t r=0; r<runs; r++)
+		{
+		  result.starts[r] = pin_time();
+		  kernel_1_16_simple(a, size);
+		  result.ends[r] = pin_time();
+		}
+	  break;
+	case FMA_1_16:
+	  result.flops = 2;
+	  result.kern_name = "FMA aware 1/16";
+	  for(size_t r=0; r<runs; r++)
+		{
+		  result.starts[r] = pin_time();
+		  kernel_1_16_fuseaware(a, b, c, size);
+		  result.ends[r] = pin_time();
+		}
+	  break;
+	case SIMPLE_8_1:
+	  result.flops = 128;
+	  result.kern_name = "Simple 8";
+	  for(size_t r=0; r<runs; r++)
+		{
+		  result.starts[r] = pin_time();
+		  kernel_8_1_simple(a, size);
+		  result.ends[r] = pin_time();
+		}
+	  break;
+	case FMA_8_1:
+	  result.flops = 128;
+	  result.kern_name = "FMA aware 8";
+	  for(size_t r=0; r<runs; r++)
+		{
+		  result.starts[r] = pin_time();
+		  kernel_8_1_fuseaware(a, size);
+		  result.ends[r] = pin_time();
+		}
+	  break;
+	case SIMPLE_8_1_FASTMATH:
+	  DEBUG("AIKERN MANPACK");
+	  result.flops = 128;
+	  result.kern_name = "Simple 8 undermining fastmath";
+	  for(size_t r=0; r<runs; r++)
+		{
+		  result.starts[r] = pin_time();
+		  kernel_8_1_simple_fastmath(a, size);
+		  result.ends[r] = pin_time();
+		}
+	  break;
+	case FMA_8_1_MANPACK:
+	  DEBUG("AIKERN MANPACK");
+#ifdef INTRINS
+	  if(size%4 != 0)
+		{
+		  bail_out("Must use multiple of 4 size for manpack");
+		}
+  
+	  result.flops = 128;
+	  result.kern_name = "FMA aware 8 with manual packing";
+	  for(size_t r=0; r<runs; r++)
+		{
+		  DEBUG("running manpack run %zu",r);
+		  result.starts[r] = pin_time();
+		  kernel_8_1_fuseaware_manpack(a, size);
+		  result.ends[r] = pin_time();
+		}
+#endif
+	  break;
+	default:
+	  bail_out("No such kernel %s", kernel);
+	  break;
+	}
+
+  return result;
+}
+
+inline void kernel_1_16_simple(double* a, size_t size)
+{
+  #pragma omp parallel for
+  for(size_t i=0; i<size; i++)
+	{
+	  a[i] = a[i] * a[i];
+	}
+}
+
+inline void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size)
+{
+  #pragma omp parallel for
+  for(size_t i=0; i<size; i++)
+	{
+	  a[i] = a[i] * b[i] + c[i];
+	}
+}
+
+inline void kernel_8_1_simple(double* a, size_t size)
+{
+  #pragma omp parallel for
+  for(size_t i=0; i<size; i++)
+	{
+	  a[i] = REP100(a[i]*)
+			 REP20(a[i]*)
+		     REP8(a[i]*)
+		     REP1(a[i]);
+  }
+}
+
+inline void kernel_8_1_simple_fastmath(double* a, size_t size)
+{
+  #pragma omp parallel for
+  for(size_t i=0; i<size; i++)
+	{
+	  REP100(a[i]=a[i]*a[i];);
+	  REP20(a[i]=a[i]*a[i];);
+	  REP8(a[i]=a[i]*a[i];);
+  }
+}
+
+inline void kernel_8_1_fuseaware(double* a, size_t size)
+{
+  #pragma omp parallel for
+  for(size_t i=0; i<size; i++)
+	{
+		REP60(a[i] = a[i] * a[i] + a[i];)
+		REP4(a[i] = a[i] * a[i] + a[i];)
+	}
+}
+
+#ifdef INTRINS
+
+#include <immintrin.h>
+
+inline void kernel_8_1_fuseaware_manpack(double* a, size_t size)
+{
+
+  #pragma omp parallel for
+  for(size_t i=0; i<(size-4); i+=4)
+	{
+	  // pack doubles
+	  __m256d packvec = _mm256_set_pd(a[i], a[i+1], a[i+2], a[i+3]);
+	  
+	  REP60(packvec = _mm256_fmadd_pd(packvec, packvec, packvec););
+	  REP4(packvec = _mm256_fmadd_pd(packvec, packvec, packvec););
+	  
+	  a[i] = packvec[0];
+	  a[i+1] = packvec[1];
+	  a[i+2] = packvec[2];
+	  a[i+3] = packvec[3];
+	}
+}
+
+#endif /* INTRINS */
+
+
+/********************************************
+ *  Kernels which potentially compile to	*
+ *  different operational intensities than	*
+ *  specified								*
+ ********************************************/
+
+void kernel_1_16_simple_dangerous(double* a, double* b, size_t size)
+{
+  register volatile double tmp = 0.1;
+
+  #pragma omp parallel for
+  for(size_t i=0; i<size; i++){
+	tmp = a[i] * b[i];
+  }
+}
+
+void kernel_8_1_simple_dangerous(double* a, size_t size)
+{
+  register volatile double tmp = 0.1;
+  
+  #pragma omp parallel for
+  for(size_t i=0; i<size; i++)
+	{
+	  tmp = a[i] * a[i] * a[i] * a[i] *
+		    a[i] * a[i] * a[i] * a[i];
+	}
+}
+
+void kernel_1_8_vo_dangerous(double* a, size_t size)
+{
+  register volatile double tmp=0.0;
+
+  #pragma omp parallel for
+  for(size_t i=0; i<size; i++)
+	{
+	  tmp = a[i] * a[i];
+	}
+}
+
+
+/* === Helper Functions === */
+
+static double pin_time(void)
+{
+  struct timeval tp;
+  int i;
+
+  i = gettimeofday(&tp,NULL);
+
+  if(i != 0)
+	{
+	  bail_out("Time measurement impossible. gettimeofday error");
+	}
+  
+  return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
+}
+
+static void bail_out(char* fmt, ...)
+{
+  char* prog_name = "aikern";
+  
+  if(fmt != NULL)
+	{
+	  char msgbuf[150];
+	
+	  va_list vl;
+	  va_start(vl, fmt);
+
+	  if(vsnprintf(msgbuf, sizeof(msgbuf), fmt, vl) < 0)
+		  msgbuf[0] = '\0';
+
+	  va_end( vl);
+	  
+	  if(strlen(msgbuf) > 0)
+		(void)fprintf(stderr, "%s: %s \n", prog_name, msgbuf);
+
+	}
+
+  if(errno != 0)
+	(void)fprintf(stderr, "%s: %s\n", prog_name, strerror(errno));
+	  
+  exit(EXIT_FAILURE);
+}
diff --git a/aikern.h b/aikern.h
new file mode 100644
index 0000000..c6ec2d2
--- /dev/null
+++ b/aikern.h
@@ -0,0 +1,288 @@
+#ifndef AIKERN_H
+#define AIKERN_H
+
+typedef struct {
+  size_t	runs;				// also # of start-/endtimes
+  double*	starts;				// starttimes
+  double*	ends;				// endtimes
+  int		flops;				// flops per iteration
+  char*		kern_name;
+  size_t	size;				// size of arrays handeld
+} kern_result;
+
+typedef enum {
+  SIMPLE_1_16, FMA_1_16, SIMPLE_8_1, FMA_8_1, SIMPLE_8_1_FASTMATH, FMA_8_1_MANPACK
+} kernel_t;
+
+/**
+ * @brief main entry point. Dispatches the kernel calls
+ * @param kernel	the kernel to run
+ * @param a			An array with double values of size param size
+ * @param b			An array with double values of size param size
+ * @param c			An array with double values of size param size
+ * @param size		The size of the arrays
+ * @param runs		How often the kernel should be executed
+ * @return kern_result containing information about the kernel execution 
+ *
+ * 
+ */
+kern_result kernel_dispatch(kernel_t kernel,
+							double* a, double* b, double* c,
+							size_t size, size_t runs);
+
+/**
+ * @brief A simple 1/16 operational intensity kernel
+ * @param a			An array with double values of size param size
+ * @param size		Size of the three param arrays
+ * @param result	Pointer to result storage
+ *
+ * === Warning ===
+ * Don't use with -O0: Stores everything on stack
+ *
+ * === Description ===
+ * Uses a simple floating point operation: a[i] = a[i] * a[i];
+ * 
+ * Runs in a parallelized for loop.
+ * 
+ * === Analysis ===
+ * COMM: 1 read (8 byte), 1 write = 16 bytes
+ * COMP: 1 FLOP
+ *      ---------
+ * OI:   1/16
+ * 
+ * === Optimization ===
+ * Nothing special 
+ *
+ */
+void kernel_1_16_simple(double* a, size_t size);
+
+
+/**
+ * @brief A 1/16 operational intensity kernel utilizing FMA
+ * @param a			An array with double values of size param size
+ * @param b			An array with double values of size param size
+ * @param c			An array with double values of size param size
+ * @param size		Size of the three param arrays
+ * @param result	Pointer to result storage
+ * 
+ * === Warning ===
+ * This is dangerous if FMA is not used/can't be used. Then there
+ * are intermediary writes (and reads) to the stack.
+ *
+ * === Description ===
+ * Uses a triad function: a[i] = a[i] * b[i] + c[i]; in order
+ * to utilize the FMA unit.
+ * 
+ * Runs in a parallelized for loop.
+ *
+ * === Analysis ===
+ * With gcc -O2 -mavx -mfma FMA compiles to:
+ *	 vmovsd xmm0,QWORD PTR [rdi+rax*8]				# 1 read (8 byte)
+ *	 vmovsd xmm1,QWORD PTR [rdx+rax*8]				# 1 read
+ *	 vfmadd132sd xmm0,xmm1,QWORD PTR [rsi+rax*8]	# 2 FLOPs + 1 read
+ *	 vmovsd QWORD PTR [rdi+rax*8],xmm0				# 1 write
+ *													 --------
+ *													  1/16 OI  
+ * 
+ * === Optimization ===
+ * For packed doubles compile with -Ofast
+ *
+ */
+void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size);
+
+
+
+/**
+ * @brief A simple 8/1 operational intensity kernel
+ * @param a			An array with double values of size param size
+ * @param size		Size of the three param arrays
+ * @param result	Pointer to result storage
+ *
+ * === Warning ===
+ * Don't use with -O0: Stores everything on stack
+ *
+ * === Description ===
+ * Uses a simple floating point operation: a[i] = a[i] * a[i] * ...* a[i];
+ * 
+ * Runs in a parallelized for loop.
+ *
+ * === Analysis ===
+ * With AVX and -O2 (not necessarily FMA) best results (obviously correct
+ * easy to read disassembly).
+ *
+ * With gcc -O2 -mavx compiles to:
+ *	 vmovsd xmm1,QWORD PTR [rdi]					# 1 read
+ *	 vmulsd xmm0,xmm1,xmm1							# 1 FLOP+register shuffling
+ *	 vmulsd xmm0,xmm0,xmm1							# 127x 1 FLOP+register shuffling
+ *	 # [...]
+ *	 vmovsd QWORD PTR [rdi-0x8],xmm0				# 1 write
+ *													 --------
+ *													  128/16 = 8/1 OI  
+ * 
+ * === Optimization ===
+ * Nothing special
+ */
+void kernel_8_1_simple(double* a, size_t size);
+
+/**
+ * @brief A 8/1 operational intensity kernel utilizing FMA
+ * @param a			An array with double values of size param size
+ * @param size		Size of the three param arrays
+ * @param result	Pointer to result storage
+ *
+ * === Warning ===
+ * This is dangerous if FMA is not used/can't be used. Then there
+ * are intermediary writes (and reads) to the stack.
+ *
+ * === Description ===
+ * Uses multiple triad function: a[i] = a[i] * a[i] + a[i]; in order
+ * to utilize the FMA unit.
+ * 
+ * Runs in a parallelized for loop.
+ *
+ * === Analysis ===
+ * With gcc -O2 -mavx -mfma FMA compiles to:
+ *	vmovsd xmm0,QWORD PTR [rdi]					# 1 read
+ *	vfmadd132sd xmm0,xmm0,xmm0					# 64 x 2 FLOPs+register shuffling
+ *	vmovsd QWORD PTR [rdi-0x8],xmm0				# 1 write
+ *												  --------
+ *												  128/16 = 8/1 OI  
+ * 
+ * === Optimization ===
+ * For packed doubles compile with -Ofast
+ *
+ */
+void kernel_8_1_fuseaware(double* a, size_t size);
+
+/**
+ * @brief A simple 8/1 operational intensity kernel which
+ *		  undermines evil fastmath optimization
+ * @param a			An array with double values of size param size
+ * @param size		Size of the three param arrays
+ * @param result	Pointer to result storage
+ *
+ * === Warning ===
+ * Don't use with anything other than -Ofast / -ffast-math
+ *
+ * === Description ===
+ * Uses a simple floating point operation that more closely resembles
+ * that of 8_1_fuseaware:
+ * a[i] = a[i]*a[i];		# 128x
+ * 
+ * Runs in a parallelized for loop.
+ *
+ * === Analysis ===
+ * -Ofast/-ffast-math does not preserve strict IEEE compliance. It
+ * therefore is allowed to ignore non-associativity of floating
+ * point operations.
+ *
+ * x = x*x*x*x*x*x*x*x; is optimized to x *= x;x *= x;x *= x;
+ * 
+ * This cleary breaks the whole OI calculation of 8_1_simple.
+ * 
+ * This kernel does not introduce more byte write-outs than
+ * 8_1_simple at a high optimization level since a[i] is held
+ * in a register and only written out once at the end of an
+ * iteration.
+ * 
+ * 
+ * === Optimization ===
+ * Nothing special
+ */
+void kernel_8_1_simple_fastmath(double* a, size_t size);
+
+
+/********************************************
+ *  Kernels which potentially compile to	*
+ *  different operational intensities than	*
+ *  specified								*
+ ********************************************/
+
+/**
+ * @brief A 1/16 operational intensity which might compile to a flawed oi kernel
+ * @param a		An array with double values of size param size
+ * @param b		An array with double values of size param size
+ * @param size  Size of the three param arrays
+ *
+ *  === Problem ===
+ *	As soon as volatile is used gcc uses the stack for tmp.
+ *	Even if "register" is in place. Resulting in one additional write per loop.
+ *	Omitting volatile results in optimizing away the whole loop 
+ *	(checked at -O2, which is necessary for FMA to eventually step in).
+ *	Maybe the value stays in cache, maybe not. It does not live a register.
+ *
+ *	Even with -O3:
+ *	movsd  xmm0,QWORD PTR [rdi+rax*8]  # 1 read
+ *	mulsd  xmm0,QWORD PTR [rsi+rax*8]  # 1 read (+ write to xmm0, not counted)
+ *	# [...]							   # instructions for loop
+ *	movsd  QWORD PTR [rsp-0x8],xmm0    # malicious write
+ *
+ *	Without volatile (-O3):
+ *	repz ret						   # that's it
+ */
+void kernel_1_16_simple_dangerous(double* a, double* b, size_t size);
+
+/**
+ * @brief A 8/1 operational intensity which might compile to a flawed oi kernel
+ * @param a		An array with double values of size param size
+ * @param size  Size of the three param arrays
+ *
+ * === Problem ==
+ * Same as for kernel_1_16_simple_dangerous
+ */
+void kernel_8_1_simple_dangerous(double* a, size_t size);
+
+/**
+ * @brief A 1/8 operational intensity which might compile to a flawed oi kernel
+ * @param a		An array with double values of size param size
+ * @param size  Size of the three param arrays
+ *
+ * === Problem ==
+ * Same as for kernel_1_16_simple_dangerous
+ * 
+ * Without volatile the loop is optimized away completely.
+ * With volatile tmp is written to the stack in every loop
+ * (-O3). tmp could be cached or not. This might depend on
+ * how large the array is and how the cpu work internally
+ * -> unpredictable.
+ */
+void kernel_1_8_vo_dangerous(double* a, size_t size);
+
+
+#ifdef INTRINS
+void kernel_8_1_fuseaware_manpack(double* a, size_t size);
+#endif
+
+
+
+/****************************************
+ * Helper macros for repeating things	*
+ ****************************************/
+
+#define REP0(X)
+#define REP1(X) X
+#define REP2(X) REP1(X) REP1(X)
+#define REP3(X) REP2(X) REP1(X)
+#define REP4(X) REP3(X) REP1(X)
+#define REP5(X) REP4(X) REP1(X)
+#define REP6(X) REP5(X) REP1(X)
+#define REP7(X) REP6(X) REP1(X)
+#define REP8(X) REP7(X) REP1(X)
+#define REP9(X) REP8(X) REP1(X)
+
+#define REP10(X)  REP9(X)  REP1(X)
+#define REP20(X) REP10(X) REP10(X)
+#define REP30(X) REP20(X) REP10(X)
+#define REP40(X) REP30(X) REP10(X)
+#define REP50(X) REP40(X) REP10(X)
+#define REP60(X) REP50(X) REP10(X)
+  
+#define REP100(X) REP50(X) REP50(X)
+
+#ifdef ENDEBUG
+#define DEBUG(...) do { fprintf(stderr, __VA_ARGS__); fprintf(stderr, "\n"); } while(0)
+#else
+#define DEBUG(...)
+#endif
+
+#endif /* AIKERN_H */
diff --git a/prof b/prof
new file mode 100644
index 0000000..1cc892d
--- /dev/null
+++ b/prof
@@ -0,0 +1,199 @@
+Flat profile:
+
+Each sample counts as 0.01 seconds.
+  %   cumulative   self              self     total           
+ time   seconds   seconds    calls  Ts/call  Ts/call  name    
+100.01      0.74     0.74                             bail_out
+  0.00      0.74     0.00       50     0.00     0.00  pin_time
+  0.00      0.74     0.00        5     0.00     0.00  kernel_1_16_fuseaware
+  0.00      0.74     0.00        5     0.00     0.00  kernel_1_16_simple
+  0.00      0.74     0.00        5     0.00     0.00  kernel_8_1_fuseaware
+  0.00      0.74     0.00        5     0.00     0.00  kernel_8_1_simple
+  0.00      0.74     0.00        5     0.00     0.00  kernel_8_1_simple_fastmath
+  0.00      0.74     0.00        5     0.00     0.00  kernel_dispatch
+  0.00      0.74     0.00        5     0.00     0.00  print_kernresult
+  0.00      0.74     0.00        2     0.00     0.00  pin_time
+  0.00      0.74     0.00        1     0.00     0.00  get_int
+  0.00      0.74     0.00        1     0.00     0.00  get_size
+  0.00      0.74     0.00        1     0.00     0.00  testkern
+
+ %         the percentage of the total running time of the
+time       program used by this function.
+
+cumulative a running sum of the number of seconds accounted
+ seconds   for by this function and those listed above it.
+
+ self      the number of seconds accounted for by this
+seconds    function alone.  This is the major sort for this
+           listing.
+
+calls      the number of times this function was invoked, if
+           this function is profiled, else blank.
+ 
+ self      the average number of milliseconds spent in this
+ms/call    function per call, if this function is profiled,
+	   else blank.
+
+ total     the average number of milliseconds spent in this
+ms/call    function and its descendents per call, if this 
+	   function is profiled, else blank.
+
+name       the name of the function.  This is the minor sort
+           for this listing. The index shows the location of
+	   the function in the gprof listing. If the index is
+	   in parenthesis it shows where it would appear in
+	   the gprof listing if it were to be printed.
+
+Copyright (C) 2012-2014 Free Software Foundation, Inc.
+
+Copying and distribution of this file, with or without modification,
+are permitted in any medium without royalty provided the copyright
+notice and this notice are preserved.
+
+		     Call graph (explanation follows)
+
+
+granularity: each sample hit covers 2 byte(s) for 1.35% of 0.74 seconds
+
+index % time    self  children    called     name
+                                                 <spontaneous>
+[1]    100.0    0.74    0.00                 bail_out [1]
+-----------------------------------------------
+                0.00    0.00      50/50          kernel_dispatch [8]
+[2]      0.0    0.00    0.00      50         pin_time [2]
+-----------------------------------------------
+                0.00    0.00       5/5           kernel_dispatch [8]
+[3]      0.0    0.00    0.00       5         kernel_1_16_fuseaware [3]
+-----------------------------------------------
+                0.00    0.00       5/5           kernel_dispatch [8]
+[4]      0.0    0.00    0.00       5         kernel_1_16_simple [4]
+-----------------------------------------------
+                0.00    0.00       5/5           kernel_dispatch [8]
+[5]      0.0    0.00    0.00       5         kernel_8_1_fuseaware [5]
+-----------------------------------------------
+                0.00    0.00       5/5           kernel_dispatch [8]
+[6]      0.0    0.00    0.00       5         kernel_8_1_simple [6]
+-----------------------------------------------
+                0.00    0.00       5/5           kernel_dispatch [8]
+[7]      0.0    0.00    0.00       5         kernel_8_1_simple_fastmath [7]
+-----------------------------------------------
+                0.00    0.00       5/5           main [23]
+[8]      0.0    0.00    0.00       5         kernel_dispatch [8]
+                0.00    0.00      50/50          pin_time [2]
+                0.00    0.00       5/5           kernel_1_16_simple [4]
+                0.00    0.00       5/5           kernel_1_16_fuseaware [3]
+                0.00    0.00       5/5           kernel_8_1_simple [6]
+                0.00    0.00       5/5           kernel_8_1_fuseaware [5]
+                0.00    0.00       5/5           kernel_8_1_simple_fastmath [7]
+-----------------------------------------------
+                0.00    0.00       5/5           main [23]
+[9]      0.0    0.00    0.00       5         print_kernresult [9]
+-----------------------------------------------
+                0.00    0.00       2/2           main [23]
+[10]     0.0    0.00    0.00       2         pin_time [10]
+-----------------------------------------------
+                0.00    0.00       1/1           main [23]
+[11]     0.0    0.00    0.00       1         get_int [11]
+-----------------------------------------------
+                0.00    0.00       1/1           main [23]
+[12]     0.0    0.00    0.00       1         get_size [12]
+-----------------------------------------------
+                0.00    0.00       1/1           main [23]
+[13]     0.0    0.00    0.00       1         testkern [13]
+-----------------------------------------------
+
+ This table describes the call tree of the program, and was sorted by
+ the total amount of time spent in each function and its children.
+
+ Each entry in this table consists of several lines.  The line with the
+ index number at the left hand margin lists the current function.
+ The lines above it list the functions that called this function,
+ and the lines below it list the functions this one called.
+ This line lists:
+     index	A unique number given to each element of the table.
+		Index numbers are sorted numerically.
+		The index number is printed next to every function name so
+		it is easier to look up where the function is in the table.
+
+     % time	This is the percentage of the `total' time that was spent
+		in this function and its children.  Note that due to
+		different viewpoints, functions excluded by options, etc,
+		these numbers will NOT add up to 100%.
+
+     self	This is the total amount of time spent in this function.
+
+     children	This is the total amount of time propagated into this
+		function by its children.
+
+     called	This is the number of times the function was called.
+		If the function called itself recursively, the number
+		only includes non-recursive calls, and is followed by
+		a `+' and the number of recursive calls.
+
+     name	The name of the current function.  The index number is
+		printed after it.  If the function is a member of a
+		cycle, the cycle number is printed between the
+		function's name and the index number.
+
+
+ For the function's parents, the fields have the following meanings:
+
+     self	This is the amount of time that was propagated directly
+		from the function into this parent.
+
+     children	This is the amount of time that was propagated from
+		the function's children into this parent.
+
+     called	This is the number of times this parent called the
+		function `/' the total number of times the function
+		was called.  Recursive calls to the function are not
+		included in the number after the `/'.
+
+     name	This is the name of the parent.  The parent's index
+		number is printed after it.  If the parent is a
+		member of a cycle, the cycle number is printed between
+		the name and the index number.
+
+ If the parents of the function cannot be determined, the word
+ `<spontaneous>' is printed in the `name' field, and all the other
+ fields are blank.
+
+ For the function's children, the fields have the following meanings:
+
+     self	This is the amount of time that was propagated directly
+		from the child into the function.
+
+     children	This is the amount of time that was propagated from the
+		child's children to the function.
+
+     called	This is the number of times the function called
+		this child `/' the total number of times the child
+		was called.  Recursive calls by the child are not
+		listed in the number after the `/'.
+
+     name	This is the name of the child.  The child's index
+		number is printed after it.  If the child is a
+		member of a cycle, the cycle number is printed
+		between the name and the index number.
+
+ If there are any cycles (circles) in the call graph, there is an
+ entry for the cycle-as-a-whole.  This entry shows who called the
+ cycle (as parents) and the members of the cycle (as children.)
+ The `+' recursive calls entry shows the number of function calls that
+ were internal to the cycle, and the calls entry for each member shows,
+ for that member, how many times it was called from other members of
+ the cycle.
+
+Copyright (C) 2012-2014 Free Software Foundation, Inc.
+
+Copying and distribution of this file, with or without modification,
+are permitted in any medium without royalty provided the copyright
+notice and this notice are preserved.
+
+Index by function name
+
+   [1] bail_out (aikern.c)     [5] kernel_8_1_fuseaware    [2] pin_time (aikern.c)
+  [11] get_int (roofline.c)    [6] kernel_8_1_simple       [9] print_kernresult (roofline.c)
+  [12] get_size (roofline.c)   [7] kernel_8_1_simple_fastmath [13] testkern (roofline.c)
+   [3] kernel_1_16_fuseaware   [8] kernel_dispatch
+   [4] kernel_1_16_simple     [10] pin_time (roofline.c)
diff --git a/roofline.c b/roofline.c
new file mode 100644
index 0000000..84b6b30
--- /dev/null
+++ b/roofline.c
@@ -0,0 +1,342 @@
+# include <stdlib.h>
+# include <stdio.h>
+# include <unistd.h>
+# include <ctype.h>
+# include <sys/time.h>
+# include <sys/stat.h>
+# include <errno.h>
+# include <string.h>
+# include <stdint.h>
+# include <getopt.h>
+# include <stdarg.h>
+# include <limits.h>
+
+# include "aikern.h"
+
+
+/* === Macros === */
+
+#ifdef ENDEBUG
+#define DEBUG(...) do { fprintf(stderr, __VA_ARGS__); fprintf(stderr, "\n"); } while(0)
+#else
+#define DEBUG(...)
+#endif
+
+/* === Constants === */
+
+/* === Global Variables === */
+char* prog_name;
+
+/* === Prototypes === */
+
+/**
+ * @brief print usage message
+ */
+static void usage(void);
+
+/**
+ * @brief terminate program on program error
+ * @param msg additional message to print
+ * @param ret exit value
+ */
+static void bail_out(char* fmt, ...);
+
+/**
+ * @brief converts the argument to size_t if possible.
+ *        bails out on error.
+ * @param oparg the argument to convert
+ */
+static size_t get_size(char* oparg);
+
+/**
+ * @brief converts the argument to int if possible.
+ *        bails out on error.
+ * @param oparg the argument to convert
+ */
+static int get_int(char* oparg);
+
+/**
+ * @brief microseconds since epoch
+ */
+static double pin_time(void);
+
+/**
+ * @brief a simple test kernel with ai of 1/16
+ */
+static void testkern(double* a, double* b, double* c, size_t size);
+
+/**
+ * @brief pretty prints a kern_result
+ */
+static void print_kernresult(kern_result* result, const char* logname);
+
+int main(int argc, char* argv[]) {
+  prog_name = argv[0];
+  
+  int opt;
+  char *size_arg = NULL;
+  char *runs_arg = NULL;
+  
+  while((opt = getopt(argc, argv, "s:r:")) != -1)
+	{
+	  switch(opt)
+		{
+		case 's':
+		  size_arg = optarg;
+		  break;
+		case 'r':
+		  runs_arg = optarg;
+		  break;
+		case '?':
+		  usage();
+		default:
+		  usage();
+		}
+	}
+
+  if(optind < argc)
+	{
+
+	  for (int index = optind; index < argc; index++)
+		  bail_out ("Non-option argument %s\n", argv[index]);
+	  
+	  usage();
+	}
+
+  if(size_arg == NULL || runs_arg == NULL)
+	  usage();
+
+  size_t size = get_size(size_arg);
+  int runs = get_int(runs_arg);
+
+  // Allocating arrays
+  printf("Will run with array sizes of %zu elements\n", size);
+  printf("Will calculate min, max, avg for %d runs\n", runs);
+  double* a = malloc(sizeof(double)*(size));
+  double* b = malloc(sizeof(double)*(size));
+  double* c = malloc(sizeof(double)*(size));
+
+  if(a==NULL || b==NULL || c == NULL)
+	  bail_out("One of the mallocs failed\n. a = %p, b=%p, c=%p", a, b, c);
+  
+  printf("Allocated 3 arrays (3*%.2f MB = %.2f GB)\n", (sizeof(double)*(size)/1024.0/1024.0), (sizeof(double)*(size)*3/1024.0/1024.0/1024));
+  printf("Filling arrays with dummy values. This will also warm the cache\n");
+
+  // Filling arrays with arbitrary numbers
+  #pragma omp parallel for
+  for (size_t j=0; j<size; j++)
+	{
+	  a[j] = 1.0;
+	  b[j] = 2.0;
+	  c[j] = 3.0;
+	}
+
+  double t;
+  printf("Heating up machine\n");
+  t = pin_time();
+  testkern(a,b,c, size);
+  t = pin_time() - t;
+  printf("Machine heating took %.4f microseconds = %.4f seconds (with test OI kernel)\n", (t*1.0E6), t);
+  printf("Starting tests...\n\n\n");
+
+  // Executing kernels
+  kern_result	simple16  = kernel_dispatch(SIMPLE_1_16, a, b, c, size, runs);
+  kern_result	fma16	  = kernel_dispatch(FMA_1_16, a, b, c, size, runs);
+  kern_result	simple8	  = kernel_dispatch(SIMPLE_8_1, a, b, c, size, runs);
+  kern_result	fma8	  = kernel_dispatch(FMA_8_1, a, b, c, size, runs);
+  kern_result	simple8fm = kernel_dispatch(SIMPLE_8_1_FASTMATH, a, b, c, size, runs);
+
+#ifdef INTRINS
+  DEBUG("Running manpack now");
+  kern_result	fma8manpack = kernel_dispatch(FMA_8_1_MANPACK, a, b, c, size, runs);
+  DEBUG("manpack run successful");
+#endif
+  
+  // Freeing arrays
+  free(a);
+  free(b);
+  free(c);
+
+  // Printing results
+  print_kernresult(&simple16, "simple16");
+  print_kernresult(&fma16, "fma16");
+  print_kernresult(&simple8, "simple8");
+  print_kernresult(&fma8, "fma8");
+  print_kernresult(&simple8fm, "simple8fastmath");
+
+#ifdef INTRINS
+  print_kernresult(&fma8manpack, "fma8manpack");
+#endif
+
+
+  printf("\n\n\n");
+  printf("Please refer to the log files in the log/ folder for details about the GFLOP/s of every kernel.");
+  printf("\n");
+  printf("Exiting...");
+  exit(EXIT_SUCCESS);
+}
+
+static void testkern(double* a, double* b, double* c, size_t size)
+{
+  #pragma omp parallel for
+  for (size_t j = 0; j < size; j++)
+	{
+	  /* 3*8 Bytes read + 3*8 Bytes write, 3 FLOPs -> AI = 3/(2*3*8) = 1/16 */
+	  a[j] = 2.0E0 * a[j];
+	  b[j] = 2.0E0 * b[j];
+	  c[j] = 2.0E0 * c[j];
+	}  
+}
+
+/* === Helper Functions === */
+
+static double pin_time(void)
+{
+  struct timeval tp;
+  int i;
+
+  i = gettimeofday(&tp,NULL);
+
+  if(i != 0)
+	{
+	  bail_out("Time measurement impossible. gettimeofday error");
+	}
+  
+  return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
+}
+
+static size_t get_size(char *oparg)
+{
+  long long int llsize = strtoll(oparg, NULL, 10);
+
+  if(llsize <= 0)
+	usage();
+	
+  unsigned long long int u_llsize = (unsigned long long int) llsize;
+
+  if(u_llsize > SIZE_MAX)
+	{
+	  bail_out("Only size between 1 to %zu allowed.", SIZE_MAX);
+	}
+	
+  return (size_t) llsize;
+}
+
+static int get_int(char *oparg)
+{
+  long long int llsize = strtoll(oparg, NULL, 10);
+
+  if(llsize <= 0)
+	usage();
+	
+  unsigned long long int u_llsize = (unsigned long long int) llsize;
+
+  if(u_llsize > INT_MAX)
+	{
+	  bail_out("Only size between 1 to %d allowed.", INT_MAX);
+	}
+	
+  return (int) llsize;
+}
+
+static void usage()
+{
+  fprintf(stderr, "USAGE: ./roofline -s <size> -r <runs> \n");
+  fprintf(stderr, "e.g.: ./roofline -s 100000 -r 5 \n");
+  bail_out("Invalid paramers");
+}
+
+static void bail_out(char* fmt, ...)
+{
+  if(fmt != NULL)
+	{
+	  char msgbuf[150];
+	
+	  va_list vl;
+	  va_start(vl, fmt);
+
+	  if(vsnprintf(msgbuf, sizeof(msgbuf), fmt, vl) < 0)
+		  msgbuf[0] = '\0';
+
+	  va_end( vl);
+	  
+	  if(strlen(msgbuf) > 0)
+		(void)fprintf(stderr, "%s: %s \n", prog_name, msgbuf);
+
+	}
+
+  if(errno != 0)
+	(void)fprintf(stderr, "%s: %s\n", prog_name, strerror(errno));
+	  
+  exit(EXIT_FAILURE);
+}
+
+static void print_kernresult(kern_result* result, const char* logname)
+{
+  struct stat st = {};
+
+  if (stat("log", &st) == -1)
+	{
+	  if(mkdir("log", 0700))
+		{
+		  bail_out("Couldn't create log directory for %s", result->kern_name);
+		}
+	}
+
+  char logpath[20];
+  snprintf(logpath, sizeof(logpath), "%s/%s", "log", logname);
+  FILE* log = fopen(logpath, "w");
+  if(log == NULL)
+	bail_out("Couldn't open log file for %s", result->kern_name);
+  
+  if(fputs("run,start,end,delta,GFLOP/s\n", log) == EOF)
+	{
+	  fclose(log);
+	  bail_out("Couldn't write header to log file");
+	}
+
+  printf("=== %s ===\n", result->kern_name);
+  
+  double min;
+  double max;
+  double sum = 0.0;
+  double deltas[result->runs];
+
+  deltas[0] = result->ends[0] - result->starts[0];
+  min=deltas[0];
+  max=deltas[0];
+  sum+=deltas[0];
+
+  for(size_t i=1; i<result->runs; i++)
+	{
+	  deltas[i] = result->ends[i] - result->starts[i];
+	  sum+=deltas[i];
+
+	  if(deltas[i] < min) min=deltas[i];
+	  if(deltas[i] > max) max=deltas[i];
+
+	  double gflops = ((result->flops * result->size) / deltas[i]) / 1.0E9;
+
+	  if(fprintf(log, "%zu,%.4f,%.4f,%.4f,%.4f\n",
+				 i, result->starts[i],
+				 result->ends[i], deltas[i],
+				 gflops) == EOF)
+		{
+		  fclose(log);
+		  bail_out("Couldn't write to log file");
+		}
+  	}
+
+
+  printf("%d flop(s) per run\t %zu run(s)\n\n", result->flops, result->runs);
+  printf("Min: %.4f \t Max: %.4f \t Avg: %.4f\n", min, max, (sum/result->runs));
+  
+  printf("\n\n\n");
+    
+
+
+  if(fclose(log))
+	{
+	  bail_out("Couldn't close log file for %s", result->kern_name);
+	}
+}