added logging, added fastmath undermining for simple8

2016-06-24 03:45:10 +02:00 · 2016-06-24 03:45:10 +02:00 · aaf77c78f6
commit aaf77c78f6
parent 31bfead054
16 changed files with 202 additions and 18 deletions
--- a/plot/plot.py
+++ b/plot/plot.py
@ -21,6 +21,7 @@ while i<=64:
    xlbl.append(repr(i))
    i *= 2

+print(xlbl)    
 # memory
 values = []
 bandwidth = 10.6
--- a/roofline/src/Makefile
+++ b/roofline/src/Makefile
@ -1,7 +1,7 @@
-all: bin lib
+all: clean bin lib

 # Roofline Binary
-bin: roofline roofline_o3 roofline_fma roofline_fma_o3 roofline_fma_fast_o3 roofline_fma_fast_fastmath_o3
+bin: roofline roofline_o3 roofline_fma roofline_fma_o3 roofline_fma_fast_o3 roofline_fma_fast_o2 roofline_fma_fast_fastmath_o3
 	mkdir bin
 	mv $^ bin

@ -20,12 +20,15 @@ roofline_fma_o3: roofline.c aikern_fma_o3.a
 roofline_fma_fast_o3: roofline.c aikern_fma_fast_o3.a
 	gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@

+roofline_fma_fast_o2: roofline.c aikern_fma_fast_o2.a
+	gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@
+
 roofline_fma_fast_fastmath_o3: roofline.c aikern_fma_fast_fastmath_o3.a
 	gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@


 # Static Libraries
-lib: aikern.a aikern_o3.a aikern_fma.a aikern_fma_o3.a aikern_fma_fast_o3.a aikern_fma_fast_fastmath_o3.a
+lib: aikern.a aikern_o3.a aikern_fma.a aikern_fma_o3.a aikern_fma_fast_o2.a aikern_fma_fast_o3.a aikern_fma_fast_fastmath_o3.a
 	mkdir lib
 	mv $^ lib

@ -49,6 +52,11 @@ aikern_fma_o3.a: aikern.c aikern.h
 	ar rcs $@ aikern_fma_o3.o
 	rm aikern_fma_o3.o

+aikern_fma_fast_o2.a: aikern.c aikern.h
+	gcc -Wall -Wextra -Wno-unused -O2 -mavx -mfma -fopenmp -Ofast -c -o aikern_fma_fast_o2.o $<
+	ar rcs $@ aikern_fma_fast_o2.o
+	rm aikern_fma_fast_o2.o
+
 aikern_fma_fast_o3.a: aikern.c aikern.h
 	gcc -Wall -Wextra -Wno-unused -O3 -mavx -mfma -fopenmp -Ofast -c -o aikern_fma_fast_o3.o $<
 	ar rcs $@ aikern_fma_fast_o3.o
@ -61,6 +69,9 @@ aikern_fma_fast_fastmath_o3.a: aikern.c aikern.h

 # Cleanup
 clean:
+	rm -f *.a
+	rm -f *.o
+	rm -f roofline roofline_o3 roofline_fma roofline_fma_o3 roofline_fma_fast_o3 roofline_fma_fast_o2 roofline_fma_fast_fastmath_o3
 	rm -fR bin
 	rm -fR lib

--- a/roofline/src/aikern.c
+++ b/roofline/src/aikern.c
@ -25,10 +25,11 @@ kern_result kernel_dispatch(kernel_t kernel,
 							size_t size, size_t runs)
 {
  
-  kern_result result;
+  kern_result result = {0};
  result.runs = runs;
  result.starts = malloc(sizeof(double)*(runs));
  result.ends = malloc(sizeof(double)*(runs));
+  result.size = size;

  if(result.starts==NULL || result.ends==NULL)
 	{
@ -41,6 +42,7 @@ kern_result kernel_dispatch(kernel_t kernel,

 	case SIMPLE_1_16:
 	  result.flops = 1;
+	  result.kern_name = "Simple 1/16";
 	  for(size_t r=0; r<runs; r++)
 		{
 		  result.starts[r] = pin_time();
@ -50,6 +52,7 @@ kern_result kernel_dispatch(kernel_t kernel,
 	  break;
 	case FMA_1_16:
 	  result.flops = 2;
+	  result.kern_name = "FMA aware 1/16";
 	  for(size_t r=0; r<runs; r++)
 		{
 		  result.starts[r] = pin_time();
@ -59,6 +62,7 @@ kern_result kernel_dispatch(kernel_t kernel,
 	  break;
 	case SIMPLE_8_1:
 	  result.flops = 128;
+	  result.kern_name = "Simple 8";
 	  for(size_t r=0; r<runs; r++)
 		{
 		  result.starts[r] = pin_time();
@ -68,6 +72,7 @@ kern_result kernel_dispatch(kernel_t kernel,
 	  break;
 	case FMA_8_1:
 	  result.flops = 128;
+	  result.kern_name = "FMA aware 8";
 	  for(size_t r=0; r<runs; r++)
 		{
 		  result.starts[r] = pin_time();
@ -75,6 +80,16 @@ kern_result kernel_dispatch(kernel_t kernel,
 		  result.ends[r] = pin_time();
 		}
 	  break;
+	case SIMPLE_8_1_FASTMATH:
+	  result.flops = 128;
+	  result.kern_name = "Simple 8 undermining fastmath";
+	  for(size_t r=0; r<runs; r++)
+		{
+		  result.starts[r] = pin_time();
+		  kernel_8_1_simple_fastmath(a, size);
+		  result.ends[r] = pin_time();
+		}
+	  break;
 	default:
 	  bail_out("No such kernel %s", kernel);
 	}
@ -82,7 +97,7 @@ kern_result kernel_dispatch(kernel_t kernel,
  return result;
 }

-void kernel_1_16_simple(double* a, size_t size)
+inline void kernel_1_16_simple(double* a, size_t size)
 {
  #pragma omp parallel for
  for(size_t i=0; i<size; i++)
@ -91,7 +106,7 @@ void kernel_1_16_simple(double* a, size_t size)
 	}
 }

-void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size)
+inline void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size)
 {
  #pragma omp parallel for
  for(size_t i=0; i<size; i++)
@ -100,7 +115,7 @@ void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size)
 	}
 }

-void kernel_8_1_simple(double* a, size_t size)
+inline void kernel_8_1_simple(double* a, size_t size)
 {
  #pragma omp parallel for
  for(size_t i=0; i<size; i++)
@ -112,7 +127,18 @@ void kernel_8_1_simple(double* a, size_t size)
  }
 }

-void kernel_8_1_fuseaware(double* a, size_t size)
+inline void kernel_8_1_simple_fastmath(double* a, size_t size)
+{
+  #pragma omp parallel for
+  for(size_t i=0; i<size; i++)
+	{
+	  REP100(a[i]=a[i]*a[i];);
+	  REP20(a[i]=a[i]*a[i];);
+	  REP8(a[i]=a[i]*a[i];);
+  }
+}
+
+inline void kernel_8_1_fuseaware(double* a, size_t size)
 {
  #pragma omp parallel for
  for(size_t i=0; i<size; i++)
--- a/roofline/src/aikern.h
+++ b/roofline/src/aikern.h
@ -5,11 +5,13 @@ typedef struct {
  size_t	runs;				// also # of start-/endtimes
  double*	starts;				// starttimes
  double*	ends;				// endtimes
-  int flops; //flops per run
+  int		flops;				// flops per iteration
+  char*		kern_name;
+  size_t	size;				// size of arrays handeld
 } kern_result;

 typedef enum {
-  SIMPLE_1_16, FMA_1_16, SIMPLE_8_1, FMA_8_1
+  SIMPLE_1_16, FMA_1_16, SIMPLE_8_1, FMA_8_1, SIMPLE_8_1_FASTMATH
 } kernel_t;

 /**
@ -152,6 +154,43 @@ void kernel_8_1_simple(double* a, size_t size);
 */
 void kernel_8_1_fuseaware(double* a, size_t size);

+/**
+ * @brief A simple 8/1 operational intensity kernel which
+ *		  undermines evil fastmath optimization
+ * @param a			An array with double values of size param size
+ * @param size		Size of the three param arrays
+ * @param result	Pointer to result storage
+ *
+ * === Warning ===
+ * Don't use with anything other than -Ofast / -ffast-math
+ *
+ * === Description ===
+ * Uses a simple floating point operation that more closely resembles
+ * that of 8_1_fuseaware:
+ * a[i] = a[i]*a[i];		# 128x
+ * 
+ * Runs in a parallelized for loop.
+ *
+ * === Analysis ===
+ * -Ofast/-ffast-math does not preserve strict IEEE compliance. It
+ * therefore is allowed to ignore non-associativity of floating
+ * point operations.
+ *
+ * x = x*x*x*x*x*x*x*x; is optimized to x *= x;x *= x;x *= x;
+ * 
+ * This cleary breaks the whole OI calculation of 8_1_simple.
+ * 
+ * This kernel does not introduce more byte write-outs than
+ * 8_1_simple at a high optimization level since a[i] is held
+ * in a register and only written out once at the end of an
+ * iteration.
+ * 
+ * 
+ * === Optimization ===
+ * Nothing special
+ */
+void kernel_8_1_simple_fastmath(double* a, size_t size);
+

 /********************************************
 *  Kernels which potentially compile to	*
--- a/roofline/src/log/fma16
+++ b/roofline/src/log/fma16
@ -0,0 +1,5 @@
+run,start,end,delta,GFLOP/s
+1,1466732365.5426,1466732366.1946,0.6520,0.9202
+2,1466732366.1946,1466732366.8410,0.6464,0.9282
+3,1466732366.8410,1466732367.4875,0.6465,0.9281
+4,1466732367.4875,1466732368.1370,0.6495,0.9238
--- a/roofline/src/log/fma8
+++ b/roofline/src/log/fma8
@ -0,0 +1,5 @@
+run,start,end,delta,GFLOP/s
+1,1466732371.5080,1466732373.2547,1.7468,21.9836
+2,1466732373.2547,1466732375.0033,1.7486,21.9603
+3,1466732375.0033,1466732376.7499,1.7465,21.9864
+4,1466732376.7499,1466732378.4990,1.7492,21.9534
--- a/roofline/src/log/simple16
+++ b/roofline/src/log/simple16
@ -0,0 +1,5 @@
+run,start,end,delta,GFLOP/s
+1,1466732363.5905,1466732363.9157,0.3252,0.9226
+2,1466732363.9157,1466732364.2410,0.3253,0.9223
+3,1466732364.2410,1466732364.5659,0.3249,0.9234
+4,1466732364.5659,1466732364.8925,0.3266,0.9186
--- a/roofline/src/log/simple8
+++ b/roofline/src/log/simple8
@ -0,0 +1,5 @@
+run,start,end,delta,GFLOP/s
+1,1466732368.4592,1466732368.7843,0.3251,118.1190
+2,1466732368.7843,1466732369.1090,0.3247,118.2713
+3,1466732369.1090,1466732369.4353,0.3263,117.6684
+4,1466732369.4353,1466732369.7596,0.3243,118.4045
--- a/roofline/src/log/simple8fastmath
+++ b/roofline/src/log/simple8fastmath
@ -0,0 +1,5 @@
+run,start,end,delta,GFLOP/s
+1,1466732382.7768,1466732387.0594,4.2827,8.9664
+2,1466732387.0594,1466732391.3489,4.2895,8.9521
+3,1466732391.3489,1466732395.6322,4.2834,8.9649
+4,1466732395.6322,1466732399.9114,4.2791,8.9738
--- a/roofline/src/roofline
+++ b/roofline/src/roofline
--- a/roofline/src/roofline.c
+++ b/roofline/src/roofline.c
@ -3,6 +3,7 @@
 # include <unistd.h>
 # include <ctype.h>
 # include <sys/time.h>
+# include <sys/stat.h>
 # include <errno.h>
 # include <string.h>
 # include <stdint.h>
@ -67,7 +68,7 @@ static void testkern(double* a, double* b, double* c, size_t size);
 /**
 * @brief pretty prints a kern_result
 */
-static void print_kernresult(kern_result* result);
+static void print_kernresult(kern_result* result, const char* logname);

 int main(int argc, char* argv[]) {
  prog_name = argv[0];
@ -108,6 +109,7 @@ int main(int argc, char* argv[]) {
  size_t size = get_size(size_arg);
  int runs = get_int(runs_arg);

+  // Allocating arrays
  printf("Will run with array sizes of %zu elements\n", size);
  printf("Will calculate min, max, avg for %d runs\n", runs);
  double* a = malloc(sizeof(double)*(size));
@ -120,6 +122,7 @@ int main(int argc, char* argv[]) {
  printf("Allocated 3 arrays (3*%.2f MB = %.2f GB)\n", (sizeof(double)*(size)/1024.0/1024.0), (sizeof(double)*(size)*3/1024.0/1024.0/1024));
  printf("Filling arrays with dummy values. This will also warm the cache\n");

+  // Filling arrays with arbitrary numbers
  #pragma omp parallel for
  for (size_t j=0; j<size; j++)
 	{
@ -134,13 +137,26 @@ int main(int argc, char* argv[]) {
  testkern(a,b,c, size);
  t = pin_time() - t;
  printf("Machine heating took %.4f microseconds = %.4f seconds (with test OI kernel)\n", (t*1.0E6), t);
+  printf("Starting tests...\n\n\n");

+  // Executing kernels
  kern_result simple16 = kernel_dispatch(SIMPLE_1_16, a, b, c, size, runs);
  kern_result fma16 = kernel_dispatch(FMA_1_16, a, b, c, size, runs);
  kern_result simple8 = kernel_dispatch(SIMPLE_8_1, a, b, c, size, runs);
  kern_result fma8 = kernel_dispatch(FMA_8_1, a, b, c, size, runs);
+  kern_result simple8fm = kernel_dispatch(SIMPLE_8_1_FASTMATH, a, b, c, size, runs);

-  print_kernresult(&simple16);
+  // Freeing arrays
+  free(a);
+  free(b);
+  free(c);
+
+  // Printing results
+  print_kernresult(&simple16, "simple16");
+  print_kernresult(&fma16, "fma16");
+  print_kernresult(&simple8, "simple8");
+  print_kernresult(&fma8, "fma8");
+  print_kernresult(&simple8fm, "simple8fastmath");

  exit(EXIT_SUCCESS);
 }
@ -233,6 +249,72 @@ static void bail_out(char* fmt, ...)
  exit(EXIT_FAILURE);
 }

-static void print_kernresult(kern_result* result){
-  return;
+static void print_kernresult(kern_result* result, const char* logname)
+{
+  struct stat st = {0};
+
+  if (stat("log", &st) == -1)
+	{
+	  if(mkdir("log", 0700))
+		{
+		  bail_out("Couldn't create log directory for %s", result->kern_name);
+		}
+	}
+
+  char logpath[20];
+  snprintf(logpath, sizeof(logpath), "%s/%s", "log", logname);
+  FILE* log = fopen(logpath, "w");
+  if(log == NULL)
+	bail_out("Couldn't open log file for %s", result->kern_name);
+  
+  if(fputs("run,start,end,delta,GFLOP/s\n", log) == EOF)
+	{
+	  fclose(log);
+	  bail_out("Couldn't write header to log file");
+	}
+
+  printf("=== %s ===\n", result->kern_name);
+  
+  double min;
+  double max;
+  double sum = 0.0;
+  double deltas[result->runs];
+
+  deltas[0] = result->ends[0] - result->starts[0];
+  min=deltas[0];
+  max=deltas[0];
+  sum+=deltas[0];
+
+  for(size_t i=1; i<result->runs; i++)
+	{
+	  deltas[i] = result->ends[i] - result->starts[i];
+	  sum+=deltas[i];
+
+	  if(deltas[i] < min) min=deltas[i];
+	  if(deltas[i] > max) max=deltas[i];
+
+	  double gflops = ((result->flops * result->size) / deltas[i]) / 1.0E9;
+
+	  if(fprintf(log, "%zu,%.4f,%.4f,%.4f,%.4f\n",
+				 i, result->starts[i],
+				 result->ends[i], deltas[i],
+				 gflops) == EOF)
+		{
+		  fclose(log);
+		  bail_out("Couldn't write to log file");
+		}
+  	}
+
+
+  printf("%d flop(s) per run\t %zu run(s)\n\n", result->flops, result->runs);
+  printf("Min: %.4f \t Max: %.4f \t Avg: %.4f\n", min, max, (sum/result->runs));
+  
+  printf("\n\n\n");
+    
+
+
+  if(fclose(log))
+	{
+	  bail_out("Couldn't close log file for %s", result->kern_name);
+	}
 }
--- a/roofline/src/roofline_fma
+++ b/roofline/src/roofline_fma
--- a/roofline/src/roofline_fma_fast_fastmath_o3
+++ b/roofline/src/roofline_fma_fast_fastmath_o3
--- a/roofline/src/roofline_fma_fast_o3
+++ b/roofline/src/roofline_fma_fast_o3
--- a/roofline/src/roofline_fma_o3
+++ b/roofline/src/roofline_fma_o3
--- a/roofline/src/roofline_o3
+++ b/roofline/src/roofline_o3