nix wichtiges

2016-06-20 01:13:23 +02:00 · 2016-06-20 01:13:23 +02:00 · 4d407ada15
commit 4d407ada15
parent 352832d463
9 changed files with 30 additions and 15 deletions
--- a/roofline/src/Makefile
+++ b/roofline/src/Makefile
@ -31,7 +31,7 @@ aikern_o3avx.a: aikern.c aikern.h
 	gcc -O3 -mavx -c -o aikern_o3avx.o aikern.c
 	ar rcs aikern_o3avx.a aikern_o3avx.o
-# This is the only option that actually uses fma without optimizing the hell out of the kernel
+# This is the only version that actually uses FMA
 aikern_avxfma.a: aikern.c aikern.h
 	gcc -O2 -mavx -mfma -c -o aikern_avxfma.o aikern.c
 	ar rcs aikern_avxfma.a aikern_avxfma.o
--- a/roofline/src/aikern.c
+++ b/roofline/src/aikern.c
@ -98,15 +98,6 @@ void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size)
  }
 }
 void kernel_1_8_vo(double* a, double* b, double* c, size_t size)
 {
  double tmp=0.0;
  for(size_t i=0; i<size; i++) {
 	tmp = a[i] * a[i];
  }
 }
 /* === FAILED KERNELS === */
 /*
@ -170,3 +161,23 @@ void kernel_8_1_simple_dangerous(double* a, double* b, double* c, size_t size)
 	      a[i] * a[i] * a[i] * a[i];
  }
 }
 void kernel_1_8_vo_dangerous(double* a, double* b, double* c, size_t size)
 {
  /* This is the 1/8 AI kernel from the lecture
 	 === Problem ==
 	 Same as for kernel_1_16_simple_dangerous
 	 Without volatile the loop is optimized away completely.
 	 With volatile tmp is written to the stack in every loop
 	 (-O3). tmp could be cached or not. This might depend on
 	 how large the array is and how the cpu work internally
 	 -> unpredictable.
  */
  volatile double tmp=0.0;
  for(size_t i=0; i<size; i++) {
 	tmp = a[i] * a[i];
  }
 }
--- a/roofline/src/aikern.h
+++ b/roofline/src/aikern.h
@ -2,4 +2,7 @@ void kernel_1_16_simple(double* a, double* b, double* c, size_t size);
 void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size);
 void kernel_8_1_simple(double* a, double* b, double* c, size_t size);
 void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size);
-void kernel_1_8_vo(double* a, double* b, double* c, size_t size);
+
 void kernel_1_16_simple_dangerous(double* a, double* b, double* c, size_t size);
 void kernel_8_1_simple_dangerous(double* a, double* b, double* c, size_t size);
 void kernel_1_8_vo_dangerous(double* a, double* b, double* c, size_t size);
--- a/roofline/src/roofline
+++ b/roofline/src/roofline
--- a/roofline/src/roofline.c
+++ b/roofline/src/roofline.c
@ -118,16 +118,17 @@ int main(int argc, char* argv[]) {
  printf("Filling arrays with dummy values\n");
-  #pragma omp parallel for
+  /*  #pragma omp parallel for
  for (size_t j=0; j<size; j++)
 	{
 	  a[j] = 1.0;
 	  b[j] = 2.0;
 	  c[j] = 3.0;
-	}
+	  }*/
  double t;
  printf("Warming up cache\n");
-  double t = mysecond();
+  t = mysecond();
  testkern(a,b,c, size);
  t = mysecond() - t;
  printf("Cache warming took %.4f microseconds = %.4f seconds (with test AI of 1/16 FLOPs/Byte)\n", (t*1.0E6), t);
@ -162,7 +163,7 @@ int main(int argc, char* argv[]) {
  printf("1/8 vo\n");
  t = mysecond();
-  kernel_1_8_vo(a,b,c, size);
+  kernel_1_8_vo_dangerous(a,b,c, size);
  t = mysecond() - t;
  printf("Cache warming took %.4f microseconds = %.4f seconds (with test AI of 1/16 FLOPs/Byte)\n", (t*1.0E6), t);
--- a/roofline/src/roofline_avx
+++ b/roofline/src/roofline_avx
--- a/roofline/src/roofline_avxfma
+++ b/roofline/src/roofline_avxfma
--- a/roofline/src/roofline_o3
+++ b/roofline/src/roofline_o3
--- a/roofline/src/roofline_o3avx
+++ b/roofline/src/roofline_o3avx