Kernels should work

2016-06-19 22:59:33 +02:00 · 2016-06-19 22:59:33 +02:00 · 352832d463
commit 352832d463
parent 68b0b82100
8 changed files with 176 additions and 37 deletions
--- a/roofline/src/Makefile
+++ b/roofline/src/Makefile
@ -1,14 +1,43 @@
-all: roofline aikern.a
+all: roofline roofline_avx roofline_o3avx roofline_o3 roofline_avxfma

 roofline: roofline.c aikern.a
 	gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@

+roofline_avx: roofline.c aikern_avx.a
+	gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@
+
+roofline_o3avx: roofline.c aikern_o3avx.a
+	gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@
+
+roofline_o3: roofline.c aikern_o3.a
+	gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@
+
+roofline_avxfma: roofline.c aikern_avxfma.a
+	gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@
+
 aikern.a: aikern.c aikern.h
-	gcc -O3 -c -o aikern.o aikern.c
+	gcc -c -o aikern.o aikern.c
 	ar rcs aikern.a aikern.o

+aikern_avx.a: aikern.c aikern.h
+	gcc -mavx -c -o aikern_avx.o aikern.c
+	ar rcs aikern_avx.a aikern_avx.o
+
+aikern_o3.a: aikern.c aikern.h
+	gcc -O3 -c -o aikern_o3.o aikern.c
+	ar rcs aikern_o3.a aikern_o3.o
+
+aikern_o3avx.a: aikern.c aikern.h
+	gcc -O3 -mavx -c -o aikern_o3avx.o aikern.c
+	ar rcs aikern_o3avx.a aikern_o3avx.o
+
+# This is the only option that actually uses fma without optimizing the hell out of the kernel
+aikern_avxfma.a: aikern.c aikern.h
+	gcc -O2 -mavx -mfma -c -o aikern_avxfma.o aikern.c
+	ar rcs aikern_avxfma.a aikern_avxfma.o
+
 clean:
-	rm -f roofline
+	rm -f roofline roofline_avx roofline_o3avx roofline_o3 roofline_avxfma
 	rm -f *.o
 	rm -f *.a
 	rm -f *.so
--- a/roofline/src/aikern.c
+++ b/roofline/src/aikern.c
@ -2,61 +2,171 @@

 void kernel_1_16_simple(double* a, double* b, double* c, size_t size)
 {
-  // volatile to prevent compiler from optimizing this away
-  // register to advise compiler to put this in register
-  volatile double tmp = 0.1;
-
  #pragma omp parallel for
  for(size_t i=0; i<size; i++){
-	/* COMM: 2 reads = 16 bytes, COMP: 1 FLOP -> AI = 1/16 */
-	tmp = a[i] * b[i];
+	/* 
+	   COMM: 1 reads, 1 write = 16 bytes
+	   COMP: 1 FLOP 
+	   -> AI = 1/16 
+	*/
+	a[i] = a[i] * a[i];
  }
 }

 void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size)
 {
+  /* === Warning ===
+	 This is dangerous if FMA is not used/can't be used. Then there
+	 are intermediary writes (and reads) to the stack. With FMA:
+	 
+	 vmovsd xmm0,QWORD PTR [rdi+rax*8]				# 1 read
+	 vmovsd xmm1,QWORD PTR [rdx+rax*8]				# 1 read
+	 vfmadd132sd xmm0,xmm1,QWORD PTR [rsi+rax*8]	# 2 FLOPs + 1 read
+	 vmovsd QWORD PTR [rdi+rax*8],xmm0				# 1 write
+  */
+  
  #pragma omp parallel for
  for(size_t i=0; i<size; i++){
-	/* COMM: 3 reads, 1 write = 32 bytes, COMP: 2 FLOP -> AI = 2/32 = 1/16 */
+	/* 
+	   COMM: 3 reads, 1 write = 32 bytes
+	   COMP: 2 FLOP 
+	   -> AI = 2/32 = 1/16 
+	*/
 	a[i] = a[i] * b[i] + c[i];
  }
 }

 void kernel_8_1_simple(double* a, double* b, double* c, size_t size)
 {
+  /* === Warning ===
+	 Seems correct with -O3. Though -O3 does some loop unrolling.
+
+	 With -O0 this is dangerous, intermediary values stored on stack
+	 who knows if they survive in cache -> unpredictable.
+
+	 With AVX and -O2 (not necessarily FMA) best results 
+	 (obviously correct, only register shuffling). With FMA:
+
+	 vmovsd xmm1,QWORD PTR [rdi]					# 1 read
+	 vmulsd xmm0,xmm1,xmm1							# 1 FLOP+register shuffling
+	 vmulsd xmm0,xmm0,xmm1							# 15x 1 FLOP+register shuffling
+	 # [...]
+	 vmovsd QWORD PTR [rdi-0x8],xmm0				# 1 write
+  */
+  
+  #pragma omp parallel for
+  for(size_t i=0; i<size; i++){
+	/* 
+	   COMM: 1 read+1 write 
+	   COMP: 16 FLOPs 
+	   -> AI = 8 
+	*/
+	a[i] = a[i] * a[i] * a[i] *
+		   a[i] * a[i] * a[i] *
+		   a[i] * a[i] * a[i] *
+	 	   a[i] * a[i] * a[i] *
+		   a[i] * a[i] * a[i] *
+	       a[i] * a[i];
+  }
+}
+
+void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size)
+{
+  /*
+	With FMA (and -O2):
+
+	vmovsd xmm0,QWORD PTR [rdi]					# 1 read
+	vfmadd132sd xmm0,xmm0,xmm0					# 8x 2 FLOPs+register shuffling
+	vmovsd QWORD PTR [rdi-0x8],xmm0				# 1 write
+   */
+
+  #pragma omp parallel for
+  for(size_t i=0; i<size; i++){
+	/* 
+	   COMM: 1 read + 1 write
+	   COMP: 16 FLOP
+	   -> AI = 8 
+	*/
+	a[i] = a[i] * a[i] + a[i];
+	a[i] = a[i] * a[i] + a[i];
+	a[i] = a[i] * a[i] + a[i];
+	a[i] = a[i] * a[i] + a[i];
+	a[i] = a[i] * a[i] + a[i];
+	a[i] = a[i] * a[i] + a[i];
+	a[i] = a[i] * a[i] + a[i];
+	a[i] = a[i] * a[i] + a[i];
+  }
+}
+
+void kernel_1_8_vo(double* a, double* b, double* c, size_t size)
+{
+  double tmp=0.0;
+  for(size_t i=0; i<size; i++) {
+	tmp = a[i] * a[i];
+  }
+}
+
+
+/* === FAILED KERNELS === */
+
+/*
+  These are theoretically correct kernels but all of them yield
+  dangerous results with gcc 5.3.1 (checked the assembly).
+*/
+
+void kernel_1_16_simple_dangerous(double* a, double* b, double* c, size_t size)
+{
+  /* === Problem ===
+	As soon as volatile is used gcc uses the stack for tmp.
+	Even if "register" is in place. Resulting in one additional write per loop.
+	Omitting volatile results in optimizing away the whole loop 
+	(checked at -O2, which is necessary for FMA to eventually step in).
+	Maybe the value stays in cache, maybe not. It does not live a register.
+
+	Even with -O3:
+	movsd  xmm0,QWORD PTR [rdi+rax*8]  # 1 read
+	mulsd  xmm0,QWORD PTR [rsi+rax*8]  # 1 read (+ write to xmm0, not counted)
+	# [...]							   # instructions for loop
+	movsd  QWORD PTR [rsp-0x8],xmm0    # malicious write
+
+	Without volatile (-O3):
+	repz ret						   # that's it
+  */
+
+  
  // volatile to prevent compiler from optimizing this away
  // register to advise compiler to put this in register
  volatile register double tmp = 0.1;

  #pragma omp parallel for
  for(size_t i=0; i<size; i++){
-	/* COMM: 1 read, COMP: 8 FLOP -> AI = 8	*/
+	/* 
+	   COMM: 2 reads = 16 bytes
+	   COMP: 1 FLOP
+	   -> AI = 1/16 
+	*/
+	tmp = a[i] * b[i];
+  }
+}
+
+void kernel_8_1_simple_dangerous(double* a, double* b, double* c, size_t size)
+{
+  /* === Problem ==
+	 Same as for kernel_1_16_simple_dangerous
+  */
+  
+  // volatile to prevent compiler from optimizing this away
+  // register to advise compiler to put this in register
+  volatile register double tmp = 0.1;
+  
+  #pragma omp parallel for
+  for(size_t i=0; i<size; i++){
+	/* 
+	   COMM: 1 read
+	   COMP: 8 FLOP
+	   -> AI = 8	
+	*/
 	tmp = a[i] * a[i] * a[i] * a[i] *
 	      a[i] * a[i] * a[i] * a[i];
  }
 }
-
-void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size)
-{
-  // volatile to prevent compiler from optimizing this away
-  // register to advise compiler to put this in register
-  register volatile double tmp = 0.1;
-
-  #pragma omp parallel for
-  for(size_t i=0; i<size; i++){
-	/* COMM: 1 read, COMP: 8 FLOP -> AI = 8 */
-	tmp = a[i];
-	tmp = tmp * tmp + tmp;
-	tmp = tmp * tmp + tmp;
-	tmp = tmp * tmp + tmp;
-	tmp = tmp * tmp + tmp;
-  }
-}
-
-void kernel_1_8_vo(double* a, double* b, double* c, size_t size)
-{
-  volatile double tmp=0.0;
-  for(size_t i=0; i<size; i++) {
-	tmp = a[i] * a[i];
-  }
-}
--- a/roofline/src/roofline
+++ b/roofline/src/roofline
--- a/roofline/src/roofline.c
+++ b/roofline/src/roofline.c
@ -229,7 +229,7 @@ static int get_int(char *oparg)

 static void usage()
 {
-  fprintf(stderr, "USAGE: ./roofline -s <size> -s <runs> \n");
+  fprintf(stderr, "USAGE: ./roofline -s <size> -r <runs> \n");
  bail_out(NULL);
 }

--- a/roofline/src/roofline_avx
+++ b/roofline/src/roofline_avx
--- a/roofline/src/roofline_avxfma
+++ b/roofline/src/roofline_avxfma
--- a/roofline/src/roofline_o3
+++ b/roofline/src/roofline_o3
--- a/roofline/src/roofline_o3avx
+++ b/roofline/src/roofline_o3avx