Kernels should work

2016-06-19 22:59:33 +02:00 · 2016-06-19 22:59:33 +02:00 · 352832d463
commit 352832d463
parent 68b0b82100
8 changed files with 176 additions and 37 deletions
--- a/roofline/src/Makefile
+++ b/roofline/src/Makefile
@ -1,14 +1,43 @@
-all: roofline aikern.a
+all: roofline roofline_avx roofline_o3avx roofline_o3 roofline_avxfma
 roofline: roofline.c aikern.a
 	gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@
 roofline_avx: roofline.c aikern_avx.a
 	gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@
 roofline_o3avx: roofline.c aikern_o3avx.a
 	gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@
 roofline_o3: roofline.c aikern_o3.a
 	gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@
 roofline_avxfma: roofline.c aikern_avxfma.a
 	gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@
 aikern.a: aikern.c aikern.h
-	gcc -O3 -c -o aikern.o aikern.c
+	gcc -c -o aikern.o aikern.c
 	ar rcs aikern.a aikern.o
 aikern_avx.a: aikern.c aikern.h
 	gcc -mavx -c -o aikern_avx.o aikern.c
 	ar rcs aikern_avx.a aikern_avx.o
 aikern_o3.a: aikern.c aikern.h
 	gcc -O3 -c -o aikern_o3.o aikern.c
 	ar rcs aikern_o3.a aikern_o3.o
 aikern_o3avx.a: aikern.c aikern.h
 	gcc -O3 -mavx -c -o aikern_o3avx.o aikern.c
 	ar rcs aikern_o3avx.a aikern_o3avx.o
 # This is the only option that actually uses fma without optimizing the hell out of the kernel
 aikern_avxfma.a: aikern.c aikern.h
 	gcc -O2 -mavx -mfma -c -o aikern_avxfma.o aikern.c
 	ar rcs aikern_avxfma.a aikern_avxfma.o
 clean:
-	rm -f roofline
+	rm -f roofline roofline_avx roofline_o3avx roofline_o3 roofline_avxfma
 	rm -f *.o
 	rm -f *.a
 	rm -f *.so
--- a/roofline/src/aikern.c
+++ b/roofline/src/aikern.c
@ -2,61 +2,171 @@
 void kernel_1_16_simple(double* a, double* b, double* c, size_t size)
 {
  // volatile to prevent compiler from optimizing this away
  // register to advise compiler to put this in register
  volatile double tmp = 0.1;
  #pragma omp parallel for
  for(size_t i=0; i<size; i++){
-	/* COMM: 2 reads = 16 bytes, COMP: 1 FLOP -> AI = 1/16 */
+	/* 
-	tmp = a[i] * b[i];
+	   COMM: 1 reads, 1 write = 16 bytes
 	   COMP: 1 FLOP 
 	   -> AI = 1/16 
 	*/
 	a[i] = a[i] * a[i];
  }
 }
 void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size)
 {
  /* === Warning ===
 	 This is dangerous if FMA is not used/can't be used. Then there
 	 are intermediary writes (and reads) to the stack. With FMA:
 	 vmovsd xmm0,QWORD PTR [rdi+rax*8]				# 1 read
 	 vmovsd xmm1,QWORD PTR [rdx+rax*8]				# 1 read
 	 vfmadd132sd xmm0,xmm1,QWORD PTR [rsi+rax*8]	# 2 FLOPs + 1 read
 	 vmovsd QWORD PTR [rdi+rax*8],xmm0				# 1 write
  */
  #pragma omp parallel for
  for(size_t i=0; i<size; i++){
-	/* COMM: 3 reads, 1 write = 32 bytes, COMP: 2 FLOP -> AI = 2/32 = 1/16 */
+	/* 
 	   COMM: 3 reads, 1 write = 32 bytes
 	   COMP: 2 FLOP 
 	   -> AI = 2/32 = 1/16 
 	*/
 	a[i] = a[i] * b[i] + c[i];
  }
 }
 void kernel_8_1_simple(double* a, double* b, double* c, size_t size)
 {
  /* === Warning ===
 	 Seems correct with -O3. Though -O3 does some loop unrolling.
 	 With -O0 this is dangerous, intermediary values stored on stack
 	 who knows if they survive in cache -> unpredictable.
 	 With AVX and -O2 (not necessarily FMA) best results 
 	 (obviously correct, only register shuffling). With FMA:
 	 vmovsd xmm1,QWORD PTR [rdi]					# 1 read
 	 vmulsd xmm0,xmm1,xmm1							# 1 FLOP+register shuffling
 	 vmulsd xmm0,xmm0,xmm1							# 15x 1 FLOP+register shuffling
 	 # [...]
 	 vmovsd QWORD PTR [rdi-0x8],xmm0				# 1 write
  */
  #pragma omp parallel for
  for(size_t i=0; i<size; i++){
 	/* 
 	   COMM: 1 read+1 write 
 	   COMP: 16 FLOPs 
 	   -> AI = 8 
 	*/
 	a[i] = a[i] * a[i] * a[i] *
 		   a[i] * a[i] * a[i] *
 		   a[i] * a[i] * a[i] *
 	 	   a[i] * a[i] * a[i] *
 		   a[i] * a[i] * a[i] *
 	       a[i] * a[i];
  }
 }
 void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size)
 {
  /*
 	With FMA (and -O2):
 	vmovsd xmm0,QWORD PTR [rdi]					# 1 read
 	vfmadd132sd xmm0,xmm0,xmm0					# 8x 2 FLOPs+register shuffling
 	vmovsd QWORD PTR [rdi-0x8],xmm0				# 1 write
   */
  #pragma omp parallel for
  for(size_t i=0; i<size; i++){
 	/* 
 	   COMM: 1 read + 1 write
 	   COMP: 16 FLOP
 	   -> AI = 8 
 	*/
 	a[i] = a[i] * a[i] + a[i];
 	a[i] = a[i] * a[i] + a[i];
 	a[i] = a[i] * a[i] + a[i];
 	a[i] = a[i] * a[i] + a[i];
 	a[i] = a[i] * a[i] + a[i];
 	a[i] = a[i] * a[i] + a[i];
 	a[i] = a[i] * a[i] + a[i];
 	a[i] = a[i] * a[i] + a[i];
  }
 }
 void kernel_1_8_vo(double* a, double* b, double* c, size_t size)
 {
  double tmp=0.0;
  for(size_t i=0; i<size; i++) {
 	tmp = a[i] * a[i];
  }
 }
 /* === FAILED KERNELS === */
 /*
  These are theoretically correct kernels but all of them yield
  dangerous results with gcc 5.3.1 (checked the assembly).
 */
 void kernel_1_16_simple_dangerous(double* a, double* b, double* c, size_t size)
 {
  /* === Problem ===
 	As soon as volatile is used gcc uses the stack for tmp.
 	Even if "register" is in place. Resulting in one additional write per loop.
 	Omitting volatile results in optimizing away the whole loop 
 	(checked at -O2, which is necessary for FMA to eventually step in).
 	Maybe the value stays in cache, maybe not. It does not live a register.
 	Even with -O3:
 	movsd  xmm0,QWORD PTR [rdi+rax*8]  # 1 read
 	mulsd  xmm0,QWORD PTR [rsi+rax*8]  # 1 read (+ write to xmm0, not counted)
 	# [...]							   # instructions for loop
 	movsd  QWORD PTR [rsp-0x8],xmm0    # malicious write
 	Without volatile (-O3):
 	repz ret						   # that's it
  */
  // volatile to prevent compiler from optimizing this away
  // register to advise compiler to put this in register
  volatile register double tmp = 0.1;
  #pragma omp parallel for
  for(size_t i=0; i<size; i++){
-	/* COMM: 1 read, COMP: 8 FLOP -> AI = 8	*/
+	/* 
 	   COMM: 2 reads = 16 bytes
 	   COMP: 1 FLOP
 	   -> AI = 1/16 
 	*/
 	tmp = a[i] * b[i];
  }
 }
 void kernel_8_1_simple_dangerous(double* a, double* b, double* c, size_t size)
 {
  /* === Problem ==
 	 Same as for kernel_1_16_simple_dangerous
  */
  // volatile to prevent compiler from optimizing this away
  // register to advise compiler to put this in register
  volatile register double tmp = 0.1;
  #pragma omp parallel for
  for(size_t i=0; i<size; i++){
 	/* 
 	   COMM: 1 read
 	   COMP: 8 FLOP
 	   -> AI = 8	
 	*/
 	tmp = a[i] * a[i] * a[i] * a[i] *
 	      a[i] * a[i] * a[i] * a[i];
  }
 }
 void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size)
 {
  // volatile to prevent compiler from optimizing this away
  // register to advise compiler to put this in register
  register volatile double tmp = 0.1;
  #pragma omp parallel for
  for(size_t i=0; i<size; i++){
 	/* COMM: 1 read, COMP: 8 FLOP -> AI = 8 */
 	tmp = a[i];
 	tmp = tmp * tmp + tmp;
 	tmp = tmp * tmp + tmp;
 	tmp = tmp * tmp + tmp;
 	tmp = tmp * tmp + tmp;
  }
 }
 void kernel_1_8_vo(double* a, double* b, double* c, size_t size)
 {
  volatile double tmp=0.0;
  for(size_t i=0; i<size; i++) {
 	tmp = a[i] * a[i];
  }
 }
--- a/roofline/src/roofline
+++ b/roofline/src/roofline
--- a/roofline/src/roofline.c
+++ b/roofline/src/roofline.c
@ -229,7 +229,7 @@ static int get_int(char *oparg)
 static void usage()
 {
-  fprintf(stderr, "USAGE: ./roofline -s <size> -s <runs> \n");
+  fprintf(stderr, "USAGE: ./roofline -s <size> -r <runs> \n");
  bail_out(NULL);
 }
--- a/roofline/src/roofline_avx
+++ b/roofline/src/roofline_avx
--- a/roofline/src/roofline_avxfma
+++ b/roofline/src/roofline_avxfma
--- a/roofline/src/roofline_o3
+++ b/roofline/src/roofline_o3
--- a/roofline/src/roofline_o3avx
+++ b/roofline/src/roofline_o3avx