diff --git a/roofline/src/Makefile b/roofline/src/Makefile index 6a0ad16..742bf38 100644 --- a/roofline/src/Makefile +++ b/roofline/src/Makefile @@ -1,14 +1,43 @@ -all: roofline aikern.a +all: roofline roofline_avx roofline_o3avx roofline_o3 roofline_avxfma roofline: roofline.c aikern.a gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@ +roofline_avx: roofline.c aikern_avx.a + gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@ + +roofline_o3avx: roofline.c aikern_o3avx.a + gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@ + +roofline_o3: roofline.c aikern_o3.a + gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@ + +roofline_avxfma: roofline.c aikern_avxfma.a + gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@ + aikern.a: aikern.c aikern.h - gcc -O3 -c -o aikern.o aikern.c + gcc -c -o aikern.o aikern.c ar rcs aikern.a aikern.o +aikern_avx.a: aikern.c aikern.h + gcc -mavx -c -o aikern_avx.o aikern.c + ar rcs aikern_avx.a aikern_avx.o + +aikern_o3.a: aikern.c aikern.h + gcc -O3 -c -o aikern_o3.o aikern.c + ar rcs aikern_o3.a aikern_o3.o + +aikern_o3avx.a: aikern.c aikern.h + gcc -O3 -mavx -c -o aikern_o3avx.o aikern.c + ar rcs aikern_o3avx.a aikern_o3avx.o + +# This is the only option that actually uses fma without optimizing the hell out of the kernel +aikern_avxfma.a: aikern.c aikern.h + gcc -O2 -mavx -mfma -c -o aikern_avxfma.o aikern.c + ar rcs aikern_avxfma.a aikern_avxfma.o + clean: - rm -f roofline + rm -f roofline roofline_avx roofline_o3avx roofline_o3 roofline_avxfma rm -f *.o rm -f *.a rm -f *.so diff --git a/roofline/src/aikern.c b/roofline/src/aikern.c index 933ea6a..64e5c59 100644 --- a/roofline/src/aikern.c +++ b/roofline/src/aikern.c @@ -2,61 +2,171 @@ void kernel_1_16_simple(double* a, double* b, double* c, size_t size) { - // volatile to prevent compiler from optimizing this away - // register to advise compiler to put this in register - volatile double tmp = 0.1; - #pragma omp parallel for for(size_t i=0; i AI = 1/16 */ - tmp = a[i] * b[i]; + /* + COMM: 1 reads, 1 write = 16 bytes + COMP: 1 FLOP + -> AI = 1/16 + */ + a[i] = a[i] * a[i]; } } void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size) { + /* === Warning === + This is dangerous if FMA is not used/can't be used. Then there + are intermediary writes (and reads) to the stack. With FMA: + + vmovsd xmm0,QWORD PTR [rdi+rax*8] # 1 read + vmovsd xmm1,QWORD PTR [rdx+rax*8] # 1 read + vfmadd132sd xmm0,xmm1,QWORD PTR [rsi+rax*8] # 2 FLOPs + 1 read + vmovsd QWORD PTR [rdi+rax*8],xmm0 # 1 write + */ + #pragma omp parallel for for(size_t i=0; i AI = 2/32 = 1/16 */ + /* + COMM: 3 reads, 1 write = 32 bytes + COMP: 2 FLOP + -> AI = 2/32 = 1/16 + */ a[i] = a[i] * b[i] + c[i]; } } void kernel_8_1_simple(double* a, double* b, double* c, size_t size) { + /* === Warning === + Seems correct with -O3. Though -O3 does some loop unrolling. + + With -O0 this is dangerous, intermediary values stored on stack + who knows if they survive in cache -> unpredictable. + + With AVX and -O2 (not necessarily FMA) best results + (obviously correct, only register shuffling). With FMA: + + vmovsd xmm1,QWORD PTR [rdi] # 1 read + vmulsd xmm0,xmm1,xmm1 # 1 FLOP+register shuffling + vmulsd xmm0,xmm0,xmm1 # 15x 1 FLOP+register shuffling + # [...] + vmovsd QWORD PTR [rdi-0x8],xmm0 # 1 write + */ + + #pragma omp parallel for + for(size_t i=0; i AI = 8 + */ + a[i] = a[i] * a[i] * a[i] * + a[i] * a[i] * a[i] * + a[i] * a[i] * a[i] * + a[i] * a[i] * a[i] * + a[i] * a[i] * a[i] * + a[i] * a[i]; + } +} + +void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size) +{ + /* + With FMA (and -O2): + + vmovsd xmm0,QWORD PTR [rdi] # 1 read + vfmadd132sd xmm0,xmm0,xmm0 # 8x 2 FLOPs+register shuffling + vmovsd QWORD PTR [rdi-0x8],xmm0 # 1 write + */ + + #pragma omp parallel for + for(size_t i=0; i AI = 8 + */ + a[i] = a[i] * a[i] + a[i]; + a[i] = a[i] * a[i] + a[i]; + a[i] = a[i] * a[i] + a[i]; + a[i] = a[i] * a[i] + a[i]; + a[i] = a[i] * a[i] + a[i]; + a[i] = a[i] * a[i] + a[i]; + a[i] = a[i] * a[i] + a[i]; + a[i] = a[i] * a[i] + a[i]; + } +} + +void kernel_1_8_vo(double* a, double* b, double* c, size_t size) +{ + double tmp=0.0; + for(size_t i=0; i AI = 1/16 + */ + tmp = a[i] * b[i]; + } +} + +void kernel_8_1_simple_dangerous(double* a, double* b, double* c, size_t size) +{ + /* === Problem == + Same as for kernel_1_16_simple_dangerous + */ + // volatile to prevent compiler from optimizing this away // register to advise compiler to put this in register volatile register double tmp = 0.1; #pragma omp parallel for for(size_t i=0; i AI = 8 */ + /* + COMM: 1 read + COMP: 8 FLOP + -> AI = 8 + */ tmp = a[i] * a[i] * a[i] * a[i] * a[i] * a[i] * a[i] * a[i]; } } - -void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size) -{ - // volatile to prevent compiler from optimizing this away - // register to advise compiler to put this in register - register volatile double tmp = 0.1; - - #pragma omp parallel for - for(size_t i=0; i AI = 8 */ - tmp = a[i]; - tmp = tmp * tmp + tmp; - tmp = tmp * tmp + tmp; - tmp = tmp * tmp + tmp; - tmp = tmp * tmp + tmp; - } -} - -void kernel_1_8_vo(double* a, double* b, double* c, size_t size) -{ - volatile double tmp=0.0; - for(size_t i=0; i -s \n"); + fprintf(stderr, "USAGE: ./roofline -s -r \n"); bail_out(NULL); } diff --git a/roofline/src/roofline_avx b/roofline/src/roofline_avx new file mode 100755 index 0000000..4f798b1 Binary files /dev/null and b/roofline/src/roofline_avx differ diff --git a/roofline/src/roofline_avxfma b/roofline/src/roofline_avxfma new file mode 100755 index 0000000..cc9fee9 Binary files /dev/null and b/roofline/src/roofline_avxfma differ diff --git a/roofline/src/roofline_o3 b/roofline/src/roofline_o3 new file mode 100755 index 0000000..8dd3a95 Binary files /dev/null and b/roofline/src/roofline_o3 differ diff --git a/roofline/src/roofline_o3avx b/roofline/src/roofline_o3avx new file mode 100755 index 0000000..1b23d6f Binary files /dev/null and b/roofline/src/roofline_o3avx differ