Kernels should work

This commit is contained in:
Armin Friedl 2016-06-19 22:59:33 +02:00
parent 68b0b82100
commit 352832d463
8 changed files with 176 additions and 37 deletions

View file

@ -1,14 +1,43 @@
all: roofline aikern.a all: roofline roofline_avx roofline_o3avx roofline_o3 roofline_avxfma
roofline: roofline.c aikern.a roofline: roofline.c aikern.a
gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@ gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@
roofline_avx: roofline.c aikern_avx.a
gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@
roofline_o3avx: roofline.c aikern_o3avx.a
gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@
roofline_o3: roofline.c aikern_o3.a
gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@
roofline_avxfma: roofline.c aikern_avxfma.a
gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@
aikern.a: aikern.c aikern.h aikern.a: aikern.c aikern.h
gcc -O3 -c -o aikern.o aikern.c gcc -c -o aikern.o aikern.c
ar rcs aikern.a aikern.o ar rcs aikern.a aikern.o
aikern_avx.a: aikern.c aikern.h
gcc -mavx -c -o aikern_avx.o aikern.c
ar rcs aikern_avx.a aikern_avx.o
aikern_o3.a: aikern.c aikern.h
gcc -O3 -c -o aikern_o3.o aikern.c
ar rcs aikern_o3.a aikern_o3.o
aikern_o3avx.a: aikern.c aikern.h
gcc -O3 -mavx -c -o aikern_o3avx.o aikern.c
ar rcs aikern_o3avx.a aikern_o3avx.o
# This is the only option that actually uses fma without optimizing the hell out of the kernel
aikern_avxfma.a: aikern.c aikern.h
gcc -O2 -mavx -mfma -c -o aikern_avxfma.o aikern.c
ar rcs aikern_avxfma.a aikern_avxfma.o
clean: clean:
rm -f roofline rm -f roofline roofline_avx roofline_o3avx roofline_o3 roofline_avxfma
rm -f *.o rm -f *.o
rm -f *.a rm -f *.a
rm -f *.so rm -f *.so

View file

@ -2,61 +2,171 @@
void kernel_1_16_simple(double* a, double* b, double* c, size_t size) void kernel_1_16_simple(double* a, double* b, double* c, size_t size)
{ {
// volatile to prevent compiler from optimizing this away
// register to advise compiler to put this in register
volatile double tmp = 0.1;
#pragma omp parallel for #pragma omp parallel for
for(size_t i=0; i<size; i++){ for(size_t i=0; i<size; i++){
/* COMM: 2 reads = 16 bytes, COMP: 1 FLOP -> AI = 1/16 */ /*
tmp = a[i] * b[i]; COMM: 1 reads, 1 write = 16 bytes
COMP: 1 FLOP
-> AI = 1/16
*/
a[i] = a[i] * a[i];
} }
} }
void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size) void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size)
{ {
/* === Warning ===
This is dangerous if FMA is not used/can't be used. Then there
are intermediary writes (and reads) to the stack. With FMA:
vmovsd xmm0,QWORD PTR [rdi+rax*8] # 1 read
vmovsd xmm1,QWORD PTR [rdx+rax*8] # 1 read
vfmadd132sd xmm0,xmm1,QWORD PTR [rsi+rax*8] # 2 FLOPs + 1 read
vmovsd QWORD PTR [rdi+rax*8],xmm0 # 1 write
*/
#pragma omp parallel for #pragma omp parallel for
for(size_t i=0; i<size; i++){ for(size_t i=0; i<size; i++){
/* COMM: 3 reads, 1 write = 32 bytes, COMP: 2 FLOP -> AI = 2/32 = 1/16 */ /*
COMM: 3 reads, 1 write = 32 bytes
COMP: 2 FLOP
-> AI = 2/32 = 1/16
*/
a[i] = a[i] * b[i] + c[i]; a[i] = a[i] * b[i] + c[i];
} }
} }
void kernel_8_1_simple(double* a, double* b, double* c, size_t size) void kernel_8_1_simple(double* a, double* b, double* c, size_t size)
{ {
/* === Warning ===
Seems correct with -O3. Though -O3 does some loop unrolling.
With -O0 this is dangerous, intermediary values stored on stack
who knows if they survive in cache -> unpredictable.
With AVX and -O2 (not necessarily FMA) best results
(obviously correct, only register shuffling). With FMA:
vmovsd xmm1,QWORD PTR [rdi] # 1 read
vmulsd xmm0,xmm1,xmm1 # 1 FLOP+register shuffling
vmulsd xmm0,xmm0,xmm1 # 15x 1 FLOP+register shuffling
# [...]
vmovsd QWORD PTR [rdi-0x8],xmm0 # 1 write
*/
#pragma omp parallel for
for(size_t i=0; i<size; i++){
/*
COMM: 1 read+1 write
COMP: 16 FLOPs
-> AI = 8
*/
a[i] = a[i] * a[i] * a[i] *
a[i] * a[i] * a[i] *
a[i] * a[i] * a[i] *
a[i] * a[i] * a[i] *
a[i] * a[i] * a[i] *
a[i] * a[i];
}
}
void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size)
{
/*
With FMA (and -O2):
vmovsd xmm0,QWORD PTR [rdi] # 1 read
vfmadd132sd xmm0,xmm0,xmm0 # 8x 2 FLOPs+register shuffling
vmovsd QWORD PTR [rdi-0x8],xmm0 # 1 write
*/
#pragma omp parallel for
for(size_t i=0; i<size; i++){
/*
COMM: 1 read + 1 write
COMP: 16 FLOP
-> AI = 8
*/
a[i] = a[i] * a[i] + a[i];
a[i] = a[i] * a[i] + a[i];
a[i] = a[i] * a[i] + a[i];
a[i] = a[i] * a[i] + a[i];
a[i] = a[i] * a[i] + a[i];
a[i] = a[i] * a[i] + a[i];
a[i] = a[i] * a[i] + a[i];
a[i] = a[i] * a[i] + a[i];
}
}
void kernel_1_8_vo(double* a, double* b, double* c, size_t size)
{
double tmp=0.0;
for(size_t i=0; i<size; i++) {
tmp = a[i] * a[i];
}
}
/* === FAILED KERNELS === */
/*
These are theoretically correct kernels but all of them yield
dangerous results with gcc 5.3.1 (checked the assembly).
*/
void kernel_1_16_simple_dangerous(double* a, double* b, double* c, size_t size)
{
/* === Problem ===
As soon as volatile is used gcc uses the stack for tmp.
Even if "register" is in place. Resulting in one additional write per loop.
Omitting volatile results in optimizing away the whole loop
(checked at -O2, which is necessary for FMA to eventually step in).
Maybe the value stays in cache, maybe not. It does not live a register.
Even with -O3:
movsd xmm0,QWORD PTR [rdi+rax*8] # 1 read
mulsd xmm0,QWORD PTR [rsi+rax*8] # 1 read (+ write to xmm0, not counted)
# [...] # instructions for loop
movsd QWORD PTR [rsp-0x8],xmm0 # malicious write
Without volatile (-O3):
repz ret # that's it
*/
// volatile to prevent compiler from optimizing this away // volatile to prevent compiler from optimizing this away
// register to advise compiler to put this in register // register to advise compiler to put this in register
volatile register double tmp = 0.1; volatile register double tmp = 0.1;
#pragma omp parallel for #pragma omp parallel for
for(size_t i=0; i<size; i++){ for(size_t i=0; i<size; i++){
/* COMM: 1 read, COMP: 8 FLOP -> AI = 8 */ /*
COMM: 2 reads = 16 bytes
COMP: 1 FLOP
-> AI = 1/16
*/
tmp = a[i] * b[i];
}
}
void kernel_8_1_simple_dangerous(double* a, double* b, double* c, size_t size)
{
/* === Problem ==
Same as for kernel_1_16_simple_dangerous
*/
// volatile to prevent compiler from optimizing this away
// register to advise compiler to put this in register
volatile register double tmp = 0.1;
#pragma omp parallel for
for(size_t i=0; i<size; i++){
/*
COMM: 1 read
COMP: 8 FLOP
-> AI = 8
*/
tmp = a[i] * a[i] * a[i] * a[i] * tmp = a[i] * a[i] * a[i] * a[i] *
a[i] * a[i] * a[i] * a[i]; a[i] * a[i] * a[i] * a[i];
} }
} }
void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size)
{
// volatile to prevent compiler from optimizing this away
// register to advise compiler to put this in register
register volatile double tmp = 0.1;
#pragma omp parallel for
for(size_t i=0; i<size; i++){
/* COMM: 1 read, COMP: 8 FLOP -> AI = 8 */
tmp = a[i];
tmp = tmp * tmp + tmp;
tmp = tmp * tmp + tmp;
tmp = tmp * tmp + tmp;
tmp = tmp * tmp + tmp;
}
}
void kernel_1_8_vo(double* a, double* b, double* c, size_t size)
{
volatile double tmp=0.0;
for(size_t i=0; i<size; i++) {
tmp = a[i] * a[i];
}
}

Binary file not shown.

View file

@ -229,7 +229,7 @@ static int get_int(char *oparg)
static void usage() static void usage()
{ {
fprintf(stderr, "USAGE: ./roofline -s <size> -s <runs> \n"); fprintf(stderr, "USAGE: ./roofline -s <size> -r <runs> \n");
bail_out(NULL); bail_out(NULL);
} }

BIN
roofline/src/roofline_avx Executable file

Binary file not shown.

BIN
roofline/src/roofline_avxfma Executable file

Binary file not shown.

BIN
roofline/src/roofline_o3 Executable file

Binary file not shown.

BIN
roofline/src/roofline_o3avx Executable file

Binary file not shown.