Kernels should work
This commit is contained in:
parent
68b0b82100
commit
352832d463
8 changed files with 176 additions and 37 deletions
|
@ -1,14 +1,43 @@
|
|||
all: roofline aikern.a
|
||||
all: roofline roofline_avx roofline_o3avx roofline_o3 roofline_avxfma
|
||||
|
||||
roofline: roofline.c aikern.a
|
||||
gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@
|
||||
|
||||
roofline_avx: roofline.c aikern_avx.a
|
||||
gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@
|
||||
|
||||
roofline_o3avx: roofline.c aikern_o3avx.a
|
||||
gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@
|
||||
|
||||
roofline_o3: roofline.c aikern_o3.a
|
||||
gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@
|
||||
|
||||
roofline_avxfma: roofline.c aikern_avxfma.a
|
||||
gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@
|
||||
|
||||
aikern.a: aikern.c aikern.h
|
||||
gcc -O3 -c -o aikern.o aikern.c
|
||||
gcc -c -o aikern.o aikern.c
|
||||
ar rcs aikern.a aikern.o
|
||||
|
||||
aikern_avx.a: aikern.c aikern.h
|
||||
gcc -mavx -c -o aikern_avx.o aikern.c
|
||||
ar rcs aikern_avx.a aikern_avx.o
|
||||
|
||||
aikern_o3.a: aikern.c aikern.h
|
||||
gcc -O3 -c -o aikern_o3.o aikern.c
|
||||
ar rcs aikern_o3.a aikern_o3.o
|
||||
|
||||
aikern_o3avx.a: aikern.c aikern.h
|
||||
gcc -O3 -mavx -c -o aikern_o3avx.o aikern.c
|
||||
ar rcs aikern_o3avx.a aikern_o3avx.o
|
||||
|
||||
# This is the only option that actually uses fma without optimizing the hell out of the kernel
|
||||
aikern_avxfma.a: aikern.c aikern.h
|
||||
gcc -O2 -mavx -mfma -c -o aikern_avxfma.o aikern.c
|
||||
ar rcs aikern_avxfma.a aikern_avxfma.o
|
||||
|
||||
clean:
|
||||
rm -f roofline
|
||||
rm -f roofline roofline_avx roofline_o3avx roofline_o3 roofline_avxfma
|
||||
rm -f *.o
|
||||
rm -f *.a
|
||||
rm -f *.so
|
||||
|
|
|
@ -2,61 +2,171 @@
|
|||
|
||||
void kernel_1_16_simple(double* a, double* b, double* c, size_t size)
|
||||
{
|
||||
// volatile to prevent compiler from optimizing this away
|
||||
// register to advise compiler to put this in register
|
||||
volatile double tmp = 0.1;
|
||||
|
||||
#pragma omp parallel for
|
||||
for(size_t i=0; i<size; i++){
|
||||
/* COMM: 2 reads = 16 bytes, COMP: 1 FLOP -> AI = 1/16 */
|
||||
tmp = a[i] * b[i];
|
||||
/*
|
||||
COMM: 1 reads, 1 write = 16 bytes
|
||||
COMP: 1 FLOP
|
||||
-> AI = 1/16
|
||||
*/
|
||||
a[i] = a[i] * a[i];
|
||||
}
|
||||
}
|
||||
|
||||
void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size)
|
||||
{
|
||||
/* === Warning ===
|
||||
This is dangerous if FMA is not used/can't be used. Then there
|
||||
are intermediary writes (and reads) to the stack. With FMA:
|
||||
|
||||
vmovsd xmm0,QWORD PTR [rdi+rax*8] # 1 read
|
||||
vmovsd xmm1,QWORD PTR [rdx+rax*8] # 1 read
|
||||
vfmadd132sd xmm0,xmm1,QWORD PTR [rsi+rax*8] # 2 FLOPs + 1 read
|
||||
vmovsd QWORD PTR [rdi+rax*8],xmm0 # 1 write
|
||||
*/
|
||||
|
||||
#pragma omp parallel for
|
||||
for(size_t i=0; i<size; i++){
|
||||
/* COMM: 3 reads, 1 write = 32 bytes, COMP: 2 FLOP -> AI = 2/32 = 1/16 */
|
||||
/*
|
||||
COMM: 3 reads, 1 write = 32 bytes
|
||||
COMP: 2 FLOP
|
||||
-> AI = 2/32 = 1/16
|
||||
*/
|
||||
a[i] = a[i] * b[i] + c[i];
|
||||
}
|
||||
}
|
||||
|
||||
void kernel_8_1_simple(double* a, double* b, double* c, size_t size)
|
||||
{
|
||||
/* === Warning ===
|
||||
Seems correct with -O3. Though -O3 does some loop unrolling.
|
||||
|
||||
With -O0 this is dangerous, intermediary values stored on stack
|
||||
who knows if they survive in cache -> unpredictable.
|
||||
|
||||
With AVX and -O2 (not necessarily FMA) best results
|
||||
(obviously correct, only register shuffling). With FMA:
|
||||
|
||||
vmovsd xmm1,QWORD PTR [rdi] # 1 read
|
||||
vmulsd xmm0,xmm1,xmm1 # 1 FLOP+register shuffling
|
||||
vmulsd xmm0,xmm0,xmm1 # 15x 1 FLOP+register shuffling
|
||||
# [...]
|
||||
vmovsd QWORD PTR [rdi-0x8],xmm0 # 1 write
|
||||
*/
|
||||
|
||||
#pragma omp parallel for
|
||||
for(size_t i=0; i<size; i++){
|
||||
/*
|
||||
COMM: 1 read+1 write
|
||||
COMP: 16 FLOPs
|
||||
-> AI = 8
|
||||
*/
|
||||
a[i] = a[i] * a[i] * a[i] *
|
||||
a[i] * a[i] * a[i] *
|
||||
a[i] * a[i] * a[i] *
|
||||
a[i] * a[i] * a[i] *
|
||||
a[i] * a[i] * a[i] *
|
||||
a[i] * a[i];
|
||||
}
|
||||
}
|
||||
|
||||
void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size)
|
||||
{
|
||||
/*
|
||||
With FMA (and -O2):
|
||||
|
||||
vmovsd xmm0,QWORD PTR [rdi] # 1 read
|
||||
vfmadd132sd xmm0,xmm0,xmm0 # 8x 2 FLOPs+register shuffling
|
||||
vmovsd QWORD PTR [rdi-0x8],xmm0 # 1 write
|
||||
*/
|
||||
|
||||
#pragma omp parallel for
|
||||
for(size_t i=0; i<size; i++){
|
||||
/*
|
||||
COMM: 1 read + 1 write
|
||||
COMP: 16 FLOP
|
||||
-> AI = 8
|
||||
*/
|
||||
a[i] = a[i] * a[i] + a[i];
|
||||
a[i] = a[i] * a[i] + a[i];
|
||||
a[i] = a[i] * a[i] + a[i];
|
||||
a[i] = a[i] * a[i] + a[i];
|
||||
a[i] = a[i] * a[i] + a[i];
|
||||
a[i] = a[i] * a[i] + a[i];
|
||||
a[i] = a[i] * a[i] + a[i];
|
||||
a[i] = a[i] * a[i] + a[i];
|
||||
}
|
||||
}
|
||||
|
||||
void kernel_1_8_vo(double* a, double* b, double* c, size_t size)
|
||||
{
|
||||
double tmp=0.0;
|
||||
for(size_t i=0; i<size; i++) {
|
||||
tmp = a[i] * a[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* === FAILED KERNELS === */
|
||||
|
||||
/*
|
||||
These are theoretically correct kernels but all of them yield
|
||||
dangerous results with gcc 5.3.1 (checked the assembly).
|
||||
*/
|
||||
|
||||
void kernel_1_16_simple_dangerous(double* a, double* b, double* c, size_t size)
|
||||
{
|
||||
/* === Problem ===
|
||||
As soon as volatile is used gcc uses the stack for tmp.
|
||||
Even if "register" is in place. Resulting in one additional write per loop.
|
||||
Omitting volatile results in optimizing away the whole loop
|
||||
(checked at -O2, which is necessary for FMA to eventually step in).
|
||||
Maybe the value stays in cache, maybe not. It does not live a register.
|
||||
|
||||
Even with -O3:
|
||||
movsd xmm0,QWORD PTR [rdi+rax*8] # 1 read
|
||||
mulsd xmm0,QWORD PTR [rsi+rax*8] # 1 read (+ write to xmm0, not counted)
|
||||
# [...] # instructions for loop
|
||||
movsd QWORD PTR [rsp-0x8],xmm0 # malicious write
|
||||
|
||||
Without volatile (-O3):
|
||||
repz ret # that's it
|
||||
*/
|
||||
|
||||
|
||||
// volatile to prevent compiler from optimizing this away
|
||||
// register to advise compiler to put this in register
|
||||
volatile register double tmp = 0.1;
|
||||
|
||||
#pragma omp parallel for
|
||||
for(size_t i=0; i<size; i++){
|
||||
/*
|
||||
COMM: 2 reads = 16 bytes
|
||||
COMP: 1 FLOP
|
||||
-> AI = 1/16
|
||||
*/
|
||||
tmp = a[i] * b[i];
|
||||
}
|
||||
}
|
||||
|
||||
void kernel_8_1_simple_dangerous(double* a, double* b, double* c, size_t size)
|
||||
{
|
||||
/* === Problem ==
|
||||
Same as for kernel_1_16_simple_dangerous
|
||||
*/
|
||||
|
||||
// volatile to prevent compiler from optimizing this away
|
||||
// register to advise compiler to put this in register
|
||||
volatile register double tmp = 0.1;
|
||||
|
||||
#pragma omp parallel for
|
||||
for(size_t i=0; i<size; i++){
|
||||
/* COMM: 1 read, COMP: 8 FLOP -> AI = 8 */
|
||||
/*
|
||||
COMM: 1 read
|
||||
COMP: 8 FLOP
|
||||
-> AI = 8
|
||||
*/
|
||||
tmp = a[i] * a[i] * a[i] * a[i] *
|
||||
a[i] * a[i] * a[i] * a[i];
|
||||
}
|
||||
}
|
||||
|
||||
void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size)
|
||||
{
|
||||
// volatile to prevent compiler from optimizing this away
|
||||
// register to advise compiler to put this in register
|
||||
register volatile double tmp = 0.1;
|
||||
|
||||
#pragma omp parallel for
|
||||
for(size_t i=0; i<size; i++){
|
||||
/* COMM: 1 read, COMP: 8 FLOP -> AI = 8 */
|
||||
tmp = a[i];
|
||||
tmp = tmp * tmp + tmp;
|
||||
tmp = tmp * tmp + tmp;
|
||||
tmp = tmp * tmp + tmp;
|
||||
tmp = tmp * tmp + tmp;
|
||||
}
|
||||
}
|
||||
|
||||
void kernel_1_8_vo(double* a, double* b, double* c, size_t size)
|
||||
{
|
||||
volatile double tmp=0.0;
|
||||
for(size_t i=0; i<size; i++) {
|
||||
tmp = a[i] * a[i];
|
||||
}
|
||||
}
|
||||
|
|
Binary file not shown.
|
@ -229,7 +229,7 @@ static int get_int(char *oparg)
|
|||
|
||||
static void usage()
|
||||
{
|
||||
fprintf(stderr, "USAGE: ./roofline -s <size> -s <runs> \n");
|
||||
fprintf(stderr, "USAGE: ./roofline -s <size> -r <runs> \n");
|
||||
bail_out(NULL);
|
||||
}
|
||||
|
||||
|
|
BIN
roofline/src/roofline_avx
Executable file
BIN
roofline/src/roofline_avx
Executable file
Binary file not shown.
BIN
roofline/src/roofline_avxfma
Executable file
BIN
roofline/src/roofline_avxfma
Executable file
Binary file not shown.
BIN
roofline/src/roofline_o3
Executable file
BIN
roofline/src/roofline_o3
Executable file
Binary file not shown.
BIN
roofline/src/roofline_o3avx
Executable file
BIN
roofline/src/roofline_o3avx
Executable file
Binary file not shown.
Loading…
Reference in a new issue