Kernels should work
This commit is contained in:
parent
68b0b82100
commit
352832d463
8 changed files with 176 additions and 37 deletions
|
@ -1,14 +1,43 @@
|
||||||
all: roofline aikern.a
|
all: roofline roofline_avx roofline_o3avx roofline_o3 roofline_avxfma
|
||||||
|
|
||||||
roofline: roofline.c aikern.a
|
roofline: roofline.c aikern.a
|
||||||
gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@
|
gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@
|
||||||
|
|
||||||
|
roofline_avx: roofline.c aikern_avx.a
|
||||||
|
gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@
|
||||||
|
|
||||||
|
roofline_o3avx: roofline.c aikern_o3avx.a
|
||||||
|
gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@
|
||||||
|
|
||||||
|
roofline_o3: roofline.c aikern_o3.a
|
||||||
|
gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@
|
||||||
|
|
||||||
|
roofline_avxfma: roofline.c aikern_avxfma.a
|
||||||
|
gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@
|
||||||
|
|
||||||
aikern.a: aikern.c aikern.h
|
aikern.a: aikern.c aikern.h
|
||||||
gcc -O3 -c -o aikern.o aikern.c
|
gcc -c -o aikern.o aikern.c
|
||||||
ar rcs aikern.a aikern.o
|
ar rcs aikern.a aikern.o
|
||||||
|
|
||||||
|
aikern_avx.a: aikern.c aikern.h
|
||||||
|
gcc -mavx -c -o aikern_avx.o aikern.c
|
||||||
|
ar rcs aikern_avx.a aikern_avx.o
|
||||||
|
|
||||||
|
aikern_o3.a: aikern.c aikern.h
|
||||||
|
gcc -O3 -c -o aikern_o3.o aikern.c
|
||||||
|
ar rcs aikern_o3.a aikern_o3.o
|
||||||
|
|
||||||
|
aikern_o3avx.a: aikern.c aikern.h
|
||||||
|
gcc -O3 -mavx -c -o aikern_o3avx.o aikern.c
|
||||||
|
ar rcs aikern_o3avx.a aikern_o3avx.o
|
||||||
|
|
||||||
|
# This is the only option that actually uses fma without optimizing the hell out of the kernel
|
||||||
|
aikern_avxfma.a: aikern.c aikern.h
|
||||||
|
gcc -O2 -mavx -mfma -c -o aikern_avxfma.o aikern.c
|
||||||
|
ar rcs aikern_avxfma.a aikern_avxfma.o
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -f roofline
|
rm -f roofline roofline_avx roofline_o3avx roofline_o3 roofline_avxfma
|
||||||
rm -f *.o
|
rm -f *.o
|
||||||
rm -f *.a
|
rm -f *.a
|
||||||
rm -f *.so
|
rm -f *.so
|
||||||
|
|
|
@ -2,61 +2,171 @@
|
||||||
|
|
||||||
void kernel_1_16_simple(double* a, double* b, double* c, size_t size)
|
void kernel_1_16_simple(double* a, double* b, double* c, size_t size)
|
||||||
{
|
{
|
||||||
// volatile to prevent compiler from optimizing this away
|
|
||||||
// register to advise compiler to put this in register
|
|
||||||
volatile double tmp = 0.1;
|
|
||||||
|
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
for(size_t i=0; i<size; i++){
|
for(size_t i=0; i<size; i++){
|
||||||
/* COMM: 2 reads = 16 bytes, COMP: 1 FLOP -> AI = 1/16 */
|
/*
|
||||||
tmp = a[i] * b[i];
|
COMM: 1 reads, 1 write = 16 bytes
|
||||||
|
COMP: 1 FLOP
|
||||||
|
-> AI = 1/16
|
||||||
|
*/
|
||||||
|
a[i] = a[i] * a[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size)
|
void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size)
|
||||||
{
|
{
|
||||||
|
/* === Warning ===
|
||||||
|
This is dangerous if FMA is not used/can't be used. Then there
|
||||||
|
are intermediary writes (and reads) to the stack. With FMA:
|
||||||
|
|
||||||
|
vmovsd xmm0,QWORD PTR [rdi+rax*8] # 1 read
|
||||||
|
vmovsd xmm1,QWORD PTR [rdx+rax*8] # 1 read
|
||||||
|
vfmadd132sd xmm0,xmm1,QWORD PTR [rsi+rax*8] # 2 FLOPs + 1 read
|
||||||
|
vmovsd QWORD PTR [rdi+rax*8],xmm0 # 1 write
|
||||||
|
*/
|
||||||
|
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
for(size_t i=0; i<size; i++){
|
for(size_t i=0; i<size; i++){
|
||||||
/* COMM: 3 reads, 1 write = 32 bytes, COMP: 2 FLOP -> AI = 2/32 = 1/16 */
|
/*
|
||||||
|
COMM: 3 reads, 1 write = 32 bytes
|
||||||
|
COMP: 2 FLOP
|
||||||
|
-> AI = 2/32 = 1/16
|
||||||
|
*/
|
||||||
a[i] = a[i] * b[i] + c[i];
|
a[i] = a[i] * b[i] + c[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void kernel_8_1_simple(double* a, double* b, double* c, size_t size)
|
void kernel_8_1_simple(double* a, double* b, double* c, size_t size)
|
||||||
{
|
{
|
||||||
|
/* === Warning ===
|
||||||
|
Seems correct with -O3. Though -O3 does some loop unrolling.
|
||||||
|
|
||||||
|
With -O0 this is dangerous, intermediary values stored on stack
|
||||||
|
who knows if they survive in cache -> unpredictable.
|
||||||
|
|
||||||
|
With AVX and -O2 (not necessarily FMA) best results
|
||||||
|
(obviously correct, only register shuffling). With FMA:
|
||||||
|
|
||||||
|
vmovsd xmm1,QWORD PTR [rdi] # 1 read
|
||||||
|
vmulsd xmm0,xmm1,xmm1 # 1 FLOP+register shuffling
|
||||||
|
vmulsd xmm0,xmm0,xmm1 # 15x 1 FLOP+register shuffling
|
||||||
|
# [...]
|
||||||
|
vmovsd QWORD PTR [rdi-0x8],xmm0 # 1 write
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma omp parallel for
|
||||||
|
for(size_t i=0; i<size; i++){
|
||||||
|
/*
|
||||||
|
COMM: 1 read+1 write
|
||||||
|
COMP: 16 FLOPs
|
||||||
|
-> AI = 8
|
||||||
|
*/
|
||||||
|
a[i] = a[i] * a[i] * a[i] *
|
||||||
|
a[i] * a[i] * a[i] *
|
||||||
|
a[i] * a[i] * a[i] *
|
||||||
|
a[i] * a[i] * a[i] *
|
||||||
|
a[i] * a[i] * a[i] *
|
||||||
|
a[i] * a[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
With FMA (and -O2):
|
||||||
|
|
||||||
|
vmovsd xmm0,QWORD PTR [rdi] # 1 read
|
||||||
|
vfmadd132sd xmm0,xmm0,xmm0 # 8x 2 FLOPs+register shuffling
|
||||||
|
vmovsd QWORD PTR [rdi-0x8],xmm0 # 1 write
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma omp parallel for
|
||||||
|
for(size_t i=0; i<size; i++){
|
||||||
|
/*
|
||||||
|
COMM: 1 read + 1 write
|
||||||
|
COMP: 16 FLOP
|
||||||
|
-> AI = 8
|
||||||
|
*/
|
||||||
|
a[i] = a[i] * a[i] + a[i];
|
||||||
|
a[i] = a[i] * a[i] + a[i];
|
||||||
|
a[i] = a[i] * a[i] + a[i];
|
||||||
|
a[i] = a[i] * a[i] + a[i];
|
||||||
|
a[i] = a[i] * a[i] + a[i];
|
||||||
|
a[i] = a[i] * a[i] + a[i];
|
||||||
|
a[i] = a[i] * a[i] + a[i];
|
||||||
|
a[i] = a[i] * a[i] + a[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void kernel_1_8_vo(double* a, double* b, double* c, size_t size)
|
||||||
|
{
|
||||||
|
double tmp=0.0;
|
||||||
|
for(size_t i=0; i<size; i++) {
|
||||||
|
tmp = a[i] * a[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* === FAILED KERNELS === */
|
||||||
|
|
||||||
|
/*
|
||||||
|
These are theoretically correct kernels but all of them yield
|
||||||
|
dangerous results with gcc 5.3.1 (checked the assembly).
|
||||||
|
*/
|
||||||
|
|
||||||
|
void kernel_1_16_simple_dangerous(double* a, double* b, double* c, size_t size)
|
||||||
|
{
|
||||||
|
/* === Problem ===
|
||||||
|
As soon as volatile is used gcc uses the stack for tmp.
|
||||||
|
Even if "register" is in place. Resulting in one additional write per loop.
|
||||||
|
Omitting volatile results in optimizing away the whole loop
|
||||||
|
(checked at -O2, which is necessary for FMA to eventually step in).
|
||||||
|
Maybe the value stays in cache, maybe not. It does not live a register.
|
||||||
|
|
||||||
|
Even with -O3:
|
||||||
|
movsd xmm0,QWORD PTR [rdi+rax*8] # 1 read
|
||||||
|
mulsd xmm0,QWORD PTR [rsi+rax*8] # 1 read (+ write to xmm0, not counted)
|
||||||
|
# [...] # instructions for loop
|
||||||
|
movsd QWORD PTR [rsp-0x8],xmm0 # malicious write
|
||||||
|
|
||||||
|
Without volatile (-O3):
|
||||||
|
repz ret # that's it
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
// volatile to prevent compiler from optimizing this away
|
||||||
|
// register to advise compiler to put this in register
|
||||||
|
volatile register double tmp = 0.1;
|
||||||
|
|
||||||
|
#pragma omp parallel for
|
||||||
|
for(size_t i=0; i<size; i++){
|
||||||
|
/*
|
||||||
|
COMM: 2 reads = 16 bytes
|
||||||
|
COMP: 1 FLOP
|
||||||
|
-> AI = 1/16
|
||||||
|
*/
|
||||||
|
tmp = a[i] * b[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void kernel_8_1_simple_dangerous(double* a, double* b, double* c, size_t size)
|
||||||
|
{
|
||||||
|
/* === Problem ==
|
||||||
|
Same as for kernel_1_16_simple_dangerous
|
||||||
|
*/
|
||||||
|
|
||||||
// volatile to prevent compiler from optimizing this away
|
// volatile to prevent compiler from optimizing this away
|
||||||
// register to advise compiler to put this in register
|
// register to advise compiler to put this in register
|
||||||
volatile register double tmp = 0.1;
|
volatile register double tmp = 0.1;
|
||||||
|
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
for(size_t i=0; i<size; i++){
|
for(size_t i=0; i<size; i++){
|
||||||
/* COMM: 1 read, COMP: 8 FLOP -> AI = 8 */
|
/*
|
||||||
|
COMM: 1 read
|
||||||
|
COMP: 8 FLOP
|
||||||
|
-> AI = 8
|
||||||
|
*/
|
||||||
tmp = a[i] * a[i] * a[i] * a[i] *
|
tmp = a[i] * a[i] * a[i] * a[i] *
|
||||||
a[i] * a[i] * a[i] * a[i];
|
a[i] * a[i] * a[i] * a[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size)
|
|
||||||
{
|
|
||||||
// volatile to prevent compiler from optimizing this away
|
|
||||||
// register to advise compiler to put this in register
|
|
||||||
register volatile double tmp = 0.1;
|
|
||||||
|
|
||||||
#pragma omp parallel for
|
|
||||||
for(size_t i=0; i<size; i++){
|
|
||||||
/* COMM: 1 read, COMP: 8 FLOP -> AI = 8 */
|
|
||||||
tmp = a[i];
|
|
||||||
tmp = tmp * tmp + tmp;
|
|
||||||
tmp = tmp * tmp + tmp;
|
|
||||||
tmp = tmp * tmp + tmp;
|
|
||||||
tmp = tmp * tmp + tmp;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void kernel_1_8_vo(double* a, double* b, double* c, size_t size)
|
|
||||||
{
|
|
||||||
volatile double tmp=0.0;
|
|
||||||
for(size_t i=0; i<size; i++) {
|
|
||||||
tmp = a[i] * a[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
Binary file not shown.
|
@ -229,7 +229,7 @@ static int get_int(char *oparg)
|
||||||
|
|
||||||
static void usage()
|
static void usage()
|
||||||
{
|
{
|
||||||
fprintf(stderr, "USAGE: ./roofline -s <size> -s <runs> \n");
|
fprintf(stderr, "USAGE: ./roofline -s <size> -r <runs> \n");
|
||||||
bail_out(NULL);
|
bail_out(NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
BIN
roofline/src/roofline_avx
Executable file
BIN
roofline/src/roofline_avx
Executable file
Binary file not shown.
BIN
roofline/src/roofline_avxfma
Executable file
BIN
roofline/src/roofline_avxfma
Executable file
Binary file not shown.
BIN
roofline/src/roofline_o3
Executable file
BIN
roofline/src/roofline_o3
Executable file
Binary file not shown.
BIN
roofline/src/roofline_o3avx
Executable file
BIN
roofline/src/roofline_o3avx
Executable file
Binary file not shown.
Loading…
Reference in a new issue