nix wichtiges
This commit is contained in:
parent
352832d463
commit
4d407ada15
9 changed files with 30 additions and 15 deletions
|
@ -31,7 +31,7 @@ aikern_o3avx.a: aikern.c aikern.h
|
||||||
gcc -O3 -mavx -c -o aikern_o3avx.o aikern.c
|
gcc -O3 -mavx -c -o aikern_o3avx.o aikern.c
|
||||||
ar rcs aikern_o3avx.a aikern_o3avx.o
|
ar rcs aikern_o3avx.a aikern_o3avx.o
|
||||||
|
|
||||||
# This is the only option that actually uses fma without optimizing the hell out of the kernel
|
# This is the only version that actually uses FMA
|
||||||
aikern_avxfma.a: aikern.c aikern.h
|
aikern_avxfma.a: aikern.c aikern.h
|
||||||
gcc -O2 -mavx -mfma -c -o aikern_avxfma.o aikern.c
|
gcc -O2 -mavx -mfma -c -o aikern_avxfma.o aikern.c
|
||||||
ar rcs aikern_avxfma.a aikern_avxfma.o
|
ar rcs aikern_avxfma.a aikern_avxfma.o
|
||||||
|
|
|
@ -98,15 +98,6 @@ void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void kernel_1_8_vo(double* a, double* b, double* c, size_t size)
|
|
||||||
{
|
|
||||||
double tmp=0.0;
|
|
||||||
for(size_t i=0; i<size; i++) {
|
|
||||||
tmp = a[i] * a[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/* === FAILED KERNELS === */
|
/* === FAILED KERNELS === */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -170,3 +161,23 @@ void kernel_8_1_simple_dangerous(double* a, double* b, double* c, size_t size)
|
||||||
a[i] * a[i] * a[i] * a[i];
|
a[i] * a[i] * a[i] * a[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void kernel_1_8_vo_dangerous(double* a, double* b, double* c, size_t size)
|
||||||
|
{
|
||||||
|
/* This is the 1/8 AI kernel from the lecture
|
||||||
|
|
||||||
|
=== Problem ==
|
||||||
|
Same as for kernel_1_16_simple_dangerous
|
||||||
|
|
||||||
|
Without volatile the loop is optimized away completely.
|
||||||
|
With volatile tmp is written to the stack in every loop
|
||||||
|
(-O3). tmp could be cached or not. This might depend on
|
||||||
|
how large the array is and how the cpu work internally
|
||||||
|
-> unpredictable.
|
||||||
|
*/
|
||||||
|
|
||||||
|
volatile double tmp=0.0;
|
||||||
|
for(size_t i=0; i<size; i++) {
|
||||||
|
tmp = a[i] * a[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -2,4 +2,7 @@ void kernel_1_16_simple(double* a, double* b, double* c, size_t size);
|
||||||
void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size);
|
void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size);
|
||||||
void kernel_8_1_simple(double* a, double* b, double* c, size_t size);
|
void kernel_8_1_simple(double* a, double* b, double* c, size_t size);
|
||||||
void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size);
|
void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size);
|
||||||
void kernel_1_8_vo(double* a, double* b, double* c, size_t size);
|
|
||||||
|
void kernel_1_16_simple_dangerous(double* a, double* b, double* c, size_t size);
|
||||||
|
void kernel_8_1_simple_dangerous(double* a, double* b, double* c, size_t size);
|
||||||
|
void kernel_1_8_vo_dangerous(double* a, double* b, double* c, size_t size);
|
||||||
|
|
Binary file not shown.
|
@ -118,16 +118,17 @@ int main(int argc, char* argv[]) {
|
||||||
|
|
||||||
printf("Filling arrays with dummy values\n");
|
printf("Filling arrays with dummy values\n");
|
||||||
|
|
||||||
#pragma omp parallel for
|
/* #pragma omp parallel for
|
||||||
for (size_t j=0; j<size; j++)
|
for (size_t j=0; j<size; j++)
|
||||||
{
|
{
|
||||||
a[j] = 1.0;
|
a[j] = 1.0;
|
||||||
b[j] = 2.0;
|
b[j] = 2.0;
|
||||||
c[j] = 3.0;
|
c[j] = 3.0;
|
||||||
}
|
}*/
|
||||||
|
|
||||||
|
double t;
|
||||||
printf("Warming up cache\n");
|
printf("Warming up cache\n");
|
||||||
double t = mysecond();
|
t = mysecond();
|
||||||
testkern(a,b,c, size);
|
testkern(a,b,c, size);
|
||||||
t = mysecond() - t;
|
t = mysecond() - t;
|
||||||
printf("Cache warming took %.4f microseconds = %.4f seconds (with test AI of 1/16 FLOPs/Byte)\n", (t*1.0E6), t);
|
printf("Cache warming took %.4f microseconds = %.4f seconds (with test AI of 1/16 FLOPs/Byte)\n", (t*1.0E6), t);
|
||||||
|
@ -162,7 +163,7 @@ int main(int argc, char* argv[]) {
|
||||||
|
|
||||||
printf("1/8 vo\n");
|
printf("1/8 vo\n");
|
||||||
t = mysecond();
|
t = mysecond();
|
||||||
kernel_1_8_vo(a,b,c, size);
|
kernel_1_8_vo_dangerous(a,b,c, size);
|
||||||
t = mysecond() - t;
|
t = mysecond() - t;
|
||||||
printf("Cache warming took %.4f microseconds = %.4f seconds (with test AI of 1/16 FLOPs/Byte)\n", (t*1.0E6), t);
|
printf("Cache warming took %.4f microseconds = %.4f seconds (with test AI of 1/16 FLOPs/Byte)\n", (t*1.0E6), t);
|
||||||
|
|
||||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in a new issue