nix wichtiges
This commit is contained in:
parent
352832d463
commit
4d407ada15
9 changed files with 30 additions and 15 deletions
|
@ -31,7 +31,7 @@ aikern_o3avx.a: aikern.c aikern.h
|
|||
gcc -O3 -mavx -c -o aikern_o3avx.o aikern.c
|
||||
ar rcs aikern_o3avx.a aikern_o3avx.o
|
||||
|
||||
# This is the only option that actually uses fma without optimizing the hell out of the kernel
|
||||
# This is the only version that actually uses FMA
|
||||
aikern_avxfma.a: aikern.c aikern.h
|
||||
gcc -O2 -mavx -mfma -c -o aikern_avxfma.o aikern.c
|
||||
ar rcs aikern_avxfma.a aikern_avxfma.o
|
||||
|
|
|
@ -98,15 +98,6 @@ void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size)
|
|||
}
|
||||
}
|
||||
|
||||
void kernel_1_8_vo(double* a, double* b, double* c, size_t size)
|
||||
{
|
||||
double tmp=0.0;
|
||||
for(size_t i=0; i<size; i++) {
|
||||
tmp = a[i] * a[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* === FAILED KERNELS === */
|
||||
|
||||
/*
|
||||
|
@ -170,3 +161,23 @@ void kernel_8_1_simple_dangerous(double* a, double* b, double* c, size_t size)
|
|||
a[i] * a[i] * a[i] * a[i];
|
||||
}
|
||||
}
|
||||
|
||||
void kernel_1_8_vo_dangerous(double* a, double* b, double* c, size_t size)
|
||||
{
|
||||
/* This is the 1/8 AI kernel from the lecture
|
||||
|
||||
=== Problem ==
|
||||
Same as for kernel_1_16_simple_dangerous
|
||||
|
||||
Without volatile the loop is optimized away completely.
|
||||
With volatile tmp is written to the stack in every loop
|
||||
(-O3). tmp could be cached or not. This might depend on
|
||||
how large the array is and how the cpu work internally
|
||||
-> unpredictable.
|
||||
*/
|
||||
|
||||
volatile double tmp=0.0;
|
||||
for(size_t i=0; i<size; i++) {
|
||||
tmp = a[i] * a[i];
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,4 +2,7 @@ void kernel_1_16_simple(double* a, double* b, double* c, size_t size);
|
|||
void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size);
|
||||
void kernel_8_1_simple(double* a, double* b, double* c, size_t size);
|
||||
void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size);
|
||||
void kernel_1_8_vo(double* a, double* b, double* c, size_t size);
|
||||
|
||||
void kernel_1_16_simple_dangerous(double* a, double* b, double* c, size_t size);
|
||||
void kernel_8_1_simple_dangerous(double* a, double* b, double* c, size_t size);
|
||||
void kernel_1_8_vo_dangerous(double* a, double* b, double* c, size_t size);
|
||||
|
|
Binary file not shown.
|
@ -118,16 +118,17 @@ int main(int argc, char* argv[]) {
|
|||
|
||||
printf("Filling arrays with dummy values\n");
|
||||
|
||||
#pragma omp parallel for
|
||||
/* #pragma omp parallel for
|
||||
for (size_t j=0; j<size; j++)
|
||||
{
|
||||
a[j] = 1.0;
|
||||
b[j] = 2.0;
|
||||
c[j] = 3.0;
|
||||
}
|
||||
}*/
|
||||
|
||||
double t;
|
||||
printf("Warming up cache\n");
|
||||
double t = mysecond();
|
||||
t = mysecond();
|
||||
testkern(a,b,c, size);
|
||||
t = mysecond() - t;
|
||||
printf("Cache warming took %.4f microseconds = %.4f seconds (with test AI of 1/16 FLOPs/Byte)\n", (t*1.0E6), t);
|
||||
|
@ -162,7 +163,7 @@ int main(int argc, char* argv[]) {
|
|||
|
||||
printf("1/8 vo\n");
|
||||
t = mysecond();
|
||||
kernel_1_8_vo(a,b,c, size);
|
||||
kernel_1_8_vo_dangerous(a,b,c, size);
|
||||
t = mysecond() - t;
|
||||
printf("Cache warming took %.4f microseconds = %.4f seconds (with test AI of 1/16 FLOPs/Byte)\n", (t*1.0E6), t);
|
||||
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in a new issue