nix wichtiges

This commit is contained in:
Armin Friedl 2016-06-20 01:13:23 +02:00
parent 352832d463
commit 4d407ada15
9 changed files with 30 additions and 15 deletions

View file

@ -31,7 +31,7 @@ aikern_o3avx.a: aikern.c aikern.h
gcc -O3 -mavx -c -o aikern_o3avx.o aikern.c gcc -O3 -mavx -c -o aikern_o3avx.o aikern.c
ar rcs aikern_o3avx.a aikern_o3avx.o ar rcs aikern_o3avx.a aikern_o3avx.o
# This is the only option that actually uses fma without optimizing the hell out of the kernel # This is the only version that actually uses FMA
aikern_avxfma.a: aikern.c aikern.h aikern_avxfma.a: aikern.c aikern.h
gcc -O2 -mavx -mfma -c -o aikern_avxfma.o aikern.c gcc -O2 -mavx -mfma -c -o aikern_avxfma.o aikern.c
ar rcs aikern_avxfma.a aikern_avxfma.o ar rcs aikern_avxfma.a aikern_avxfma.o

View file

@ -98,15 +98,6 @@ void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size)
} }
} }
void kernel_1_8_vo(double* a, double* b, double* c, size_t size)
{
double tmp=0.0;
for(size_t i=0; i<size; i++) {
tmp = a[i] * a[i];
}
}
/* === FAILED KERNELS === */ /* === FAILED KERNELS === */
/* /*
@ -170,3 +161,23 @@ void kernel_8_1_simple_dangerous(double* a, double* b, double* c, size_t size)
a[i] * a[i] * a[i] * a[i]; a[i] * a[i] * a[i] * a[i];
} }
} }
void kernel_1_8_vo_dangerous(double* a, double* b, double* c, size_t size)
{
/* This is the 1/8 AI kernel from the lecture
=== Problem ==
Same as for kernel_1_16_simple_dangerous
Without volatile the loop is optimized away completely.
With volatile tmp is written to the stack in every loop
(-O3). tmp could be cached or not. This might depend on
how large the array is and how the cpu work internally
-> unpredictable.
*/
volatile double tmp=0.0;
for(size_t i=0; i<size; i++) {
tmp = a[i] * a[i];
}
}

View file

@ -2,4 +2,7 @@ void kernel_1_16_simple(double* a, double* b, double* c, size_t size);
void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size); void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size);
void kernel_8_1_simple(double* a, double* b, double* c, size_t size); void kernel_8_1_simple(double* a, double* b, double* c, size_t size);
void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size); void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size);
void kernel_1_8_vo(double* a, double* b, double* c, size_t size);
void kernel_1_16_simple_dangerous(double* a, double* b, double* c, size_t size);
void kernel_8_1_simple_dangerous(double* a, double* b, double* c, size_t size);
void kernel_1_8_vo_dangerous(double* a, double* b, double* c, size_t size);

Binary file not shown.

View file

@ -118,16 +118,17 @@ int main(int argc, char* argv[]) {
printf("Filling arrays with dummy values\n"); printf("Filling arrays with dummy values\n");
#pragma omp parallel for /* #pragma omp parallel for
for (size_t j=0; j<size; j++) for (size_t j=0; j<size; j++)
{ {
a[j] = 1.0; a[j] = 1.0;
b[j] = 2.0; b[j] = 2.0;
c[j] = 3.0; c[j] = 3.0;
} }*/
double t;
printf("Warming up cache\n"); printf("Warming up cache\n");
double t = mysecond(); t = mysecond();
testkern(a,b,c, size); testkern(a,b,c, size);
t = mysecond() - t; t = mysecond() - t;
printf("Cache warming took %.4f microseconds = %.4f seconds (with test AI of 1/16 FLOPs/Byte)\n", (t*1.0E6), t); printf("Cache warming took %.4f microseconds = %.4f seconds (with test AI of 1/16 FLOPs/Byte)\n", (t*1.0E6), t);
@ -162,7 +163,7 @@ int main(int argc, char* argv[]) {
printf("1/8 vo\n"); printf("1/8 vo\n");
t = mysecond(); t = mysecond();
kernel_1_8_vo(a,b,c, size); kernel_1_8_vo_dangerous(a,b,c, size);
t = mysecond() - t; t = mysecond() - t;
printf("Cache warming took %.4f microseconds = %.4f seconds (with test AI of 1/16 FLOPs/Byte)\n", (t*1.0E6), t); printf("Cache warming took %.4f microseconds = %.4f seconds (with test AI of 1/16 FLOPs/Byte)\n", (t*1.0E6), t);

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.