added logging, added fastmath undermining for simple8

This commit is contained in:
Armin Friedl 2016-06-24 03:45:10 +02:00
parent 31bfead054
commit aaf77c78f6
16 changed files with 202 additions and 18 deletions

View file

@ -21,6 +21,7 @@ while i<=64:
xlbl.append(repr(i))
i *= 2
print(xlbl)
# memory
values = []
bandwidth = 10.6

View file

@ -1,7 +1,7 @@
all: bin lib
all: clean bin lib
# Roofline Binary
bin: roofline roofline_o3 roofline_fma roofline_fma_o3 roofline_fma_fast_o3 roofline_fma_fast_fastmath_o3
bin: roofline roofline_o3 roofline_fma roofline_fma_o3 roofline_fma_fast_o3 roofline_fma_fast_o2 roofline_fma_fast_fastmath_o3
mkdir bin
mv $^ bin
@ -20,12 +20,15 @@ roofline_fma_o3: roofline.c aikern_fma_o3.a
roofline_fma_fast_o3: roofline.c aikern_fma_fast_o3.a
gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@
roofline_fma_fast_o2: roofline.c aikern_fma_fast_o2.a
gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@
roofline_fma_fast_fastmath_o3: roofline.c aikern_fma_fast_fastmath_o3.a
gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@
# Static Libraries
lib: aikern.a aikern_o3.a aikern_fma.a aikern_fma_o3.a aikern_fma_fast_o3.a aikern_fma_fast_fastmath_o3.a
lib: aikern.a aikern_o3.a aikern_fma.a aikern_fma_o3.a aikern_fma_fast_o2.a aikern_fma_fast_o3.a aikern_fma_fast_fastmath_o3.a
mkdir lib
mv $^ lib
@ -49,6 +52,11 @@ aikern_fma_o3.a: aikern.c aikern.h
ar rcs $@ aikern_fma_o3.o
rm aikern_fma_o3.o
aikern_fma_fast_o2.a: aikern.c aikern.h
gcc -Wall -Wextra -Wno-unused -O2 -mavx -mfma -fopenmp -Ofast -c -o aikern_fma_fast_o2.o $<
ar rcs $@ aikern_fma_fast_o2.o
rm aikern_fma_fast_o2.o
aikern_fma_fast_o3.a: aikern.c aikern.h
gcc -Wall -Wextra -Wno-unused -O3 -mavx -mfma -fopenmp -Ofast -c -o aikern_fma_fast_o3.o $<
ar rcs $@ aikern_fma_fast_o3.o
@ -61,6 +69,9 @@ aikern_fma_fast_fastmath_o3.a: aikern.c aikern.h
# Cleanup
clean:
rm -f *.a
rm -f *.o
rm -f roofline roofline_o3 roofline_fma roofline_fma_o3 roofline_fma_fast_o3 roofline_fma_fast_o2 roofline_fma_fast_fastmath_o3
rm -fR bin
rm -fR lib

View file

@ -25,10 +25,11 @@ kern_result kernel_dispatch(kernel_t kernel,
size_t size, size_t runs)
{
kern_result result;
kern_result result = {0};
result.runs = runs;
result.starts = malloc(sizeof(double)*(runs));
result.ends = malloc(sizeof(double)*(runs));
result.size = size;
if(result.starts==NULL || result.ends==NULL)
{
@ -41,6 +42,7 @@ kern_result kernel_dispatch(kernel_t kernel,
case SIMPLE_1_16:
result.flops = 1;
result.kern_name = "Simple 1/16";
for(size_t r=0; r<runs; r++)
{
result.starts[r] = pin_time();
@ -50,6 +52,7 @@ kern_result kernel_dispatch(kernel_t kernel,
break;
case FMA_1_16:
result.flops = 2;
result.kern_name = "FMA aware 1/16";
for(size_t r=0; r<runs; r++)
{
result.starts[r] = pin_time();
@ -59,6 +62,7 @@ kern_result kernel_dispatch(kernel_t kernel,
break;
case SIMPLE_8_1:
result.flops = 128;
result.kern_name = "Simple 8";
for(size_t r=0; r<runs; r++)
{
result.starts[r] = pin_time();
@ -68,6 +72,7 @@ kern_result kernel_dispatch(kernel_t kernel,
break;
case FMA_8_1:
result.flops = 128;
result.kern_name = "FMA aware 8";
for(size_t r=0; r<runs; r++)
{
result.starts[r] = pin_time();
@ -75,6 +80,16 @@ kern_result kernel_dispatch(kernel_t kernel,
result.ends[r] = pin_time();
}
break;
case SIMPLE_8_1_FASTMATH:
result.flops = 128;
result.kern_name = "Simple 8 undermining fastmath";
for(size_t r=0; r<runs; r++)
{
result.starts[r] = pin_time();
kernel_8_1_simple_fastmath(a, size);
result.ends[r] = pin_time();
}
break;
default:
bail_out("No such kernel %s", kernel);
}
@ -82,7 +97,7 @@ kern_result kernel_dispatch(kernel_t kernel,
return result;
}
void kernel_1_16_simple(double* a, size_t size)
inline void kernel_1_16_simple(double* a, size_t size)
{
#pragma omp parallel for
for(size_t i=0; i<size; i++)
@ -91,7 +106,7 @@ void kernel_1_16_simple(double* a, size_t size)
}
}
void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size)
inline void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size)
{
#pragma omp parallel for
for(size_t i=0; i<size; i++)
@ -100,7 +115,7 @@ void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size)
}
}
void kernel_8_1_simple(double* a, size_t size)
inline void kernel_8_1_simple(double* a, size_t size)
{
#pragma omp parallel for
for(size_t i=0; i<size; i++)
@ -112,7 +127,18 @@ void kernel_8_1_simple(double* a, size_t size)
}
}
void kernel_8_1_fuseaware(double* a, size_t size)
inline void kernel_8_1_simple_fastmath(double* a, size_t size)
{
#pragma omp parallel for
for(size_t i=0; i<size; i++)
{
REP100(a[i]=a[i]*a[i];);
REP20(a[i]=a[i]*a[i];);
REP8(a[i]=a[i]*a[i];);
}
}
inline void kernel_8_1_fuseaware(double* a, size_t size)
{
#pragma omp parallel for
for(size_t i=0; i<size; i++)

View file

@ -5,11 +5,13 @@ typedef struct {
size_t runs; // also # of start-/endtimes
double* starts; // starttimes
double* ends; // endtimes
int flops; //flops per run
int flops; // flops per iteration
char* kern_name;
size_t size; // size of arrays handeld
} kern_result;
typedef enum {
SIMPLE_1_16, FMA_1_16, SIMPLE_8_1, FMA_8_1
SIMPLE_1_16, FMA_1_16, SIMPLE_8_1, FMA_8_1, SIMPLE_8_1_FASTMATH
} kernel_t;
/**
@ -152,6 +154,43 @@ void kernel_8_1_simple(double* a, size_t size);
*/
void kernel_8_1_fuseaware(double* a, size_t size);
/**
* @brief A simple 8/1 operational intensity kernel which
* undermines evil fastmath optimization
* @param a An array with double values of size param size
* @param size Size of the three param arrays
* @param result Pointer to result storage
*
* === Warning ===
* Don't use with anything other than -Ofast / -ffast-math
*
* === Description ===
* Uses a simple floating point operation that more closely resembles
* that of 8_1_fuseaware:
* a[i] = a[i]*a[i]; # 128x
*
* Runs in a parallelized for loop.
*
* === Analysis ===
* -Ofast/-ffast-math does not preserve strict IEEE compliance. It
* therefore is allowed to ignore non-associativity of floating
* point operations.
*
* x = x*x*x*x*x*x*x*x; is optimized to x *= x;x *= x;x *= x;
*
* This cleary breaks the whole OI calculation of 8_1_simple.
*
* This kernel does not introduce more byte write-outs than
* 8_1_simple at a high optimization level since a[i] is held
* in a register and only written out once at the end of an
* iteration.
*
*
* === Optimization ===
* Nothing special
*/
void kernel_8_1_simple_fastmath(double* a, size_t size);
/********************************************
* Kernels which potentially compile to *

5
roofline/src/log/fma16 Normal file
View file

@ -0,0 +1,5 @@
run,start,end,delta,GFLOP/s
1,1466732365.5426,1466732366.1946,0.6520,0.9202
2,1466732366.1946,1466732366.8410,0.6464,0.9282
3,1466732366.8410,1466732367.4875,0.6465,0.9281
4,1466732367.4875,1466732368.1370,0.6495,0.9238

5
roofline/src/log/fma8 Normal file
View file

@ -0,0 +1,5 @@
run,start,end,delta,GFLOP/s
1,1466732371.5080,1466732373.2547,1.7468,21.9836
2,1466732373.2547,1466732375.0033,1.7486,21.9603
3,1466732375.0033,1466732376.7499,1.7465,21.9864
4,1466732376.7499,1466732378.4990,1.7492,21.9534

View file

@ -0,0 +1,5 @@
run,start,end,delta,GFLOP/s
1,1466732363.5905,1466732363.9157,0.3252,0.9226
2,1466732363.9157,1466732364.2410,0.3253,0.9223
3,1466732364.2410,1466732364.5659,0.3249,0.9234
4,1466732364.5659,1466732364.8925,0.3266,0.9186

5
roofline/src/log/simple8 Normal file
View file

@ -0,0 +1,5 @@
run,start,end,delta,GFLOP/s
1,1466732368.4592,1466732368.7843,0.3251,118.1190
2,1466732368.7843,1466732369.1090,0.3247,118.2713
3,1466732369.1090,1466732369.4353,0.3263,117.6684
4,1466732369.4353,1466732369.7596,0.3243,118.4045

View file

@ -0,0 +1,5 @@
run,start,end,delta,GFLOP/s
1,1466732382.7768,1466732387.0594,4.2827,8.9664
2,1466732387.0594,1466732391.3489,4.2895,8.9521
3,1466732391.3489,1466732395.6322,4.2834,8.9649
4,1466732395.6322,1466732399.9114,4.2791,8.9738

Binary file not shown.

View file

@ -3,6 +3,7 @@
# include <unistd.h>
# include <ctype.h>
# include <sys/time.h>
# include <sys/stat.h>
# include <errno.h>
# include <string.h>
# include <stdint.h>
@ -67,7 +68,7 @@ static void testkern(double* a, double* b, double* c, size_t size);
/**
* @brief pretty prints a kern_result
*/
static void print_kernresult(kern_result* result);
static void print_kernresult(kern_result* result, const char* logname);
int main(int argc, char* argv[]) {
prog_name = argv[0];
@ -108,6 +109,7 @@ int main(int argc, char* argv[]) {
size_t size = get_size(size_arg);
int runs = get_int(runs_arg);
// Allocating arrays
printf("Will run with array sizes of %zu elements\n", size);
printf("Will calculate min, max, avg for %d runs\n", runs);
double* a = malloc(sizeof(double)*(size));
@ -120,6 +122,7 @@ int main(int argc, char* argv[]) {
printf("Allocated 3 arrays (3*%.2f MB = %.2f GB)\n", (sizeof(double)*(size)/1024.0/1024.0), (sizeof(double)*(size)*3/1024.0/1024.0/1024));
printf("Filling arrays with dummy values. This will also warm the cache\n");
// Filling arrays with arbitrary numbers
#pragma omp parallel for
for (size_t j=0; j<size; j++)
{
@ -134,13 +137,26 @@ int main(int argc, char* argv[]) {
testkern(a,b,c, size);
t = pin_time() - t;
printf("Machine heating took %.4f microseconds = %.4f seconds (with test OI kernel)\n", (t*1.0E6), t);
printf("Starting tests...\n\n\n");
// Executing kernels
kern_result simple16 = kernel_dispatch(SIMPLE_1_16, a, b, c, size, runs);
kern_result fma16 = kernel_dispatch(FMA_1_16, a, b, c, size, runs);
kern_result simple8 = kernel_dispatch(SIMPLE_8_1, a, b, c, size, runs);
kern_result fma8 = kernel_dispatch(FMA_8_1, a, b, c, size, runs);
kern_result simple8fm = kernel_dispatch(SIMPLE_8_1_FASTMATH, a, b, c, size, runs);
print_kernresult(&simple16);
// Freeing arrays
free(a);
free(b);
free(c);
// Printing results
print_kernresult(&simple16, "simple16");
print_kernresult(&fma16, "fma16");
print_kernresult(&simple8, "simple8");
print_kernresult(&fma8, "fma8");
print_kernresult(&simple8fm, "simple8fastmath");
exit(EXIT_SUCCESS);
}
@ -233,6 +249,72 @@ static void bail_out(char* fmt, ...)
exit(EXIT_FAILURE);
}
static void print_kernresult(kern_result* result){
return;
static void print_kernresult(kern_result* result, const char* logname)
{
struct stat st = {0};
if (stat("log", &st) == -1)
{
if(mkdir("log", 0700))
{
bail_out("Couldn't create log directory for %s", result->kern_name);
}
}
char logpath[20];
snprintf(logpath, sizeof(logpath), "%s/%s", "log", logname);
FILE* log = fopen(logpath, "w");
if(log == NULL)
bail_out("Couldn't open log file for %s", result->kern_name);
if(fputs("run,start,end,delta,GFLOP/s\n", log) == EOF)
{
fclose(log);
bail_out("Couldn't write header to log file");
}
printf("=== %s ===\n", result->kern_name);
double min;
double max;
double sum = 0.0;
double deltas[result->runs];
deltas[0] = result->ends[0] - result->starts[0];
min=deltas[0];
max=deltas[0];
sum+=deltas[0];
for(size_t i=1; i<result->runs; i++)
{
deltas[i] = result->ends[i] - result->starts[i];
sum+=deltas[i];
if(deltas[i] < min) min=deltas[i];
if(deltas[i] > max) max=deltas[i];
double gflops = ((result->flops * result->size) / deltas[i]) / 1.0E9;
if(fprintf(log, "%zu,%.4f,%.4f,%.4f,%.4f\n",
i, result->starts[i],
result->ends[i], deltas[i],
gflops) == EOF)
{
fclose(log);
bail_out("Couldn't write to log file");
}
}
printf("%d flop(s) per run\t %zu run(s)\n\n", result->flops, result->runs);
printf("Min: %.4f \t Max: %.4f \t Avg: %.4f\n", min, max, (sum/result->runs));
printf("\n\n\n");
if(fclose(log))
{
bail_out("Couldn't close log file for %s", result->kern_name);
}
}

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.