added logging, added fastmath undermining for simple8
This commit is contained in:
parent
31bfead054
commit
aaf77c78f6
16 changed files with 202 additions and 18 deletions
|
@ -21,6 +21,7 @@ while i<=64:
|
|||
xlbl.append(repr(i))
|
||||
i *= 2
|
||||
|
||||
print(xlbl)
|
||||
# memory
|
||||
values = []
|
||||
bandwidth = 10.6
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
all: bin lib
|
||||
all: clean bin lib
|
||||
|
||||
# Roofline Binary
|
||||
bin: roofline roofline_o3 roofline_fma roofline_fma_o3 roofline_fma_fast_o3 roofline_fma_fast_fastmath_o3
|
||||
bin: roofline roofline_o3 roofline_fma roofline_fma_o3 roofline_fma_fast_o3 roofline_fma_fast_o2 roofline_fma_fast_fastmath_o3
|
||||
mkdir bin
|
||||
mv $^ bin
|
||||
|
||||
|
@ -20,12 +20,15 @@ roofline_fma_o3: roofline.c aikern_fma_o3.a
|
|||
roofline_fma_fast_o3: roofline.c aikern_fma_fast_o3.a
|
||||
gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@
|
||||
|
||||
roofline_fma_fast_o2: roofline.c aikern_fma_fast_o2.a
|
||||
gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@
|
||||
|
||||
roofline_fma_fast_fastmath_o3: roofline.c aikern_fma_fast_fastmath_o3.a
|
||||
gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@
|
||||
|
||||
|
||||
# Static Libraries
|
||||
lib: aikern.a aikern_o3.a aikern_fma.a aikern_fma_o3.a aikern_fma_fast_o3.a aikern_fma_fast_fastmath_o3.a
|
||||
lib: aikern.a aikern_o3.a aikern_fma.a aikern_fma_o3.a aikern_fma_fast_o2.a aikern_fma_fast_o3.a aikern_fma_fast_fastmath_o3.a
|
||||
mkdir lib
|
||||
mv $^ lib
|
||||
|
||||
|
@ -49,6 +52,11 @@ aikern_fma_o3.a: aikern.c aikern.h
|
|||
ar rcs $@ aikern_fma_o3.o
|
||||
rm aikern_fma_o3.o
|
||||
|
||||
aikern_fma_fast_o2.a: aikern.c aikern.h
|
||||
gcc -Wall -Wextra -Wno-unused -O2 -mavx -mfma -fopenmp -Ofast -c -o aikern_fma_fast_o2.o $<
|
||||
ar rcs $@ aikern_fma_fast_o2.o
|
||||
rm aikern_fma_fast_o2.o
|
||||
|
||||
aikern_fma_fast_o3.a: aikern.c aikern.h
|
||||
gcc -Wall -Wextra -Wno-unused -O3 -mavx -mfma -fopenmp -Ofast -c -o aikern_fma_fast_o3.o $<
|
||||
ar rcs $@ aikern_fma_fast_o3.o
|
||||
|
@ -61,6 +69,9 @@ aikern_fma_fast_fastmath_o3.a: aikern.c aikern.h
|
|||
|
||||
# Cleanup
|
||||
clean:
|
||||
rm -f *.a
|
||||
rm -f *.o
|
||||
rm -f roofline roofline_o3 roofline_fma roofline_fma_o3 roofline_fma_fast_o3 roofline_fma_fast_o2 roofline_fma_fast_fastmath_o3
|
||||
rm -fR bin
|
||||
rm -fR lib
|
||||
|
||||
|
|
|
@ -25,10 +25,11 @@ kern_result kernel_dispatch(kernel_t kernel,
|
|||
size_t size, size_t runs)
|
||||
{
|
||||
|
||||
kern_result result;
|
||||
kern_result result = {0};
|
||||
result.runs = runs;
|
||||
result.starts = malloc(sizeof(double)*(runs));
|
||||
result.ends = malloc(sizeof(double)*(runs));
|
||||
result.size = size;
|
||||
|
||||
if(result.starts==NULL || result.ends==NULL)
|
||||
{
|
||||
|
@ -41,6 +42,7 @@ kern_result kernel_dispatch(kernel_t kernel,
|
|||
|
||||
case SIMPLE_1_16:
|
||||
result.flops = 1;
|
||||
result.kern_name = "Simple 1/16";
|
||||
for(size_t r=0; r<runs; r++)
|
||||
{
|
||||
result.starts[r] = pin_time();
|
||||
|
@ -50,6 +52,7 @@ kern_result kernel_dispatch(kernel_t kernel,
|
|||
break;
|
||||
case FMA_1_16:
|
||||
result.flops = 2;
|
||||
result.kern_name = "FMA aware 1/16";
|
||||
for(size_t r=0; r<runs; r++)
|
||||
{
|
||||
result.starts[r] = pin_time();
|
||||
|
@ -59,6 +62,7 @@ kern_result kernel_dispatch(kernel_t kernel,
|
|||
break;
|
||||
case SIMPLE_8_1:
|
||||
result.flops = 128;
|
||||
result.kern_name = "Simple 8";
|
||||
for(size_t r=0; r<runs; r++)
|
||||
{
|
||||
result.starts[r] = pin_time();
|
||||
|
@ -68,6 +72,7 @@ kern_result kernel_dispatch(kernel_t kernel,
|
|||
break;
|
||||
case FMA_8_1:
|
||||
result.flops = 128;
|
||||
result.kern_name = "FMA aware 8";
|
||||
for(size_t r=0; r<runs; r++)
|
||||
{
|
||||
result.starts[r] = pin_time();
|
||||
|
@ -75,6 +80,16 @@ kern_result kernel_dispatch(kernel_t kernel,
|
|||
result.ends[r] = pin_time();
|
||||
}
|
||||
break;
|
||||
case SIMPLE_8_1_FASTMATH:
|
||||
result.flops = 128;
|
||||
result.kern_name = "Simple 8 undermining fastmath";
|
||||
for(size_t r=0; r<runs; r++)
|
||||
{
|
||||
result.starts[r] = pin_time();
|
||||
kernel_8_1_simple_fastmath(a, size);
|
||||
result.ends[r] = pin_time();
|
||||
}
|
||||
break;
|
||||
default:
|
||||
bail_out("No such kernel %s", kernel);
|
||||
}
|
||||
|
@ -82,7 +97,7 @@ kern_result kernel_dispatch(kernel_t kernel,
|
|||
return result;
|
||||
}
|
||||
|
||||
void kernel_1_16_simple(double* a, size_t size)
|
||||
inline void kernel_1_16_simple(double* a, size_t size)
|
||||
{
|
||||
#pragma omp parallel for
|
||||
for(size_t i=0; i<size; i++)
|
||||
|
@ -91,7 +106,7 @@ void kernel_1_16_simple(double* a, size_t size)
|
|||
}
|
||||
}
|
||||
|
||||
void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size)
|
||||
inline void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size)
|
||||
{
|
||||
#pragma omp parallel for
|
||||
for(size_t i=0; i<size; i++)
|
||||
|
@ -100,7 +115,7 @@ void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size)
|
|||
}
|
||||
}
|
||||
|
||||
void kernel_8_1_simple(double* a, size_t size)
|
||||
inline void kernel_8_1_simple(double* a, size_t size)
|
||||
{
|
||||
#pragma omp parallel for
|
||||
for(size_t i=0; i<size; i++)
|
||||
|
@ -112,7 +127,18 @@ void kernel_8_1_simple(double* a, size_t size)
|
|||
}
|
||||
}
|
||||
|
||||
void kernel_8_1_fuseaware(double* a, size_t size)
|
||||
inline void kernel_8_1_simple_fastmath(double* a, size_t size)
|
||||
{
|
||||
#pragma omp parallel for
|
||||
for(size_t i=0; i<size; i++)
|
||||
{
|
||||
REP100(a[i]=a[i]*a[i];);
|
||||
REP20(a[i]=a[i]*a[i];);
|
||||
REP8(a[i]=a[i]*a[i];);
|
||||
}
|
||||
}
|
||||
|
||||
inline void kernel_8_1_fuseaware(double* a, size_t size)
|
||||
{
|
||||
#pragma omp parallel for
|
||||
for(size_t i=0; i<size; i++)
|
||||
|
|
|
@ -5,11 +5,13 @@ typedef struct {
|
|||
size_t runs; // also # of start-/endtimes
|
||||
double* starts; // starttimes
|
||||
double* ends; // endtimes
|
||||
int flops; //flops per run
|
||||
int flops; // flops per iteration
|
||||
char* kern_name;
|
||||
size_t size; // size of arrays handeld
|
||||
} kern_result;
|
||||
|
||||
typedef enum {
|
||||
SIMPLE_1_16, FMA_1_16, SIMPLE_8_1, FMA_8_1
|
||||
SIMPLE_1_16, FMA_1_16, SIMPLE_8_1, FMA_8_1, SIMPLE_8_1_FASTMATH
|
||||
} kernel_t;
|
||||
|
||||
/**
|
||||
|
@ -152,6 +154,43 @@ void kernel_8_1_simple(double* a, size_t size);
|
|||
*/
|
||||
void kernel_8_1_fuseaware(double* a, size_t size);
|
||||
|
||||
/**
|
||||
* @brief A simple 8/1 operational intensity kernel which
|
||||
* undermines evil fastmath optimization
|
||||
* @param a An array with double values of size param size
|
||||
* @param size Size of the three param arrays
|
||||
* @param result Pointer to result storage
|
||||
*
|
||||
* === Warning ===
|
||||
* Don't use with anything other than -Ofast / -ffast-math
|
||||
*
|
||||
* === Description ===
|
||||
* Uses a simple floating point operation that more closely resembles
|
||||
* that of 8_1_fuseaware:
|
||||
* a[i] = a[i]*a[i]; # 128x
|
||||
*
|
||||
* Runs in a parallelized for loop.
|
||||
*
|
||||
* === Analysis ===
|
||||
* -Ofast/-ffast-math does not preserve strict IEEE compliance. It
|
||||
* therefore is allowed to ignore non-associativity of floating
|
||||
* point operations.
|
||||
*
|
||||
* x = x*x*x*x*x*x*x*x; is optimized to x *= x;x *= x;x *= x;
|
||||
*
|
||||
* This cleary breaks the whole OI calculation of 8_1_simple.
|
||||
*
|
||||
* This kernel does not introduce more byte write-outs than
|
||||
* 8_1_simple at a high optimization level since a[i] is held
|
||||
* in a register and only written out once at the end of an
|
||||
* iteration.
|
||||
*
|
||||
*
|
||||
* === Optimization ===
|
||||
* Nothing special
|
||||
*/
|
||||
void kernel_8_1_simple_fastmath(double* a, size_t size);
|
||||
|
||||
|
||||
/********************************************
|
||||
* Kernels which potentially compile to *
|
||||
|
|
5
roofline/src/log/fma16
Normal file
5
roofline/src/log/fma16
Normal file
|
@ -0,0 +1,5 @@
|
|||
run,start,end,delta,GFLOP/s
|
||||
1,1466732365.5426,1466732366.1946,0.6520,0.9202
|
||||
2,1466732366.1946,1466732366.8410,0.6464,0.9282
|
||||
3,1466732366.8410,1466732367.4875,0.6465,0.9281
|
||||
4,1466732367.4875,1466732368.1370,0.6495,0.9238
|
5
roofline/src/log/fma8
Normal file
5
roofline/src/log/fma8
Normal file
|
@ -0,0 +1,5 @@
|
|||
run,start,end,delta,GFLOP/s
|
||||
1,1466732371.5080,1466732373.2547,1.7468,21.9836
|
||||
2,1466732373.2547,1466732375.0033,1.7486,21.9603
|
||||
3,1466732375.0033,1466732376.7499,1.7465,21.9864
|
||||
4,1466732376.7499,1466732378.4990,1.7492,21.9534
|
5
roofline/src/log/simple16
Normal file
5
roofline/src/log/simple16
Normal file
|
@ -0,0 +1,5 @@
|
|||
run,start,end,delta,GFLOP/s
|
||||
1,1466732363.5905,1466732363.9157,0.3252,0.9226
|
||||
2,1466732363.9157,1466732364.2410,0.3253,0.9223
|
||||
3,1466732364.2410,1466732364.5659,0.3249,0.9234
|
||||
4,1466732364.5659,1466732364.8925,0.3266,0.9186
|
5
roofline/src/log/simple8
Normal file
5
roofline/src/log/simple8
Normal file
|
@ -0,0 +1,5 @@
|
|||
run,start,end,delta,GFLOP/s
|
||||
1,1466732368.4592,1466732368.7843,0.3251,118.1190
|
||||
2,1466732368.7843,1466732369.1090,0.3247,118.2713
|
||||
3,1466732369.1090,1466732369.4353,0.3263,117.6684
|
||||
4,1466732369.4353,1466732369.7596,0.3243,118.4045
|
5
roofline/src/log/simple8fastmath
Normal file
5
roofline/src/log/simple8fastmath
Normal file
|
@ -0,0 +1,5 @@
|
|||
run,start,end,delta,GFLOP/s
|
||||
1,1466732382.7768,1466732387.0594,4.2827,8.9664
|
||||
2,1466732387.0594,1466732391.3489,4.2895,8.9521
|
||||
3,1466732391.3489,1466732395.6322,4.2834,8.9649
|
||||
4,1466732395.6322,1466732399.9114,4.2791,8.9738
|
Binary file not shown.
|
@ -3,6 +3,7 @@
|
|||
# include <unistd.h>
|
||||
# include <ctype.h>
|
||||
# include <sys/time.h>
|
||||
# include <sys/stat.h>
|
||||
# include <errno.h>
|
||||
# include <string.h>
|
||||
# include <stdint.h>
|
||||
|
@ -67,7 +68,7 @@ static void testkern(double* a, double* b, double* c, size_t size);
|
|||
/**
|
||||
* @brief pretty prints a kern_result
|
||||
*/
|
||||
static void print_kernresult(kern_result* result);
|
||||
static void print_kernresult(kern_result* result, const char* logname);
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
prog_name = argv[0];
|
||||
|
@ -108,6 +109,7 @@ int main(int argc, char* argv[]) {
|
|||
size_t size = get_size(size_arg);
|
||||
int runs = get_int(runs_arg);
|
||||
|
||||
// Allocating arrays
|
||||
printf("Will run with array sizes of %zu elements\n", size);
|
||||
printf("Will calculate min, max, avg for %d runs\n", runs);
|
||||
double* a = malloc(sizeof(double)*(size));
|
||||
|
@ -120,6 +122,7 @@ int main(int argc, char* argv[]) {
|
|||
printf("Allocated 3 arrays (3*%.2f MB = %.2f GB)\n", (sizeof(double)*(size)/1024.0/1024.0), (sizeof(double)*(size)*3/1024.0/1024.0/1024));
|
||||
printf("Filling arrays with dummy values. This will also warm the cache\n");
|
||||
|
||||
// Filling arrays with arbitrary numbers
|
||||
#pragma omp parallel for
|
||||
for (size_t j=0; j<size; j++)
|
||||
{
|
||||
|
@ -134,13 +137,26 @@ int main(int argc, char* argv[]) {
|
|||
testkern(a,b,c, size);
|
||||
t = pin_time() - t;
|
||||
printf("Machine heating took %.4f microseconds = %.4f seconds (with test OI kernel)\n", (t*1.0E6), t);
|
||||
printf("Starting tests...\n\n\n");
|
||||
|
||||
// Executing kernels
|
||||
kern_result simple16 = kernel_dispatch(SIMPLE_1_16, a, b, c, size, runs);
|
||||
kern_result fma16 = kernel_dispatch(FMA_1_16, a, b, c, size, runs);
|
||||
kern_result simple8 = kernel_dispatch(SIMPLE_8_1, a, b, c, size, runs);
|
||||
kern_result fma8 = kernel_dispatch(FMA_8_1, a, b, c, size, runs);
|
||||
kern_result simple8fm = kernel_dispatch(SIMPLE_8_1_FASTMATH, a, b, c, size, runs);
|
||||
|
||||
print_kernresult(&simple16);
|
||||
// Freeing arrays
|
||||
free(a);
|
||||
free(b);
|
||||
free(c);
|
||||
|
||||
// Printing results
|
||||
print_kernresult(&simple16, "simple16");
|
||||
print_kernresult(&fma16, "fma16");
|
||||
print_kernresult(&simple8, "simple8");
|
||||
print_kernresult(&fma8, "fma8");
|
||||
print_kernresult(&simple8fm, "simple8fastmath");
|
||||
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
|
@ -233,6 +249,72 @@ static void bail_out(char* fmt, ...)
|
|||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
static void print_kernresult(kern_result* result){
|
||||
return;
|
||||
static void print_kernresult(kern_result* result, const char* logname)
|
||||
{
|
||||
struct stat st = {0};
|
||||
|
||||
if (stat("log", &st) == -1)
|
||||
{
|
||||
if(mkdir("log", 0700))
|
||||
{
|
||||
bail_out("Couldn't create log directory for %s", result->kern_name);
|
||||
}
|
||||
}
|
||||
|
||||
char logpath[20];
|
||||
snprintf(logpath, sizeof(logpath), "%s/%s", "log", logname);
|
||||
FILE* log = fopen(logpath, "w");
|
||||
if(log == NULL)
|
||||
bail_out("Couldn't open log file for %s", result->kern_name);
|
||||
|
||||
if(fputs("run,start,end,delta,GFLOP/s\n", log) == EOF)
|
||||
{
|
||||
fclose(log);
|
||||
bail_out("Couldn't write header to log file");
|
||||
}
|
||||
|
||||
printf("=== %s ===\n", result->kern_name);
|
||||
|
||||
double min;
|
||||
double max;
|
||||
double sum = 0.0;
|
||||
double deltas[result->runs];
|
||||
|
||||
deltas[0] = result->ends[0] - result->starts[0];
|
||||
min=deltas[0];
|
||||
max=deltas[0];
|
||||
sum+=deltas[0];
|
||||
|
||||
for(size_t i=1; i<result->runs; i++)
|
||||
{
|
||||
deltas[i] = result->ends[i] - result->starts[i];
|
||||
sum+=deltas[i];
|
||||
|
||||
if(deltas[i] < min) min=deltas[i];
|
||||
if(deltas[i] > max) max=deltas[i];
|
||||
|
||||
double gflops = ((result->flops * result->size) / deltas[i]) / 1.0E9;
|
||||
|
||||
if(fprintf(log, "%zu,%.4f,%.4f,%.4f,%.4f\n",
|
||||
i, result->starts[i],
|
||||
result->ends[i], deltas[i],
|
||||
gflops) == EOF)
|
||||
{
|
||||
fclose(log);
|
||||
bail_out("Couldn't write to log file");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
printf("%d flop(s) per run\t %zu run(s)\n\n", result->flops, result->runs);
|
||||
printf("Min: %.4f \t Max: %.4f \t Avg: %.4f\n", min, max, (sum/result->runs));
|
||||
|
||||
printf("\n\n\n");
|
||||
|
||||
|
||||
|
||||
if(fclose(log))
|
||||
{
|
||||
bail_out("Couldn't close log file for %s", result->kern_name);
|
||||
}
|
||||
}
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in a new issue