added logging, added fastmath undermining for simple8
This commit is contained in:
parent
31bfead054
commit
aaf77c78f6
16 changed files with 202 additions and 18 deletions
|
@ -21,6 +21,7 @@ while i<=64:
|
||||||
xlbl.append(repr(i))
|
xlbl.append(repr(i))
|
||||||
i *= 2
|
i *= 2
|
||||||
|
|
||||||
|
print(xlbl)
|
||||||
# memory
|
# memory
|
||||||
values = []
|
values = []
|
||||||
bandwidth = 10.6
|
bandwidth = 10.6
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
all: bin lib
|
all: clean bin lib
|
||||||
|
|
||||||
# Roofline Binary
|
# Roofline Binary
|
||||||
bin: roofline roofline_o3 roofline_fma roofline_fma_o3 roofline_fma_fast_o3 roofline_fma_fast_fastmath_o3
|
bin: roofline roofline_o3 roofline_fma roofline_fma_o3 roofline_fma_fast_o3 roofline_fma_fast_o2 roofline_fma_fast_fastmath_o3
|
||||||
mkdir bin
|
mkdir bin
|
||||||
mv $^ bin
|
mv $^ bin
|
||||||
|
|
||||||
|
@ -20,12 +20,15 @@ roofline_fma_o3: roofline.c aikern_fma_o3.a
|
||||||
roofline_fma_fast_o3: roofline.c aikern_fma_fast_o3.a
|
roofline_fma_fast_o3: roofline.c aikern_fma_fast_o3.a
|
||||||
gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@
|
gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@
|
||||||
|
|
||||||
|
roofline_fma_fast_o2: roofline.c aikern_fma_fast_o2.a
|
||||||
|
gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@
|
||||||
|
|
||||||
roofline_fma_fast_fastmath_o3: roofline.c aikern_fma_fast_fastmath_o3.a
|
roofline_fma_fast_fastmath_o3: roofline.c aikern_fma_fast_fastmath_o3.a
|
||||||
gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@
|
gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@
|
||||||
|
|
||||||
|
|
||||||
# Static Libraries
|
# Static Libraries
|
||||||
lib: aikern.a aikern_o3.a aikern_fma.a aikern_fma_o3.a aikern_fma_fast_o3.a aikern_fma_fast_fastmath_o3.a
|
lib: aikern.a aikern_o3.a aikern_fma.a aikern_fma_o3.a aikern_fma_fast_o2.a aikern_fma_fast_o3.a aikern_fma_fast_fastmath_o3.a
|
||||||
mkdir lib
|
mkdir lib
|
||||||
mv $^ lib
|
mv $^ lib
|
||||||
|
|
||||||
|
@ -49,6 +52,11 @@ aikern_fma_o3.a: aikern.c aikern.h
|
||||||
ar rcs $@ aikern_fma_o3.o
|
ar rcs $@ aikern_fma_o3.o
|
||||||
rm aikern_fma_o3.o
|
rm aikern_fma_o3.o
|
||||||
|
|
||||||
|
aikern_fma_fast_o2.a: aikern.c aikern.h
|
||||||
|
gcc -Wall -Wextra -Wno-unused -O2 -mavx -mfma -fopenmp -Ofast -c -o aikern_fma_fast_o2.o $<
|
||||||
|
ar rcs $@ aikern_fma_fast_o2.o
|
||||||
|
rm aikern_fma_fast_o2.o
|
||||||
|
|
||||||
aikern_fma_fast_o3.a: aikern.c aikern.h
|
aikern_fma_fast_o3.a: aikern.c aikern.h
|
||||||
gcc -Wall -Wextra -Wno-unused -O3 -mavx -mfma -fopenmp -Ofast -c -o aikern_fma_fast_o3.o $<
|
gcc -Wall -Wextra -Wno-unused -O3 -mavx -mfma -fopenmp -Ofast -c -o aikern_fma_fast_o3.o $<
|
||||||
ar rcs $@ aikern_fma_fast_o3.o
|
ar rcs $@ aikern_fma_fast_o3.o
|
||||||
|
@ -61,6 +69,9 @@ aikern_fma_fast_fastmath_o3.a: aikern.c aikern.h
|
||||||
|
|
||||||
# Cleanup
|
# Cleanup
|
||||||
clean:
|
clean:
|
||||||
|
rm -f *.a
|
||||||
|
rm -f *.o
|
||||||
|
rm -f roofline roofline_o3 roofline_fma roofline_fma_o3 roofline_fma_fast_o3 roofline_fma_fast_o2 roofline_fma_fast_fastmath_o3
|
||||||
rm -fR bin
|
rm -fR bin
|
||||||
rm -fR lib
|
rm -fR lib
|
||||||
|
|
||||||
|
|
|
@ -25,10 +25,11 @@ kern_result kernel_dispatch(kernel_t kernel,
|
||||||
size_t size, size_t runs)
|
size_t size, size_t runs)
|
||||||
{
|
{
|
||||||
|
|
||||||
kern_result result;
|
kern_result result = {0};
|
||||||
result.runs = runs;
|
result.runs = runs;
|
||||||
result.starts = malloc(sizeof(double)*(runs));
|
result.starts = malloc(sizeof(double)*(runs));
|
||||||
result.ends = malloc(sizeof(double)*(runs));
|
result.ends = malloc(sizeof(double)*(runs));
|
||||||
|
result.size = size;
|
||||||
|
|
||||||
if(result.starts==NULL || result.ends==NULL)
|
if(result.starts==NULL || result.ends==NULL)
|
||||||
{
|
{
|
||||||
|
@ -41,6 +42,7 @@ kern_result kernel_dispatch(kernel_t kernel,
|
||||||
|
|
||||||
case SIMPLE_1_16:
|
case SIMPLE_1_16:
|
||||||
result.flops = 1;
|
result.flops = 1;
|
||||||
|
result.kern_name = "Simple 1/16";
|
||||||
for(size_t r=0; r<runs; r++)
|
for(size_t r=0; r<runs; r++)
|
||||||
{
|
{
|
||||||
result.starts[r] = pin_time();
|
result.starts[r] = pin_time();
|
||||||
|
@ -50,6 +52,7 @@ kern_result kernel_dispatch(kernel_t kernel,
|
||||||
break;
|
break;
|
||||||
case FMA_1_16:
|
case FMA_1_16:
|
||||||
result.flops = 2;
|
result.flops = 2;
|
||||||
|
result.kern_name = "FMA aware 1/16";
|
||||||
for(size_t r=0; r<runs; r++)
|
for(size_t r=0; r<runs; r++)
|
||||||
{
|
{
|
||||||
result.starts[r] = pin_time();
|
result.starts[r] = pin_time();
|
||||||
|
@ -59,6 +62,7 @@ kern_result kernel_dispatch(kernel_t kernel,
|
||||||
break;
|
break;
|
||||||
case SIMPLE_8_1:
|
case SIMPLE_8_1:
|
||||||
result.flops = 128;
|
result.flops = 128;
|
||||||
|
result.kern_name = "Simple 8";
|
||||||
for(size_t r=0; r<runs; r++)
|
for(size_t r=0; r<runs; r++)
|
||||||
{
|
{
|
||||||
result.starts[r] = pin_time();
|
result.starts[r] = pin_time();
|
||||||
|
@ -68,6 +72,7 @@ kern_result kernel_dispatch(kernel_t kernel,
|
||||||
break;
|
break;
|
||||||
case FMA_8_1:
|
case FMA_8_1:
|
||||||
result.flops = 128;
|
result.flops = 128;
|
||||||
|
result.kern_name = "FMA aware 8";
|
||||||
for(size_t r=0; r<runs; r++)
|
for(size_t r=0; r<runs; r++)
|
||||||
{
|
{
|
||||||
result.starts[r] = pin_time();
|
result.starts[r] = pin_time();
|
||||||
|
@ -75,6 +80,16 @@ kern_result kernel_dispatch(kernel_t kernel,
|
||||||
result.ends[r] = pin_time();
|
result.ends[r] = pin_time();
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case SIMPLE_8_1_FASTMATH:
|
||||||
|
result.flops = 128;
|
||||||
|
result.kern_name = "Simple 8 undermining fastmath";
|
||||||
|
for(size_t r=0; r<runs; r++)
|
||||||
|
{
|
||||||
|
result.starts[r] = pin_time();
|
||||||
|
kernel_8_1_simple_fastmath(a, size);
|
||||||
|
result.ends[r] = pin_time();
|
||||||
|
}
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
bail_out("No such kernel %s", kernel);
|
bail_out("No such kernel %s", kernel);
|
||||||
}
|
}
|
||||||
|
@ -82,7 +97,7 @@ kern_result kernel_dispatch(kernel_t kernel,
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
void kernel_1_16_simple(double* a, size_t size)
|
inline void kernel_1_16_simple(double* a, size_t size)
|
||||||
{
|
{
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
for(size_t i=0; i<size; i++)
|
for(size_t i=0; i<size; i++)
|
||||||
|
@ -91,7 +106,7 @@ void kernel_1_16_simple(double* a, size_t size)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size)
|
inline void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size)
|
||||||
{
|
{
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
for(size_t i=0; i<size; i++)
|
for(size_t i=0; i<size; i++)
|
||||||
|
@ -100,7 +115,7 @@ void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void kernel_8_1_simple(double* a, size_t size)
|
inline void kernel_8_1_simple(double* a, size_t size)
|
||||||
{
|
{
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
for(size_t i=0; i<size; i++)
|
for(size_t i=0; i<size; i++)
|
||||||
|
@ -112,7 +127,18 @@ void kernel_8_1_simple(double* a, size_t size)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void kernel_8_1_fuseaware(double* a, size_t size)
|
inline void kernel_8_1_simple_fastmath(double* a, size_t size)
|
||||||
|
{
|
||||||
|
#pragma omp parallel for
|
||||||
|
for(size_t i=0; i<size; i++)
|
||||||
|
{
|
||||||
|
REP100(a[i]=a[i]*a[i];);
|
||||||
|
REP20(a[i]=a[i]*a[i];);
|
||||||
|
REP8(a[i]=a[i]*a[i];);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void kernel_8_1_fuseaware(double* a, size_t size)
|
||||||
{
|
{
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
for(size_t i=0; i<size; i++)
|
for(size_t i=0; i<size; i++)
|
||||||
|
|
|
@ -2,14 +2,16 @@
|
||||||
#define AIKERN_H
|
#define AIKERN_H
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
size_t runs; // also # of start-/endtimes
|
size_t runs; // also # of start-/endtimes
|
||||||
double* starts; // starttimes
|
double* starts; // starttimes
|
||||||
double* ends; // endtimes
|
double* ends; // endtimes
|
||||||
int flops; //flops per run
|
int flops; // flops per iteration
|
||||||
|
char* kern_name;
|
||||||
|
size_t size; // size of arrays handeld
|
||||||
} kern_result;
|
} kern_result;
|
||||||
|
|
||||||
typedef enum {
|
typedef enum {
|
||||||
SIMPLE_1_16, FMA_1_16, SIMPLE_8_1, FMA_8_1
|
SIMPLE_1_16, FMA_1_16, SIMPLE_8_1, FMA_8_1, SIMPLE_8_1_FASTMATH
|
||||||
} kernel_t;
|
} kernel_t;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -152,6 +154,43 @@ void kernel_8_1_simple(double* a, size_t size);
|
||||||
*/
|
*/
|
||||||
void kernel_8_1_fuseaware(double* a, size_t size);
|
void kernel_8_1_fuseaware(double* a, size_t size);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief A simple 8/1 operational intensity kernel which
|
||||||
|
* undermines evil fastmath optimization
|
||||||
|
* @param a An array with double values of size param size
|
||||||
|
* @param size Size of the three param arrays
|
||||||
|
* @param result Pointer to result storage
|
||||||
|
*
|
||||||
|
* === Warning ===
|
||||||
|
* Don't use with anything other than -Ofast / -ffast-math
|
||||||
|
*
|
||||||
|
* === Description ===
|
||||||
|
* Uses a simple floating point operation that more closely resembles
|
||||||
|
* that of 8_1_fuseaware:
|
||||||
|
* a[i] = a[i]*a[i]; # 128x
|
||||||
|
*
|
||||||
|
* Runs in a parallelized for loop.
|
||||||
|
*
|
||||||
|
* === Analysis ===
|
||||||
|
* -Ofast/-ffast-math does not preserve strict IEEE compliance. It
|
||||||
|
* therefore is allowed to ignore non-associativity of floating
|
||||||
|
* point operations.
|
||||||
|
*
|
||||||
|
* x = x*x*x*x*x*x*x*x; is optimized to x *= x;x *= x;x *= x;
|
||||||
|
*
|
||||||
|
* This cleary breaks the whole OI calculation of 8_1_simple.
|
||||||
|
*
|
||||||
|
* This kernel does not introduce more byte write-outs than
|
||||||
|
* 8_1_simple at a high optimization level since a[i] is held
|
||||||
|
* in a register and only written out once at the end of an
|
||||||
|
* iteration.
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* === Optimization ===
|
||||||
|
* Nothing special
|
||||||
|
*/
|
||||||
|
void kernel_8_1_simple_fastmath(double* a, size_t size);
|
||||||
|
|
||||||
|
|
||||||
/********************************************
|
/********************************************
|
||||||
* Kernels which potentially compile to *
|
* Kernels which potentially compile to *
|
||||||
|
|
5
roofline/src/log/fma16
Normal file
5
roofline/src/log/fma16
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
run,start,end,delta,GFLOP/s
|
||||||
|
1,1466732365.5426,1466732366.1946,0.6520,0.9202
|
||||||
|
2,1466732366.1946,1466732366.8410,0.6464,0.9282
|
||||||
|
3,1466732366.8410,1466732367.4875,0.6465,0.9281
|
||||||
|
4,1466732367.4875,1466732368.1370,0.6495,0.9238
|
5
roofline/src/log/fma8
Normal file
5
roofline/src/log/fma8
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
run,start,end,delta,GFLOP/s
|
||||||
|
1,1466732371.5080,1466732373.2547,1.7468,21.9836
|
||||||
|
2,1466732373.2547,1466732375.0033,1.7486,21.9603
|
||||||
|
3,1466732375.0033,1466732376.7499,1.7465,21.9864
|
||||||
|
4,1466732376.7499,1466732378.4990,1.7492,21.9534
|
5
roofline/src/log/simple16
Normal file
5
roofline/src/log/simple16
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
run,start,end,delta,GFLOP/s
|
||||||
|
1,1466732363.5905,1466732363.9157,0.3252,0.9226
|
||||||
|
2,1466732363.9157,1466732364.2410,0.3253,0.9223
|
||||||
|
3,1466732364.2410,1466732364.5659,0.3249,0.9234
|
||||||
|
4,1466732364.5659,1466732364.8925,0.3266,0.9186
|
5
roofline/src/log/simple8
Normal file
5
roofline/src/log/simple8
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
run,start,end,delta,GFLOP/s
|
||||||
|
1,1466732368.4592,1466732368.7843,0.3251,118.1190
|
||||||
|
2,1466732368.7843,1466732369.1090,0.3247,118.2713
|
||||||
|
3,1466732369.1090,1466732369.4353,0.3263,117.6684
|
||||||
|
4,1466732369.4353,1466732369.7596,0.3243,118.4045
|
5
roofline/src/log/simple8fastmath
Normal file
5
roofline/src/log/simple8fastmath
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
run,start,end,delta,GFLOP/s
|
||||||
|
1,1466732382.7768,1466732387.0594,4.2827,8.9664
|
||||||
|
2,1466732387.0594,1466732391.3489,4.2895,8.9521
|
||||||
|
3,1466732391.3489,1466732395.6322,4.2834,8.9649
|
||||||
|
4,1466732395.6322,1466732399.9114,4.2791,8.9738
|
Binary file not shown.
|
@ -3,6 +3,7 @@
|
||||||
# include <unistd.h>
|
# include <unistd.h>
|
||||||
# include <ctype.h>
|
# include <ctype.h>
|
||||||
# include <sys/time.h>
|
# include <sys/time.h>
|
||||||
|
# include <sys/stat.h>
|
||||||
# include <errno.h>
|
# include <errno.h>
|
||||||
# include <string.h>
|
# include <string.h>
|
||||||
# include <stdint.h>
|
# include <stdint.h>
|
||||||
|
@ -67,7 +68,7 @@ static void testkern(double* a, double* b, double* c, size_t size);
|
||||||
/**
|
/**
|
||||||
* @brief pretty prints a kern_result
|
* @brief pretty prints a kern_result
|
||||||
*/
|
*/
|
||||||
static void print_kernresult(kern_result* result);
|
static void print_kernresult(kern_result* result, const char* logname);
|
||||||
|
|
||||||
int main(int argc, char* argv[]) {
|
int main(int argc, char* argv[]) {
|
||||||
prog_name = argv[0];
|
prog_name = argv[0];
|
||||||
|
@ -108,6 +109,7 @@ int main(int argc, char* argv[]) {
|
||||||
size_t size = get_size(size_arg);
|
size_t size = get_size(size_arg);
|
||||||
int runs = get_int(runs_arg);
|
int runs = get_int(runs_arg);
|
||||||
|
|
||||||
|
// Allocating arrays
|
||||||
printf("Will run with array sizes of %zu elements\n", size);
|
printf("Will run with array sizes of %zu elements\n", size);
|
||||||
printf("Will calculate min, max, avg for %d runs\n", runs);
|
printf("Will calculate min, max, avg for %d runs\n", runs);
|
||||||
double* a = malloc(sizeof(double)*(size));
|
double* a = malloc(sizeof(double)*(size));
|
||||||
|
@ -119,7 +121,8 @@ int main(int argc, char* argv[]) {
|
||||||
|
|
||||||
printf("Allocated 3 arrays (3*%.2f MB = %.2f GB)\n", (sizeof(double)*(size)/1024.0/1024.0), (sizeof(double)*(size)*3/1024.0/1024.0/1024));
|
printf("Allocated 3 arrays (3*%.2f MB = %.2f GB)\n", (sizeof(double)*(size)/1024.0/1024.0), (sizeof(double)*(size)*3/1024.0/1024.0/1024));
|
||||||
printf("Filling arrays with dummy values. This will also warm the cache\n");
|
printf("Filling arrays with dummy values. This will also warm the cache\n");
|
||||||
|
|
||||||
|
// Filling arrays with arbitrary numbers
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
for (size_t j=0; j<size; j++)
|
for (size_t j=0; j<size; j++)
|
||||||
{
|
{
|
||||||
|
@ -134,13 +137,26 @@ int main(int argc, char* argv[]) {
|
||||||
testkern(a,b,c, size);
|
testkern(a,b,c, size);
|
||||||
t = pin_time() - t;
|
t = pin_time() - t;
|
||||||
printf("Machine heating took %.4f microseconds = %.4f seconds (with test OI kernel)\n", (t*1.0E6), t);
|
printf("Machine heating took %.4f microseconds = %.4f seconds (with test OI kernel)\n", (t*1.0E6), t);
|
||||||
|
printf("Starting tests...\n\n\n");
|
||||||
|
|
||||||
|
// Executing kernels
|
||||||
kern_result simple16 = kernel_dispatch(SIMPLE_1_16, a, b, c, size, runs);
|
kern_result simple16 = kernel_dispatch(SIMPLE_1_16, a, b, c, size, runs);
|
||||||
kern_result fma16 = kernel_dispatch(FMA_1_16, a, b, c, size, runs);
|
kern_result fma16 = kernel_dispatch(FMA_1_16, a, b, c, size, runs);
|
||||||
kern_result simple8 = kernel_dispatch(SIMPLE_8_1, a, b, c, size, runs);
|
kern_result simple8 = kernel_dispatch(SIMPLE_8_1, a, b, c, size, runs);
|
||||||
kern_result fma8 = kernel_dispatch(FMA_8_1, a, b, c, size, runs);
|
kern_result fma8 = kernel_dispatch(FMA_8_1, a, b, c, size, runs);
|
||||||
|
kern_result simple8fm = kernel_dispatch(SIMPLE_8_1_FASTMATH, a, b, c, size, runs);
|
||||||
|
|
||||||
print_kernresult(&simple16);
|
// Freeing arrays
|
||||||
|
free(a);
|
||||||
|
free(b);
|
||||||
|
free(c);
|
||||||
|
|
||||||
|
// Printing results
|
||||||
|
print_kernresult(&simple16, "simple16");
|
||||||
|
print_kernresult(&fma16, "fma16");
|
||||||
|
print_kernresult(&simple8, "simple8");
|
||||||
|
print_kernresult(&fma8, "fma8");
|
||||||
|
print_kernresult(&simple8fm, "simple8fastmath");
|
||||||
|
|
||||||
exit(EXIT_SUCCESS);
|
exit(EXIT_SUCCESS);
|
||||||
}
|
}
|
||||||
|
@ -233,6 +249,72 @@ static void bail_out(char* fmt, ...)
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void print_kernresult(kern_result* result){
|
static void print_kernresult(kern_result* result, const char* logname)
|
||||||
return;
|
{
|
||||||
|
struct stat st = {0};
|
||||||
|
|
||||||
|
if (stat("log", &st) == -1)
|
||||||
|
{
|
||||||
|
if(mkdir("log", 0700))
|
||||||
|
{
|
||||||
|
bail_out("Couldn't create log directory for %s", result->kern_name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
char logpath[20];
|
||||||
|
snprintf(logpath, sizeof(logpath), "%s/%s", "log", logname);
|
||||||
|
FILE* log = fopen(logpath, "w");
|
||||||
|
if(log == NULL)
|
||||||
|
bail_out("Couldn't open log file for %s", result->kern_name);
|
||||||
|
|
||||||
|
if(fputs("run,start,end,delta,GFLOP/s\n", log) == EOF)
|
||||||
|
{
|
||||||
|
fclose(log);
|
||||||
|
bail_out("Couldn't write header to log file");
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("=== %s ===\n", result->kern_name);
|
||||||
|
|
||||||
|
double min;
|
||||||
|
double max;
|
||||||
|
double sum = 0.0;
|
||||||
|
double deltas[result->runs];
|
||||||
|
|
||||||
|
deltas[0] = result->ends[0] - result->starts[0];
|
||||||
|
min=deltas[0];
|
||||||
|
max=deltas[0];
|
||||||
|
sum+=deltas[0];
|
||||||
|
|
||||||
|
for(size_t i=1; i<result->runs; i++)
|
||||||
|
{
|
||||||
|
deltas[i] = result->ends[i] - result->starts[i];
|
||||||
|
sum+=deltas[i];
|
||||||
|
|
||||||
|
if(deltas[i] < min) min=deltas[i];
|
||||||
|
if(deltas[i] > max) max=deltas[i];
|
||||||
|
|
||||||
|
double gflops = ((result->flops * result->size) / deltas[i]) / 1.0E9;
|
||||||
|
|
||||||
|
if(fprintf(log, "%zu,%.4f,%.4f,%.4f,%.4f\n",
|
||||||
|
i, result->starts[i],
|
||||||
|
result->ends[i], deltas[i],
|
||||||
|
gflops) == EOF)
|
||||||
|
{
|
||||||
|
fclose(log);
|
||||||
|
bail_out("Couldn't write to log file");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
printf("%d flop(s) per run\t %zu run(s)\n\n", result->flops, result->runs);
|
||||||
|
printf("Min: %.4f \t Max: %.4f \t Avg: %.4f\n", min, max, (sum/result->runs));
|
||||||
|
|
||||||
|
printf("\n\n\n");
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if(fclose(log))
|
||||||
|
{
|
||||||
|
bail_out("Couldn't close log file for %s", result->kern_name);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in a new issue