one last kernel
This commit is contained in:
parent
aaf77c78f6
commit
c8f6ca1989
11 changed files with 348 additions and 33 deletions
|
@ -1,7 +1,7 @@
|
|||
all: clean bin lib
|
||||
|
||||
# Roofline Binary
|
||||
bin: roofline roofline_o3 roofline_fma roofline_fma_o3 roofline_fma_fast_o3 roofline_fma_fast_o2 roofline_fma_fast_fastmath_o3
|
||||
bin: roofline roofline_o3 roofline_fma roofline_fma_o3 roofline_fma_fast_o3 roofline_fma_fast_o2 roofline_fma_fast_fastmath_o3 roofline_full roofline_profile roofline_full_clang roofline_full_manpack
|
||||
mkdir bin
|
||||
mv $^ bin
|
||||
|
||||
|
@ -26,9 +26,20 @@ roofline_fma_fast_o2: roofline.c aikern_fma_fast_o2.a
|
|||
roofline_fma_fast_fastmath_o3: roofline.c aikern_fma_fast_fastmath_o3.a
|
||||
gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@
|
||||
|
||||
roofline_full: roofline.c aikern_full.a
|
||||
gcc -Wall -Wextra -std=c99 -fopenmp -O3 -mavx -mfma -fopenmp -Ofast -ffast-math -malign-double -march=native $^ -o $@
|
||||
|
||||
roofline_full_clang: roofline.c aikern_full_clang.a
|
||||
clang -Wall -Wextra -std=c99 -fopenmp -O3 -mavx -mfma -fopenmp -Ofast -ffast-math -march=native $^ -o $@
|
||||
|
||||
roofline_full_manpack: roofline.c aikern_full_manpack.a
|
||||
gcc -Wall -Wextra -std=c99 -fopenmp -DINTRINS -O3 -mavx -mfma -fopenmp -Ofast -ffast-math -malign-double -march=native $^ -o $@
|
||||
|
||||
roofline_profile: roofline.c aikern_profile.a
|
||||
gcc -Wall -Wextra -std=c99 -fopenmp -O0 -g -pg $^ -o $@
|
||||
|
||||
# Static Libraries
|
||||
lib: aikern.a aikern_o3.a aikern_fma.a aikern_fma_o3.a aikern_fma_fast_o2.a aikern_fma_fast_o3.a aikern_fma_fast_fastmath_o3.a
|
||||
lib: aikern.a aikern_o3.a aikern_fma.a aikern_fma_o3.a aikern_fma_fast_o2.a aikern_fma_fast_o3.a aikern_fma_fast_fastmath_o3.a aikern_full.a aikern_full_clang.a aikern_profile.a aikern_full_manpack.a
|
||||
mkdir lib
|
||||
mv $^ lib
|
||||
|
||||
|
@ -67,11 +78,31 @@ aikern_fma_fast_fastmath_o3.a: aikern.c aikern.h
|
|||
ar rcs $@ aikern_fma_fast_fastmath_o3.o
|
||||
rm aikern_fma_fast_fastmath_o3.o
|
||||
|
||||
aikern_full.a: aikern.c aikern.h
|
||||
gcc -Wall -Wextra -Wno-unused -O3 -mavx -mfma -fopenmp -Ofast -ffast-math -malign-double -march=native -c -o aikern_full.o $<
|
||||
ar rcs $@ aikern_full.o
|
||||
rm aikern_full.o
|
||||
|
||||
aikern_full_clang.a: aikern.c aikern.h
|
||||
gcc -Wall -Wextra -Wno-unused -O3 -mavx -mfma -fopenmp -Ofast -ffast-math -march=native -c -o aikern_full_clang.o $<
|
||||
ar rcs $@ aikern_full_clang.o
|
||||
rm aikern_full_clang.o
|
||||
|
||||
aikern_profile.a: aikern.c aikern.h
|
||||
gcc -Wall -Wextra -Wno-unused -fopenmp -O0 -g -pg -c -o aikern_profile.o $<
|
||||
ar rcs $@ aikern_profile.o
|
||||
rm aikern_profile.o
|
||||
|
||||
aikern_full_manpack.a: aikern.c aikern.h
|
||||
gcc -Wall -Wextra -Wno-unused -DINTRINS -O3 -mavx -mfma -fopenmp -Ofast -ffast-math -malign-double -march=native -c -o aikern_full_manpack.o $<
|
||||
ar rcs $@ aikern_full_manpack.o
|
||||
rm aikern_full_manpack.o
|
||||
|
||||
# Cleanup
|
||||
clean:
|
||||
rm -f *.a
|
||||
rm -f *.o
|
||||
rm -f roofline roofline_o3 roofline_fma roofline_fma_o3 roofline_fma_fast_o3 roofline_fma_fast_o2 roofline_fma_fast_fastmath_o3
|
||||
rm -f roofline roofline_o3 roofline_fma roofline_fma_o3 roofline_fma_fast_o3 roofline_fma_fast_o2 roofline_fma_fast_fastmath_o3 roofline_fma_fast_fastmath_aligned_o3 roofline_profile roofline_full_clang
|
||||
rm -fR bin
|
||||
rm -fR lib
|
||||
|
||||
|
|
|
@ -8,6 +8,14 @@
|
|||
|
||||
# include "aikern.h"
|
||||
|
||||
/* === Macros === */
|
||||
|
||||
#ifdef ENDEBUG
|
||||
#define DEBUG(...) do { fprintf(stderr, __VA_ARGS__); fprintf(stderr, "\n"); } while(0)
|
||||
#else
|
||||
#define DEBUG(...)
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @brief terminate program on program error
|
||||
* @param msg additional message to print
|
||||
|
@ -81,6 +89,7 @@ kern_result kernel_dispatch(kernel_t kernel,
|
|||
}
|
||||
break;
|
||||
case SIMPLE_8_1_FASTMATH:
|
||||
DEBUG("AIKERN MANPACK");
|
||||
result.flops = 128;
|
||||
result.kern_name = "Simple 8 undermining fastmath";
|
||||
for(size_t r=0; r<runs; r++)
|
||||
|
@ -90,8 +99,28 @@ kern_result kernel_dispatch(kernel_t kernel,
|
|||
result.ends[r] = pin_time();
|
||||
}
|
||||
break;
|
||||
case FMA_8_1_MANPACK:
|
||||
DEBUG("AIKERN MANPACK");
|
||||
#ifdef INTRINS
|
||||
if(size%4 != 0)
|
||||
{
|
||||
bail_out("Must use multiple of 4 size for manpack");
|
||||
}
|
||||
|
||||
result.flops = 128;
|
||||
result.kern_name = "FMA aware 8 with manual packing";
|
||||
for(size_t r=0; r<runs; r++)
|
||||
{
|
||||
DEBUG("running manpack run %zu",r);
|
||||
result.starts[r] = pin_time();
|
||||
kernel_8_1_fuseaware_manpack(a, size);
|
||||
result.ends[r] = pin_time();
|
||||
}
|
||||
#endif
|
||||
break;
|
||||
default:
|
||||
bail_out("No such kernel %s", kernel);
|
||||
break;
|
||||
}
|
||||
|
||||
return result;
|
||||
|
@ -148,6 +177,32 @@ inline void kernel_8_1_fuseaware(double* a, size_t size)
|
|||
}
|
||||
}
|
||||
|
||||
#ifdef INTRINS
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
inline void kernel_8_1_fuseaware_manpack(double* a, size_t size)
|
||||
{
|
||||
|
||||
#pragma omp parallel for
|
||||
for(size_t i=0; i<(size-4); i+=4)
|
||||
{
|
||||
// pack doubles
|
||||
__m256d packvec = _mm256_set_pd(a[i], a[i+1], a[i+2], a[i+3]);
|
||||
|
||||
REP60(packvec = _mm256_fmadd_pd(packvec, packvec, packvec););
|
||||
REP4(packvec = _mm256_fmadd_pd(packvec, packvec, packvec););
|
||||
|
||||
a[i] = packvec[0];
|
||||
a[i+1] = packvec[1];
|
||||
a[i+2] = packvec[2];
|
||||
a[i+3] = packvec[3];
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* INTRINS */
|
||||
|
||||
|
||||
/********************************************
|
||||
* Kernels which potentially compile to *
|
||||
* different operational intensities than *
|
||||
|
@ -198,7 +253,9 @@ static double pin_time(void)
|
|||
i = gettimeofday(&tp,NULL);
|
||||
|
||||
if(i != 0)
|
||||
{
|
||||
bail_out("Time measurement impossible. gettimeofday error");
|
||||
}
|
||||
|
||||
return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
|
||||
}
|
||||
|
|
|
@ -11,7 +11,7 @@ typedef struct {
|
|||
} kern_result;
|
||||
|
||||
typedef enum {
|
||||
SIMPLE_1_16, FMA_1_16, SIMPLE_8_1, FMA_8_1, SIMPLE_8_1_FASTMATH
|
||||
SIMPLE_1_16, FMA_1_16, SIMPLE_8_1, FMA_8_1, SIMPLE_8_1_FASTMATH, FMA_8_1_MANPACK
|
||||
} kernel_t;
|
||||
|
||||
/**
|
||||
|
@ -249,6 +249,12 @@ void kernel_8_1_simple_dangerous(double* a, size_t size);
|
|||
void kernel_1_8_vo_dangerous(double* a, size_t size);
|
||||
|
||||
|
||||
#ifdef INTRINS
|
||||
void kernel_8_1_fuseaware_manpack(double* a, size_t size);
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/****************************************
|
||||
* Helper macros for repeating things *
|
||||
****************************************/
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
run,start,end,delta,GFLOP/s
|
||||
1,1466732365.5426,1466732366.1946,0.6520,0.9202
|
||||
2,1466732366.1946,1466732366.8410,0.6464,0.9282
|
||||
3,1466732366.8410,1466732367.4875,0.6465,0.9281
|
||||
4,1466732367.4875,1466732368.1370,0.6495,0.9238
|
||||
1,1466764717.8266,1466764718.1751,0.3486,0.8606
|
||||
2,1466764718.1751,1466764718.5235,0.3484,0.8611
|
||||
3,1466764718.5235,1466764718.8726,0.3491,0.8593
|
||||
4,1466764718.8726,1466764719.2248,0.3522,0.8518
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
run,start,end,delta,GFLOP/s
|
||||
1,1466732371.5080,1466732373.2547,1.7468,21.9836
|
||||
2,1466732373.2547,1466732375.0033,1.7486,21.9603
|
||||
3,1466732375.0033,1466732376.7499,1.7465,21.9864
|
||||
4,1466732376.7499,1466732378.4990,1.7492,21.9534
|
||||
1,1466764721.0839,1466764721.9589,0.8750,21.9434
|
||||
2,1466764721.9589,1466764722.8340,0.8752,21.9383
|
||||
3,1466764722.8340,1466764723.7090,0.8749,21.9451
|
||||
4,1466764723.7090,1466764724.5784,0.8694,22.0840
|
||||
|
|
5
roofline/src/log/fma8manpack
Normal file
5
roofline/src/log/fma8manpack
Normal file
|
@ -0,0 +1,5 @@
|
|||
run,start,end,delta,GFLOP/s
|
||||
1,1466764736.6256,1466764737.6487,1.0231,18.7665
|
||||
2,1466764737.6487,1466764738.6642,1.0155,18.9073
|
||||
3,1466764738.6642,1466764739.6867,1.0225,18.7770
|
||||
4,1466764739.6867,1466764740.7045,1.0178,18.8651
|
|
@ -1,5 +1,5 @@
|
|||
run,start,end,delta,GFLOP/s
|
||||
1,1466732363.5905,1466732363.9157,0.3252,0.9226
|
||||
2,1466732363.9157,1466732364.2410,0.3253,0.9223
|
||||
3,1466732364.2410,1466732364.5659,0.3249,0.9234
|
||||
4,1466732364.5659,1466732364.8925,0.3266,0.9186
|
||||
1,1466764716.7465,1466764716.9269,0.1804,0.8314
|
||||
2,1466764716.9269,1466764717.1069,0.1800,0.8334
|
||||
3,1466764717.1069,1466764717.2871,0.1801,0.8327
|
||||
4,1466764717.2871,1466764717.4767,0.1897,0.7908
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
run,start,end,delta,GFLOP/s
|
||||
1,1466732368.4592,1466732368.7843,0.3251,118.1190
|
||||
2,1466732368.7843,1466732369.1090,0.3247,118.2713
|
||||
3,1466732369.1090,1466732369.4353,0.3263,117.6684
|
||||
4,1466732369.4353,1466732369.7596,0.3243,118.4045
|
||||
1,1466764719.4042,1466764719.5845,0.1803,106.5045
|
||||
2,1466764719.5845,1466764719.7707,0.1861,103.1532
|
||||
3,1466764719.7707,1466764719.9487,0.1780,107.8675
|
||||
4,1466764719.9487,1466764720.1264,0.1777,108.0261
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
run,start,end,delta,GFLOP/s
|
||||
1,1466732382.7768,1466732387.0594,4.2827,8.9664
|
||||
2,1466732387.0594,1466732391.3489,4.2895,8.9521
|
||||
3,1466732391.3489,1466732395.6322,4.2834,8.9649
|
||||
4,1466732395.6322,1466732399.9114,4.2791,8.9738
|
||||
1,1466764726.7241,1466764728.9906,2.2664,8.4714
|
||||
2,1466764728.9906,1466764731.1306,2.1400,8.9720
|
||||
3,1466764731.1306,1466764733.3797,2.2491,8.5366
|
||||
4,1466764733.3797,1466764735.6046,2.2249,8.6298
|
||||
|
|
199
roofline/src/prof
Normal file
199
roofline/src/prof
Normal file
|
@ -0,0 +1,199 @@
|
|||
Flat profile:
|
||||
|
||||
Each sample counts as 0.01 seconds.
|
||||
% cumulative self self total
|
||||
time seconds seconds calls Ts/call Ts/call name
|
||||
100.01 0.74 0.74 bail_out
|
||||
0.00 0.74 0.00 50 0.00 0.00 pin_time
|
||||
0.00 0.74 0.00 5 0.00 0.00 kernel_1_16_fuseaware
|
||||
0.00 0.74 0.00 5 0.00 0.00 kernel_1_16_simple
|
||||
0.00 0.74 0.00 5 0.00 0.00 kernel_8_1_fuseaware
|
||||
0.00 0.74 0.00 5 0.00 0.00 kernel_8_1_simple
|
||||
0.00 0.74 0.00 5 0.00 0.00 kernel_8_1_simple_fastmath
|
||||
0.00 0.74 0.00 5 0.00 0.00 kernel_dispatch
|
||||
0.00 0.74 0.00 5 0.00 0.00 print_kernresult
|
||||
0.00 0.74 0.00 2 0.00 0.00 pin_time
|
||||
0.00 0.74 0.00 1 0.00 0.00 get_int
|
||||
0.00 0.74 0.00 1 0.00 0.00 get_size
|
||||
0.00 0.74 0.00 1 0.00 0.00 testkern
|
||||
|
||||
% the percentage of the total running time of the
|
||||
time program used by this function.
|
||||
|
||||
cumulative a running sum of the number of seconds accounted
|
||||
seconds for by this function and those listed above it.
|
||||
|
||||
self the number of seconds accounted for by this
|
||||
seconds function alone. This is the major sort for this
|
||||
listing.
|
||||
|
||||
calls the number of times this function was invoked, if
|
||||
this function is profiled, else blank.
|
||||
|
||||
self the average number of milliseconds spent in this
|
||||
ms/call function per call, if this function is profiled,
|
||||
else blank.
|
||||
|
||||
total the average number of milliseconds spent in this
|
||||
ms/call function and its descendents per call, if this
|
||||
function is profiled, else blank.
|
||||
|
||||
name the name of the function. This is the minor sort
|
||||
for this listing. The index shows the location of
|
||||
the function in the gprof listing. If the index is
|
||||
in parenthesis it shows where it would appear in
|
||||
the gprof listing if it were to be printed.
|
||||
|
||||
Copyright (C) 2012-2014 Free Software Foundation, Inc.
|
||||
|
||||
Copying and distribution of this file, with or without modification,
|
||||
are permitted in any medium without royalty provided the copyright
|
||||
notice and this notice are preserved.
|
||||
|
||||
Call graph (explanation follows)
|
||||
|
||||
|
||||
granularity: each sample hit covers 2 byte(s) for 1.35% of 0.74 seconds
|
||||
|
||||
index % time self children called name
|
||||
<spontaneous>
|
||||
[1] 100.0 0.74 0.00 bail_out [1]
|
||||
-----------------------------------------------
|
||||
0.00 0.00 50/50 kernel_dispatch [8]
|
||||
[2] 0.0 0.00 0.00 50 pin_time [2]
|
||||
-----------------------------------------------
|
||||
0.00 0.00 5/5 kernel_dispatch [8]
|
||||
[3] 0.0 0.00 0.00 5 kernel_1_16_fuseaware [3]
|
||||
-----------------------------------------------
|
||||
0.00 0.00 5/5 kernel_dispatch [8]
|
||||
[4] 0.0 0.00 0.00 5 kernel_1_16_simple [4]
|
||||
-----------------------------------------------
|
||||
0.00 0.00 5/5 kernel_dispatch [8]
|
||||
[5] 0.0 0.00 0.00 5 kernel_8_1_fuseaware [5]
|
||||
-----------------------------------------------
|
||||
0.00 0.00 5/5 kernel_dispatch [8]
|
||||
[6] 0.0 0.00 0.00 5 kernel_8_1_simple [6]
|
||||
-----------------------------------------------
|
||||
0.00 0.00 5/5 kernel_dispatch [8]
|
||||
[7] 0.0 0.00 0.00 5 kernel_8_1_simple_fastmath [7]
|
||||
-----------------------------------------------
|
||||
0.00 0.00 5/5 main [23]
|
||||
[8] 0.0 0.00 0.00 5 kernel_dispatch [8]
|
||||
0.00 0.00 50/50 pin_time [2]
|
||||
0.00 0.00 5/5 kernel_1_16_simple [4]
|
||||
0.00 0.00 5/5 kernel_1_16_fuseaware [3]
|
||||
0.00 0.00 5/5 kernel_8_1_simple [6]
|
||||
0.00 0.00 5/5 kernel_8_1_fuseaware [5]
|
||||
0.00 0.00 5/5 kernel_8_1_simple_fastmath [7]
|
||||
-----------------------------------------------
|
||||
0.00 0.00 5/5 main [23]
|
||||
[9] 0.0 0.00 0.00 5 print_kernresult [9]
|
||||
-----------------------------------------------
|
||||
0.00 0.00 2/2 main [23]
|
||||
[10] 0.0 0.00 0.00 2 pin_time [10]
|
||||
-----------------------------------------------
|
||||
0.00 0.00 1/1 main [23]
|
||||
[11] 0.0 0.00 0.00 1 get_int [11]
|
||||
-----------------------------------------------
|
||||
0.00 0.00 1/1 main [23]
|
||||
[12] 0.0 0.00 0.00 1 get_size [12]
|
||||
-----------------------------------------------
|
||||
0.00 0.00 1/1 main [23]
|
||||
[13] 0.0 0.00 0.00 1 testkern [13]
|
||||
-----------------------------------------------
|
||||
|
||||
This table describes the call tree of the program, and was sorted by
|
||||
the total amount of time spent in each function and its children.
|
||||
|
||||
Each entry in this table consists of several lines. The line with the
|
||||
index number at the left hand margin lists the current function.
|
||||
The lines above it list the functions that called this function,
|
||||
and the lines below it list the functions this one called.
|
||||
This line lists:
|
||||
index A unique number given to each element of the table.
|
||||
Index numbers are sorted numerically.
|
||||
The index number is printed next to every function name so
|
||||
it is easier to look up where the function is in the table.
|
||||
|
||||
% time This is the percentage of the `total' time that was spent
|
||||
in this function and its children. Note that due to
|
||||
different viewpoints, functions excluded by options, etc,
|
||||
these numbers will NOT add up to 100%.
|
||||
|
||||
self This is the total amount of time spent in this function.
|
||||
|
||||
children This is the total amount of time propagated into this
|
||||
function by its children.
|
||||
|
||||
called This is the number of times the function was called.
|
||||
If the function called itself recursively, the number
|
||||
only includes non-recursive calls, and is followed by
|
||||
a `+' and the number of recursive calls.
|
||||
|
||||
name The name of the current function. The index number is
|
||||
printed after it. If the function is a member of a
|
||||
cycle, the cycle number is printed between the
|
||||
function's name and the index number.
|
||||
|
||||
|
||||
For the function's parents, the fields have the following meanings:
|
||||
|
||||
self This is the amount of time that was propagated directly
|
||||
from the function into this parent.
|
||||
|
||||
children This is the amount of time that was propagated from
|
||||
the function's children into this parent.
|
||||
|
||||
called This is the number of times this parent called the
|
||||
function `/' the total number of times the function
|
||||
was called. Recursive calls to the function are not
|
||||
included in the number after the `/'.
|
||||
|
||||
name This is the name of the parent. The parent's index
|
||||
number is printed after it. If the parent is a
|
||||
member of a cycle, the cycle number is printed between
|
||||
the name and the index number.
|
||||
|
||||
If the parents of the function cannot be determined, the word
|
||||
`<spontaneous>' is printed in the `name' field, and all the other
|
||||
fields are blank.
|
||||
|
||||
For the function's children, the fields have the following meanings:
|
||||
|
||||
self This is the amount of time that was propagated directly
|
||||
from the child into the function.
|
||||
|
||||
children This is the amount of time that was propagated from the
|
||||
child's children to the function.
|
||||
|
||||
called This is the number of times the function called
|
||||
this child `/' the total number of times the child
|
||||
was called. Recursive calls by the child are not
|
||||
listed in the number after the `/'.
|
||||
|
||||
name This is the name of the child. The child's index
|
||||
number is printed after it. If the child is a
|
||||
member of a cycle, the cycle number is printed
|
||||
between the name and the index number.
|
||||
|
||||
If there are any cycles (circles) in the call graph, there is an
|
||||
entry for the cycle-as-a-whole. This entry shows who called the
|
||||
cycle (as parents) and the members of the cycle (as children.)
|
||||
The `+' recursive calls entry shows the number of function calls that
|
||||
were internal to the cycle, and the calls entry for each member shows,
|
||||
for that member, how many times it was called from other members of
|
||||
the cycle.
|
||||
|
||||
Copyright (C) 2012-2014 Free Software Foundation, Inc.
|
||||
|
||||
Copying and distribution of this file, with or without modification,
|
||||
are permitted in any medium without royalty provided the copyright
|
||||
notice and this notice are preserved.
|
||||
|
||||
Index by function name
|
||||
|
||||
[1] bail_out (aikern.c) [5] kernel_8_1_fuseaware [2] pin_time (aikern.c)
|
||||
[11] get_int (roofline.c) [6] kernel_8_1_simple [9] print_kernresult (roofline.c)
|
||||
[12] get_size (roofline.c) [7] kernel_8_1_simple_fastmath [13] testkern (roofline.c)
|
||||
[3] kernel_1_16_fuseaware [8] kernel_dispatch
|
||||
[4] kernel_1_16_simple [10] pin_time (roofline.c)
|
|
@ -140,12 +140,18 @@ int main(int argc, char* argv[]) {
|
|||
printf("Starting tests...\n\n\n");
|
||||
|
||||
// Executing kernels
|
||||
kern_result simple16 = kernel_dispatch(SIMPLE_1_16, a, b, c, size, runs);
|
||||
kern_result fma16 = kernel_dispatch(FMA_1_16, a, b, c, size, runs);
|
||||
kern_result simple8 = kernel_dispatch(SIMPLE_8_1, a, b, c, size, runs);
|
||||
kern_result fma8 = kernel_dispatch(FMA_8_1, a, b, c, size, runs);
|
||||
kern_result simple8fm = kernel_dispatch(SIMPLE_8_1_FASTMATH, a, b, c, size, runs);
|
||||
kern_result simple16 = kernel_dispatch(SIMPLE_1_16, a, b, c, size, runs);
|
||||
kern_result fma16 = kernel_dispatch(FMA_1_16, a, b, c, size, runs);
|
||||
kern_result simple8 = kernel_dispatch(SIMPLE_8_1, a, b, c, size, runs);
|
||||
kern_result fma8 = kernel_dispatch(FMA_8_1, a, b, c, size, runs);
|
||||
kern_result simple8fm = kernel_dispatch(SIMPLE_8_1_FASTMATH, a, b, c, size, runs);
|
||||
|
||||
#ifdef INTRINS
|
||||
DEBUG("Running manpack now");
|
||||
kern_result fma8manpack = kernel_dispatch(FMA_8_1_MANPACK, a, b, c, size, runs);
|
||||
DEBUG("manpack run successful");
|
||||
#endif
|
||||
|
||||
// Freeing arrays
|
||||
free(a);
|
||||
free(b);
|
||||
|
@ -158,6 +164,11 @@ int main(int argc, char* argv[]) {
|
|||
print_kernresult(&fma8, "fma8");
|
||||
print_kernresult(&simple8fm, "simple8fastmath");
|
||||
|
||||
#ifdef INTRINS
|
||||
print_kernresult(&fma8manpack, "fma8manpack");
|
||||
#endif
|
||||
|
||||
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
|
@ -183,7 +194,9 @@ static double pin_time(void)
|
|||
i = gettimeofday(&tp,NULL);
|
||||
|
||||
if(i != 0)
|
||||
{
|
||||
bail_out("Time measurement impossible. gettimeofday error");
|
||||
}
|
||||
|
||||
return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
|
||||
}
|
||||
|
@ -198,7 +211,9 @@ static size_t get_size(char *oparg)
|
|||
unsigned long long int u_llsize = (unsigned long long int) llsize;
|
||||
|
||||
if(u_llsize > SIZE_MAX)
|
||||
bail_out("Only size between 1 to %zu allowed.", SIZE_MAX);
|
||||
{
|
||||
bail_out("Only size between 1 to %zu allowed.", SIZE_MAX);
|
||||
}
|
||||
|
||||
return (size_t) llsize;
|
||||
}
|
||||
|
@ -213,7 +228,9 @@ static int get_int(char *oparg)
|
|||
unsigned long long int u_llsize = (unsigned long long int) llsize;
|
||||
|
||||
if(u_llsize > INT_MAX)
|
||||
bail_out("Only size between 1 to %d allowed.", INT_MAX);
|
||||
{
|
||||
bail_out("Only size between 1 to %d allowed.", INT_MAX);
|
||||
}
|
||||
|
||||
return (int) llsize;
|
||||
}
|
||||
|
@ -221,7 +238,7 @@ static int get_int(char *oparg)
|
|||
static void usage()
|
||||
{
|
||||
fprintf(stderr, "USAGE: ./roofline -s <size> -r <runs> \n");
|
||||
bail_out(NULL);
|
||||
bail_out("Invalid paramers");
|
||||
}
|
||||
|
||||
static void bail_out(char* fmt, ...)
|
||||
|
@ -251,7 +268,7 @@ static void bail_out(char* fmt, ...)
|
|||
|
||||
static void print_kernresult(kern_result* result, const char* logname)
|
||||
{
|
||||
struct stat st = {0};
|
||||
struct stat st = {};
|
||||
|
||||
if (stat("log", &st) == -1)
|
||||
{
|
||||
|
|
Loading…
Add table
Reference in a new issue