diff --git a/plot/plot.py b/plot/plot.py index 850ec95..14a5fd8 100644 --- a/plot/plot.py +++ b/plot/plot.py @@ -25,8 +25,10 @@ while i<=64: values = [] bandwidth = 10.6 peak = 86.4 +basepeak = 54.4 ymem = [] ypeak = [] +ybasepeak = [] for i in np.arange(0,64,0.1): if bandwidth*i < peak: @@ -47,10 +49,12 @@ while i<=64: ymem.append(None) i*=2 + + #plot data #data = pd.Series(data=values, name='Peak Memory Bandwidth', index=np.arange(0,64,0.1)) -data = {'Peak Memory Bandwidth': pd.Series(ymem, index=xlbl), 'Peak Floating-Point Performance': pd.Series(ypeak, index=xlbl)} +data = {'Peak Memory Bandwidth': pd.Series(ymem, index=xlbl), 'Peak Floating-Point Performance (Turbo)': pd.Series(ypeak, index=xlbl)} df = pd.DataFrame(data) ax = df.plot() diff --git a/roofline/report/inputs/kernels.tex b/roofline/report/inputs/kernels.tex index c69863b..deff509 100644 --- a/roofline/report/inputs/kernels.tex +++ b/roofline/report/inputs/kernels.tex @@ -4,9 +4,8 @@ However the effective operational intensity of a given kernel in a high-level la All kernels were compiled with \verb|gcc 5.3.1| and different options. The compilation was checked with \verb|objdump -d -M intel-mnemonics|. For a more elaborate analysis of the disassembly on the testers computer, please refer to the header file \verb|aikern.h| that should come with this report. Additionally \verb|Makefile| provides all informations about the used and tested compiler options. -Good results\footnote{all, including the special FMA kernels, use only expected memory access, doing everything else in registers} were achieved with \verb|-O2 -mavx -mfma|. But \verb|-O2 -maxv -mfma| is a tradeoff between the best possible results and obviously correct compiled code. In fact the assembly almost looks like handwritten. If even more optimization is wanted \verb|-O3| can be used. To fully utilize FMA with packed doubles \verb|-Ofast| or \verb|-Ofast -ffast-math| has to be used. Be aware that more optimization than \verb|-O2 -maxv -mfma| results in a very hard to understand disassembly. \verb|-ffast-math| can even introduce rounding errors. It is not completely obvious that the highly optimized compiled code still has the wanted operational intensity. \verb|-O0| never works out. +Good results\footnote{all, including the special FMA kernels, use only expected memory access, doing everything else in registers} were achieved with \verb|-O2 -mavx -mfma|. But \verb|-O2 -maxv -mfma| is a tradeoff between the best possible results and obviously correct compiled code. In fact the disassembly almost looks like handwritten. If even more optimization is wanted \verb|-O3| can be used. To fully utilize FMA with packed doubles \verb|-Ofast| or \verb|-Ofast -ffast-math| has to be used. Be aware that more optimization than \verb|-O2 -maxv -mfma| results in a very hard to understand disassembly. \verb|-ffast-math| can even introduce rounding errors or reduce the executed FLOPs. It is not completely obvious that the highly optimized compiled still has the wanted operational intensity. \verb|-O0| never works out. -\bigskip \bigskip \begin{footnotesize} \noindent\emph{Remark:} Contrary to popular believe the roofline model is built atop the notion of operational intensity\footnote{FLOPs against bytes written to DRAM} kernels. The differences to arithmetic intensities are outlined in~\textcite{williams2009}. Depending on the definition used these two terms are not necessarily interchangeable. The notion of operational intensity in the following sections might be what some would understand by the term arithmetic itensity. @@ -40,11 +39,13 @@ Two $\rfrac{1}{16}$ kernels have been implemented. The kernel in~\prettyref{lst: The simple kernel in~\prettyref{lst:1-16-simple} reads 8 bytes (a[i]) once for both operands of $*$ and writes 8 bytes (again to a[i]). This results in 16 byte operations. Only one FP instruction is executed, namely $*$. At \verb|-O2| the loop variable is held in a register. This results in an $\rfrac{1}{16}$ OI kernel. +\bigskip \begin{lstlisting}[caption={Simple $\rfrac{1}{16}$ OI kernel}, label=lst:1-16-simple] (*\textcolor{Orchid}{\#pragma omp parallel for}*) for(size_t i=0; i] Package hyperref Warning: Token not allowed in a PDF string (PDFDocEncoding): -(hyperref) removing `math shift' on input line 15. +(hyperref) removing `math shift' on input line 14. Package hyperref Warning: Token not allowed in a PDF string (PDFDocEncoding): -(hyperref) removing `\not' on input line 15. +(hyperref) removing `\not' on input line 14. Package hyperref Warning: Token not allowed in a PDF string (PDFDocEncoding): -(hyperref) removing `math shift' on input line 15. +(hyperref) removing `math shift' on input line 14. -[5] [6]) +[5] [6]) [7] Overfull \hbox (19.7725pt too wide) in paragraph at lines 116--116 \T1/cmtt/m/n/10.95 blob / e5aa9ca4a77623ff6f1c2d5daa7995565b944506 / stream . c # L286$[][] \T1/cmr/m/n/10.95 (-20) (vis-ited on 06/20/2016). [] -[7] + AED: lastpage setting LastPage [8] Package atveryend Info: Empty hook `BeforeClearDocument' on input line 117. @@ -1401,13 +1401,13 @@ Package logreq Info: Writing requests to 'report.run.xml'. Package atveryend Info: Empty hook `AtVeryVeryEnd' on input line 117. ) Here is how much of TeX's memory you used: - 21436 strings out of 493339 - 338721 string characters out of 6141383 - 878402 words of memory out of 5000000 + 21442 strings out of 493339 + 338775 string characters out of 6141383 + 879402 words of memory out of 5000000 24309 multiletter control sequences out of 15000+600000 - 29876 words of font info for 133 fonts, out of 8000000 for 9000 + 30053 words of font info for 136 fonts, out of 8000000 for 9000 953 hyphenation exceptions out of 8191 - 48i,8n,76p,1008b,1880s stack positions out of 5000i,500n,10000p,200000b,80000s + 48i,8n,76p,1001b,1880s stack positions out of 5000i,500n,10000p,200000b,80000s {/usr/share/texlive/texmf-dist/fonts/enc/dvips/cm-super/cm-super-ts1.enc}{/us r/share/texlive/texmf-dist/fonts/enc/dvips/cm-super/cm-super-t1.enc} -Output written on report.pdf (8 pages, 328260 bytes). +Output written on report.pdf (8 pages, 328183 bytes). PDF statistics: 353 PDF objects out of 1000 (max. 8388607) 278 compressed objects within 3 object streams diff --git a/roofline/report/report.pdf b/roofline/report/report.pdf index aba6179..3548add 100644 Binary files a/roofline/report/report.pdf and b/roofline/report/report.pdf differ diff --git a/roofline/report/report.tex b/roofline/report/report.tex index 7688648..1e4c767 100644 --- a/roofline/report/report.tex +++ b/roofline/report/report.tex @@ -89,7 +89,7 @@ \maketitle \begin{abstract} - A \emph{roofline model} for a multicore-processor is obtained by calcuating the theoretical peak performance of the processor and benchmarking the peak memory bandwith. Two artificial computational kernels with arithmetic intensities of $\frac{1}{16}$ GFLOPs/Byte and $8$ GFLOPs/Byte are devised. The performance of the two kernels is then compared to the theoretical calculations in the roofline model. + A \emph{roofline model} for a multicore-processor is obtained by calcuating the theoretical peak performance of the processor and benchmarking the peak memory bandwith. Two artificial computational kernels with operational intensities of $\frac{1}{16}$ GFLOPs/Byte and $8$ GFLOPs/Byte are devised. The performance of the two kernels is then compared to the theoretical calculations in the roofline model. \end{abstract} \tableofcontents diff --git a/roofline/src/Makefile b/roofline/src/Makefile index 8c2b285..adb4fb0 100644 --- a/roofline/src/Makefile +++ b/roofline/src/Makefile @@ -1,58 +1,66 @@ -all: roofline roofline_avx roofline_o3avx roofline_o3 roofline_avxfma roofline_avxfmafast +all: bin lib # Roofline Binary +bin: roofline roofline_o3 roofline_fma roofline_fma_o3 roofline_fma_fast_o3 roofline_fma_fast_fastmath_o3 + mkdir bin + mv $^ bin + roofline: roofline.c aikern.a gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@ -roofline_avx: roofline.c aikern_avx.a - gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@ - -roofline_o3avx: roofline.c aikern_o3avx.a - gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@ - roofline_o3: roofline.c aikern_o3.a - gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@ + gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@ -roofline_avxfma: roofline.c aikern_avxfma.a - gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@ +roofline_fma: roofline.c aikern_fma.a + gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@ + +roofline_fma_o3: roofline.c aikern_fma_o3.a + gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@ + +roofline_fma_fast_o3: roofline.c aikern_fma_fast_o3.a + gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@ + +roofline_fma_fast_fastmath_o3: roofline.c aikern_fma_fast_fastmath_o3.a + gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@ -roofline_avxfmafast: roofline.c aikern_avxfmafast.a - gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@ # Static Libraries -aikern.a: aikern.c aikern.h - gcc -c -o aikern.o aikern.c - ar rcs aikern.a aikern.o +lib: aikern.a aikern_o3.a aikern_fma.a aikern_fma_o3.a aikern_fma_fast_o3.a aikern_fma_fast_fastmath_o3.a + mkdir lib + mv $^ lib -aikern_avx.a: aikern.c aikern.h - gcc -mavx -c -o aikern_avx.o aikern.c - ar rcs aikern_avx.a aikern_avx.o +aikern.a: aikern.c aikern.h + gcc -Wall -Wextra -Wno-unused -fopenmp -c -o aikern.o $< + ar rcs aikern.a aikern.o + rm aikern.o aikern_o3.a: aikern.c aikern.h - gcc -O3 -c -o aikern_o3.o aikern.c - ar rcs aikern_o3.a aikern_o3.o + gcc -Wall -Wextra -Wno-unused -O3 -fopenmp -c -o aikern_o3.o $< + ar rcs $@ aikern_o3.o + rm aikern_o3.o -aikern_o3avx.a: aikern.c aikern.h - gcc -O3 -mavx -c -o aikern_o3avx.o aikern.c - ar rcs aikern_o3avx.a aikern_o3avx.o +aikern_fma.a: aikern.c aikern.h + gcc -Wall -Wextra -Wno-unused -O2 -mavx -mfma -fopenmp -c -o aikern_fma.o $< + ar rcs $@ aikern_fma.o + rm aikern_fma.o -# This is the only version that actually uses FMA -aikern_avxfma.a: aikern.c aikern.h - gcc -O2 -mavx -mfma -c -o aikern_avxfma.o aikern.c - ar rcs aikern_avxfma.a aikern_avxfma.o +aikern_fma_o3.a: aikern.c aikern.h + gcc -Wall -Wextra -Wno-unused -O3 -mavx -mfma -fopenmp -c -o aikern_fma_o3.o $< + ar rcs $@ aikern_fma_o3.o + rm aikern_fma_o3.o -aikern_avxfmafast.a: aikern.c aikern.h - gcc -O2 -mavx -mfma -Ofast -c -o aikern_avxfmafastmath.o aikern.c - ar rcs aikern_avxfmafast.a aikern_avxfmafastmath.o - -aikern_avxfmafastmath.a: aikern.c aikern.h - gcc -O2 -mavx -mfma -Ofast -ffast-math -c -o aikern_avxfmafastmath.o aikern.c - ar rcs aikern_avxfmafast.a aikern_avxfmafastmath.o +aikern_fma_fast_o3.a: aikern.c aikern.h + gcc -Wall -Wextra -Wno-unused -O3 -mavx -mfma -fopenmp -Ofast -c -o aikern_fma_fast_o3.o $< + ar rcs $@ aikern_fma_fast_o3.o + rm aikern_fma_fast_o3.o +aikern_fma_fast_fastmath_o3.a: aikern.c aikern.h + gcc -Wall -Wextra -Wno-unused -O3 -mavx -mfma -fopenmp -Ofast -ffast-math -c -o aikern_fma_fast_fastmath_o3.o $< + ar rcs $@ aikern_fma_fast_fastmath_o3.o + rm aikern_fma_fast_fastmath_o3.o +# Cleanup clean: - rm -f roofline roofline_avx roofline_o3avx roofline_o3 roofline_avxfma roofline_avxfmafast - rm -f *.o - rm -f *.a - rm -f *.so + rm -fR bin + rm -fR lib diff --git a/roofline/src/aikern.c b/roofline/src/aikern.c index 1ff84cb..72d53d3 100644 --- a/roofline/src/aikern.c +++ b/roofline/src/aikern.c @@ -1,88 +1,46 @@ # include +# include +# include +# include +# include +# include +# include + +# include "aikern.h" + +/** + * @brief terminate program on program error + * @param msg additional message to print + * @param ret exit value + */ +static void bail_out(char* fmt, ...); + +/** + * @brief microseconds since epoch + */ +static double pin_time(void); void kernel_1_16_simple(double* a, double* b, double* c, size_t size) { + double t = pin_time(); #pragma omp parallel for for(size_t i=0; i AI = 1/16 - */ a[i] = a[i] * a[i]; } } void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size) { - /* === Warning === - This is dangerous if FMA is not used/can't be used. Then there - are intermediary writes (and reads) to the stack. With FMA: - - vmovsd xmm0,QWORD PTR [rdi+rax*8] # 1 read - vmovsd xmm1,QWORD PTR [rdx+rax*8] # 1 read - vfmadd132sd xmm0,xmm1,QWORD PTR [rsi+rax*8] # 2 FLOPs + 1 read - vmovsd QWORD PTR [rdi+rax*8],xmm0 # 1 write - - Uses packed doubles with -Ofast. - */ - #pragma omp parallel for for(size_t i=0; i AI = 2/32 = 1/16 - */ a[i] = a[i] * b[i] + c[i]; } } -#define REP0(X) -#define REP1(X) X -#define REP2(X) REP1(X) REP1(X) -#define REP3(X) REP2(X) REP1(X) -#define REP4(X) REP3(X) REP1(X) -#define REP5(X) REP4(X) REP1(X) -#define REP6(X) REP5(X) REP1(X) -#define REP7(X) REP6(X) REP1(X) -#define REP8(X) REP7(X) REP1(X) -#define REP9(X) REP8(X) REP1(X) - -#define REP10(X) REP9(X) REP1(X) -#define REP20(X) REP10(X) REP10(X) -#define REP30(X) REP20(X) REP10(X) -#define REP40(X) REP30(X) REP10(X) -#define REP50(X) REP40(X) REP10(X) -#define REP60(X) REP50(X) REP10(X) - -#define REP100(X) REP50(X) REP50(X) - void kernel_8_1_simple(double* a, double* b, double* c, size_t size) { - /* === Warning === - Seems correct with -O3. Though -O3 does some loop unrolling. - - With -O0 this is dangerous, intermediary values stored on stack - who knows if they survive in cache -> unpredictable. - - With AVX and -O2 (not necessarily FMA) best results - (obviously correct, only register shuffling). With FMA: - - vmovsd xmm1,QWORD PTR [rdi] # 1 read - vmulsd xmm0,xmm1,xmm1 # 1 FLOP+register shuffling - vmulsd xmm0,xmm0,xmm1 # 127x 1 FLOP+register shuffling - # [...] - vmovsd QWORD PTR [rdi-0x8],xmm0 # 1 write - */ - #pragma omp parallel for for(size_t i=0; i AI = 128/16 = 8 - */ a[i] = REP100(a[i]*) REP20(a[i]*) REP8(a[i]*) @@ -92,88 +50,30 @@ void kernel_8_1_simple(double* a, double* b, double* c, size_t size) void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size) { - /* - With FMA (and -O2): - - vmovsd xmm0,QWORD PTR [rdi] # 1 read - vfmadd132sd xmm0,xmm0,xmm0 # 64 x 2 FLOPs+register shuffling - vmovsd QWORD PTR [rdi-0x8],xmm0 # 1 write - - Uses packed doubles with -Ofast. - */ - #pragma omp parallel for for(size_t i=0; i AI = 8 - */ REP60(a[i] = a[i] * a[i] + a[i];) REP4(a[i] = a[i] * a[i] + a[i];) } } -/* === FAILED KERNELS === */ - -/* - These are theoretically correct kernels but all of them yield - dangerous results with gcc 5.3.1 (checked the assembly). -*/ - void kernel_1_16_simple_dangerous(double* a, double* b, double* c, size_t size) { - /* === Problem === - As soon as volatile is used gcc uses the stack for tmp. - Even if "register" is in place. Resulting in one additional write per loop. - Omitting volatile results in optimizing away the whole loop - (checked at -O2, which is necessary for FMA to eventually step in). - Maybe the value stays in cache, maybe not. It does not live a register. - - Even with -O3: - movsd xmm0,QWORD PTR [rdi+rax*8] # 1 read - mulsd xmm0,QWORD PTR [rsi+rax*8] # 1 read (+ write to xmm0, not counted) - # [...] # instructions for loop - movsd QWORD PTR [rsp-0x8],xmm0 # malicious write - - Without volatile (-O3): - repz ret # that's it - */ - - - // volatile to prevent compiler from optimizing this away - // register to advise compiler to put this in register - double tmp = 0.1; + register volatile double tmp = 0.1; #pragma omp parallel for for(size_t i=0; i AI = 1/16 - */ tmp = a[i] * b[i]; } } void kernel_8_1_simple_dangerous(double* a, double* b, double* c, size_t size) { - /* === Problem == - Same as for kernel_1_16_simple_dangerous - */ - - // volatile to prevent compiler from optimizing this away - // register to advise compiler to put this in register - volatile register double tmp = 0.1; + register volatile double tmp = 0.1; #pragma omp parallel for for(size_t i=0; i AI = 8 - */ tmp = a[i] * a[i] * a[i] * a[i] * a[i] * a[i] * a[i] * a[i]; } @@ -183,18 +83,55 @@ void kernel_1_8_vo_dangerous(double* a, double* b, double* c, size_t size) { /* This is the 1/8 AI kernel from the lecture - === Problem == - Same as for kernel_1_16_simple_dangerous - - Without volatile the loop is optimized away completely. - With volatile tmp is written to the stack in every loop - (-O3). tmp could be cached or not. This might depend on - how large the array is and how the cpu work internally - -> unpredictable. */ - volatile double tmp=0.0; + register volatile double tmp=0.0; + for(size_t i=0; i 0) + (void)fprintf(stderr, "%s: %s \n", prog_name, msgbuf); + + } + + if(errno != 0) + (void)fprintf(stderr, "%s: %s\n", prog_name, strerror(errno)); + + exit(EXIT_FAILURE); } diff --git a/roofline/src/aikern.h b/roofline/src/aikern.h index 92193a1..b0a6ff8 100644 --- a/roofline/src/aikern.h +++ b/roofline/src/aikern.h @@ -1,8 +1,224 @@ +#ifndef AIKERN_H +#define AIKERN_H + +/** + * @brief A simple 1/16 operational intensity kernel + * @param a An array with double values of size param size + * @param b An array with double values of size param size + * @param c An array with double values of size param size + * @param size Size of the three param arrays + * + * === Warning === + * Don't use with -O0: Stores everything on stack + * + * === Description === + * Uses a simple floating point operation: a[i] = a[i] * a[i]; + * + * Runs in a parallelized for loop. + * + * === Analysis === + * COMM: 1 read (8 byte), 1 write = 16 bytes + * COMP: 1 FLOP + * --------- + * OI: 1/16 + * + * === Optimization === + * Nothing special + * + */ void kernel_1_16_simple(double* a, double* b, double* c, size_t size); + + + +/** + * @brief A 1/16 operational intensity kernel utilizing FMA + * @param a An array with double values of size param size + * @param b An array with double values of size param size + * @param c An array with double values of size param size + * @param size Size of the three param arrays + * + * === Warning === + * This is dangerous if FMA is not used/can't be used. Then there + * are intermediary writes (and reads) to the stack. + * + * === Description === + * Uses a triad function: a[i] = a[i] * b[i] + c[i]; in order + * to utilize the FMA unit. + * + * Runs in a parallelized for loop. + * + * === Analysis === + * With gcc -O2 -mavx -mfma FMA compiles to: + * vmovsd xmm0,QWORD PTR [rdi+rax*8] # 1 read (8 byte) + * vmovsd xmm1,QWORD PTR [rdx+rax*8] # 1 read + * vfmadd132sd xmm0,xmm1,QWORD PTR [rsi+rax*8] # 2 FLOPs + 1 read + * vmovsd QWORD PTR [rdi+rax*8],xmm0 # 1 write + * -------- + * 1/16 OI + * + * === Optimization === + * For packed doubles compile with -Ofast + * + */ void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size); + + + +/** + * @brief A simple 8/1 operational intensity kernel + * @param a An array with double values of size param size + * @param b An array with double values of size param size + * @param c An array with double values of size param size + * @param size Size of the three param arrays + * + * === Warning === + * Don't use with -O0: Stores everything on stack + * + * === Description === + * Uses a simple floating point operation: a[i] = a[i] * a[i] * ...* a[i]; + * + * Runs in a parallelized for loop. + * + * === Analysis === + * With AVX and -O2 (not necessarily FMA) best results (obviously correct + * easy to read disassembly). + * + * With gcc -O2 -mavx compiles to: + * vmovsd xmm1,QWORD PTR [rdi] # 1 read + * vmulsd xmm0,xmm1,xmm1 # 1 FLOP+register shuffling + * vmulsd xmm0,xmm0,xmm1 # 127x 1 FLOP+register shuffling + * # [...] + * vmovsd QWORD PTR [rdi-0x8],xmm0 # 1 write + * -------- + * 128/16 = 8/1 OI + * + * === Optimization === + * Nothing special + */ void kernel_8_1_simple(double* a, double* b, double* c, size_t size); + +/** + * @brief A 8/1 operational intensity kernel utilizing FMA + * @param a An array with double values of size param size + * @param b An array with double values of size param size + * @param c An array with double values of size param size + * @param size Size of the three param arrays + * + * === Warning === + * This is dangerous if FMA is not used/can't be used. Then there + * are intermediary writes (and reads) to the stack. + * + * === Description === + * Uses multiple triad function: a[i] = a[i] * a[i] + a[i]; in order + * to utilize the FMA unit. + * + * Runs in a parallelized for loop. + * + * === Analysis === + * With gcc -O2 -mavx -mfma FMA compiles to: + * vmovsd xmm0,QWORD PTR [rdi] # 1 read + * vfmadd132sd xmm0,xmm0,xmm0 # 64 x 2 FLOPs+register shuffling + * vmovsd QWORD PTR [rdi-0x8],xmm0 # 1 write + * -------- + * 128/16 = 8/1 OI + * + * === Optimization === + * For packed doubles compile with -Ofast + * + */ void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size); + +/******************************************** + * Kernels which potentially compile to * + * different operational intensities than * + * specified * + ********************************************/ + +/** + * @brief A 1/16 operational intensity which might compile to a flawed oi kernel + * @param a An array with double values of size param size + * @param b An array with double values of size param size + * @param c An array with double values of size param size + * @param size Size of the three param arrays + * + * === Problem === + * As soon as volatile is used gcc uses the stack for tmp. + * Even if "register" is in place. Resulting in one additional write per loop. + * Omitting volatile results in optimizing away the whole loop + * (checked at -O2, which is necessary for FMA to eventually step in). + * Maybe the value stays in cache, maybe not. It does not live a register. + * + * Even with -O3: + * movsd xmm0,QWORD PTR [rdi+rax*8] # 1 read + * mulsd xmm0,QWORD PTR [rsi+rax*8] # 1 read (+ write to xmm0, not counted) + * # [...] # instructions for loop + * movsd QWORD PTR [rsp-0x8],xmm0 # malicious write + * + * Without volatile (-O3): + * repz ret # that's it + */ void kernel_1_16_simple_dangerous(double* a, double* b, double* c, size_t size); + +/** + * @brief A 8/1 operational intensity which might compile to a flawed oi kernel + * @param a An array with double values of size param size + * @param b An array with double values of size param size + * @param c An array with double values of size param size + * @param size Size of the three param arrays + * + * === Problem == + * Same as for kernel_1_16_simple_dangerous + */ void kernel_8_1_simple_dangerous(double* a, double* b, double* c, size_t size); + +/** + * @brief A 1/8 operational intensity which might compile to a flawed oi kernel + * @param a An array with double values of size param size + * @param b An array with double values of size param size + * @param c An array with double values of size param size + * @param size Size of the three param arrays + * + * === Problem == + * Same as for kernel_1_16_simple_dangerous + * + * Without volatile the loop is optimized away completely. + * With volatile tmp is written to the stack in every loop + * (-O3). tmp could be cached or not. This might depend on + * how large the array is and how the cpu work internally + * -> unpredictable. + */ void kernel_1_8_vo_dangerous(double* a, double* b, double* c, size_t size); + + +/**************************************** + * Helper macros for repeating things * + ****************************************/ + +#define REP0(X) +#define REP1(X) X +#define REP2(X) REP1(X) REP1(X) +#define REP3(X) REP2(X) REP1(X) +#define REP4(X) REP3(X) REP1(X) +#define REP5(X) REP4(X) REP1(X) +#define REP6(X) REP5(X) REP1(X) +#define REP7(X) REP6(X) REP1(X) +#define REP8(X) REP7(X) REP1(X) +#define REP9(X) REP8(X) REP1(X) + +#define REP10(X) REP9(X) REP1(X) +#define REP20(X) REP10(X) REP10(X) +#define REP30(X) REP20(X) REP10(X) +#define REP40(X) REP30(X) REP10(X) +#define REP50(X) REP40(X) REP10(X) +#define REP60(X) REP50(X) REP10(X) + +#define REP100(X) REP50(X) REP50(X) + +#ifdef ENDEBUG +#define DEBUG(...) do { fprintf(stderr, __VA_ARGS__); fprintf(stderr, "\n"); } while(0) +#else +#define DEBUG(...) +#endif + +#endif /* AIKERN_H */ diff --git a/roofline/src/roofline b/roofline/src/roofline deleted file mode 100755 index e16bf7a..0000000 Binary files a/roofline/src/roofline and /dev/null differ diff --git a/roofline/src/roofline.c b/roofline/src/roofline.c index 8fe4922..77539c4 100644 --- a/roofline/src/roofline.c +++ b/roofline/src/roofline.c @@ -57,7 +57,7 @@ static int get_int(char* oparg); /** * @brief microseconds since epoch */ -static double mysecond(void); +static double pin_time(void); /** * @brief a simple test kernel with ai of 1/16 @@ -103,10 +103,8 @@ int main(int argc, char* argv[]) { size_t size = get_size(size_arg); int runs = get_int(runs_arg); - printf("Will run with array sizes of %zu\n", size); + printf("Will run with array sizes of %zu elements\n", size); printf("Will calculate min, max, avg for %d runs\n", runs); - - /* Make this volatile so that nothing is optimized away here */ double* a = malloc(sizeof(double)*(size)); double* b = malloc(sizeof(double)*(size)); double* c = malloc(sizeof(double)*(size)); @@ -114,60 +112,53 @@ int main(int argc, char* argv[]) { if(a==NULL || b==NULL || c == NULL) bail_out("One of the mallocs failed\n. a = %p, b=%p, c=%p", a, b, c); - printf("Allocated 3 arrays\n"); - - printf("Filling arrays with dummy values\n"); + printf("Allocated 3 arrays (3*%.2f MB = %.2f GB)\n", (sizeof(double)*(size)/1024.0/1024.0), (sizeof(double)*(size)*3/1024.0/1024.0/1024)); + printf("Filling arrays with dummy values. This will also warm the cache\n"); - /* #pragma omp parallel for + #pragma omp parallel for for (size_t j=0; j