nodes.log
This commit is contained in:
parent
9cdac30176
commit
dc9ce4c306
16 changed files with 1217 additions and 56 deletions
1080
reduce/nodes.log
Normal file
1080
reduce/nodes.log
Normal file
File diff suppressed because it is too large
Load diff
|
@ -113,6 +113,39 @@ for(size_t i=0; i<size; i++){
|
|||
\end{lstlisting}
|
||||
\bigskip
|
||||
|
||||
\subsubsection{Some Further 8/1 Kernel}
|
||||
Since some effort was put in getting results near peak performance \verb|-Ofast -ffast-math| was used to stretch compiler optimization to the maximum. Unfortunately \verb|-ffast-math| does not preserve strict IEEE compliance. It is therefore allowed to ignore non-associativity of floating point operations. For example $x = x*x*x*x*x*x*x*x$ can be optimized to $x~*=~x; x~*=~x; x~*=~x;$. Clearly this has an effect on the OI of the kernel. To test fastmath the kernel in~\prettyref{lst:8-1-fma-fastmath} was introduced. Mind that a[i] is written out only once and held in registers during a single iteration.
|
||||
|
||||
\bigskip
|
||||
\begin{lstlisting}[caption={FMA aware $8$ OI kernel with fastmath correctness}, label=lst:8-1-fma-fastmath]
|
||||
(*\textcolor{Orchid}{\#pragma omp parallel for}*)
|
||||
for(size_t i=0; i<size; i++){
|
||||
REP100(a[i]=a[i]*a[i];);
|
||||
REP20(a[i]=a[i]*a[i];);
|
||||
REP8(a[i]=a[i]*a[i];);
|
||||
}
|
||||
\end{lstlisting}
|
||||
\bigskip
|
||||
|
||||
Since the results were still not satisfying another kernel 8/1 OI kernel which makes use of handcrafted compiler intrinsics was introduced too. This kernel makes full use of the 256-bit-packed-doubles fused-multiply-add floating-point operation the FMA unit of the processor provides. The kernel can be seen in~\prettyref{lst:8-1-intrinsics}. At least in theory this should yield peak performance. The disassembly under full optimization (options can be seen in \verb|Makefile|) behaves very much like handwritten assembly.
|
||||
|
||||
\bigskip
|
||||
\begin{lstlisting}[caption={FMA aware $8$ OI kernel with intrinsics}, label=lst:8-1-intrinsics]
|
||||
(*\textcolor{Orchid}{\#pragma omp parallel for}*)
|
||||
for(size_t i=0; i<(size-4); i+=4){
|
||||
// pack doubles
|
||||
__m256d packvec = _mm256_set_pd(a[i], a[i+1], a[i+2], a[i+3]);
|
||||
|
||||
REP60(packvec = _mm256_fmadd_pd(packvec, packvec, packvec););
|
||||
REP4(packvec = _mm256_fmadd_pd(packvec, packvec, packvec););
|
||||
|
||||
a[i] = packvec[0];
|
||||
a[i+1] = packvec[1];
|
||||
a[i+2] = packvec[2];
|
||||
a[i+3] = packvec[3];
|
||||
}
|
||||
\end{lstlisting}
|
||||
\bigskip
|
||||
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
|
|
24
roofline/report/inputs/results.tex
Normal file
24
roofline/report/inputs/results.tex
Normal file
|
@ -0,0 +1,24 @@
|
|||
The best results for various kernels are given in~\prettyref{tbl:res-kernels}. The optimization binary \verb|roofline_full_manpack| was used for these results. This is the binary with all optimizations and the intrinsics kernel enabled. The following parameters were used: \verb|roofline_full_manpack -s 150000000 -r 5|. One double array was therefore 1144.41 MB big -- clearly too big for the cache.
|
||||
|
||||
\begin{table}[h!]
|
||||
\centering
|
||||
\begin{tabular}{ll}
|
||||
\toprule
|
||||
Kernel & Max. GFLOP/s \\
|
||||
\midrule
|
||||
simple16 & 0.9919 \\
|
||||
fma16 & 0.9891 \\
|
||||
simple8 & 123.4004 \\
|
||||
simple8fastmath & 8.7187 \\
|
||||
fma8 & 21.7866 \\
|
||||
fma8manpack & 18.9066 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\caption{Results for various kernels}
|
||||
\label{tbl:res-kernels}
|
||||
\end{table}
|
||||
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
%%% TeX-master: "../report"
|
||||
%%% End:
|
|
@ -68,6 +68,11 @@
|
|||
\@writefile{lol}{\defcounter {refsection}{0}\relax }\@writefile{lol}{\contentsline {lstlisting}{\numberline {6}Simple $8$ OI kernel}{7}{lstlisting.6}}
|
||||
\newlabel{lst:8-1-fma}{{7}{7}{FMA aware $8$ OI kernel}{lstlisting.7}{}}
|
||||
\@writefile{lol}{\defcounter {refsection}{0}\relax }\@writefile{lol}{\contentsline {lstlisting}{\numberline {7}FMA aware $8$ OI kernel}{7}{lstlisting.7}}
|
||||
\newlabel{LastPage}{{}{8}{}{page.8}{}}
|
||||
\xdef\lastpage@lastpage{8}
|
||||
\xdef\lastpage@lastpageHy{8}
|
||||
\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsubsection}{\numberline {3.3.1}Some Further 8/1 Kernel}{8}{subsubsection.3.3.1}}
|
||||
\newlabel{lst:8-1-fma-fastmath}{{8}{8}{FMA aware $8$ OI kernel with fastmath correctness}{lstlisting.8}{}}
|
||||
\@writefile{lol}{\defcounter {refsection}{0}\relax }\@writefile{lol}{\contentsline {lstlisting}{\numberline {8}FMA aware $8$ OI kernel with fastmath correctness}{8}{lstlisting.8}}
|
||||
\newlabel{lst:8-1-intrinsics}{{9}{8}{FMA aware $8$ OI kernel with intrinsics}{lstlisting.9}{}}
|
||||
\@writefile{lol}{\defcounter {refsection}{0}\relax }\@writefile{lol}{\contentsline {lstlisting}{\numberline {9}FMA aware $8$ OI kernel with intrinsics}{8}{lstlisting.9}}
|
||||
\newlabel{LastPage}{{}{9}{}{page.9}{}}
|
||||
\xdef\lastpage@lastpage{9}
|
||||
\xdef\lastpage@lastpageHy{9}
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
# Fdb version 3
|
||||
["biber report"] 1466704438 "report.bcf" "report.bbl" "report" 1466711144
|
||||
"report.bcf" 1466711144 92382 2683b542d57d2326e3b37a6a44222b52 ""
|
||||
["biber report"] 1466704438 "report.bcf" "report.bbl" "report" 1466801023
|
||||
"report.bcf" 1466799693 92382 2683b542d57d2326e3b37a6a44222b52 ""
|
||||
"roofline.bib" 1466704433 4157 226e47c750579a202f66b6f0e4df67bb ""
|
||||
(generated)
|
||||
"report.bbl"
|
||||
"report.blg"
|
||||
["pdflatex"] 1466711143 "report.tex" "report.pdf" "report" 1466711144
|
||||
["pdflatex"] 1466799692 "report.tex" "report.pdf" "report" 1466801023
|
||||
"/usr/share/texlive/texmf-dist/fonts/enc/dvips/cm-super/cm-super-t1.enc" 1136849721 2971 def0b6c1f0b107b3b936def894055589 ""
|
||||
"/usr/share/texlive/texmf-dist/fonts/enc/dvips/cm-super/cm-super-ts1.enc" 1136849721 2900 1537cc8184ad1792082cd229ecc269f4 ""
|
||||
"/usr/share/texlive/texmf-dist/fonts/map/fontname/texfonts.map" 1272929888 3287 e6b82fe08f5336d4d5ebc73fb1152e87 ""
|
||||
|
@ -196,22 +196,22 @@
|
|||
"/usr/share/texlive/texmf-dist/web2c/texmf.cnf" 1455657841 31706 2be2b4306fae7fc20493e3b90c2ad04d ""
|
||||
"/usr/share/texlive/texmf-var/web2c/pdftex/pdflatex.fmt" 1457104667 3492982 6abaa3262ef9227a797168d32888676c ""
|
||||
"inputs/introduction.tex" 1466184626 76 eaf0f76fa74815989416f6f6d1c36f8b ""
|
||||
"inputs/kernels.tex" 1466711142 10273 94bc8e1ce2e538a2a1c74426512dcc37 ""
|
||||
"inputs/kernels.tex" 1466800173 12285 7459a5d3d19f8cfbe2ace9512c674169 ""
|
||||
"inputs/roofline.tex" 1466710567 5525 b96d99208485f5095cd10d50a150dff7 ""
|
||||
"report.aux" 1466711144 6200 b98bcd77a3a008a4d0f92ab6f355b22c ""
|
||||
"report.aux" 1466799693 6920 efd026f088aa74618447caae8f088925 ""
|
||||
"report.bbl" 1466704439 7655 4b5f697a70789470cde9f922b6440ee7 "biber report"
|
||||
"report.out" 1466711144 566 365a3bdfdb786abd7e70ca003f732afb ""
|
||||
"report.run.xml" 1466711144 2317 80d7743117fafc51b1e42b536d793f68 ""
|
||||
"report.out" 1466799693 649 906e25252ab8cb90aead774c66de15bf ""
|
||||
"report.run.xml" 1466799693 2317 80d7743117fafc51b1e42b536d793f68 ""
|
||||
"report.tex" 1466709836 4497 1f64f8ce17913e2b9dd71c7d6e896da8 ""
|
||||
"report.toc" 1466711144 1210 9050233c7a77a885db53f60f534c1c7a ""
|
||||
"report.toc" 1466799693 1343 b579331b0ae5f9f743ca0ceca6f78889 ""
|
||||
"res/rooftop-eps-converted-to.pdf" 1466670002 22114 f6f2c1d53d8b6a5f4042e202648c7b36 ""
|
||||
"res/rooftop.eps" 1466669975 36013 2a6358f72820d80a6e87ee15e92d5669 ""
|
||||
(generated)
|
||||
"report.toc"
|
||||
"report.run.xml"
|
||||
"report.bcf"
|
||||
"report.log"
|
||||
"report-blx.bib"
|
||||
"report.pdf"
|
||||
"report.aux"
|
||||
"report.log"
|
||||
"report.out"
|
||||
"report.aux"
|
||||
"report.bcf"
|
||||
"report.run.xml"
|
||||
"report.toc"
|
||||
"report.pdf"
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
This is pdfTeX, Version 3.14159265-2.6-1.40.15 (TeX Live 2014) (preloaded format=pdflatex 2016.3.4) 23 JUN 2016 21:45
|
||||
This is pdfTeX, Version 3.14159265-2.6-1.40.15 (TeX Live 2014) (preloaded format=pdflatex 2016.3.4) 24 JUN 2016 22:21
|
||||
entering extended mode
|
||||
restricted \write18 enabled.
|
||||
%&-line parsing enabled.
|
||||
|
@ -1359,7 +1359,7 @@ to.pdf res/rooftop.eps>
|
|||
(epstopdf) \includegraphics on input line 70.
|
||||
Package epstopdf Info: Output file is already uptodate.
|
||||
|
||||
<res/rooftop-eps-converted-to.pdf, id=98, 587.19376pt x 442.65375pt>
|
||||
<res/rooftop-eps-converted-to.pdf, id=103, 587.19376pt x 442.65375pt>
|
||||
File: res/rooftop-eps-converted-to.pdf Graphic file (type pdf)
|
||||
|
||||
<use res/rooftop-eps-converted-to.pdf>
|
||||
|
@ -1379,32 +1379,32 @@ Package hyperref Warning: Token not allowed in a PDF string (PDFDocEncoding):
|
|||
Package hyperref Warning: Token not allowed in a PDF string (PDFDocEncoding):
|
||||
(hyperref) removing `math shift' on input line 14.
|
||||
|
||||
[5] [6]) [7]
|
||||
[5] [6] [7])
|
||||
Overfull \hbox (19.7725pt too wide) in paragraph at lines 116--116
|
||||
\T1/cmtt/m/n/10.95 blob / e5aa9ca4a77623ff6f1c2d5daa7995565b944506 / stream . c
|
||||
# L286$[][] \T1/cmr/m/n/10.95 (-20) (vis-ited on 06/20/2016).
|
||||
[]
|
||||
|
||||
|
||||
[8]
|
||||
AED: lastpage setting LastPage
|
||||
[8]
|
||||
[9]
|
||||
Package atveryend Info: Empty hook `BeforeClearDocument' on input line 117.
|
||||
Package atveryend Info: Empty hook `AfterLastShipout' on input line 117.
|
||||
(./report.aux)
|
||||
Package atveryend Info: Executing hook `AtVeryEndDocument' on input line 117.
|
||||
Package atveryend Info: Executing hook `AtEndAfterFileList' on input line 117.
|
||||
Package rerunfilecheck Info: File `report.out' has not changed.
|
||||
(rerunfilecheck) Checksum: 365A3BDFDB786ABD7E70CA003F732AFB;566.
|
||||
(rerunfilecheck) Checksum: 906E25252AB8CB90AEAD774C66DE15BF;649.
|
||||
Package logreq Info: Writing requests to 'report.run.xml'.
|
||||
\openout1 = `report.run.xml'.
|
||||
|
||||
Package atveryend Info: Empty hook `AtVeryVeryEnd' on input line 117.
|
||||
)
|
||||
Here is how much of TeX's memory you used:
|
||||
21442 strings out of 493339
|
||||
338775 string characters out of 6141383
|
||||
879402 words of memory out of 5000000
|
||||
24309 multiletter control sequences out of 15000+600000
|
||||
21477 strings out of 493339
|
||||
339286 string characters out of 6141383
|
||||
879545 words of memory out of 5000000
|
||||
24321 multiletter control sequences out of 15000+600000
|
||||
30053 words of font info for 136 fonts, out of 8000000 for 9000
|
||||
953 hyphenation exceptions out of 8191
|
||||
48i,8n,76p,1001b,1880s stack positions out of 5000i,500n,10000p,200000b,80000s
|
||||
|
@ -1427,10 +1427,10 @@ t/fonts/type1/public/cm-super/sfrm1440.pfb></usr/share/texlive/texmf-dist/fonts
|
|||
/type1/public/cm-super/sfti0900.pfb></usr/share/texlive/texmf-dist/fonts/type1/
|
||||
public/cm-super/sfti1095.pfb></usr/share/texlive/texmf-dist/fonts/type1/public/
|
||||
cm-super/sftt1095.pfb>
|
||||
Output written on report.pdf (8 pages, 328183 bytes).
|
||||
Output written on report.pdf (9 pages, 336309 bytes).
|
||||
PDF statistics:
|
||||
353 PDF objects out of 1000 (max. 8388607)
|
||||
278 compressed objects within 3 object streams
|
||||
81 named destinations out of 1000 (max. 500000)
|
||||
26190 words of extra memory for PDF output out of 29859 (max. 10000000)
|
||||
390 PDF objects out of 1000 (max. 8388607)
|
||||
313 compressed objects within 4 object streams
|
||||
104 named destinations out of 1000 (max. 500000)
|
||||
26198 words of extra memory for PDF output out of 29859 (max. 10000000)
|
||||
|
||||
|
|
Binary file not shown.
|
@ -19,3 +19,5 @@
|
|||
\contentsline {subsection}{\numberline {3.2}The 1/16 OI Kernel}{6}{subsection.3.2}
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {subsection}{\numberline {3.3}The 8 OI Kernel}{6}{subsection.3.3}
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {subsubsection}{\numberline {3.3.1}Some Further 8/1 Kernel}{8}{subsubsection.3.3.1}
|
||||
|
|
|
@ -1,6 +1,18 @@
|
|||
all: clean bin lib
|
||||
|
||||
# Roofline Binary
|
||||
|
||||
## This is the least demanding target, use it if nothing else works
|
||||
nofancy: roofline roofline_o3
|
||||
mkdir bin
|
||||
mv $^ bin
|
||||
|
||||
## Your processor needs an FMA unit for this target to work
|
||||
fmacap: roofline roofline_o3 roofline_fma roofline_fma_o3 roofline_fma_fast_o3 roofline_fma_fast_o2 roofline_fma_fast_fastmath_o3
|
||||
mkdir bin
|
||||
mv $^ bin
|
||||
|
||||
## This will compile just everything
|
||||
bin: roofline roofline_o3 roofline_fma roofline_fma_o3 roofline_fma_fast_o3 roofline_fma_fast_o2 roofline_fma_fast_fastmath_o3 roofline_full roofline_profile roofline_full_clang roofline_full_manpack
|
||||
mkdir bin
|
||||
mv $^ bin
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
run,start,end,delta,GFLOP/s
|
||||
1,1466764717.8266,1466764718.1751,0.3486,0.8606
|
||||
2,1466764718.1751,1466764718.5235,0.3484,0.8611
|
||||
3,1466764718.5235,1466764718.8726,0.3491,0.8593
|
||||
4,1466764718.8726,1466764719.2248,0.3522,0.8518
|
||||
1,1466800735.6110,1466800735.9143,0.3033,0.9891
|
||||
2,1466800735.9143,1466800736.2212,0.3069,0.9776
|
||||
3,1466800736.2212,1466800736.5252,0.3040,0.9868
|
||||
4,1466800736.5252,1466800736.8295,0.3043,0.9858
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
run,start,end,delta,GFLOP/s
|
||||
1,1466764721.0839,1466764721.9589,0.8750,21.9434
|
||||
2,1466764721.9589,1466764722.8340,0.8752,21.9383
|
||||
3,1466764722.8340,1466764723.7090,0.8749,21.9451
|
||||
4,1466764723.7090,1466764724.5784,0.8694,22.0840
|
||||
1,1466800738.5621,1466800739.4551,0.8930,21.5008
|
||||
2,1466800739.4551,1466800740.3364,0.8813,21.7866
|
||||
3,1466800740.3364,1466800741.2678,0.9314,20.6144
|
||||
4,1466800741.2678,1466800742.1560,0.8882,21.6173
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
run,start,end,delta,GFLOP/s
|
||||
1,1466764736.6256,1466764737.6487,1.0231,18.7665
|
||||
2,1466764737.6487,1466764738.6642,1.0155,18.9073
|
||||
3,1466764738.6642,1466764739.6867,1.0225,18.7770
|
||||
4,1466764739.6867,1466764740.7045,1.0178,18.8651
|
||||
1,1466800754.3877,1466800755.4812,1.0935,17.5586
|
||||
2,1466800755.4812,1466800756.4967,1.0155,18.9066
|
||||
3,1466800756.4967,1466800757.5917,1.0949,17.5351
|
||||
4,1466800757.5917,1466800758.6499,1.0582,18.1442
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
run,start,end,delta,GFLOP/s
|
||||
1,1466764716.7465,1466764716.9269,0.1804,0.8314
|
||||
2,1466764716.9269,1466764717.1069,0.1800,0.8334
|
||||
3,1466764717.1069,1466764717.2871,0.1801,0.8327
|
||||
4,1466764717.2871,1466764717.4767,0.1897,0.7908
|
||||
1,1466800734.6958,1466800734.8498,0.1540,0.9743
|
||||
2,1466800734.8498,1466800735.0010,0.1512,0.9919
|
||||
3,1466800735.0010,1466800735.1535,0.1525,0.9835
|
||||
4,1466800735.1535,1466800735.3052,0.1517,0.9890
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
run,start,end,delta,GFLOP/s
|
||||
1,1466764719.4042,1466764719.5845,0.1803,106.5045
|
||||
2,1466764719.5845,1466764719.7707,0.1861,103.1532
|
||||
3,1466764719.7707,1466764719.9487,0.1780,107.8675
|
||||
4,1466764719.9487,1466764720.1264,0.1777,108.0261
|
||||
1,1466800736.9802,1466800737.1358,0.1556,123.4004
|
||||
2,1466800737.1358,1466800737.2968,0.1609,119.3029
|
||||
3,1466800737.2968,1466800737.4573,0.1605,119.6037
|
||||
4,1466800737.4573,1466800737.6192,0.1619,118.5706
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
run,start,end,delta,GFLOP/s
|
||||
1,1466764726.7241,1466764728.9906,2.2664,8.4714
|
||||
2,1466764728.9906,1466764731.1306,2.1400,8.9720
|
||||
3,1466764731.1306,1466764733.3797,2.2491,8.5366
|
||||
4,1466764733.3797,1466764735.6046,2.2249,8.6298
|
||||
1,1466800744.3973,1466800746.6229,2.2256,8.6270
|
||||
2,1466800746.6229,1466800748.8499,2.2270,8.6216
|
||||
3,1466800748.8499,1466800751.1237,2.2738,8.4440
|
||||
4,1466800751.1237,1466800753.3258,2.2022,8.7187
|
||||
|
|
|
@ -169,6 +169,10 @@ int main(int argc, char* argv[]) {
|
|||
#endif
|
||||
|
||||
|
||||
printf("\n\n\n");
|
||||
printf("Please refer to the log files in the log/ folder for details about the GFLOP/s of every kernel.");
|
||||
printf("\n");
|
||||
printf("Exiting...");
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
|
@ -238,6 +242,7 @@ static int get_int(char *oparg)
|
|||
static void usage()
|
||||
{
|
||||
fprintf(stderr, "USAGE: ./roofline -s <size> -r <runs> \n");
|
||||
fprintf(stderr, "e.g.: ./roofline -s 100000 -r 5 \n");
|
||||
bail_out("Invalid paramers");
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue