nodes.log

This commit is contained in:
Armin Friedl 2016-06-24 22:48:12 +02:00
parent 9cdac30176
commit dc9ce4c306
16 changed files with 1217 additions and 56 deletions

1080
reduce/nodes.log Normal file

File diff suppressed because it is too large Load diff

View file

@ -113,6 +113,39 @@ for(size_t i=0; i<size; i++){
\end{lstlisting}
\bigskip
\subsubsection{Some Further 8/1 Kernel}
Since some effort was put in getting results near peak performance \verb|-Ofast -ffast-math| was used to stretch compiler optimization to the maximum. Unfortunately \verb|-ffast-math| does not preserve strict IEEE compliance. It is therefore allowed to ignore non-associativity of floating point operations. For example $x = x*x*x*x*x*x*x*x$ can be optimized to $x~*=~x; x~*=~x; x~*=~x;$. Clearly this has an effect on the OI of the kernel. To test fastmath the kernel in~\prettyref{lst:8-1-fma-fastmath} was introduced. Mind that a[i] is written out only once and held in registers during a single iteration.
\bigskip
\begin{lstlisting}[caption={FMA aware $8$ OI kernel with fastmath correctness}, label=lst:8-1-fma-fastmath]
(*\textcolor{Orchid}{\#pragma omp parallel for}*)
for(size_t i=0; i<size; i++){
REP100(a[i]=a[i]*a[i];);
REP20(a[i]=a[i]*a[i];);
REP8(a[i]=a[i]*a[i];);
}
\end{lstlisting}
\bigskip
Since the results were still not satisfying another kernel 8/1 OI kernel which makes use of handcrafted compiler intrinsics was introduced too. This kernel makes full use of the 256-bit-packed-doubles fused-multiply-add floating-point operation the FMA unit of the processor provides. The kernel can be seen in~\prettyref{lst:8-1-intrinsics}. At least in theory this should yield peak performance. The disassembly under full optimization (options can be seen in \verb|Makefile|) behaves very much like handwritten assembly.
\bigskip
\begin{lstlisting}[caption={FMA aware $8$ OI kernel with intrinsics}, label=lst:8-1-intrinsics]
(*\textcolor{Orchid}{\#pragma omp parallel for}*)
for(size_t i=0; i<(size-4); i+=4){
// pack doubles
__m256d packvec = _mm256_set_pd(a[i], a[i+1], a[i+2], a[i+3]);
REP60(packvec = _mm256_fmadd_pd(packvec, packvec, packvec););
REP4(packvec = _mm256_fmadd_pd(packvec, packvec, packvec););
a[i] = packvec[0];
a[i+1] = packvec[1];
a[i+2] = packvec[2];
a[i+3] = packvec[3];
}
\end{lstlisting}
\bigskip
%%% Local Variables:
%%% mode: latex

View file

@ -0,0 +1,24 @@
The best results for various kernels are given in~\prettyref{tbl:res-kernels}. The optimization binary \verb|roofline_full_manpack| was used for these results. This is the binary with all optimizations and the intrinsics kernel enabled. The following parameters were used: \verb|roofline_full_manpack -s 150000000 -r 5|. One double array was therefore 1144.41 MB big -- clearly too big for the cache.
\begin{table}[h!]
\centering
\begin{tabular}{ll}
\toprule
Kernel & Max. GFLOP/s \\
\midrule
simple16 & 0.9919 \\
fma16 & 0.9891 \\
simple8 & 123.4004 \\
simple8fastmath & 8.7187 \\
fma8 & 21.7866 \\
fma8manpack & 18.9066 \\
\bottomrule
\end{tabular}
\caption{Results for various kernels}
\label{tbl:res-kernels}
\end{table}
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "../report"
%%% End:

View file

@ -68,6 +68,11 @@
\@writefile{lol}{\defcounter {refsection}{0}\relax }\@writefile{lol}{\contentsline {lstlisting}{\numberline {6}Simple $8$ OI kernel}{7}{lstlisting.6}}
\newlabel{lst:8-1-fma}{{7}{7}{FMA aware $8$ OI kernel}{lstlisting.7}{}}
\@writefile{lol}{\defcounter {refsection}{0}\relax }\@writefile{lol}{\contentsline {lstlisting}{\numberline {7}FMA aware $8$ OI kernel}{7}{lstlisting.7}}
\newlabel{LastPage}{{}{8}{}{page.8}{}}
\xdef\lastpage@lastpage{8}
\xdef\lastpage@lastpageHy{8}
\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsubsection}{\numberline {3.3.1}Some Further 8/1 Kernel}{8}{subsubsection.3.3.1}}
\newlabel{lst:8-1-fma-fastmath}{{8}{8}{FMA aware $8$ OI kernel with fastmath correctness}{lstlisting.8}{}}
\@writefile{lol}{\defcounter {refsection}{0}\relax }\@writefile{lol}{\contentsline {lstlisting}{\numberline {8}FMA aware $8$ OI kernel with fastmath correctness}{8}{lstlisting.8}}
\newlabel{lst:8-1-intrinsics}{{9}{8}{FMA aware $8$ OI kernel with intrinsics}{lstlisting.9}{}}
\@writefile{lol}{\defcounter {refsection}{0}\relax }\@writefile{lol}{\contentsline {lstlisting}{\numberline {9}FMA aware $8$ OI kernel with intrinsics}{8}{lstlisting.9}}
\newlabel{LastPage}{{}{9}{}{page.9}{}}
\xdef\lastpage@lastpage{9}
\xdef\lastpage@lastpageHy{9}

View file

@ -1,11 +1,11 @@
# Fdb version 3
["biber report"] 1466704438 "report.bcf" "report.bbl" "report" 1466711144
"report.bcf" 1466711144 92382 2683b542d57d2326e3b37a6a44222b52 ""
["biber report"] 1466704438 "report.bcf" "report.bbl" "report" 1466801023
"report.bcf" 1466799693 92382 2683b542d57d2326e3b37a6a44222b52 ""
"roofline.bib" 1466704433 4157 226e47c750579a202f66b6f0e4df67bb ""
(generated)
"report.bbl"
"report.blg"
["pdflatex"] 1466711143 "report.tex" "report.pdf" "report" 1466711144
["pdflatex"] 1466799692 "report.tex" "report.pdf" "report" 1466801023
"/usr/share/texlive/texmf-dist/fonts/enc/dvips/cm-super/cm-super-t1.enc" 1136849721 2971 def0b6c1f0b107b3b936def894055589 ""
"/usr/share/texlive/texmf-dist/fonts/enc/dvips/cm-super/cm-super-ts1.enc" 1136849721 2900 1537cc8184ad1792082cd229ecc269f4 ""
"/usr/share/texlive/texmf-dist/fonts/map/fontname/texfonts.map" 1272929888 3287 e6b82fe08f5336d4d5ebc73fb1152e87 ""
@ -196,22 +196,22 @@
"/usr/share/texlive/texmf-dist/web2c/texmf.cnf" 1455657841 31706 2be2b4306fae7fc20493e3b90c2ad04d ""
"/usr/share/texlive/texmf-var/web2c/pdftex/pdflatex.fmt" 1457104667 3492982 6abaa3262ef9227a797168d32888676c ""
"inputs/introduction.tex" 1466184626 76 eaf0f76fa74815989416f6f6d1c36f8b ""
"inputs/kernels.tex" 1466711142 10273 94bc8e1ce2e538a2a1c74426512dcc37 ""
"inputs/kernels.tex" 1466800173 12285 7459a5d3d19f8cfbe2ace9512c674169 ""
"inputs/roofline.tex" 1466710567 5525 b96d99208485f5095cd10d50a150dff7 ""
"report.aux" 1466711144 6200 b98bcd77a3a008a4d0f92ab6f355b22c ""
"report.aux" 1466799693 6920 efd026f088aa74618447caae8f088925 ""
"report.bbl" 1466704439 7655 4b5f697a70789470cde9f922b6440ee7 "biber report"
"report.out" 1466711144 566 365a3bdfdb786abd7e70ca003f732afb ""
"report.run.xml" 1466711144 2317 80d7743117fafc51b1e42b536d793f68 ""
"report.out" 1466799693 649 906e25252ab8cb90aead774c66de15bf ""
"report.run.xml" 1466799693 2317 80d7743117fafc51b1e42b536d793f68 ""
"report.tex" 1466709836 4497 1f64f8ce17913e2b9dd71c7d6e896da8 ""
"report.toc" 1466711144 1210 9050233c7a77a885db53f60f534c1c7a ""
"report.toc" 1466799693 1343 b579331b0ae5f9f743ca0ceca6f78889 ""
"res/rooftop-eps-converted-to.pdf" 1466670002 22114 f6f2c1d53d8b6a5f4042e202648c7b36 ""
"res/rooftop.eps" 1466669975 36013 2a6358f72820d80a6e87ee15e92d5669 ""
(generated)
"report.toc"
"report.run.xml"
"report.bcf"
"report.log"
"report-blx.bib"
"report.pdf"
"report.aux"
"report.log"
"report.out"
"report.aux"
"report.bcf"
"report.run.xml"
"report.toc"
"report.pdf"

View file

@ -1,4 +1,4 @@
This is pdfTeX, Version 3.14159265-2.6-1.40.15 (TeX Live 2014) (preloaded format=pdflatex 2016.3.4) 23 JUN 2016 21:45
This is pdfTeX, Version 3.14159265-2.6-1.40.15 (TeX Live 2014) (preloaded format=pdflatex 2016.3.4) 24 JUN 2016 22:21
entering extended mode
restricted \write18 enabled.
%&-line parsing enabled.
@ -1359,7 +1359,7 @@ to.pdf res/rooftop.eps>
(epstopdf) \includegraphics on input line 70.
Package epstopdf Info: Output file is already uptodate.
<res/rooftop-eps-converted-to.pdf, id=98, 587.19376pt x 442.65375pt>
<res/rooftop-eps-converted-to.pdf, id=103, 587.19376pt x 442.65375pt>
File: res/rooftop-eps-converted-to.pdf Graphic file (type pdf)
<use res/rooftop-eps-converted-to.pdf>
@ -1379,32 +1379,32 @@ Package hyperref Warning: Token not allowed in a PDF string (PDFDocEncoding):
Package hyperref Warning: Token not allowed in a PDF string (PDFDocEncoding):
(hyperref) removing `math shift' on input line 14.
[5] [6]) [7]
[5] [6] [7])
Overfull \hbox (19.7725pt too wide) in paragraph at lines 116--116
\T1/cmtt/m/n/10.95 blob / e5aa9ca4a77623ff6f1c2d5daa7995565b944506 / stream . c
# L286$[][] \T1/cmr/m/n/10.95 (-20) (vis-ited on 06/20/2016).
[]
AED: lastpage setting LastPage
[8]
AED: lastpage setting LastPage
[9]
Package atveryend Info: Empty hook `BeforeClearDocument' on input line 117.
Package atveryend Info: Empty hook `AfterLastShipout' on input line 117.
(./report.aux)
Package atveryend Info: Executing hook `AtVeryEndDocument' on input line 117.
Package atveryend Info: Executing hook `AtEndAfterFileList' on input line 117.
Package rerunfilecheck Info: File `report.out' has not changed.
(rerunfilecheck) Checksum: 365A3BDFDB786ABD7E70CA003F732AFB;566.
(rerunfilecheck) Checksum: 906E25252AB8CB90AEAD774C66DE15BF;649.
Package logreq Info: Writing requests to 'report.run.xml'.
\openout1 = `report.run.xml'.
Package atveryend Info: Empty hook `AtVeryVeryEnd' on input line 117.
)
Here is how much of TeX's memory you used:
21442 strings out of 493339
338775 string characters out of 6141383
879402 words of memory out of 5000000
24309 multiletter control sequences out of 15000+600000
21477 strings out of 493339
339286 string characters out of 6141383
879545 words of memory out of 5000000
24321 multiletter control sequences out of 15000+600000
30053 words of font info for 136 fonts, out of 8000000 for 9000
953 hyphenation exceptions out of 8191
48i,8n,76p,1001b,1880s stack positions out of 5000i,500n,10000p,200000b,80000s
@ -1427,10 +1427,10 @@ t/fonts/type1/public/cm-super/sfrm1440.pfb></usr/share/texlive/texmf-dist/fonts
/type1/public/cm-super/sfti0900.pfb></usr/share/texlive/texmf-dist/fonts/type1/
public/cm-super/sfti1095.pfb></usr/share/texlive/texmf-dist/fonts/type1/public/
cm-super/sftt1095.pfb>
Output written on report.pdf (8 pages, 328183 bytes).
Output written on report.pdf (9 pages, 336309 bytes).
PDF statistics:
353 PDF objects out of 1000 (max. 8388607)
278 compressed objects within 3 object streams
81 named destinations out of 1000 (max. 500000)
26190 words of extra memory for PDF output out of 29859 (max. 10000000)
390 PDF objects out of 1000 (max. 8388607)
313 compressed objects within 4 object streams
104 named destinations out of 1000 (max. 500000)
26198 words of extra memory for PDF output out of 29859 (max. 10000000)

Binary file not shown.

View file

@ -19,3 +19,5 @@
\contentsline {subsection}{\numberline {3.2}The 1/16 OI Kernel}{6}{subsection.3.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.3}The 8 OI Kernel}{6}{subsection.3.3}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{\numberline {3.3.1}Some Further 8/1 Kernel}{8}{subsubsection.3.3.1}

View file

@ -1,6 +1,18 @@
all: clean bin lib
# Roofline Binary
## This is the least demanding target, use it if nothing else works
nofancy: roofline roofline_o3
mkdir bin
mv $^ bin
## Your processor needs an FMA unit for this target to work
fmacap: roofline roofline_o3 roofline_fma roofline_fma_o3 roofline_fma_fast_o3 roofline_fma_fast_o2 roofline_fma_fast_fastmath_o3
mkdir bin
mv $^ bin
## This will compile just everything
bin: roofline roofline_o3 roofline_fma roofline_fma_o3 roofline_fma_fast_o3 roofline_fma_fast_o2 roofline_fma_fast_fastmath_o3 roofline_full roofline_profile roofline_full_clang roofline_full_manpack
mkdir bin
mv $^ bin

View file

@ -1,5 +1,5 @@
run,start,end,delta,GFLOP/s
1,1466764717.8266,1466764718.1751,0.3486,0.8606
2,1466764718.1751,1466764718.5235,0.3484,0.8611
3,1466764718.5235,1466764718.8726,0.3491,0.8593
4,1466764718.8726,1466764719.2248,0.3522,0.8518
1,1466800735.6110,1466800735.9143,0.3033,0.9891
2,1466800735.9143,1466800736.2212,0.3069,0.9776
3,1466800736.2212,1466800736.5252,0.3040,0.9868
4,1466800736.5252,1466800736.8295,0.3043,0.9858

View file

@ -1,5 +1,5 @@
run,start,end,delta,GFLOP/s
1,1466764721.0839,1466764721.9589,0.8750,21.9434
2,1466764721.9589,1466764722.8340,0.8752,21.9383
3,1466764722.8340,1466764723.7090,0.8749,21.9451
4,1466764723.7090,1466764724.5784,0.8694,22.0840
1,1466800738.5621,1466800739.4551,0.8930,21.5008
2,1466800739.4551,1466800740.3364,0.8813,21.7866
3,1466800740.3364,1466800741.2678,0.9314,20.6144
4,1466800741.2678,1466800742.1560,0.8882,21.6173

View file

@ -1,5 +1,5 @@
run,start,end,delta,GFLOP/s
1,1466764736.6256,1466764737.6487,1.0231,18.7665
2,1466764737.6487,1466764738.6642,1.0155,18.9073
3,1466764738.6642,1466764739.6867,1.0225,18.7770
4,1466764739.6867,1466764740.7045,1.0178,18.8651
1,1466800754.3877,1466800755.4812,1.0935,17.5586
2,1466800755.4812,1466800756.4967,1.0155,18.9066
3,1466800756.4967,1466800757.5917,1.0949,17.5351
4,1466800757.5917,1466800758.6499,1.0582,18.1442

View file

@ -1,5 +1,5 @@
run,start,end,delta,GFLOP/s
1,1466764716.7465,1466764716.9269,0.1804,0.8314
2,1466764716.9269,1466764717.1069,0.1800,0.8334
3,1466764717.1069,1466764717.2871,0.1801,0.8327
4,1466764717.2871,1466764717.4767,0.1897,0.7908
1,1466800734.6958,1466800734.8498,0.1540,0.9743
2,1466800734.8498,1466800735.0010,0.1512,0.9919
3,1466800735.0010,1466800735.1535,0.1525,0.9835
4,1466800735.1535,1466800735.3052,0.1517,0.9890

View file

@ -1,5 +1,5 @@
run,start,end,delta,GFLOP/s
1,1466764719.4042,1466764719.5845,0.1803,106.5045
2,1466764719.5845,1466764719.7707,0.1861,103.1532
3,1466764719.7707,1466764719.9487,0.1780,107.8675
4,1466764719.9487,1466764720.1264,0.1777,108.0261
1,1466800736.9802,1466800737.1358,0.1556,123.4004
2,1466800737.1358,1466800737.2968,0.1609,119.3029
3,1466800737.2968,1466800737.4573,0.1605,119.6037
4,1466800737.4573,1466800737.6192,0.1619,118.5706

View file

@ -1,5 +1,5 @@
run,start,end,delta,GFLOP/s
1,1466764726.7241,1466764728.9906,2.2664,8.4714
2,1466764728.9906,1466764731.1306,2.1400,8.9720
3,1466764731.1306,1466764733.3797,2.2491,8.5366
4,1466764733.3797,1466764735.6046,2.2249,8.6298
1,1466800744.3973,1466800746.6229,2.2256,8.6270
2,1466800746.6229,1466800748.8499,2.2270,8.6216
3,1466800748.8499,1466800751.1237,2.2738,8.4440
4,1466800751.1237,1466800753.3258,2.2022,8.7187

View file

@ -169,6 +169,10 @@ int main(int argc, char* argv[]) {
#endif
printf("\n\n\n");
printf("Please refer to the log files in the log/ folder for details about the GFLOP/s of every kernel.");
printf("\n");
printf("Exiting...");
exit(EXIT_SUCCESS);
}
@ -238,6 +242,7 @@ static int get_int(char *oparg)
static void usage()
{
fprintf(stderr, "USAGE: ./roofline -s <size> -r <runs> \n");
fprintf(stderr, "e.g.: ./roofline -s 100000 -r 5 \n");
bail_out("Invalid paramers");
}