According to the definition used the arithmetic intensity is measured by operations per byte. This might not be adequat for haswell processors (and later). Due to the fused multiply-add\footnote{although called multiply-add there are 36 different slightly instructions} extension two floating point operations can be performed with a single instruction.
\subsection{Theoretical Peak Performance}
The CPU under test was a Intel\textregistered{} Core\texttrademark{} i5-4210U. \prettyref{tbl:spec-4210} shows the relevant specifications for this processor according to \textcite{ark4210}.
+ \centering
+ \begin{tabular}{ll}
+ \toprule
+ Specification & Value \\
+ \midrule
+ Instruction Set Extension & SSE4.1/4.2, AVX 2.0 \\
+ \# of Cores & 2 \\
+ Processor Base Frequency & 1.7 GHz \\
+ Max Turbo Frequency & 2.7 GHz \\
+ Microarchitecture & Haswell \\
+ \bottomrule
+ \end{tabular}
\caption{Intel\textregistered{} Core\texttrademark{} i5-4210U processor specifications~\cite{ark4210}}
+ \label{tbl:spec-4210}
+The 4th generation Intel Core processors provide FMA\footnote{Fused Multiply Add} and AVX\footnote{Advanced Vector Extension} extensions~\cite[5-2 Vol.1]{intel2016}. An FMA unit is capable of ``[...] 256-bit floating-point instructions to perform computation on
+256-bit vectors''~\cite[5-28 Vol.1]{intel2016}. Therefore it can execute 2 (multiply-add) times 4 double-precision floating-point instructions each cycle. This results in 8 DP FLOPs per cycle.
Unfortunately no definite source could be found but according to \textcite{shimpi2012} the Haswell architecture has 2 FMA units, equalling to $2 * 8 = 16$ DP FLOPs per core. Furthermore there are 2 cores in a Core i5 processor. Taken together this results in $16 * 2 = 32$ DP FLOPs per cycle for both cores.
At max frequency the processor is therefore capable of a theoretical peak performance of $32*2.7 = 86.4$ GFLOP/s.
diff --git a/roofline/report/report.tex b/roofline/report/report.tex
new file mode 100644
index 0000000..6ee5fef
--- /dev/null
+++ b/roofline/report/report.tex
@@ -0,0 +1,123 @@
+\documentclass[a4paper, DIV=12]{scrartcl}
+ linkcolor=blue,
+ urlcolor=blue,
+ breaklinks=true,
+ citecolor=blue]{hyperref}
+\newcommand\bigforall{\mbox{\Large $\mathsurround0pt\forall$}}
+\lstset{ %
+ backgroundcolor=\color{white}, % choose the background color; you must add \usepackage{color} or
+ basicstyle=\ttfamily, % the size of the fonts that are used for the code
+ breakatwhitespace=true, % sets if automatic breaks should only happen at whitespace
+ breaklines=true, % sets automatic line breaking
+ captionpos=b, % sets the caption-position to bottom
+ escapeinside={(*}{*)}, % if you want to add LaTeX within your code
+ extendedchars=true, % lets you use non-ASCII characters; for 8-bits encodings only, does not work with UTF-8
+ frame=single, % adds a frame around the code
+ keepspaces=true, % keeps spaces in text, useful for keeping indentation of code (possibly needs columns=flexible)
+ language=TeX, % the language of the code
+ numbers=left, % where to put the line-numbers; possible values are (none, left, right)
+ numbersep=5pt, % how far the line-numbers are from the code
+ numberstyle=\tiny\color{gray}, % the style that is used for the line-numbers
+ rulecolor=\color{black}, % if not set, the frame-color may be changed on line-breaks within not-black text (e.g. comments (green here))
+ showspaces=false, % show spaces everywhere adding particular underscores; it overrides 'showstringspaces'
+ showstringspaces=false, % underline spaces within strings only
+ showtabs=false, % show tabs within strings adding particular underscores
+ stepnumber=1, % the step between two line-numbers. If it's 1, each line will be numbered
+ tabsize=2, % sets default tabsize to 2 spaces
+ title=\lstname, % show the filename of files included with \lstinputlisting; also try caption instead of title
+ emph=[3]{int:,array,set,of,int,if,then,else,constraint,var,union,endif,function,where,in,div,predicate,let,opt,full,format,def,for,True,False,return,or},
+ emphstyle=[3]\color{ForestGreen},
+ emph=[2]{length,max,forall,startEmptyBuffer,fix,startEmptyBufferShow,exactly,cumulative,occurs,deopt,sum,,all},
+ emphstyle=[2]\color{blue},
+ commentstyle=\color{BrickRed},
+ stringstyle =\color{red},
+\subject{High Performance Computing}
+\subtitle{Project 3}
+\author{Johannes Winklehner\\1226104 \and Armin Friedl\\1053597}
+ A \emph{roofline model} for a multicore-processor is obtained by calcuating the theoretical peak performance of the processor and benchmarking the peak memory bandwith. Two artificial computational kernels with arithmetic intensities of $\frac{1}{16}$ GFLOPs/Byte and $8$ GFLOPs/Byte are devised. The performance of the two kernels is then compared to the theoretical calculations in the roofline model.
+\section{Roofline Model}
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: t
+%%% End:
diff --git a/roofline/report/roofline.bib b/roofline/report/roofline.bib
new file mode 100644
index 0000000..da75f3e
--- /dev/null
+++ b/roofline/report/roofline.bib
@@ -0,0 +1,52 @@
+% This file was created with JabRef 2.10.
+% Encoding: UTF-8
+ Title = {Intel® 64 and IA-32 Architectures Software Developer’s Manual},
+ Author = {Intel},
+ Url = {https://www-ssl.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf},
+ Year = {2016},
+ Month = {April},
+ Organization = {Intel},
+ Subtitle = {Combined Volumes: 1, 2A, 2B, 2C, 3A, 3B, 3C and 3D},
+ Owner = {armin},
+ Timestamp = {2016.06.19}
+ Title = {Intel® Core™ i5-4210U Processor Specifications},
+ Author = {{Intel Ark}},
+ Url = {http://ark.intel.com/products/81016/},
+ Urldate = {2016-06-19},
+ Owner = {armin},
+ Timestamp = {2016.06.19}
+ Title = {Haswell's Wide Execution Engine},
+ Author = {Anand Lal Shimpi},
+ Date = {2012-10-05},
+ Url = {http://www.anandtech.com/show/6355/intels-haswell-architecture/8},
+ Urldate = {2016-06-19},
+ Owner = {armin},
+ Timestamp = {2016.06.19}
+ Title = {Roofline: an insightful visual performance model for multicore architectures},
+ Author = {Williams, Samuel and Waterman, Andrew and Patterson, David},
+ Year = {2009},
+ Number = {4},
+ Pages = {65--76},
+ Volume = {52},
+ Journal = {Communications of the ACM},
+ Owner = {armin},
+ Publisher = {ACM},
+ Timestamp = {2016.06.17}
diff --git a/roofline/src/Makefile b/roofline/src/Makefile
new file mode 100644
index 0000000..6a0ad16
--- /dev/null
+++ b/roofline/src/Makefile
@@ -0,0 +1,15 @@
+all: roofline aikern.a
+roofline: roofline.c aikern.a
+ gcc -Wall -Wextra -O3 -std=c99 -fopenmp $^ -o $@
+aikern.a: aikern.c aikern.h
+ gcc -O3 -c -o aikern.o aikern.c
+ ar rcs aikern.a aikern.o
+ rm -f roofline
+ rm -f *.o
+ rm -f *.a
+ rm -f *.so
diff --git a/roofline/src/aikern.c b/roofline/src/aikern.c
new file mode 100644
index 0000000..933ea6a
--- /dev/null
+++ b/roofline/src/aikern.c
@@ -0,0 +1,62 @@
+# include
+void kernel_1_16_simple(double* a, double* b, double* c, size_t size)
+ // volatile to prevent compiler from optimizing this away
+ // register to advise compiler to put this in register
+ volatile double tmp = 0.1;
+ #pragma omp parallel for
+ for(size_t i=0; i AI = 1/16 */
+ tmp = a[i] * b[i];
+ }
+void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size)
+ #pragma omp parallel for
+ for(size_t i=0; i AI = 2/32 = 1/16 */
+ a[i] = a[i] * b[i] + c[i];
+ }
+void kernel_8_1_simple(double* a, double* b, double* c, size_t size)
+ // volatile to prevent compiler from optimizing this away
+ // register to advise compiler to put this in register
+ volatile register double tmp = 0.1;
+ #pragma omp parallel for
+ for(size_t i=0; i AI = 8 */
+ tmp = a[i] * a[i] * a[i] * a[i] *
+ a[i] * a[i] * a[i] * a[i];
+ }
+void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size)
+ // volatile to prevent compiler from optimizing this away
+ // register to advise compiler to put this in register
+ register volatile double tmp = 0.1;
+ #pragma omp parallel for
+ for(size_t i=0; i AI = 8 */
+ tmp = a[i];
+ tmp = tmp * tmp + tmp;
+ tmp = tmp * tmp + tmp;
+ tmp = tmp * tmp + tmp;
+ tmp = tmp * tmp + tmp;
+ }
+void kernel_1_8_vo(double* a, double* b, double* c, size_t size)
+ volatile double tmp=0.0;
+ for(size_t i=0; i
+# include
+# include
+# include
+# include
+# include
+# include
+# include
+# include
+# include
+# include
+# include "aikern.h"
+/* === Macros === */
+#ifdef ENDEBUG
+#define DEBUG(...) do { fprintf(stderr, __VA_ARGS__); fprintf(stderr, "\n"); } while(0)
+#define DEBUG(...)
+/* === Constants === */
+/* === Global Variables === */
+char* prog_name;
+/* === Prototypes === */
+ * @brief print usage message
+ */
+static void usage(void);
+ * @brief terminate program on program error
+ * @param msg additional message to print
+ * @param ret exit value
+ */
+static void bail_out(char* fmt, ...);
+ * @brief converts the argument to size_t if possible.
+ * bails out on error.
+ * @param oparg the argument to convert
+ */
+static size_t get_size(char* oparg);
+ * @brief converts the argument to int if possible.
+ * bails out on error.
+ * @param oparg the argument to convert
+ */
+static int get_int(char* oparg);
+ * @brief microseconds since epoch
+ */
+static double mysecond(void);
+ * @brief a simple test kernel with ai of 1/16
+ */
+static void testkern(double* a, double* b, double* c, size_t size);
+int main(int argc, char* argv[]) {
+ prog_name = argv[0];
+ int opt;
+ char *size_arg = NULL;
+ char *runs_arg = NULL;
+ while((opt = getopt(argc, argv, "s:r:")) != -1)
+ {
+ switch(opt)
+ {
+ case 's':
+ size_arg = optarg;
+ break;
+ case 'r':
+ runs_arg = optarg;
+ break;
+ case '?':
+ usage();
+ default:
+ usage();
+ }
+ }
+ if(optind < argc)
+ {
+ for (int index = optind; index < argc; index++)
+ bail_out ("Non-option argument %s\n", argv[index]);
+ usage();
+ }
+ if(size_arg == NULL || runs_arg == NULL)
+ usage();
+ size_t size = get_size(size_arg);
+ int runs = get_int(runs_arg);
+ printf("Will run with array sizes of %zu\n", size);
+ printf("Will calculate min, max, avg for %d runs\n", runs);
+ /* Make this volatile so that nothing is optimized away here */
+ double* a = malloc(sizeof(double)*(size));
+ double* b = malloc(sizeof(double)*(size));
+ double* c = malloc(sizeof(double)*(size));
+ if(a==NULL || b==NULL || c == NULL)
+ bail_out("One of the mallocs failed\n. a = %p, b=%p, c=%p", a, b, c);
+ printf("Allocated 3 arrays\n");
+ printf("Filling arrays with dummy values\n");
+ #pragma omp parallel for
+ for (size_t j=0; j AI = 3/(2*3*8) = 1/16 */
+ a[j] = 2.0E0 * a[j];
+ b[j] = 2.0E0 * b[j];
+ c[j] = 2.0E0 * c[j];
+ }
+/* === Helper Functions === */
+static double mysecond(void)
+ struct timeval tp;
+ int i;
+ i = gettimeofday(&tp,NULL);
+ if(i != 0)
+ bail_out("Time measurement impossible. gettimeofday error");
+ return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
+static size_t get_size(char *oparg)
+ long long int llsize = strtoll(oparg, NULL, 10);
+ if(llsize <= 0)
+ usage();
+ unsigned long long int u_llsize = (unsigned long long int) llsize;
+ if(u_llsize > SIZE_MAX)
+ bail_out("Only size between 1 to %zu allowed.", SIZE_MAX);
+ return (size_t) llsize;
+static int get_int(char *oparg)
+ long long int llsize = strtoll(oparg, NULL, 10);
+ if(llsize <= 0)
+ usage();
+ unsigned long long int u_llsize = (unsigned long long int) llsize;
+ if(u_llsize > INT_MAX)
+ bail_out("Only size between 1 to %d allowed.", INT_MAX);
+ return (int) llsize;
+static void usage()
+ fprintf(stderr, "USAGE: ./roofline -s -s \n");
+ bail_out(NULL);
+static void bail_out(char* fmt, ...)
+ if(fmt != NULL)
+ {
+ char msgbuf[150];
+ va_list vl;
+ va_start(vl, fmt);
+ if(vsnprintf(msgbuf, sizeof(msgbuf), fmt, vl) < 0)
+ msgbuf[0] = '\0';
+ va_end( vl);
+ if(strlen(msgbuf) > 0)
+ (void)fprintf(stderr, "%s: %s \n", prog_name, msgbuf);
+ }
+ if(errno != 0)
+ (void)fprintf(stderr, "%s: %s\n", prog_name, strerror(errno));