225 lines
9.2 KiB
TeX
Executable file
225 lines
9.2 KiB
TeX
Executable file
\documentclass[a4paper, DIV=12]{scrartcl}
|
|
\usepackage[english]{babel}
|
|
\usepackage[utf8]{inputenc}
|
|
\usepackage[dvipsnames]{xcolor}
|
|
\usepackage{amsmath}
|
|
\usepackage{amssymb}
|
|
\usepackage{stmaryrd}
|
|
\usepackage{graphicx}
|
|
\usepackage{pdflscape}
|
|
\usepackage{listingsutf8}
|
|
\usepackage{spverbatim}
|
|
\usepackage{placeins}
|
|
\usepackage{lmodern}
|
|
%\usepackage{helvet}
|
|
\usepackage{booktabs}
|
|
\usepackage[T1]{fontenc}
|
|
\usepackage{microtype}
|
|
\usepackage{framed}
|
|
\usepackage[colorlinks=true,
|
|
linkcolor=blue,
|
|
urlcolor=blue,
|
|
breaklinks=true,
|
|
citecolor=blue]{hyperref}
|
|
\usepackage{prettyref}
|
|
\usepackage{lastpage}
|
|
\usepackage{subcaption}
|
|
\usepackage{tabularx}
|
|
\usepackage{adjustbox}
|
|
\usepackage{pdfpages}
|
|
\usepackage{xspace}
|
|
\usepackage[inline]{enumitem}
|
|
\usepackage[abbreviate=false,maxbibnames=99,backend=biber]{biblatex}
|
|
\usepackage{textcomp}
|
|
\usepackage{tikz}
|
|
\usepackage[ruled,linesnumbered]{algorithm2e}
|
|
|
|
\setkomafont{disposition}{\normalfont\bfseries}
|
|
|
|
\setlist[itemize]{itemsep=0.1em}
|
|
\setlist[enumerate]{itemsep=0.1em}
|
|
|
|
|
|
\newrefformat{tbl}{\hyperref[#1]{Table~\ref*{#1}}}
|
|
\newrefformat{fig}{\hyperref[#1]{Figure~\ref*{#1}}}
|
|
\newrefformat{lst}{\hyperref[#1]{Listing~\ref*{#1}}}
|
|
\newrefformat{equ}{\hyperref[#1]{Equation~\ref*{#1}}}
|
|
\newrefformat{sec}{\hyperref[#1]{Section~\ref*{#1}}}
|
|
\newrefformat{alg}{\hyperref[#1]{Algorithm~\ref*{#1}}}
|
|
\renewcommand{\arraystretch}{1.2}
|
|
|
|
\newcommand\bigforall{\mbox{\Large $\mathsurround0pt\forall$}}
|
|
\everymath{\displaystyle}
|
|
|
|
\lstset{ %
|
|
backgroundcolor=\color{white}, % choose the background color; you must add \usepackage{color} or
|
|
basicstyle=\ttfamily, % the size of the fonts that are used for the code
|
|
breakatwhitespace=true, % sets if automatic breaks should only happen at whitespace
|
|
breaklines=true, % sets automatic line breaking
|
|
captionpos=b, % sets the caption-position to bottom
|
|
escapeinside={(*}{*)}, % if you want to add LaTeX within your code
|
|
extendedchars=true, % lets you use non-ASCII characters; for 8-bits encodings only, does not work with UTF-8
|
|
frame=single, % adds a frame around the code
|
|
keepspaces=true, % keeps spaces in text, useful for keeping indentation of code (possibly needs columns=flexible)
|
|
language=TeX, % the language of the code
|
|
numbers=left, % where to put the line-numbers; possible values are (none, left, right)
|
|
numbersep=5pt, % how far the line-numbers are from the code
|
|
numberstyle=\tiny\color{gray}, % the style that is used for the line-numbers
|
|
rulecolor=\color{black}, % if not set, the frame-color may be changed on line-breaks within not-black text (e.g. comments (green here))
|
|
showspaces=false, % show spaces everywhere adding particular underscores; it overrides 'showstringspaces'
|
|
showstringspaces=false, % underline spaces within strings only
|
|
showtabs=false, % show tabs within strings adding particular underscores
|
|
stepnumber=1, % the step between two line-numbers. If it's 1, each line will be numbered
|
|
tabsize=2, % sets default tabsize to 2 spaces
|
|
title=\lstname, % show the filename of files included with \lstinputlisting; also try caption instead of title
|
|
emph=[3]{int:,array,set,of,int,if,then,else,constraint,var,union,endif,function,where,in,div,predicate,let,opt,full,format,def,for,True,False,return,or},
|
|
emphstyle=[3]\color{ForestGreen},
|
|
emph=[2]{length,max,forall,startEmptyBuffer,fix,startEmptyBufferShow,exactly,cumulative,occurs,deopt,sum,,all},
|
|
emphstyle=[2]\color{blue},
|
|
commentstyle=\color{BrickRed},
|
|
stringstyle =\color{red},
|
|
}
|
|
|
|
\begin{document}
|
|
|
|
\subject{High Performance Computing}
|
|
\title{Reduction trees for MPI Reductions}
|
|
\subtitle{Project 2}
|
|
|
|
\author{Johannes Winklehner\\1226104 \and Armin Friedl\\1053597}
|
|
\date{\today}
|
|
|
|
\maketitle
|
|
|
|
\tableofcontents
|
|
|
|
\newpage
|
|
|
|
\section{Problem Description}
|
|
\label{sec:description}
|
|
|
|
The purpose of this project is to compare different implementations of the collective communication call MPI\_Reduce.
|
|
The compared implementations should all use different forms of Tree Reduction algorithms.
|
|
As a baseline for the comparison serves a given implementation of the MPI standard, which is in our case NEC MPI.
|
|
\begin{description}
|
|
\item[Binomial Tree]
|
|
A binomial tree has a non-fixed degree where each tree $B_i$ has exactly $i$ subtrees of size $B_0$ to $B_{i-1}$.
|
|
The number of nodes in such a tree is equal to $2^i$ and the depth is $i$.
|
|
\item[Fibonacci Tree]
|
|
The Fibonacci tree uses a fixed degree of $2$ where a tree of size $F_i$ has one subtree of size $T_{i-1}$ and one of $T_{i-2}$.
|
|
Therefore the number of nodes in this kind of tree is $fib(i+3)-1$ using the Fibonacci function $fib(x) = fib(x-1)+fib(x-2)$ and its depth is as well $i$.
|
|
\item[Binary Tree]
|
|
The binary tree used for reduction is a common complete binary tree where a tree $T_i$ has two subtrees $T_{i-1}$.
|
|
Such a tree has $2^{i+1}-1$ nodes and its depth is as for the other types $i$.
|
|
\end{description}
|
|
|
|
\begin{center}
|
|
\begin{minipage}{.4\textwidth}
|
|
\begin{tikzpicture}
|
|
\node [circle,draw]{$B_i$}
|
|
child { node [circle,draw]{$B_{i-1}$}}
|
|
child {node [circle,draw] {$B_{i-2}$}}
|
|
child {node {\dots} edge from parent[draw=none]}
|
|
child {node [circle,draw] {$B_0$}};
|
|
\end{tikzpicture}
|
|
%\caption{Binomial Tree of size $i$}
|
|
\end{minipage}
|
|
\begin{minipage}{.2\textwidth}
|
|
\begin{tikzpicture}
|
|
\node [circle,draw]{$F_i$}
|
|
child { node [circle,draw]{$F_{i-1}$}}
|
|
child {node [circle,draw] {$F_{i-2}$}};
|
|
\end{tikzpicture}
|
|
%\caption{Fibonacci Tree of size $i$}
|
|
\end{minipage}
|
|
\begin{minipage}{.2\textwidth}
|
|
\begin{tikzpicture}
|
|
\node [circle,draw]{$T_i$}
|
|
child { node [circle,draw]{$T_{i-1}$}}
|
|
child {node [circle,draw] {$T_{i-2}$}};
|
|
\end{tikzpicture}
|
|
%\caption{Complete Binary Tree of size $i$}
|
|
\end{minipage}
|
|
\end{center}
|
|
|
|
All three implementations of the reduce function must use exactly the same interface as the MPI standard defines it.
|
|
This interface is shown in \prettyref{lst:reduce}.
|
|
This requires that all implementations support any arbitrary MPI datatype as well as operations.
|
|
The standard also provides some constraints regarding the associativity and commutativity of executable operations.
|
|
Every MPI operation must be associative, but does not necessarily have to be commutative.
|
|
This means that all results of the operation must be computed in the MPI rank order of all processes.
|
|
|
|
\begin{lstlisting}[language=C, caption=MPI Reduce interface, label=lst:reduce]
|
|
int MPI_Reduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm)
|
|
\end{lstlisting}
|
|
|
|
The standard also defines additional features of the reduce function, for example an in place operator for the root process.
|
|
However since those details where not mentioned in the assignment description, we did not consider them as part of the project.
|
|
|
|
The basic algorithm for a tree reduction, which will be shown in the next section, is very similar for all kinds of trees and uses Point-to-Point communication between tree nodes.
|
|
The assumption for our implementations to be efficient is that the underlying communication network is fully connected and allows for bidirectional communication.
|
|
|
|
\FloatBarrier
|
|
|
|
\section{Implemented Algorithms}
|
|
\label{sec:algorithms}
|
|
|
|
The basic algorithm for a tree reduction is very simple and is shown in \prettyref{alg:reduce}.
|
|
At first the parent and all child nodes have to be determined to know the communication partners of each process.
|
|
Then each process receives the partial results from all of its children and calculates its own result from the received data.
|
|
To ensure the correctness of the result for non commutative operations the iteration of child nodes has to be done in rank order.
|
|
Processes which are leaf nodes in the tree have no children and therefore skip the receiving part of the algorithm.
|
|
If a process has a parent and is therefore not the root process, it sends its result to the determined parent node.
|
|
However if the process is the root process the reduction is finished and can be returned.
|
|
|
|
\begin{algorithm}
|
|
\caption{Tree Reduce}
|
|
\label{alg:reduce}
|
|
\KwIn{An array $\vec{a}$ of a given $datatype$ with size $count$ for each process}
|
|
\KwOut{The result of the reduction on the $root$ process}
|
|
determine $parent$ and $children$\;
|
|
$result = \vec{a}$\;
|
|
\ForAll{child in children}{
|
|
receive $result$ from $child$\;
|
|
$result =$ local reduce of received array and $result$\;
|
|
}
|
|
\eIf{parent exists}{
|
|
send $result$ to $parent$\;
|
|
}{
|
|
$output = result$\;
|
|
}
|
|
\end{algorithm}
|
|
|
|
The calculation of the parent and child nodes is the only aspect which has to be changed for all possible kinds of trees.
|
|
However there are of course certain optimizations possible to use some knowledge of a concrete tree.
|
|
|
|
\FloatBarrier
|
|
|
|
\section{Implementation Details}
|
|
\label{sec:kernels}
|
|
|
|
\FloatBarrier
|
|
|
|
\section{Results}
|
|
\label{sec:results}
|
|
|
|
\FloatBarrier
|
|
|
|
\section{Analysis}
|
|
\label{sec:analysis}
|
|
|
|
\section{Appendix}
|
|
|
|
\lstinputlisting[language=C]{../binom_reduce.c}
|
|
|
|
\lstinputlisting[language=C]{../fib_reduce.c}
|
|
|
|
\lstinputlisting[language=C]{../bin_reduce.c}
|
|
|
|
\end{document}
|
|
|
|
|
|
%%% Local Variables:
|
|
%%% mode: latex
|
|
%%% TeX-master: t
|
|
%%% End:
|