Init
This commit is contained in:
commit
c61491e713
5 changed files with 1237 additions and 0 deletions
120
Makefile
Normal file
120
Makefile
Normal file
|
@ -0,0 +1,120 @@
|
||||||
|
all: clean bin lib
|
||||||
|
|
||||||
|
# Roofline Binary
|
||||||
|
|
||||||
|
## This is the least demanding target, use it if nothing else works
|
||||||
|
nofancy: roofline roofline_o3
|
||||||
|
mkdir bin
|
||||||
|
mv $^ bin
|
||||||
|
|
||||||
|
## Your processor needs an FMA unit for this target to work
|
||||||
|
fmacap: roofline roofline_o3 roofline_fma roofline_fma_o3 roofline_fma_fast_o3 roofline_fma_fast_o2 roofline_fma_fast_fastmath_o3
|
||||||
|
mkdir bin
|
||||||
|
mv $^ bin
|
||||||
|
|
||||||
|
## This will compile just everything
|
||||||
|
bin: roofline roofline_o3 roofline_fma roofline_fma_o3 roofline_fma_fast_o3 roofline_fma_fast_o2 roofline_fma_fast_fastmath_o3 roofline_full roofline_profile roofline_full_clang roofline_full_manpack
|
||||||
|
mkdir bin
|
||||||
|
mv $^ bin
|
||||||
|
|
||||||
|
roofline: roofline.c aikern.a
|
||||||
|
gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@
|
||||||
|
|
||||||
|
roofline_o3: roofline.c aikern_o3.a
|
||||||
|
gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@
|
||||||
|
|
||||||
|
roofline_fma: roofline.c aikern_fma.a
|
||||||
|
gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@
|
||||||
|
|
||||||
|
roofline_fma_o3: roofline.c aikern_fma_o3.a
|
||||||
|
gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@
|
||||||
|
|
||||||
|
roofline_fma_fast_o3: roofline.c aikern_fma_fast_o3.a
|
||||||
|
gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@
|
||||||
|
|
||||||
|
roofline_fma_fast_o2: roofline.c aikern_fma_fast_o2.a
|
||||||
|
gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@
|
||||||
|
|
||||||
|
roofline_fma_fast_fastmath_o3: roofline.c aikern_fma_fast_fastmath_o3.a
|
||||||
|
gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@
|
||||||
|
|
||||||
|
roofline_full: roofline.c aikern_full.a
|
||||||
|
gcc -Wall -Wextra -std=c99 -fopenmp -O3 -mavx -mfma -fopenmp -Ofast -ffast-math -malign-double -march=native $^ -o $@
|
||||||
|
|
||||||
|
roofline_full_clang: roofline.c aikern_full_clang.a
|
||||||
|
clang -Wall -Wextra -std=c99 -fopenmp -O3 -mavx -mfma -fopenmp -Ofast -ffast-math -march=native $^ -o $@
|
||||||
|
|
||||||
|
roofline_full_manpack: roofline.c aikern_full_manpack.a
|
||||||
|
gcc -Wall -Wextra -std=c99 -fopenmp -DINTRINS -O3 -mavx -mfma -fopenmp -Ofast -ffast-math -malign-double -march=native $^ -o $@
|
||||||
|
|
||||||
|
roofline_profile: roofline.c aikern_profile.a
|
||||||
|
gcc -Wall -Wextra -std=c99 -fopenmp -O0 -g -pg $^ -o $@
|
||||||
|
|
||||||
|
# Static Libraries
|
||||||
|
lib: aikern.a aikern_o3.a aikern_fma.a aikern_fma_o3.a aikern_fma_fast_o2.a aikern_fma_fast_o3.a aikern_fma_fast_fastmath_o3.a aikern_full.a aikern_full_clang.a aikern_profile.a aikern_full_manpack.a
|
||||||
|
mkdir lib
|
||||||
|
mv $^ lib
|
||||||
|
|
||||||
|
aikern.a: aikern.c aikern.h
|
||||||
|
gcc -Wall -Wextra -Wno-unused -fopenmp -c -o aikern.o $<
|
||||||
|
ar rcs aikern.a aikern.o
|
||||||
|
rm aikern.o
|
||||||
|
|
||||||
|
aikern_o3.a: aikern.c aikern.h
|
||||||
|
gcc -Wall -Wextra -Wno-unused -O3 -fopenmp -c -o aikern_o3.o $<
|
||||||
|
ar rcs $@ aikern_o3.o
|
||||||
|
rm aikern_o3.o
|
||||||
|
|
||||||
|
aikern_fma.a: aikern.c aikern.h
|
||||||
|
gcc -Wall -Wextra -Wno-unused -O2 -mavx -mfma -fopenmp -c -o aikern_fma.o $<
|
||||||
|
ar rcs $@ aikern_fma.o
|
||||||
|
rm aikern_fma.o
|
||||||
|
|
||||||
|
aikern_fma_o3.a: aikern.c aikern.h
|
||||||
|
gcc -Wall -Wextra -Wno-unused -O3 -mavx -mfma -fopenmp -c -o aikern_fma_o3.o $<
|
||||||
|
ar rcs $@ aikern_fma_o3.o
|
||||||
|
rm aikern_fma_o3.o
|
||||||
|
|
||||||
|
aikern_fma_fast_o2.a: aikern.c aikern.h
|
||||||
|
gcc -Wall -Wextra -Wno-unused -O2 -mavx -mfma -fopenmp -Ofast -c -o aikern_fma_fast_o2.o $<
|
||||||
|
ar rcs $@ aikern_fma_fast_o2.o
|
||||||
|
rm aikern_fma_fast_o2.o
|
||||||
|
|
||||||
|
aikern_fma_fast_o3.a: aikern.c aikern.h
|
||||||
|
gcc -Wall -Wextra -Wno-unused -O3 -mavx -mfma -fopenmp -Ofast -c -o aikern_fma_fast_o3.o $<
|
||||||
|
ar rcs $@ aikern_fma_fast_o3.o
|
||||||
|
rm aikern_fma_fast_o3.o
|
||||||
|
|
||||||
|
aikern_fma_fast_fastmath_o3.a: aikern.c aikern.h
|
||||||
|
gcc -Wall -Wextra -Wno-unused -O3 -mavx -mfma -fopenmp -Ofast -ffast-math -c -o aikern_fma_fast_fastmath_o3.o $<
|
||||||
|
ar rcs $@ aikern_fma_fast_fastmath_o3.o
|
||||||
|
rm aikern_fma_fast_fastmath_o3.o
|
||||||
|
|
||||||
|
aikern_full.a: aikern.c aikern.h
|
||||||
|
gcc -Wall -Wextra -Wno-unused -O3 -mavx -mfma -fopenmp -Ofast -ffast-math -malign-double -march=native -c -o aikern_full.o $<
|
||||||
|
ar rcs $@ aikern_full.o
|
||||||
|
rm aikern_full.o
|
||||||
|
|
||||||
|
aikern_full_clang.a: aikern.c aikern.h
|
||||||
|
gcc -Wall -Wextra -Wno-unused -O3 -mavx -mfma -fopenmp -Ofast -ffast-math -march=native -c -o aikern_full_clang.o $<
|
||||||
|
ar rcs $@ aikern_full_clang.o
|
||||||
|
rm aikern_full_clang.o
|
||||||
|
|
||||||
|
aikern_profile.a: aikern.c aikern.h
|
||||||
|
gcc -Wall -Wextra -Wno-unused -fopenmp -O0 -g -pg -c -o aikern_profile.o $<
|
||||||
|
ar rcs $@ aikern_profile.o
|
||||||
|
rm aikern_profile.o
|
||||||
|
|
||||||
|
aikern_full_manpack.a: aikern.c aikern.h
|
||||||
|
gcc -Wall -Wextra -Wno-unused -DINTRINS -O3 -mavx -mfma -fopenmp -Ofast -ffast-math -malign-double -march=native -c -o aikern_full_manpack.o $<
|
||||||
|
ar rcs $@ aikern_full_manpack.o
|
||||||
|
rm aikern_full_manpack.o
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
clean:
|
||||||
|
rm -f *.a
|
||||||
|
rm -f *.o
|
||||||
|
rm -f roofline roofline_o3 roofline_fma roofline_fma_o3 roofline_fma_fast_o3 roofline_fma_fast_o2 roofline_fma_fast_fastmath_o3 roofline_fma_fast_fastmath_aligned_o3 roofline_profile roofline_full_clang
|
||||||
|
rm -fR bin
|
||||||
|
rm -fR lib
|
||||||
|
|
288
aikern.c
Normal file
288
aikern.c
Normal file
|
@ -0,0 +1,288 @@
|
||||||
|
# include <stdlib.h>
|
||||||
|
# include <stdio.h>
|
||||||
|
# include <unistd.h>
|
||||||
|
# include <stdarg.h>
|
||||||
|
# include <errno.h>
|
||||||
|
# include <string.h>
|
||||||
|
# include <sys/time.h>
|
||||||
|
|
||||||
|
# include "aikern.h"
|
||||||
|
|
||||||
|
/* === Macros === */
|
||||||
|
|
||||||
|
#ifdef ENDEBUG
|
||||||
|
#define DEBUG(...) do { fprintf(stderr, __VA_ARGS__); fprintf(stderr, "\n"); } while(0)
|
||||||
|
#else
|
||||||
|
#define DEBUG(...)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief terminate program on program error
|
||||||
|
* @param msg additional message to print
|
||||||
|
* @param ret exit value
|
||||||
|
*/
|
||||||
|
static void bail_out(char* fmt, ...);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief microseconds since epoch
|
||||||
|
*/
|
||||||
|
static double pin_time(void);
|
||||||
|
|
||||||
|
kern_result kernel_dispatch(kernel_t kernel,
|
||||||
|
double* a, double* b, double* c,
|
||||||
|
size_t size, size_t runs)
|
||||||
|
{
|
||||||
|
|
||||||
|
kern_result result = {0};
|
||||||
|
result.runs = runs;
|
||||||
|
result.starts = malloc(sizeof(double)*(runs));
|
||||||
|
result.ends = malloc(sizeof(double)*(runs));
|
||||||
|
result.size = size;
|
||||||
|
|
||||||
|
if(result.starts==NULL || result.ends==NULL)
|
||||||
|
{
|
||||||
|
bail_out("One of the mallocs failed\n. starts = %p, ends=%p", result.starts, result.ends);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
switch(kernel)
|
||||||
|
{
|
||||||
|
|
||||||
|
case SIMPLE_1_16:
|
||||||
|
result.flops = 1;
|
||||||
|
result.kern_name = "Simple 1/16";
|
||||||
|
for(size_t r=0; r<runs; r++)
|
||||||
|
{
|
||||||
|
result.starts[r] = pin_time();
|
||||||
|
kernel_1_16_simple(a, size);
|
||||||
|
result.ends[r] = pin_time();
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case FMA_1_16:
|
||||||
|
result.flops = 2;
|
||||||
|
result.kern_name = "FMA aware 1/16";
|
||||||
|
for(size_t r=0; r<runs; r++)
|
||||||
|
{
|
||||||
|
result.starts[r] = pin_time();
|
||||||
|
kernel_1_16_fuseaware(a, b, c, size);
|
||||||
|
result.ends[r] = pin_time();
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case SIMPLE_8_1:
|
||||||
|
result.flops = 128;
|
||||||
|
result.kern_name = "Simple 8";
|
||||||
|
for(size_t r=0; r<runs; r++)
|
||||||
|
{
|
||||||
|
result.starts[r] = pin_time();
|
||||||
|
kernel_8_1_simple(a, size);
|
||||||
|
result.ends[r] = pin_time();
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case FMA_8_1:
|
||||||
|
result.flops = 128;
|
||||||
|
result.kern_name = "FMA aware 8";
|
||||||
|
for(size_t r=0; r<runs; r++)
|
||||||
|
{
|
||||||
|
result.starts[r] = pin_time();
|
||||||
|
kernel_8_1_fuseaware(a, size);
|
||||||
|
result.ends[r] = pin_time();
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case SIMPLE_8_1_FASTMATH:
|
||||||
|
DEBUG("AIKERN MANPACK");
|
||||||
|
result.flops = 128;
|
||||||
|
result.kern_name = "Simple 8 undermining fastmath";
|
||||||
|
for(size_t r=0; r<runs; r++)
|
||||||
|
{
|
||||||
|
result.starts[r] = pin_time();
|
||||||
|
kernel_8_1_simple_fastmath(a, size);
|
||||||
|
result.ends[r] = pin_time();
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case FMA_8_1_MANPACK:
|
||||||
|
DEBUG("AIKERN MANPACK");
|
||||||
|
#ifdef INTRINS
|
||||||
|
if(size%4 != 0)
|
||||||
|
{
|
||||||
|
bail_out("Must use multiple of 4 size for manpack");
|
||||||
|
}
|
||||||
|
|
||||||
|
result.flops = 128;
|
||||||
|
result.kern_name = "FMA aware 8 with manual packing";
|
||||||
|
for(size_t r=0; r<runs; r++)
|
||||||
|
{
|
||||||
|
DEBUG("running manpack run %zu",r);
|
||||||
|
result.starts[r] = pin_time();
|
||||||
|
kernel_8_1_fuseaware_manpack(a, size);
|
||||||
|
result.ends[r] = pin_time();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
bail_out("No such kernel %s", kernel);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void kernel_1_16_simple(double* a, size_t size)
|
||||||
|
{
|
||||||
|
#pragma omp parallel for
|
||||||
|
for(size_t i=0; i<size; i++)
|
||||||
|
{
|
||||||
|
a[i] = a[i] * a[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size)
|
||||||
|
{
|
||||||
|
#pragma omp parallel for
|
||||||
|
for(size_t i=0; i<size; i++)
|
||||||
|
{
|
||||||
|
a[i] = a[i] * b[i] + c[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void kernel_8_1_simple(double* a, size_t size)
|
||||||
|
{
|
||||||
|
#pragma omp parallel for
|
||||||
|
for(size_t i=0; i<size; i++)
|
||||||
|
{
|
||||||
|
a[i] = REP100(a[i]*)
|
||||||
|
REP20(a[i]*)
|
||||||
|
REP8(a[i]*)
|
||||||
|
REP1(a[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void kernel_8_1_simple_fastmath(double* a, size_t size)
|
||||||
|
{
|
||||||
|
#pragma omp parallel for
|
||||||
|
for(size_t i=0; i<size; i++)
|
||||||
|
{
|
||||||
|
REP100(a[i]=a[i]*a[i];);
|
||||||
|
REP20(a[i]=a[i]*a[i];);
|
||||||
|
REP8(a[i]=a[i]*a[i];);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void kernel_8_1_fuseaware(double* a, size_t size)
|
||||||
|
{
|
||||||
|
#pragma omp parallel for
|
||||||
|
for(size_t i=0; i<size; i++)
|
||||||
|
{
|
||||||
|
REP60(a[i] = a[i] * a[i] + a[i];)
|
||||||
|
REP4(a[i] = a[i] * a[i] + a[i];)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef INTRINS
|
||||||
|
|
||||||
|
#include <immintrin.h>
|
||||||
|
|
||||||
|
inline void kernel_8_1_fuseaware_manpack(double* a, size_t size)
|
||||||
|
{
|
||||||
|
|
||||||
|
#pragma omp parallel for
|
||||||
|
for(size_t i=0; i<(size-4); i+=4)
|
||||||
|
{
|
||||||
|
// pack doubles
|
||||||
|
__m256d packvec = _mm256_set_pd(a[i], a[i+1], a[i+2], a[i+3]);
|
||||||
|
|
||||||
|
REP60(packvec = _mm256_fmadd_pd(packvec, packvec, packvec););
|
||||||
|
REP4(packvec = _mm256_fmadd_pd(packvec, packvec, packvec););
|
||||||
|
|
||||||
|
a[i] = packvec[0];
|
||||||
|
a[i+1] = packvec[1];
|
||||||
|
a[i+2] = packvec[2];
|
||||||
|
a[i+3] = packvec[3];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif /* INTRINS */
|
||||||
|
|
||||||
|
|
||||||
|
/********************************************
|
||||||
|
* Kernels which potentially compile to *
|
||||||
|
* different operational intensities than *
|
||||||
|
* specified *
|
||||||
|
********************************************/
|
||||||
|
|
||||||
|
void kernel_1_16_simple_dangerous(double* a, double* b, size_t size)
|
||||||
|
{
|
||||||
|
register volatile double tmp = 0.1;
|
||||||
|
|
||||||
|
#pragma omp parallel for
|
||||||
|
for(size_t i=0; i<size; i++){
|
||||||
|
tmp = a[i] * b[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void kernel_8_1_simple_dangerous(double* a, size_t size)
|
||||||
|
{
|
||||||
|
register volatile double tmp = 0.1;
|
||||||
|
|
||||||
|
#pragma omp parallel for
|
||||||
|
for(size_t i=0; i<size; i++)
|
||||||
|
{
|
||||||
|
tmp = a[i] * a[i] * a[i] * a[i] *
|
||||||
|
a[i] * a[i] * a[i] * a[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void kernel_1_8_vo_dangerous(double* a, size_t size)
|
||||||
|
{
|
||||||
|
register volatile double tmp=0.0;
|
||||||
|
|
||||||
|
#pragma omp parallel for
|
||||||
|
for(size_t i=0; i<size; i++)
|
||||||
|
{
|
||||||
|
tmp = a[i] * a[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* === Helper Functions === */
|
||||||
|
|
||||||
|
static double pin_time(void)
|
||||||
|
{
|
||||||
|
struct timeval tp;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
i = gettimeofday(&tp,NULL);
|
||||||
|
|
||||||
|
if(i != 0)
|
||||||
|
{
|
||||||
|
bail_out("Time measurement impossible. gettimeofday error");
|
||||||
|
}
|
||||||
|
|
||||||
|
return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
|
||||||
|
}
|
||||||
|
|
||||||
|
static void bail_out(char* fmt, ...)
|
||||||
|
{
|
||||||
|
char* prog_name = "aikern";
|
||||||
|
|
||||||
|
if(fmt != NULL)
|
||||||
|
{
|
||||||
|
char msgbuf[150];
|
||||||
|
|
||||||
|
va_list vl;
|
||||||
|
va_start(vl, fmt);
|
||||||
|
|
||||||
|
if(vsnprintf(msgbuf, sizeof(msgbuf), fmt, vl) < 0)
|
||||||
|
msgbuf[0] = '\0';
|
||||||
|
|
||||||
|
va_end( vl);
|
||||||
|
|
||||||
|
if(strlen(msgbuf) > 0)
|
||||||
|
(void)fprintf(stderr, "%s: %s \n", prog_name, msgbuf);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
if(errno != 0)
|
||||||
|
(void)fprintf(stderr, "%s: %s\n", prog_name, strerror(errno));
|
||||||
|
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
288
aikern.h
Normal file
288
aikern.h
Normal file
|
@ -0,0 +1,288 @@
|
||||||
|
#ifndef AIKERN_H
|
||||||
|
#define AIKERN_H
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
size_t runs; // also # of start-/endtimes
|
||||||
|
double* starts; // starttimes
|
||||||
|
double* ends; // endtimes
|
||||||
|
int flops; // flops per iteration
|
||||||
|
char* kern_name;
|
||||||
|
size_t size; // size of arrays handeld
|
||||||
|
} kern_result;
|
||||||
|
|
||||||
|
typedef enum {
|
||||||
|
SIMPLE_1_16, FMA_1_16, SIMPLE_8_1, FMA_8_1, SIMPLE_8_1_FASTMATH, FMA_8_1_MANPACK
|
||||||
|
} kernel_t;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief main entry point. Dispatches the kernel calls
|
||||||
|
* @param kernel the kernel to run
|
||||||
|
* @param a An array with double values of size param size
|
||||||
|
* @param b An array with double values of size param size
|
||||||
|
* @param c An array with double values of size param size
|
||||||
|
* @param size The size of the arrays
|
||||||
|
* @param runs How often the kernel should be executed
|
||||||
|
* @return kern_result containing information about the kernel execution
|
||||||
|
*
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
kern_result kernel_dispatch(kernel_t kernel,
|
||||||
|
double* a, double* b, double* c,
|
||||||
|
size_t size, size_t runs);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief A simple 1/16 operational intensity kernel
|
||||||
|
* @param a An array with double values of size param size
|
||||||
|
* @param size Size of the three param arrays
|
||||||
|
* @param result Pointer to result storage
|
||||||
|
*
|
||||||
|
* === Warning ===
|
||||||
|
* Don't use with -O0: Stores everything on stack
|
||||||
|
*
|
||||||
|
* === Description ===
|
||||||
|
* Uses a simple floating point operation: a[i] = a[i] * a[i];
|
||||||
|
*
|
||||||
|
* Runs in a parallelized for loop.
|
||||||
|
*
|
||||||
|
* === Analysis ===
|
||||||
|
* COMM: 1 read (8 byte), 1 write = 16 bytes
|
||||||
|
* COMP: 1 FLOP
|
||||||
|
* ---------
|
||||||
|
* OI: 1/16
|
||||||
|
*
|
||||||
|
* === Optimization ===
|
||||||
|
* Nothing special
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
void kernel_1_16_simple(double* a, size_t size);
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief A 1/16 operational intensity kernel utilizing FMA
|
||||||
|
* @param a An array with double values of size param size
|
||||||
|
* @param b An array with double values of size param size
|
||||||
|
* @param c An array with double values of size param size
|
||||||
|
* @param size Size of the three param arrays
|
||||||
|
* @param result Pointer to result storage
|
||||||
|
*
|
||||||
|
* === Warning ===
|
||||||
|
* This is dangerous if FMA is not used/can't be used. Then there
|
||||||
|
* are intermediary writes (and reads) to the stack.
|
||||||
|
*
|
||||||
|
* === Description ===
|
||||||
|
* Uses a triad function: a[i] = a[i] * b[i] + c[i]; in order
|
||||||
|
* to utilize the FMA unit.
|
||||||
|
*
|
||||||
|
* Runs in a parallelized for loop.
|
||||||
|
*
|
||||||
|
* === Analysis ===
|
||||||
|
* With gcc -O2 -mavx -mfma FMA compiles to:
|
||||||
|
* vmovsd xmm0,QWORD PTR [rdi+rax*8] # 1 read (8 byte)
|
||||||
|
* vmovsd xmm1,QWORD PTR [rdx+rax*8] # 1 read
|
||||||
|
* vfmadd132sd xmm0,xmm1,QWORD PTR [rsi+rax*8] # 2 FLOPs + 1 read
|
||||||
|
* vmovsd QWORD PTR [rdi+rax*8],xmm0 # 1 write
|
||||||
|
* --------
|
||||||
|
* 1/16 OI
|
||||||
|
*
|
||||||
|
* === Optimization ===
|
||||||
|
* For packed doubles compile with -Ofast
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief A simple 8/1 operational intensity kernel
|
||||||
|
* @param a An array with double values of size param size
|
||||||
|
* @param size Size of the three param arrays
|
||||||
|
* @param result Pointer to result storage
|
||||||
|
*
|
||||||
|
* === Warning ===
|
||||||
|
* Don't use with -O0: Stores everything on stack
|
||||||
|
*
|
||||||
|
* === Description ===
|
||||||
|
* Uses a simple floating point operation: a[i] = a[i] * a[i] * ...* a[i];
|
||||||
|
*
|
||||||
|
* Runs in a parallelized for loop.
|
||||||
|
*
|
||||||
|
* === Analysis ===
|
||||||
|
* With AVX and -O2 (not necessarily FMA) best results (obviously correct
|
||||||
|
* easy to read disassembly).
|
||||||
|
*
|
||||||
|
* With gcc -O2 -mavx compiles to:
|
||||||
|
* vmovsd xmm1,QWORD PTR [rdi] # 1 read
|
||||||
|
* vmulsd xmm0,xmm1,xmm1 # 1 FLOP+register shuffling
|
||||||
|
* vmulsd xmm0,xmm0,xmm1 # 127x 1 FLOP+register shuffling
|
||||||
|
* # [...]
|
||||||
|
* vmovsd QWORD PTR [rdi-0x8],xmm0 # 1 write
|
||||||
|
* --------
|
||||||
|
* 128/16 = 8/1 OI
|
||||||
|
*
|
||||||
|
* === Optimization ===
|
||||||
|
* Nothing special
|
||||||
|
*/
|
||||||
|
void kernel_8_1_simple(double* a, size_t size);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief A 8/1 operational intensity kernel utilizing FMA
|
||||||
|
* @param a An array with double values of size param size
|
||||||
|
* @param size Size of the three param arrays
|
||||||
|
* @param result Pointer to result storage
|
||||||
|
*
|
||||||
|
* === Warning ===
|
||||||
|
* This is dangerous if FMA is not used/can't be used. Then there
|
||||||
|
* are intermediary writes (and reads) to the stack.
|
||||||
|
*
|
||||||
|
* === Description ===
|
||||||
|
* Uses multiple triad function: a[i] = a[i] * a[i] + a[i]; in order
|
||||||
|
* to utilize the FMA unit.
|
||||||
|
*
|
||||||
|
* Runs in a parallelized for loop.
|
||||||
|
*
|
||||||
|
* === Analysis ===
|
||||||
|
* With gcc -O2 -mavx -mfma FMA compiles to:
|
||||||
|
* vmovsd xmm0,QWORD PTR [rdi] # 1 read
|
||||||
|
* vfmadd132sd xmm0,xmm0,xmm0 # 64 x 2 FLOPs+register shuffling
|
||||||
|
* vmovsd QWORD PTR [rdi-0x8],xmm0 # 1 write
|
||||||
|
* --------
|
||||||
|
* 128/16 = 8/1 OI
|
||||||
|
*
|
||||||
|
* === Optimization ===
|
||||||
|
* For packed doubles compile with -Ofast
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
void kernel_8_1_fuseaware(double* a, size_t size);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief A simple 8/1 operational intensity kernel which
|
||||||
|
* undermines evil fastmath optimization
|
||||||
|
* @param a An array with double values of size param size
|
||||||
|
* @param size Size of the three param arrays
|
||||||
|
* @param result Pointer to result storage
|
||||||
|
*
|
||||||
|
* === Warning ===
|
||||||
|
* Don't use with anything other than -Ofast / -ffast-math
|
||||||
|
*
|
||||||
|
* === Description ===
|
||||||
|
* Uses a simple floating point operation that more closely resembles
|
||||||
|
* that of 8_1_fuseaware:
|
||||||
|
* a[i] = a[i]*a[i]; # 128x
|
||||||
|
*
|
||||||
|
* Runs in a parallelized for loop.
|
||||||
|
*
|
||||||
|
* === Analysis ===
|
||||||
|
* -Ofast/-ffast-math does not preserve strict IEEE compliance. It
|
||||||
|
* therefore is allowed to ignore non-associativity of floating
|
||||||
|
* point operations.
|
||||||
|
*
|
||||||
|
* x = x*x*x*x*x*x*x*x; is optimized to x *= x;x *= x;x *= x;
|
||||||
|
*
|
||||||
|
* This cleary breaks the whole OI calculation of 8_1_simple.
|
||||||
|
*
|
||||||
|
* This kernel does not introduce more byte write-outs than
|
||||||
|
* 8_1_simple at a high optimization level since a[i] is held
|
||||||
|
* in a register and only written out once at the end of an
|
||||||
|
* iteration.
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* === Optimization ===
|
||||||
|
* Nothing special
|
||||||
|
*/
|
||||||
|
void kernel_8_1_simple_fastmath(double* a, size_t size);
|
||||||
|
|
||||||
|
|
||||||
|
/********************************************
|
||||||
|
* Kernels which potentially compile to *
|
||||||
|
* different operational intensities than *
|
||||||
|
* specified *
|
||||||
|
********************************************/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief A 1/16 operational intensity which might compile to a flawed oi kernel
|
||||||
|
* @param a An array with double values of size param size
|
||||||
|
* @param b An array with double values of size param size
|
||||||
|
* @param size Size of the three param arrays
|
||||||
|
*
|
||||||
|
* === Problem ===
|
||||||
|
* As soon as volatile is used gcc uses the stack for tmp.
|
||||||
|
* Even if "register" is in place. Resulting in one additional write per loop.
|
||||||
|
* Omitting volatile results in optimizing away the whole loop
|
||||||
|
* (checked at -O2, which is necessary for FMA to eventually step in).
|
||||||
|
* Maybe the value stays in cache, maybe not. It does not live a register.
|
||||||
|
*
|
||||||
|
* Even with -O3:
|
||||||
|
* movsd xmm0,QWORD PTR [rdi+rax*8] # 1 read
|
||||||
|
* mulsd xmm0,QWORD PTR [rsi+rax*8] # 1 read (+ write to xmm0, not counted)
|
||||||
|
* # [...] # instructions for loop
|
||||||
|
* movsd QWORD PTR [rsp-0x8],xmm0 # malicious write
|
||||||
|
*
|
||||||
|
* Without volatile (-O3):
|
||||||
|
* repz ret # that's it
|
||||||
|
*/
|
||||||
|
void kernel_1_16_simple_dangerous(double* a, double* b, size_t size);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief A 8/1 operational intensity which might compile to a flawed oi kernel
|
||||||
|
* @param a An array with double values of size param size
|
||||||
|
* @param size Size of the three param arrays
|
||||||
|
*
|
||||||
|
* === Problem ==
|
||||||
|
* Same as for kernel_1_16_simple_dangerous
|
||||||
|
*/
|
||||||
|
void kernel_8_1_simple_dangerous(double* a, size_t size);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief A 1/8 operational intensity which might compile to a flawed oi kernel
|
||||||
|
* @param a An array with double values of size param size
|
||||||
|
* @param size Size of the three param arrays
|
||||||
|
*
|
||||||
|
* === Problem ==
|
||||||
|
* Same as for kernel_1_16_simple_dangerous
|
||||||
|
*
|
||||||
|
* Without volatile the loop is optimized away completely.
|
||||||
|
* With volatile tmp is written to the stack in every loop
|
||||||
|
* (-O3). tmp could be cached or not. This might depend on
|
||||||
|
* how large the array is and how the cpu work internally
|
||||||
|
* -> unpredictable.
|
||||||
|
*/
|
||||||
|
void kernel_1_8_vo_dangerous(double* a, size_t size);
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef INTRINS
|
||||||
|
void kernel_8_1_fuseaware_manpack(double* a, size_t size);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/****************************************
|
||||||
|
* Helper macros for repeating things *
|
||||||
|
****************************************/
|
||||||
|
|
||||||
|
#define REP0(X)
|
||||||
|
#define REP1(X) X
|
||||||
|
#define REP2(X) REP1(X) REP1(X)
|
||||||
|
#define REP3(X) REP2(X) REP1(X)
|
||||||
|
#define REP4(X) REP3(X) REP1(X)
|
||||||
|
#define REP5(X) REP4(X) REP1(X)
|
||||||
|
#define REP6(X) REP5(X) REP1(X)
|
||||||
|
#define REP7(X) REP6(X) REP1(X)
|
||||||
|
#define REP8(X) REP7(X) REP1(X)
|
||||||
|
#define REP9(X) REP8(X) REP1(X)
|
||||||
|
|
||||||
|
#define REP10(X) REP9(X) REP1(X)
|
||||||
|
#define REP20(X) REP10(X) REP10(X)
|
||||||
|
#define REP30(X) REP20(X) REP10(X)
|
||||||
|
#define REP40(X) REP30(X) REP10(X)
|
||||||
|
#define REP50(X) REP40(X) REP10(X)
|
||||||
|
#define REP60(X) REP50(X) REP10(X)
|
||||||
|
|
||||||
|
#define REP100(X) REP50(X) REP50(X)
|
||||||
|
|
||||||
|
#ifdef ENDEBUG
|
||||||
|
#define DEBUG(...) do { fprintf(stderr, __VA_ARGS__); fprintf(stderr, "\n"); } while(0)
|
||||||
|
#else
|
||||||
|
#define DEBUG(...)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif /* AIKERN_H */
|
199
prof
Normal file
199
prof
Normal file
|
@ -0,0 +1,199 @@
|
||||||
|
Flat profile:
|
||||||
|
|
||||||
|
Each sample counts as 0.01 seconds.
|
||||||
|
% cumulative self self total
|
||||||
|
time seconds seconds calls Ts/call Ts/call name
|
||||||
|
100.01 0.74 0.74 bail_out
|
||||||
|
0.00 0.74 0.00 50 0.00 0.00 pin_time
|
||||||
|
0.00 0.74 0.00 5 0.00 0.00 kernel_1_16_fuseaware
|
||||||
|
0.00 0.74 0.00 5 0.00 0.00 kernel_1_16_simple
|
||||||
|
0.00 0.74 0.00 5 0.00 0.00 kernel_8_1_fuseaware
|
||||||
|
0.00 0.74 0.00 5 0.00 0.00 kernel_8_1_simple
|
||||||
|
0.00 0.74 0.00 5 0.00 0.00 kernel_8_1_simple_fastmath
|
||||||
|
0.00 0.74 0.00 5 0.00 0.00 kernel_dispatch
|
||||||
|
0.00 0.74 0.00 5 0.00 0.00 print_kernresult
|
||||||
|
0.00 0.74 0.00 2 0.00 0.00 pin_time
|
||||||
|
0.00 0.74 0.00 1 0.00 0.00 get_int
|
||||||
|
0.00 0.74 0.00 1 0.00 0.00 get_size
|
||||||
|
0.00 0.74 0.00 1 0.00 0.00 testkern
|
||||||
|
|
||||||
|
% the percentage of the total running time of the
|
||||||
|
time program used by this function.
|
||||||
|
|
||||||
|
cumulative a running sum of the number of seconds accounted
|
||||||
|
seconds for by this function and those listed above it.
|
||||||
|
|
||||||
|
self the number of seconds accounted for by this
|
||||||
|
seconds function alone. This is the major sort for this
|
||||||
|
listing.
|
||||||
|
|
||||||
|
calls the number of times this function was invoked, if
|
||||||
|
this function is profiled, else blank.
|
||||||
|
|
||||||
|
self the average number of milliseconds spent in this
|
||||||
|
ms/call function per call, if this function is profiled,
|
||||||
|
else blank.
|
||||||
|
|
||||||
|
total the average number of milliseconds spent in this
|
||||||
|
ms/call function and its descendents per call, if this
|
||||||
|
function is profiled, else blank.
|
||||||
|
|
||||||
|
name the name of the function. This is the minor sort
|
||||||
|
for this listing. The index shows the location of
|
||||||
|
the function in the gprof listing. If the index is
|
||||||
|
in parenthesis it shows where it would appear in
|
||||||
|
the gprof listing if it were to be printed.
|
||||||
|
|
||||||
|
Copyright (C) 2012-2014 Free Software Foundation, Inc.
|
||||||
|
|
||||||
|
Copying and distribution of this file, with or without modification,
|
||||||
|
are permitted in any medium without royalty provided the copyright
|
||||||
|
notice and this notice are preserved.
|
||||||
|
|
||||||
|
Call graph (explanation follows)
|
||||||
|
|
||||||
|
|
||||||
|
granularity: each sample hit covers 2 byte(s) for 1.35% of 0.74 seconds
|
||||||
|
|
||||||
|
index % time self children called name
|
||||||
|
<spontaneous>
|
||||||
|
[1] 100.0 0.74 0.00 bail_out [1]
|
||||||
|
-----------------------------------------------
|
||||||
|
0.00 0.00 50/50 kernel_dispatch [8]
|
||||||
|
[2] 0.0 0.00 0.00 50 pin_time [2]
|
||||||
|
-----------------------------------------------
|
||||||
|
0.00 0.00 5/5 kernel_dispatch [8]
|
||||||
|
[3] 0.0 0.00 0.00 5 kernel_1_16_fuseaware [3]
|
||||||
|
-----------------------------------------------
|
||||||
|
0.00 0.00 5/5 kernel_dispatch [8]
|
||||||
|
[4] 0.0 0.00 0.00 5 kernel_1_16_simple [4]
|
||||||
|
-----------------------------------------------
|
||||||
|
0.00 0.00 5/5 kernel_dispatch [8]
|
||||||
|
[5] 0.0 0.00 0.00 5 kernel_8_1_fuseaware [5]
|
||||||
|
-----------------------------------------------
|
||||||
|
0.00 0.00 5/5 kernel_dispatch [8]
|
||||||
|
[6] 0.0 0.00 0.00 5 kernel_8_1_simple [6]
|
||||||
|
-----------------------------------------------
|
||||||
|
0.00 0.00 5/5 kernel_dispatch [8]
|
||||||
|
[7] 0.0 0.00 0.00 5 kernel_8_1_simple_fastmath [7]
|
||||||
|
-----------------------------------------------
|
||||||
|
0.00 0.00 5/5 main [23]
|
||||||
|
[8] 0.0 0.00 0.00 5 kernel_dispatch [8]
|
||||||
|
0.00 0.00 50/50 pin_time [2]
|
||||||
|
0.00 0.00 5/5 kernel_1_16_simple [4]
|
||||||
|
0.00 0.00 5/5 kernel_1_16_fuseaware [3]
|
||||||
|
0.00 0.00 5/5 kernel_8_1_simple [6]
|
||||||
|
0.00 0.00 5/5 kernel_8_1_fuseaware [5]
|
||||||
|
0.00 0.00 5/5 kernel_8_1_simple_fastmath [7]
|
||||||
|
-----------------------------------------------
|
||||||
|
0.00 0.00 5/5 main [23]
|
||||||
|
[9] 0.0 0.00 0.00 5 print_kernresult [9]
|
||||||
|
-----------------------------------------------
|
||||||
|
0.00 0.00 2/2 main [23]
|
||||||
|
[10] 0.0 0.00 0.00 2 pin_time [10]
|
||||||
|
-----------------------------------------------
|
||||||
|
0.00 0.00 1/1 main [23]
|
||||||
|
[11] 0.0 0.00 0.00 1 get_int [11]
|
||||||
|
-----------------------------------------------
|
||||||
|
0.00 0.00 1/1 main [23]
|
||||||
|
[12] 0.0 0.00 0.00 1 get_size [12]
|
||||||
|
-----------------------------------------------
|
||||||
|
0.00 0.00 1/1 main [23]
|
||||||
|
[13] 0.0 0.00 0.00 1 testkern [13]
|
||||||
|
-----------------------------------------------
|
||||||
|
|
||||||
|
This table describes the call tree of the program, and was sorted by
|
||||||
|
the total amount of time spent in each function and its children.
|
||||||
|
|
||||||
|
Each entry in this table consists of several lines. The line with the
|
||||||
|
index number at the left hand margin lists the current function.
|
||||||
|
The lines above it list the functions that called this function,
|
||||||
|
and the lines below it list the functions this one called.
|
||||||
|
This line lists:
|
||||||
|
index A unique number given to each element of the table.
|
||||||
|
Index numbers are sorted numerically.
|
||||||
|
The index number is printed next to every function name so
|
||||||
|
it is easier to look up where the function is in the table.
|
||||||
|
|
||||||
|
% time This is the percentage of the `total' time that was spent
|
||||||
|
in this function and its children. Note that due to
|
||||||
|
different viewpoints, functions excluded by options, etc,
|
||||||
|
these numbers will NOT add up to 100%.
|
||||||
|
|
||||||
|
self This is the total amount of time spent in this function.
|
||||||
|
|
||||||
|
children This is the total amount of time propagated into this
|
||||||
|
function by its children.
|
||||||
|
|
||||||
|
called This is the number of times the function was called.
|
||||||
|
If the function called itself recursively, the number
|
||||||
|
only includes non-recursive calls, and is followed by
|
||||||
|
a `+' and the number of recursive calls.
|
||||||
|
|
||||||
|
name The name of the current function. The index number is
|
||||||
|
printed after it. If the function is a member of a
|
||||||
|
cycle, the cycle number is printed between the
|
||||||
|
function's name and the index number.
|
||||||
|
|
||||||
|
|
||||||
|
For the function's parents, the fields have the following meanings:
|
||||||
|
|
||||||
|
self This is the amount of time that was propagated directly
|
||||||
|
from the function into this parent.
|
||||||
|
|
||||||
|
children This is the amount of time that was propagated from
|
||||||
|
the function's children into this parent.
|
||||||
|
|
||||||
|
called This is the number of times this parent called the
|
||||||
|
function `/' the total number of times the function
|
||||||
|
was called. Recursive calls to the function are not
|
||||||
|
included in the number after the `/'.
|
||||||
|
|
||||||
|
name This is the name of the parent. The parent's index
|
||||||
|
number is printed after it. If the parent is a
|
||||||
|
member of a cycle, the cycle number is printed between
|
||||||
|
the name and the index number.
|
||||||
|
|
||||||
|
If the parents of the function cannot be determined, the word
|
||||||
|
`<spontaneous>' is printed in the `name' field, and all the other
|
||||||
|
fields are blank.
|
||||||
|
|
||||||
|
For the function's children, the fields have the following meanings:
|
||||||
|
|
||||||
|
self This is the amount of time that was propagated directly
|
||||||
|
from the child into the function.
|
||||||
|
|
||||||
|
children This is the amount of time that was propagated from the
|
||||||
|
child's children to the function.
|
||||||
|
|
||||||
|
called This is the number of times the function called
|
||||||
|
this child `/' the total number of times the child
|
||||||
|
was called. Recursive calls by the child are not
|
||||||
|
listed in the number after the `/'.
|
||||||
|
|
||||||
|
name This is the name of the child. The child's index
|
||||||
|
number is printed after it. If the child is a
|
||||||
|
member of a cycle, the cycle number is printed
|
||||||
|
between the name and the index number.
|
||||||
|
|
||||||
|
If there are any cycles (circles) in the call graph, there is an
|
||||||
|
entry for the cycle-as-a-whole. This entry shows who called the
|
||||||
|
cycle (as parents) and the members of the cycle (as children.)
|
||||||
|
The `+' recursive calls entry shows the number of function calls that
|
||||||
|
were internal to the cycle, and the calls entry for each member shows,
|
||||||
|
for that member, how many times it was called from other members of
|
||||||
|
the cycle.
|
||||||
|
|
||||||
|
Copyright (C) 2012-2014 Free Software Foundation, Inc.
|
||||||
|
|
||||||
|
Copying and distribution of this file, with or without modification,
|
||||||
|
are permitted in any medium without royalty provided the copyright
|
||||||
|
notice and this notice are preserved.
|
||||||
|
|
||||||
|
Index by function name
|
||||||
|
|
||||||
|
[1] bail_out (aikern.c) [5] kernel_8_1_fuseaware [2] pin_time (aikern.c)
|
||||||
|
[11] get_int (roofline.c) [6] kernel_8_1_simple [9] print_kernresult (roofline.c)
|
||||||
|
[12] get_size (roofline.c) [7] kernel_8_1_simple_fastmath [13] testkern (roofline.c)
|
||||||
|
[3] kernel_1_16_fuseaware [8] kernel_dispatch
|
||||||
|
[4] kernel_1_16_simple [10] pin_time (roofline.c)
|
342
roofline.c
Normal file
342
roofline.c
Normal file
|
@ -0,0 +1,342 @@
|
||||||
|
# include <stdlib.h>
|
||||||
|
# include <stdio.h>
|
||||||
|
# include <unistd.h>
|
||||||
|
# include <ctype.h>
|
||||||
|
# include <sys/time.h>
|
||||||
|
# include <sys/stat.h>
|
||||||
|
# include <errno.h>
|
||||||
|
# include <string.h>
|
||||||
|
# include <stdint.h>
|
||||||
|
# include <getopt.h>
|
||||||
|
# include <stdarg.h>
|
||||||
|
# include <limits.h>
|
||||||
|
|
||||||
|
# include "aikern.h"
|
||||||
|
|
||||||
|
|
||||||
|
/* === Macros === */
|
||||||
|
|
||||||
|
#ifdef ENDEBUG
|
||||||
|
#define DEBUG(...) do { fprintf(stderr, __VA_ARGS__); fprintf(stderr, "\n"); } while(0)
|
||||||
|
#else
|
||||||
|
#define DEBUG(...)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* === Constants === */
|
||||||
|
|
||||||
|
/* === Global Variables === */
|
||||||
|
char* prog_name;
|
||||||
|
|
||||||
|
/* === Prototypes === */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief print usage message
|
||||||
|
*/
|
||||||
|
static void usage(void);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief terminate program on program error
|
||||||
|
* @param msg additional message to print
|
||||||
|
* @param ret exit value
|
||||||
|
*/
|
||||||
|
static void bail_out(char* fmt, ...);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief converts the argument to size_t if possible.
|
||||||
|
* bails out on error.
|
||||||
|
* @param oparg the argument to convert
|
||||||
|
*/
|
||||||
|
static size_t get_size(char* oparg);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief converts the argument to int if possible.
|
||||||
|
* bails out on error.
|
||||||
|
* @param oparg the argument to convert
|
||||||
|
*/
|
||||||
|
static int get_int(char* oparg);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief microseconds since epoch
|
||||||
|
*/
|
||||||
|
static double pin_time(void);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief a simple test kernel with ai of 1/16
|
||||||
|
*/
|
||||||
|
static void testkern(double* a, double* b, double* c, size_t size);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief pretty prints a kern_result
|
||||||
|
*/
|
||||||
|
static void print_kernresult(kern_result* result, const char* logname);
|
||||||
|
|
||||||
|
int main(int argc, char* argv[]) {
|
||||||
|
prog_name = argv[0];
|
||||||
|
|
||||||
|
int opt;
|
||||||
|
char *size_arg = NULL;
|
||||||
|
char *runs_arg = NULL;
|
||||||
|
|
||||||
|
while((opt = getopt(argc, argv, "s:r:")) != -1)
|
||||||
|
{
|
||||||
|
switch(opt)
|
||||||
|
{
|
||||||
|
case 's':
|
||||||
|
size_arg = optarg;
|
||||||
|
break;
|
||||||
|
case 'r':
|
||||||
|
runs_arg = optarg;
|
||||||
|
break;
|
||||||
|
case '?':
|
||||||
|
usage();
|
||||||
|
default:
|
||||||
|
usage();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if(optind < argc)
|
||||||
|
{
|
||||||
|
|
||||||
|
for (int index = optind; index < argc; index++)
|
||||||
|
bail_out ("Non-option argument %s\n", argv[index]);
|
||||||
|
|
||||||
|
usage();
|
||||||
|
}
|
||||||
|
|
||||||
|
if(size_arg == NULL || runs_arg == NULL)
|
||||||
|
usage();
|
||||||
|
|
||||||
|
size_t size = get_size(size_arg);
|
||||||
|
int runs = get_int(runs_arg);
|
||||||
|
|
||||||
|
// Allocating arrays
|
||||||
|
printf("Will run with array sizes of %zu elements\n", size);
|
||||||
|
printf("Will calculate min, max, avg for %d runs\n", runs);
|
||||||
|
double* a = malloc(sizeof(double)*(size));
|
||||||
|
double* b = malloc(sizeof(double)*(size));
|
||||||
|
double* c = malloc(sizeof(double)*(size));
|
||||||
|
|
||||||
|
if(a==NULL || b==NULL || c == NULL)
|
||||||
|
bail_out("One of the mallocs failed\n. a = %p, b=%p, c=%p", a, b, c);
|
||||||
|
|
||||||
|
printf("Allocated 3 arrays (3*%.2f MB = %.2f GB)\n", (sizeof(double)*(size)/1024.0/1024.0), (sizeof(double)*(size)*3/1024.0/1024.0/1024));
|
||||||
|
printf("Filling arrays with dummy values. This will also warm the cache\n");
|
||||||
|
|
||||||
|
// Filling arrays with arbitrary numbers
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (size_t j=0; j<size; j++)
|
||||||
|
{
|
||||||
|
a[j] = 1.0;
|
||||||
|
b[j] = 2.0;
|
||||||
|
c[j] = 3.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
double t;
|
||||||
|
printf("Heating up machine\n");
|
||||||
|
t = pin_time();
|
||||||
|
testkern(a,b,c, size);
|
||||||
|
t = pin_time() - t;
|
||||||
|
printf("Machine heating took %.4f microseconds = %.4f seconds (with test OI kernel)\n", (t*1.0E6), t);
|
||||||
|
printf("Starting tests...\n\n\n");
|
||||||
|
|
||||||
|
// Executing kernels
|
||||||
|
kern_result simple16 = kernel_dispatch(SIMPLE_1_16, a, b, c, size, runs);
|
||||||
|
kern_result fma16 = kernel_dispatch(FMA_1_16, a, b, c, size, runs);
|
||||||
|
kern_result simple8 = kernel_dispatch(SIMPLE_8_1, a, b, c, size, runs);
|
||||||
|
kern_result fma8 = kernel_dispatch(FMA_8_1, a, b, c, size, runs);
|
||||||
|
kern_result simple8fm = kernel_dispatch(SIMPLE_8_1_FASTMATH, a, b, c, size, runs);
|
||||||
|
|
||||||
|
#ifdef INTRINS
|
||||||
|
DEBUG("Running manpack now");
|
||||||
|
kern_result fma8manpack = kernel_dispatch(FMA_8_1_MANPACK, a, b, c, size, runs);
|
||||||
|
DEBUG("manpack run successful");
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Freeing arrays
|
||||||
|
free(a);
|
||||||
|
free(b);
|
||||||
|
free(c);
|
||||||
|
|
||||||
|
// Printing results
|
||||||
|
print_kernresult(&simple16, "simple16");
|
||||||
|
print_kernresult(&fma16, "fma16");
|
||||||
|
print_kernresult(&simple8, "simple8");
|
||||||
|
print_kernresult(&fma8, "fma8");
|
||||||
|
print_kernresult(&simple8fm, "simple8fastmath");
|
||||||
|
|
||||||
|
#ifdef INTRINS
|
||||||
|
print_kernresult(&fma8manpack, "fma8manpack");
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
printf("\n\n\n");
|
||||||
|
printf("Please refer to the log files in the log/ folder for details about the GFLOP/s of every kernel.");
|
||||||
|
printf("\n");
|
||||||
|
printf("Exiting...");
|
||||||
|
exit(EXIT_SUCCESS);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void testkern(double* a, double* b, double* c, size_t size)
|
||||||
|
{
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (size_t j = 0; j < size; j++)
|
||||||
|
{
|
||||||
|
/* 3*8 Bytes read + 3*8 Bytes write, 3 FLOPs -> AI = 3/(2*3*8) = 1/16 */
|
||||||
|
a[j] = 2.0E0 * a[j];
|
||||||
|
b[j] = 2.0E0 * b[j];
|
||||||
|
c[j] = 2.0E0 * c[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* === Helper Functions === */
|
||||||
|
|
||||||
|
static double pin_time(void)
|
||||||
|
{
|
||||||
|
struct timeval tp;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
i = gettimeofday(&tp,NULL);
|
||||||
|
|
||||||
|
if(i != 0)
|
||||||
|
{
|
||||||
|
bail_out("Time measurement impossible. gettimeofday error");
|
||||||
|
}
|
||||||
|
|
||||||
|
return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t get_size(char *oparg)
|
||||||
|
{
|
||||||
|
long long int llsize = strtoll(oparg, NULL, 10);
|
||||||
|
|
||||||
|
if(llsize <= 0)
|
||||||
|
usage();
|
||||||
|
|
||||||
|
unsigned long long int u_llsize = (unsigned long long int) llsize;
|
||||||
|
|
||||||
|
if(u_llsize > SIZE_MAX)
|
||||||
|
{
|
||||||
|
bail_out("Only size between 1 to %zu allowed.", SIZE_MAX);
|
||||||
|
}
|
||||||
|
|
||||||
|
return (size_t) llsize;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int get_int(char *oparg)
|
||||||
|
{
|
||||||
|
long long int llsize = strtoll(oparg, NULL, 10);
|
||||||
|
|
||||||
|
if(llsize <= 0)
|
||||||
|
usage();
|
||||||
|
|
||||||
|
unsigned long long int u_llsize = (unsigned long long int) llsize;
|
||||||
|
|
||||||
|
if(u_llsize > INT_MAX)
|
||||||
|
{
|
||||||
|
bail_out("Only size between 1 to %d allowed.", INT_MAX);
|
||||||
|
}
|
||||||
|
|
||||||
|
return (int) llsize;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void usage()
|
||||||
|
{
|
||||||
|
fprintf(stderr, "USAGE: ./roofline -s <size> -r <runs> \n");
|
||||||
|
fprintf(stderr, "e.g.: ./roofline -s 100000 -r 5 \n");
|
||||||
|
bail_out("Invalid paramers");
|
||||||
|
}
|
||||||
|
|
||||||
|
static void bail_out(char* fmt, ...)
|
||||||
|
{
|
||||||
|
if(fmt != NULL)
|
||||||
|
{
|
||||||
|
char msgbuf[150];
|
||||||
|
|
||||||
|
va_list vl;
|
||||||
|
va_start(vl, fmt);
|
||||||
|
|
||||||
|
if(vsnprintf(msgbuf, sizeof(msgbuf), fmt, vl) < 0)
|
||||||
|
msgbuf[0] = '\0';
|
||||||
|
|
||||||
|
va_end( vl);
|
||||||
|
|
||||||
|
if(strlen(msgbuf) > 0)
|
||||||
|
(void)fprintf(stderr, "%s: %s \n", prog_name, msgbuf);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
if(errno != 0)
|
||||||
|
(void)fprintf(stderr, "%s: %s\n", prog_name, strerror(errno));
|
||||||
|
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void print_kernresult(kern_result* result, const char* logname)
|
||||||
|
{
|
||||||
|
struct stat st = {};
|
||||||
|
|
||||||
|
if (stat("log", &st) == -1)
|
||||||
|
{
|
||||||
|
if(mkdir("log", 0700))
|
||||||
|
{
|
||||||
|
bail_out("Couldn't create log directory for %s", result->kern_name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
char logpath[20];
|
||||||
|
snprintf(logpath, sizeof(logpath), "%s/%s", "log", logname);
|
||||||
|
FILE* log = fopen(logpath, "w");
|
||||||
|
if(log == NULL)
|
||||||
|
bail_out("Couldn't open log file for %s", result->kern_name);
|
||||||
|
|
||||||
|
if(fputs("run,start,end,delta,GFLOP/s\n", log) == EOF)
|
||||||
|
{
|
||||||
|
fclose(log);
|
||||||
|
bail_out("Couldn't write header to log file");
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("=== %s ===\n", result->kern_name);
|
||||||
|
|
||||||
|
double min;
|
||||||
|
double max;
|
||||||
|
double sum = 0.0;
|
||||||
|
double deltas[result->runs];
|
||||||
|
|
||||||
|
deltas[0] = result->ends[0] - result->starts[0];
|
||||||
|
min=deltas[0];
|
||||||
|
max=deltas[0];
|
||||||
|
sum+=deltas[0];
|
||||||
|
|
||||||
|
for(size_t i=1; i<result->runs; i++)
|
||||||
|
{
|
||||||
|
deltas[i] = result->ends[i] - result->starts[i];
|
||||||
|
sum+=deltas[i];
|
||||||
|
|
||||||
|
if(deltas[i] < min) min=deltas[i];
|
||||||
|
if(deltas[i] > max) max=deltas[i];
|
||||||
|
|
||||||
|
double gflops = ((result->flops * result->size) / deltas[i]) / 1.0E9;
|
||||||
|
|
||||||
|
if(fprintf(log, "%zu,%.4f,%.4f,%.4f,%.4f\n",
|
||||||
|
i, result->starts[i],
|
||||||
|
result->ends[i], deltas[i],
|
||||||
|
gflops) == EOF)
|
||||||
|
{
|
||||||
|
fclose(log);
|
||||||
|
bail_out("Couldn't write to log file");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
printf("%d flop(s) per run\t %zu run(s)\n\n", result->flops, result->runs);
|
||||||
|
printf("Min: %.4f \t Max: %.4f \t Avg: %.4f\n", min, max, (sum/result->runs));
|
||||||
|
|
||||||
|
printf("\n\n\n");
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if(fclose(log))
|
||||||
|
{
|
||||||
|
bail_out("Couldn't close log file for %s", result->kern_name);
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in a new issue