major refactoring

This commit is contained in:
Armin Friedl 2016-06-24 00:55:50 +02:00
parent ba7a732d31
commit 31bfead054
9 changed files with 163 additions and 89 deletions

View file

@ -20,11 +20,73 @@ static void bail_out(char* fmt, ...);
*/
static double pin_time(void);
void kernel_1_16_simple(double* a, double* b, double* c, size_t size)
kern_result kernel_dispatch(kernel_t kernel,
double* a, double* b, double* c,
size_t size, size_t runs)
{
kern_result result;
result.runs = runs;
result.starts = malloc(sizeof(double)*(runs));
result.ends = malloc(sizeof(double)*(runs));
if(result.starts==NULL || result.ends==NULL)
{
bail_out("One of the mallocs failed\n. starts = %p, ends=%p", result.starts, result.ends);
}
switch(kernel)
{
case SIMPLE_1_16:
result.flops = 1;
for(size_t r=0; r<runs; r++)
{
result.starts[r] = pin_time();
kernel_1_16_simple(a, size);
result.ends[r] = pin_time();
}
break;
case FMA_1_16:
result.flops = 2;
for(size_t r=0; r<runs; r++)
{
result.starts[r] = pin_time();
kernel_1_16_fuseaware(a, b, c, size);
result.ends[r] = pin_time();
}
break;
case SIMPLE_8_1:
result.flops = 128;
for(size_t r=0; r<runs; r++)
{
result.starts[r] = pin_time();
kernel_8_1_simple(a, size);
result.ends[r] = pin_time();
}
break;
case FMA_8_1:
result.flops = 128;
for(size_t r=0; r<runs; r++)
{
result.starts[r] = pin_time();
kernel_8_1_fuseaware(a, size);
result.ends[r] = pin_time();
}
break;
default:
bail_out("No such kernel %s", kernel);
}
return result;
}
void kernel_1_16_simple(double* a, size_t size)
{
double t = pin_time();
#pragma omp parallel for
for(size_t i=0; i<size; i++){
for(size_t i=0; i<size; i++)
{
a[i] = a[i] * a[i];
}
}
@ -32,15 +94,17 @@ void kernel_1_16_simple(double* a, double* b, double* c, size_t size)
void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size)
{
#pragma omp parallel for
for(size_t i=0; i<size; i++){
for(size_t i=0; i<size; i++)
{
a[i] = a[i] * b[i] + c[i];
}
}
void kernel_8_1_simple(double* a, double* b, double* c, size_t size)
void kernel_8_1_simple(double* a, size_t size)
{
#pragma omp parallel for
for(size_t i=0; i<size; i++){
for(size_t i=0; i<size; i++)
{
a[i] = REP100(a[i]*)
REP20(a[i]*)
REP8(a[i]*)
@ -48,17 +112,23 @@ void kernel_8_1_simple(double* a, double* b, double* c, size_t size)
}
}
void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size)
void kernel_8_1_fuseaware(double* a, size_t size)
{
#pragma omp parallel for
for(size_t i=0; i<size; i++){
for(size_t i=0; i<size; i++)
{
REP60(a[i] = a[i] * a[i] + a[i];)
REP4(a[i] = a[i] * a[i] + a[i];)
}
}
/********************************************
* Kernels which potentially compile to *
* different operational intensities than *
* specified *
********************************************/
void kernel_1_16_simple_dangerous(double* a, double* b, double* c, size_t size)
void kernel_1_16_simple_dangerous(double* a, double* b, size_t size)
{
register volatile double tmp = 0.1;
@ -68,29 +138,27 @@ void kernel_1_16_simple_dangerous(double* a, double* b, double* c, size_t size)
}
}
void kernel_8_1_simple_dangerous(double* a, double* b, double* c, size_t size)
void kernel_8_1_simple_dangerous(double* a, size_t size)
{
register volatile double tmp = 0.1;
#pragma omp parallel for
for(size_t i=0; i<size; i++){
for(size_t i=0; i<size; i++)
{
tmp = a[i] * a[i] * a[i] * a[i] *
a[i] * a[i] * a[i] * a[i];
}
}
void kernel_1_8_vo_dangerous(double* a, double* b, double* c, size_t size)
void kernel_1_8_vo_dangerous(double* a, size_t size)
{
/* This is the 1/8 AI kernel from the lecture
*/
register volatile double tmp=0.0;
for(size_t i=0; i<size; i++) {
#pragma omp parallel for
for(size_t i=0; i<size; i++)
{
tmp = a[i] * a[i];
}
}

View file

@ -1,12 +1,38 @@
#ifndef AIKERN_H
#define AIKERN_H
typedef struct {
size_t runs; // also # of start-/endtimes
double* starts; // starttimes
double* ends; // endtimes
int flops; //flops per run
} kern_result;
typedef enum {
SIMPLE_1_16, FMA_1_16, SIMPLE_8_1, FMA_8_1
} kernel_t;
/**
* @brief main entry point. Dispatches the kernel calls
* @param kernel the kernel to run
* @param a An array with double values of size param size
* @param b An array with double values of size param size
* @param c An array with double values of size param size
* @param size The size of the arrays
* @param runs How often the kernel should be executed
* @return kern_result containing information about the kernel execution
*
*
*/
kern_result kernel_dispatch(kernel_t kernel,
double* a, double* b, double* c,
size_t size, size_t runs);
/**
* @brief A simple 1/16 operational intensity kernel
* @param a An array with double values of size param size
* @param b An array with double values of size param size
* @param c An array with double values of size param size
* @param size Size of the three param arrays
* @param result Pointer to result storage
*
* === Warning ===
* Don't use with -O0: Stores everything on stack
@ -26,8 +52,7 @@
* Nothing special
*
*/
void kernel_1_16_simple(double* a, double* b, double* c, size_t size);
void kernel_1_16_simple(double* a, size_t size);
/**
@ -36,6 +61,7 @@ void kernel_1_16_simple(double* a, double* b, double* c, size_t size);
* @param b An array with double values of size param size
* @param c An array with double values of size param size
* @param size Size of the three param arrays
* @param result Pointer to result storage
*
* === Warning ===
* This is dangerous if FMA is not used/can't be used. Then there
@ -67,9 +93,8 @@ void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size);
/**
* @brief A simple 8/1 operational intensity kernel
* @param a An array with double values of size param size
* @param b An array with double values of size param size
* @param c An array with double values of size param size
* @param size Size of the three param arrays
* @param result Pointer to result storage
*
* === Warning ===
* Don't use with -O0: Stores everything on stack
@ -95,14 +120,13 @@ void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size);
* === Optimization ===
* Nothing special
*/
void kernel_8_1_simple(double* a, double* b, double* c, size_t size);
void kernel_8_1_simple(double* a, size_t size);
/**
* @brief A 8/1 operational intensity kernel utilizing FMA
* @param a An array with double values of size param size
* @param b An array with double values of size param size
* @param c An array with double values of size param size
* @param size Size of the three param arrays
* @param result Pointer to result storage
*
* === Warning ===
* This is dangerous if FMA is not used/can't be used. Then there
@ -126,7 +150,7 @@ void kernel_8_1_simple(double* a, double* b, double* c, size_t size);
* For packed doubles compile with -Ofast
*
*/
void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size);
void kernel_8_1_fuseaware(double* a, size_t size);
/********************************************
@ -139,7 +163,6 @@ void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size);
* @brief A 1/16 operational intensity which might compile to a flawed oi kernel
* @param a An array with double values of size param size
* @param b An array with double values of size param size
* @param c An array with double values of size param size
* @param size Size of the three param arrays
*
* === Problem ===
@ -158,25 +181,21 @@ void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size);
* Without volatile (-O3):
* repz ret # that's it
*/
void kernel_1_16_simple_dangerous(double* a, double* b, double* c, size_t size);
void kernel_1_16_simple_dangerous(double* a, double* b, size_t size);
/**
* @brief A 8/1 operational intensity which might compile to a flawed oi kernel
* @param a An array with double values of size param size
* @param b An array with double values of size param size
* @param c An array with double values of size param size
* @param size Size of the three param arrays
*
* === Problem ==
* Same as for kernel_1_16_simple_dangerous
*/
void kernel_8_1_simple_dangerous(double* a, double* b, double* c, size_t size);
void kernel_8_1_simple_dangerous(double* a, size_t size);
/**
* @brief A 1/8 operational intensity which might compile to a flawed oi kernel
* @param a An array with double values of size param size
* @param b An array with double values of size param size
* @param c An array with double values of size param size
* @param size Size of the three param arrays
*
* === Problem ==
@ -188,7 +207,7 @@ void kernel_8_1_simple_dangerous(double* a, double* b, double* c, size_t size);
* how large the array is and how the cpu work internally
* -> unpredictable.
*/
void kernel_1_8_vo_dangerous(double* a, double* b, double* c, size_t size);
void kernel_1_8_vo_dangerous(double* a, size_t size);
/****************************************

BIN
roofline/src/roofline Executable file

Binary file not shown.

View file

@ -64,6 +64,11 @@ static double pin_time(void);
*/
static void testkern(double* a, double* b, double* c, size_t size);
/**
* @brief pretty prints a kern_result
*/
static void print_kernresult(kern_result* result);
int main(int argc, char* argv[]) {
prog_name = argv[0];
@ -130,34 +135,12 @@ int main(int argc, char* argv[]) {
t = pin_time() - t;
printf("Machine heating took %.4f microseconds = %.4f seconds (with test OI kernel)\n", (t*1.0E6), t);
kern_result simple16 = kernel_dispatch(SIMPLE_1_16, a, b, c, size, runs);
kern_result fma16 = kernel_dispatch(FMA_1_16, a, b, c, size, runs);
kern_result simple8 = kernel_dispatch(SIMPLE_8_1, a, b, c, size, runs);
kern_result fma8 = kernel_dispatch(FMA_8_1, a, b, c, size, runs);
/*
TESTS!!
*/
printf("1/16 simple\n");
t = pin_time();
kernel_1_16_simple(a,b,c, size);
t = pin_time() - t;
printf("Cache warming took %.4f microseconds = %.4f seconds (with test AI of 1/16 FLOPs/Byte)\n", (t*1.0E6), t);
printf("1/16 fuseaware\n");
t = pin_time();
kernel_1_16_fuseaware(a,b,c, size);
t = pin_time() - t;
printf("Cache warming took %.4f microseconds = %.4f seconds (with test AI of 1/16 FLOPs/Byte)\n", (t*1.0E6), t);
printf("8 simple\n");
t = pin_time();
kernel_8_1_simple(a,b,c, size);
t = pin_time() - t;
printf("Cache warming took %.4f microseconds = %.4f seconds (with test AI of 1/16 FLOPs/Byte)\n", (t*1.0E6), t);
printf("8 fuseaware\n");
t = pin_time();
kernel_8_1_fuseaware(a,b,c, size);
t = pin_time() - t;
printf("Cache warming took %.4f microseconds = %.4f seconds (with test AI of 1/16 FLOPs/Byte)\n", (t*1.0E6), t);
print_kernresult(&simple16);
exit(EXIT_SUCCESS);
}
@ -249,3 +232,7 @@ static void bail_out(char* fmt, ...)
exit(EXIT_FAILURE);
}
static void print_kernresult(kern_result* result){
return;
}

BIN
roofline/src/roofline_fma Executable file

Binary file not shown.

Binary file not shown.

BIN
roofline/src/roofline_fma_fast_o3 Executable file

Binary file not shown.

BIN
roofline/src/roofline_fma_o3 Executable file

Binary file not shown.

BIN
roofline/src/roofline_o3 Executable file

Binary file not shown.