major refactoring

This commit is contained in:
Armin Friedl 2016-06-24 00:55:50 +02:00
parent ba7a732d31
commit 31bfead054
9 changed files with 163 additions and 89 deletions

View file

@ -20,11 +20,73 @@ static void bail_out(char* fmt, ...);
*/ */
static double pin_time(void); static double pin_time(void);
void kernel_1_16_simple(double* a, double* b, double* c, size_t size) kern_result kernel_dispatch(kernel_t kernel,
double* a, double* b, double* c,
size_t size, size_t runs)
{
kern_result result;
result.runs = runs;
result.starts = malloc(sizeof(double)*(runs));
result.ends = malloc(sizeof(double)*(runs));
if(result.starts==NULL || result.ends==NULL)
{
bail_out("One of the mallocs failed\n. starts = %p, ends=%p", result.starts, result.ends);
}
switch(kernel)
{
case SIMPLE_1_16:
result.flops = 1;
for(size_t r=0; r<runs; r++)
{
result.starts[r] = pin_time();
kernel_1_16_simple(a, size);
result.ends[r] = pin_time();
}
break;
case FMA_1_16:
result.flops = 2;
for(size_t r=0; r<runs; r++)
{
result.starts[r] = pin_time();
kernel_1_16_fuseaware(a, b, c, size);
result.ends[r] = pin_time();
}
break;
case SIMPLE_8_1:
result.flops = 128;
for(size_t r=0; r<runs; r++)
{
result.starts[r] = pin_time();
kernel_8_1_simple(a, size);
result.ends[r] = pin_time();
}
break;
case FMA_8_1:
result.flops = 128;
for(size_t r=0; r<runs; r++)
{
result.starts[r] = pin_time();
kernel_8_1_fuseaware(a, size);
result.ends[r] = pin_time();
}
break;
default:
bail_out("No such kernel %s", kernel);
}
return result;
}
void kernel_1_16_simple(double* a, size_t size)
{ {
double t = pin_time();
#pragma omp parallel for #pragma omp parallel for
for(size_t i=0; i<size; i++){ for(size_t i=0; i<size; i++)
{
a[i] = a[i] * a[i]; a[i] = a[i] * a[i];
} }
} }
@ -32,15 +94,17 @@ void kernel_1_16_simple(double* a, double* b, double* c, size_t size)
void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size) void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size)
{ {
#pragma omp parallel for #pragma omp parallel for
for(size_t i=0; i<size; i++){ for(size_t i=0; i<size; i++)
{
a[i] = a[i] * b[i] + c[i]; a[i] = a[i] * b[i] + c[i];
} }
} }
void kernel_8_1_simple(double* a, double* b, double* c, size_t size) void kernel_8_1_simple(double* a, size_t size)
{ {
#pragma omp parallel for #pragma omp parallel for
for(size_t i=0; i<size; i++){ for(size_t i=0; i<size; i++)
{
a[i] = REP100(a[i]*) a[i] = REP100(a[i]*)
REP20(a[i]*) REP20(a[i]*)
REP8(a[i]*) REP8(a[i]*)
@ -48,17 +112,23 @@ void kernel_8_1_simple(double* a, double* b, double* c, size_t size)
} }
} }
void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size) void kernel_8_1_fuseaware(double* a, size_t size)
{ {
#pragma omp parallel for #pragma omp parallel for
for(size_t i=0; i<size; i++){ for(size_t i=0; i<size; i++)
{
REP60(a[i] = a[i] * a[i] + a[i];) REP60(a[i] = a[i] * a[i] + a[i];)
REP4(a[i] = a[i] * a[i] + a[i];) REP4(a[i] = a[i] * a[i] + a[i];)
} }
} }
/********************************************
* Kernels which potentially compile to *
* different operational intensities than *
* specified *
********************************************/
void kernel_1_16_simple_dangerous(double* a, double* b, double* c, size_t size) void kernel_1_16_simple_dangerous(double* a, double* b, size_t size)
{ {
register volatile double tmp = 0.1; register volatile double tmp = 0.1;
@ -68,29 +138,27 @@ void kernel_1_16_simple_dangerous(double* a, double* b, double* c, size_t size)
} }
} }
void kernel_8_1_simple_dangerous(double* a, double* b, double* c, size_t size) void kernel_8_1_simple_dangerous(double* a, size_t size)
{ {
register volatile double tmp = 0.1; register volatile double tmp = 0.1;
#pragma omp parallel for #pragma omp parallel for
for(size_t i=0; i<size; i++){ for(size_t i=0; i<size; i++)
{
tmp = a[i] * a[i] * a[i] * a[i] * tmp = a[i] * a[i] * a[i] * a[i] *
a[i] * a[i] * a[i] * a[i]; a[i] * a[i] * a[i] * a[i];
} }
} }
void kernel_1_8_vo_dangerous(double* a, double* b, double* c, size_t size) void kernel_1_8_vo_dangerous(double* a, size_t size)
{ {
/* This is the 1/8 AI kernel from the lecture
*/
register volatile double tmp=0.0; register volatile double tmp=0.0;
for(size_t i=0; i<size; i++) { #pragma omp parallel for
for(size_t i=0; i<size; i++)
{
tmp = a[i] * a[i]; tmp = a[i] * a[i];
} }
} }

View file

@ -1,12 +1,38 @@
#ifndef AIKERN_H #ifndef AIKERN_H
#define AIKERN_H #define AIKERN_H
typedef struct {
size_t runs; // also # of start-/endtimes
double* starts; // starttimes
double* ends; // endtimes
int flops; //flops per run
} kern_result;
typedef enum {
SIMPLE_1_16, FMA_1_16, SIMPLE_8_1, FMA_8_1
} kernel_t;
/**
* @brief main entry point. Dispatches the kernel calls
* @param kernel the kernel to run
* @param a An array with double values of size param size
* @param b An array with double values of size param size
* @param c An array with double values of size param size
* @param size The size of the arrays
* @param runs How often the kernel should be executed
* @return kern_result containing information about the kernel execution
*
*
*/
kern_result kernel_dispatch(kernel_t kernel,
double* a, double* b, double* c,
size_t size, size_t runs);
/** /**
* @brief A simple 1/16 operational intensity kernel * @brief A simple 1/16 operational intensity kernel
* @param a An array with double values of size param size * @param a An array with double values of size param size
* @param b An array with double values of size param size
* @param c An array with double values of size param size
* @param size Size of the three param arrays * @param size Size of the three param arrays
* @param result Pointer to result storage
* *
* === Warning === * === Warning ===
* Don't use with -O0: Stores everything on stack * Don't use with -O0: Stores everything on stack
@ -26,8 +52,7 @@
* Nothing special * Nothing special
* *
*/ */
void kernel_1_16_simple(double* a, double* b, double* c, size_t size); void kernel_1_16_simple(double* a, size_t size);
/** /**
@ -36,6 +61,7 @@ void kernel_1_16_simple(double* a, double* b, double* c, size_t size);
* @param b An array with double values of size param size * @param b An array with double values of size param size
* @param c An array with double values of size param size * @param c An array with double values of size param size
* @param size Size of the three param arrays * @param size Size of the three param arrays
* @param result Pointer to result storage
* *
* === Warning === * === Warning ===
* This is dangerous if FMA is not used/can't be used. Then there * This is dangerous if FMA is not used/can't be used. Then there
@ -67,9 +93,8 @@ void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size);
/** /**
* @brief A simple 8/1 operational intensity kernel * @brief A simple 8/1 operational intensity kernel
* @param a An array with double values of size param size * @param a An array with double values of size param size
* @param b An array with double values of size param size
* @param c An array with double values of size param size
* @param size Size of the three param arrays * @param size Size of the three param arrays
* @param result Pointer to result storage
* *
* === Warning === * === Warning ===
* Don't use with -O0: Stores everything on stack * Don't use with -O0: Stores everything on stack
@ -95,14 +120,13 @@ void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size);
* === Optimization === * === Optimization ===
* Nothing special * Nothing special
*/ */
void kernel_8_1_simple(double* a, double* b, double* c, size_t size); void kernel_8_1_simple(double* a, size_t size);
/** /**
* @brief A 8/1 operational intensity kernel utilizing FMA * @brief A 8/1 operational intensity kernel utilizing FMA
* @param a An array with double values of size param size * @param a An array with double values of size param size
* @param b An array with double values of size param size
* @param c An array with double values of size param size
* @param size Size of the three param arrays * @param size Size of the three param arrays
* @param result Pointer to result storage
* *
* === Warning === * === Warning ===
* This is dangerous if FMA is not used/can't be used. Then there * This is dangerous if FMA is not used/can't be used. Then there
@ -126,7 +150,7 @@ void kernel_8_1_simple(double* a, double* b, double* c, size_t size);
* For packed doubles compile with -Ofast * For packed doubles compile with -Ofast
* *
*/ */
void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size); void kernel_8_1_fuseaware(double* a, size_t size);
/******************************************** /********************************************
@ -139,7 +163,6 @@ void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size);
* @brief A 1/16 operational intensity which might compile to a flawed oi kernel * @brief A 1/16 operational intensity which might compile to a flawed oi kernel
* @param a An array with double values of size param size * @param a An array with double values of size param size
* @param b An array with double values of size param size * @param b An array with double values of size param size
* @param c An array with double values of size param size
* @param size Size of the three param arrays * @param size Size of the three param arrays
* *
* === Problem === * === Problem ===
@ -158,25 +181,21 @@ void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size);
* Without volatile (-O3): * Without volatile (-O3):
* repz ret # that's it * repz ret # that's it
*/ */
void kernel_1_16_simple_dangerous(double* a, double* b, double* c, size_t size); void kernel_1_16_simple_dangerous(double* a, double* b, size_t size);
/** /**
* @brief A 8/1 operational intensity which might compile to a flawed oi kernel * @brief A 8/1 operational intensity which might compile to a flawed oi kernel
* @param a An array with double values of size param size * @param a An array with double values of size param size
* @param b An array with double values of size param size
* @param c An array with double values of size param size
* @param size Size of the three param arrays * @param size Size of the three param arrays
* *
* === Problem == * === Problem ==
* Same as for kernel_1_16_simple_dangerous * Same as for kernel_1_16_simple_dangerous
*/ */
void kernel_8_1_simple_dangerous(double* a, double* b, double* c, size_t size); void kernel_8_1_simple_dangerous(double* a, size_t size);
/** /**
* @brief A 1/8 operational intensity which might compile to a flawed oi kernel * @brief A 1/8 operational intensity which might compile to a flawed oi kernel
* @param a An array with double values of size param size * @param a An array with double values of size param size
* @param b An array with double values of size param size
* @param c An array with double values of size param size
* @param size Size of the three param arrays * @param size Size of the three param arrays
* *
* === Problem == * === Problem ==
@ -188,7 +207,7 @@ void kernel_8_1_simple_dangerous(double* a, double* b, double* c, size_t size);
* how large the array is and how the cpu work internally * how large the array is and how the cpu work internally
* -> unpredictable. * -> unpredictable.
*/ */
void kernel_1_8_vo_dangerous(double* a, double* b, double* c, size_t size); void kernel_1_8_vo_dangerous(double* a, size_t size);
/**************************************** /****************************************

BIN
roofline/src/roofline Executable file

Binary file not shown.

View file

@ -64,6 +64,11 @@ static double pin_time(void);
*/ */
static void testkern(double* a, double* b, double* c, size_t size); static void testkern(double* a, double* b, double* c, size_t size);
/**
* @brief pretty prints a kern_result
*/
static void print_kernresult(kern_result* result);
int main(int argc, char* argv[]) { int main(int argc, char* argv[]) {
prog_name = argv[0]; prog_name = argv[0];
@ -130,34 +135,12 @@ int main(int argc, char* argv[]) {
t = pin_time() - t; t = pin_time() - t;
printf("Machine heating took %.4f microseconds = %.4f seconds (with test OI kernel)\n", (t*1.0E6), t); printf("Machine heating took %.4f microseconds = %.4f seconds (with test OI kernel)\n", (t*1.0E6), t);
kern_result simple16 = kernel_dispatch(SIMPLE_1_16, a, b, c, size, runs);
kern_result fma16 = kernel_dispatch(FMA_1_16, a, b, c, size, runs);
kern_result simple8 = kernel_dispatch(SIMPLE_8_1, a, b, c, size, runs);
kern_result fma8 = kernel_dispatch(FMA_8_1, a, b, c, size, runs);
/* print_kernresult(&simple16);
TESTS!!
*/
printf("1/16 simple\n");
t = pin_time();
kernel_1_16_simple(a,b,c, size);
t = pin_time() - t;
printf("Cache warming took %.4f microseconds = %.4f seconds (with test AI of 1/16 FLOPs/Byte)\n", (t*1.0E6), t);
printf("1/16 fuseaware\n");
t = pin_time();
kernel_1_16_fuseaware(a,b,c, size);
t = pin_time() - t;
printf("Cache warming took %.4f microseconds = %.4f seconds (with test AI of 1/16 FLOPs/Byte)\n", (t*1.0E6), t);
printf("8 simple\n");
t = pin_time();
kernel_8_1_simple(a,b,c, size);
t = pin_time() - t;
printf("Cache warming took %.4f microseconds = %.4f seconds (with test AI of 1/16 FLOPs/Byte)\n", (t*1.0E6), t);
printf("8 fuseaware\n");
t = pin_time();
kernel_8_1_fuseaware(a,b,c, size);
t = pin_time() - t;
printf("Cache warming took %.4f microseconds = %.4f seconds (with test AI of 1/16 FLOPs/Byte)\n", (t*1.0E6), t);
exit(EXIT_SUCCESS); exit(EXIT_SUCCESS);
} }
@ -249,3 +232,7 @@ static void bail_out(char* fmt, ...)
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
static void print_kernresult(kern_result* result){
return;
}

BIN
roofline/src/roofline_fma Executable file

Binary file not shown.

Binary file not shown.

BIN
roofline/src/roofline_fma_fast_o3 Executable file

Binary file not shown.

BIN
roofline/src/roofline_fma_o3 Executable file

Binary file not shown.

BIN
roofline/src/roofline_o3 Executable file

Binary file not shown.