major refactoring
This commit is contained in:
parent
ba7a732d31
commit
31bfead054
9 changed files with 163 additions and 89 deletions
|
@ -20,11 +20,73 @@ static void bail_out(char* fmt, ...);
|
||||||
*/
|
*/
|
||||||
static double pin_time(void);
|
static double pin_time(void);
|
||||||
|
|
||||||
void kernel_1_16_simple(double* a, double* b, double* c, size_t size)
|
kern_result kernel_dispatch(kernel_t kernel,
|
||||||
|
double* a, double* b, double* c,
|
||||||
|
size_t size, size_t runs)
|
||||||
|
{
|
||||||
|
|
||||||
|
kern_result result;
|
||||||
|
result.runs = runs;
|
||||||
|
result.starts = malloc(sizeof(double)*(runs));
|
||||||
|
result.ends = malloc(sizeof(double)*(runs));
|
||||||
|
|
||||||
|
if(result.starts==NULL || result.ends==NULL)
|
||||||
|
{
|
||||||
|
bail_out("One of the mallocs failed\n. starts = %p, ends=%p", result.starts, result.ends);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
switch(kernel)
|
||||||
|
{
|
||||||
|
|
||||||
|
case SIMPLE_1_16:
|
||||||
|
result.flops = 1;
|
||||||
|
for(size_t r=0; r<runs; r++)
|
||||||
|
{
|
||||||
|
result.starts[r] = pin_time();
|
||||||
|
kernel_1_16_simple(a, size);
|
||||||
|
result.ends[r] = pin_time();
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case FMA_1_16:
|
||||||
|
result.flops = 2;
|
||||||
|
for(size_t r=0; r<runs; r++)
|
||||||
|
{
|
||||||
|
result.starts[r] = pin_time();
|
||||||
|
kernel_1_16_fuseaware(a, b, c, size);
|
||||||
|
result.ends[r] = pin_time();
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case SIMPLE_8_1:
|
||||||
|
result.flops = 128;
|
||||||
|
for(size_t r=0; r<runs; r++)
|
||||||
|
{
|
||||||
|
result.starts[r] = pin_time();
|
||||||
|
kernel_8_1_simple(a, size);
|
||||||
|
result.ends[r] = pin_time();
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case FMA_8_1:
|
||||||
|
result.flops = 128;
|
||||||
|
for(size_t r=0; r<runs; r++)
|
||||||
|
{
|
||||||
|
result.starts[r] = pin_time();
|
||||||
|
kernel_8_1_fuseaware(a, size);
|
||||||
|
result.ends[r] = pin_time();
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
bail_out("No such kernel %s", kernel);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
void kernel_1_16_simple(double* a, size_t size)
|
||||||
{
|
{
|
||||||
double t = pin_time();
|
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
for(size_t i=0; i<size; i++){
|
for(size_t i=0; i<size; i++)
|
||||||
|
{
|
||||||
a[i] = a[i] * a[i];
|
a[i] = a[i] * a[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -32,15 +94,17 @@ void kernel_1_16_simple(double* a, double* b, double* c, size_t size)
|
||||||
void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size)
|
void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size)
|
||||||
{
|
{
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
for(size_t i=0; i<size; i++){
|
for(size_t i=0; i<size; i++)
|
||||||
|
{
|
||||||
a[i] = a[i] * b[i] + c[i];
|
a[i] = a[i] * b[i] + c[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void kernel_8_1_simple(double* a, double* b, double* c, size_t size)
|
void kernel_8_1_simple(double* a, size_t size)
|
||||||
{
|
{
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
for(size_t i=0; i<size; i++){
|
for(size_t i=0; i<size; i++)
|
||||||
|
{
|
||||||
a[i] = REP100(a[i]*)
|
a[i] = REP100(a[i]*)
|
||||||
REP20(a[i]*)
|
REP20(a[i]*)
|
||||||
REP8(a[i]*)
|
REP8(a[i]*)
|
||||||
|
@ -48,17 +112,23 @@ void kernel_8_1_simple(double* a, double* b, double* c, size_t size)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size)
|
void kernel_8_1_fuseaware(double* a, size_t size)
|
||||||
{
|
{
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
for(size_t i=0; i<size; i++){
|
for(size_t i=0; i<size; i++)
|
||||||
|
{
|
||||||
REP60(a[i] = a[i] * a[i] + a[i];)
|
REP60(a[i] = a[i] * a[i] + a[i];)
|
||||||
REP4(a[i] = a[i] * a[i] + a[i];)
|
REP4(a[i] = a[i] * a[i] + a[i];)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/********************************************
|
||||||
|
* Kernels which potentially compile to *
|
||||||
|
* different operational intensities than *
|
||||||
|
* specified *
|
||||||
|
********************************************/
|
||||||
|
|
||||||
void kernel_1_16_simple_dangerous(double* a, double* b, double* c, size_t size)
|
void kernel_1_16_simple_dangerous(double* a, double* b, size_t size)
|
||||||
{
|
{
|
||||||
register volatile double tmp = 0.1;
|
register volatile double tmp = 0.1;
|
||||||
|
|
||||||
|
@ -68,29 +138,27 @@ void kernel_1_16_simple_dangerous(double* a, double* b, double* c, size_t size)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void kernel_8_1_simple_dangerous(double* a, double* b, double* c, size_t size)
|
void kernel_8_1_simple_dangerous(double* a, size_t size)
|
||||||
{
|
{
|
||||||
register volatile double tmp = 0.1;
|
register volatile double tmp = 0.1;
|
||||||
|
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
for(size_t i=0; i<size; i++){
|
for(size_t i=0; i<size; i++)
|
||||||
|
{
|
||||||
tmp = a[i] * a[i] * a[i] * a[i] *
|
tmp = a[i] * a[i] * a[i] * a[i] *
|
||||||
a[i] * a[i] * a[i] * a[i];
|
a[i] * a[i] * a[i] * a[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void kernel_1_8_vo_dangerous(double* a, double* b, double* c, size_t size)
|
void kernel_1_8_vo_dangerous(double* a, size_t size)
|
||||||
{
|
{
|
||||||
/* This is the 1/8 AI kernel from the lecture
|
|
||||||
|
|
||||||
*/
|
|
||||||
|
|
||||||
register volatile double tmp=0.0;
|
register volatile double tmp=0.0;
|
||||||
|
|
||||||
for(size_t i=0; i<size; i++) {
|
#pragma omp parallel for
|
||||||
|
for(size_t i=0; i<size; i++)
|
||||||
|
{
|
||||||
tmp = a[i] * a[i];
|
tmp = a[i] * a[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,12 +1,38 @@
|
||||||
#ifndef AIKERN_H
|
#ifndef AIKERN_H
|
||||||
#define AIKERN_H
|
#define AIKERN_H
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
size_t runs; // also # of start-/endtimes
|
||||||
|
double* starts; // starttimes
|
||||||
|
double* ends; // endtimes
|
||||||
|
int flops; //flops per run
|
||||||
|
} kern_result;
|
||||||
|
|
||||||
|
typedef enum {
|
||||||
|
SIMPLE_1_16, FMA_1_16, SIMPLE_8_1, FMA_8_1
|
||||||
|
} kernel_t;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief main entry point. Dispatches the kernel calls
|
||||||
|
* @param kernel the kernel to run
|
||||||
|
* @param a An array with double values of size param size
|
||||||
|
* @param b An array with double values of size param size
|
||||||
|
* @param c An array with double values of size param size
|
||||||
|
* @param size The size of the arrays
|
||||||
|
* @param runs How often the kernel should be executed
|
||||||
|
* @return kern_result containing information about the kernel execution
|
||||||
|
*
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
kern_result kernel_dispatch(kernel_t kernel,
|
||||||
|
double* a, double* b, double* c,
|
||||||
|
size_t size, size_t runs);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief A simple 1/16 operational intensity kernel
|
* @brief A simple 1/16 operational intensity kernel
|
||||||
* @param a An array with double values of size param size
|
* @param a An array with double values of size param size
|
||||||
* @param b An array with double values of size param size
|
|
||||||
* @param c An array with double values of size param size
|
|
||||||
* @param size Size of the three param arrays
|
* @param size Size of the three param arrays
|
||||||
|
* @param result Pointer to result storage
|
||||||
*
|
*
|
||||||
* === Warning ===
|
* === Warning ===
|
||||||
* Don't use with -O0: Stores everything on stack
|
* Don't use with -O0: Stores everything on stack
|
||||||
|
@ -26,8 +52,7 @@
|
||||||
* Nothing special
|
* Nothing special
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
void kernel_1_16_simple(double* a, double* b, double* c, size_t size);
|
void kernel_1_16_simple(double* a, size_t size);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -36,6 +61,7 @@ void kernel_1_16_simple(double* a, double* b, double* c, size_t size);
|
||||||
* @param b An array with double values of size param size
|
* @param b An array with double values of size param size
|
||||||
* @param c An array with double values of size param size
|
* @param c An array with double values of size param size
|
||||||
* @param size Size of the three param arrays
|
* @param size Size of the three param arrays
|
||||||
|
* @param result Pointer to result storage
|
||||||
*
|
*
|
||||||
* === Warning ===
|
* === Warning ===
|
||||||
* This is dangerous if FMA is not used/can't be used. Then there
|
* This is dangerous if FMA is not used/can't be used. Then there
|
||||||
|
@ -67,9 +93,8 @@ void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size);
|
||||||
/**
|
/**
|
||||||
* @brief A simple 8/1 operational intensity kernel
|
* @brief A simple 8/1 operational intensity kernel
|
||||||
* @param a An array with double values of size param size
|
* @param a An array with double values of size param size
|
||||||
* @param b An array with double values of size param size
|
|
||||||
* @param c An array with double values of size param size
|
|
||||||
* @param size Size of the three param arrays
|
* @param size Size of the three param arrays
|
||||||
|
* @param result Pointer to result storage
|
||||||
*
|
*
|
||||||
* === Warning ===
|
* === Warning ===
|
||||||
* Don't use with -O0: Stores everything on stack
|
* Don't use with -O0: Stores everything on stack
|
||||||
|
@ -95,14 +120,13 @@ void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size);
|
||||||
* === Optimization ===
|
* === Optimization ===
|
||||||
* Nothing special
|
* Nothing special
|
||||||
*/
|
*/
|
||||||
void kernel_8_1_simple(double* a, double* b, double* c, size_t size);
|
void kernel_8_1_simple(double* a, size_t size);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief A 8/1 operational intensity kernel utilizing FMA
|
* @brief A 8/1 operational intensity kernel utilizing FMA
|
||||||
* @param a An array with double values of size param size
|
* @param a An array with double values of size param size
|
||||||
* @param b An array with double values of size param size
|
|
||||||
* @param c An array with double values of size param size
|
|
||||||
* @param size Size of the three param arrays
|
* @param size Size of the three param arrays
|
||||||
|
* @param result Pointer to result storage
|
||||||
*
|
*
|
||||||
* === Warning ===
|
* === Warning ===
|
||||||
* This is dangerous if FMA is not used/can't be used. Then there
|
* This is dangerous if FMA is not used/can't be used. Then there
|
||||||
|
@ -126,7 +150,7 @@ void kernel_8_1_simple(double* a, double* b, double* c, size_t size);
|
||||||
* For packed doubles compile with -Ofast
|
* For packed doubles compile with -Ofast
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size);
|
void kernel_8_1_fuseaware(double* a, size_t size);
|
||||||
|
|
||||||
|
|
||||||
/********************************************
|
/********************************************
|
||||||
|
@ -139,7 +163,6 @@ void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size);
|
||||||
* @brief A 1/16 operational intensity which might compile to a flawed oi kernel
|
* @brief A 1/16 operational intensity which might compile to a flawed oi kernel
|
||||||
* @param a An array with double values of size param size
|
* @param a An array with double values of size param size
|
||||||
* @param b An array with double values of size param size
|
* @param b An array with double values of size param size
|
||||||
* @param c An array with double values of size param size
|
|
||||||
* @param size Size of the three param arrays
|
* @param size Size of the three param arrays
|
||||||
*
|
*
|
||||||
* === Problem ===
|
* === Problem ===
|
||||||
|
@ -158,25 +181,21 @@ void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size);
|
||||||
* Without volatile (-O3):
|
* Without volatile (-O3):
|
||||||
* repz ret # that's it
|
* repz ret # that's it
|
||||||
*/
|
*/
|
||||||
void kernel_1_16_simple_dangerous(double* a, double* b, double* c, size_t size);
|
void kernel_1_16_simple_dangerous(double* a, double* b, size_t size);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief A 8/1 operational intensity which might compile to a flawed oi kernel
|
* @brief A 8/1 operational intensity which might compile to a flawed oi kernel
|
||||||
* @param a An array with double values of size param size
|
* @param a An array with double values of size param size
|
||||||
* @param b An array with double values of size param size
|
|
||||||
* @param c An array with double values of size param size
|
|
||||||
* @param size Size of the three param arrays
|
* @param size Size of the three param arrays
|
||||||
*
|
*
|
||||||
* === Problem ==
|
* === Problem ==
|
||||||
* Same as for kernel_1_16_simple_dangerous
|
* Same as for kernel_1_16_simple_dangerous
|
||||||
*/
|
*/
|
||||||
void kernel_8_1_simple_dangerous(double* a, double* b, double* c, size_t size);
|
void kernel_8_1_simple_dangerous(double* a, size_t size);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief A 1/8 operational intensity which might compile to a flawed oi kernel
|
* @brief A 1/8 operational intensity which might compile to a flawed oi kernel
|
||||||
* @param a An array with double values of size param size
|
* @param a An array with double values of size param size
|
||||||
* @param b An array with double values of size param size
|
|
||||||
* @param c An array with double values of size param size
|
|
||||||
* @param size Size of the three param arrays
|
* @param size Size of the three param arrays
|
||||||
*
|
*
|
||||||
* === Problem ==
|
* === Problem ==
|
||||||
|
@ -188,7 +207,7 @@ void kernel_8_1_simple_dangerous(double* a, double* b, double* c, size_t size);
|
||||||
* how large the array is and how the cpu work internally
|
* how large the array is and how the cpu work internally
|
||||||
* -> unpredictable.
|
* -> unpredictable.
|
||||||
*/
|
*/
|
||||||
void kernel_1_8_vo_dangerous(double* a, double* b, double* c, size_t size);
|
void kernel_1_8_vo_dangerous(double* a, size_t size);
|
||||||
|
|
||||||
|
|
||||||
/****************************************
|
/****************************************
|
||||||
|
|
BIN
roofline/src/roofline
Executable file
BIN
roofline/src/roofline
Executable file
Binary file not shown.
|
@ -64,6 +64,11 @@ static double pin_time(void);
|
||||||
*/
|
*/
|
||||||
static void testkern(double* a, double* b, double* c, size_t size);
|
static void testkern(double* a, double* b, double* c, size_t size);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief pretty prints a kern_result
|
||||||
|
*/
|
||||||
|
static void print_kernresult(kern_result* result);
|
||||||
|
|
||||||
int main(int argc, char* argv[]) {
|
int main(int argc, char* argv[]) {
|
||||||
prog_name = argv[0];
|
prog_name = argv[0];
|
||||||
|
|
||||||
|
@ -130,34 +135,12 @@ int main(int argc, char* argv[]) {
|
||||||
t = pin_time() - t;
|
t = pin_time() - t;
|
||||||
printf("Machine heating took %.4f microseconds = %.4f seconds (with test OI kernel)\n", (t*1.0E6), t);
|
printf("Machine heating took %.4f microseconds = %.4f seconds (with test OI kernel)\n", (t*1.0E6), t);
|
||||||
|
|
||||||
|
kern_result simple16 = kernel_dispatch(SIMPLE_1_16, a, b, c, size, runs);
|
||||||
|
kern_result fma16 = kernel_dispatch(FMA_1_16, a, b, c, size, runs);
|
||||||
|
kern_result simple8 = kernel_dispatch(SIMPLE_8_1, a, b, c, size, runs);
|
||||||
|
kern_result fma8 = kernel_dispatch(FMA_8_1, a, b, c, size, runs);
|
||||||
|
|
||||||
/*
|
print_kernresult(&simple16);
|
||||||
TESTS!!
|
|
||||||
|
|
||||||
*/
|
|
||||||
printf("1/16 simple\n");
|
|
||||||
t = pin_time();
|
|
||||||
kernel_1_16_simple(a,b,c, size);
|
|
||||||
t = pin_time() - t;
|
|
||||||
printf("Cache warming took %.4f microseconds = %.4f seconds (with test AI of 1/16 FLOPs/Byte)\n", (t*1.0E6), t);
|
|
||||||
|
|
||||||
printf("1/16 fuseaware\n");
|
|
||||||
t = pin_time();
|
|
||||||
kernel_1_16_fuseaware(a,b,c, size);
|
|
||||||
t = pin_time() - t;
|
|
||||||
printf("Cache warming took %.4f microseconds = %.4f seconds (with test AI of 1/16 FLOPs/Byte)\n", (t*1.0E6), t);
|
|
||||||
|
|
||||||
printf("8 simple\n");
|
|
||||||
t = pin_time();
|
|
||||||
kernel_8_1_simple(a,b,c, size);
|
|
||||||
t = pin_time() - t;
|
|
||||||
printf("Cache warming took %.4f microseconds = %.4f seconds (with test AI of 1/16 FLOPs/Byte)\n", (t*1.0E6), t);
|
|
||||||
|
|
||||||
printf("8 fuseaware\n");
|
|
||||||
t = pin_time();
|
|
||||||
kernel_8_1_fuseaware(a,b,c, size);
|
|
||||||
t = pin_time() - t;
|
|
||||||
printf("Cache warming took %.4f microseconds = %.4f seconds (with test AI of 1/16 FLOPs/Byte)\n", (t*1.0E6), t);
|
|
||||||
|
|
||||||
exit(EXIT_SUCCESS);
|
exit(EXIT_SUCCESS);
|
||||||
}
|
}
|
||||||
|
@ -249,3 +232,7 @@ static void bail_out(char* fmt, ...)
|
||||||
|
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void print_kernresult(kern_result* result){
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
BIN
roofline/src/roofline_fma
Executable file
BIN
roofline/src/roofline_fma
Executable file
Binary file not shown.
BIN
roofline/src/roofline_fma_fast_fastmath_o3
Executable file
BIN
roofline/src/roofline_fma_fast_fastmath_o3
Executable file
Binary file not shown.
BIN
roofline/src/roofline_fma_fast_o3
Executable file
BIN
roofline/src/roofline_fma_fast_o3
Executable file
Binary file not shown.
BIN
roofline/src/roofline_fma_o3
Executable file
BIN
roofline/src/roofline_fma_o3
Executable file
Binary file not shown.
BIN
roofline/src/roofline_o3
Executable file
BIN
roofline/src/roofline_o3
Executable file
Binary file not shown.
Loading…
Reference in a new issue