major refactoring
This commit is contained in:
parent
ba7a732d31
commit
31bfead054
9 changed files with 163 additions and 89 deletions
|
@ -20,45 +20,115 @@ static void bail_out(char* fmt, ...);
|
|||
*/
|
||||
static double pin_time(void);
|
||||
|
||||
void kernel_1_16_simple(double* a, double* b, double* c, size_t size)
|
||||
kern_result kernel_dispatch(kernel_t kernel,
|
||||
double* a, double* b, double* c,
|
||||
size_t size, size_t runs)
|
||||
{
|
||||
|
||||
kern_result result;
|
||||
result.runs = runs;
|
||||
result.starts = malloc(sizeof(double)*(runs));
|
||||
result.ends = malloc(sizeof(double)*(runs));
|
||||
|
||||
if(result.starts==NULL || result.ends==NULL)
|
||||
{
|
||||
bail_out("One of the mallocs failed\n. starts = %p, ends=%p", result.starts, result.ends);
|
||||
}
|
||||
|
||||
|
||||
switch(kernel)
|
||||
{
|
||||
|
||||
case SIMPLE_1_16:
|
||||
result.flops = 1;
|
||||
for(size_t r=0; r<runs; r++)
|
||||
{
|
||||
result.starts[r] = pin_time();
|
||||
kernel_1_16_simple(a, size);
|
||||
result.ends[r] = pin_time();
|
||||
}
|
||||
break;
|
||||
case FMA_1_16:
|
||||
result.flops = 2;
|
||||
for(size_t r=0; r<runs; r++)
|
||||
{
|
||||
result.starts[r] = pin_time();
|
||||
kernel_1_16_fuseaware(a, b, c, size);
|
||||
result.ends[r] = pin_time();
|
||||
}
|
||||
break;
|
||||
case SIMPLE_8_1:
|
||||
result.flops = 128;
|
||||
for(size_t r=0; r<runs; r++)
|
||||
{
|
||||
result.starts[r] = pin_time();
|
||||
kernel_8_1_simple(a, size);
|
||||
result.ends[r] = pin_time();
|
||||
}
|
||||
break;
|
||||
case FMA_8_1:
|
||||
result.flops = 128;
|
||||
for(size_t r=0; r<runs; r++)
|
||||
{
|
||||
result.starts[r] = pin_time();
|
||||
kernel_8_1_fuseaware(a, size);
|
||||
result.ends[r] = pin_time();
|
||||
}
|
||||
break;
|
||||
default:
|
||||
bail_out("No such kernel %s", kernel);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void kernel_1_16_simple(double* a, size_t size)
|
||||
{
|
||||
double t = pin_time();
|
||||
#pragma omp parallel for
|
||||
for(size_t i=0; i<size; i++){
|
||||
a[i] = a[i] * a[i];
|
||||
}
|
||||
for(size_t i=0; i<size; i++)
|
||||
{
|
||||
a[i] = a[i] * a[i];
|
||||
}
|
||||
}
|
||||
|
||||
void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size)
|
||||
{
|
||||
#pragma omp parallel for
|
||||
for(size_t i=0; i<size; i++){
|
||||
a[i] = a[i] * b[i] + c[i];
|
||||
}
|
||||
for(size_t i=0; i<size; i++)
|
||||
{
|
||||
a[i] = a[i] * b[i] + c[i];
|
||||
}
|
||||
}
|
||||
|
||||
void kernel_8_1_simple(double* a, double* b, double* c, size_t size)
|
||||
void kernel_8_1_simple(double* a, size_t size)
|
||||
{
|
||||
#pragma omp parallel for
|
||||
for(size_t i=0; i<size; i++){
|
||||
a[i] = REP100(a[i]*)
|
||||
REP20(a[i]*)
|
||||
REP8(a[i]*)
|
||||
REP1(a[i]);
|
||||
for(size_t i=0; i<size; i++)
|
||||
{
|
||||
a[i] = REP100(a[i]*)
|
||||
REP20(a[i]*)
|
||||
REP8(a[i]*)
|
||||
REP1(a[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size)
|
||||
void kernel_8_1_fuseaware(double* a, size_t size)
|
||||
{
|
||||
#pragma omp parallel for
|
||||
for(size_t i=0; i<size; i++){
|
||||
REP60(a[i] = a[i] * a[i] + a[i];)
|
||||
REP4(a[i] = a[i] * a[i] + a[i];)
|
||||
}
|
||||
for(size_t i=0; i<size; i++)
|
||||
{
|
||||
REP60(a[i] = a[i] * a[i] + a[i];)
|
||||
REP4(a[i] = a[i] * a[i] + a[i];)
|
||||
}
|
||||
}
|
||||
|
||||
/********************************************
|
||||
* Kernels which potentially compile to *
|
||||
* different operational intensities than *
|
||||
* specified *
|
||||
********************************************/
|
||||
|
||||
void kernel_1_16_simple_dangerous(double* a, double* b, double* c, size_t size)
|
||||
void kernel_1_16_simple_dangerous(double* a, double* b, size_t size)
|
||||
{
|
||||
register volatile double tmp = 0.1;
|
||||
|
||||
|
@ -68,29 +138,27 @@ void kernel_1_16_simple_dangerous(double* a, double* b, double* c, size_t size)
|
|||
}
|
||||
}
|
||||
|
||||
void kernel_8_1_simple_dangerous(double* a, double* b, double* c, size_t size)
|
||||
void kernel_8_1_simple_dangerous(double* a, size_t size)
|
||||
{
|
||||
register volatile double tmp = 0.1;
|
||||
|
||||
#pragma omp parallel for
|
||||
for(size_t i=0; i<size; i++){
|
||||
tmp = a[i] * a[i] * a[i] * a[i] *
|
||||
a[i] * a[i] * a[i] * a[i];
|
||||
}
|
||||
for(size_t i=0; i<size; i++)
|
||||
{
|
||||
tmp = a[i] * a[i] * a[i] * a[i] *
|
||||
a[i] * a[i] * a[i] * a[i];
|
||||
}
|
||||
}
|
||||
|
||||
void kernel_1_8_vo_dangerous(double* a, double* b, double* c, size_t size)
|
||||
void kernel_1_8_vo_dangerous(double* a, size_t size)
|
||||
{
|
||||
/* This is the 1/8 AI kernel from the lecture
|
||||
|
||||
*/
|
||||
|
||||
register volatile double tmp=0.0;
|
||||
|
||||
for(size_t i=0; i<size; i++) {
|
||||
tmp = a[i] * a[i];
|
||||
}
|
||||
|
||||
#pragma omp parallel for
|
||||
for(size_t i=0; i<size; i++)
|
||||
{
|
||||
tmp = a[i] * a[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -1,12 +1,38 @@
|
|||
#ifndef AIKERN_H
|
||||
#define AIKERN_H
|
||||
|
||||
typedef struct {
|
||||
size_t runs; // also # of start-/endtimes
|
||||
double* starts; // starttimes
|
||||
double* ends; // endtimes
|
||||
int flops; //flops per run
|
||||
} kern_result;
|
||||
|
||||
typedef enum {
|
||||
SIMPLE_1_16, FMA_1_16, SIMPLE_8_1, FMA_8_1
|
||||
} kernel_t;
|
||||
|
||||
/**
|
||||
* @brief main entry point. Dispatches the kernel calls
|
||||
* @param kernel the kernel to run
|
||||
* @param a An array with double values of size param size
|
||||
* @param b An array with double values of size param size
|
||||
* @param c An array with double values of size param size
|
||||
* @param size The size of the arrays
|
||||
* @param runs How often the kernel should be executed
|
||||
* @return kern_result containing information about the kernel execution
|
||||
*
|
||||
*
|
||||
*/
|
||||
kern_result kernel_dispatch(kernel_t kernel,
|
||||
double* a, double* b, double* c,
|
||||
size_t size, size_t runs);
|
||||
|
||||
/**
|
||||
* @brief A simple 1/16 operational intensity kernel
|
||||
* @param a An array with double values of size param size
|
||||
* @param b An array with double values of size param size
|
||||
* @param c An array with double values of size param size
|
||||
* @param size Size of the three param arrays
|
||||
* @param a An array with double values of size param size
|
||||
* @param size Size of the three param arrays
|
||||
* @param result Pointer to result storage
|
||||
*
|
||||
* === Warning ===
|
||||
* Don't use with -O0: Stores everything on stack
|
||||
|
@ -26,16 +52,16 @@
|
|||
* Nothing special
|
||||
*
|
||||
*/
|
||||
void kernel_1_16_simple(double* a, double* b, double* c, size_t size);
|
||||
|
||||
void kernel_1_16_simple(double* a, size_t size);
|
||||
|
||||
|
||||
/**
|
||||
* @brief A 1/16 operational intensity kernel utilizing FMA
|
||||
* @param a An array with double values of size param size
|
||||
* @param b An array with double values of size param size
|
||||
* @param c An array with double values of size param size
|
||||
* @param size Size of the three param arrays
|
||||
* @param a An array with double values of size param size
|
||||
* @param b An array with double values of size param size
|
||||
* @param c An array with double values of size param size
|
||||
* @param size Size of the three param arrays
|
||||
* @param result Pointer to result storage
|
||||
*
|
||||
* === Warning ===
|
||||
* This is dangerous if FMA is not used/can't be used. Then there
|
||||
|
@ -66,10 +92,9 @@ void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size);
|
|||
|
||||
/**
|
||||
* @brief A simple 8/1 operational intensity kernel
|
||||
* @param a An array with double values of size param size
|
||||
* @param b An array with double values of size param size
|
||||
* @param c An array with double values of size param size
|
||||
* @param size Size of the three param arrays
|
||||
* @param a An array with double values of size param size
|
||||
* @param size Size of the three param arrays
|
||||
* @param result Pointer to result storage
|
||||
*
|
||||
* === Warning ===
|
||||
* Don't use with -O0: Stores everything on stack
|
||||
|
@ -95,14 +120,13 @@ void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size);
|
|||
* === Optimization ===
|
||||
* Nothing special
|
||||
*/
|
||||
void kernel_8_1_simple(double* a, double* b, double* c, size_t size);
|
||||
void kernel_8_1_simple(double* a, size_t size);
|
||||
|
||||
/**
|
||||
* @brief A 8/1 operational intensity kernel utilizing FMA
|
||||
* @param a An array with double values of size param size
|
||||
* @param b An array with double values of size param size
|
||||
* @param c An array with double values of size param size
|
||||
* @param size Size of the three param arrays
|
||||
* @param a An array with double values of size param size
|
||||
* @param size Size of the three param arrays
|
||||
* @param result Pointer to result storage
|
||||
*
|
||||
* === Warning ===
|
||||
* This is dangerous if FMA is not used/can't be used. Then there
|
||||
|
@ -126,7 +150,7 @@ void kernel_8_1_simple(double* a, double* b, double* c, size_t size);
|
|||
* For packed doubles compile with -Ofast
|
||||
*
|
||||
*/
|
||||
void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size);
|
||||
void kernel_8_1_fuseaware(double* a, size_t size);
|
||||
|
||||
|
||||
/********************************************
|
||||
|
@ -139,7 +163,6 @@ void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size);
|
|||
* @brief A 1/16 operational intensity which might compile to a flawed oi kernel
|
||||
* @param a An array with double values of size param size
|
||||
* @param b An array with double values of size param size
|
||||
* @param c An array with double values of size param size
|
||||
* @param size Size of the three param arrays
|
||||
*
|
||||
* === Problem ===
|
||||
|
@ -158,25 +181,21 @@ void kernel_8_1_fuseaware(double* a, double* b, double* c, size_t size);
|
|||
* Without volatile (-O3):
|
||||
* repz ret # that's it
|
||||
*/
|
||||
void kernel_1_16_simple_dangerous(double* a, double* b, double* c, size_t size);
|
||||
void kernel_1_16_simple_dangerous(double* a, double* b, size_t size);
|
||||
|
||||
/**
|
||||
* @brief A 8/1 operational intensity which might compile to a flawed oi kernel
|
||||
* @param a An array with double values of size param size
|
||||
* @param b An array with double values of size param size
|
||||
* @param c An array with double values of size param size
|
||||
* @param size Size of the three param arrays
|
||||
*
|
||||
* === Problem ==
|
||||
* Same as for kernel_1_16_simple_dangerous
|
||||
*/
|
||||
void kernel_8_1_simple_dangerous(double* a, double* b, double* c, size_t size);
|
||||
void kernel_8_1_simple_dangerous(double* a, size_t size);
|
||||
|
||||
/**
|
||||
* @brief A 1/8 operational intensity which might compile to a flawed oi kernel
|
||||
* @param a An array with double values of size param size
|
||||
* @param b An array with double values of size param size
|
||||
* @param c An array with double values of size param size
|
||||
* @param size Size of the three param arrays
|
||||
*
|
||||
* === Problem ==
|
||||
|
@ -188,7 +207,7 @@ void kernel_8_1_simple_dangerous(double* a, double* b, double* c, size_t size);
|
|||
* how large the array is and how the cpu work internally
|
||||
* -> unpredictable.
|
||||
*/
|
||||
void kernel_1_8_vo_dangerous(double* a, double* b, double* c, size_t size);
|
||||
void kernel_1_8_vo_dangerous(double* a, size_t size);
|
||||
|
||||
|
||||
/****************************************
|
||||
|
|
BIN
roofline/src/roofline
Executable file
BIN
roofline/src/roofline
Executable file
Binary file not shown.
|
@ -64,6 +64,11 @@ static double pin_time(void);
|
|||
*/
|
||||
static void testkern(double* a, double* b, double* c, size_t size);
|
||||
|
||||
/**
|
||||
* @brief pretty prints a kern_result
|
||||
*/
|
||||
static void print_kernresult(kern_result* result);
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
prog_name = argv[0];
|
||||
|
||||
|
@ -130,34 +135,12 @@ int main(int argc, char* argv[]) {
|
|||
t = pin_time() - t;
|
||||
printf("Machine heating took %.4f microseconds = %.4f seconds (with test OI kernel)\n", (t*1.0E6), t);
|
||||
|
||||
|
||||
/*
|
||||
TESTS!!
|
||||
kern_result simple16 = kernel_dispatch(SIMPLE_1_16, a, b, c, size, runs);
|
||||
kern_result fma16 = kernel_dispatch(FMA_1_16, a, b, c, size, runs);
|
||||
kern_result simple8 = kernel_dispatch(SIMPLE_8_1, a, b, c, size, runs);
|
||||
kern_result fma8 = kernel_dispatch(FMA_8_1, a, b, c, size, runs);
|
||||
|
||||
*/
|
||||
printf("1/16 simple\n");
|
||||
t = pin_time();
|
||||
kernel_1_16_simple(a,b,c, size);
|
||||
t = pin_time() - t;
|
||||
printf("Cache warming took %.4f microseconds = %.4f seconds (with test AI of 1/16 FLOPs/Byte)\n", (t*1.0E6), t);
|
||||
|
||||
printf("1/16 fuseaware\n");
|
||||
t = pin_time();
|
||||
kernel_1_16_fuseaware(a,b,c, size);
|
||||
t = pin_time() - t;
|
||||
printf("Cache warming took %.4f microseconds = %.4f seconds (with test AI of 1/16 FLOPs/Byte)\n", (t*1.0E6), t);
|
||||
|
||||
printf("8 simple\n");
|
||||
t = pin_time();
|
||||
kernel_8_1_simple(a,b,c, size);
|
||||
t = pin_time() - t;
|
||||
printf("Cache warming took %.4f microseconds = %.4f seconds (with test AI of 1/16 FLOPs/Byte)\n", (t*1.0E6), t);
|
||||
|
||||
printf("8 fuseaware\n");
|
||||
t = pin_time();
|
||||
kernel_8_1_fuseaware(a,b,c, size);
|
||||
t = pin_time() - t;
|
||||
printf("Cache warming took %.4f microseconds = %.4f seconds (with test AI of 1/16 FLOPs/Byte)\n", (t*1.0E6), t);
|
||||
print_kernresult(&simple16);
|
||||
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
|
@ -249,3 +232,7 @@ static void bail_out(char* fmt, ...)
|
|||
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
static void print_kernresult(kern_result* result){
|
||||
return;
|
||||
}
|
||||
|
|
BIN
roofline/src/roofline_fma
Executable file
BIN
roofline/src/roofline_fma
Executable file
Binary file not shown.
BIN
roofline/src/roofline_fma_fast_fastmath_o3
Executable file
BIN
roofline/src/roofline_fma_fast_fastmath_o3
Executable file
Binary file not shown.
BIN
roofline/src/roofline_fma_fast_o3
Executable file
BIN
roofline/src/roofline_fma_fast_o3
Executable file
Binary file not shown.
BIN
roofline/src/roofline_fma_o3
Executable file
BIN
roofline/src/roofline_fma_o3
Executable file
Binary file not shown.
BIN
roofline/src/roofline_o3
Executable file
BIN
roofline/src/roofline_o3
Executable file
Binary file not shown.
Loading…
Reference in a new issue