i5-roofline/aikern.c

289 lines
5.6 KiB
C
Raw Normal View History

2020-09-03 16:49:50 +00:00
# include <stdlib.h>
# include <stdio.h>
# include <unistd.h>
# include <stdarg.h>
# include <errno.h>
# include <string.h>
# include <sys/time.h>
# include "aikern.h"
/* === Macros === */
#ifdef ENDEBUG
#define DEBUG(...) do { fprintf(stderr, __VA_ARGS__); fprintf(stderr, "\n"); } while(0)
#else
#define DEBUG(...)
#endif
/**
* @brief terminate program on program error
* @param msg additional message to print
* @param ret exit value
*/
static void bail_out(char* fmt, ...);
/**
* @brief microseconds since epoch
*/
static double pin_time(void);
kern_result kernel_dispatch(kernel_t kernel,
double* a, double* b, double* c,
size_t size, size_t runs)
{
kern_result result = {0};
result.runs = runs;
result.starts = malloc(sizeof(double)*(runs));
result.ends = malloc(sizeof(double)*(runs));
result.size = size;
if(result.starts==NULL || result.ends==NULL)
{
bail_out("One of the mallocs failed\n. starts = %p, ends=%p", result.starts, result.ends);
}
switch(kernel)
{
case SIMPLE_1_16:
result.flops = 1;
result.kern_name = "Simple 1/16";
for(size_t r=0; r<runs; r++)
{
result.starts[r] = pin_time();
kernel_1_16_simple(a, size);
result.ends[r] = pin_time();
}
break;
case FMA_1_16:
result.flops = 2;
result.kern_name = "FMA aware 1/16";
for(size_t r=0; r<runs; r++)
{
result.starts[r] = pin_time();
kernel_1_16_fuseaware(a, b, c, size);
result.ends[r] = pin_time();
}
break;
case SIMPLE_8_1:
result.flops = 128;
result.kern_name = "Simple 8";
for(size_t r=0; r<runs; r++)
{
result.starts[r] = pin_time();
kernel_8_1_simple(a, size);
result.ends[r] = pin_time();
}
break;
case FMA_8_1:
result.flops = 128;
result.kern_name = "FMA aware 8";
for(size_t r=0; r<runs; r++)
{
result.starts[r] = pin_time();
kernel_8_1_fuseaware(a, size);
result.ends[r] = pin_time();
}
break;
case SIMPLE_8_1_FASTMATH:
DEBUG("AIKERN MANPACK");
result.flops = 128;
result.kern_name = "Simple 8 undermining fastmath";
for(size_t r=0; r<runs; r++)
{
result.starts[r] = pin_time();
kernel_8_1_simple_fastmath(a, size);
result.ends[r] = pin_time();
}
break;
case FMA_8_1_MANPACK:
DEBUG("AIKERN MANPACK");
#ifdef INTRINS
if(size%4 != 0)
{
bail_out("Must use multiple of 4 size for manpack");
}
result.flops = 128;
result.kern_name = "FMA aware 8 with manual packing";
for(size_t r=0; r<runs; r++)
{
DEBUG("running manpack run %zu",r);
result.starts[r] = pin_time();
kernel_8_1_fuseaware_manpack(a, size);
result.ends[r] = pin_time();
}
#endif
break;
default:
bail_out("No such kernel %s", kernel);
break;
}
return result;
}
inline void kernel_1_16_simple(double* a, size_t size)
{
#pragma omp parallel for
for(size_t i=0; i<size; i++)
{
a[i] = a[i] * a[i];
}
}
inline void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size)
{
#pragma omp parallel for
for(size_t i=0; i<size; i++)
{
a[i] = a[i] * b[i] + c[i];
}
}
inline void kernel_8_1_simple(double* a, size_t size)
{
#pragma omp parallel for
for(size_t i=0; i<size; i++)
{
a[i] = REP100(a[i]*)
REP20(a[i]*)
REP8(a[i]*)
REP1(a[i]);
}
}
inline void kernel_8_1_simple_fastmath(double* a, size_t size)
{
#pragma omp parallel for
for(size_t i=0; i<size; i++)
{
REP100(a[i]=a[i]*a[i];);
REP20(a[i]=a[i]*a[i];);
REP8(a[i]=a[i]*a[i];);
}
}
inline void kernel_8_1_fuseaware(double* a, size_t size)
{
#pragma omp parallel for
for(size_t i=0; i<size; i++)
{
REP60(a[i] = a[i] * a[i] + a[i];)
REP4(a[i] = a[i] * a[i] + a[i];)
}
}
#ifdef INTRINS
#include <immintrin.h>
inline void kernel_8_1_fuseaware_manpack(double* a, size_t size)
{
#pragma omp parallel for
for(size_t i=0; i<(size-4); i+=4)
{
// pack doubles
__m256d packvec = _mm256_set_pd(a[i], a[i+1], a[i+2], a[i+3]);
REP60(packvec = _mm256_fmadd_pd(packvec, packvec, packvec););
REP4(packvec = _mm256_fmadd_pd(packvec, packvec, packvec););
a[i] = packvec[0];
a[i+1] = packvec[1];
a[i+2] = packvec[2];
a[i+3] = packvec[3];
}
}
#endif /* INTRINS */
/********************************************
* Kernels which potentially compile to *
* different operational intensities than *
* specified *
********************************************/
void kernel_1_16_simple_dangerous(double* a, double* b, size_t size)
{
register volatile double tmp = 0.1;
#pragma omp parallel for
for(size_t i=0; i<size; i++){
tmp = a[i] * b[i];
}
}
void kernel_8_1_simple_dangerous(double* a, size_t size)
{
register volatile double tmp = 0.1;
#pragma omp parallel for
for(size_t i=0; i<size; i++)
{
tmp = a[i] * a[i] * a[i] * a[i] *
a[i] * a[i] * a[i] * a[i];
}
}
void kernel_1_8_vo_dangerous(double* a, size_t size)
{
register volatile double tmp=0.0;
#pragma omp parallel for
for(size_t i=0; i<size; i++)
{
tmp = a[i] * a[i];
}
}
/* === Helper Functions === */
static double pin_time(void)
{
struct timeval tp;
int i;
i = gettimeofday(&tp,NULL);
if(i != 0)
{
bail_out("Time measurement impossible. gettimeofday error");
}
return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
}
static void bail_out(char* fmt, ...)
{
char* prog_name = "aikern";
if(fmt != NULL)
{
char msgbuf[150];
va_list vl;
va_start(vl, fmt);
if(vsnprintf(msgbuf, sizeof(msgbuf), fmt, vl) < 0)
msgbuf[0] = '\0';
va_end( vl);
if(strlen(msgbuf) > 0)
(void)fprintf(stderr, "%s: %s \n", prog_name, msgbuf);
}
if(errno != 0)
(void)fprintf(stderr, "%s: %s\n", prog_name, strerror(errno));
exit(EXIT_FAILURE);
}