commit c61491e7138f3074d02abcc5984f4a4294c9fd67 Author: Armin Friedl Date: Thu Sep 3 18:49:50 2020 +0200 Init diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..d906396 --- /dev/null +++ b/Makefile @@ -0,0 +1,120 @@ +all: clean bin lib + +# Roofline Binary + +## This is the least demanding target, use it if nothing else works +nofancy: roofline roofline_o3 + mkdir bin + mv $^ bin + +## Your processor needs an FMA unit for this target to work +fmacap: roofline roofline_o3 roofline_fma roofline_fma_o3 roofline_fma_fast_o3 roofline_fma_fast_o2 roofline_fma_fast_fastmath_o3 + mkdir bin + mv $^ bin + +## This will compile just everything +bin: roofline roofline_o3 roofline_fma roofline_fma_o3 roofline_fma_fast_o3 roofline_fma_fast_o2 roofline_fma_fast_fastmath_o3 roofline_full roofline_profile roofline_full_clang roofline_full_manpack + mkdir bin + mv $^ bin + +roofline: roofline.c aikern.a + gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@ + +roofline_o3: roofline.c aikern_o3.a + gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@ + +roofline_fma: roofline.c aikern_fma.a + gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@ + +roofline_fma_o3: roofline.c aikern_fma_o3.a + gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@ + +roofline_fma_fast_o3: roofline.c aikern_fma_fast_o3.a + gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@ + +roofline_fma_fast_o2: roofline.c aikern_fma_fast_o2.a + gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@ + +roofline_fma_fast_fastmath_o3: roofline.c aikern_fma_fast_fastmath_o3.a + gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@ + +roofline_full: roofline.c aikern_full.a + gcc -Wall -Wextra -std=c99 -fopenmp -O3 -mavx -mfma -fopenmp -Ofast -ffast-math -malign-double -march=native $^ -o $@ + +roofline_full_clang: roofline.c aikern_full_clang.a + clang -Wall -Wextra -std=c99 -fopenmp -O3 -mavx -mfma -fopenmp -Ofast -ffast-math -march=native $^ -o $@ + +roofline_full_manpack: roofline.c aikern_full_manpack.a + gcc -Wall -Wextra -std=c99 -fopenmp -DINTRINS -O3 -mavx -mfma -fopenmp -Ofast -ffast-math -malign-double -march=native $^ -o $@ + +roofline_profile: roofline.c aikern_profile.a + gcc -Wall -Wextra -std=c99 -fopenmp -O0 -g -pg $^ -o $@ + +# Static Libraries +lib: aikern.a aikern_o3.a aikern_fma.a aikern_fma_o3.a aikern_fma_fast_o2.a aikern_fma_fast_o3.a aikern_fma_fast_fastmath_o3.a aikern_full.a aikern_full_clang.a aikern_profile.a aikern_full_manpack.a + mkdir lib + mv $^ lib + +aikern.a: aikern.c aikern.h + gcc -Wall -Wextra -Wno-unused -fopenmp -c -o aikern.o $< + ar rcs aikern.a aikern.o + rm aikern.o + +aikern_o3.a: aikern.c aikern.h + gcc -Wall -Wextra -Wno-unused -O3 -fopenmp -c -o aikern_o3.o $< + ar rcs $@ aikern_o3.o + rm aikern_o3.o + +aikern_fma.a: aikern.c aikern.h + gcc -Wall -Wextra -Wno-unused -O2 -mavx -mfma -fopenmp -c -o aikern_fma.o $< + ar rcs $@ aikern_fma.o + rm aikern_fma.o + +aikern_fma_o3.a: aikern.c aikern.h + gcc -Wall -Wextra -Wno-unused -O3 -mavx -mfma -fopenmp -c -o aikern_fma_o3.o $< + ar rcs $@ aikern_fma_o3.o + rm aikern_fma_o3.o + +aikern_fma_fast_o2.a: aikern.c aikern.h + gcc -Wall -Wextra -Wno-unused -O2 -mavx -mfma -fopenmp -Ofast -c -o aikern_fma_fast_o2.o $< + ar rcs $@ aikern_fma_fast_o2.o + rm aikern_fma_fast_o2.o + +aikern_fma_fast_o3.a: aikern.c aikern.h + gcc -Wall -Wextra -Wno-unused -O3 -mavx -mfma -fopenmp -Ofast -c -o aikern_fma_fast_o3.o $< + ar rcs $@ aikern_fma_fast_o3.o + rm aikern_fma_fast_o3.o + +aikern_fma_fast_fastmath_o3.a: aikern.c aikern.h + gcc -Wall -Wextra -Wno-unused -O3 -mavx -mfma -fopenmp -Ofast -ffast-math -c -o aikern_fma_fast_fastmath_o3.o $< + ar rcs $@ aikern_fma_fast_fastmath_o3.o + rm aikern_fma_fast_fastmath_o3.o + +aikern_full.a: aikern.c aikern.h + gcc -Wall -Wextra -Wno-unused -O3 -mavx -mfma -fopenmp -Ofast -ffast-math -malign-double -march=native -c -o aikern_full.o $< + ar rcs $@ aikern_full.o + rm aikern_full.o + +aikern_full_clang.a: aikern.c aikern.h + gcc -Wall -Wextra -Wno-unused -O3 -mavx -mfma -fopenmp -Ofast -ffast-math -march=native -c -o aikern_full_clang.o $< + ar rcs $@ aikern_full_clang.o + rm aikern_full_clang.o + +aikern_profile.a: aikern.c aikern.h + gcc -Wall -Wextra -Wno-unused -fopenmp -O0 -g -pg -c -o aikern_profile.o $< + ar rcs $@ aikern_profile.o + rm aikern_profile.o + +aikern_full_manpack.a: aikern.c aikern.h + gcc -Wall -Wextra -Wno-unused -DINTRINS -O3 -mavx -mfma -fopenmp -Ofast -ffast-math -malign-double -march=native -c -o aikern_full_manpack.o $< + ar rcs $@ aikern_full_manpack.o + rm aikern_full_manpack.o + +# Cleanup +clean: + rm -f *.a + rm -f *.o + rm -f roofline roofline_o3 roofline_fma roofline_fma_o3 roofline_fma_fast_o3 roofline_fma_fast_o2 roofline_fma_fast_fastmath_o3 roofline_fma_fast_fastmath_aligned_o3 roofline_profile roofline_full_clang + rm -fR bin + rm -fR lib + diff --git a/aikern.c b/aikern.c new file mode 100644 index 0000000..5bd6b0e --- /dev/null +++ b/aikern.c @@ -0,0 +1,288 @@ +# include +# include +# include +# include +# include +# include +# include + +# include "aikern.h" + +/* === Macros === */ + +#ifdef ENDEBUG +#define DEBUG(...) do { fprintf(stderr, __VA_ARGS__); fprintf(stderr, "\n"); } while(0) +#else +#define DEBUG(...) +#endif + +/** + * @brief terminate program on program error + * @param msg additional message to print + * @param ret exit value + */ +static void bail_out(char* fmt, ...); + +/** + * @brief microseconds since epoch + */ +static double pin_time(void); + +kern_result kernel_dispatch(kernel_t kernel, + double* a, double* b, double* c, + size_t size, size_t runs) +{ + + kern_result result = {0}; + result.runs = runs; + result.starts = malloc(sizeof(double)*(runs)); + result.ends = malloc(sizeof(double)*(runs)); + result.size = size; + + if(result.starts==NULL || result.ends==NULL) + { + bail_out("One of the mallocs failed\n. starts = %p, ends=%p", result.starts, result.ends); + } + + + switch(kernel) + { + + case SIMPLE_1_16: + result.flops = 1; + result.kern_name = "Simple 1/16"; + for(size_t r=0; r + +inline void kernel_8_1_fuseaware_manpack(double* a, size_t size) +{ + + #pragma omp parallel for + for(size_t i=0; i<(size-4); i+=4) + { + // pack doubles + __m256d packvec = _mm256_set_pd(a[i], a[i+1], a[i+2], a[i+3]); + + REP60(packvec = _mm256_fmadd_pd(packvec, packvec, packvec);); + REP4(packvec = _mm256_fmadd_pd(packvec, packvec, packvec);); + + a[i] = packvec[0]; + a[i+1] = packvec[1]; + a[i+2] = packvec[2]; + a[i+3] = packvec[3]; + } +} + +#endif /* INTRINS */ + + +/******************************************** + * Kernels which potentially compile to * + * different operational intensities than * + * specified * + ********************************************/ + +void kernel_1_16_simple_dangerous(double* a, double* b, size_t size) +{ + register volatile double tmp = 0.1; + + #pragma omp parallel for + for(size_t i=0; i 0) + (void)fprintf(stderr, "%s: %s \n", prog_name, msgbuf); + + } + + if(errno != 0) + (void)fprintf(stderr, "%s: %s\n", prog_name, strerror(errno)); + + exit(EXIT_FAILURE); +} diff --git a/aikern.h b/aikern.h new file mode 100644 index 0000000..c6ec2d2 --- /dev/null +++ b/aikern.h @@ -0,0 +1,288 @@ +#ifndef AIKERN_H +#define AIKERN_H + +typedef struct { + size_t runs; // also # of start-/endtimes + double* starts; // starttimes + double* ends; // endtimes + int flops; // flops per iteration + char* kern_name; + size_t size; // size of arrays handeld +} kern_result; + +typedef enum { + SIMPLE_1_16, FMA_1_16, SIMPLE_8_1, FMA_8_1, SIMPLE_8_1_FASTMATH, FMA_8_1_MANPACK +} kernel_t; + +/** + * @brief main entry point. Dispatches the kernel calls + * @param kernel the kernel to run + * @param a An array with double values of size param size + * @param b An array with double values of size param size + * @param c An array with double values of size param size + * @param size The size of the arrays + * @param runs How often the kernel should be executed + * @return kern_result containing information about the kernel execution + * + * + */ +kern_result kernel_dispatch(kernel_t kernel, + double* a, double* b, double* c, + size_t size, size_t runs); + +/** + * @brief A simple 1/16 operational intensity kernel + * @param a An array with double values of size param size + * @param size Size of the three param arrays + * @param result Pointer to result storage + * + * === Warning === + * Don't use with -O0: Stores everything on stack + * + * === Description === + * Uses a simple floating point operation: a[i] = a[i] * a[i]; + * + * Runs in a parallelized for loop. + * + * === Analysis === + * COMM: 1 read (8 byte), 1 write = 16 bytes + * COMP: 1 FLOP + * --------- + * OI: 1/16 + * + * === Optimization === + * Nothing special + * + */ +void kernel_1_16_simple(double* a, size_t size); + + +/** + * @brief A 1/16 operational intensity kernel utilizing FMA + * @param a An array with double values of size param size + * @param b An array with double values of size param size + * @param c An array with double values of size param size + * @param size Size of the three param arrays + * @param result Pointer to result storage + * + * === Warning === + * This is dangerous if FMA is not used/can't be used. Then there + * are intermediary writes (and reads) to the stack. + * + * === Description === + * Uses a triad function: a[i] = a[i] * b[i] + c[i]; in order + * to utilize the FMA unit. + * + * Runs in a parallelized for loop. + * + * === Analysis === + * With gcc -O2 -mavx -mfma FMA compiles to: + * vmovsd xmm0,QWORD PTR [rdi+rax*8] # 1 read (8 byte) + * vmovsd xmm1,QWORD PTR [rdx+rax*8] # 1 read + * vfmadd132sd xmm0,xmm1,QWORD PTR [rsi+rax*8] # 2 FLOPs + 1 read + * vmovsd QWORD PTR [rdi+rax*8],xmm0 # 1 write + * -------- + * 1/16 OI + * + * === Optimization === + * For packed doubles compile with -Ofast + * + */ +void kernel_1_16_fuseaware(double* a, double* b, double* c, size_t size); + + + +/** + * @brief A simple 8/1 operational intensity kernel + * @param a An array with double values of size param size + * @param size Size of the three param arrays + * @param result Pointer to result storage + * + * === Warning === + * Don't use with -O0: Stores everything on stack + * + * === Description === + * Uses a simple floating point operation: a[i] = a[i] * a[i] * ...* a[i]; + * + * Runs in a parallelized for loop. + * + * === Analysis === + * With AVX and -O2 (not necessarily FMA) best results (obviously correct + * easy to read disassembly). + * + * With gcc -O2 -mavx compiles to: + * vmovsd xmm1,QWORD PTR [rdi] # 1 read + * vmulsd xmm0,xmm1,xmm1 # 1 FLOP+register shuffling + * vmulsd xmm0,xmm0,xmm1 # 127x 1 FLOP+register shuffling + * # [...] + * vmovsd QWORD PTR [rdi-0x8],xmm0 # 1 write + * -------- + * 128/16 = 8/1 OI + * + * === Optimization === + * Nothing special + */ +void kernel_8_1_simple(double* a, size_t size); + +/** + * @brief A 8/1 operational intensity kernel utilizing FMA + * @param a An array with double values of size param size + * @param size Size of the three param arrays + * @param result Pointer to result storage + * + * === Warning === + * This is dangerous if FMA is not used/can't be used. Then there + * are intermediary writes (and reads) to the stack. + * + * === Description === + * Uses multiple triad function: a[i] = a[i] * a[i] + a[i]; in order + * to utilize the FMA unit. + * + * Runs in a parallelized for loop. + * + * === Analysis === + * With gcc -O2 -mavx -mfma FMA compiles to: + * vmovsd xmm0,QWORD PTR [rdi] # 1 read + * vfmadd132sd xmm0,xmm0,xmm0 # 64 x 2 FLOPs+register shuffling + * vmovsd QWORD PTR [rdi-0x8],xmm0 # 1 write + * -------- + * 128/16 = 8/1 OI + * + * === Optimization === + * For packed doubles compile with -Ofast + * + */ +void kernel_8_1_fuseaware(double* a, size_t size); + +/** + * @brief A simple 8/1 operational intensity kernel which + * undermines evil fastmath optimization + * @param a An array with double values of size param size + * @param size Size of the three param arrays + * @param result Pointer to result storage + * + * === Warning === + * Don't use with anything other than -Ofast / -ffast-math + * + * === Description === + * Uses a simple floating point operation that more closely resembles + * that of 8_1_fuseaware: + * a[i] = a[i]*a[i]; # 128x + * + * Runs in a parallelized for loop. + * + * === Analysis === + * -Ofast/-ffast-math does not preserve strict IEEE compliance. It + * therefore is allowed to ignore non-associativity of floating + * point operations. + * + * x = x*x*x*x*x*x*x*x; is optimized to x *= x;x *= x;x *= x; + * + * This cleary breaks the whole OI calculation of 8_1_simple. + * + * This kernel does not introduce more byte write-outs than + * 8_1_simple at a high optimization level since a[i] is held + * in a register and only written out once at the end of an + * iteration. + * + * + * === Optimization === + * Nothing special + */ +void kernel_8_1_simple_fastmath(double* a, size_t size); + + +/******************************************** + * Kernels which potentially compile to * + * different operational intensities than * + * specified * + ********************************************/ + +/** + * @brief A 1/16 operational intensity which might compile to a flawed oi kernel + * @param a An array with double values of size param size + * @param b An array with double values of size param size + * @param size Size of the three param arrays + * + * === Problem === + * As soon as volatile is used gcc uses the stack for tmp. + * Even if "register" is in place. Resulting in one additional write per loop. + * Omitting volatile results in optimizing away the whole loop + * (checked at -O2, which is necessary for FMA to eventually step in). + * Maybe the value stays in cache, maybe not. It does not live a register. + * + * Even with -O3: + * movsd xmm0,QWORD PTR [rdi+rax*8] # 1 read + * mulsd xmm0,QWORD PTR [rsi+rax*8] # 1 read (+ write to xmm0, not counted) + * # [...] # instructions for loop + * movsd QWORD PTR [rsp-0x8],xmm0 # malicious write + * + * Without volatile (-O3): + * repz ret # that's it + */ +void kernel_1_16_simple_dangerous(double* a, double* b, size_t size); + +/** + * @brief A 8/1 operational intensity which might compile to a flawed oi kernel + * @param a An array with double values of size param size + * @param size Size of the three param arrays + * + * === Problem == + * Same as for kernel_1_16_simple_dangerous + */ +void kernel_8_1_simple_dangerous(double* a, size_t size); + +/** + * @brief A 1/8 operational intensity which might compile to a flawed oi kernel + * @param a An array with double values of size param size + * @param size Size of the three param arrays + * + * === Problem == + * Same as for kernel_1_16_simple_dangerous + * + * Without volatile the loop is optimized away completely. + * With volatile tmp is written to the stack in every loop + * (-O3). tmp could be cached or not. This might depend on + * how large the array is and how the cpu work internally + * -> unpredictable. + */ +void kernel_1_8_vo_dangerous(double* a, size_t size); + + +#ifdef INTRINS +void kernel_8_1_fuseaware_manpack(double* a, size_t size); +#endif + + + +/**************************************** + * Helper macros for repeating things * + ****************************************/ + +#define REP0(X) +#define REP1(X) X +#define REP2(X) REP1(X) REP1(X) +#define REP3(X) REP2(X) REP1(X) +#define REP4(X) REP3(X) REP1(X) +#define REP5(X) REP4(X) REP1(X) +#define REP6(X) REP5(X) REP1(X) +#define REP7(X) REP6(X) REP1(X) +#define REP8(X) REP7(X) REP1(X) +#define REP9(X) REP8(X) REP1(X) + +#define REP10(X) REP9(X) REP1(X) +#define REP20(X) REP10(X) REP10(X) +#define REP30(X) REP20(X) REP10(X) +#define REP40(X) REP30(X) REP10(X) +#define REP50(X) REP40(X) REP10(X) +#define REP60(X) REP50(X) REP10(X) + +#define REP100(X) REP50(X) REP50(X) + +#ifdef ENDEBUG +#define DEBUG(...) do { fprintf(stderr, __VA_ARGS__); fprintf(stderr, "\n"); } while(0) +#else +#define DEBUG(...) +#endif + +#endif /* AIKERN_H */ diff --git a/prof b/prof new file mode 100644 index 0000000..1cc892d --- /dev/null +++ b/prof @@ -0,0 +1,199 @@ +Flat profile: + +Each sample counts as 0.01 seconds. + % cumulative self self total + time seconds seconds calls Ts/call Ts/call name +100.01 0.74 0.74 bail_out + 0.00 0.74 0.00 50 0.00 0.00 pin_time + 0.00 0.74 0.00 5 0.00 0.00 kernel_1_16_fuseaware + 0.00 0.74 0.00 5 0.00 0.00 kernel_1_16_simple + 0.00 0.74 0.00 5 0.00 0.00 kernel_8_1_fuseaware + 0.00 0.74 0.00 5 0.00 0.00 kernel_8_1_simple + 0.00 0.74 0.00 5 0.00 0.00 kernel_8_1_simple_fastmath + 0.00 0.74 0.00 5 0.00 0.00 kernel_dispatch + 0.00 0.74 0.00 5 0.00 0.00 print_kernresult + 0.00 0.74 0.00 2 0.00 0.00 pin_time + 0.00 0.74 0.00 1 0.00 0.00 get_int + 0.00 0.74 0.00 1 0.00 0.00 get_size + 0.00 0.74 0.00 1 0.00 0.00 testkern + + % the percentage of the total running time of the +time program used by this function. + +cumulative a running sum of the number of seconds accounted + seconds for by this function and those listed above it. + + self the number of seconds accounted for by this +seconds function alone. This is the major sort for this + listing. + +calls the number of times this function was invoked, if + this function is profiled, else blank. + + self the average number of milliseconds spent in this +ms/call function per call, if this function is profiled, + else blank. + + total the average number of milliseconds spent in this +ms/call function and its descendents per call, if this + function is profiled, else blank. + +name the name of the function. This is the minor sort + for this listing. The index shows the location of + the function in the gprof listing. If the index is + in parenthesis it shows where it would appear in + the gprof listing if it were to be printed. + +Copyright (C) 2012-2014 Free Software Foundation, Inc. + +Copying and distribution of this file, with or without modification, +are permitted in any medium without royalty provided the copyright +notice and this notice are preserved. + + Call graph (explanation follows) + + +granularity: each sample hit covers 2 byte(s) for 1.35% of 0.74 seconds + +index % time self children called name + +[1] 100.0 0.74 0.00 bail_out [1] +----------------------------------------------- + 0.00 0.00 50/50 kernel_dispatch [8] +[2] 0.0 0.00 0.00 50 pin_time [2] +----------------------------------------------- + 0.00 0.00 5/5 kernel_dispatch [8] +[3] 0.0 0.00 0.00 5 kernel_1_16_fuseaware [3] +----------------------------------------------- + 0.00 0.00 5/5 kernel_dispatch [8] +[4] 0.0 0.00 0.00 5 kernel_1_16_simple [4] +----------------------------------------------- + 0.00 0.00 5/5 kernel_dispatch [8] +[5] 0.0 0.00 0.00 5 kernel_8_1_fuseaware [5] +----------------------------------------------- + 0.00 0.00 5/5 kernel_dispatch [8] +[6] 0.0 0.00 0.00 5 kernel_8_1_simple [6] +----------------------------------------------- + 0.00 0.00 5/5 kernel_dispatch [8] +[7] 0.0 0.00 0.00 5 kernel_8_1_simple_fastmath [7] +----------------------------------------------- + 0.00 0.00 5/5 main [23] +[8] 0.0 0.00 0.00 5 kernel_dispatch [8] + 0.00 0.00 50/50 pin_time [2] + 0.00 0.00 5/5 kernel_1_16_simple [4] + 0.00 0.00 5/5 kernel_1_16_fuseaware [3] + 0.00 0.00 5/5 kernel_8_1_simple [6] + 0.00 0.00 5/5 kernel_8_1_fuseaware [5] + 0.00 0.00 5/5 kernel_8_1_simple_fastmath [7] +----------------------------------------------- + 0.00 0.00 5/5 main [23] +[9] 0.0 0.00 0.00 5 print_kernresult [9] +----------------------------------------------- + 0.00 0.00 2/2 main [23] +[10] 0.0 0.00 0.00 2 pin_time [10] +----------------------------------------------- + 0.00 0.00 1/1 main [23] +[11] 0.0 0.00 0.00 1 get_int [11] +----------------------------------------------- + 0.00 0.00 1/1 main [23] +[12] 0.0 0.00 0.00 1 get_size [12] +----------------------------------------------- + 0.00 0.00 1/1 main [23] +[13] 0.0 0.00 0.00 1 testkern [13] +----------------------------------------------- + + This table describes the call tree of the program, and was sorted by + the total amount of time spent in each function and its children. + + Each entry in this table consists of several lines. The line with the + index number at the left hand margin lists the current function. + The lines above it list the functions that called this function, + and the lines below it list the functions this one called. + This line lists: + index A unique number given to each element of the table. + Index numbers are sorted numerically. + The index number is printed next to every function name so + it is easier to look up where the function is in the table. + + % time This is the percentage of the `total' time that was spent + in this function and its children. Note that due to + different viewpoints, functions excluded by options, etc, + these numbers will NOT add up to 100%. + + self This is the total amount of time spent in this function. + + children This is the total amount of time propagated into this + function by its children. + + called This is the number of times the function was called. + If the function called itself recursively, the number + only includes non-recursive calls, and is followed by + a `+' and the number of recursive calls. + + name The name of the current function. The index number is + printed after it. If the function is a member of a + cycle, the cycle number is printed between the + function's name and the index number. + + + For the function's parents, the fields have the following meanings: + + self This is the amount of time that was propagated directly + from the function into this parent. + + children This is the amount of time that was propagated from + the function's children into this parent. + + called This is the number of times this parent called the + function `/' the total number of times the function + was called. Recursive calls to the function are not + included in the number after the `/'. + + name This is the name of the parent. The parent's index + number is printed after it. If the parent is a + member of a cycle, the cycle number is printed between + the name and the index number. + + If the parents of the function cannot be determined, the word + `' is printed in the `name' field, and all the other + fields are blank. + + For the function's children, the fields have the following meanings: + + self This is the amount of time that was propagated directly + from the child into the function. + + children This is the amount of time that was propagated from the + child's children to the function. + + called This is the number of times the function called + this child `/' the total number of times the child + was called. Recursive calls by the child are not + listed in the number after the `/'. + + name This is the name of the child. The child's index + number is printed after it. If the child is a + member of a cycle, the cycle number is printed + between the name and the index number. + + If there are any cycles (circles) in the call graph, there is an + entry for the cycle-as-a-whole. This entry shows who called the + cycle (as parents) and the members of the cycle (as children.) + The `+' recursive calls entry shows the number of function calls that + were internal to the cycle, and the calls entry for each member shows, + for that member, how many times it was called from other members of + the cycle. + +Copyright (C) 2012-2014 Free Software Foundation, Inc. + +Copying and distribution of this file, with or without modification, +are permitted in any medium without royalty provided the copyright +notice and this notice are preserved. + +Index by function name + + [1] bail_out (aikern.c) [5] kernel_8_1_fuseaware [2] pin_time (aikern.c) + [11] get_int (roofline.c) [6] kernel_8_1_simple [9] print_kernresult (roofline.c) + [12] get_size (roofline.c) [7] kernel_8_1_simple_fastmath [13] testkern (roofline.c) + [3] kernel_1_16_fuseaware [8] kernel_dispatch + [4] kernel_1_16_simple [10] pin_time (roofline.c) diff --git a/roofline.c b/roofline.c new file mode 100644 index 0000000..84b6b30 --- /dev/null +++ b/roofline.c @@ -0,0 +1,342 @@ +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# include "aikern.h" + + +/* === Macros === */ + +#ifdef ENDEBUG +#define DEBUG(...) do { fprintf(stderr, __VA_ARGS__); fprintf(stderr, "\n"); } while(0) +#else +#define DEBUG(...) +#endif + +/* === Constants === */ + +/* === Global Variables === */ +char* prog_name; + +/* === Prototypes === */ + +/** + * @brief print usage message + */ +static void usage(void); + +/** + * @brief terminate program on program error + * @param msg additional message to print + * @param ret exit value + */ +static void bail_out(char* fmt, ...); + +/** + * @brief converts the argument to size_t if possible. + * bails out on error. + * @param oparg the argument to convert + */ +static size_t get_size(char* oparg); + +/** + * @brief converts the argument to int if possible. + * bails out on error. + * @param oparg the argument to convert + */ +static int get_int(char* oparg); + +/** + * @brief microseconds since epoch + */ +static double pin_time(void); + +/** + * @brief a simple test kernel with ai of 1/16 + */ +static void testkern(double* a, double* b, double* c, size_t size); + +/** + * @brief pretty prints a kern_result + */ +static void print_kernresult(kern_result* result, const char* logname); + +int main(int argc, char* argv[]) { + prog_name = argv[0]; + + int opt; + char *size_arg = NULL; + char *runs_arg = NULL; + + while((opt = getopt(argc, argv, "s:r:")) != -1) + { + switch(opt) + { + case 's': + size_arg = optarg; + break; + case 'r': + runs_arg = optarg; + break; + case '?': + usage(); + default: + usage(); + } + } + + if(optind < argc) + { + + for (int index = optind; index < argc; index++) + bail_out ("Non-option argument %s\n", argv[index]); + + usage(); + } + + if(size_arg == NULL || runs_arg == NULL) + usage(); + + size_t size = get_size(size_arg); + int runs = get_int(runs_arg); + + // Allocating arrays + printf("Will run with array sizes of %zu elements\n", size); + printf("Will calculate min, max, avg for %d runs\n", runs); + double* a = malloc(sizeof(double)*(size)); + double* b = malloc(sizeof(double)*(size)); + double* c = malloc(sizeof(double)*(size)); + + if(a==NULL || b==NULL || c == NULL) + bail_out("One of the mallocs failed\n. a = %p, b=%p, c=%p", a, b, c); + + printf("Allocated 3 arrays (3*%.2f MB = %.2f GB)\n", (sizeof(double)*(size)/1024.0/1024.0), (sizeof(double)*(size)*3/1024.0/1024.0/1024)); + printf("Filling arrays with dummy values. This will also warm the cache\n"); + + // Filling arrays with arbitrary numbers + #pragma omp parallel for + for (size_t j=0; j AI = 3/(2*3*8) = 1/16 */ + a[j] = 2.0E0 * a[j]; + b[j] = 2.0E0 * b[j]; + c[j] = 2.0E0 * c[j]; + } +} + +/* === Helper Functions === */ + +static double pin_time(void) +{ + struct timeval tp; + int i; + + i = gettimeofday(&tp,NULL); + + if(i != 0) + { + bail_out("Time measurement impossible. gettimeofday error"); + } + + return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); +} + +static size_t get_size(char *oparg) +{ + long long int llsize = strtoll(oparg, NULL, 10); + + if(llsize <= 0) + usage(); + + unsigned long long int u_llsize = (unsigned long long int) llsize; + + if(u_llsize > SIZE_MAX) + { + bail_out("Only size between 1 to %zu allowed.", SIZE_MAX); + } + + return (size_t) llsize; +} + +static int get_int(char *oparg) +{ + long long int llsize = strtoll(oparg, NULL, 10); + + if(llsize <= 0) + usage(); + + unsigned long long int u_llsize = (unsigned long long int) llsize; + + if(u_llsize > INT_MAX) + { + bail_out("Only size between 1 to %d allowed.", INT_MAX); + } + + return (int) llsize; +} + +static void usage() +{ + fprintf(stderr, "USAGE: ./roofline -s -r \n"); + fprintf(stderr, "e.g.: ./roofline -s 100000 -r 5 \n"); + bail_out("Invalid paramers"); +} + +static void bail_out(char* fmt, ...) +{ + if(fmt != NULL) + { + char msgbuf[150]; + + va_list vl; + va_start(vl, fmt); + + if(vsnprintf(msgbuf, sizeof(msgbuf), fmt, vl) < 0) + msgbuf[0] = '\0'; + + va_end( vl); + + if(strlen(msgbuf) > 0) + (void)fprintf(stderr, "%s: %s \n", prog_name, msgbuf); + + } + + if(errno != 0) + (void)fprintf(stderr, "%s: %s\n", prog_name, strerror(errno)); + + exit(EXIT_FAILURE); +} + +static void print_kernresult(kern_result* result, const char* logname) +{ + struct stat st = {}; + + if (stat("log", &st) == -1) + { + if(mkdir("log", 0700)) + { + bail_out("Couldn't create log directory for %s", result->kern_name); + } + } + + char logpath[20]; + snprintf(logpath, sizeof(logpath), "%s/%s", "log", logname); + FILE* log = fopen(logpath, "w"); + if(log == NULL) + bail_out("Couldn't open log file for %s", result->kern_name); + + if(fputs("run,start,end,delta,GFLOP/s\n", log) == EOF) + { + fclose(log); + bail_out("Couldn't write header to log file"); + } + + printf("=== %s ===\n", result->kern_name); + + double min; + double max; + double sum = 0.0; + double deltas[result->runs]; + + deltas[0] = result->ends[0] - result->starts[0]; + min=deltas[0]; + max=deltas[0]; + sum+=deltas[0]; + + for(size_t i=1; iruns; i++) + { + deltas[i] = result->ends[i] - result->starts[i]; + sum+=deltas[i]; + + if(deltas[i] < min) min=deltas[i]; + if(deltas[i] > max) max=deltas[i]; + + double gflops = ((result->flops * result->size) / deltas[i]) / 1.0E9; + + if(fprintf(log, "%zu,%.4f,%.4f,%.4f,%.4f\n", + i, result->starts[i], + result->ends[i], deltas[i], + gflops) == EOF) + { + fclose(log); + bail_out("Couldn't write to log file"); + } + } + + + printf("%d flop(s) per run\t %zu run(s)\n\n", result->flops, result->runs); + printf("Min: %.4f \t Max: %.4f \t Avg: %.4f\n", min, max, (sum/result->runs)); + + printf("\n\n\n"); + + + + if(fclose(log)) + { + bail_out("Couldn't close log file for %s", result->kern_name); + } +}