From c8f6ca1989cea988035b933d2ff60b472721290b Mon Sep 17 00:00:00 2001 From: Armin Friedl Date: Fri, 24 Jun 2016 18:50:59 +0200 Subject: [PATCH] one last kernel --- roofline/src/Makefile | 37 +++++- roofline/src/aikern.c | 57 +++++++++ roofline/src/aikern.h | 8 +- roofline/src/log/fma16 | 8 +- roofline/src/log/fma8 | 8 +- roofline/src/log/fma8manpack | 5 + roofline/src/log/simple16 | 8 +- roofline/src/log/simple8 | 8 +- roofline/src/log/simple8fastmath | 8 +- roofline/src/prof | 199 +++++++++++++++++++++++++++++++ roofline/src/roofline.c | 35 ++++-- 11 files changed, 348 insertions(+), 33 deletions(-) create mode 100644 roofline/src/log/fma8manpack create mode 100644 roofline/src/prof diff --git a/roofline/src/Makefile b/roofline/src/Makefile index 845fe45..1386f56 100644 --- a/roofline/src/Makefile +++ b/roofline/src/Makefile @@ -1,7 +1,7 @@ all: clean bin lib # Roofline Binary -bin: roofline roofline_o3 roofline_fma roofline_fma_o3 roofline_fma_fast_o3 roofline_fma_fast_o2 roofline_fma_fast_fastmath_o3 +bin: roofline roofline_o3 roofline_fma roofline_fma_o3 roofline_fma_fast_o3 roofline_fma_fast_o2 roofline_fma_fast_fastmath_o3 roofline_full roofline_profile roofline_full_clang roofline_full_manpack mkdir bin mv $^ bin @@ -26,9 +26,20 @@ roofline_fma_fast_o2: roofline.c aikern_fma_fast_o2.a roofline_fma_fast_fastmath_o3: roofline.c aikern_fma_fast_fastmath_o3.a gcc -Wall -Wextra -std=c99 -fopenmp $^ -o $@ +roofline_full: roofline.c aikern_full.a + gcc -Wall -Wextra -std=c99 -fopenmp -O3 -mavx -mfma -fopenmp -Ofast -ffast-math -malign-double -march=native $^ -o $@ + +roofline_full_clang: roofline.c aikern_full_clang.a + clang -Wall -Wextra -std=c99 -fopenmp -O3 -mavx -mfma -fopenmp -Ofast -ffast-math -march=native $^ -o $@ + +roofline_full_manpack: roofline.c aikern_full_manpack.a + gcc -Wall -Wextra -std=c99 -fopenmp -DINTRINS -O3 -mavx -mfma -fopenmp -Ofast -ffast-math -malign-double -march=native $^ -o $@ + +roofline_profile: roofline.c aikern_profile.a + gcc -Wall -Wextra -std=c99 -fopenmp -O0 -g -pg $^ -o $@ # Static Libraries -lib: aikern.a aikern_o3.a aikern_fma.a aikern_fma_o3.a aikern_fma_fast_o2.a aikern_fma_fast_o3.a aikern_fma_fast_fastmath_o3.a +lib: aikern.a aikern_o3.a aikern_fma.a aikern_fma_o3.a aikern_fma_fast_o2.a aikern_fma_fast_o3.a aikern_fma_fast_fastmath_o3.a aikern_full.a aikern_full_clang.a aikern_profile.a aikern_full_manpack.a mkdir lib mv $^ lib @@ -67,11 +78,31 @@ aikern_fma_fast_fastmath_o3.a: aikern.c aikern.h ar rcs $@ aikern_fma_fast_fastmath_o3.o rm aikern_fma_fast_fastmath_o3.o +aikern_full.a: aikern.c aikern.h + gcc -Wall -Wextra -Wno-unused -O3 -mavx -mfma -fopenmp -Ofast -ffast-math -malign-double -march=native -c -o aikern_full.o $< + ar rcs $@ aikern_full.o + rm aikern_full.o + +aikern_full_clang.a: aikern.c aikern.h + gcc -Wall -Wextra -Wno-unused -O3 -mavx -mfma -fopenmp -Ofast -ffast-math -march=native -c -o aikern_full_clang.o $< + ar rcs $@ aikern_full_clang.o + rm aikern_full_clang.o + +aikern_profile.a: aikern.c aikern.h + gcc -Wall -Wextra -Wno-unused -fopenmp -O0 -g -pg -c -o aikern_profile.o $< + ar rcs $@ aikern_profile.o + rm aikern_profile.o + +aikern_full_manpack.a: aikern.c aikern.h + gcc -Wall -Wextra -Wno-unused -DINTRINS -O3 -mavx -mfma -fopenmp -Ofast -ffast-math -malign-double -march=native -c -o aikern_full_manpack.o $< + ar rcs $@ aikern_full_manpack.o + rm aikern_full_manpack.o + # Cleanup clean: rm -f *.a rm -f *.o - rm -f roofline roofline_o3 roofline_fma roofline_fma_o3 roofline_fma_fast_o3 roofline_fma_fast_o2 roofline_fma_fast_fastmath_o3 + rm -f roofline roofline_o3 roofline_fma roofline_fma_o3 roofline_fma_fast_o3 roofline_fma_fast_o2 roofline_fma_fast_fastmath_o3 roofline_fma_fast_fastmath_aligned_o3 roofline_profile roofline_full_clang rm -fR bin rm -fR lib diff --git a/roofline/src/aikern.c b/roofline/src/aikern.c index 1068a41..5bd6b0e 100644 --- a/roofline/src/aikern.c +++ b/roofline/src/aikern.c @@ -8,6 +8,14 @@ # include "aikern.h" +/* === Macros === */ + +#ifdef ENDEBUG +#define DEBUG(...) do { fprintf(stderr, __VA_ARGS__); fprintf(stderr, "\n"); } while(0) +#else +#define DEBUG(...) +#endif + /** * @brief terminate program on program error * @param msg additional message to print @@ -81,6 +89,7 @@ kern_result kernel_dispatch(kernel_t kernel, } break; case SIMPLE_8_1_FASTMATH: + DEBUG("AIKERN MANPACK"); result.flops = 128; result.kern_name = "Simple 8 undermining fastmath"; for(size_t r=0; r + +inline void kernel_8_1_fuseaware_manpack(double* a, size_t size) +{ + + #pragma omp parallel for + for(size_t i=0; i<(size-4); i+=4) + { + // pack doubles + __m256d packvec = _mm256_set_pd(a[i], a[i+1], a[i+2], a[i+3]); + + REP60(packvec = _mm256_fmadd_pd(packvec, packvec, packvec);); + REP4(packvec = _mm256_fmadd_pd(packvec, packvec, packvec);); + + a[i] = packvec[0]; + a[i+1] = packvec[1]; + a[i+2] = packvec[2]; + a[i+3] = packvec[3]; + } +} + +#endif /* INTRINS */ + + /******************************************** * Kernels which potentially compile to * * different operational intensities than * @@ -198,7 +253,9 @@ static double pin_time(void) i = gettimeofday(&tp,NULL); if(i != 0) + { bail_out("Time measurement impossible. gettimeofday error"); + } return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); } diff --git a/roofline/src/aikern.h b/roofline/src/aikern.h index 2749a43..c6ec2d2 100644 --- a/roofline/src/aikern.h +++ b/roofline/src/aikern.h @@ -11,7 +11,7 @@ typedef struct { } kern_result; typedef enum { - SIMPLE_1_16, FMA_1_16, SIMPLE_8_1, FMA_8_1, SIMPLE_8_1_FASTMATH + SIMPLE_1_16, FMA_1_16, SIMPLE_8_1, FMA_8_1, SIMPLE_8_1_FASTMATH, FMA_8_1_MANPACK } kernel_t; /** @@ -249,6 +249,12 @@ void kernel_8_1_simple_dangerous(double* a, size_t size); void kernel_1_8_vo_dangerous(double* a, size_t size); +#ifdef INTRINS +void kernel_8_1_fuseaware_manpack(double* a, size_t size); +#endif + + + /**************************************** * Helper macros for repeating things * ****************************************/ diff --git a/roofline/src/log/fma16 b/roofline/src/log/fma16 index 00e53e9..eb6b6ae 100644 --- a/roofline/src/log/fma16 +++ b/roofline/src/log/fma16 @@ -1,5 +1,5 @@ run,start,end,delta,GFLOP/s -1,1466732365.5426,1466732366.1946,0.6520,0.9202 -2,1466732366.1946,1466732366.8410,0.6464,0.9282 -3,1466732366.8410,1466732367.4875,0.6465,0.9281 -4,1466732367.4875,1466732368.1370,0.6495,0.9238 +1,1466764717.8266,1466764718.1751,0.3486,0.8606 +2,1466764718.1751,1466764718.5235,0.3484,0.8611 +3,1466764718.5235,1466764718.8726,0.3491,0.8593 +4,1466764718.8726,1466764719.2248,0.3522,0.8518 diff --git a/roofline/src/log/fma8 b/roofline/src/log/fma8 index 18e1f22..595952c 100644 --- a/roofline/src/log/fma8 +++ b/roofline/src/log/fma8 @@ -1,5 +1,5 @@ run,start,end,delta,GFLOP/s -1,1466732371.5080,1466732373.2547,1.7468,21.9836 -2,1466732373.2547,1466732375.0033,1.7486,21.9603 -3,1466732375.0033,1466732376.7499,1.7465,21.9864 -4,1466732376.7499,1466732378.4990,1.7492,21.9534 +1,1466764721.0839,1466764721.9589,0.8750,21.9434 +2,1466764721.9589,1466764722.8340,0.8752,21.9383 +3,1466764722.8340,1466764723.7090,0.8749,21.9451 +4,1466764723.7090,1466764724.5784,0.8694,22.0840 diff --git a/roofline/src/log/fma8manpack b/roofline/src/log/fma8manpack new file mode 100644 index 0000000..48ec305 --- /dev/null +++ b/roofline/src/log/fma8manpack @@ -0,0 +1,5 @@ +run,start,end,delta,GFLOP/s +1,1466764736.6256,1466764737.6487,1.0231,18.7665 +2,1466764737.6487,1466764738.6642,1.0155,18.9073 +3,1466764738.6642,1466764739.6867,1.0225,18.7770 +4,1466764739.6867,1466764740.7045,1.0178,18.8651 diff --git a/roofline/src/log/simple16 b/roofline/src/log/simple16 index d480224..3112314 100644 --- a/roofline/src/log/simple16 +++ b/roofline/src/log/simple16 @@ -1,5 +1,5 @@ run,start,end,delta,GFLOP/s -1,1466732363.5905,1466732363.9157,0.3252,0.9226 -2,1466732363.9157,1466732364.2410,0.3253,0.9223 -3,1466732364.2410,1466732364.5659,0.3249,0.9234 -4,1466732364.5659,1466732364.8925,0.3266,0.9186 +1,1466764716.7465,1466764716.9269,0.1804,0.8314 +2,1466764716.9269,1466764717.1069,0.1800,0.8334 +3,1466764717.1069,1466764717.2871,0.1801,0.8327 +4,1466764717.2871,1466764717.4767,0.1897,0.7908 diff --git a/roofline/src/log/simple8 b/roofline/src/log/simple8 index d04184f..4f34abe 100644 --- a/roofline/src/log/simple8 +++ b/roofline/src/log/simple8 @@ -1,5 +1,5 @@ run,start,end,delta,GFLOP/s -1,1466732368.4592,1466732368.7843,0.3251,118.1190 -2,1466732368.7843,1466732369.1090,0.3247,118.2713 -3,1466732369.1090,1466732369.4353,0.3263,117.6684 -4,1466732369.4353,1466732369.7596,0.3243,118.4045 +1,1466764719.4042,1466764719.5845,0.1803,106.5045 +2,1466764719.5845,1466764719.7707,0.1861,103.1532 +3,1466764719.7707,1466764719.9487,0.1780,107.8675 +4,1466764719.9487,1466764720.1264,0.1777,108.0261 diff --git a/roofline/src/log/simple8fastmath b/roofline/src/log/simple8fastmath index 6ee022d..7db0011 100644 --- a/roofline/src/log/simple8fastmath +++ b/roofline/src/log/simple8fastmath @@ -1,5 +1,5 @@ run,start,end,delta,GFLOP/s -1,1466732382.7768,1466732387.0594,4.2827,8.9664 -2,1466732387.0594,1466732391.3489,4.2895,8.9521 -3,1466732391.3489,1466732395.6322,4.2834,8.9649 -4,1466732395.6322,1466732399.9114,4.2791,8.9738 +1,1466764726.7241,1466764728.9906,2.2664,8.4714 +2,1466764728.9906,1466764731.1306,2.1400,8.9720 +3,1466764731.1306,1466764733.3797,2.2491,8.5366 +4,1466764733.3797,1466764735.6046,2.2249,8.6298 diff --git a/roofline/src/prof b/roofline/src/prof new file mode 100644 index 0000000..1cc892d --- /dev/null +++ b/roofline/src/prof @@ -0,0 +1,199 @@ +Flat profile: + +Each sample counts as 0.01 seconds. + % cumulative self self total + time seconds seconds calls Ts/call Ts/call name +100.01 0.74 0.74 bail_out + 0.00 0.74 0.00 50 0.00 0.00 pin_time + 0.00 0.74 0.00 5 0.00 0.00 kernel_1_16_fuseaware + 0.00 0.74 0.00 5 0.00 0.00 kernel_1_16_simple + 0.00 0.74 0.00 5 0.00 0.00 kernel_8_1_fuseaware + 0.00 0.74 0.00 5 0.00 0.00 kernel_8_1_simple + 0.00 0.74 0.00 5 0.00 0.00 kernel_8_1_simple_fastmath + 0.00 0.74 0.00 5 0.00 0.00 kernel_dispatch + 0.00 0.74 0.00 5 0.00 0.00 print_kernresult + 0.00 0.74 0.00 2 0.00 0.00 pin_time + 0.00 0.74 0.00 1 0.00 0.00 get_int + 0.00 0.74 0.00 1 0.00 0.00 get_size + 0.00 0.74 0.00 1 0.00 0.00 testkern + + % the percentage of the total running time of the +time program used by this function. + +cumulative a running sum of the number of seconds accounted + seconds for by this function and those listed above it. + + self the number of seconds accounted for by this +seconds function alone. This is the major sort for this + listing. + +calls the number of times this function was invoked, if + this function is profiled, else blank. + + self the average number of milliseconds spent in this +ms/call function per call, if this function is profiled, + else blank. + + total the average number of milliseconds spent in this +ms/call function and its descendents per call, if this + function is profiled, else blank. + +name the name of the function. This is the minor sort + for this listing. The index shows the location of + the function in the gprof listing. If the index is + in parenthesis it shows where it would appear in + the gprof listing if it were to be printed. + +Copyright (C) 2012-2014 Free Software Foundation, Inc. + +Copying and distribution of this file, with or without modification, +are permitted in any medium without royalty provided the copyright +notice and this notice are preserved. + + Call graph (explanation follows) + + +granularity: each sample hit covers 2 byte(s) for 1.35% of 0.74 seconds + +index % time self children called name + +[1] 100.0 0.74 0.00 bail_out [1] +----------------------------------------------- + 0.00 0.00 50/50 kernel_dispatch [8] +[2] 0.0 0.00 0.00 50 pin_time [2] +----------------------------------------------- + 0.00 0.00 5/5 kernel_dispatch [8] +[3] 0.0 0.00 0.00 5 kernel_1_16_fuseaware [3] +----------------------------------------------- + 0.00 0.00 5/5 kernel_dispatch [8] +[4] 0.0 0.00 0.00 5 kernel_1_16_simple [4] +----------------------------------------------- + 0.00 0.00 5/5 kernel_dispatch [8] +[5] 0.0 0.00 0.00 5 kernel_8_1_fuseaware [5] +----------------------------------------------- + 0.00 0.00 5/5 kernel_dispatch [8] +[6] 0.0 0.00 0.00 5 kernel_8_1_simple [6] +----------------------------------------------- + 0.00 0.00 5/5 kernel_dispatch [8] +[7] 0.0 0.00 0.00 5 kernel_8_1_simple_fastmath [7] +----------------------------------------------- + 0.00 0.00 5/5 main [23] +[8] 0.0 0.00 0.00 5 kernel_dispatch [8] + 0.00 0.00 50/50 pin_time [2] + 0.00 0.00 5/5 kernel_1_16_simple [4] + 0.00 0.00 5/5 kernel_1_16_fuseaware [3] + 0.00 0.00 5/5 kernel_8_1_simple [6] + 0.00 0.00 5/5 kernel_8_1_fuseaware [5] + 0.00 0.00 5/5 kernel_8_1_simple_fastmath [7] +----------------------------------------------- + 0.00 0.00 5/5 main [23] +[9] 0.0 0.00 0.00 5 print_kernresult [9] +----------------------------------------------- + 0.00 0.00 2/2 main [23] +[10] 0.0 0.00 0.00 2 pin_time [10] +----------------------------------------------- + 0.00 0.00 1/1 main [23] +[11] 0.0 0.00 0.00 1 get_int [11] +----------------------------------------------- + 0.00 0.00 1/1 main [23] +[12] 0.0 0.00 0.00 1 get_size [12] +----------------------------------------------- + 0.00 0.00 1/1 main [23] +[13] 0.0 0.00 0.00 1 testkern [13] +----------------------------------------------- + + This table describes the call tree of the program, and was sorted by + the total amount of time spent in each function and its children. + + Each entry in this table consists of several lines. The line with the + index number at the left hand margin lists the current function. + The lines above it list the functions that called this function, + and the lines below it list the functions this one called. + This line lists: + index A unique number given to each element of the table. + Index numbers are sorted numerically. + The index number is printed next to every function name so + it is easier to look up where the function is in the table. + + % time This is the percentage of the `total' time that was spent + in this function and its children. Note that due to + different viewpoints, functions excluded by options, etc, + these numbers will NOT add up to 100%. + + self This is the total amount of time spent in this function. + + children This is the total amount of time propagated into this + function by its children. + + called This is the number of times the function was called. + If the function called itself recursively, the number + only includes non-recursive calls, and is followed by + a `+' and the number of recursive calls. + + name The name of the current function. The index number is + printed after it. If the function is a member of a + cycle, the cycle number is printed between the + function's name and the index number. + + + For the function's parents, the fields have the following meanings: + + self This is the amount of time that was propagated directly + from the function into this parent. + + children This is the amount of time that was propagated from + the function's children into this parent. + + called This is the number of times this parent called the + function `/' the total number of times the function + was called. Recursive calls to the function are not + included in the number after the `/'. + + name This is the name of the parent. The parent's index + number is printed after it. If the parent is a + member of a cycle, the cycle number is printed between + the name and the index number. + + If the parents of the function cannot be determined, the word + `' is printed in the `name' field, and all the other + fields are blank. + + For the function's children, the fields have the following meanings: + + self This is the amount of time that was propagated directly + from the child into the function. + + children This is the amount of time that was propagated from the + child's children to the function. + + called This is the number of times the function called + this child `/' the total number of times the child + was called. Recursive calls by the child are not + listed in the number after the `/'. + + name This is the name of the child. The child's index + number is printed after it. If the child is a + member of a cycle, the cycle number is printed + between the name and the index number. + + If there are any cycles (circles) in the call graph, there is an + entry for the cycle-as-a-whole. This entry shows who called the + cycle (as parents) and the members of the cycle (as children.) + The `+' recursive calls entry shows the number of function calls that + were internal to the cycle, and the calls entry for each member shows, + for that member, how many times it was called from other members of + the cycle. + +Copyright (C) 2012-2014 Free Software Foundation, Inc. + +Copying and distribution of this file, with or without modification, +are permitted in any medium without royalty provided the copyright +notice and this notice are preserved. + +Index by function name + + [1] bail_out (aikern.c) [5] kernel_8_1_fuseaware [2] pin_time (aikern.c) + [11] get_int (roofline.c) [6] kernel_8_1_simple [9] print_kernresult (roofline.c) + [12] get_size (roofline.c) [7] kernel_8_1_simple_fastmath [13] testkern (roofline.c) + [3] kernel_1_16_fuseaware [8] kernel_dispatch + [4] kernel_1_16_simple [10] pin_time (roofline.c) diff --git a/roofline/src/roofline.c b/roofline/src/roofline.c index 5f8bcd4..1b8f580 100644 --- a/roofline/src/roofline.c +++ b/roofline/src/roofline.c @@ -140,12 +140,18 @@ int main(int argc, char* argv[]) { printf("Starting tests...\n\n\n"); // Executing kernels - kern_result simple16 = kernel_dispatch(SIMPLE_1_16, a, b, c, size, runs); - kern_result fma16 = kernel_dispatch(FMA_1_16, a, b, c, size, runs); - kern_result simple8 = kernel_dispatch(SIMPLE_8_1, a, b, c, size, runs); - kern_result fma8 = kernel_dispatch(FMA_8_1, a, b, c, size, runs); - kern_result simple8fm = kernel_dispatch(SIMPLE_8_1_FASTMATH, a, b, c, size, runs); + kern_result simple16 = kernel_dispatch(SIMPLE_1_16, a, b, c, size, runs); + kern_result fma16 = kernel_dispatch(FMA_1_16, a, b, c, size, runs); + kern_result simple8 = kernel_dispatch(SIMPLE_8_1, a, b, c, size, runs); + kern_result fma8 = kernel_dispatch(FMA_8_1, a, b, c, size, runs); + kern_result simple8fm = kernel_dispatch(SIMPLE_8_1_FASTMATH, a, b, c, size, runs); +#ifdef INTRINS + DEBUG("Running manpack now"); + kern_result fma8manpack = kernel_dispatch(FMA_8_1_MANPACK, a, b, c, size, runs); + DEBUG("manpack run successful"); +#endif + // Freeing arrays free(a); free(b); @@ -158,6 +164,11 @@ int main(int argc, char* argv[]) { print_kernresult(&fma8, "fma8"); print_kernresult(&simple8fm, "simple8fastmath"); +#ifdef INTRINS + print_kernresult(&fma8manpack, "fma8manpack"); +#endif + + exit(EXIT_SUCCESS); } @@ -183,7 +194,9 @@ static double pin_time(void) i = gettimeofday(&tp,NULL); if(i != 0) + { bail_out("Time measurement impossible. gettimeofday error"); + } return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); } @@ -198,7 +211,9 @@ static size_t get_size(char *oparg) unsigned long long int u_llsize = (unsigned long long int) llsize; if(u_llsize > SIZE_MAX) - bail_out("Only size between 1 to %zu allowed.", SIZE_MAX); + { + bail_out("Only size between 1 to %zu allowed.", SIZE_MAX); + } return (size_t) llsize; } @@ -213,7 +228,9 @@ static int get_int(char *oparg) unsigned long long int u_llsize = (unsigned long long int) llsize; if(u_llsize > INT_MAX) - bail_out("Only size between 1 to %d allowed.", INT_MAX); + { + bail_out("Only size between 1 to %d allowed.", INT_MAX); + } return (int) llsize; } @@ -221,7 +238,7 @@ static int get_int(char *oparg) static void usage() { fprintf(stderr, "USAGE: ./roofline -s -r \n"); - bail_out(NULL); + bail_out("Invalid paramers"); } static void bail_out(char* fmt, ...) @@ -251,7 +268,7 @@ static void bail_out(char* fmt, ...) static void print_kernresult(kern_result* result, const char* logname) { - struct stat st = {0}; + struct stat st = {}; if (stat("log", &st) == -1) {