#if defined(CPU_X86) && (defined(COMPILER_MSVC) || defined(COMPILER_GCC)) #define X86ASM /* gcc 2.95 royally screws up stack alignments on variables */ #if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS6PP)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 30000))) #define X86ASM_SSE #define X86ASM_SSE2 #endif #if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2005)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40102))) #define X86ASM_SSSE3 #endif #if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2010SP1)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40400))) #define X86ASM_AVX #define X86ASM_XOP #endif #if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2012)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40700))) #define X86ASM_AVX2 #endif #endif #if defined(CPU_X86_64) && defined(COMPILER_GCC) #define X86_64ASM #define X86_64ASM_SSE2 #if (COMPILER_GCC >= 40102) #define X86_64ASM_SSSE3 #endif #if (COMPILER_GCC >= 40400) #define X86_64ASM_AVX #define X86_64ASM_XOP #endif #if (COMPILER_GCC >= 40700) #define X86_64ASM_AVX2 #endif #endif #if defined(COMPILER_MSVC) && (defined(CPU_X86_FORCE_INTRINSICS) || defined(CPU_X86_64)) #define X86_INTRINSIC #if defined(CPU_X86_64) || defined(X86ASM_SSE) #define X86_INTRINSIC_SSE #endif #if defined(CPU_X86_64) || defined(X86ASM_SSE2) #define X86_INTRINSIC_SSE2 #endif #if (COMPILER_MSVC >= COMPILER_MSVC_VS2005) #define X86_INTRINSIC_SSSE3 #endif #if (COMPILER_MSVC >= COMPILER_MSVC_VS2010SP1) #define X86_INTRINSIC_AVX #define X86_INTRINSIC_XOP #endif #if (COMPILER_MSVC >= COMPILER_MSVC_VS2012) #define X86_INTRINSIC_AVX2 #endif #endif #if defined(COMPILER_GCC) && defined(CPU_X86_FORCE_INTRINSICS) #define X86_INTRINSIC #if defined(__SSE__) #define X86_INTRINSIC_SSE #endif #if defined(__SSE2__) #define X86_INTRINSIC_SSE2 #endif #if defined(__SSSE3__) #define X86_INTRINSIC_SSSE3 #endif #if defined(__AVX__) #define X86_INTRINSIC_AVX #endif #if defined(__XOP__) #define X86_INTRINSIC_XOP #endif #if defined(__AVX2__) #define X86_INTRINSIC_AVX2 #endif #endif /* only use simd on windows (or SSE2 on gcc)! */ #if defined(CPU_X86_FORCE_INTRINSICS) || defined(X86_INTRINSIC) #if defined(X86_INTRINSIC_SSE) #include #include typedef __m64 qmm; typedef __m128 xmm; typedef __m128d xmmd; #endif #if defined(X86_INTRINSIC_SSE2) #include typedef __m128i xmmi; #endif #if defined(X86_INTRINSIC_SSSE3) #include #endif #if defined(X86_INTRINSIC_AVX) #include #endif #if defined(X86_INTRINSIC_XOP) #if defined(COMPILER_MSVC) #include #else #include #endif #endif #if defined(X86_INTRINSIC_AVX2) typedef __m256i ymmi; #endif #endif #if defined(X86_INTRINSIC_SSE2) typedef union packedelem8_t { uint8_t u[16]; xmmi v; } packedelem8; typedef union packedelem32_t { uint32_t u[4]; xmmi v; } packedelem32; typedef union packedelem64_t { uint64_t u[2]; xmmi v; } packedelem64; #else typedef union packedelem8_t { uint8_t u[16]; uint32_t dw[4]; } packedelem8; typedef union packedelem32_t { uint32_t u[4]; uint8_t b[16]; } packedelem32; typedef union packedelem64_t { uint64_t u[2]; uint8_t b[16]; } packedelem64; #endif #if defined(X86_INTRINSIC_SSSE3) static const packedelem8 ALIGN(16) ssse3_rotl16_32bit = {{2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13}}; static const packedelem8 ALIGN(16) ssse3_rotl8_32bit = {{3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14}}; #endif /* x86 inline asm for gcc/msvc. usage: asm_naked_fn_proto(return_type, name) (type parm1, type parm2..) asm_naked_fn(name) a1(..) a2(.., ..) a3(.., .., ..) 64bit OR 0 paramters: a1(ret) 32bit AND n parameters: aret(4n), eg aret(16) for 4 parameters asm_naked_fn_end(name) */ #if defined(X86ASM) || defined(X86_64ASM) #if defined(COMPILER_MSVC) #pragma warning(disable : 4731) /* frame pointer modified by inline assembly */ #define a1(x) __asm {x} #define a2(x, y) __asm {x, y} #define a3(x, y, z) __asm {x, y, z} #define a4(x, y, z, w) __asm {x, y, z, w} #define aj(x) __asm {x} #define asm_align8 a1(ALIGN 8) #define asm_align16 a1(ALIGN 16) #define asm_calling_convention STDCALL #define aret(n) a1(ret n) #define asm_naked_fn_proto(type, fn) static NAKED type asm_calling_convention fn #define asm_naked_fn(fn) { #define asm_naked_fn_end(fn) } #elif defined(COMPILER_GCC) #define GNU_AS1(x) #x ";\n" #define GNU_AS2(x, y) #x ", " #y ";\n" #define GNU_AS3(x, y, z) #x ", " #y ", " #z ";\n" #define GNU_AS4(x, y, z, w) #x ", " #y ", " #z ", " #w ";\n" #define GNU_ASFN(x) "\n_" #x ":\n" #x ":\n" #define GNU_ASJ(x) ".att_syntax prefix\n" #x "\n.intel_syntax noprefix\n" #define a1(x) GNU_AS1(x) #define a2(x, y) GNU_AS2(x, y) #define a3(x, y, z) GNU_AS3(x, y, z) #define a4(x, y, z, w) GNU_AS4(x, y, z, w) #define aj(x) GNU_ASJ(x) #define asm_align8 ".p2align 3,,7" #define asm_align16 ".p2align 4,,15" #if defined(OS_WINDOWS) #define asm_calling_convention CDECL #define aret(n) a1(ret) #if defined(X86_64ASM) #define asm_naked_fn(fn) ; __asm__ ( \ ".text\n" \ asm_align16 GNU_ASFN(fn) \ "subq $136, %rsp;" \ "movdqa %xmm6, 0(%rsp);" \ "movdqa %xmm7, 16(%rsp);" \ "movdqa %xmm8, 32(%rsp);" \ "movdqa %xmm9, 48(%rsp);" \ "movdqa %xmm10, 64(%rsp);" \ "movdqa %xmm11, 80(%rsp);" \ "movdqa %xmm12, 96(%rsp);" \ "movq %rdi, 112(%rsp);" \ "movq %rsi, 120(%rsp);" \ "movq %rcx, %rdi;" \ "movq %rdx, %rsi;" \ "movq %r8, %rdx;" \ "movq %r9, %rcx;" \ "call 1f;" \ "movdqa 0(%rsp), %xmm6;" \ "movdqa 16(%rsp), %xmm7;" \ "movdqa 32(%rsp), %xmm8;" \ "movdqa 48(%rsp), %xmm9;" \ "movdqa 64(%rsp), %xmm10;" \ "movdqa 80(%rsp), %xmm11;" \ "movdqa 96(%rsp), %xmm12;" \ "movq 112(%rsp), %rdi;" \ "movq 120(%rsp), %rsi;" \ "addq $136, %rsp;" \ "ret;" \ ".intel_syntax noprefix;" \ ".p2align 4,,15;" \ "1:;" #else #define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASFN(fn) #endif #else #define asm_calling_convention STDCALL #define aret(n) a1(ret n) #define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASFN(fn) #endif #define asm_naked_fn_proto(type, fn) extern type asm_calling_convention fn #define asm_naked_fn_end(fn) ".att_syntax prefix;\n" ); #define asm_gcc() __asm__ __volatile__(".intel_syntax noprefix;\n" #define asm_gcc_parms() ".att_syntax prefix;" #define asm_gcc_trashed() __asm__ __volatile__("" ::: #define asm_gcc_end() ); #else need x86 asm #endif #endif /* X86ASM || X86_64ASM */ #if defined(CPU_X86) || defined(CPU_X86_64) typedef enum cpu_flags_x86_t { cpu_mmx = 1 << 0, cpu_sse = 1 << 1, cpu_sse2 = 1 << 2, cpu_sse3 = 1 << 3, cpu_ssse3 = 1 << 4, cpu_sse4_1 = 1 << 5, cpu_sse4_2 = 1 << 6, cpu_avx = 1 << 7, cpu_xop = 1 << 8, cpu_avx2 = 1 << 9 } cpu_flags_x86; typedef enum cpu_vendors_x86_t { cpu_nobody, cpu_intel, cpu_amd } cpu_vendors_x86; typedef struct x86_regs_t { uint32_t eax, ebx, ecx, edx; } x86_regs; #if defined(X86ASM) asm_naked_fn_proto(int, has_cpuid)(void) asm_naked_fn(has_cpuid) a1(pushfd) a1(pop eax) a2(mov ecx, eax) a2(xor eax, 0x200000) a1(push eax) a1(popfd) a1(pushfd) a1(pop eax) a2(xor eax, ecx) a2(shr eax, 21) a2(and eax, 1) a1(push ecx) a1(popfd) a1(ret) asm_naked_fn_end(has_cpuid) #endif /* X86ASM */ static void NOINLINE get_cpuid(x86_regs *regs, uint32_t flags) { #if defined(COMPILER_MSVC) __cpuid((int *)regs, (int)flags); #else #if defined(CPU_X86_64) #define cpuid_bx rbx #else #define cpuid_bx ebx #endif asm_gcc() a1(push cpuid_bx) a2(xor ecx, ecx) a1(cpuid) a2(mov [%1 + 0], eax) a2(mov [%1 + 4], ebx) a2(mov [%1 + 8], ecx) a2(mov [%1 + 12], edx) a1(pop cpuid_bx) asm_gcc_parms() : "+a"(flags) : "S"(regs) : "%ecx", "%edx", "cc" asm_gcc_end() #endif } #if defined(X86ASM_AVX) || defined(X86_64ASM_AVX) static uint64_t NOINLINE get_xgetbv(uint32_t flags) { #if defined(COMPILER_MSVC) return _xgetbv(flags); #else uint32_t lo, hi; asm_gcc() a1(xgetbv) asm_gcc_parms() : "+c"(flags), "=a" (lo), "=d" (hi) asm_gcc_end() return ((uint64_t)lo | ((uint64_t)hi << 32)); #endif } #endif // AVX support #if defined(SCRYPT_TEST_SPEED) size_t cpu_detect_mask = (size_t)-1; #endif static size_t detect_cpu(void) { union { uint8_t s[12]; uint32_t i[3]; } vendor_string; cpu_vendors_x86 vendor = cpu_nobody; x86_regs regs; uint32_t max_level, max_ext_level; size_t cpu_flags = 0; #if defined(X86ASM_AVX) || defined(X86_64ASM_AVX) uint64_t xgetbv_flags; #endif #if defined(CPU_X86) if (!has_cpuid()) return cpu_flags; #endif get_cpuid(®s, 0); max_level = regs.eax; vendor_string.i[0] = regs.ebx; vendor_string.i[1] = regs.edx; vendor_string.i[2] = regs.ecx; if (scrypt_verify(vendor_string.s, (const uint8_t *)"GenuineIntel", 12)) vendor = cpu_intel; else if (scrypt_verify(vendor_string.s, (const uint8_t *)"AuthenticAMD", 12)) vendor = cpu_amd; if (max_level & 0x00000500) { /* "Intel P5 pre-B0" */ cpu_flags |= cpu_mmx; return cpu_flags; } if (max_level < 1) return cpu_flags; get_cpuid(®s, 1); #if defined(X86ASM_AVX) || defined(X86_64ASM_AVX) /* xsave/xrestore */ if (regs.ecx & (1 << 27)) { xgetbv_flags = get_xgetbv(0); if ((regs.ecx & (1 << 28)) && (xgetbv_flags & 0x6)) cpu_flags |= cpu_avx; } #endif if (regs.ecx & (1 << 20)) cpu_flags |= cpu_sse4_2; if (regs.ecx & (1 << 19)) cpu_flags |= cpu_sse4_2; if (regs.ecx & (1 << 9)) cpu_flags |= cpu_ssse3; if (regs.ecx & (1 )) cpu_flags |= cpu_sse3; if (regs.edx & (1 << 26)) cpu_flags |= cpu_sse2; if (regs.edx & (1 << 25)) cpu_flags |= cpu_sse; if (regs.edx & (1 << 23)) cpu_flags |= cpu_mmx; if (cpu_flags & cpu_avx) { if (max_level >= 7) { get_cpuid(®s, 7); if (regs.ebx & (1 << 5)) cpu_flags |= cpu_avx2; } get_cpuid(®s, 0x80000000); max_ext_level = regs.eax; if (max_ext_level >= 0x80000001) { get_cpuid(®s, 0x80000001); if (regs.ecx & (1 << 11)) cpu_flags |= cpu_xop; } } #if defined(SCRYPT_TEST_SPEED) cpu_flags &= cpu_detect_mask; #endif return cpu_flags; } #if defined(SCRYPT_TEST_SPEED) static const char * get_top_cpuflag_desc(size_t flag) { if (flag & cpu_avx2) return "AVX2"; else if (flag & cpu_xop) return "XOP"; else if (flag & cpu_avx) return "AVX"; else if (flag & cpu_sse4_2) return "SSE4.2"; else if (flag & cpu_sse4_1) return "SSE4.1"; else if (flag & cpu_ssse3) return "SSSE3"; else if (flag & cpu_sse2) return "SSE2"; else if (flag & cpu_sse) return "SSE"; else if (flag & cpu_mmx) return "MMX"; else return "Basic"; } #endif /* enable the highest system-wide option */ #if defined(SCRYPT_CHOOSE_COMPILETIME) #if !defined(__AVX2__) #undef X86_64ASM_AVX2 #undef X86ASM_AVX2 #undef X86_INTRINSIC_AVX2 #endif #if !defined(__XOP__) #undef X86_64ASM_XOP #undef X86ASM_XOP #undef X86_INTRINSIC_XOP #endif #if !defined(__AVX__) #undef X86_64ASM_AVX #undef X86ASM_AVX #undef X86_INTRINSIC_AVX #endif #if !defined(__SSSE3__) #undef X86_64ASM_SSSE3 #undef X86ASM_SSSE3 #undef X86_INTRINSIC_SSSE3 #endif #if !defined(__SSE2__) #undef X86_64ASM_SSE2 #undef X86ASM_SSE2 #undef X86_INTRINSIC_SSE2 #endif #endif #endif /* defined(CPU_X86) || defined(CPU_X86_64) */