#include #include #include #include "blake3_impl.h" #if defined(IS_X86) #if defined(_MSC_VER) #include #elif defined(__GNUC__) #include #else #error "Unimplemented!" #endif #endif #if defined(IS_X86) static uint64_t xgetbv() { #if defined(_MSC_VER) return _xgetbv(0); #else uint32_t eax = 0, edx = 0; __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0)); return ((uint64_t)edx << 32) | eax; #endif } static void cpuid(uint32_t out[4], uint32_t id) { #if defined(_MSC_VER) __cpuid((int *)out, id); #elif defined(__i386__) || defined(_M_IX86) __asm__ __volatile__("movl %%ebx, %1\n" "cpuid\n" "xchgl %1, %%ebx\n" : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(id)); #else __asm__ __volatile__("cpuid\n" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(id)); #endif } static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) { #if defined(_MSC_VER) __cpuidex((int *)out, id, sid); #elif defined(__i386__) || defined(_M_IX86) __asm__ __volatile__("movl %%ebx, %1\n" "cpuid\n" "xchgl %1, %%ebx\n" : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(id), "c"(sid)); #else __asm__ __volatile__("cpuid\n" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(id), "c"(sid)); #endif } #endif enum cpu_feature { SSE2 = 1 << 0, SSSE3 = 1 << 1, SSE41 = 1 << 2, AVX = 1 << 3, AVX2 = 1 << 4, AVX512F = 1 << 5, AVX512VL = 1 << 6, /* ... */ UNDEFINED = 1 << 30 }; #if !defined(BLAKE3_TESTING) static /* Allow the variable to be controlled manually for testing */ #endif enum cpu_feature g_cpu_features = UNDEFINED; #if !defined(BLAKE3_TESTING) static #endif enum cpu_feature get_cpu_features() { if (g_cpu_features != UNDEFINED) { return g_cpu_features; } else { #if defined(IS_X86) uint32_t regs[4] = {0}; uint32_t *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3]; (void)edx; enum cpu_feature features = 0; cpuid(regs, 0); const int max_id = *eax; cpuid(regs, 1); #if defined(__amd64__) || defined(_M_X64) features |= SSE2; #else if (*edx & (1UL << 26)) features |= SSE2; #endif if (*ecx & (1UL << 0)) features |= SSSE3; if (*ecx & (1UL << 19)) features |= SSE41; if (*ecx & (1UL << 27)) { // OSXSAVE const uint64_t mask = xgetbv(); if ((mask & 6) == 6) { // SSE and AVX states if (*ecx & (1UL << 28)) features |= AVX; if (max_id >= 7) { cpuidex(regs, 7, 0); if (*ebx & (1UL << 5)) features |= AVX2; if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm if (*ebx & (1UL << 31)) features |= AVX512VL; if (*ebx & (1UL << 16)) features |= AVX512F; } } } } g_cpu_features = features; return features; #else /* How to detect NEON? */ return 0; #endif } } void blake3_compress_in_place(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags) { #if defined(IS_X86) const enum cpu_feature features = get_cpu_features(); #if !defined(BLAKE3_NO_AVX512) if (features & AVX512VL) { blake3_compress_in_place_avx512(cv, block, block_len, counter, flags); return; } #endif #if !defined(BLAKE3_NO_SSE41) if (features & SSE41) { blake3_compress_in_place_sse41(cv, block, block_len, counter, flags); return; } #endif #endif blake3_compress_in_place_portable(cv, block, block_len, counter, flags); } void blake3_compress_xof(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]) { #if defined(IS_X86) const enum cpu_feature features = get_cpu_features(); #if !defined(BLAKE3_NO_AVX512) if (features & AVX512VL) { blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out); return; } #endif #if !defined(BLAKE3_NO_SSE41) if (features & SSE41) { blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out); return; } #endif #endif blake3_compress_xof_portable(cv, block, block_len, counter, flags, out); } void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { #if defined(IS_X86) const enum cpu_feature features = get_cpu_features(); #if !defined(BLAKE3_NO_AVX512) if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); return; } #endif #if !defined(BLAKE3_NO_AVX2) if (features & AVX2) { blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); return; } #endif #if !defined(BLAKE3_NO_SSE41) if (features & SSE41) { blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); return; } #endif #endif #if defined(BLAKE3_USE_NEON) blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); return; #endif blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); } // The dynamically detected SIMD degree of the current platform. size_t blake3_simd_degree(void) { #if defined(IS_X86) const enum cpu_feature features = get_cpu_features(); #if !defined(BLAKE3_NO_AVX512) if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { return 16; } #endif #if !defined(BLAKE3_NO_AVX2) if (features & AVX2) { return 8; } #endif #if !defined(BLAKE3_NO_SSE41) if (features & SSE41) { return 4; } #endif #endif #if defined(BLAKE3_USE_NEON) return 4; #endif return 1; }