diff --git a/frame/base/bli_arch.c b/frame/base/bli_arch.c index 19d0aeaaa..3e6b028fe 100644 --- a/frame/base/bli_arch.c +++ b/frame/base/bli_arch.c @@ -40,15 +40,21 @@ arch_t bli_arch_query_id( void ) { arch_t id = -1; - // Architecture families + // Architecture families. #ifdef BLIS_FAMILY_INTEL64 - bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); + id = bli_cpuid_query_id(); #endif #ifdef BLIS_FAMILY_AMD64 - bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); + id = bli_cpuid_query_id(); #endif - // Intel architectures + // Intel microarchitectures. +#ifdef BLIS_FAMILY_KNL + id = BLIS_ARCH_KNL; +#endif +#ifdef BLIS_FAMILY_KNC + id = BLIS_ARCH_KNC; +#endif #ifdef BLIS_FAMILY_HASWELL id = BLIS_ARCH_HASWELL; #endif @@ -58,14 +64,8 @@ arch_t bli_arch_query_id( void ) #ifdef BLIS_FAMILY_PENRYN id = BLIS_ARCH_PENRYN; #endif -#ifdef BLIS_FAMILY_KNL - id = BLIS_ARCH_KNL; -#endif -#ifdef BLIS_FAMILY_KNC - id = BLIS_ARCH_KNC; -#endif - // AMD architectures + // AMD microarchitectures. #ifdef BLIS_FAMILY_ZEN id = BLIS_ARCH_ZEN; #endif @@ -82,7 +82,7 @@ arch_t bli_arch_query_id( void ) id = BLIS_ARCH_BULLDOZER; #endif - // ARM architectures + // ARM microarchitectures. #ifdef BLIS_FAMILY_CORTEXA57 id = BLIS_ARCH_CORTEXA57; #endif @@ -93,7 +93,7 @@ arch_t bli_arch_query_id( void ) id = BLIS_ARCH_CORTEXA9; #endif - // IBM architectures + // IBM microarchitectures. #ifdef BLIS_FAMILY_POWER7 id = BLIS_ARCH_POWER7; #endif @@ -101,11 +101,14 @@ arch_t bli_arch_query_id( void ) id = BLIS_ARCH_BGQ; #endif - // Generic architecture + // Generic microarchitecture. #ifdef BLIS_FAMILY_GENERIC id = BLIS_ARCH_GENERIC; #endif + //printf( "blis_arch_query_id(): id = %u\n", id ); + //exit(1); + return id; } diff --git a/frame/base/bli_cpuid.c b/frame/base/bli_cpuid.c new file mode 100644 index 000000000..7b41c52bf --- /dev/null +++ b/frame/base/bli_cpuid.c @@ -0,0 +1,676 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +arch_t bli_cpuid_query_id( void ) +{ + uint32_t vendor, family, model, features; + + // Call the CPUID instruction and parse its results into a family id, + // model id, and a feature bit field. The return value encodes the + // vendor. + vendor = bli_cpuid_query( &family, &model, &features ); + + if ( vendor == VENDOR_INTEL ) + { + // Check for each Intel configuration that is enabled, check for that + // microarchitecture. We check from most recent to most dated. +#ifdef BLIS_CONFIG_KNL + if ( bli_cpuid_is_knl( family, model, features ) ) + return BLIS_ARCH_KNL; +#endif +#ifdef BLIS_CONFIG_HASWELL + if ( bli_cpuid_is_haswell( family, model, features ) ) + return BLIS_ARCH_HASWELL; +#endif +#ifdef BLIS_CONFIG_SANDYBRIDGE + if ( bli_cpuid_is_sandybridge( family, model, features ) ) + return BLIS_ARCH_SANDYBRIDGE; +#endif +#ifdef BLIS_CONFIG_PENRYN + if ( bli_cpuid_is_penryn( family, model, features ) ) + return BLIS_ARCH_PENRYN; +#endif + } + else if ( vendor == VENDOR_AMD ) + { + + // Check for each AMD configuration that is enabled, check for that + // microarchitecture. We check from most recent to most dated. +#ifdef BLIS_CONFIG_ZEN + if ( bli_cpuid_is_zen( family, model, features ) ) + return BLIS_ARCH_ZEN; +#endif +#ifdef BLIS_CONFIG_EXCAVATOR + if ( bli_cpuid_is_excavator( family, model, features ) ) + return BLIS_ARCH_EXCAVATOR; +#endif +#ifdef BLIS_CONFIG_STEAMROLLER + if ( bli_cpuid_is_steamroller( family, model, features ) ) + return BLIS_ARCH_STEAMROLLER; +#endif +#ifdef BLIS_CONFIG_PILEDRIVER + if ( bli_cpuid_is_piledriver( family, model, features ) ) + return BLIS_ARCH_PILEDRIVER; +#endif +#ifdef BLIS_CONFIG_BULLDOZER + if ( bli_cpuid_is_bulldozer( family, model, features ) ) + return BLIS_ARCH_BULLDOZER; +#endif + } + else if ( vendor == VENDOR_UNKNOWN ) + { + return BLIS_ARCH_GENERIC; + } + + return BLIS_ARCH_GENERIC; +} + +// ----------------------------------------------------------------------------- + +bool_t bli_cpuid_is_knl + ( + uint32_t family, + uint32_t model, + uint32_t features + ) +{ + // Check for expected CPU features. + const uint32_t expected = FEATURE_AVX | + FEATURE_FMA3 | + FEATURE_AVX2 | + FEATURE_AVX512F | + FEATURE_AVX512PF; + + if ( !bli_cpuid_has_features( features, expected ) ) return FALSE; + + return TRUE; +} + +bool_t bli_cpuid_is_haswell + ( + uint32_t family, + uint32_t model, + uint32_t features + ) +{ + // Check for expected CPU features. + const uint32_t expected = FEATURE_AVX | + FEATURE_FMA3 | + FEATURE_AVX2; + + if ( !bli_cpuid_has_features( features, expected ) ) return FALSE; + + return TRUE; +} + +bool_t bli_cpuid_is_sandybridge + ( + uint32_t family, + uint32_t model, + uint32_t features + ) +{ + // Check for expected CPU features. + const uint32_t expected = FEATURE_AVX; + + if ( !bli_cpuid_has_features( features, expected ) ) return FALSE; + + return TRUE; +} + +bool_t bli_cpuid_is_penryn + ( + uint32_t family, + uint32_t model, + uint32_t features + ) +{ + // Check for expected CPU features. + const uint32_t expected = FEATURE_SSE3 | + FEATURE_SSSE3; + + if ( !bli_cpuid_has_features( features, expected ) ) return FALSE; + + return TRUE; +} + + +// ----------------------------------------------------------------------------- + +bool_t bli_cpuid_is_zen + ( + uint32_t family, + uint32_t model, + uint32_t features + ) +{ + // Check for expected CPU features. + const uint32_t expected = FEATURE_AVX | + FEATURE_FMA3 | + FEATURE_AVX2; + + if ( !bli_cpuid_has_features( features, expected ) ) return FALSE; + + // All Zen cores have a family of 0x17. + if ( family != 0x17 ) return FALSE; + + // Finally, check for specific models: + // - 0x00-0xff (THIS NEEDS UPDATING) + const bool_t is_arch + = + ( 0x00 <= model && model <= 0xff ); + + if ( !is_arch ) return FALSE; + + return TRUE; +} + +bool_t bli_cpuid_is_excavator + ( + uint32_t family, + uint32_t model, + uint32_t features + ) +{ + // Check for expected CPU features. + const uint32_t expected = FEATURE_AVX | + FEATURE_FMA3 | + FEATURE_AVX2; + + if ( !bli_cpuid_has_features( features, expected ) ) return FALSE; + + // All Excavator cores have a family of 0x15. + if ( family != 0x15 ) return FALSE; + + // Finally, check for specific models: + // - 0x60-0x7f + const bool_t is_arch + = + ( 0x60 <= model && model <= 0x7f ); + + if ( !is_arch ) return FALSE; + + return TRUE; +} + +bool_t bli_cpuid_is_steamroller + ( + uint32_t family, + uint32_t model, + uint32_t features + ) +{ + // Check for expected CPU features. + const uint32_t expected = FEATURE_AVX | + FEATURE_FMA3 | + FEATURE_FMA4; + + if ( !bli_cpuid_has_features( features, expected ) ) return FALSE; + + // All Steamroller cores have a family of 0x15. + if ( family != 0x15 ) return FALSE; + + // Finally, check for specific models: + // - 0x30-0x3f + const bool_t is_arch + = + ( 0x30 <= model && model <= 0x3f ); + + if ( !is_arch ) return FALSE; + + return TRUE; +} + +bool_t bli_cpuid_is_piledriver + ( + uint32_t family, + uint32_t model, + uint32_t features + ) +{ + // Check for expected CPU features. + const uint32_t expected = FEATURE_AVX | + FEATURE_FMA3 | + FEATURE_FMA4; + + if ( !bli_cpuid_has_features( features, expected ) ) return FALSE; + + // All Piledriver cores have a family of 0x15. + if ( family != 0x15 ) return FALSE; + + // Finally, check for specific models: + // - 0x02 + // - 0x10-0x1f + const bool_t is_arch + = + model == 0x02 || ( 0x10 <= model && model <= 0x1f ); + + if ( !is_arch ) return FALSE; + + return TRUE; +} + +bool_t bli_cpuid_is_bulldozer + ( + uint32_t family, + uint32_t model, + uint32_t features + ) +{ + // Check for expected CPU features. + const uint32_t expected = FEATURE_AVX | + FEATURE_FMA4; + + if ( !bli_cpuid_has_features( features, expected ) ) return FALSE; + + // All Bulldozer cores have a family of 0x15. + if ( family != 0x15 ) return FALSE; + + // Finally, check for specific models: + // - 0x00 + // - 0x01 + const bool_t is_arch + = + ( model == 0x00 || model == 0x01 ); + + if ( !is_arch ) return FALSE; + + return TRUE; +} + +// ----------------------------------------------------------------------------- + +// +// This section of the file was based off of cpuid.cxx from TBLIS [1]. +// +// [1] https://github.com/devinamatthews/tblis +// + +/* + + Copyright (C) 2017, The University of Texas at Austin + Copyright (C) 2017, Devin Matthews + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) + +enum +{ + // input register(s) output register + FEATURE_MASK_SSE3 = (1u<< 0), // cpuid[eax=1] :ecx[0] + FEATURE_MASK_SSSE3 = (1u<< 9), // cpuid[eax=1] :ecx[9] + FEATURE_MASK_SSE41 = (1u<<19), // cpuid[eax=1] :ecx[19] + FEATURE_MASK_SSE42 = (1u<<20), // cpuid[eax=1] :ecx[20] + FEATURE_MASK_AVX = (1u<<28), // cpuid[eax=1] :ecx[28] + FEATURE_MASK_AVX2 = (1u<< 5), // cpuid[eax=7,ecx=0] :ebx[5] + FEATURE_MASK_FMA3 = (1u<<12), // cpuid[eax=1] :ecx[12] + FEATURE_MASK_FMA4 = (1u<<16), // cpuid[eax=0x80000001]:ecx[16] + FEATURE_MASK_AVX512F = (1u<<16), // cpuid[eax=7,ecx=0] :ebx[16] + FEATURE_MASK_AVX512DQ = (1u<<17), // cpuid[eax=7,ecx=0] :ebx[17] + FEATURE_MASK_AVX512PF = (1u<<26), // cpuid[eax=7,ecx=0] :ebx[26] + FEATURE_MASK_AVX512ER = (1u<<27), // cpuid[eax=7,ecx=0] :ebx[27] + FEATURE_MASK_AVX512CD = (1u<<28), // cpuid[eax=7,ecx=0] :ebx[28] + FEATURE_MASK_AVX512BW = (1u<<30), // cpuid[eax=7,ecx=0] :ebx[30] + FEATURE_MASK_AVX512VL = (1u<<31), // cpuid[eax=7,ecx=0] :ebx[31] + FEATURE_MASK_XGETBV = (1u<<26)| + (1u<<27), // cpuid[eax=1] :ecx[27:26] + XGETBV_MASK_XMM = 0x02u, // xcr0[1] + XGETBV_MASK_YMM = 0x04u, // xcr0[2] + XGETBV_MASK_ZMM = 0xe0u // xcr0[7:5] +}; + + +uint32_t bli_cpuid_query + ( + uint32_t* family, + uint32_t* model, + uint32_t* features + ) +{ + uint32_t eax, ebx, ecx, edx; + + uint32_t old_model = 0; + uint32_t old_family = 0; + uint32_t ext_model = 0; + uint32_t ext_family = 0; + + *family = 0; + *model = 0; + *features = 0; + + //fprintf( stderr, "checking cpuid\n" ); + + uint32_t cpuid_max = __get_cpuid_max( 0, 0 ); + uint32_t cpuid_max_ext = __get_cpuid_max( 0x80000000u, 0 ); + + //fprintf( stderr, "max cpuid leaf: %d\n", cpuid_max ); + //fprintf( stderr, "max extended cpuid leaf: %08x\n", cpuid_max_ext ); + + if ( cpuid_max < 1 ) return VENDOR_UNKNOWN; + + // The fourth '0' serves as the NULL-terminator for the vendor string. + uint32_t vendor_string[4] = { 0, 0, 0, 0 }; + + // This is actually a macro that modifies the last four operands, + // hence why they are not passed by address. + __cpuid( 0, eax, vendor_string[0], + vendor_string[2], + vendor_string[1] ); + + // Check extended feature bits for post-AVX2 features. + if ( cpuid_max >= 7 ) + { + // This is actually a macro that modifies the last four operands, + // hence why they are not passed by address. + __cpuid_count( 7, 0, eax, ebx, ecx, edx ); + + //fprintf( stderr, "cpuid leaf 7:\n" ); + //print_binary( eax ); + //print_binary( ebx ); + //print_binary( ecx ); + //print_binary( edx ); + + if ( bli_cpuid_has_features( ebx, FEATURE_MASK_AVX2 ) ) *features |= FEATURE_AVX2; + if ( bli_cpuid_has_features( ebx, FEATURE_MASK_AVX512F ) ) *features |= FEATURE_AVX512F; + if ( bli_cpuid_has_features( ebx, FEATURE_MASK_AVX512DQ ) ) *features |= FEATURE_AVX512DQ; + if ( bli_cpuid_has_features( ebx, FEATURE_MASK_AVX512PF ) ) *features |= FEATURE_AVX512PF; + if ( bli_cpuid_has_features( ebx, FEATURE_MASK_AVX512ER ) ) *features |= FEATURE_AVX512ER; + if ( bli_cpuid_has_features( ebx, FEATURE_MASK_AVX512CD ) ) *features |= FEATURE_AVX512CD; + if ( bli_cpuid_has_features( ebx, FEATURE_MASK_AVX512BW ) ) *features |= FEATURE_AVX512BW; + if ( bli_cpuid_has_features( ebx, FEATURE_MASK_AVX512VL ) ) *features |= FEATURE_AVX512VL; + } + + // Check extended processor info / features bits for AMD-specific features. + if ( cpuid_max_ext >= 0x80000001u ) + { + // This is actually a macro that modifies the last four operands, + // hence why they are not passed by address. + __cpuid( 0x80000001u, eax, ebx, ecx, edx ); + + //fprintf(stderr, "extended cpuid leaf 0x80000001:\n"); + //print_binary(eax); + //print_binary(ebx); + //print_binary(ecx); + //print_binary(edx); + + if ( bli_cpuid_has_features( ecx, FEATURE_MASK_FMA4 ) ) *features |= FEATURE_FMA4; + } + + // Unconditionally check processor info / features bits. + { + // This is actually a macro that modifies the last four operands, + // hence why they are not passed by address. + __cpuid( 1, eax, ebx, ecx, edx ); + + //fprintf(stderr, "cpuid leaf 1:\n"); + //print_binary(eax); + //print_binary(ebx); + //print_binary(ecx); + //print_binary(edx); + + /* + cpuid(eax=1): eax[27:0] + + 3: 0 - Stepping + 7: 4 - Model + 11: 8 - Family + 13:12 - Processor Type + 19:16 - Extended Model + 27:20 - Extended Family + + Intel and AMD have suggested applications to display the family of a + CPU as the sum of the "Family" and the "Extended Family" fields shown + above, and the model as the sum of the "Model" and the 4-bit + left-shifted "Extended Model" fields. If "Family" is different than + 6 or 15, only the "Family" and "Model" fields should be used while the + "Extended Family" and "Extended Model" bits are reserved. If "Family" + is set to 15, then "Extended Family" and the 4-bit left-shifted + "Extended Model" should be added to the respective base values, and if + "Family" is set to 6, then only the 4-bit left-shifted "Extended Model" + should be added to "Model". + */ + + old_model = ( eax >> 4 ) & ( 0xF ); // bits 7:4 + old_family = ( eax >> 8 ) & ( 0xF ); // bits 11:8 + + ext_model = ( eax >> 16 ) & ( 0xF ); // bits 19:16 + ext_family = ( eax >> 20 ) & ( 0xFF ); // bits 27:20 + + // Set the display model and family values based on the original family + // value. See explanation above. + if ( old_family == 6 ) + { + *model = ( ext_model << 4 ) + old_model; + *family = old_family; + } + else if ( old_family == 15 ) + { + *model = ( ext_model << 4 ) + old_model; + *family = ( ext_family ) + old_family; + } + else + { + *model = old_model; + *family = old_family; + } + + // Check for SSE, AVX, and FMA3 features. + if ( bli_cpuid_has_features( ecx, FEATURE_MASK_SSE3 ) ) *features |= FEATURE_SSE3; + if ( bli_cpuid_has_features( ecx, FEATURE_MASK_SSSE3 ) ) *features |= FEATURE_SSSE3; + if ( bli_cpuid_has_features( ecx, FEATURE_MASK_SSE41 ) ) *features |= FEATURE_SSE41; + if ( bli_cpuid_has_features( ecx, FEATURE_MASK_SSE42 ) ) *features |= FEATURE_SSE42; + if ( bli_cpuid_has_features( ecx, FEATURE_MASK_AVX ) ) *features |= FEATURE_AVX; + if ( bli_cpuid_has_features( ecx, FEATURE_MASK_FMA3 ) ) *features |= FEATURE_FMA3; + + // Check whether the hardware supports xsave/xrestor/xsetbv/xgetbv AND + // support for these is enabled by the OS. If so, then we proceed with + // checking that various register-state saving features are available. + if ( bli_cpuid_has_features( ecx, FEATURE_MASK_XGETBV ) ) + { + uint32_t xcr = 0; + + // Call xgetbv to get xcr0 (the extended control register) copied + // to [edx:eax]. This encodes whether software supports various + // register state-saving features. + __asm__ __volatile__ + ( + ".byte 0x0F, 0x01, 0xD0" + : "=a" (eax), + "=d" (edx) + : "c" (xcr) + : "cc" + ); + + //fprintf(stderr, "xcr0:\n"); + //print_binary(eax); + //print_binary(edx); + + //fprintf(stderr, "xgetbv: xmm: %d\n", bli_cpuid_has_features(eax, XGETBV_MASK_XMM)); + //fprintf(stderr, "xgetbv: ymm: %d\n", bli_cpuid_has_features(eax, XGETBV_MASK_XMM| + // XGETBV_MASK_YMM)); + //fprintf(stderr, "xgetbv: zmm: %d\n", bli_cpuid_has_features(eax, XGETBV_MASK_XMM| + // XGETBV_MASK_YMM| + // XGETBV_MASK_ZMM)); + + // The OS can manage the state of 512-bit zmm (AVX-512) registers + // only if the xcr[7:5] bits are set. If they are not set, then + // clear all feature bits related to AVX-512. + if ( !bli_cpuid_has_features( eax, XGETBV_MASK_XMM | + XGETBV_MASK_YMM | + XGETBV_MASK_ZMM ) ) + { + *features &= ~( FEATURE_AVX512F | + FEATURE_AVX512DQ | + FEATURE_AVX512PF | + FEATURE_AVX512ER | + FEATURE_AVX512CD | + FEATURE_AVX512BW | + FEATURE_AVX512VL ); + } + + // The OS can manage the state of 256-bit ymm (AVX) registers + // only if the xcr[2] bit is set. If it is not set, then + // clear all feature bits related to AVX. + if ( !bli_cpuid_has_features( eax, XGETBV_MASK_XMM | + XGETBV_MASK_YMM ) ) + { + *features &= ~( FEATURE_AVX | + FEATURE_AVX2 | + FEATURE_FMA3 | + FEATURE_FMA4 ); + } + + // The OS can manage the state of 128-bit xmm (SSE) registers + // only if the xcr[1] bit is set. If it is not set, then + // clear all feature bits related to SSE (which means the + // entire bitfield is clear). + if ( !bli_cpuid_has_features( eax, XGETBV_MASK_XMM ) ) + { + *features = 0; + } + } + else + { + // If the hardware does not support xsave/xrestor/xsetbv/xgetbv, + // OR these features are not enabled by the OS, then we clear + // the bitfield, because it means that not even xmm support is + // present. + + //fprintf(stderr, "xgetbv: no\n"); + features = 0; + } + } + + //fprintf(stderr, "vendor: %12s\n", vendor_string); + //fprintf(stderr, "family: %d\n", family); + //fprintf(stderr, "model: %d\n", model); + //fprintf(stderr, "sse3: %d\n", bli_cpuid_has_features(features, FEATURE_SSE3)); + //fprintf(stderr, "ssse3: %d\n", bli_cpuid_has_features(features, FEATURE_SSSE3)); + //fprintf(stderr, "sse4.1: %d\n", bli_cpuid_has_features(features, FEATURE_SSE41)); + //fprintf(stderr, "sse4.2: %d\n", bli_cpuid_has_features(features, FEATURE_SSE42)); + //fprintf(stderr, "avx: %d\n", bli_cpuid_has_features(features, FEATURE_AVX)); + //fprintf(stderr, "avx2: %d\n", bli_cpuid_has_features(features, FEATURE_AVX2)); + //fprintf(stderr, "fma3: %d\n", bli_cpuid_has_features(features, FEATURE_FMA3)); + //fprintf(stderr, "fma4: %d\n", bli_cpuid_has_features(features, FEATURE_FMA4)); + //fprintf(stderr, "avx512f: %d\n", bli_cpuid_has_features(features, FEATURE_AVX512F)); + //fprintf(stderr, "avx512pf: %d\n", bli_cpuid_has_features(features, FEATURE_AVX512PF)); + //fprintf(stderr, "avx512dq: %d\n", bli_cpuid_has_features(features, FEATURE_AVX512DQ)); + + // Check the vendor string and return a value to indicate Intel or AMD. + if ( strcmp( ( char* )vendor_string, "AuthenticAMD" ) == 0 ) + return VENDOR_AMD; + else if ( strcmp( ( char* )vendor_string, "GenuineIntel" ) == 0 ) + return VENDOR_INTEL; + else + return VENDOR_UNKNOWN; +} + +#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) + +int get_cpu_type( int* model, int* part, int* features ) +{ + model = MODEL_UNKNOWN; + features = 0; + + FILE *fd1 = popen("grep -m 1 Processor /proc/cpuinfo", "r"); + if (!fd1) return VENDOR_ARM; + FILE *fd2 = popen("grep -m 1 'CPU part' /proc/cpuinfo", "r"); + if (!fd2) + { + pclose(fd1); + return VENDOR_ARM; + } + FILE *fd3 = popen("grep -m 1 Features /proc/cpuinfo", "r"); + if (!fd3) + { + pclose(fd1); + pclose(fd2); + return VENDOR_ARM; + } + + char c; + std::string proc, ptno, feat; + while ((c = fgetc(fd1)) != EOF) proc.push_back(c); + while ((c = fgetc(fd2)) != EOF) ptno.push_back(c); + while ((c = fgetc(fd3)) != EOF) feat.push_back(c); + + pclose(fd1); + pclose(fd2); + pclose(fd3); + + if (feat.find("neon") != std::string::npos || + feat.find("asimd") != std::string::npos) + features |= FEATURE_NEON; + + if (proc.find("ARMv7") != std::string::npos) + model = MODEL_ARMV7; + else if (proc.find("AArch64") != std::string::npos) + model = MODEL_ARMV8; + + auto pos = ptno.find("0x"); + TBLIS_ASSERT(pos != std::string::npos); + part = strtoi(ptno, pos, 16); + + return VENDOR_ARM; +} + + +#endif diff --git a/frame/base/bli_cpuid.h b/frame/base/bli_cpuid.h new file mode 100644 index 000000000..a9c99fef4 --- /dev/null +++ b/frame/base/bli_cpuid.h @@ -0,0 +1,151 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_CPUID_H +#define BLIS_CPUID_H + +arch_t bli_cpuid_query_id( void ); + +bool_t bli_cpuid_is_knl( uint32_t family, uint32_t model, uint32_t features ); +bool_t bli_cpuid_is_haswell( uint32_t family, uint32_t model, uint32_t features ); +bool_t bli_cpuid_is_sandybridge( uint32_t family, uint32_t model, uint32_t features ); +bool_t bli_cpuid_is_penryn( uint32_t family, uint32_t model, uint32_t features ); + +bool_t bli_cpuid_is_zen( uint32_t family, uint32_t model, uint32_t features ); +bool_t bli_cpuid_is_excavator( uint32_t family, uint32_t model, uint32_t features ); +bool_t bli_cpuid_is_steamroller( uint32_t family, uint32_t model, uint32_t features ); +bool_t bli_cpuid_is_piledriver( uint32_t family, uint32_t model, uint32_t features ); +bool_t bli_cpuid_is_bulldozer( uint32_t family, uint32_t model, uint32_t features ); + +uint32_t bli_cpuid_query( uint32_t* family, uint32_t* model, uint32_t* features ); + +// ----------------------------------------------------------------------------- + +// +// This section of the file was based off of cpuid.hpp from TBLIS [1]. +// +// [1] https://github.com/devinamatthews/tblis +// + +/* + + Copyright (C) 2017, The University of Texas at Austin + Copyright (C) 2017, Devin Matthews + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +static bool_t bli_cpuid_has_features( uint32_t have, uint32_t want ) +{ + return ( have & want ) == want; +} + +// ----------------------------------------------------------------------------- + +#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) + +#include "cpuid.h" + +enum +{ + VENDOR_INTEL, + VENDOR_AMD, + VENDOR_UNKNOWN +}; +enum +{ + FEATURE_SSE3 = 0x0001, + FEATURE_SSSE3 = 0x0002, + FEATURE_SSE41 = 0x0004, + FEATURE_SSE42 = 0x0008, + FEATURE_AVX = 0x0010, + FEATURE_AVX2 = 0x0020, + FEATURE_FMA3 = 0x0040, + FEATURE_FMA4 = 0x0080, + FEATURE_AVX512F = 0x0100, + FEATURE_AVX512DQ = 0x0200, + FEATURE_AVX512PF = 0x0400, + FEATURE_AVX512ER = 0x0800, + FEATURE_AVX512CD = 0x1000, + FEATURE_AVX512BW = 0x2000, + FEATURE_AVX512VL = 0x4000 +}; + +#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) + +enum +{ + VENDOR_ARM, + VENDOR_UNKNOWN +}; +enum +{ + MODEL_ARMV7, + MODEL_ARMV8, + MODEL_UNKNOWN +}; +enum +{ + FEATURE_NEON = 0x1 +}; + +#endif + + + +#endif + diff --git a/frame/include/bli_arch_config.h b/frame/include/bli_arch_config.h index e8c046192..d0cfbc6aa 100644 --- a/frame/include/bli_arch_config.h +++ b/frame/include/bli_arch_config.h @@ -42,6 +42,12 @@ // -- Intel64 architectures -- +#ifdef BLIS_CONFIG_KNL +CNTX_INIT_PROTS( knl ) +#endif +#ifdef BLIS_CONFIG_KNC +CNTX_INIT_PROTS( knc ) +#endif #ifdef BLIS_CONFIG_HASWELL CNTX_INIT_PROTS( haswell ) #endif @@ -52,13 +58,6 @@ CNTX_INIT_PROTS( sandybridge ) CNTX_INIT_PROTS( penryn ) #endif -#ifdef BLIS_CONFIG_KNL -CNTX_INIT_PROTS( knl ) -#endif -#ifdef BLIS_CONFIG_KNC -CNTX_INIT_PROTS( knc ) -#endif - // -- AMD64 architectures -- #ifdef BLIS_CONFIG_ZEN @@ -109,12 +108,23 @@ CNTX_INIT_PROTS( generic ) // -- Architecture family-specific headers ------------------------------------- // -// -- Intel64 architectures -- +// -- x86_64 families -- #ifdef BLIS_FAMILY_INTEL64 #include "bli_family_intel64.h" #endif +#ifdef BLIS_FAMILY_AMD64 +#include "bli_family_amd64.h" +#endif + +// -- Intel64 architectures -- +#ifdef BLIS_FAMILY_KNL +#include "bli_family_knl.h" +#endif +#ifdef BLIS_FAMILY_KNC +#include "bli_family_knc.h" +#endif #ifdef BLIS_FAMILY_HASWELL #include "bli_family_haswell.h" #endif @@ -125,19 +135,8 @@ CNTX_INIT_PROTS( generic ) #include "bli_family_penryn.h" #endif -#ifdef BLIS_FAMILY_KNL -#include "bli_family_knl.h" -#endif -#ifdef BLIS_FAMILY_KNC -#include "bli_family_knc.h" -#endif - // -- AMD64 architectures -- -#ifdef BLIS_FAMILY_AMD64 -#include "bli_family_amd64.h" -#endif - #ifdef BLIS_FAMILY_ZEN #include "bli_family_zen.h" #endif @@ -188,6 +187,12 @@ CNTX_INIT_PROTS( generic ) // -- Intel64 architectures -- +#ifdef BLIS_KERNELS_KNL +#include "bli_kernels_knl.h" +#endif +#ifdef BLIS_KERNELS_KNC +#include "bli_kernels_knc.h" +#endif #ifdef BLIS_KERNELS_HASWELL #include "bli_kernels_haswell.h" #endif @@ -198,13 +203,6 @@ CNTX_INIT_PROTS( generic ) #include "bli_kernels_penryn.h" #endif -#ifdef BLIS_KERNELS_KNL -#include "bli_kernels_knl.h" -#endif -#ifdef BLIS_KERNELS_KNC -#include "bli_kernels_knc.h" -#endif - // -- AMD64 architectures -- //#ifdef BLIS_KERNELS_ZEN diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 34b0fb7b3..c15b0ed17 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -815,11 +815,11 @@ typedef enum typedef enum { // Intel - BLIS_ARCH_HASWELL = 0, + BLIS_ARCH_KNL = 0, + BLIS_ARCH_KNC, + BLIS_ARCH_HASWELL, BLIS_ARCH_SANDYBRIDGE, BLIS_ARCH_PENRYN, - BLIS_ARCH_KNL, - BLIS_ARCH_KNC, // AMD BLIS_ARCH_ZEN, diff --git a/frame/include/blis.h b/frame/include/blis.h index 01d809ac8..84308bf45 100644 --- a/frame/include/blis.h +++ b/frame/include/blis.h @@ -123,6 +123,7 @@ extern "C" { #include "bli_cntl.h" #include "bli_info.h" #include "bli_arch.h" +#include "bli_cpuid.h" // -- Level-0 operations --