Source code
Revision control
Copy as Markdown
Other Tools
// Copyright 2019 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/targets.h"
#include <stdint.h>
#include <stdio.h>
#include "hwy/base.h"
#include "hwy/detect_targets.h"
#include "hwy/highway.h"
#include "hwy/x86_cpuid.h"
#if HWY_ARCH_X86
#include <xmmintrin.h>
#elif (HWY_ARCH_ARM || HWY_ARCH_PPC || HWY_ARCH_S390X || HWY_ARCH_RISCV || \
HWY_ARCH_LOONGARCH) && \
HWY_OS_LINUX
// sys/auxv.h does not always include asm/hwcap.h, or define HWCAP*, hence we
// still include this directly. See #1199.
#if HWY_HAVE_ASM_HWCAP
#include <asm/hwcap.h>
#endif
#if HWY_HAVE_AUXV
#include <sys/auxv.h>
#endif
#endif // HWY_ARCH_*
#if HWY_OS_APPLE
#include <sys/sysctl.h>
#include <sys/utsname.h>
#endif // HWY_OS_APPLE
namespace hwy {
#if HWY_OS_APPLE
static HWY_INLINE HWY_MAYBE_UNUSED bool HasCpuFeature(
const char* feature_name) {
int result = 0;
size_t len = sizeof(int);
return (sysctlbyname(feature_name, &result, &len, nullptr, 0) == 0 &&
result != 0);
}
static HWY_INLINE HWY_MAYBE_UNUSED bool ParseU32(const char*& ptr,
uint32_t& parsed_val) {
uint64_t parsed_u64 = 0;
const char* start_ptr = ptr;
for (char ch; (ch = (*ptr)) != '\0'; ++ptr) {
unsigned digit = static_cast<unsigned>(static_cast<unsigned char>(ch)) -
static_cast<unsigned>(static_cast<unsigned char>('0'));
if (digit > 9u) {
break;
}
parsed_u64 = (parsed_u64 * 10u) + digit;
if (parsed_u64 > 0xFFFFFFFFu) {
return false;
}
}
parsed_val = static_cast<uint32_t>(parsed_u64);
return (ptr != start_ptr);
}
static HWY_INLINE HWY_MAYBE_UNUSED bool IsMacOs12_2OrLater() {
utsname uname_buf;
ZeroBytes(&uname_buf, sizeof(utsname));
if ((uname(&uname_buf)) != 0) {
return false;
}
const char* ptr = uname_buf.release;
if (!ptr) {
return false;
}
uint32_t major;
uint32_t minor;
if (!ParseU32(ptr, major)) {
return false;
}
if (*ptr != '.') {
return false;
}
++ptr;
if (!ParseU32(ptr, minor)) {
return false;
}
// We are running on macOS 12.2 or later if the Darwin kernel version is 21.3
// or later
return (major > 21 || (major == 21 && minor >= 3));
}
#endif // HWY_OS_APPLE
#if HWY_ARCH_X86 && HWY_HAVE_RUNTIME_DISPATCH
namespace x86 {
// Returns the lower 32 bits of extended control register 0.
// Requires CPU support for "OSXSAVE" (see below).
static uint32_t ReadXCR0() {
#if HWY_COMPILER_MSVC
return static_cast<uint32_t>(_xgetbv(0));
#else // HWY_COMPILER_MSVC
uint32_t xcr0, xcr0_high;
const uint32_t index = 0;
asm volatile(".byte 0x0F, 0x01, 0xD0"
: "=a"(xcr0), "=d"(xcr0_high)
: "c"(index));
return xcr0;
#endif // HWY_COMPILER_MSVC
}
// Arbitrary bit indices indicating which instruction set extensions are
// supported. Use enum to ensure values are distinct.
enum class FeatureIndex : uint32_t {
kSSE = 0,
kSSE2,
kSSE3,
kSSSE3,
kSSE41,
kSSE42,
kCLMUL,
kAES,
kAVX,
kAVX2,
kF16C,
kFMA,
kLZCNT,
kBMI,
kBMI2,
kAVX512F,
kAVX512VL,
kAVX512CD,
kAVX512DQ,
kAVX512BW,
kAVX512FP16,
kAVX512BF16,
kVNNI,
kVPCLMULQDQ,
kVBMI,
kVBMI2,
kVAES,
kPOPCNTDQ,
kBITALG,
kGFNI,
kAVX10,
kAPX,
kSentinel
};
static_assert(static_cast<size_t>(FeatureIndex::kSentinel) < 64,
"Too many bits for u64");
static HWY_INLINE constexpr uint64_t Bit(FeatureIndex index) {
return 1ull << static_cast<size_t>(index);
}
// Returns bit array of FeatureIndex from CPUID feature flags.
static uint64_t FlagsFromCPUID() {
uint64_t flags = 0; // return value
uint32_t abcd[4];
Cpuid(0, 0, abcd);
const uint32_t max_level = abcd[0];
// Standard feature flags
Cpuid(1, 0, abcd);
flags |= IsBitSet(abcd[3], 25) ? Bit(FeatureIndex::kSSE) : 0;
flags |= IsBitSet(abcd[3], 26) ? Bit(FeatureIndex::kSSE2) : 0;
flags |= IsBitSet(abcd[2], 0) ? Bit(FeatureIndex::kSSE3) : 0;
flags |= IsBitSet(abcd[2], 1) ? Bit(FeatureIndex::kCLMUL) : 0;
flags |= IsBitSet(abcd[2], 9) ? Bit(FeatureIndex::kSSSE3) : 0;
flags |= IsBitSet(abcd[2], 12) ? Bit(FeatureIndex::kFMA) : 0;
flags |= IsBitSet(abcd[2], 19) ? Bit(FeatureIndex::kSSE41) : 0;
flags |= IsBitSet(abcd[2], 20) ? Bit(FeatureIndex::kSSE42) : 0;
flags |= IsBitSet(abcd[2], 25) ? Bit(FeatureIndex::kAES) : 0;
flags |= IsBitSet(abcd[2], 28) ? Bit(FeatureIndex::kAVX) : 0;
flags |= IsBitSet(abcd[2], 29) ? Bit(FeatureIndex::kF16C) : 0;
// Extended feature flags
Cpuid(0x80000001U, 0, abcd);
flags |= IsBitSet(abcd[2], 5) ? Bit(FeatureIndex::kLZCNT) : 0;
// Extended features
if (max_level >= 7) {
Cpuid(7, 0, abcd);
flags |= IsBitSet(abcd[1], 3) ? Bit(FeatureIndex::kBMI) : 0;
flags |= IsBitSet(abcd[1], 5) ? Bit(FeatureIndex::kAVX2) : 0;
flags |= IsBitSet(abcd[1], 8) ? Bit(FeatureIndex::kBMI2) : 0;
flags |= IsBitSet(abcd[1], 16) ? Bit(FeatureIndex::kAVX512F) : 0;
flags |= IsBitSet(abcd[1], 17) ? Bit(FeatureIndex::kAVX512DQ) : 0;
flags |= IsBitSet(abcd[1], 28) ? Bit(FeatureIndex::kAVX512CD) : 0;
flags |= IsBitSet(abcd[1], 30) ? Bit(FeatureIndex::kAVX512BW) : 0;
flags |= IsBitSet(abcd[1], 31) ? Bit(FeatureIndex::kAVX512VL) : 0;
flags |= IsBitSet(abcd[2], 1) ? Bit(FeatureIndex::kVBMI) : 0;
flags |= IsBitSet(abcd[2], 6) ? Bit(FeatureIndex::kVBMI2) : 0;
flags |= IsBitSet(abcd[2], 8) ? Bit(FeatureIndex::kGFNI) : 0;
flags |= IsBitSet(abcd[2], 9) ? Bit(FeatureIndex::kVAES) : 0;
flags |= IsBitSet(abcd[2], 10) ? Bit(FeatureIndex::kVPCLMULQDQ) : 0;
flags |= IsBitSet(abcd[2], 11) ? Bit(FeatureIndex::kVNNI) : 0;
flags |= IsBitSet(abcd[2], 12) ? Bit(FeatureIndex::kBITALG) : 0;
flags |= IsBitSet(abcd[2], 14) ? Bit(FeatureIndex::kPOPCNTDQ) : 0;
flags |= IsBitSet(abcd[3], 23) ? Bit(FeatureIndex::kAVX512FP16) : 0;
Cpuid(7, 1, abcd);
flags |= IsBitSet(abcd[0], 5) ? Bit(FeatureIndex::kAVX512BF16) : 0;
flags |= IsBitSet(abcd[3], 19) ? Bit(FeatureIndex::kAVX10) : 0;
flags |= IsBitSet(abcd[3], 21) ? Bit(FeatureIndex::kAPX) : 0;
}
return flags;
}
// Each Highway target requires a 'group' of multiple features/flags.
static constexpr uint64_t kGroupSSE2 =
Bit(FeatureIndex::kSSE) | Bit(FeatureIndex::kSSE2);
static constexpr uint64_t kGroupSSSE3 =
Bit(FeatureIndex::kSSE3) | Bit(FeatureIndex::kSSSE3) | kGroupSSE2;
#ifdef HWY_DISABLE_PCLMUL_AES
static constexpr uint64_t kGroupSSE4 =
Bit(FeatureIndex::kSSE41) | Bit(FeatureIndex::kSSE42) | kGroupSSSE3;
#else
static constexpr uint64_t kGroupSSE4 =
Bit(FeatureIndex::kSSE41) | Bit(FeatureIndex::kSSE42) |
Bit(FeatureIndex::kCLMUL) | Bit(FeatureIndex::kAES) | kGroupSSSE3;
#endif // HWY_DISABLE_PCLMUL_AES
// We normally assume BMI/BMI2/FMA are available if AVX2 is. This allows us to
// use BZHI and (compiler-generated) MULX. However, VirtualBox lacks them
// avoiding using and requiring these so AVX2 can still be used.
#ifdef HWY_DISABLE_BMI2_FMA
static constexpr uint64_t kGroupBMI2_FMA = 0;
#else
static constexpr uint64_t kGroupBMI2_FMA = Bit(FeatureIndex::kBMI) |
Bit(FeatureIndex::kBMI2) |
Bit(FeatureIndex::kFMA);
#endif
#ifdef HWY_DISABLE_F16C
static constexpr uint64_t kGroupF16C = 0;
#else
static constexpr uint64_t kGroupF16C = Bit(FeatureIndex::kF16C);
#endif
static constexpr uint64_t kGroupAVX2 =
Bit(FeatureIndex::kAVX) | Bit(FeatureIndex::kAVX2) |
Bit(FeatureIndex::kLZCNT) | kGroupBMI2_FMA | kGroupF16C | kGroupSSE4;
static constexpr uint64_t kGroupAVX3 =
Bit(FeatureIndex::kAVX512F) | Bit(FeatureIndex::kAVX512VL) |
Bit(FeatureIndex::kAVX512DQ) | Bit(FeatureIndex::kAVX512BW) |
Bit(FeatureIndex::kAVX512CD) | kGroupAVX2;
static constexpr uint64_t kGroupAVX3_DL =
Bit(FeatureIndex::kVNNI) | Bit(FeatureIndex::kVPCLMULQDQ) |
Bit(FeatureIndex::kVBMI) | Bit(FeatureIndex::kVBMI2) |
Bit(FeatureIndex::kVAES) | Bit(FeatureIndex::kPOPCNTDQ) |
Bit(FeatureIndex::kBITALG) | Bit(FeatureIndex::kGFNI) | kGroupAVX3;
static constexpr uint64_t kGroupAVX3_ZEN4 =
Bit(FeatureIndex::kAVX512BF16) | kGroupAVX3_DL;
static constexpr uint64_t kGroupAVX3_SPR =
Bit(FeatureIndex::kAVX512FP16) | kGroupAVX3_ZEN4;
static constexpr uint64_t kGroupAVX10 =
Bit(FeatureIndex::kAVX10) | Bit(FeatureIndex::kAPX) |
Bit(FeatureIndex::kVPCLMULQDQ) | Bit(FeatureIndex::kVAES) |
Bit(FeatureIndex::kGFNI) | kGroupAVX2;
static int64_t DetectTargets() {
int64_t bits = 0; // return value of supported targets.
HWY_IF_CONSTEXPR(HWY_ARCH_X86_64) {
bits |= HWY_SSE2; // always present in x64
}
const uint64_t flags = FlagsFromCPUID();
// Set target bit(s) if all their group's flags are all set.
if ((flags & kGroupAVX3_SPR) == kGroupAVX3_SPR) {
bits |= HWY_AVX3_SPR;
}
if ((flags & kGroupAVX3_DL) == kGroupAVX3_DL) {
bits |= HWY_AVX3_DL;
}
if ((flags & kGroupAVX3) == kGroupAVX3) {
bits |= HWY_AVX3;
}
if ((flags & kGroupAVX2) == kGroupAVX2) {
bits |= HWY_AVX2;
}
if ((flags & kGroupSSE4) == kGroupSSE4) {
bits |= HWY_SSE4;
}
if ((flags & kGroupSSSE3) == kGroupSSSE3) {
bits |= HWY_SSSE3;
}
HWY_IF_CONSTEXPR(HWY_ARCH_X86_32) {
if ((flags & kGroupSSE2) == kGroupSSE2) {
bits |= HWY_SSE2;
}
}
uint32_t abcd[4];
if ((flags & kGroupAVX10) == kGroupAVX10) {
Cpuid(0x24, 0, abcd);
// AVX10 version is in lower 8 bits of abcd[1]
const uint32_t avx10_ver = abcd[1] & 0xFFu;
// 512-bit vectors are supported if avx10_ver >= 1 is true and bit 18 of
// abcd[1] is set
const bool has_avx10_with_512bit_vectors =
(avx10_ver >= 1) && IsBitSet(abcd[1], 18);
if (has_avx10_with_512bit_vectors) {
// AVX10.1 or later with support for 512-bit vectors implies support for
// the AVX3/AVX3_DL/AVX3_SPR targets
bits |= (HWY_AVX3_SPR | HWY_AVX3_DL | HWY_AVX3);
if (avx10_ver >= 2) {
// AVX10.2 is supported if avx10_ver >= 2 is true
bits |= HWY_AVX10_2;
}
}
}
// Clear AVX2/AVX3 bits if the CPU or OS does not support XSAVE - otherwise,
// YMM/ZMM registers are not preserved across context switches.
// The lower 128 bits of XMM0-XMM15 are guaranteed to be preserved across
// context switches on x86_64
// The following OS's are known to preserve the lower 128 bits of XMM
// registers across context switches on x86 CPUs that support SSE (even in
// 32-bit mode):
// - Windows 2000 or later
// - Linux 2.4.0 or later
// - Mac OS X 10.4 or later
// - FreeBSD 4.4 or later
// - NetBSD 1.6 or later
// - OpenBSD 3.5 or later
// - UnixWare 7 Release 7.1.1 or later
// - Solaris 9 4/04 or later
Cpuid(1, 0, abcd);
const bool has_xsave = IsBitSet(abcd[2], 26);
const bool has_osxsave = IsBitSet(abcd[2], 27);
constexpr int64_t min_avx2 = HWY_AVX2 | (HWY_AVX2 - 1);
if (has_xsave && has_osxsave) {
#if HWY_OS_APPLE
// On macOS, check for AVX3 XSAVE support by checking that we are running on
// macOS 12.2 or later and HasCpuFeature("hw.optional.avx512f") returns true
// There is a bug in macOS 12.1 or earlier that can cause ZMM16-ZMM31, the
// upper 256 bits of the ZMM registers, and K0-K7 (the AVX512 mask
// registers) to not be properly preserved across a context switch on
// macOS 12.1 or earlier.
// This bug on macOS 12.1 or earlier on x86_64 CPU's with AVX3 support is
// described at
// In addition to the bug that is there on macOS 12.1 or earlier, bits 5, 6,
// and 7 can be set to 0 on x86_64 CPU's with AVX3 support on macOS until
// the first AVX512 instruction is executed as macOS only preserves
// ZMM16-ZMM31, the upper 256 bits of the ZMM registers, and K0-K7 across a
// context switch on threads that have executed an AVX512 instruction.
// Checking for AVX3 XSAVE support on macOS using
// HasCpuFeature("hw.optional.avx512f") avoids false negative results
// on x86_64 CPU's that have AVX3 support.
const bool have_avx3_xsave_support =
IsMacOs12_2OrLater() && HasCpuFeature("hw.optional.avx512f");
#endif
const uint32_t xcr0 = ReadXCR0();
constexpr int64_t min_avx3 = HWY_AVX3 | (HWY_AVX3 - 1);
// XMM/YMM
if (!IsBitSet(xcr0, 1) || !IsBitSet(xcr0, 2)) {
// Clear the AVX2/AVX3 bits if XMM/YMM XSAVE is not enabled
bits &= ~min_avx2;
}
#if !HWY_OS_APPLE
// On OS's other than macOS, check for AVX3 XSAVE support by checking that
// bits 5, 6, and 7 of XCR0 are set.
const bool have_avx3_xsave_support =
IsBitSet(xcr0, 5) && IsBitSet(xcr0, 6) && IsBitSet(xcr0, 7);
#endif
// opmask, ZMM lo/hi
if (!have_avx3_xsave_support) {
bits &= ~min_avx3;
}
} else { // !has_xsave || !has_osxsave
// Clear the AVX2/AVX3 bits if the CPU or OS does not support XSAVE
bits &= ~min_avx2;
}
// This is mainly to work around the slow Zen4 CompressStore. It's unclear
// whether subsequent AMD models will be affected; assume yes.
if ((bits & HWY_AVX3_DL) && (flags & kGroupAVX3_ZEN4) == kGroupAVX3_ZEN4 &&
IsAMD()) {
bits |= HWY_AVX3_ZEN4;
}
return bits;
}
} // namespace x86
#elif HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH
namespace arm {
#if HWY_ARCH_ARM_A64 && !HWY_OS_APPLE && \
(HWY_COMPILER_GCC || HWY_COMPILER_CLANG) && \
((HWY_TARGETS & HWY_ALL_SVE) != 0)
HWY_PUSH_ATTRIBUTES("+sve")
static int64_t DetectAdditionalSveTargets(int64_t detected_targets) {
uint64_t sve_vec_len;
// Use inline assembly instead of svcntb_pat(SV_ALL) as GCC or Clang might
// possibly optimize a svcntb_pat(SV_ALL) call to a constant if the
// -msve-vector-bits option is specified
asm("cntb %0" : "=r"(sve_vec_len)::);
return ((sve_vec_len == 32)
? HWY_SVE_256
: (((detected_targets & HWY_SVE2) != 0 && sve_vec_len == 16)
? HWY_SVE2_128
: 0));
}
HWY_POP_ATTRIBUTES
#endif
static int64_t DetectTargets() {
int64_t bits = 0; // return value of supported targets.
using CapBits = unsigned long; // NOLINT
#if HWY_OS_APPLE
const CapBits hw = 0UL;
#else
// For Android, this has been supported since API 20 (2014).
const CapBits hw = getauxval(AT_HWCAP);
#endif
(void)hw;
#if HWY_ARCH_ARM_A64
bits |= HWY_NEON_WITHOUT_AES; // aarch64 always has NEON and VFPv4..
#if HWY_OS_APPLE
if (HasCpuFeature("hw.optional.arm.FEAT_AES")) {
bits |= HWY_NEON;
// Some macOS versions report AdvSIMD_HPFPCvt under a different key.
// Check both known variants for compatibility.
if ((HasCpuFeature("hw.optional.AdvSIMD_HPFPCvt") ||
HasCpuFeature("hw.optional.arm.AdvSIMD_HPFPCvt")) &&
HasCpuFeature("hw.optional.arm.FEAT_DotProd") &&
HasCpuFeature("hw.optional.arm.FEAT_BF16")) {
bits |= HWY_NEON_BF16;
}
}
#else // !HWY_OS_APPLE
// .. but not necessarily AES, which is required for HWY_NEON.
#if defined(HWCAP_AES)
if (hw & HWCAP_AES) {
bits |= HWY_NEON;
#if defined(HWCAP_ASIMDHP) && defined(HWCAP_ASIMDDP) && defined(HWCAP2_BF16)
const CapBits hw2 = getauxval(AT_HWCAP2);
const int64_t kGroupF16Dot = HWCAP_ASIMDHP | HWCAP_ASIMDDP;
if ((hw & kGroupF16Dot) == kGroupF16Dot && (hw2 & HWCAP2_BF16)) {
bits |= HWY_NEON_BF16;
}
#endif // HWCAP_ASIMDHP && HWCAP_ASIMDDP && HWCAP2_BF16
}
#endif // HWCAP_AES
#if defined(HWCAP_SVE)
if (hw & HWCAP_SVE) {
bits |= HWY_SVE;
}
#endif
#ifndef HWCAP2_SVE2
#define HWCAP2_SVE2 (1 << 1)
#endif
#ifndef HWCAP2_SVEAES
#define HWCAP2_SVEAES (1 << 2)
#endif
const CapBits hw2 = getauxval(AT_HWCAP2);
if ((hw2 & HWCAP2_SVE2) && (hw2 & HWCAP2_SVEAES)) {
bits |= HWY_SVE2;
}
#if (HWY_COMPILER_GCC || HWY_COMPILER_CLANG) && \
((HWY_TARGETS & HWY_ALL_SVE) != 0)
if ((bits & HWY_ALL_SVE) != 0) {
bits |= DetectAdditionalSveTargets(bits);
}
#endif // (HWY_COMPILER_GCC || HWY_COMPILER_CLANG) &&
// ((HWY_TARGETS & HWY_ALL_SVE) != 0)
#endif // HWY_OS_APPLE
#else // !HWY_ARCH_ARM_A64
// Some old auxv.h / hwcap.h do not define these. If not, treat as unsupported.
#if defined(HWCAP_NEON) && defined(HWCAP_VFPv4)
if ((hw & HWCAP_NEON) && (hw & HWCAP_VFPv4)) {
bits |= HWY_NEON_WITHOUT_AES;
}
#endif
// aarch32 would check getauxval(AT_HWCAP2) & HWCAP2_AES, but we do not yet
// support that platform, and Armv7 lacks AES entirely. Because HWY_NEON
// requires native AES instructions, we do not enable that target here.
#endif // HWY_ARCH_ARM_A64
return bits;
}
} // namespace arm
#elif HWY_ARCH_PPC && HWY_HAVE_RUNTIME_DISPATCH
namespace ppc {
#ifndef PPC_FEATURE_HAS_ALTIVEC
#define PPC_FEATURE_HAS_ALTIVEC 0x10000000
#endif
#ifndef PPC_FEATURE_HAS_VSX
#define PPC_FEATURE_HAS_VSX 0x00000080
#endif
#ifndef PPC_FEATURE2_ARCH_2_07
#define PPC_FEATURE2_ARCH_2_07 0x80000000
#endif
#ifndef PPC_FEATURE2_VEC_CRYPTO
#define PPC_FEATURE2_VEC_CRYPTO 0x02000000
#endif
#ifndef PPC_FEATURE2_ARCH_3_00
#define PPC_FEATURE2_ARCH_3_00 0x00800000
#endif
#ifndef PPC_FEATURE2_ARCH_3_1
#define PPC_FEATURE2_ARCH_3_1 0x00040000
#endif
using CapBits = unsigned long; // NOLINT
// For AT_HWCAP, the others are for AT_HWCAP2
static constexpr CapBits kGroupVSX =
PPC_FEATURE_HAS_ALTIVEC | PPC_FEATURE_HAS_VSX;
#if defined(HWY_DISABLE_PPC8_CRYPTO)
static constexpr CapBits kGroupPPC8 = PPC_FEATURE2_ARCH_2_07;
#else
static constexpr CapBits kGroupPPC8 =
PPC_FEATURE2_ARCH_2_07 | PPC_FEATURE2_VEC_CRYPTO;
#endif
static constexpr CapBits kGroupPPC9 = kGroupPPC8 | PPC_FEATURE2_ARCH_3_00;
static constexpr CapBits kGroupPPC10 = kGroupPPC9 | PPC_FEATURE2_ARCH_3_1;
static int64_t DetectTargets() {
int64_t bits = 0; // return value of supported targets.
#if defined(AT_HWCAP) && defined(AT_HWCAP2)
const CapBits hw = getauxval(AT_HWCAP);
if ((hw & kGroupVSX) == kGroupVSX) {
const CapBits hw2 = getauxval(AT_HWCAP2);
if ((hw2 & kGroupPPC8) == kGroupPPC8) {
bits |= HWY_PPC8;
}
if ((hw2 & kGroupPPC9) == kGroupPPC9) {
bits |= HWY_PPC9;
}
if ((hw2 & kGroupPPC10) == kGroupPPC10) {
bits |= HWY_PPC10;
}
} // VSX
#endif // defined(AT_HWCAP) && defined(AT_HWCAP2)
return bits;
}
} // namespace ppc
#elif HWY_ARCH_S390X && HWY_HAVE_RUNTIME_DISPATCH
namespace s390x {
#ifndef HWCAP_S390_VX
#define HWCAP_S390_VX 2048
#endif
#ifndef HWCAP_S390_VXE
#define HWCAP_S390_VXE 8192
#endif
#ifndef HWCAP_S390_VXRS_EXT2
#define HWCAP_S390_VXRS_EXT2 32768
#endif
using CapBits = unsigned long; // NOLINT
static constexpr CapBits kGroupZ14 = HWCAP_S390_VX | HWCAP_S390_VXE;
static constexpr CapBits kGroupZ15 =
HWCAP_S390_VX | HWCAP_S390_VXE | HWCAP_S390_VXRS_EXT2;
static int64_t DetectTargets() {
int64_t bits = 0;
#if defined(AT_HWCAP)
const CapBits hw = getauxval(AT_HWCAP);
if ((hw & kGroupZ14) == kGroupZ14) {
bits |= HWY_Z14;
}
if ((hw & kGroupZ15) == kGroupZ15) {
bits |= HWY_Z15;
}
#endif
return bits;
}
} // namespace s390x
#elif HWY_ARCH_RISCV && HWY_HAVE_RUNTIME_DISPATCH
namespace rvv {
#ifndef HWCAP_RVV
#define COMPAT_HWCAP_ISA_V (1 << ('V' - 'A'))
#endif
using CapBits = unsigned long; // NOLINT
static int64_t DetectTargets() {
int64_t bits = 0;
const CapBits hw = getauxval(AT_HWCAP);
if ((hw & COMPAT_HWCAP_ISA_V) == COMPAT_HWCAP_ISA_V) {
size_t e8m1_vec_len;
#if HWY_ARCH_RISCV_64
int64_t vtype_reg_val;
#else
int32_t vtype_reg_val;
#endif
// Check that a vuint8m1_t vector is at least 16 bytes and that tail
// agnostic and mask agnostic mode are supported
asm volatile(
// Avoid compiler error on GCC or Clang if -march=rv64gcv1p0 or
// -march=rv32gcv1p0 option is not specified on the command line
".option push\n\t"
".option arch, +v\n\t"
"vsetvli %0, zero, e8, m1, ta, ma\n\t"
"csrr %1, vtype\n\t"
".option pop"
: "=r"(e8m1_vec_len), "=r"(vtype_reg_val));
// The RVV target is supported if the VILL bit of VTYPE (the MSB bit of
// VTYPE) is not set and the length of a vuint8m1_t vector is at least 16
// bytes
if (vtype_reg_val >= 0 && e8m1_vec_len >= 16) {
bits |= HWY_RVV;
}
}
return bits;
}
} // namespace rvv
#elif HWY_ARCH_LOONGARCH && HWY_HAVE_RUNTIME_DISPATCH
namespace loongarch {
#ifndef LA_HWCAP_LSX
#define LA_HWCAP_LSX (1u << 4)
#endif
#ifndef LA_HWCAP_LASX
#define LA_HWCAP_LASX (1u << 5)
#endif
using CapBits = unsigned long; // NOLINT
static int64_t DetectTargets() {
int64_t bits = 0;
const CapBits hw = getauxval(AT_HWCAP);
if (hw & LA_HWCAP_LSX) bits |= HWY_LSX;
if (hw & LA_HWCAP_LASX) bits |= HWY_LASX;
return bits;
}
} // namespace loongarch
#endif // HWY_ARCH_*
// Returns targets supported by the CPU, independently of DisableTargets.
// Factored out of SupportedTargets to make its structure more obvious. Note
// that x86 CPUID may take several hundred cycles.
static int64_t DetectTargets() {
// Apps will use only one of these (the default is EMU128), but compile flags
// for this TU may differ from that of the app, so allow both.
int64_t bits = HWY_SCALAR | HWY_EMU128;
#if HWY_ARCH_X86 && HWY_HAVE_RUNTIME_DISPATCH
bits |= x86::DetectTargets();
#elif HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH
bits |= arm::DetectTargets();
#elif HWY_ARCH_PPC && HWY_HAVE_RUNTIME_DISPATCH
bits |= ppc::DetectTargets();
#elif HWY_ARCH_S390X && HWY_HAVE_RUNTIME_DISPATCH
bits |= s390x::DetectTargets();
#elif HWY_ARCH_RISCV && HWY_HAVE_RUNTIME_DISPATCH
bits |= rvv::DetectTargets();
#elif HWY_ARCH_LOONGARCH && HWY_HAVE_RUNTIME_DISPATCH
bits |= loongarch::DetectTargets();
#else
// TODO(janwas): detect support for WASM.
// This file is typically compiled without HWY_IS_TEST, but targets_test has
// it set, and will expect all of its HWY_TARGETS (= all attainable) to be
// supported.
bits |= HWY_ENABLED_BASELINE;
#endif // HWY_ARCH_*
if ((bits & HWY_ENABLED_BASELINE) != HWY_ENABLED_BASELINE) {
const uint64_t bits_u = static_cast<uint64_t>(bits);
const uint64_t enabled = static_cast<uint64_t>(HWY_ENABLED_BASELINE);
HWY_WARN("CPU supports 0x%08x%08x, software requires 0x%08x%08x\n",
static_cast<uint32_t>(bits_u >> 32),
static_cast<uint32_t>(bits_u & 0xFFFFFFFF),
static_cast<uint32_t>(enabled >> 32),
static_cast<uint32_t>(enabled & 0xFFFFFFFF));
}
return bits;
}
// When running tests, this value can be set to the mocked supported targets
// mask. Only written to from a single thread before the test starts.
static int64_t supported_targets_for_test_ = 0;
// Mask of targets disabled at runtime with DisableTargets.
static int64_t supported_mask_ = LimitsMax<int64_t>();
HWY_DLLEXPORT void DisableTargets(int64_t disabled_targets) {
supported_mask_ = static_cast<int64_t>(~disabled_targets);
// This will take effect on the next call to SupportedTargets, which is
// called right before GetChosenTarget::Update. However, calling Update here
// would make it appear that HWY_DYNAMIC_DISPATCH was called, which we want
// to check in tests. We instead de-initialize such that the next
// HWY_DYNAMIC_DISPATCH calls GetChosenTarget::Update via FunctionCache.
GetChosenTarget().DeInit();
}
HWY_DLLEXPORT void SetSupportedTargetsForTest(int64_t targets) {
supported_targets_for_test_ = targets;
GetChosenTarget().DeInit(); // see comment above
}
HWY_DLLEXPORT int64_t SupportedTargets() {
int64_t targets = supported_targets_for_test_;
if (HWY_LIKELY(targets == 0)) {
// Mock not active. Re-detect instead of caching just in case we're on a
// heterogeneous ISA (also requires some app support to pin threads). This
// is only reached on the first HWY_DYNAMIC_DISPATCH or after each call to
// DisableTargets or SetSupportedTargetsForTest.
targets = DetectTargets();
// VectorBytes invokes HWY_DYNAMIC_DISPATCH. To prevent infinite recursion,
// first set up ChosenTarget. No need to Update() again afterwards with the
// final targets - that will be done by a caller of this function.
GetChosenTarget().Update(targets);
}
targets &= supported_mask_;
return targets == 0 ? HWY_STATIC_TARGET : targets;
}
HWY_DLLEXPORT ChosenTarget& GetChosenTarget() {
static ChosenTarget chosen_target;
return chosen_target;
}
} // namespace hwy