topology.cc - mozsearch

firefox-main/third_party/highway/hwy/contrib/thread_pool/topology.cc

Enable keyboard shortcuts

Source code

Revision control

Copy as Markdown

Other Tools

// Copyright 2024 Google LLC

// SPDX-License-Identifier: Apache-2.0

//

// Licensed under the Apache License, Version 2.0 (the "License");

// you may not use this file except in compliance with the License.

// You may obtain a copy of the License at

//

//      http://www.apache.org/licenses/LICENSE-2.0

//

// Unless required by applicable law or agreed to in writing, software

// distributed under the License is distributed on an "AS IS" BASIS,

// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

// See the License for the specific language governing permissions and

// limitations under the License.

#include "hwy/contrib/thread_pool/topology.h"

#include <stddef.h>

#include <stdint.h>

#include <stdio.h>

#include <string.h>  // strchr

#include <array>

#include <map>

#include <string>

#include <vector>

#include "hwy/base.h"  // HWY_OS_WIN, HWY_WARN

#if HWY_OS_APPLE

#include <sys/sysctl.h>

#include "hwy/aligned_allocator.h"  // HWY_ALIGNMENT

#endif

#if HWY_OS_WIN

#ifndef NOMINMAX

#define NOMINMAX

#endif

#ifndef WIN32_LEAN_AND_MEAN

#define WIN32_LEAN_AND_MEAN

#endif

#ifndef _WIN32_WINNT

#define _WIN32_WINNT 0x0601  // Windows 7 / Server 2008

#endif

#include <windows.h>

#endif  // HWY_OS_WIN

#if HWY_OS_LINUX || HWY_OS_FREEBSD

#ifndef _GNU_SOURCE

#define _GNU_SOURCE

#endif

#include <errno.h>

#include <fcntl.h>

#include <pthread.h>

#include <sched.h>

#include <sys/stat.h>

#include <sys/syscall.h>

#include <sys/types.h>

#include <unistd.h>  // sysconf

#endif  // HWY_OS_LINUX || HWY_OS_FREEBSD

#if HWY_OS_FREEBSD

#include <sys/param.h>

// After param.h / types.h.

#include <sys/cpuset.h>

#endif  // HWY_OS_FREEBSD

#if HWY_ARCH_WASM

#include <emscripten/threading.h>

#endif

namespace hwy {

HWY_CONTRIB_DLLEXPORT bool HaveThreadingSupport() {

#if HWY_ARCH_WASM

  return emscripten_has_threading_support() != 0;

#else

  return true;

#endif

namespace {

// Returns `whole / part`, with a check that `part` evenly divides `whole`,

// which implies the result is exact.

HWY_MAYBE_UNUSED size_t DivByFactor(size_t whole, size_t part) {

  HWY_ASSERT(part != 0);

  const size_t div = whole / part;

  const size_t mul = div * part;

  if (mul != whole) {

    HWY_ABORT("%zu / %zu = %zu; *%zu = %zu\n", whole, part, div, part, mul);

  return div;

#if HWY_OS_WIN

using SLPI = SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX;

template <class Func>

bool ForEachSLPI(LOGICAL_PROCESSOR_RELATIONSHIP rel, Func&& func) {

  // Get required buffer size.

  DWORD buf_bytes = 0;

  HWY_ASSERT(!GetLogicalProcessorInformationEx(rel, nullptr, &buf_bytes));

  // Observed when `rel` is not supported:

  if (HWY_UNLIKELY(buf_bytes == 0 && GetLastError() == ERROR_GEN_FAILURE)) {

    if (rel != RelationNumaNodeEx && rel != RelationProcessorDie) {

      HWY_WARN("Unexpected err %lx for GLPI relationship %d\n", GetLastError(),

               static_cast<int>(rel));

    return false;

  HWY_ASSERT(GetLastError() == ERROR_INSUFFICIENT_BUFFER);

  // Note: `buf_bytes` may be less than `sizeof(SLPI)`, which has padding.

  uint8_t* buf = static_cast<uint8_t*>(malloc(buf_bytes));

  HWY_ASSERT(buf);

  // Fill the buffer.

  SLPI* info = reinterpret_cast<SLPI*>(buf);

  if (HWY_UNLIKELY(!GetLogicalProcessorInformationEx(rel, info, &buf_bytes))) {

    free(buf);

    return false;

  // Iterate over each SLPI. `sizeof(SLPI)` is unreliable, see above.

  uint8_t* pos = buf;

  while (pos < buf + buf_bytes) {

    info = reinterpret_cast<SLPI*>(pos);

    HWY_ASSERT(rel == RelationAll || info->Relationship == rel);

    func(*info);

    pos += info->Size;

  if (pos != buf + buf_bytes) {

    HWY_WARN("unexpected pos %p, end %p, buf_bytes %lu, sizeof(SLPI) %zu\n",

             pos, buf + buf_bytes, buf_bytes, sizeof(SLPI));

  free(buf);

  return true;

size_t NumBits(size_t num_groups, const GROUP_AFFINITY* affinity) {

  size_t total_bits = 0;

  for (size_t i = 0; i < num_groups; ++i) {

    size_t bits = 0;

    hwy::CopyBytes<sizeof(bits)>(&affinity[i].Mask, &bits);

    total_bits += hwy::PopCount(bits);

  return total_bits;

// Calls `func(lp, lps)` for each index `lp` in the set, after ensuring that

// `lp < lps.size()`. `line` is for debugging via Warn().

template <class Func>

void ForeachBit(size_t num_groups, const GROUP_AFFINITY* affinity,

                std::vector<Topology::LP>& lps, int line, const Func& func) {

  for (size_t group = 0; group < num_groups; ++group) {

    size_t bits = 0;

    hwy::CopyBytes<sizeof(bits)>(&affinity[group].Mask, &bits);

    while (bits != 0) {

      size_t lp = Num0BitsBelowLS1Bit_Nonzero64(bits);

      bits &= bits - 1;  // clear LSB

      if (HWY_UNLIKELY(lp >= lps.size())) {

        Warn(__FILE__, line, "Clamping lp %zu to lps.size() %zu, groups %zu\n",

             lp, lps.size(), num_groups);

        lp = lps.size() - 1;

      func(lp, lps);

#elif HWY_OS_APPLE

// Returns whether sysctlbyname() succeeded; if so, writes `val / div` to

// `out`, otherwise sets `err`.

template <typename T>

bool Sysctl(const char* name, size_t div, int& err, T* out) {

  size_t val = 0;

  size_t size = sizeof(val);

  // Last two arguments are for updating the value, which we do not want.

  const int ret = sysctlbyname(name, &val, &size, nullptr, 0);

  if (HWY_UNLIKELY(ret != 0)) {

    // Do not print warnings because some `name` are expected to fail.

    err = ret;

    return false;

  *out = static_cast<T>(DivByFactor(val, div));

  return true;

#endif  // HWY_OS_*

}  // namespace

HWY_CONTRIB_DLLEXPORT size_t TotalLogicalProcessors() {

  size_t total_lps = 0;

#if HWY_ARCH_WASM

  const int num_cores = emscripten_num_logical_cores();

  if (num_cores > 0) total_lps = static_cast<size_t>(num_cores);

#elif HWY_OS_WIN

  // If there are multiple groups, this should return them all, rather than

  // just the first 64, but VMs report less.

  (void)ForEachSLPI(RelationProcessorCore, [&total_lps](const SLPI& info) {

    const PROCESSOR_RELATIONSHIP& p = info.Processor;

    total_lps += NumBits(p.GroupCount, p.GroupMask);

});

#elif HWY_OS_LINUX

  // Only check "online" because sysfs entries such as topology are missing for

  // offline CPUs, which will cause `DetectPackages` to fail.

  const long ret = sysconf(_SC_NPROCESSORS_ONLN);  // NOLINT(runtime/int)

  if (ret < 0) {

    HWY_WARN("Unexpected _SC_NPROCESSORS_CONF = %d\n", static_cast<int>(ret));

  } else {

    total_lps = static_cast<size_t>(ret);

#elif HWY_OS_APPLE

  int err;

  // Only report P processors.

  if (!Sysctl("hw.perflevel0.logicalcpu", 1, err, &total_lps)) {

    total_lps = 0;

#endif

  if (HWY_UNLIKELY(total_lps == 0)) {  // Failed to detect.

    HWY_WARN(

        "Unknown TotalLogicalProcessors, assuming 1. "

        "HWY_OS_: WIN=%d LINUX=%d APPLE=%d;\n"

        "HWY_ARCH_: WASM=%d X86=%d PPC=%d ARM=%d RISCV=%d S390X=%d\n",

        HWY_OS_WIN, HWY_OS_LINUX, HWY_OS_APPLE, HWY_ARCH_WASM, HWY_ARCH_X86,

        HWY_ARCH_PPC, HWY_ARCH_ARM, HWY_ARCH_RISCV, HWY_ARCH_S390X);

    return 1;

  // Warn that we are clamping.

  if (HWY_UNLIKELY(total_lps > kMaxLogicalProcessors)) {

    HWY_WARN("OS reports %zu processors but clamping to %zu\n", total_lps,

             kMaxLogicalProcessors);

    total_lps = kMaxLogicalProcessors;

  return total_lps;

// ------------------------------ Affinity

#if HWY_OS_LINUX || HWY_OS_FREEBSD

namespace {

#if HWY_OS_LINUX

using CpuSet = cpu_set_t;

#else

using CpuSet = cpuset_t;

#endif

// Helper functions reduce the number of #if in GetThreadAffinity.

int GetAffinity(CpuSet* set) {

  // To specify the current thread, pass 0 on Linux/Android and -1 on FreeBSD.

#if defined(__ANDROID__) && __ANDROID_API__ < 12

  return syscall(__NR_sched_getaffinity, 0, sizeof(CpuSet), set);

#elif HWY_OS_FREEBSD

  return cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(CpuSet),

                            set);

#else  // normal Linux

  return sched_getaffinity(0, sizeof(CpuSet), set);

#endif

int SetAffinity(CpuSet* set) {

  // To specify the current thread, pass 0 on Linux/Android and -1 on FreeBSD.

#if defined(__ANDROID__) && __ANDROID_API__ < 12

  return syscall(__NR_sched_setaffinity, 0, sizeof(CpuSet), set);

#elif HWY_OS_FREEBSD

  return cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(CpuSet),

                            set);

#else  // normal Linux

  return sched_setaffinity(0, sizeof(CpuSet), set);

#endif

bool IsSet(size_t lp, const CpuSet* set) {

#if HWY_COMPILER_GCC_ACTUAL

  // Workaround for GCC compiler warning with CPU_ISSET macro

  HWY_DIAGNOSTICS(push)

  HWY_DIAGNOSTICS_OFF(disable : 4305 4309, ignored "-Wsign-conversion")

#endif

  const int is_set = CPU_ISSET(static_cast<int>(lp), set);

#if HWY_COMPILER_GCC_ACTUAL

  HWY_DIAGNOSTICS(pop)

#endif

  return is_set != 0;

void Set(size_t lp, CpuSet* set) {

#if HWY_COMPILER_GCC_ACTUAL

  // Workaround for GCC compiler warning with CPU_SET macro

  HWY_DIAGNOSTICS(push)

  HWY_DIAGNOSTICS_OFF(disable : 4305 4309, ignored "-Wsign-conversion")

#endif

  CPU_SET(static_cast<int>(lp), set);

#if HWY_COMPILER_GCC_ACTUAL

  HWY_DIAGNOSTICS(pop)

#endif

}  // namespace

#endif  // HWY_OS_LINUX || HWY_OS_FREEBSD

HWY_CONTRIB_DLLEXPORT bool GetThreadAffinity(LogicalProcessorSet& lps) {

#if HWY_OS_WIN

  // Only support the first 64 because WINE does not support processor groups.

  const HANDLE hThread = GetCurrentThread();

  const DWORD_PTR prev = SetThreadAffinityMask(hThread, ~DWORD_PTR(0));

  if (!prev) return false;

  (void)SetThreadAffinityMask(hThread, prev);

  lps = LogicalProcessorSet();  // clear all

  lps.SetNonzeroBitsFrom64(prev);

  return true;

#elif HWY_OS_LINUX || HWY_OS_FREEBSD

  CpuSet set;

  CPU_ZERO(&set);

  const int err = GetAffinity(&set);

  if (err != 0) return false;

  for (size_t lp = 0; lp < kMaxLogicalProcessors; ++lp) {

    if (IsSet(lp, &set)) lps.Set(lp);

  return true;

#else

  // For HWY_OS_APPLE, affinity is not supported. Do not even set lp=0 to force

  // callers to handle this case.

  (void)lps;

  return false;

#endif

HWY_CONTRIB_DLLEXPORT bool SetThreadAffinity(const LogicalProcessorSet& lps) {

#if HWY_OS_WIN

  const HANDLE hThread = GetCurrentThread();

  const DWORD_PTR prev = SetThreadAffinityMask(hThread, lps.Get64());

  return prev != 0;

#elif HWY_OS_LINUX || HWY_OS_FREEBSD

  CpuSet set;

  CPU_ZERO(&set);

  lps.Foreach([&set](size_t lp) { Set(lp, &set); });

  const int err = SetAffinity(&set);

  if (err != 0) return false;

  return true;

#else

  // Apple THREAD_AFFINITY_POLICY is only an (often ignored) hint.

  (void)lps;

  return false;

#endif

namespace {

struct PackageSizes {

  size_t num_clusters;

  size_t num_cores;

};

#if HWY_OS_LINUX

class File {

 public:

  explicit File(const char* path) {

    for (;;) {

      fd_ = open(path, O_RDONLY);

      if (fd_ > 0) return;           // success

      if (errno == EINTR) continue;  // signal: retry

      if (errno == ENOENT) return;   // not found, give up

      HWY_WARN("Unexpected error opening %s: %d\n", path, errno);

      return;  // unknown error, give up

  ~File() {

    if (fd_ > 0) {

      for (;;) {

        const int ret = close(fd_);

        if (ret == 0) break;           // success

        if (errno == EINTR) continue;  // signal: retry

        HWY_WARN("Unexpected error closing file: %d\n", errno);

        return;  // unknown error, ignore

  // Returns number of bytes read or 0 on failure.

  size_t Read(char* buf200) const {

    if (fd_ < 0) return 0;

    size_t pos = 0;

    for (;;) {

      // read instead of `pread`, which might not work for sysfs.

      const auto bytes_read = read(fd_, buf200 + pos, 200 - pos);

      if (bytes_read == 0) {  // EOF: done

        buf200[pos++] = '\0';

        return pos;

      if (bytes_read == -1) {

        if (errno == EINTR) continue;  // signal: retry

        HWY_WARN("Unexpected error reading file: %d\n", errno);

        return 0;

      pos += static_cast<size_t>(bytes_read);

      HWY_ASSERT(pos <= 200);

 private:

  int fd_;

};

// Returns bytes read, or 0 on failure.

size_t ReadSysfs(const char* format, size_t lp, char* buf200) {

  char path[200];

  const int bytes_written = snprintf(path, sizeof(path), format, lp);

  HWY_ASSERT(0 < bytes_written &&

             bytes_written < static_cast<int>(sizeof(path) - 1));

  const File file(path);

  return file.Read(buf200);

// Interprets [str + pos, str + end) as base-10 ASCII. Stops when any non-digit

// is found, or at end. Returns false if no digits found.

bool ParseDigits(const char* str, const size_t end, size_t& pos, size_t* out) {

  HWY_ASSERT(pos <= end);

  // 9 digits cannot overflow even 32-bit size_t.

  const size_t stop = pos + 9;

  *out = 0;

  for (; pos < HWY_MIN(end, stop); ++pos) {

    const int c = str[pos];

    if (c < '0' || c > '9') break;

    *out *= 10;

    *out += static_cast<size_t>(c - '0');

  if (pos == 0) {  // No digits found

    *out = 0;

    return false;

  return true;

// Number, plus optional K or M suffix, plus terminator.

bool ParseNumberWithOptionalSuffix(const char* str, size_t len, size_t* out) {

  size_t pos = 0;

  if (!ParseDigits(str, len, pos, out)) return false;

  if (str[pos] == 'K') {

    *out <<= 10;

    ++pos;

  if (str[pos] == 'M') {

    *out <<= 20;

    ++pos;

  if (str[pos] != '\0' && str[pos] != '\n') {

    HWY_ABORT("Expected [suffix] terminator at %zu %s\n", pos, str);

  return true;

bool ReadNumberWithOptionalSuffix(const char* format, size_t lp, size_t* out) {

  char buf200[200];

  const size_t pos = ReadSysfs(format, lp, buf200);

  if (pos == 0) return false;

  return ParseNumberWithOptionalSuffix(buf200, pos, out);

const char* kPackage =

    "/sys/devices/system/cpu/cpu%zu/topology/physical_package_id";

const char* kCluster = "/sys/devices/system/cpu/cpu%zu/cache/index3/id";

const char* kCore = "/sys/devices/system/cpu/cpu%zu/topology/core_id";

const char* kL2Size = "/sys/devices/system/cpu/cpu%zu/cache/index2/size";

const char* kL3Size = "/sys/devices/system/cpu/cpu%zu/cache/index3/size";

const char* kNode = "/sys/devices/system/node/node%zu/cpulist";

// sysfs values can be arbitrarily large, so store in a map and replace with

// indices in order of appearance.

class Remapper {

 public:

  // Returns false on error, or sets `out_index` to the index of the sysfs

  // value selected by `format` and `lp`.

  template <typename T>

  bool operator()(const char* format, size_t lp, T* HWY_RESTRICT out_index) {

    size_t opaque;

    if (!ReadNumberWithOptionalSuffix(format, lp, &opaque)) return false;

    const auto ib = indices_.insert({opaque, num_});

    num_ += ib.second;                      // increment if inserted

    const size_t index = ib.first->second;  // new or existing

    HWY_ASSERT(index < num_);

    HWY_ASSERT(index < hwy::LimitsMax<T>());

    *out_index = static_cast<T>(index);

    return true;

  size_t Num() const { return num_; }

 private:

  std::map<size_t, size_t> indices_;

  size_t num_ = 0;

};

// For internal use by `DetectPackages`.

struct PerPackage {

  Remapper clusters;

  Remapper cores;

  // We rely on this zero-init and increment it below.

  uint8_t smt_per_core[kMaxLogicalProcessors] = {0};

};

// Initializes `lps` and returns a PackageSizes vector (empty on failure)

// indicating the number of clusters and cores per package.

std::vector<PackageSizes> DetectPackages(std::vector<Topology::LP>& lps) {

  std::vector<PackageSizes> empty;

  Remapper packages;

  for (size_t lp = 0; lp < lps.size(); ++lp) {

    if (!packages(kPackage, lp, &lps[lp].package)) {

      HWY_WARN("Failed to read sysfs package for LP %zu\n", lp);

      return empty;

  std::vector<PerPackage> per_package(packages.Num());

  HWY_ASSERT(!per_package.empty());

  for (size_t lp = 0; lp < lps.size(); ++lp) {

    PerPackage& pp = per_package[lps[lp].package];

    // Not a failure: some CPUs lack a (shared) L3 cache.

    if (!pp.clusters(kCluster, lp, &lps[lp].cluster)) {

      lps[lp].cluster = 0;

    if (!pp.cores(kCore, lp, &lps[lp].core)) {

      HWY_WARN("Failed to read sysfs core for LP %zu\n", lp);

      return empty;

    // SMT ID is how many LP we have already seen assigned to the same core.

    HWY_ASSERT(lps[lp].core < kMaxLogicalProcessors);

    lps[lp].smt = pp.smt_per_core[lps[lp].core]++;

    HWY_ASSERT(lps[lp].smt < 16);

  std::vector<PackageSizes> package_sizes(per_package.size());

  for (size_t p = 0; p < package_sizes.size(); ++p) {

    // Was zero if the package has no shared L3, see above.

    package_sizes[p].num_clusters = HWY_MAX(1, per_package[p].clusters.Num());

    package_sizes[p].num_cores = per_package[p].cores.Num();

    HWY_ASSERT(package_sizes[p].num_cores != 0);

  return package_sizes;

std::vector<size_t> ExpandList(const char* list, size_t list_end,

                               size_t max_lp) {

  std::vector<size_t> expanded;

  constexpr size_t kNotFound = ~size_t{0};

  size_t pos = 0;

  // Returns first `found_pos >= pos` where `list[found_pos] == c`, or

  // `kNotFound`.

  const auto find = [list, list_end, &pos](char c) -> size_t {

    const char* found_ptr = strchr(list + pos, c);

    if (found_ptr == nullptr) return kNotFound;

    const size_t found_pos = static_cast<size_t>(found_ptr - list);

    HWY_ASSERT(found_pos < list_end && list[found_pos] == c);

    return found_pos;

};

  // Reads LP number and advances `pos`. `end` is for verifying we did not

  // read past a known terminator, or the end of string.

  const auto parse_lp = [list, list_end, &pos, max_lp](size_t end) -> size_t {

    end = HWY_MIN(end, list_end);

    size_t lp;

    HWY_ASSERT(ParseDigits(list, end, pos, &lp));

    HWY_IF_CONSTEXPR(HWY_ARCH_RISCV) {

      // On RISC-V, both TotalLogicalProcessors and GetThreadAffinity may

      // under-report the count, hence clamp.

      lp = HWY_MIN(lp, max_lp);

    HWY_ASSERT(lp <= max_lp);

    HWY_ASSERT(pos <= end);

    return lp;

};

  // Parse all [first-]last separated by commas.

  for (;;) {

    // Single number or first of range: ends with dash, comma, or end.

    const size_t lp_range_first = parse_lp(HWY_MIN(find('-'), find(',')));

    if (list[pos] == '-') {  // range

      ++pos;                 // skip dash

      // Last of range ends with comma or end.

      const size_t lp_range_last = parse_lp(find(','));

      expanded.reserve(expanded.size() + lp_range_last - lp_range_first + 1);

      for (size_t lp = lp_range_first; lp <= lp_range_last; ++lp) {

        expanded.push_back(lp);

    } else {  // single number

      expanded.push_back(lp_range_first);

    // Done if reached end of string.

    if (pos == list_end || list[pos] == '\0' || list[pos] == '\n') {

      break;

    // Comma means at least one more term is coming.

    if (list[pos] == ',') {

      ++pos;

      continue;

    HWY_ABORT("Unexpected character at %zu in %s\n", pos, list);

  }  // for pos

  return expanded;

// Sets LP.node for all `lps`.

void SetNodes(std::vector<Topology::LP>& lps) {

  // For each NUMA node found via sysfs:

  for (size_t node = 0;; node++) {

    // Read its cpulist so we can scatter `node` to all its `lps`.

    char buf200[200];

    const size_t bytes_read = ReadSysfs(kNode, node, buf200);

    if (bytes_read == 0) break;

    const std::vector<size_t> list =

        ExpandList(buf200, bytes_read, lps.size() - 1);

    for (size_t lp : list) {

      lps[lp].node = static_cast<uint8_t>(node);

void SetClusterCacheSizes(std::vector<Topology::Package>& packages) {

  for (size_t ip = 0; ip < packages.size(); ++ip) {

    Topology::Package& p = packages[ip];

    for (size_t ic = 0; ic < p.clusters.size(); ++ic) {

      Topology::Cluster& c = p.clusters[ic];

      const size_t lp = c.lps.First();

      size_t bytes;

      if (ReadNumberWithOptionalSuffix(kL2Size, lp, &bytes)) {

        c.private_kib = bytes >> 10;

      if (ReadNumberWithOptionalSuffix(kL3Size, lp, &bytes)) {

        c.shared_kib = bytes >> 10;

#elif HWY_OS_WIN

// Also sets LP.core and LP.smt.

size_t MaxLpsPerCore(std::vector<Topology::LP>& lps) {

  size_t max_lps_per_core = 0;

  size_t core_idx = 0;

  (void)ForEachSLPI(RelationProcessorCore, [&max_lps_per_core, &core_idx,

                                            &lps](const SLPI& info) {

    const PROCESSOR_RELATIONSHIP& p = info.Processor;

    const size_t lps_per_core = NumBits(p.GroupCount, p.GroupMask);

    max_lps_per_core = HWY_MAX(max_lps_per_core, lps_per_core);

    size_t smt = 0;

    ForeachBit(p.GroupCount, p.GroupMask, lps, __LINE__,

               [core_idx, &smt](size_t lp, std::vector<Topology::LP>& lps) {

                 lps[lp].core = static_cast<uint16_t>(core_idx);

                 lps[lp].smt = static_cast<uint8_t>(smt++);

});

    ++core_idx;

});

  HWY_ASSERT(max_lps_per_core != 0);

  return max_lps_per_core;

// Interprets cluster (typically a shared L3 cache) as a "processor die". Also

// sets LP.cluster.

size_t MaxCoresPerCluster(const size_t max_lps_per_core,

                          std::vector<Topology::LP>& lps) {

  size_t max_cores_per_cluster = 0;

  size_t cluster_idx = 0;

  // Shared between `foreach_die` and `foreach_l3`.

  const auto foreach_cluster = [&](size_t num_groups,

                                   const GROUP_AFFINITY* groups) {

    const size_t lps_per_cluster = NumBits(num_groups, groups);

    // `max_lps_per_core` is an upper bound, hence round up. It is not an error

    // if there is only one core per cluster - can happen for L3.

    const size_t cores_per_cluster = DivCeil(lps_per_cluster, max_lps_per_core);

    max_cores_per_cluster = HWY_MAX(max_cores_per_cluster, cores_per_cluster);

    ForeachBit(num_groups, groups, lps, __LINE__,

               [cluster_idx](size_t lp, std::vector<Topology::LP>& lps) {

                 lps[lp].cluster = static_cast<uint16_t>(cluster_idx);

});

    ++cluster_idx;

};

  // Passes group bits to `foreach_cluster`, depending on relationship type.

  const auto foreach_die = [&foreach_cluster](const SLPI& info) {

    const PROCESSOR_RELATIONSHIP& p = info.Processor;

    foreach_cluster(p.GroupCount, p.GroupMask);

};

  const auto foreach_l3 = [&foreach_cluster](const SLPI& info) {

    const CACHE_RELATIONSHIP& cr = info.Cache;

    if (cr.Type != CacheUnified && cr.Type != CacheData) return;

    if (cr.Level != 3) return;

    foreach_cluster(cr.GroupCount, cr.GroupMasks);

};

  if (!ForEachSLPI(RelationProcessorDie, foreach_die)) {

    // Has been observed to fail; also check for shared L3 caches.

    (void)ForEachSLPI(RelationCache, foreach_l3);

  if (max_cores_per_cluster == 0) {

    HWY_WARN("All clusters empty, assuming 1 core each\n");

    max_cores_per_cluster = 1;

  return max_cores_per_cluster;

// Initializes `lps` and returns a `PackageSizes` vector (empty on failure)

// indicating the number of clusters and cores per package.

std::vector<PackageSizes> DetectPackages(std::vector<Topology::LP>& lps) {

  const size_t max_lps_per_core = MaxLpsPerCore(lps);

  const size_t max_cores_per_cluster =

      MaxCoresPerCluster(max_lps_per_core, lps);

  std::vector<PackageSizes> packages;

  size_t package_idx = 0;

  (void)ForEachSLPI(RelationProcessorPackage, [&](const SLPI& info) {

    const PROCESSOR_RELATIONSHIP& p = info.Processor;

    const size_t lps_per_package = NumBits(p.GroupCount, p.GroupMask);

    PackageSizes ps;  // avoid designated initializers for MSVC

    ps.num_clusters = max_cores_per_cluster;

    // `max_lps_per_core` is an upper bound, hence round up.

    ps.num_cores = DivCeil(lps_per_package, max_lps_per_core);

    packages.push_back(ps);

    ForeachBit(p.GroupCount, p.GroupMask, lps, __LINE__,

               [package_idx](size_t lp, std::vector<Topology::LP>& lps) {

                 lps[lp].package = static_cast<uint8_t>(package_idx);

});

    ++package_idx;

});

  return packages;

// Sets LP.node for all `lps`.

void SetNodes(std::vector<Topology::LP>& lps) {

  // Zero-initialize all nodes in case the below fails.

  for (size_t lp = 0; lp < lps.size(); ++lp) {

    lps[lp].node = 0;

  // We want the full NUMA nodes, but Windows Server 2022 truncates the results

  // of `RelationNumaNode` to a single 64-LP group. To get the old, unlimited

  // behavior without using the new `RelationNumaNodeEx` symbol, use the old

  // `RelationAll` and filter the SLPI we want.

  (void)ForEachSLPI(RelationAll, [&](const SLPI& info) {

    if (info.Relationship != RelationNumaNode) return;

    const NUMA_NODE_RELATIONSHIP& nn = info.NumaNode;

    // This field was previously reserved/zero. There is at least one group.

    const size_t num_groups = HWY_MAX(1, nn.GroupCount);

    const uint8_t node = static_cast<uint8_t>(nn.NodeNumber);

    ForeachBit(num_groups, nn.GroupMasks, lps, __LINE__,

               [node](size_t lp, std::vector<Topology::LP>& lps) {

                 lps[lp].node = node;

});

});

#elif HWY_OS_APPLE

// Initializes `lps` and returns a `PackageSizes` vector (empty on failure)

// indicating the number of clusters and cores per package.

std::vector<PackageSizes> DetectPackages(std::vector<Topology::LP>& lps) {

  int err;

  size_t total_cores = 0;

  if (!Sysctl("hw.perflevel0.physicalcpu", 1, err, &total_cores)) {

    HWY_WARN("Error %d detecting total_cores, assuming one per LP\n", err);

    total_cores = lps.size();

  if (lps.size() % total_cores != 0) {

    HWY_WARN("LPs %zu not a multiple of total_cores %zu\n", lps.size(),

             total_cores);

  const size_t lp_per_core = DivCeil(lps.size(), total_cores);

  size_t cores_per_cluster = 0;

  if (!Sysctl("hw.perflevel0.cpusperl2", 1, err, &cores_per_cluster)) {

    HWY_WARN("Error %d detecting cores_per_cluster\n", err);

    cores_per_cluster = HWY_MIN(4, total_cores);

  if (total_cores % cores_per_cluster != 0) {

    HWY_WARN("total_cores %zu not a multiple of cores_per_cluster %zu\n",

             total_cores, cores_per_cluster);

  for (size_t lp = 0; lp < lps.size(); ++lp) {

    lps[lp].package = 0;  // single package

    lps[lp].core = static_cast<uint16_t>(lp / lp_per_core);

    lps[lp].smt = static_cast<uint8_t>(lp % lp_per_core);

    lps[lp].cluster = static_cast<uint16_t>(lps[lp].core / cores_per_cluster);

  PackageSizes ps;

  ps.num_clusters = DivCeil(total_cores, cores_per_cluster);

  ps.num_cores = total_cores;

  return std::vector<PackageSizes>{ps};

// Sets LP.node for all `lps`.

void SetNodes(std::vector<Topology::LP>& lps) {

  for (size_t lp = 0; lp < lps.size(); ++lp) {

    lps[lp].node = 0;  // no NUMA

#endif  // HWY_OS_*

#if HWY_OS_WIN || HWY_OS_APPLE

void SetClusterCacheSizes(std::vector<Topology::Package>& packages) {

  // Assumes clusters are homogeneous. Otherwise, we would have to scan

  // `RelationCache` again and find the corresponding package_idx.

  const Cache* caches = DataCaches();

  const size_t private_kib = caches ? caches[2].size_kib : 0;

  const size_t shared_kib = caches ? caches[3].size_kib : 0;

  for (size_t ip = 0; ip < packages.size(); ++ip) {

    Topology::Package& p = packages[ip];

    for (size_t ic = 0; ic < p.clusters.size(); ++ic) {

      Topology::Cluster& c = p.clusters[ic];

      c.private_kib = private_kib;

      c.shared_kib = shared_kib;

#endif  // HWY_OS_WIN || HWY_OS_APPLE

}  // namespace

HWY_CONTRIB_DLLEXPORT Topology::Topology() {

#if HWY_OS_LINUX || HWY_OS_WIN || HWY_OS_APPLE

  lps.resize(TotalLogicalProcessors());

  const std::vector<PackageSizes>& package_sizes = DetectPackages(lps);

  if (package_sizes.empty()) return;

  SetNodes(lps);

  // Allocate per-package/cluster/core vectors. This indicates to callers that

  // detection succeeded.

  packages.resize(package_sizes.size());

  for (size_t p = 0; p < packages.size(); ++p) {

    packages[p].clusters.resize(package_sizes[p].num_clusters);

    packages[p].cores.resize(package_sizes[p].num_cores);

  // Populate the per-cluster/core sets of LP.

  for (size_t lp = 0; lp < lps.size(); ++lp) {

    Package& p = packages[lps[lp].package];

    p.clusters[lps[lp].cluster].lps.Set(lp);

    p.cores[lps[lp].core].lps.Set(lp);

  SetClusterCacheSizes(packages);

#endif  // HWY_OS_*

// ------------------------------ Cache detection

namespace {

using Caches = std::array<Cache, 4>;

// We assume homogeneous caches across all clusters because some OS APIs return

// a single value for a class of CPUs.

#if HWY_OS_LINUX

std::string ReadString(const char* name, size_t index) {

  // First CPU is usually a P core.

  const std::string path("/sys/devices/system/cpu/cpu0/cache/index%zu/");

  char buf200[200];

  size_t end = ReadSysfs((path + name).c_str(), index, buf200);

  // Remove trailing newline/null to simplify string comparison.

  for (; end != 0; --end) {

    if (buf200[end - 1] != '\0' && buf200[end - 1] != '\n') break;

  return std::string(buf200, buf200 + end);

template <typename T>

bool WriteSysfs(const char* name, size_t index, T* out) {

  const std::string str = ReadString(name, index);

  // Do not call `ParseNumberWithOptionalSuffix` because it acts on the

  // K suffix in "size", but we actually want KiB.

  size_t pos = 0;

  size_t val;

  if (!ParseDigits(str.c_str(), str.length(), pos, &val)) return false;

  HWY_ASSERT(pos <= str.length());

  *out = static_cast<T>(val);

  return true;

// Reading from sysfs is preferred because sysconf returns L3 associativity = 0

// on some CPUs, and does not indicate sharing across cores.

// https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-devices-system-cpu

bool InitCachesSysfs(Caches& caches) {

  // For computing shared cache sizes.

  std::vector<hwy::Topology::LP> lps(TotalLogicalProcessors());

  const std::vector<PackageSizes> package_sizes = DetectPackages(lps);

  // `package_sizes` is only used to check that `lps` were filled.

  if (package_sizes.empty()) {

    HWY_WARN("no packages, shared cache sizes may be incorrect\n");

    return false;

  for (size_t i = 0;; ++i) {

    const std::string type = ReadString("type", i);

    if (type.empty()) break;  // done, no more entries

    if (type != "Data" && type != "Unified") continue;

    uint32_t level;

    if (!WriteSysfs("level", i, &level)) continue;

    if (level != 1 && level != 2 && level != 3) continue;

    Cache& c = caches[level];

    // Check before overwriting any fields.

    if (c.size_kib != 0) {

      HWY_WARN("ignoring another L%u, first size %u\n", level, c.size_kib);

      continue;

    const bool ok = WriteSysfs("size", i, &c.size_kib) &&

                    WriteSysfs("ways_of_associativity", i, &c.associativity) &&

                    WriteSysfs("number_of_sets", i, &c.sets);

    if (HWY_UNLIKELY(!ok)) {

      HWY_WARN("skipping partially-detected L%u, error %d\n", level, errno);

      c = Cache();

      continue;

    // Compute line size *before* adjusting the size for sharing. Note that

    // `coherency_line_size` exists, but we are not sure that is the line size.

    const size_t bytes = static_cast<size_t>(c.size_kib) * 1024;

    const size_t lines = c.associativity * c.sets;

    c.bytes_per_line = static_cast<uint16_t>(DivByFactor(bytes, lines));

    // Divide by number of *cores* sharing the cache.

    const std::string shared_str = ReadString("shared_cpu_list", i);

    if (HWY_UNLIKELY(shared_str.empty())) {

      HWY_WARN("no shared_cpu_list for L%u %s\n", level, type.c_str());

      c.cores_sharing = 1;

    } else {

      const std::vector<size_t> shared_lps =

          ExpandList(shared_str.c_str(), shared_str.length(), lps.size() - 1);

      size_t num_cores = 0;

      for (size_t lp : shared_lps) {

        if (HWY_LIKELY(lp < lps.size())) {

          num_cores += lps[lp].smt == 0;

        } else {

          HWY_WARN("out of bounds lp %zu of %zu from %s\n", lp, lps.size(),

                   shared_str.c_str());

      if (num_cores == 0) {

        HWY_WARN("no cores sharing L%u %s, setting to 1\n", level,

                 type.c_str());

        num_cores = 1;

      c.cores_sharing = static_cast<uint16_t>(num_cores);

      // There exist CPUs for which L3 is not evenly divisible by `num_cores`,

      // hence do not use `DivByFactor`. It is safer to round down.

      c.size_kib = static_cast<uint32_t>(c.size_kib / num_cores);

      c.sets = static_cast<uint32_t>(c.sets / num_cores);

  // Require L1 and L2 cache.

  if (HWY_UNLIKELY(caches[1].size_kib == 0 || caches[2].size_kib == 0)) {

// Don't complain on Android because this is known to happen there. We are

// unaware of good alternatives: `getauxval(AT_L1D_CACHEGEOMETRY)` and

// `sysconf(_SC_LEVEL1_DCACHE_SIZE)` are unreliable, detecting via timing seems

// difficult to do reliably, and we do not want to maintain lists of known CPUs

// and their properties. It's OK to return false; callers are responsible for

// assuming reasonable defaults.

#ifndef __ANDROID__

    HWY_WARN("sysfs detected L1=%u L2=%u, err %x\n", caches[1].size_kib,

             caches[2].size_kib, errno);

#endif

    return false;

  // L3 is optional; if not found, its size is already zero from static init.

  return true;

#elif HWY_OS_WIN

bool InitCachesWin(Caches& caches) {

  std::vector<hwy::Topology::LP> lps(TotalLogicalProcessors());

  const size_t max_lps_per_core = MaxLpsPerCore(lps);

  (void)ForEachSLPI(RelationCache, [max_lps_per_core,

                                    &caches](const SLPI& info) {

    const CACHE_RELATIONSHIP& cr = info.Cache;

    if (cr.Type != CacheUnified && cr.Type != CacheData) return;

    if (1 <= cr.Level && cr.Level <= 3) {

      Cache& c = caches[cr.Level];

      // If the size is non-zero then we (probably) have already detected this

      // cache and can skip the CR.

      if (c.size_kib > 0) return;

      c.size_kib = static_cast<uint32_t>(DivByFactor(cr.CacheSize, 1024));

      c.bytes_per_line = static_cast<uint16_t>(cr.LineSize);

      c.associativity = (cr.Associativity == CACHE_FULLY_ASSOCIATIVE)

                            ? Cache::kMaxAssociativity

                            : cr.Associativity;

      // How many cores share this cache?

      size_t shared_with = NumBits(cr.GroupCount, cr.GroupMasks);

      // Divide out hyperthreads. This core may have fewer than

      // `max_lps_per_core`, hence round up.

      shared_with = DivCeil(shared_with, max_lps_per_core);

      if (shared_with == 0) {

        HWY_WARN("no cores sharing L%u, setting to 1\n", cr.Level);

        shared_with = 1;

      // Update `size_kib` to *per-core* portion.

      // There exist CPUs for which L3 is not evenly divisible by `shared_with`,

      // hence do not use `DivByFactor`. It is safer to round down.

      c.size_kib = static_cast<uint32_t>(c.size_kib / shared_with);

      c.cores_sharing = static_cast<uint16_t>(shared_with);

});

  // Require L1 and L2 cache.

  if (HWY_UNLIKELY(caches[1].size_kib == 0 || caches[2].size_kib == 0)) {

    HWY_WARN("Windows detected L1=%u, L2=%u, err %lx\n", caches[1].size_kib,

             caches[2].size_kib, GetLastError());

    return false;

  // L3 is optional; if not found, its size is already zero from static init.

  return true;

#elif HWY_OS_APPLE

bool InitCachesApple(Caches& caches) {

  int err = 0;

  Cache& L1 = caches[1];

  Cache& L2 = caches[2];

  Cache& L3 = caches[3];

  // Total L1 and L2 size can be reliably queried, but prefer perflevel0

  // (P-cores) because hw.l1dcachesize etc. are documented to describe the

  // "least performant core".

  bool ok = Sysctl("hw.perflevel0.l1dcachesize", 1024, err, &L1.size_kib) ||

            Sysctl("hw.l1dcachesize", 1024, err, &L1.size_kib);

  ok &= Sysctl("hw.perflevel0.l2cachesize", 1024, err, &L2.size_kib) ||

        Sysctl("hw.l2cachesize", 1024, err, &L2.size_kib);

  if (HWY_UNLIKELY(!ok)) {

    HWY_WARN("Apple cache detection failed, error %d\n", err);

    return false;

  L1.cores_sharing = 1;

  if (Sysctl("hw.perflevel0.cpusperl2", 1, err, &L2.cores_sharing)) {

    // There exist CPUs for which L2 is not evenly divisible by `cores_sharing`,

    // hence do not use `DivByFactor`. It is safer to round down.

    L2.size_kib /= L2.cores_sharing;

  } else {

    L2.cores_sharing = 1;

  // Other properties are not always reported. Set `associativity` and

  // `bytes_per_line` based on known models.

  char brand[128] = {0};

  size_t size = sizeof(brand);

  if (!sysctlbyname("machdep.cpu.brand_string", brand, &size, nullptr, 0)) {

    if (strncmp(brand, "Apple ", 6) != 0) {

      // Unexpected, but we will continue check the string suffixes.

      HWY_WARN("unexpected Apple brand %s\n", brand);

    if (brand[6] == 'M') {

      // https://dougallj.github.io/applecpu/firestorm.html,

      // https://www.7-cpu.com/cpu/Apple_M1.html:

      L1.bytes_per_line = 64;

      L1.associativity = 8;

      L2.bytes_per_line = 128;

      if (brand[7] == '1') {  // M1

        L2.associativity = 12;

      } else if ('2' <= brand[7] && brand[7] <= '4') {  // M2/M3, maybe also M4

        L2.associativity = 16;

      } else {

        L2.associativity = 0;  // Unknown, set below via sysctl.

      // Although Wikipedia lists SLC sizes per model, we do not know how it is

      // partitioned/allocated, so do not treat it as a reliable L3.

    }  // M*

  }  // brand string

  // This sysctl does not distinguish between L1 and L2 line sizes, so only use

  // it if we have not already set `bytes_per_line` above.

  uint16_t bytes_per_line;

  if (!Sysctl("hw.cachelinesize", 1, err, &bytes_per_line)) {

    bytes_per_line = static_cast<uint16_t>(HWY_ALIGNMENT);  // guess

  for (size_t level = 1; level <= 3; ++level) {

    if (caches[level].bytes_per_line == 0) {

      caches[level].bytes_per_line = bytes_per_line;

  // Fill in associativity if not already set. Unfortunately this is only

  // reported on x86, not on M*.

  if (L1.associativity == 0 && !Sysctl("machdep.cpu.cache.L1_associativity", 1,

                                       err, &L1.associativity)) {

    L1.associativity = 8;  // guess

  if (L2.associativity == 0 && !Sysctl("machdep.cpu.cache.L2_associativity", 1,

                                       err, &L2.associativity)) {

    L2.associativity = 12;  // guess

  // There is no L3_associativity.

  if (L3.associativity == 0) {

    L3.associativity = 12;  // guess

  // Now attempt to query L3. Although this sysctl is documented, M3 does not

  // report an L3 cache.

  if (L3.size_kib == 0 &&

      (Sysctl("hw.perflevel0.l3cachesize", 1024, err, &L3.size_kib) ||

       Sysctl("hw.l3cachesize", 1024, err, &L3.size_kib))) {

    // There exist CPUs for which L3 is not evenly divisible by `cores_sharing`,

    // hence do not use `DivByFactor`. It is safer to round down.

    if (Sysctl("hw.perflevel0.cpusperl3", 1, err, &L3.cores_sharing)) {

      L3.size_kib /= L3.cores_sharing;

    } else {

      L3.cores_sharing = 1;

  // If no L3 cache, reset all fields for consistency.

  if (L3.size_kib == 0) {

    L3 = Cache();

  // Are there other useful sysctls? hw.cacheconfig appears to be how many

  // cores share the memory and caches, though this is not documented, and

  // duplicates information in hw.perflevel0.cpusperl*.

  return true;

#endif  // HWY_OS_*

// Most APIs do not set the `sets` field, so compute it from the size and

// associativity, and if a value is already set, ensure it matches.

HWY_MAYBE_UNUSED void ComputeSets(Cache& c) {

  // If there is no such cache, avoid division by zero.

  if (HWY_UNLIKELY(c.size_kib == 0)) {

    c.sets = 0;

    return;

  const size_t bytes = static_cast<size_t>(c.size_kib) * 1024;

  // `size_kib` may have been rounded down, hence `lines` and `sets` are not

  // necessarily evenly divisible, so round down instead of `DivByFactor`.

  const size_t lines = bytes / c.bytes_per_line;

  const size_t sets = lines / c.associativity;

  if (c.sets == 0) {

    c.sets = static_cast<uint32_t>(sets);

  } else {

    const size_t diff = c.sets - sets;

    if (diff > 1) {

      HWY_ABORT("Inconsistent cache sets %u != %zu\n", c.sets, sets);

const Cache* InitDataCaches() {

  alignas(64) static Caches caches;

  // On failure, return immediately because InitCaches*() already warn.

#if HWY_OS_LINUX

  if (HWY_UNLIKELY(!InitCachesSysfs(caches))) return nullptr;

#elif HWY_OS_WIN

  if (HWY_UNLIKELY(!InitCachesWin(caches))) return nullptr;

#elif HWY_OS_APPLE

  if (HWY_UNLIKELY(!InitCachesApple(caches))) return nullptr;

#else

  HWY_WARN("Cache detection not implemented for this platform.\n");

  (void)caches;

  return nullptr;

#define HWY_NO_CACHE_DETECTION

#endif

  // Prevents "code not reached" warnings on WASM.

#ifndef HWY_NO_CACHE_DETECTION

  for (size_t level = 1; level <= 3; ++level) {

    ComputeSets(caches[level]);

  // Heuristic to ignore SLCs such as on Ampere Altra, which should not be

  // treated as a reliable L3 because of their cache inclusion policy.

  // On Apple M*, these are not even reported as an L3.

  if (caches[3].cores_sharing >= 16 && caches[3].size_kib <= 512) {

    caches[3] = Cache();

  return &caches[0];

#endif  // HWY_NO_CACHE_DETECTION

}  // namespace

HWY_CONTRIB_DLLEXPORT const Cache* DataCaches() {

  static const Cache* caches = InitDataCaches();

  return caches;

}  // namespace hwy