perf_counters.cc - mozsearch

firefox-main/third_party/highway/hwy/perf_counters.cc

Enable keyboard shortcuts

Source code

Revision control

Copy as Markdown

Other Tools

// Copyright 2024 Google LLC

// SPDX-License-Identifier: Apache-2.0

//

// Licensed under the Apache License, Version 2.0 (the "License");

// you may not use this file except in compliance with the License.

// You may obtain a copy of the License at

//

//      http://www.apache.org/licenses/LICENSE-2.0

//

// Unless required by applicable law or agreed to in writing, software

// distributed under the License is distributed on an "AS IS" BASIS,

// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

// See the License for the specific language governing permissions and

// limitations under the License.

#include "hwy/perf_counters.h"

#include "hwy/detect_compiler_arch.h"  // HWY_OS_LINUX

#if HWY_OS_LINUX || HWY_IDE

#include <errno.h>

#include <fcntl.h>  // open

#include <linux/perf_event.h>

#include <stddef.h>

#include <stdint.h>

#include <stdio.h>

#include <string.h>  // strcmp

#include <sys/ioctl.h>

#include <sys/prctl.h>

#include <sys/stat.h>  // O_RDONLY

#include <sys/syscall.h>

#include <sys/utsname.h>

#include <unistd.h>

#include <string>

#include <vector>

#include "hwy/base.h"  // HWY_ASSERT

#include "hwy/bit_set.h"

#include "hwy/timer.h"

#endif  // HWY_OS_LINUX || HWY_IDE

namespace hwy {

namespace platform {

#if HWY_OS_LINUX || HWY_IDE

namespace {

bool PerfCountersSupported() {

  // This is the documented way.

  struct stat s;

  return stat("/proc/sys/kernel/perf_event_paranoid", &s) == 0;

// If we detect Linux < 6.9 and AMD EPYC, use cycles instead of ref-cycles

// because the latter is not supported and returns 0, see

// https://lwn.net/Articles/967791/.

uint64_t RefCyclesOrCycles() {

  const uint32_t ref_cycles = PERF_COUNT_HW_REF_CPU_CYCLES;

  utsname buf;

  if (uname(&buf) != 0) return ref_cycles;

  if (std::string(buf.sysname) != "Linux") return ref_cycles;

  int major, minor;

  if (sscanf(buf.release, "%d.%d", &major, &minor) != 2) return ref_cycles;

  if (major > 6 || (major == 6 && minor >= 9)) return ref_cycles;

  // AMD Zen4 CPU

  char cpu100[100];

  if (!GetCpuString(cpu100)) return ref_cycles;

  if (std::string(cpu100).rfind("AMD EPYC", 0) != 0) return ref_cycles;

  return PERF_COUNT_HW_CPU_CYCLES;

struct CounterConfig {  // for perf_event_open

  uint64_t config;

  uint32_t type;

  PerfCounters::Counter c;

};

std::vector<CounterConfig> AllCounterConfigs() {

  constexpr uint32_t kHW = PERF_TYPE_HARDWARE;

  constexpr uint32_t kSW = PERF_TYPE_SOFTWARE;

  constexpr uint32_t kC = PERF_TYPE_HW_CACHE;

  constexpr uint64_t kL3 = PERF_COUNT_HW_CACHE_LL;

  constexpr uint64_t kLoad = uint64_t{PERF_COUNT_HW_CACHE_OP_READ} << 8;

  constexpr uint64_t kStore = uint64_t{PERF_COUNT_HW_CACHE_OP_WRITE} << 8;

  constexpr uint64_t kAcc = uint64_t{PERF_COUNT_HW_CACHE_RESULT_ACCESS} << 16;

  // Order is important for bin-packing event groups. x86 can only handle two

  // LLC-related events per group, so spread them out and arrange SW events

  // such that do not start a new group. This list of counters may change.

  return {{RefCyclesOrCycles(), kHW, PerfCounters::kRefCycles},

          {PERF_COUNT_HW_INSTRUCTIONS, kHW, PerfCounters::kInstructions},

          {PERF_COUNT_SW_PAGE_FAULTS, kSW, PerfCounters::kPageFaults},

          {kL3 | kLoad | kAcc, kC, PerfCounters::kL3Loads},

          {kL3 | kStore | kAcc, kC, PerfCounters::kL3Stores},

          {PERF_COUNT_HW_BRANCH_INSTRUCTIONS, kHW, PerfCounters::kBranches},

          {PERF_COUNT_HW_BRANCH_MISSES, kHW, PerfCounters::kBranchMispredicts},

          // Second group:

          {PERF_COUNT_HW_BUS_CYCLES, kHW, PerfCounters::kBusCycles},

          {PERF_COUNT_SW_CPU_MIGRATIONS, kSW, PerfCounters::kMigrations},

          {PERF_COUNT_HW_CACHE_REFERENCES, kHW, PerfCounters::kCacheRefs},

          {PERF_COUNT_HW_CACHE_MISSES, kHW, PerfCounters::kCacheMisses}};

size_t& PackedIdx(PerfCounters::Counter c) {

  static size_t packed_idx[64];

  return packed_idx[static_cast<size_t>(c)];

class PMU {

  static perf_event_attr MakeAttr(const CounterConfig& cc) {

    perf_event_attr attr = {};

    attr.type = cc.type;

    attr.size = sizeof(attr);

    attr.config = cc.config;

    // We request more counters than the HW may support. If so, they are

    // multiplexed and only active for a fraction of the runtime. Recording the

    // times lets us extrapolate. GROUP enables a single syscall to reduce the

    // cost of reading.

    attr.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |

                       PERF_FORMAT_TOTAL_TIME_RUNNING | PERF_FORMAT_GROUP;

    // Do not set inherit=1 because that conflicts with PERF_FORMAT_GROUP.

    // Do not set disable=1, so that perf_event_open verifies all events in the

    // group can be scheduled together.

    attr.exclude_kernel = 1;  // required if perf_event_paranoid == 1

    attr.exclude_hv = 1;      // = hypervisor

    return attr;

  static int SysPerfEventOpen(const CounterConfig& cc, int leader_fd) {

    perf_event_attr attr = MakeAttr(cc);

    const int pid = 0;   // current process (cannot also be -1)

    const int cpu = -1;  // any CPU

    // Retry if interrupted by signals; this actually happens (b/64774091).

    for (int retry = 0; retry < 10; ++retry) {

      const int flags = 0;

      const int fd = static_cast<int>(

          syscall(__NR_perf_event_open, &attr, pid, cpu, leader_fd, flags));

      if (!(fd == -1 && errno == EINTR)) return fd;

    HWY_WARN("perf_event_open retries were insufficient.");

    return -1;

  // Reads from `fd`; recovers from interruptions before/during the read.

  static bool ReadBytes(int fd, ssize_t size, void* to) {

    uint8_t* bytes = reinterpret_cast<uint8_t*>(to);

    ssize_t pos = 0;

    for (int retry = 0; retry < 10; ++retry) {

      const ssize_t bytes_read =

          read(fd, bytes + pos, static_cast<size_t>(size - pos));

      if (HWY_UNLIKELY(bytes_read <= 0)) {

        if (errno == EINTR) continue;

        HWY_WARN("perf read() failed, errno %d.", errno);

        return false;

      pos += bytes_read;

      HWY_ASSERT(pos <= size);

      if (HWY_LIKELY(pos == size)) return true;  // success

    HWY_WARN("perf read() wanted %d bytes, got %d.", static_cast<int>(size),

             static_cast<int>(pos));

    return false;

  // Array size in Buf; this is another upper bound on group size. It should be

  // loose because it only wastes a bit of stack space, whereas an unnecessary

  // extra group decreases coverage. Most HW supports 4-8 counters per group.

  static constexpr size_t kMaxEventsPerGroup = PerfCounters::kCapacity;

#pragma pack(push, 1)

  struct Buf {

    uint64_t num_events;

    uint64_t time_enabled;

    uint64_t time_running;

    uint64_t values[kMaxEventsPerGroup];

};

#pragma pack(pop)

  // Returns false on error, otherwise sets `extrapolate` and `values`.

  static bool ReadAndExtrapolate(int fd, size_t num_events, double& extrapolate,

                                 double* HWY_RESTRICT values) {

    Buf buf;

    const ssize_t want_bytes =  // size of var-len `Buf`

        static_cast<ssize_t>(24 + num_events * sizeof(uint64_t));

    if (HWY_UNLIKELY(!ReadBytes(fd, want_bytes, &buf))) return false;

    HWY_DASSERT(num_events == buf.num_events);

    HWY_DASSERT(buf.time_running <= buf.time_enabled);

    // If the group was not yet scheduled, we must avoid division by zero.

    // In case counters were previously running and not reset, their current

    // values may be nonzero. Returning zero could be interpreted as counters

    // running backwards, so we instead treat this as a failure and mark the

    // counters as invalid.

    if (HWY_UNLIKELY(buf.time_running == 0)) return false;

    // Extrapolate each value.

    extrapolate = static_cast<double>(buf.time_enabled) /

                  static_cast<double>(buf.time_running);

    for (size_t i = 0; i < buf.num_events; ++i) {

      values[i] = static_cast<double>(buf.values[i]) * extrapolate;

    return true;

 public:

  bool Init() {

    // Allow callers who do not know about each other to each call `Init`.

    // If this already succeeded, we're done; if not, we will try again.

    if (HWY_UNLIKELY(!fds_.empty())) return true;

    if (HWY_UNLIKELY(!PerfCountersSupported())) {

      HWY_WARN(

          "This Linux does not support perf counters. The program will"

          "continue, but counters will return zero.");

      return false;

    groups_.push_back(Group());

    fds_.reserve(PerfCounters::kCapacity);

    for (const CounterConfig& config : AllCounterConfigs()) {

      // If the group is limited by our buffer size, add a new one.

      if (HWY_UNLIKELY(groups_.back().num_events == kMaxEventsPerGroup)) {

        groups_.push_back(Group());

      int fd = SysPerfEventOpen(config, groups_.back().leader_fd);

      // Retry in case the group is limited by HW capacity. Do not check

      // errno because it is too inconsistent (ENOSPC, EINVAL, others?).

      if (HWY_UNLIKELY(fd < 0)) {

        fd = SysPerfEventOpen(config, /*leader_fd=*/-1);

        if (fd >= 0 && groups_.back().num_events != 0) {

          groups_.push_back(Group());

      if (HWY_UNLIKELY(fd < 0)) {

        HWY_WARN("perf_event_open %d errno %d for counter %s.", fd, errno,

                 PerfCounters::Name(config.c));

      } else {

        // Add to group and set as leader if empty.

        if (groups_.back().leader_fd == -1) {

          groups_.back().leader_fd = fd;

          // Ensure the leader is not a SW event, because adding an HW

          // event to a group with only SW events is slow, and starting

          // with SW may trigger a bug, see

          // https://lore.kernel.org/lkml/tip-a1150c202207cc8501bebc45b63c264f91959260@git.kernel.org/

          if (HWY_UNLIKELY(config.type == PERF_TYPE_SOFTWARE)) {

            HWY_WARN("SW event %s should not be leader.",

                     PerfCounters::Name(config.c));

        PackedIdx(config.c) = fds_.size();

        groups_.back().num_events += 1;

        valid_.Set(static_cast<size_t>(config.c));

        fds_.push_back(fd);

    // If no counters are available, remove the empty group.

    if (HWY_UNLIKELY(fds_.empty())) {

      HWY_ASSERT(groups_.size() == 1);

      HWY_ASSERT(groups_.back().num_events == 0);

      HWY_ASSERT(groups_.back().leader_fd == -1);

      groups_.clear();

    size_t num_valid = 0;

    for (const Group& group : groups_) {

      num_valid += group.num_events;

      // All groups have a leader and are not empty.

      HWY_ASSERT(group.leader_fd >= 0);

      HWY_ASSERT(0 != group.num_events &&

                 group.num_events <= kMaxEventsPerGroup);

    // Total `num_events` matches `fds_` and `Valid()`.

    HWY_ASSERT(num_valid == fds_.size());

    HWY_ASSERT(num_valid == valid_.Count());

    HWY_ASSERT(num_valid <= PerfCounters::kCapacity);

    if (num_valid) {

      StopAllAndReset();

      return true;

    } else {

      HWY_WARN("No valid counters found.");

      return true;

  bool StartAll() {

    if (HWY_UNLIKELY(fds_.empty())) return false;

    HWY_ASSERT(prctl(PR_TASK_PERF_EVENTS_ENABLE) == 0);

    return true;

  void StopAllAndReset() {

    HWY_ASSERT(prctl(PR_TASK_PERF_EVENTS_DISABLE) == 0);

    for (int fd : fds_) {

      HWY_ASSERT(ioctl(fd, PERF_EVENT_IOC_RESET, 0) == 0);

  // Returns false on error, otherwise sets `valid`, `max_extrapolate`, and

  // `values`.

  bool Read(BitSet64& valid, double& max_extrapolate, double* values) {

    if (HWY_UNLIKELY(!valid_.Any())) return false;

    // Read all counters into buffer in the order in which they were opened.

    max_extrapolate = 1.0;

    double* pos = values;

    for (const Group& group : groups_) {

      double extrapolate;

      if (HWY_UNLIKELY(!ReadAndExtrapolate(group.leader_fd, group.num_events,

                                           extrapolate, pos))) {

        return false;

      max_extrapolate = HWY_MAX(max_extrapolate, extrapolate);

      pos += group.num_events;

    valid = valid_;

    HWY_DASSERT(pos == values + valid.Count());

    return true;

 private:

  std::vector<int> fds_;  // one per valid_

  BitSet64 valid_;

  struct Group {

    size_t num_events = 0;

    int leader_fd = -1;

};

  std::vector<Group> groups_;

};

// Monostate, see header.

PMU& GetPMU() {

  static PMU pmu;

  return pmu;

}  // namespace

HWY_DLLEXPORT bool PerfCounters::Init() { return GetPMU().Init(); }

HWY_DLLEXPORT bool PerfCounters::StartAll() { return GetPMU().StartAll(); }

HWY_DLLEXPORT void PerfCounters::StopAllAndReset() {

  GetPMU().StopAllAndReset();

HWY_DLLEXPORT PerfCounters::PerfCounters() {

  if (HWY_UNLIKELY(!GetPMU().Read(valid_, max_extrapolate_, values_))) {

    valid_ = BitSet64();

    max_extrapolate_ = 0.0;

    hwy::ZeroBytes(values_, sizeof(values_));

HWY_DLLEXPORT size_t PerfCounters::IndexForCounter(Counter c) {

  return PackedIdx(c);

#else

HWY_DLLEXPORT bool PerfCounters::Init() { return false; }

HWY_DLLEXPORT bool PerfCounters::StartAll() { return false; }

HWY_DLLEXPORT void PerfCounters::StopAllAndReset() {}

HWY_DLLEXPORT PerfCounters::PerfCounters()

    : max_extrapolate_(1.0), values_{0.0} {}

HWY_DLLEXPORT size_t PerfCounters::IndexForCounter(Counter) { return 0; }

#endif  // HWY_OS_LINUX || HWY_IDE

}  // namespace platform

}  // namespace hwy