Source code

Revision control

Copy as Markdown

Other Tools

// Copyright 2025 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef HIGHWAY_HWY_AUTO_TUNE_H_
#define HIGHWAY_HWY_AUTO_TUNE_H_
#include <stddef.h>
#include <stdint.h>
#include <string.h> // memmove
#include <cmath>
#include <vector>
#include "hwy/aligned_allocator.h" // Span
#include "hwy/base.h" // HWY_MIN
#include "hwy/contrib/sort/vqsort.h"
// Infrastructure for auto-tuning (choosing optimal parameters at runtime).
namespace hwy {
// O(1) storage to estimate the central tendency of hundreds of independent
// distributions (one per configuration). The number of samples per distribution
// (`kMinSamples`) varies from few to dozens. We support both by first storing
// values in a buffer, and when full, switching to online variance estimation.
// Modified from `hwy/stats.h`.
class CostDistribution {
public:
static constexpr size_t kMaxValues = 14; // for total size of 128 bytes
void Notify(const double x) {
if (HWY_UNLIKELY(x < 0.0)) {
HWY_WARN("Ignoring negative cost %f.", x);
return;
}
// Online phase after filling and warm-up.
if (HWY_LIKELY(IsOnline())) return OnlineNotify(x);
// Fill phase: store up to `kMaxValues` values.
values_[num_values_++] = x;
HWY_DASSERT(num_values_ <= kMaxValues);
if (HWY_UNLIKELY(num_values_ == kMaxValues)) {
WarmUpOnline();
HWY_DASSERT(IsOnline());
}
}
// Returns an estimate of the true cost, mitigating the impact of noise.
//
// Background and observations from time measurements in `thread_pool.h`:
// - We aim for O(1) storage because there may be hundreds of instances.
// - The mean is biased upwards by mostly additive noise: particularly
// interruptions such as context switches, but also contention.
// - The minimum is not a robust estimator because there are also "lucky
// shots" (1.2-1.6x lower values) where interruptions or contention happen
// to be low.
// - We want to preserve information about contention and a configuration's
// sensitivity to it. Otherwise, we are optimizing for the best-case, not
// the common case.
// - It is still important to minimize the influence of outliers, such as page
// faults, which can cause multiple times larger measurements.
// - Detecting outliers based only on the initial variance is too brittle. If
// the sample is narrow, measurements will fluctuate across runs because
// too many measurements are considered outliers. This would cause the
// 'best' configuration to vary.
//
// Approach:
// - Use Winsorization to reduce the impact of outliers, while preserving
// information on the central tendency.
// - Continually update the thresholds based on the online variance, with
// exponential smoothing for stability.
// - Trim the initial sample via MAD or skewness for a robust estimate of the
// variance.
double EstimateCost() {
if (!IsOnline()) {
WarmUpOnline();
HWY_DASSERT(IsOnline());
}
return Mean();
}
// Multiplex online state into values_ to allow higher `kMaxValues`.
// Public for inspection in tests. Do not use directly.
double& M1() { return values_[0]; } // Moments for variance.
double& M2() { return values_[1]; }
double& Mean() { return values_[2]; } // Exponential smoothing.
double& Stddev() { return values_[3]; }
double& Lower() { return values_[4]; }
double& Upper() { return values_[5]; }
private:
static double Median(double* to_sort, size_t n) {
HWY_DASSERT(n >= 2);
// F64 is supported everywhere except Armv7.
#if !HWY_ARCH_ARM_V7
VQSort(to_sort, n, SortAscending());
#else
// Values are known to be finite and non-negative, hence sorting as U64 is
// equivalent.
VQSort(reinterpret_cast<uint64_t*>(to_sort), n, SortAscending());
#endif
if (n & 1) return to_sort[n / 2];
// Even length: average of two middle elements.
return (to_sort[n / 2] + to_sort[n / 2 - 1]) * 0.5;
}
static double MAD(const double* values, size_t n, const double median) {
double abs_dev[kMaxValues];
for (size_t i = 0; i < n; ++i) {
abs_dev[i] = ScalarAbs(values[i] - median);
}
return Median(abs_dev, n);
}
// If `num_values_` is large enough, sorts and discards outliers: either via
// MAD, or if too many values are equal, by trimming according to skewness.
void RemoveOutliers() {
if (num_values_ < 3) return; // Not enough to discard two.
HWY_DASSERT(num_values_ <= kMaxValues);
// Given the noise level in `auto_tune_test`, it can happen that 1/4 of the
// sample is an outlier *in either direction*. Use median absolute
// deviation, which is robust to almost half of the sample being outliers.
const double median = Median(values_, num_values_); // sorts in-place.
const double mad = MAD(values_, num_values_, median);
// At least half the sample is equal.
if (mad == 0.0) {
// Estimate skewness to decide which side to trim more.
const double skewness =
(values_[num_values_ - 1] - median) - (median - values_[0]);
const size_t trim = HWY_MAX(num_values_ / 2, size_t{2});
const size_t left =
HWY_MAX(skewness < 0.0 ? trim * 3 / 4 : trim / 4, size_t{1});
num_values_ -= trim;
HWY_DASSERT(num_values_ >= 1);
memmove(values_, values_ + left, num_values_ * sizeof(values_[0]));
return;
}
const double upper = median + 5.0 * mad;
const double lower = median - 5.0 * mad;
size_t right = num_values_ - 1;
while (values_[right] > upper) --right;
// Nonzero MAD implies no more than half are equal, so we did not advance
// beyond the median.
HWY_DASSERT(right >= num_values_ / 2);
size_t left = 0;
while (left < right && values_[left] < lower) ++left;
HWY_DASSERT(left <= num_values_ / 2);
num_values_ = right - left + 1;
memmove(values_, values_ + left, num_values_ * sizeof(values_[0]));
}
double SampleMean() const {
// Only called in non-online phase, but buffer might not be full.
HWY_DASSERT(!IsOnline() && 0 != num_values_ && num_values_ <= kMaxValues);
double sum = 0.0;
for (size_t i = 0; i < num_values_; ++i) {
sum += values_[i];
}
return sum / static_cast<double>(num_values_);
}
// Unbiased estimator for population variance even for small `num_values_`.
double SampleVariance(double sample_mean) const {
HWY_DASSERT(sample_mean >= 0.0); // we checked costs are non-negative.
// Only called in non-online phase, but buffer might not be full.
HWY_DASSERT(!IsOnline() && 0 != num_values_ && num_values_ <= kMaxValues);
if (HWY_UNLIKELY(num_values_ == 1)) return 0.0; // prevent divide-by-zero.
double sum2 = 0.0;
for (size_t i = 0; i < num_values_; ++i) {
const double d = values_[i] - sample_mean;
sum2 += d * d;
}
return sum2 / static_cast<double>(num_values_ - 1);
}
bool IsOnline() const { return online_n_ > 0.0; }
void OnlineNotify(double x) {
// Winsorize.
x = HWY_MIN(HWY_MAX(Lower(), x), Upper());
// Welford's online variance estimator.
const double n_minus_1 = online_n_;
online_n_ += 1.0;
const double d = x - M1();
const double d_div_n = d / online_n_;
M1() += d_div_n;
HWY_DASSERT(M1() >= Lower());
M2() += d * n_minus_1 * d_div_n; // d^2 * (N-1)/N
// HWY_MAX avoids divide-by-zero.
const double stddev = std::sqrt(M2() / HWY_MAX(1.0, n_minus_1));
// Exponential smoothing.
constexpr double kNew = 0.2; // relatively fast update
constexpr double kOld = 1.0 - kNew;
Mean() = M1() * kNew + Mean() * kOld;
Stddev() = stddev * kNew + Stddev() * kOld;
// Update thresholds from smoothed mean and stddev to enable recovering from
// a too narrow initial range due to excessive trimming.
Lower() = Mean() - 3.5 * Stddev();
Upper() = Mean() + 3.5 * Stddev();
}
void WarmUpOnline() {
RemoveOutliers();
// Compute and copy before writing to `M1`, which overwrites `values_`!
const double sample_mean = SampleMean();
const double sample_variance = SampleVariance(sample_mean);
double copy[kMaxValues];
hwy::CopyBytes(values_, copy, num_values_ * sizeof(values_[0]));
M1() = M2() = 0.0;
Mean() = sample_mean;
Stddev() = std::sqrt(sample_variance);
// For single-value or all-equal sample, widen the range, else we will only
// accept the same value.
if (Stddev() == 0.0) Stddev() = Mean() / 2;
// High tolerance because the distribution is not actually Gaussian, and
// we trimmed up to *half*, and do not want to reject too many values in
// the online phase.
Lower() = Mean() - 4.0 * Stddev();
Upper() = Mean() + 4.0 * Stddev();
// Feed copied values into online estimator.
for (size_t i = 0; i < num_values_; ++i) {
OnlineNotify(copy[i]);
}
HWY_DASSERT(IsOnline());
#if SIZE_MAX == 0xFFFFFFFFu
(void)padding_;
#endif
}
size_t num_values_ = 0; // size of `values_` <= `kMaxValues`
#if SIZE_MAX == 0xFFFFFFFFu
uint32_t padding_ = 0;
#endif
double online_n_ = 0.0; // number of calls to `OnlineNotify`.
double values_[kMaxValues];
};
static_assert(sizeof(CostDistribution) == 128, "");
// Implements a counter with wrap-around, plus the ability to skip values.
// O(1) time, O(N) space via doubly-linked list of indices.
class NextWithSkip {
public:
NextWithSkip() {}
explicit NextWithSkip(size_t num) {
links_.reserve(num);
for (size_t i = 0; i < num; ++i) {
links_.emplace_back(i, num);
}
}
size_t Next(size_t pos) {
HWY_DASSERT(pos < links_.size());
HWY_DASSERT(!links_[pos].IsRemoved());
return links_[pos].Next();
}
// Must not be called for an already skipped position. Ignores an attempt to
// skip the last remaining position.
void Skip(size_t pos) {
HWY_DASSERT(!links_[pos].IsRemoved()); // not already skipped.
const size_t prev = links_[pos].Prev();
const size_t next = links_[pos].Next();
if (prev == pos || next == pos) return; // last remaining position.
links_[next].SetPrev(prev);
links_[prev].SetNext(next);
links_[pos].Remove();
}
private:
// Combine prev/next into one array to improve locality/reduce allocations.
class Link {
// Bit-shifts avoid potentially expensive 16-bit loads. Store `next` at the
// top and `prev` at the bottom for extraction with a single shift/AND.
// There may be hundreds of configurations, so 8 bits are not enough.
static constexpr size_t kBits = 14;
static constexpr size_t kShift = 32 - kBits;
static constexpr uint32_t kMaxNum = 1u << kBits;
public:
Link(size_t pos, size_t num) {
HWY_DASSERT(num < kMaxNum);
const size_t prev = pos == 0 ? num - 1 : pos - 1;
const size_t next = pos == num - 1 ? 0 : pos + 1;
bits_ =
(static_cast<uint32_t>(next) << kShift) | static_cast<uint32_t>(prev);
HWY_DASSERT(Next() == next && Prev() == prev);
HWY_DASSERT(!IsRemoved());
}
bool IsRemoved() const { return (bits_ & kMaxNum) != 0; }
void Remove() { bits_ |= kMaxNum; }
size_t Next() const { return bits_ >> kShift; }
size_t Prev() const { return bits_ & (kMaxNum - 1); }
void SetNext(size_t next) {
HWY_DASSERT(next < kMaxNum);
bits_ &= (~0u >> kBits); // clear old next
bits_ |= static_cast<uint32_t>(next) << kShift;
HWY_DASSERT(Next() == next);
HWY_DASSERT(!IsRemoved());
}
void SetPrev(size_t prev) {
HWY_DASSERT(prev < kMaxNum);
bits_ &= ~(kMaxNum - 1); // clear old prev
bits_ |= static_cast<uint32_t>(prev);
HWY_DASSERT(Prev() == prev);
HWY_DASSERT(!IsRemoved());
}
private:
uint32_t bits_;
};
std::vector<Link> links_;
};
// State machine for choosing at runtime the lowest-cost `Config`, which is
// typically a struct containing multiple parameters. For an introduction, see
// "Auto-Tuning and Performance Portability on Heterogeneous Hardware".
//
// **Which parameters**
// Note that simple parameters such as the L2 cache size can be directly queried
// via `hwy/contrib/thread_pool/topology.h`. Difficult to predict parameters
// such as task granularity are more appropriate for auto-tuning. We also
// suggest that at least some parameters should also be 'algorithm variants'
// such as parallel vs. serial, or 2D tiling vs. 1D striping.
//
// **Search strategy**
// To guarantee the optimal result, we use exhaustive search, which is suitable
// for around 10 parameters and a few hundred combinations of 'candidate'
// configurations.
//
// **How to generate candidates**
// To keep this framework simple and generic, applications enumerate the search
// space and pass the list of all feasible candidates to `SetCandidates` before
// the first call to `NextConfig`. Applications should prune the space as much
// as possible, e.g. by upper-bounding parameters based on the known cache
// sizes, and applying constraints such as one being a multiple of another.
//
// **Usage**
// Applications typically conditionally branch to the code implementing the
// configuration returned by `NextConfig`. They measure the cost of running it
// and pass that to `NotifyCost`. Branching avoids the complexity and
// opaqueness of a JIT. The number of branches can be reduced (at the cost of
// code size) by inlining low-level decisions into larger code regions, e.g. by
// hoisting them outside hot loops.
//
// **What is cost**
// Cost is an arbitrary `uint64_t`, with lower values being better. Most
// applications will use the elapsed time. If the tasks being tuned are short,
// it is important to use a high-resolution timer such as `hwy/timer.h`. Energy
//
// **Online vs. offline**
// Although applications can auto-tune once, offline, it may be difficult to
// ensure the stored configuration still applies to the current circumstances.
// Thus we recommend online auto-tuning, re-discovering the configuration on
// each run. We assume the overhead of bookkeeping and measuring cost is
// negligible relative to the actual work. The cost of auto-tuning is then that
// of running sub-optimal configurations. Assuming the best configuration is
// better than baseline, and the work is performed many thousands of times, the
// cost is outweighed by the benefits.
//
// **kMinSamples**
// To further reduce overhead, after `kMinSamples` rounds (= measurements of
// each configuration) we start excluding configurations from further
// measurements if they are sufficiently worse than the current best.
// `kMinSamples` can be several dozen when the tasks being tuned take a few
// microseconds. Even for longer tasks, it should be at least 2 for some noise
// tolerance. After this, there are another `kMinSamples / 2 + 1` rounds before
// declaring the winner.
template <typename Config, size_t kMinSamples = 2>
class AutoTune {
public:
// Returns non-null best configuration if auto-tuning has already finished.
// Otherwise, callers continue calling `NextConfig` and `NotifyCost`.
// Points into `Candidates()`.
const Config* Best() const { return best_; }
// If false, caller must call `SetCandidates` before `NextConfig`.
bool HasCandidates() const {
HWY_DASSERT(!Best());
return !candidates_.empty();
}
// WARNING: invalidates `Best()`, do not call if that is non-null.
void SetCandidates(std::vector<Config> candidates) {
HWY_DASSERT(!Best() && !HasCandidates());
candidates_.swap(candidates);
HWY_DASSERT(HasCandidates());
costs_.resize(candidates_.size());
list_ = NextWithSkip(candidates_.size());
}
// Typically called after Best() is non-null to compare all candidates' costs.
Span<const Config> Candidates() const {
HWY_DASSERT(HasCandidates());
return Span<const Config>(candidates_.data(), candidates_.size());
}
Span<CostDistribution> Costs() {
return Span<CostDistribution>(costs_.data(), costs_.size());
}
// Returns the current `Config` to measure.
const Config& NextConfig() const {
HWY_DASSERT(!Best() && HasCandidates());
return candidates_[config_idx_];
}
// O(1) except at the end of each round, which is O(N).
void NotifyCost(uint64_t cost) {
HWY_DASSERT(!Best() && HasCandidates());
costs_[config_idx_].Notify(static_cast<double>(cost));
// Save now before we update `config_idx_`.
const size_t my_idx = config_idx_;
// Only retrieve once we have enough samples, otherwise, we switch to
// online variance before the buffer is populated.
const double my_cost = rounds_complete_ >= kMinSamples
? costs_[config_idx_].EstimateCost()
: 0.0;
// Advance to next non-skipped config with wrap-around. This decorrelates
// measurements by not immediately re-measuring the same config.
config_idx_ = list_.Next(config_idx_);
// Might still equal `my_idx` if this is the only non-skipped config.
// Disqualify from future `NextConfig` if cost was too far beyond the
// current best. This reduces the number of measurements, while tolerating
// noise in the first few measurements. Must happen after advancing.
if (my_cost > skip_if_above_) {
list_.Skip(my_idx);
}
// Wrap-around indicates the round is complete.
if (HWY_UNLIKELY(config_idx_ <= my_idx)) {
++rounds_complete_;
// Enough samples for stable estimates: update the thresholds.
if (rounds_complete_ >= kMinSamples) {
double best_cost = HighestValue<double>();
size_t idx_min = 0;
for (size_t i = 0; i < candidates_.size(); ++i) {
const double estimate = costs_[i].EstimateCost();
if (estimate < best_cost) {
best_cost = estimate;
idx_min = i;
}
}
skip_if_above_ = best_cost * 1.25;
// After sufficient rounds, declare the winner.
if (HWY_UNLIKELY(rounds_complete_ == 3 * kMinSamples / 2 + 1)) {
best_ = &candidates_[idx_min];
HWY_DASSERT(Best());
}
}
}
}
// Avoid printing during the first few rounds, because those might be noisy
// and not yet skipped.
bool ShouldPrint() { return rounds_complete_ > kMinSamples; }
private:
const Config* best_ = nullptr;
std::vector<Config> candidates_;
std::vector<CostDistribution> costs_; // one per candidate
size_t config_idx_ = 0; // [0, candidates_.size())
NextWithSkip list_;
size_t rounds_complete_ = 0;
double skip_if_above_ = 0.0;
};
} // namespace hwy
#endif // HIGHWAY_HWY_AUTO_TUNE_H_