fwd_txfm_avx2.c - mozsearch

firefox-main/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_avx2.c (file symbol)

Enable keyboard shortcuts

Source code

File a bug in Core :: Audio/Video

Revision control

Copy as Markdown

Other Tools

/*

 *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

 *  Use of this source code is governed by a BSD-style license

 *  that can be found in the LICENSE file in the root of the source

 *  tree. An additional intellectual property rights grant can be found

 *  in the file PATENTS.  All contributing project authors may

 *  be found in the AUTHORS file in the root of the source tree.

*/

#include <immintrin.h>  // AVX2

#include "./vpx_config.h"

#include "./vpx_dsp_rtcd.h"

#include "vpx_dsp/txfm_common.h"

#define ADD256_EPI16 _mm256_add_epi16

#define SUB256_EPI16 _mm256_sub_epi16

static INLINE void load_buffer_16bit_to_16bit_avx2(const int16_t *in,

                                                   int stride, __m256i *out,

                                                   int out_size, int pass) {

  int i;

  const __m256i kOne = _mm256_set1_epi16(1);

  if (pass == 0) {

    for (i = 0; i < out_size; i++) {

      out[i] = _mm256_loadu_si256((const __m256i *)(in + i * stride));

      // x = x << 2

      out[i] = _mm256_slli_epi16(out[i], 2);

  } else {

    for (i = 0; i < out_size; i++) {

      out[i] = _mm256_loadu_si256((const __m256i *)(in + i * 16));

      // x = (x + 1) >> 2

      out[i] = _mm256_add_epi16(out[i], kOne);

      out[i] = _mm256_srai_epi16(out[i], 2);

static INLINE void transpose2_8x8_avx2(const __m256i *const in,

                                       __m256i *const out) {

  int i;

  __m256i t[16], u[16];

  // (1st, 2nd) ==> (lo, hi)

  //   (0, 1)   ==>  (0, 1)

  //   (2, 3)   ==>  (2, 3)

  //   (4, 5)   ==>  (4, 5)

  //   (6, 7)   ==>  (6, 7)

  for (i = 0; i < 4; i++) {

    t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]);

    t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]);

  // (1st, 2nd) ==> (lo, hi)

  //   (0, 2)   ==>  (0, 2)

  //   (1, 3)   ==>  (1, 3)

  //   (4, 6)   ==>  (4, 6)

  //   (5, 7)   ==>  (5, 7)

  for (i = 0; i < 2; i++) {

    u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]);

    u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]);

    u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]);

    u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]);

  // (1st, 2nd) ==> (lo, hi)

  //   (0, 4)   ==>  (0, 1)

  //   (1, 5)   ==>  (4, 5)

  //   (2, 6)   ==>  (2, 3)

  //   (3, 7)   ==>  (6, 7)

  for (i = 0; i < 2; i++) {

    out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]);

    out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]);

    out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]);

    out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]);

static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in,

                                              __m256i *const out) {

  __m256i t[16];

#define LOADL(idx)                                                            \

  t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \

  t[idx] = _mm256_inserti128_si256(                                           \

      t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1);

#define LOADR(idx)                                                           \

  t[8 + idx] =                                                               \

      _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \

  t[8 + idx] = _mm256_inserti128_si256(                                      \

      t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1);

  // load left 8x16

  LOADL(0)

  LOADL(1)

  LOADL(2)

  LOADL(3)

  LOADL(4)

  LOADL(5)

  LOADL(6)

  LOADL(7)

  // load right 8x16

  LOADR(0)

  LOADR(1)

  LOADR(2)

  LOADR(3)

  LOADR(4)

  LOADR(5)

  LOADR(6)

  LOADR(7)

  // get the top 16x8 result

  transpose2_8x8_avx2(t, out);

  // get the bottom 16x8 result

  transpose2_8x8_avx2(&t[8], &out[8]);

// Store 8 16-bit values. Sign extend the values.

static INLINE void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in,

                                                        tran_low_t *out,

                                                        const int stride,

                                                        const int out_size) {

  int i;

  for (i = 0; i < out_size; ++i) {

    _mm256_storeu_si256((__m256i *)(out), in[i]);

    out += stride;

#define PAIR256_SET_EPI16(a, b)                                            \

  _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \

                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \

                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \

                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))

static INLINE __m256i mult256_round_shift(const __m256i *pin0,

                                          const __m256i *pin1,

                                          const __m256i *pmultiplier,

                                          const __m256i *prounding,

                                          const int shift) {

  const __m256i u0 = _mm256_madd_epi16(*pin0, *pmultiplier);

  const __m256i u1 = _mm256_madd_epi16(*pin1, *pmultiplier);

  const __m256i v0 = _mm256_add_epi32(u0, *prounding);

  const __m256i v1 = _mm256_add_epi32(u1, *prounding);

  const __m256i w0 = _mm256_srai_epi32(v0, shift);

  const __m256i w1 = _mm256_srai_epi32(v1, shift);

  return _mm256_packs_epi32(w0, w1);

static INLINE void fdct16x16_1D_avx2(__m256i *input, __m256i *output) {

  int i;

  __m256i step2[4];

  __m256i in[8];

  __m256i step1[8];

  __m256i step3[8];

  const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(cospi_16_64);

  const __m256i k__cospi_p16_m16 = PAIR256_SET_EPI16(cospi_16_64, -cospi_16_64);

  const __m256i k__cospi_p24_p08 = PAIR256_SET_EPI16(cospi_24_64, cospi_8_64);

  const __m256i k__cospi_p08_m24 = PAIR256_SET_EPI16(cospi_8_64, -cospi_24_64);

  const __m256i k__cospi_m08_p24 = PAIR256_SET_EPI16(-cospi_8_64, cospi_24_64);

  const __m256i k__cospi_p28_p04 = PAIR256_SET_EPI16(cospi_28_64, cospi_4_64);

  const __m256i k__cospi_m04_p28 = PAIR256_SET_EPI16(-cospi_4_64, cospi_28_64);

  const __m256i k__cospi_p12_p20 = PAIR256_SET_EPI16(cospi_12_64, cospi_20_64);

  const __m256i k__cospi_m20_p12 = PAIR256_SET_EPI16(-cospi_20_64, cospi_12_64);

  const __m256i k__cospi_p30_p02 = PAIR256_SET_EPI16(cospi_30_64, cospi_2_64);

  const __m256i k__cospi_p14_p18 = PAIR256_SET_EPI16(cospi_14_64, cospi_18_64);

  const __m256i k__cospi_m02_p30 = PAIR256_SET_EPI16(-cospi_2_64, cospi_30_64);

  const __m256i k__cospi_m18_p14 = PAIR256_SET_EPI16(-cospi_18_64, cospi_14_64);

  const __m256i k__cospi_p22_p10 = PAIR256_SET_EPI16(cospi_22_64, cospi_10_64);

  const __m256i k__cospi_p06_p26 = PAIR256_SET_EPI16(cospi_6_64, cospi_26_64);

  const __m256i k__cospi_m10_p22 = PAIR256_SET_EPI16(-cospi_10_64, cospi_22_64);

  const __m256i k__cospi_m26_p06 = PAIR256_SET_EPI16(-cospi_26_64, cospi_6_64);

  const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING);

  // Calculate input for the first 8 results.

  for (i = 0; i < 8; i++) {

    in[i] = ADD256_EPI16(input[i], input[15 - i]);

  // Calculate input for the next 8 results.

  for (i = 0; i < 8; i++) {

    step1[i] = SUB256_EPI16(input[7 - i], input[8 + i]);

  // Work on the first eight values; fdct8(input, even_results);

    // Add/subtract

    const __m256i q0 = ADD256_EPI16(in[0], in[7]);

    const __m256i q1 = ADD256_EPI16(in[1], in[6]);

    const __m256i q2 = ADD256_EPI16(in[2], in[5]);

    const __m256i q3 = ADD256_EPI16(in[3], in[4]);

    const __m256i q4 = SUB256_EPI16(in[3], in[4]);

    const __m256i q5 = SUB256_EPI16(in[2], in[5]);

    const __m256i q6 = SUB256_EPI16(in[1], in[6]);

    const __m256i q7 = SUB256_EPI16(in[0], in[7]);

    // Work on first four results

      // Add/subtract

      const __m256i r0 = ADD256_EPI16(q0, q3);

      const __m256i r1 = ADD256_EPI16(q1, q2);

      const __m256i r2 = SUB256_EPI16(q1, q2);

      const __m256i r3 = SUB256_EPI16(q0, q3);

      // Interleave to do the multiply by constants which gets us

      // into 32 bits.

        const __m256i t0 = _mm256_unpacklo_epi16(r0, r1);

        const __m256i t1 = _mm256_unpackhi_epi16(r0, r1);

        const __m256i t2 = _mm256_unpacklo_epi16(r2, r3);

        const __m256i t3 = _mm256_unpackhi_epi16(r2, r3);

        output[0] = mult256_round_shift(&t0, &t1, &k__cospi_p16_p16,

                                        &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);

        output[8] = mult256_round_shift(&t0, &t1, &k__cospi_p16_m16,

                                        &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);

        output[4] = mult256_round_shift(&t2, &t3, &k__cospi_p24_p08,

                                        &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);

        output[12] =

            mult256_round_shift(&t2, &t3, &k__cospi_m08_p24,

                                &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);

    // Work on next four results

      // Interleave to do the multiply by constants which gets us

      // into 32 bits.

      const __m256i d0 = _mm256_unpacklo_epi16(q6, q5);

      const __m256i d1 = _mm256_unpackhi_epi16(q6, q5);

      const __m256i r0 = mult256_round_shift(

          &d0, &d1, &k__cospi_p16_m16, &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);

      const __m256i r1 = mult256_round_shift(

          &d0, &d1, &k__cospi_p16_p16, &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);

        // Add/subtract

        const __m256i x0 = ADD256_EPI16(q4, r0);

        const __m256i x1 = SUB256_EPI16(q4, r0);

        const __m256i x2 = SUB256_EPI16(q7, r1);

        const __m256i x3 = ADD256_EPI16(q7, r1);

        // Interleave to do the multiply by constants which gets us

        // into 32 bits.

          const __m256i t0 = _mm256_unpacklo_epi16(x0, x3);

          const __m256i t1 = _mm256_unpackhi_epi16(x0, x3);

          const __m256i t2 = _mm256_unpacklo_epi16(x1, x2);

          const __m256i t3 = _mm256_unpackhi_epi16(x1, x2);

          output[2] =

              mult256_round_shift(&t0, &t1, &k__cospi_p28_p04,

                                  &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);

          output[14] =

              mult256_round_shift(&t0, &t1, &k__cospi_m04_p28,

                                  &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);

          output[10] =

              mult256_round_shift(&t2, &t3, &k__cospi_p12_p20,

                                  &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);

          output[6] =

              mult256_round_shift(&t2, &t3, &k__cospi_m20_p12,

                                  &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);

  // Work on the next eight values; step1 -> odd_results

  {  // step 2

      const __m256i t0 = _mm256_unpacklo_epi16(step1[5], step1[2]);

      const __m256i t1 = _mm256_unpackhi_epi16(step1[5], step1[2]);

      const __m256i t2 = _mm256_unpacklo_epi16(step1[4], step1[3]);

      const __m256i t3 = _mm256_unpackhi_epi16(step1[4], step1[3]);

      step2[0] = mult256_round_shift(&t0, &t1, &k__cospi_p16_m16,

                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);

      step2[1] = mult256_round_shift(&t2, &t3, &k__cospi_p16_m16,

                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);

      step2[2] = mult256_round_shift(&t0, &t1, &k__cospi_p16_p16,

                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);

      step2[3] = mult256_round_shift(&t2, &t3, &k__cospi_p16_p16,

                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);

    // step 3

      step3[0] = ADD256_EPI16(step1[0], step2[1]);

      step3[1] = ADD256_EPI16(step1[1], step2[0]);

      step3[2] = SUB256_EPI16(step1[1], step2[0]);

      step3[3] = SUB256_EPI16(step1[0], step2[1]);

      step3[4] = SUB256_EPI16(step1[7], step2[3]);

      step3[5] = SUB256_EPI16(step1[6], step2[2]);

      step3[6] = ADD256_EPI16(step1[6], step2[2]);

      step3[7] = ADD256_EPI16(step1[7], step2[3]);

    // step 4

      const __m256i t0 = _mm256_unpacklo_epi16(step3[1], step3[6]);

      const __m256i t1 = _mm256_unpackhi_epi16(step3[1], step3[6]);

      const __m256i t2 = _mm256_unpacklo_epi16(step3[2], step3[5]);

      const __m256i t3 = _mm256_unpackhi_epi16(step3[2], step3[5]);

      step2[0] = mult256_round_shift(&t0, &t1, &k__cospi_m08_p24,

                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);

      step2[1] = mult256_round_shift(&t2, &t3, &k__cospi_p24_p08,

                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);

      step2[2] = mult256_round_shift(&t0, &t1, &k__cospi_p24_p08,

                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);

      step2[3] = mult256_round_shift(&t2, &t3, &k__cospi_p08_m24,

                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);

    // step 5

      step1[0] = ADD256_EPI16(step3[0], step2[0]);

      step1[1] = SUB256_EPI16(step3[0], step2[0]);

      step1[2] = ADD256_EPI16(step3[3], step2[1]);

      step1[3] = SUB256_EPI16(step3[3], step2[1]);

      step1[4] = SUB256_EPI16(step3[4], step2[3]);

      step1[5] = ADD256_EPI16(step3[4], step2[3]);

      step1[6] = SUB256_EPI16(step3[7], step2[2]);

      step1[7] = ADD256_EPI16(step3[7], step2[2]);

    // step 6

      const __m256i t0 = _mm256_unpacklo_epi16(step1[0], step1[7]);

      const __m256i t1 = _mm256_unpackhi_epi16(step1[0], step1[7]);

      const __m256i t2 = _mm256_unpacklo_epi16(step1[1], step1[6]);

      const __m256i t3 = _mm256_unpackhi_epi16(step1[1], step1[6]);

      output[1] = mult256_round_shift(&t0, &t1, &k__cospi_p30_p02,

                                      &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);

      output[9] = mult256_round_shift(&t2, &t3, &k__cospi_p14_p18,

                                      &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);

      output[15] = mult256_round_shift(&t0, &t1, &k__cospi_m02_p30,

                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);

      output[7] = mult256_round_shift(&t2, &t3, &k__cospi_m18_p14,

                                      &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);

      const __m256i t0 = _mm256_unpacklo_epi16(step1[2], step1[5]);

      const __m256i t1 = _mm256_unpackhi_epi16(step1[2], step1[5]);

      const __m256i t2 = _mm256_unpacklo_epi16(step1[3], step1[4]);

      const __m256i t3 = _mm256_unpackhi_epi16(step1[3], step1[4]);

      output[5] = mult256_round_shift(&t0, &t1, &k__cospi_p22_p10,

                                      &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);

      output[13] = mult256_round_shift(&t2, &t3, &k__cospi_p06_p26,

                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);

      output[11] = mult256_round_shift(&t0, &t1, &k__cospi_m10_p22,

                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);

      output[3] = mult256_round_shift(&t2, &t3, &k__cospi_m26_p06,

                                      &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);

void vpx_fdct16x16_avx2(const int16_t *input, tran_low_t *output, int stride) {

  int pass;

  DECLARE_ALIGNED(32, int16_t, intermediate[256]);

  int16_t *out0 = intermediate;

  tran_low_t *out1 = output;

  const int width = 16;

  const int height = 16;

  __m256i buf0[16], buf1[16];

  // Two transform and transpose passes

  // Process 16 columns (transposed rows in second pass) at a time.

  for (pass = 0; pass < 2; ++pass) {

    // Load and pre-condition input.

    load_buffer_16bit_to_16bit_avx2(input, stride, buf1, height, pass);

    // Calculate dct for 16x16 values

    fdct16x16_1D_avx2(buf1, buf0);

    // Transpose the results.

    transpose_16bit_16x16_avx2(buf0, buf1);

    if (pass == 0) {

      store_buffer_16bit_to_32bit_w16_avx2(buf1, (tran_low_t *)out0, width,

                                           height);

    } else {

      store_buffer_16bit_to_32bit_w16_avx2(buf1, out1, width, height);

    // Setup in/out for next pass.

    input = intermediate;

#if !CONFIG_VP9_HIGHBITDEPTH

#define FDCT32x32_2D_AVX2 vpx_fdct32x32_rd_avx2

#define FDCT32x32_HIGH_PRECISION 0

#include "vpx_dsp/x86/fwd_dct32x32_impl_avx2.h"

#undef FDCT32x32_2D_AVX2

#undef FDCT32x32_HIGH_PRECISION

#define FDCT32x32_2D_AVX2 vpx_fdct32x32_avx2

#define FDCT32x32_HIGH_PRECISION 1

#include "vpx_dsp/x86/fwd_dct32x32_impl_avx2.h"  // NOLINT

#undef FDCT32x32_2D_AVX2

#undef FDCT32x32_HIGH_PRECISION

#endif  // !CONFIG_VP9_HIGHBITDEPTH