1465 lines
61 KiB
C++
1465 lines
61 KiB
C++
/*
|
|
* Copyright 2013 The LibYuv Project Authors. All rights reserved.
|
|
*
|
|
* Use of this source code is governed by a BSD-style license
|
|
* that can be found in the LICENSE file in the root of the source
|
|
* tree. An additional intellectual property rights grant can be found
|
|
* in the file PATENTS. All contributing project authors may
|
|
* be found in the AUTHORS file in the root of the source tree.
|
|
*/
|
|
|
|
#include "libyuv/row.h"
|
|
#include "libyuv/scale_row.h"
|
|
|
|
#ifdef __cplusplus
|
|
namespace libyuv {
|
|
extern "C" {
|
|
#endif
|
|
|
|
// This module is for GCC x86 and x64.
|
|
#if !defined(LIBYUV_DISABLE_X86) && \
|
|
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
|
|
|
|
// Offsets for source bytes 0 to 9
|
|
static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
|
|
128, 128, 128, 128, 128, 128, 128, 128};
|
|
|
|
// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
|
|
static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12,
|
|
128, 128, 128, 128, 128, 128, 128, 128};
|
|
|
|
// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
|
|
static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15,
|
|
128, 128, 128, 128, 128, 128, 128, 128};
|
|
|
|
// Offsets for source bytes 0 to 10
|
|
static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
|
|
|
|
// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
|
|
static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7,
|
|
8, 9, 9, 10, 10, 11, 12, 13};
|
|
|
|
// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
|
|
static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10,
|
|
10, 11, 12, 13, 13, 14, 14, 15};
|
|
|
|
// Coefficients for source bytes 0 to 10
|
|
static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
|
|
|
|
// Coefficients for source bytes 10 to 21
|
|
static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
|
|
|
|
// Coefficients for source bytes 21 to 31
|
|
static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
|
|
|
|
// Coefficients for source bytes 21 to 31
|
|
static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
|
|
|
|
static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128,
|
|
128, 128, 128, 128, 128, 128, 128, 128};
|
|
|
|
static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3,
|
|
6, 8, 11, 14, 128, 128, 128, 128};
|
|
|
|
// Arrange words 0,3,6 into 0,1,2
|
|
static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128,
|
|
128, 128, 128, 128, 128, 128, 128, 128};
|
|
|
|
// Arrange words 0,3,6 into 3,4,5
|
|
static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1,
|
|
6, 7, 12, 13, 128, 128, 128, 128};
|
|
|
|
// Scaling values for boxes of 3x3 and 2x3
|
|
static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
|
|
65536 / 9, 65536 / 6, 0, 0};
|
|
|
|
// Arrange first value for pixels 0,1,2,3,4,5
|
|
static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128,
|
|
11, 128, 14, 128, 128, 128, 128, 128};
|
|
|
|
// Arrange second value for pixels 0,1,2,3,4,5
|
|
static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128,
|
|
12, 128, 15, 128, 128, 128, 128, 128};
|
|
|
|
// Arrange third value for pixels 0,1,2,3,4,5
|
|
static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128,
|
|
13, 128, 128, 128, 128, 128, 128, 128};
|
|
|
|
// Scaling values for boxes of 3x2 and 2x2
|
|
static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
|
|
65536 / 3, 65536 / 2, 0, 0};
|
|
|
|
// GCC versions of row functions are verbatim conversions from Visual C.
|
|
// Generated using gcc disassembly on Visual C object file:
|
|
// objdump -D yuvscaler.obj >yuvscaler.txt
|
|
|
|
void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
|
|
ptrdiff_t src_stride,
|
|
uint8_t* dst_ptr,
|
|
int dst_width) {
|
|
(void)src_stride;
|
|
asm volatile(
|
|
// 16 pixel loop.
|
|
LABELALIGN
|
|
"1: \n"
|
|
"movdqu (%0),%%xmm0 \n"
|
|
"movdqu 0x10(%0),%%xmm1 \n"
|
|
"lea 0x20(%0),%0 \n"
|
|
"psrlw $0x8,%%xmm0 \n"
|
|
"psrlw $0x8,%%xmm1 \n"
|
|
"packuswb %%xmm1,%%xmm0 \n"
|
|
"movdqu %%xmm0,(%1) \n"
|
|
"lea 0x10(%1),%1 \n"
|
|
"sub $0x10,%2 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_ptr), // %0
|
|
"+r"(dst_ptr), // %1
|
|
"+r"(dst_width) // %2
|
|
::"memory",
|
|
"cc", "xmm0", "xmm1");
|
|
}
|
|
|
|
void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
|
|
ptrdiff_t src_stride,
|
|
uint8_t* dst_ptr,
|
|
int dst_width) {
|
|
(void)src_stride;
|
|
asm volatile(
|
|
"pcmpeqb %%xmm4,%%xmm4 \n"
|
|
"psrlw $0xf,%%xmm4 \n"
|
|
"packuswb %%xmm4,%%xmm4 \n"
|
|
"pxor %%xmm5,%%xmm5 \n"
|
|
|
|
LABELALIGN
|
|
"1: \n"
|
|
"movdqu (%0),%%xmm0 \n"
|
|
"movdqu 0x10(%0),%%xmm1 \n"
|
|
"lea 0x20(%0),%0 \n"
|
|
"pmaddubsw %%xmm4,%%xmm0 \n"
|
|
"pmaddubsw %%xmm4,%%xmm1 \n"
|
|
"pavgw %%xmm5,%%xmm0 \n"
|
|
"pavgw %%xmm5,%%xmm1 \n"
|
|
"packuswb %%xmm1,%%xmm0 \n"
|
|
"movdqu %%xmm0,(%1) \n"
|
|
"lea 0x10(%1),%1 \n"
|
|
"sub $0x10,%2 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_ptr), // %0
|
|
"+r"(dst_ptr), // %1
|
|
"+r"(dst_width) // %2
|
|
::"memory",
|
|
"cc", "xmm0", "xmm1", "xmm4", "xmm5");
|
|
}
|
|
|
|
void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
|
|
ptrdiff_t src_stride,
|
|
uint8_t* dst_ptr,
|
|
int dst_width) {
|
|
asm volatile(
|
|
"pcmpeqb %%xmm4,%%xmm4 \n"
|
|
"psrlw $0xf,%%xmm4 \n"
|
|
"packuswb %%xmm4,%%xmm4 \n"
|
|
"pxor %%xmm5,%%xmm5 \n"
|
|
|
|
LABELALIGN
|
|
"1: \n"
|
|
"movdqu (%0),%%xmm0 \n"
|
|
"movdqu 0x10(%0),%%xmm1 \n"
|
|
"movdqu 0x00(%0,%3,1),%%xmm2 \n"
|
|
"movdqu 0x10(%0,%3,1),%%xmm3 \n"
|
|
"lea 0x20(%0),%0 \n"
|
|
"pmaddubsw %%xmm4,%%xmm0 \n"
|
|
"pmaddubsw %%xmm4,%%xmm1 \n"
|
|
"pmaddubsw %%xmm4,%%xmm2 \n"
|
|
"pmaddubsw %%xmm4,%%xmm3 \n"
|
|
"paddw %%xmm2,%%xmm0 \n"
|
|
"paddw %%xmm3,%%xmm1 \n"
|
|
"psrlw $0x1,%%xmm0 \n"
|
|
"psrlw $0x1,%%xmm1 \n"
|
|
"pavgw %%xmm5,%%xmm0 \n"
|
|
"pavgw %%xmm5,%%xmm1 \n"
|
|
"packuswb %%xmm1,%%xmm0 \n"
|
|
"movdqu %%xmm0,(%1) \n"
|
|
"lea 0x10(%1),%1 \n"
|
|
"sub $0x10,%2 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_ptr), // %0
|
|
"+r"(dst_ptr), // %1
|
|
"+r"(dst_width) // %2
|
|
: "r"((intptr_t)(src_stride)) // %3
|
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
|
|
}
|
|
|
|
#ifdef HAS_SCALEROWDOWN2_AVX2
|
|
void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
|
|
ptrdiff_t src_stride,
|
|
uint8_t* dst_ptr,
|
|
int dst_width) {
|
|
(void)src_stride;
|
|
asm volatile(
|
|
|
|
LABELALIGN
|
|
"1: \n"
|
|
"vmovdqu (%0),%%ymm0 \n"
|
|
"vmovdqu 0x20(%0),%%ymm1 \n"
|
|
"lea 0x40(%0),%0 \n"
|
|
"vpsrlw $0x8,%%ymm0,%%ymm0 \n"
|
|
"vpsrlw $0x8,%%ymm1,%%ymm1 \n"
|
|
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
|
|
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
|
|
"vmovdqu %%ymm0,(%1) \n"
|
|
"lea 0x20(%1),%1 \n"
|
|
"sub $0x20,%2 \n"
|
|
"jg 1b \n"
|
|
"vzeroupper \n"
|
|
: "+r"(src_ptr), // %0
|
|
"+r"(dst_ptr), // %1
|
|
"+r"(dst_width) // %2
|
|
::"memory",
|
|
"cc", "xmm0", "xmm1");
|
|
}
|
|
|
|
void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
|
|
ptrdiff_t src_stride,
|
|
uint8_t* dst_ptr,
|
|
int dst_width) {
|
|
(void)src_stride;
|
|
asm volatile(
|
|
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
|
|
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
|
|
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
|
|
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
|
|
|
|
LABELALIGN
|
|
"1: \n"
|
|
"vmovdqu (%0),%%ymm0 \n"
|
|
"vmovdqu 0x20(%0),%%ymm1 \n"
|
|
"lea 0x40(%0),%0 \n"
|
|
"vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
|
|
"vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
|
|
"vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
|
|
"vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
|
|
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
|
|
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
|
|
"vmovdqu %%ymm0,(%1) \n"
|
|
"lea 0x20(%1),%1 \n"
|
|
"sub $0x20,%2 \n"
|
|
"jg 1b \n"
|
|
"vzeroupper \n"
|
|
: "+r"(src_ptr), // %0
|
|
"+r"(dst_ptr), // %1
|
|
"+r"(dst_width) // %2
|
|
::"memory",
|
|
"cc", "xmm0", "xmm1", "xmm4", "xmm5");
|
|
}
|
|
|
|
void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
|
|
ptrdiff_t src_stride,
|
|
uint8_t* dst_ptr,
|
|
int dst_width) {
|
|
asm volatile(
|
|
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
|
|
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
|
|
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
|
|
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
|
|
|
|
LABELALIGN
|
|
"1: \n"
|
|
"vmovdqu (%0),%%ymm0 \n"
|
|
"vmovdqu 0x20(%0),%%ymm1 \n"
|
|
"vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
|
|
"vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
|
|
"lea 0x40(%0),%0 \n"
|
|
"vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
|
|
"vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
|
|
"vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
|
|
"vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
|
|
"vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
|
|
"vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
|
|
"vpsrlw $0x1,%%ymm0,%%ymm0 \n"
|
|
"vpsrlw $0x1,%%ymm1,%%ymm1 \n"
|
|
"vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
|
|
"vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
|
|
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
|
|
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
|
|
"vmovdqu %%ymm0,(%1) \n"
|
|
"lea 0x20(%1),%1 \n"
|
|
"sub $0x20,%2 \n"
|
|
"jg 1b \n"
|
|
"vzeroupper \n"
|
|
: "+r"(src_ptr), // %0
|
|
"+r"(dst_ptr), // %1
|
|
"+r"(dst_width) // %2
|
|
: "r"((intptr_t)(src_stride)) // %3
|
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
|
|
}
|
|
#endif // HAS_SCALEROWDOWN2_AVX2
|
|
|
|
void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
|
|
ptrdiff_t src_stride,
|
|
uint8_t* dst_ptr,
|
|
int dst_width) {
|
|
(void)src_stride;
|
|
asm volatile(
|
|
"pcmpeqb %%xmm5,%%xmm5 \n"
|
|
"psrld $0x18,%%xmm5 \n"
|
|
"pslld $0x10,%%xmm5 \n"
|
|
|
|
LABELALIGN
|
|
"1: \n"
|
|
"movdqu (%0),%%xmm0 \n"
|
|
"movdqu 0x10(%0),%%xmm1 \n"
|
|
"lea 0x20(%0),%0 \n"
|
|
"pand %%xmm5,%%xmm0 \n"
|
|
"pand %%xmm5,%%xmm1 \n"
|
|
"packuswb %%xmm1,%%xmm0 \n"
|
|
"psrlw $0x8,%%xmm0 \n"
|
|
"packuswb %%xmm0,%%xmm0 \n"
|
|
"movq %%xmm0,(%1) \n"
|
|
"lea 0x8(%1),%1 \n"
|
|
"sub $0x8,%2 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_ptr), // %0
|
|
"+r"(dst_ptr), // %1
|
|
"+r"(dst_width) // %2
|
|
::"memory",
|
|
"cc", "xmm0", "xmm1", "xmm5");
|
|
}
|
|
|
|
void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
|
|
ptrdiff_t src_stride,
|
|
uint8_t* dst_ptr,
|
|
int dst_width) {
|
|
intptr_t stridex3;
|
|
asm volatile(
|
|
"pcmpeqb %%xmm4,%%xmm4 \n"
|
|
"psrlw $0xf,%%xmm4 \n"
|
|
"movdqa %%xmm4,%%xmm5 \n"
|
|
"packuswb %%xmm4,%%xmm4 \n"
|
|
"psllw $0x3,%%xmm5 \n"
|
|
"lea 0x00(%4,%4,2),%3 \n"
|
|
|
|
LABELALIGN
|
|
"1: \n"
|
|
"movdqu (%0),%%xmm0 \n"
|
|
"movdqu 0x10(%0),%%xmm1 \n"
|
|
"movdqu 0x00(%0,%4,1),%%xmm2 \n"
|
|
"movdqu 0x10(%0,%4,1),%%xmm3 \n"
|
|
"pmaddubsw %%xmm4,%%xmm0 \n"
|
|
"pmaddubsw %%xmm4,%%xmm1 \n"
|
|
"pmaddubsw %%xmm4,%%xmm2 \n"
|
|
"pmaddubsw %%xmm4,%%xmm3 \n"
|
|
"paddw %%xmm2,%%xmm0 \n"
|
|
"paddw %%xmm3,%%xmm1 \n"
|
|
"movdqu 0x00(%0,%4,2),%%xmm2 \n"
|
|
"movdqu 0x10(%0,%4,2),%%xmm3 \n"
|
|
"pmaddubsw %%xmm4,%%xmm2 \n"
|
|
"pmaddubsw %%xmm4,%%xmm3 \n"
|
|
"paddw %%xmm2,%%xmm0 \n"
|
|
"paddw %%xmm3,%%xmm1 \n"
|
|
"movdqu 0x00(%0,%3,1),%%xmm2 \n"
|
|
"movdqu 0x10(%0,%3,1),%%xmm3 \n"
|
|
"lea 0x20(%0),%0 \n"
|
|
"pmaddubsw %%xmm4,%%xmm2 \n"
|
|
"pmaddubsw %%xmm4,%%xmm3 \n"
|
|
"paddw %%xmm2,%%xmm0 \n"
|
|
"paddw %%xmm3,%%xmm1 \n"
|
|
"phaddw %%xmm1,%%xmm0 \n"
|
|
"paddw %%xmm5,%%xmm0 \n"
|
|
"psrlw $0x4,%%xmm0 \n"
|
|
"packuswb %%xmm0,%%xmm0 \n"
|
|
"movq %%xmm0,(%1) \n"
|
|
"lea 0x8(%1),%1 \n"
|
|
"sub $0x8,%2 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_ptr), // %0
|
|
"+r"(dst_ptr), // %1
|
|
"+r"(dst_width), // %2
|
|
"=&r"(stridex3) // %3
|
|
: "r"((intptr_t)(src_stride)) // %4
|
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
|
|
}
|
|
|
|
#ifdef HAS_SCALEROWDOWN4_AVX2
|
|
void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
|
|
ptrdiff_t src_stride,
|
|
uint8_t* dst_ptr,
|
|
int dst_width) {
|
|
(void)src_stride;
|
|
asm volatile(
|
|
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
|
|
"vpsrld $0x18,%%ymm5,%%ymm5 \n"
|
|
"vpslld $0x10,%%ymm5,%%ymm5 \n"
|
|
|
|
LABELALIGN
|
|
"1: \n"
|
|
"vmovdqu (%0),%%ymm0 \n"
|
|
"vmovdqu 0x20(%0),%%ymm1 \n"
|
|
"lea 0x40(%0),%0 \n"
|
|
"vpand %%ymm5,%%ymm0,%%ymm0 \n"
|
|
"vpand %%ymm5,%%ymm1,%%ymm1 \n"
|
|
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
|
|
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
|
|
"vpsrlw $0x8,%%ymm0,%%ymm0 \n"
|
|
"vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
|
|
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
|
|
"vmovdqu %%xmm0,(%1) \n"
|
|
"lea 0x10(%1),%1 \n"
|
|
"sub $0x10,%2 \n"
|
|
"jg 1b \n"
|
|
"vzeroupper \n"
|
|
: "+r"(src_ptr), // %0
|
|
"+r"(dst_ptr), // %1
|
|
"+r"(dst_width) // %2
|
|
::"memory",
|
|
"cc", "xmm0", "xmm1", "xmm5");
|
|
}
|
|
|
|
void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
|
|
ptrdiff_t src_stride,
|
|
uint8_t* dst_ptr,
|
|
int dst_width) {
|
|
asm volatile(
|
|
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
|
|
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
|
|
"vpsllw $0x3,%%ymm4,%%ymm5 \n"
|
|
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
|
|
|
|
LABELALIGN
|
|
"1: \n"
|
|
"vmovdqu (%0),%%ymm0 \n"
|
|
"vmovdqu 0x20(%0),%%ymm1 \n"
|
|
"vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
|
|
"vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
|
|
"vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
|
|
"vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
|
|
"vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
|
|
"vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
|
|
"vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
|
|
"vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
|
|
"vmovdqu 0x00(%0,%3,2),%%ymm2 \n"
|
|
"vmovdqu 0x20(%0,%3,2),%%ymm3 \n"
|
|
"vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
|
|
"vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
|
|
"vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
|
|
"vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
|
|
"vmovdqu 0x00(%0,%4,1),%%ymm2 \n"
|
|
"vmovdqu 0x20(%0,%4,1),%%ymm3 \n"
|
|
"lea 0x40(%0),%0 \n"
|
|
"vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
|
|
"vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
|
|
"vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
|
|
"vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
|
|
"vphaddw %%ymm1,%%ymm0,%%ymm0 \n"
|
|
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
|
|
"vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
|
|
"vpsrlw $0x4,%%ymm0,%%ymm0 \n"
|
|
"vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
|
|
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
|
|
"vmovdqu %%xmm0,(%1) \n"
|
|
"lea 0x10(%1),%1 \n"
|
|
"sub $0x10,%2 \n"
|
|
"jg 1b \n"
|
|
"vzeroupper \n"
|
|
: "+r"(src_ptr), // %0
|
|
"+r"(dst_ptr), // %1
|
|
"+r"(dst_width) // %2
|
|
: "r"((intptr_t)(src_stride)), // %3
|
|
"r"((intptr_t)(src_stride * 3)) // %4
|
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
|
|
}
|
|
#endif // HAS_SCALEROWDOWN4_AVX2
|
|
|
|
void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
|
|
ptrdiff_t src_stride,
|
|
uint8_t* dst_ptr,
|
|
int dst_width) {
|
|
(void)src_stride;
|
|
asm volatile(
|
|
"movdqa %0,%%xmm3 \n"
|
|
"movdqa %1,%%xmm4 \n"
|
|
"movdqa %2,%%xmm5 \n"
|
|
:
|
|
: "m"(kShuf0), // %0
|
|
"m"(kShuf1), // %1
|
|
"m"(kShuf2) // %2
|
|
);
|
|
asm volatile(
|
|
|
|
LABELALIGN
|
|
"1: \n"
|
|
"movdqu (%0),%%xmm0 \n"
|
|
"movdqu 0x10(%0),%%xmm2 \n"
|
|
"lea 0x20(%0),%0 \n"
|
|
"movdqa %%xmm2,%%xmm1 \n"
|
|
"palignr $0x8,%%xmm0,%%xmm1 \n"
|
|
"pshufb %%xmm3,%%xmm0 \n"
|
|
"pshufb %%xmm4,%%xmm1 \n"
|
|
"pshufb %%xmm5,%%xmm2 \n"
|
|
"movq %%xmm0,(%1) \n"
|
|
"movq %%xmm1,0x8(%1) \n"
|
|
"movq %%xmm2,0x10(%1) \n"
|
|
"lea 0x18(%1),%1 \n"
|
|
"sub $0x18,%2 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_ptr), // %0
|
|
"+r"(dst_ptr), // %1
|
|
"+r"(dst_width) // %2
|
|
::"memory",
|
|
"cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
|
|
}
|
|
|
|
void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
|
|
ptrdiff_t src_stride,
|
|
uint8_t* dst_ptr,
|
|
int dst_width) {
|
|
asm volatile(
|
|
"movdqa %0,%%xmm2 \n" // kShuf01
|
|
"movdqa %1,%%xmm3 \n" // kShuf11
|
|
"movdqa %2,%%xmm4 \n" // kShuf21
|
|
:
|
|
: "m"(kShuf01), // %0
|
|
"m"(kShuf11), // %1
|
|
"m"(kShuf21) // %2
|
|
);
|
|
asm volatile(
|
|
"movdqa %0,%%xmm5 \n" // kMadd01
|
|
"movdqa %1,%%xmm0 \n" // kMadd11
|
|
"movdqa %2,%%xmm1 \n" // kRound34
|
|
:
|
|
: "m"(kMadd01), // %0
|
|
"m"(kMadd11), // %1
|
|
"m"(kRound34) // %2
|
|
);
|
|
asm volatile(
|
|
|
|
LABELALIGN
|
|
"1: \n"
|
|
"movdqu (%0),%%xmm6 \n"
|
|
"movdqu 0x00(%0,%3,1),%%xmm7 \n"
|
|
"pavgb %%xmm7,%%xmm6 \n"
|
|
"pshufb %%xmm2,%%xmm6 \n"
|
|
"pmaddubsw %%xmm5,%%xmm6 \n"
|
|
"paddsw %%xmm1,%%xmm6 \n"
|
|
"psrlw $0x2,%%xmm6 \n"
|
|
"packuswb %%xmm6,%%xmm6 \n"
|
|
"movq %%xmm6,(%1) \n"
|
|
"movdqu 0x8(%0),%%xmm6 \n"
|
|
"movdqu 0x8(%0,%3,1),%%xmm7 \n"
|
|
"pavgb %%xmm7,%%xmm6 \n"
|
|
"pshufb %%xmm3,%%xmm6 \n"
|
|
"pmaddubsw %%xmm0,%%xmm6 \n"
|
|
"paddsw %%xmm1,%%xmm6 \n"
|
|
"psrlw $0x2,%%xmm6 \n"
|
|
"packuswb %%xmm6,%%xmm6 \n"
|
|
"movq %%xmm6,0x8(%1) \n"
|
|
"movdqu 0x10(%0),%%xmm6 \n"
|
|
"movdqu 0x10(%0,%3,1),%%xmm7 \n"
|
|
"lea 0x20(%0),%0 \n"
|
|
"pavgb %%xmm7,%%xmm6 \n"
|
|
"pshufb %%xmm4,%%xmm6 \n"
|
|
"pmaddubsw %4,%%xmm6 \n"
|
|
"paddsw %%xmm1,%%xmm6 \n"
|
|
"psrlw $0x2,%%xmm6 \n"
|
|
"packuswb %%xmm6,%%xmm6 \n"
|
|
"movq %%xmm6,0x10(%1) \n"
|
|
"lea 0x18(%1),%1 \n"
|
|
"sub $0x18,%2 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_ptr), // %0
|
|
"+r"(dst_ptr), // %1
|
|
"+r"(dst_width) // %2
|
|
: "r"((intptr_t)(src_stride)), // %3
|
|
"m"(kMadd21) // %4
|
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
|
"xmm7");
|
|
}
|
|
|
|
void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
|
|
ptrdiff_t src_stride,
|
|
uint8_t* dst_ptr,
|
|
int dst_width) {
|
|
asm volatile(
|
|
"movdqa %0,%%xmm2 \n" // kShuf01
|
|
"movdqa %1,%%xmm3 \n" // kShuf11
|
|
"movdqa %2,%%xmm4 \n" // kShuf21
|
|
:
|
|
: "m"(kShuf01), // %0
|
|
"m"(kShuf11), // %1
|
|
"m"(kShuf21) // %2
|
|
);
|
|
asm volatile(
|
|
"movdqa %0,%%xmm5 \n" // kMadd01
|
|
"movdqa %1,%%xmm0 \n" // kMadd11
|
|
"movdqa %2,%%xmm1 \n" // kRound34
|
|
:
|
|
: "m"(kMadd01), // %0
|
|
"m"(kMadd11), // %1
|
|
"m"(kRound34) // %2
|
|
);
|
|
|
|
asm volatile(
|
|
|
|
LABELALIGN
|
|
"1: \n"
|
|
"movdqu (%0),%%xmm6 \n"
|
|
"movdqu 0x00(%0,%3,1),%%xmm7 \n"
|
|
"pavgb %%xmm6,%%xmm7 \n"
|
|
"pavgb %%xmm7,%%xmm6 \n"
|
|
"pshufb %%xmm2,%%xmm6 \n"
|
|
"pmaddubsw %%xmm5,%%xmm6 \n"
|
|
"paddsw %%xmm1,%%xmm6 \n"
|
|
"psrlw $0x2,%%xmm6 \n"
|
|
"packuswb %%xmm6,%%xmm6 \n"
|
|
"movq %%xmm6,(%1) \n"
|
|
"movdqu 0x8(%0),%%xmm6 \n"
|
|
"movdqu 0x8(%0,%3,1),%%xmm7 \n"
|
|
"pavgb %%xmm6,%%xmm7 \n"
|
|
"pavgb %%xmm7,%%xmm6 \n"
|
|
"pshufb %%xmm3,%%xmm6 \n"
|
|
"pmaddubsw %%xmm0,%%xmm6 \n"
|
|
"paddsw %%xmm1,%%xmm6 \n"
|
|
"psrlw $0x2,%%xmm6 \n"
|
|
"packuswb %%xmm6,%%xmm6 \n"
|
|
"movq %%xmm6,0x8(%1) \n"
|
|
"movdqu 0x10(%0),%%xmm6 \n"
|
|
"movdqu 0x10(%0,%3,1),%%xmm7 \n"
|
|
"lea 0x20(%0),%0 \n"
|
|
"pavgb %%xmm6,%%xmm7 \n"
|
|
"pavgb %%xmm7,%%xmm6 \n"
|
|
"pshufb %%xmm4,%%xmm6 \n"
|
|
"pmaddubsw %4,%%xmm6 \n"
|
|
"paddsw %%xmm1,%%xmm6 \n"
|
|
"psrlw $0x2,%%xmm6 \n"
|
|
"packuswb %%xmm6,%%xmm6 \n"
|
|
"movq %%xmm6,0x10(%1) \n"
|
|
"lea 0x18(%1),%1 \n"
|
|
"sub $0x18,%2 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_ptr), // %0
|
|
"+r"(dst_ptr), // %1
|
|
"+r"(dst_width) // %2
|
|
: "r"((intptr_t)(src_stride)), // %3
|
|
"m"(kMadd21) // %4
|
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
|
"xmm7");
|
|
}
|
|
|
|
void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
|
|
ptrdiff_t src_stride,
|
|
uint8_t* dst_ptr,
|
|
int dst_width) {
|
|
(void)src_stride;
|
|
asm volatile(
|
|
"movdqa %3,%%xmm4 \n"
|
|
"movdqa %4,%%xmm5 \n"
|
|
|
|
LABELALIGN
|
|
"1: \n"
|
|
"movdqu (%0),%%xmm0 \n"
|
|
"movdqu 0x10(%0),%%xmm1 \n"
|
|
"lea 0x20(%0),%0 \n"
|
|
"pshufb %%xmm4,%%xmm0 \n"
|
|
"pshufb %%xmm5,%%xmm1 \n"
|
|
"paddusb %%xmm1,%%xmm0 \n"
|
|
"movq %%xmm0,(%1) \n"
|
|
"movhlps %%xmm0,%%xmm1 \n"
|
|
"movd %%xmm1,0x8(%1) \n"
|
|
"lea 0xc(%1),%1 \n"
|
|
"sub $0xc,%2 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_ptr), // %0
|
|
"+r"(dst_ptr), // %1
|
|
"+r"(dst_width) // %2
|
|
: "m"(kShuf38a), // %3
|
|
"m"(kShuf38b) // %4
|
|
: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");
|
|
}
|
|
|
|
void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
|
|
ptrdiff_t src_stride,
|
|
uint8_t* dst_ptr,
|
|
int dst_width) {
|
|
asm volatile(
|
|
"movdqa %0,%%xmm2 \n"
|
|
"movdqa %1,%%xmm3 \n"
|
|
"movdqa %2,%%xmm4 \n"
|
|
"movdqa %3,%%xmm5 \n"
|
|
:
|
|
: "m"(kShufAb0), // %0
|
|
"m"(kShufAb1), // %1
|
|
"m"(kShufAb2), // %2
|
|
"m"(kScaleAb2) // %3
|
|
);
|
|
asm volatile(
|
|
|
|
LABELALIGN
|
|
"1: \n"
|
|
"movdqu (%0),%%xmm0 \n"
|
|
"movdqu 0x00(%0,%3,1),%%xmm1 \n"
|
|
"lea 0x10(%0),%0 \n"
|
|
"pavgb %%xmm1,%%xmm0 \n"
|
|
"movdqa %%xmm0,%%xmm1 \n"
|
|
"pshufb %%xmm2,%%xmm1 \n"
|
|
"movdqa %%xmm0,%%xmm6 \n"
|
|
"pshufb %%xmm3,%%xmm6 \n"
|
|
"paddusw %%xmm6,%%xmm1 \n"
|
|
"pshufb %%xmm4,%%xmm0 \n"
|
|
"paddusw %%xmm0,%%xmm1 \n"
|
|
"pmulhuw %%xmm5,%%xmm1 \n"
|
|
"packuswb %%xmm1,%%xmm1 \n"
|
|
"movd %%xmm1,(%1) \n"
|
|
"psrlq $0x10,%%xmm1 \n"
|
|
"movd %%xmm1,0x2(%1) \n"
|
|
"lea 0x6(%1),%1 \n"
|
|
"sub $0x6,%2 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_ptr), // %0
|
|
"+r"(dst_ptr), // %1
|
|
"+r"(dst_width) // %2
|
|
: "r"((intptr_t)(src_stride)) // %3
|
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
|
|
}
|
|
|
|
void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
|
|
ptrdiff_t src_stride,
|
|
uint8_t* dst_ptr,
|
|
int dst_width) {
|
|
asm volatile(
|
|
"movdqa %0,%%xmm2 \n"
|
|
"movdqa %1,%%xmm3 \n"
|
|
"movdqa %2,%%xmm4 \n"
|
|
"pxor %%xmm5,%%xmm5 \n"
|
|
:
|
|
: "m"(kShufAc), // %0
|
|
"m"(kShufAc3), // %1
|
|
"m"(kScaleAc33) // %2
|
|
);
|
|
asm volatile(
|
|
|
|
LABELALIGN
|
|
"1: \n"
|
|
"movdqu (%0),%%xmm0 \n"
|
|
"movdqu 0x00(%0,%3,1),%%xmm6 \n"
|
|
"movhlps %%xmm0,%%xmm1 \n"
|
|
"movhlps %%xmm6,%%xmm7 \n"
|
|
"punpcklbw %%xmm5,%%xmm0 \n"
|
|
"punpcklbw %%xmm5,%%xmm1 \n"
|
|
"punpcklbw %%xmm5,%%xmm6 \n"
|
|
"punpcklbw %%xmm5,%%xmm7 \n"
|
|
"paddusw %%xmm6,%%xmm0 \n"
|
|
"paddusw %%xmm7,%%xmm1 \n"
|
|
"movdqu 0x00(%0,%3,2),%%xmm6 \n"
|
|
"lea 0x10(%0),%0 \n"
|
|
"movhlps %%xmm6,%%xmm7 \n"
|
|
"punpcklbw %%xmm5,%%xmm6 \n"
|
|
"punpcklbw %%xmm5,%%xmm7 \n"
|
|
"paddusw %%xmm6,%%xmm0 \n"
|
|
"paddusw %%xmm7,%%xmm1 \n"
|
|
"movdqa %%xmm0,%%xmm6 \n"
|
|
"psrldq $0x2,%%xmm0 \n"
|
|
"paddusw %%xmm0,%%xmm6 \n"
|
|
"psrldq $0x2,%%xmm0 \n"
|
|
"paddusw %%xmm0,%%xmm6 \n"
|
|
"pshufb %%xmm2,%%xmm6 \n"
|
|
"movdqa %%xmm1,%%xmm7 \n"
|
|
"psrldq $0x2,%%xmm1 \n"
|
|
"paddusw %%xmm1,%%xmm7 \n"
|
|
"psrldq $0x2,%%xmm1 \n"
|
|
"paddusw %%xmm1,%%xmm7 \n"
|
|
"pshufb %%xmm3,%%xmm7 \n"
|
|
"paddusw %%xmm7,%%xmm6 \n"
|
|
"pmulhuw %%xmm4,%%xmm6 \n"
|
|
"packuswb %%xmm6,%%xmm6 \n"
|
|
"movd %%xmm6,(%1) \n"
|
|
"psrlq $0x10,%%xmm6 \n"
|
|
"movd %%xmm6,0x2(%1) \n"
|
|
"lea 0x6(%1),%1 \n"
|
|
"sub $0x6,%2 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_ptr), // %0
|
|
"+r"(dst_ptr), // %1
|
|
"+r"(dst_width) // %2
|
|
: "r"((intptr_t)(src_stride)) // %3
|
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
|
"xmm7");
|
|
}
|
|
|
|
// Reads 16xN bytes and produces 16 shorts at a time.
|
|
void ScaleAddRow_SSE2(const uint8_t* src_ptr,
|
|
uint16_t* dst_ptr,
|
|
int src_width) {
|
|
asm volatile(
|
|
|
|
"pxor %%xmm5,%%xmm5 \n"
|
|
|
|
// 16 pixel loop.
|
|
LABELALIGN
|
|
"1: \n"
|
|
"movdqu (%0),%%xmm3 \n"
|
|
"lea 0x10(%0),%0 \n" // src_ptr += 16
|
|
"movdqu (%1),%%xmm0 \n"
|
|
"movdqu 0x10(%1),%%xmm1 \n"
|
|
"movdqa %%xmm3,%%xmm2 \n"
|
|
"punpcklbw %%xmm5,%%xmm2 \n"
|
|
"punpckhbw %%xmm5,%%xmm3 \n"
|
|
"paddusw %%xmm2,%%xmm0 \n"
|
|
"paddusw %%xmm3,%%xmm1 \n"
|
|
"movdqu %%xmm0,(%1) \n"
|
|
"movdqu %%xmm1,0x10(%1) \n"
|
|
"lea 0x20(%1),%1 \n"
|
|
"sub $0x10,%2 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_ptr), // %0
|
|
"+r"(dst_ptr), // %1
|
|
"+r"(src_width) // %2
|
|
:
|
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
|
|
}
|
|
|
|
#ifdef HAS_SCALEADDROW_AVX2
|
|
// Reads 32 bytes and accumulates to 32 shorts at a time.
|
|
void ScaleAddRow_AVX2(const uint8_t* src_ptr,
|
|
uint16_t* dst_ptr,
|
|
int src_width) {
|
|
asm volatile(
|
|
|
|
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
|
|
|
|
LABELALIGN
|
|
"1: \n"
|
|
"vmovdqu (%0),%%ymm3 \n"
|
|
"lea 0x20(%0),%0 \n" // src_ptr += 32
|
|
"vpermq $0xd8,%%ymm3,%%ymm3 \n"
|
|
"vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
|
|
"vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
|
|
"vpaddusw (%1),%%ymm2,%%ymm0 \n"
|
|
"vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n"
|
|
"vmovdqu %%ymm0,(%1) \n"
|
|
"vmovdqu %%ymm1,0x20(%1) \n"
|
|
"lea 0x40(%1),%1 \n"
|
|
"sub $0x20,%2 \n"
|
|
"jg 1b \n"
|
|
"vzeroupper \n"
|
|
: "+r"(src_ptr), // %0
|
|
"+r"(dst_ptr), // %1
|
|
"+r"(src_width) // %2
|
|
:
|
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
|
|
}
|
|
#endif // HAS_SCALEADDROW_AVX2
|
|
|
|
// Constant for making pixels signed to avoid pmaddubsw
|
|
// saturation.
|
|
static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
|
|
|
|
// Constant for making pixels unsigned and adding .5 for rounding.
|
|
static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
|
|
0x4040, 0x4040, 0x4040, 0x4040};
|
|
|
|
// Bilinear column filtering. SSSE3 version.
|
|
void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
|
|
const uint8_t* src_ptr,
|
|
int dst_width,
|
|
int x,
|
|
int dx) {
|
|
intptr_t x0, x1, temp_pixel;
|
|
asm volatile(
|
|
"movd %6,%%xmm2 \n"
|
|
"movd %7,%%xmm3 \n"
|
|
"movl $0x04040000,%k2 \n"
|
|
"movd %k2,%%xmm5 \n"
|
|
"pcmpeqb %%xmm6,%%xmm6 \n"
|
|
"psrlw $0x9,%%xmm6 \n" // 0x007f007f
|
|
"pcmpeqb %%xmm7,%%xmm7 \n"
|
|
"psrlw $15,%%xmm7 \n" // 0x00010001
|
|
|
|
"pextrw $0x1,%%xmm2,%k3 \n"
|
|
"subl $0x2,%5 \n"
|
|
"jl 29f \n"
|
|
"movdqa %%xmm2,%%xmm0 \n"
|
|
"paddd %%xmm3,%%xmm0 \n"
|
|
"punpckldq %%xmm0,%%xmm2 \n"
|
|
"punpckldq %%xmm3,%%xmm3 \n"
|
|
"paddd %%xmm3,%%xmm3 \n"
|
|
"pextrw $0x3,%%xmm2,%k4 \n"
|
|
|
|
LABELALIGN
|
|
"2: \n"
|
|
"movdqa %%xmm2,%%xmm1 \n"
|
|
"paddd %%xmm3,%%xmm2 \n"
|
|
"movzwl 0x00(%1,%3,1),%k2 \n"
|
|
"movd %k2,%%xmm0 \n"
|
|
"psrlw $0x9,%%xmm1 \n"
|
|
"movzwl 0x00(%1,%4,1),%k2 \n"
|
|
"movd %k2,%%xmm4 \n"
|
|
"pshufb %%xmm5,%%xmm1 \n"
|
|
"punpcklwd %%xmm4,%%xmm0 \n"
|
|
"psubb %8,%%xmm0 \n" // make pixels signed.
|
|
"pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) +
|
|
// 1
|
|
"paddusb %%xmm7,%%xmm1 \n"
|
|
"pmaddubsw %%xmm0,%%xmm1 \n"
|
|
"pextrw $0x1,%%xmm2,%k3 \n"
|
|
"pextrw $0x3,%%xmm2,%k4 \n"
|
|
"paddw %9,%%xmm1 \n" // make pixels unsigned.
|
|
"psrlw $0x7,%%xmm1 \n"
|
|
"packuswb %%xmm1,%%xmm1 \n"
|
|
"movd %%xmm1,%k2 \n"
|
|
"mov %w2,(%0) \n"
|
|
"lea 0x2(%0),%0 \n"
|
|
"subl $0x2,%5 \n"
|
|
"jge 2b \n"
|
|
|
|
LABELALIGN
|
|
"29: \n"
|
|
"addl $0x1,%5 \n"
|
|
"jl 99f \n"
|
|
"movzwl 0x00(%1,%3,1),%k2 \n"
|
|
"movd %k2,%%xmm0 \n"
|
|
"psrlw $0x9,%%xmm2 \n"
|
|
"pshufb %%xmm5,%%xmm2 \n"
|
|
"psubb %8,%%xmm0 \n" // make pixels signed.
|
|
"pxor %%xmm6,%%xmm2 \n"
|
|
"paddusb %%xmm7,%%xmm2 \n"
|
|
"pmaddubsw %%xmm0,%%xmm2 \n"
|
|
"paddw %9,%%xmm2 \n" // make pixels unsigned.
|
|
"psrlw $0x7,%%xmm2 \n"
|
|
"packuswb %%xmm2,%%xmm2 \n"
|
|
"movd %%xmm2,%k2 \n"
|
|
"mov %b2,(%0) \n"
|
|
"99: \n"
|
|
: "+r"(dst_ptr), // %0
|
|
"+r"(src_ptr), // %1
|
|
"=&a"(temp_pixel), // %2
|
|
"=&r"(x0), // %3
|
|
"=&r"(x1), // %4
|
|
#if defined(__x86_64__)
|
|
"+rm"(dst_width) // %5
|
|
#else
|
|
"+m"(dst_width) // %5
|
|
#endif
|
|
: "rm"(x), // %6
|
|
"rm"(dx), // %7
|
|
#if defined(__x86_64__)
|
|
"x"(kFsub80), // %8
|
|
"x"(kFadd40) // %9
|
|
#else
|
|
"m"(kFsub80), // %8
|
|
"m"(kFadd40) // %9
|
|
#endif
|
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
|
"xmm7");
|
|
}
|
|
|
|
// Reads 4 pixels, duplicates them and writes 8 pixels.
|
|
// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
|
|
void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
|
|
const uint8_t* src_ptr,
|
|
int dst_width,
|
|
int x,
|
|
int dx) {
|
|
(void)x;
|
|
(void)dx;
|
|
asm volatile(
|
|
|
|
LABELALIGN
|
|
"1: \n"
|
|
"movdqu (%1),%%xmm0 \n"
|
|
"lea 0x10(%1),%1 \n"
|
|
"movdqa %%xmm0,%%xmm1 \n"
|
|
"punpcklbw %%xmm0,%%xmm0 \n"
|
|
"punpckhbw %%xmm1,%%xmm1 \n"
|
|
"movdqu %%xmm0,(%0) \n"
|
|
"movdqu %%xmm1,0x10(%0) \n"
|
|
"lea 0x20(%0),%0 \n"
|
|
"sub $0x20,%2 \n"
|
|
"jg 1b \n"
|
|
|
|
: "+r"(dst_ptr), // %0
|
|
"+r"(src_ptr), // %1
|
|
"+r"(dst_width) // %2
|
|
::"memory",
|
|
"cc", "xmm0", "xmm1");
|
|
}
|
|
|
|
void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
|
|
ptrdiff_t src_stride,
|
|
uint8_t* dst_argb,
|
|
int dst_width) {
|
|
(void)src_stride;
|
|
asm volatile(
|
|
|
|
LABELALIGN
|
|
"1: \n"
|
|
"movdqu (%0),%%xmm0 \n"
|
|
"movdqu 0x10(%0),%%xmm1 \n"
|
|
"lea 0x20(%0),%0 \n"
|
|
"shufps $0xdd,%%xmm1,%%xmm0 \n"
|
|
"movdqu %%xmm0,(%1) \n"
|
|
"lea 0x10(%1),%1 \n"
|
|
"sub $0x4,%2 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_argb), // %0
|
|
"+r"(dst_argb), // %1
|
|
"+r"(dst_width) // %2
|
|
::"memory",
|
|
"cc", "xmm0", "xmm1");
|
|
}
|
|
|
|
void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
|
|
ptrdiff_t src_stride,
|
|
uint8_t* dst_argb,
|
|
int dst_width) {
|
|
(void)src_stride;
|
|
asm volatile(
|
|
|
|
LABELALIGN
|
|
"1: \n"
|
|
"movdqu (%0),%%xmm0 \n"
|
|
"movdqu 0x10(%0),%%xmm1 \n"
|
|
"lea 0x20(%0),%0 \n"
|
|
"movdqa %%xmm0,%%xmm2 \n"
|
|
"shufps $0x88,%%xmm1,%%xmm0 \n"
|
|
"shufps $0xdd,%%xmm1,%%xmm2 \n"
|
|
"pavgb %%xmm2,%%xmm0 \n"
|
|
"movdqu %%xmm0,(%1) \n"
|
|
"lea 0x10(%1),%1 \n"
|
|
"sub $0x4,%2 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_argb), // %0
|
|
"+r"(dst_argb), // %1
|
|
"+r"(dst_width) // %2
|
|
::"memory",
|
|
"cc", "xmm0", "xmm1");
|
|
}
|
|
|
|
void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
|
|
ptrdiff_t src_stride,
|
|
uint8_t* dst_argb,
|
|
int dst_width) {
|
|
asm volatile(
|
|
|
|
LABELALIGN
|
|
"1: \n"
|
|
"movdqu (%0),%%xmm0 \n"
|
|
"movdqu 0x10(%0),%%xmm1 \n"
|
|
"movdqu 0x00(%0,%3,1),%%xmm2 \n"
|
|
"movdqu 0x10(%0,%3,1),%%xmm3 \n"
|
|
"lea 0x20(%0),%0 \n"
|
|
"pavgb %%xmm2,%%xmm0 \n"
|
|
"pavgb %%xmm3,%%xmm1 \n"
|
|
"movdqa %%xmm0,%%xmm2 \n"
|
|
"shufps $0x88,%%xmm1,%%xmm0 \n"
|
|
"shufps $0xdd,%%xmm1,%%xmm2 \n"
|
|
"pavgb %%xmm2,%%xmm0 \n"
|
|
"movdqu %%xmm0,(%1) \n"
|
|
"lea 0x10(%1),%1 \n"
|
|
"sub $0x4,%2 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_argb), // %0
|
|
"+r"(dst_argb), // %1
|
|
"+r"(dst_width) // %2
|
|
: "r"((intptr_t)(src_stride)) // %3
|
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
|
|
}
|
|
|
|
// Reads 4 pixels at a time.
|
|
// Alignment requirement: dst_argb 16 byte aligned.
|
|
void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
|
|
ptrdiff_t src_stride,
|
|
int src_stepx,
|
|
uint8_t* dst_argb,
|
|
int dst_width) {
|
|
intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
|
|
intptr_t src_stepx_x12;
|
|
(void)src_stride;
|
|
asm volatile(
|
|
"lea 0x00(,%1,4),%1 \n"
|
|
"lea 0x00(%1,%1,2),%4 \n"
|
|
|
|
LABELALIGN
|
|
"1: \n"
|
|
"movd (%0),%%xmm0 \n"
|
|
"movd 0x00(%0,%1,1),%%xmm1 \n"
|
|
"punpckldq %%xmm1,%%xmm0 \n"
|
|
"movd 0x00(%0,%1,2),%%xmm2 \n"
|
|
"movd 0x00(%0,%4,1),%%xmm3 \n"
|
|
"lea 0x00(%0,%1,4),%0 \n"
|
|
"punpckldq %%xmm3,%%xmm2 \n"
|
|
"punpcklqdq %%xmm2,%%xmm0 \n"
|
|
"movdqu %%xmm0,(%2) \n"
|
|
"lea 0x10(%2),%2 \n"
|
|
"sub $0x4,%3 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_argb), // %0
|
|
"+r"(src_stepx_x4), // %1
|
|
"+r"(dst_argb), // %2
|
|
"+r"(dst_width), // %3
|
|
"=&r"(src_stepx_x12) // %4
|
|
::"memory",
|
|
"cc", "xmm0", "xmm1", "xmm2", "xmm3");
|
|
}
|
|
|
|
// Blends four 2x2 to 4x1.
|
|
// Alignment requirement: dst_argb 16 byte aligned.
|
|
void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
|
|
ptrdiff_t src_stride,
|
|
int src_stepx,
|
|
uint8_t* dst_argb,
|
|
int dst_width) {
|
|
intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
|
|
intptr_t src_stepx_x12;
|
|
intptr_t row1 = (intptr_t)(src_stride);
|
|
asm volatile(
|
|
"lea 0x00(,%1,4),%1 \n"
|
|
"lea 0x00(%1,%1,2),%4 \n"
|
|
"lea 0x00(%0,%5,1),%5 \n"
|
|
|
|
LABELALIGN
|
|
"1: \n"
|
|
"movq (%0),%%xmm0 \n"
|
|
"movhps 0x00(%0,%1,1),%%xmm0 \n"
|
|
"movq 0x00(%0,%1,2),%%xmm1 \n"
|
|
"movhps 0x00(%0,%4,1),%%xmm1 \n"
|
|
"lea 0x00(%0,%1,4),%0 \n"
|
|
"movq (%5),%%xmm2 \n"
|
|
"movhps 0x00(%5,%1,1),%%xmm2 \n"
|
|
"movq 0x00(%5,%1,2),%%xmm3 \n"
|
|
"movhps 0x00(%5,%4,1),%%xmm3 \n"
|
|
"lea 0x00(%5,%1,4),%5 \n"
|
|
"pavgb %%xmm2,%%xmm0 \n"
|
|
"pavgb %%xmm3,%%xmm1 \n"
|
|
"movdqa %%xmm0,%%xmm2 \n"
|
|
"shufps $0x88,%%xmm1,%%xmm0 \n"
|
|
"shufps $0xdd,%%xmm1,%%xmm2 \n"
|
|
"pavgb %%xmm2,%%xmm0 \n"
|
|
"movdqu %%xmm0,(%2) \n"
|
|
"lea 0x10(%2),%2 \n"
|
|
"sub $0x4,%3 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_argb), // %0
|
|
"+r"(src_stepx_x4), // %1
|
|
"+r"(dst_argb), // %2
|
|
"+rm"(dst_width), // %3
|
|
"=&r"(src_stepx_x12), // %4
|
|
"+r"(row1) // %5
|
|
::"memory",
|
|
"cc", "xmm0", "xmm1", "xmm2", "xmm3");
|
|
}
|
|
|
|
void ScaleARGBCols_SSE2(uint8_t* dst_argb,
|
|
const uint8_t* src_argb,
|
|
int dst_width,
|
|
int x,
|
|
int dx) {
|
|
intptr_t x0, x1;
|
|
asm volatile(
|
|
"movd %5,%%xmm2 \n"
|
|
"movd %6,%%xmm3 \n"
|
|
"pshufd $0x0,%%xmm2,%%xmm2 \n"
|
|
"pshufd $0x11,%%xmm3,%%xmm0 \n"
|
|
"paddd %%xmm0,%%xmm2 \n"
|
|
"paddd %%xmm3,%%xmm3 \n"
|
|
"pshufd $0x5,%%xmm3,%%xmm0 \n"
|
|
"paddd %%xmm0,%%xmm2 \n"
|
|
"paddd %%xmm3,%%xmm3 \n"
|
|
"pshufd $0x0,%%xmm3,%%xmm3 \n"
|
|
"pextrw $0x1,%%xmm2,%k0 \n"
|
|
"pextrw $0x3,%%xmm2,%k1 \n"
|
|
"cmp $0x0,%4 \n"
|
|
"jl 99f \n"
|
|
"sub $0x4,%4 \n"
|
|
"jl 49f \n"
|
|
|
|
LABELALIGN
|
|
"40: \n"
|
|
"movd 0x00(%3,%0,4),%%xmm0 \n"
|
|
"movd 0x00(%3,%1,4),%%xmm1 \n"
|
|
"pextrw $0x5,%%xmm2,%k0 \n"
|
|
"pextrw $0x7,%%xmm2,%k1 \n"
|
|
"paddd %%xmm3,%%xmm2 \n"
|
|
"punpckldq %%xmm1,%%xmm0 \n"
|
|
"movd 0x00(%3,%0,4),%%xmm1 \n"
|
|
"movd 0x00(%3,%1,4),%%xmm4 \n"
|
|
"pextrw $0x1,%%xmm2,%k0 \n"
|
|
"pextrw $0x3,%%xmm2,%k1 \n"
|
|
"punpckldq %%xmm4,%%xmm1 \n"
|
|
"punpcklqdq %%xmm1,%%xmm0 \n"
|
|
"movdqu %%xmm0,(%2) \n"
|
|
"lea 0x10(%2),%2 \n"
|
|
"sub $0x4,%4 \n"
|
|
"jge 40b \n"
|
|
|
|
"49: \n"
|
|
"test $0x2,%4 \n"
|
|
"je 29f \n"
|
|
"movd 0x00(%3,%0,4),%%xmm0 \n"
|
|
"movd 0x00(%3,%1,4),%%xmm1 \n"
|
|
"pextrw $0x5,%%xmm2,%k0 \n"
|
|
"punpckldq %%xmm1,%%xmm0 \n"
|
|
"movq %%xmm0,(%2) \n"
|
|
"lea 0x8(%2),%2 \n"
|
|
"29: \n"
|
|
"test $0x1,%4 \n"
|
|
"je 99f \n"
|
|
"movd 0x00(%3,%0,4),%%xmm0 \n"
|
|
"movd %%xmm0,(%2) \n"
|
|
"99: \n"
|
|
: "=&a"(x0), // %0
|
|
"=&d"(x1), // %1
|
|
"+r"(dst_argb), // %2
|
|
"+r"(src_argb), // %3
|
|
"+r"(dst_width) // %4
|
|
: "rm"(x), // %5
|
|
"rm"(dx) // %6
|
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
|
|
}
|
|
|
|
// Reads 4 pixels, duplicates them and writes 8 pixels.
|
|
// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
|
|
void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
|
|
const uint8_t* src_argb,
|
|
int dst_width,
|
|
int x,
|
|
int dx) {
|
|
(void)x;
|
|
(void)dx;
|
|
asm volatile(
|
|
|
|
LABELALIGN
|
|
"1: \n"
|
|
"movdqu (%1),%%xmm0 \n"
|
|
"lea 0x10(%1),%1 \n"
|
|
"movdqa %%xmm0,%%xmm1 \n"
|
|
"punpckldq %%xmm0,%%xmm0 \n"
|
|
"punpckhdq %%xmm1,%%xmm1 \n"
|
|
"movdqu %%xmm0,(%0) \n"
|
|
"movdqu %%xmm1,0x10(%0) \n"
|
|
"lea 0x20(%0),%0 \n"
|
|
"sub $0x8,%2 \n"
|
|
"jg 1b \n"
|
|
|
|
: "+r"(dst_argb), // %0
|
|
"+r"(src_argb), // %1
|
|
"+r"(dst_width) // %2
|
|
::"memory",
|
|
"cc", "xmm0", "xmm1");
|
|
}
|
|
|
|
// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
|
|
static const uvec8 kShuffleColARGB = {
|
|
0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
|
|
8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
|
|
};
|
|
|
|
// Shuffle table for duplicating 2 fractions into 8 bytes each
|
|
static const uvec8 kShuffleFractions = {
|
|
0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
|
|
};
|
|
|
|
// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
|
|
void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
|
|
const uint8_t* src_argb,
|
|
int dst_width,
|
|
int x,
|
|
int dx) {
|
|
intptr_t x0, x1;
|
|
asm volatile(
|
|
"movdqa %0,%%xmm4 \n"
|
|
"movdqa %1,%%xmm5 \n"
|
|
:
|
|
: "m"(kShuffleColARGB), // %0
|
|
"m"(kShuffleFractions) // %1
|
|
);
|
|
|
|
asm volatile(
|
|
"movd %5,%%xmm2 \n"
|
|
"movd %6,%%xmm3 \n"
|
|
"pcmpeqb %%xmm6,%%xmm6 \n"
|
|
"psrlw $0x9,%%xmm6 \n"
|
|
"pextrw $0x1,%%xmm2,%k3 \n"
|
|
"sub $0x2,%2 \n"
|
|
"jl 29f \n"
|
|
"movdqa %%xmm2,%%xmm0 \n"
|
|
"paddd %%xmm3,%%xmm0 \n"
|
|
"punpckldq %%xmm0,%%xmm2 \n"
|
|
"punpckldq %%xmm3,%%xmm3 \n"
|
|
"paddd %%xmm3,%%xmm3 \n"
|
|
"pextrw $0x3,%%xmm2,%k4 \n"
|
|
|
|
LABELALIGN
|
|
"2: \n"
|
|
"movdqa %%xmm2,%%xmm1 \n"
|
|
"paddd %%xmm3,%%xmm2 \n"
|
|
"movq 0x00(%1,%3,4),%%xmm0 \n"
|
|
"psrlw $0x9,%%xmm1 \n"
|
|
"movhps 0x00(%1,%4,4),%%xmm0 \n"
|
|
"pshufb %%xmm5,%%xmm1 \n"
|
|
"pshufb %%xmm4,%%xmm0 \n"
|
|
"pxor %%xmm6,%%xmm1 \n"
|
|
"pmaddubsw %%xmm1,%%xmm0 \n"
|
|
"psrlw $0x7,%%xmm0 \n"
|
|
"pextrw $0x1,%%xmm2,%k3 \n"
|
|
"pextrw $0x3,%%xmm2,%k4 \n"
|
|
"packuswb %%xmm0,%%xmm0 \n"
|
|
"movq %%xmm0,(%0) \n"
|
|
"lea 0x8(%0),%0 \n"
|
|
"sub $0x2,%2 \n"
|
|
"jge 2b \n"
|
|
|
|
LABELALIGN
|
|
"29: \n"
|
|
"add $0x1,%2 \n"
|
|
"jl 99f \n"
|
|
"psrlw $0x9,%%xmm2 \n"
|
|
"movq 0x00(%1,%3,4),%%xmm0 \n"
|
|
"pshufb %%xmm5,%%xmm2 \n"
|
|
"pshufb %%xmm4,%%xmm0 \n"
|
|
"pxor %%xmm6,%%xmm2 \n"
|
|
"pmaddubsw %%xmm2,%%xmm0 \n"
|
|
"psrlw $0x7,%%xmm0 \n"
|
|
"packuswb %%xmm0,%%xmm0 \n"
|
|
"movd %%xmm0,(%0) \n"
|
|
|
|
LABELALIGN
|
|
"99: \n" // clang-format error.
|
|
|
|
: "+r"(dst_argb), // %0
|
|
"+r"(src_argb), // %1
|
|
"+rm"(dst_width), // %2
|
|
"=&r"(x0), // %3
|
|
"=&r"(x1) // %4
|
|
: "rm"(x), // %5
|
|
"rm"(dx) // %6
|
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
|
|
}
|
|
|
|
// Divide num by div and return as 16.16 fixed point result.
|
|
int FixedDiv_X86(int num, int div) {
|
|
asm volatile(
|
|
"cdq \n"
|
|
"shld $0x10,%%eax,%%edx \n"
|
|
"shl $0x10,%%eax \n"
|
|
"idiv %1 \n"
|
|
"mov %0, %%eax \n"
|
|
: "+a"(num) // %0
|
|
: "c"(div) // %1
|
|
: "memory", "cc", "edx");
|
|
return num;
|
|
}
|
|
|
|
// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
|
|
int FixedDiv1_X86(int num, int div) {
|
|
asm volatile(
|
|
"cdq \n"
|
|
"shld $0x10,%%eax,%%edx \n"
|
|
"shl $0x10,%%eax \n"
|
|
"sub $0x10001,%%eax \n"
|
|
"sbb $0x0,%%edx \n"
|
|
"sub $0x1,%1 \n"
|
|
"idiv %1 \n"
|
|
"mov %0, %%eax \n"
|
|
: "+a"(num) // %0
|
|
: "c"(div) // %1
|
|
: "memory", "cc", "edx");
|
|
return num;
|
|
}
|
|
|
|
#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
|
|
// Shuffle table for splitting UV into upper and lower part of register.
|
|
static const uvec8 kShuffleSplitUV = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u,
|
|
1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u};
|
|
static const uvec8 kShuffleMergeUV = {0u, 8u, 2u, 10u, 4u, 12u,
|
|
6u, 14u, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80, 0x80};
|
|
|
|
void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
|
|
ptrdiff_t src_stride,
|
|
uint8_t* dst_ptr,
|
|
int dst_width) {
|
|
asm volatile(
|
|
"pcmpeqb %%xmm4,%%xmm4 \n" // 01010101
|
|
"psrlw $0xf,%%xmm4 \n"
|
|
"packuswb %%xmm4,%%xmm4 \n"
|
|
"pxor %%xmm5, %%xmm5 \n" // zero
|
|
"movdqa %4,%%xmm1 \n" // split shuffler
|
|
"movdqa %5,%%xmm3 \n" // merge shuffler
|
|
|
|
LABELALIGN
|
|
"1: \n"
|
|
"movdqu (%0),%%xmm0 \n" // 8 UV row 0
|
|
"movdqu 0x00(%0,%3,1),%%xmm2 \n" // 8 UV row 1
|
|
"lea 0x10(%0),%0 \n"
|
|
"pshufb %%xmm1,%%xmm0 \n" // uuuuvvvv
|
|
"pshufb %%xmm1,%%xmm2 \n"
|
|
"pmaddubsw %%xmm4,%%xmm0 \n" // horizontal add
|
|
"pmaddubsw %%xmm4,%%xmm2 \n"
|
|
"paddw %%xmm2,%%xmm0 \n" // vertical add
|
|
"psrlw $0x1,%%xmm0 \n" // round
|
|
"pavgw %%xmm5,%%xmm0 \n"
|
|
"pshufb %%xmm3,%%xmm0 \n" // merge uv
|
|
"movq %%xmm0,(%1) \n"
|
|
"lea 0x8(%1),%1 \n" // 4 UV
|
|
"sub $0x4,%2 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_ptr), // %0
|
|
"+r"(dst_ptr), // %1
|
|
"+r"(dst_width) // %2
|
|
: "r"((intptr_t)(src_stride)), // %3
|
|
"m"(kShuffleSplitUV), // %4
|
|
"m"(kShuffleMergeUV) // %5
|
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
|
|
}
|
|
#endif // HAS_SCALEUVROWDOWN2BOX_SSSE3
|
|
|
|
#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
|
|
void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
|
|
ptrdiff_t src_stride,
|
|
uint8_t* dst_ptr,
|
|
int dst_width) {
|
|
asm volatile(
|
|
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101
|
|
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
|
|
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
|
|
"vpxor %%ymm5,%%ymm5,%%ymm5 \n" // zero
|
|
"vbroadcastf128 %4,%%ymm1 \n" // split shuffler
|
|
"vbroadcastf128 %5,%%ymm3 \n" // merge shuffler
|
|
|
|
LABELALIGN
|
|
"1: \n"
|
|
"vmovdqu (%0),%%ymm0 \n" // 16 UV row 0
|
|
"vmovdqu 0x00(%0,%3,1),%%ymm2 \n" // 16 UV row 1
|
|
"lea 0x20(%0),%0 \n"
|
|
"vpshufb %%ymm1,%%ymm0,%%ymm0 \n" // uuuuvvvv
|
|
"vpshufb %%ymm1,%%ymm2,%%ymm2 \n"
|
|
"vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // horizontal add
|
|
"vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
|
|
"vpaddw %%ymm2,%%ymm0,%%ymm0 \n" // vertical add
|
|
"vpsrlw $0x1,%%ymm0,%%ymm0 \n" // round
|
|
"vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
|
|
"vpshufb %%ymm3,%%ymm0,%%ymm0 \n" // merge uv
|
|
"vpermq $0xd8,%%ymm0,%%ymm0 \n" // combine qwords
|
|
"vmovdqu %%xmm0,(%1) \n"
|
|
"lea 0x10(%1),%1 \n" // 8 UV
|
|
"sub $0x8,%2 \n"
|
|
"jg 1b \n"
|
|
"vzeroupper \n"
|
|
: "+r"(src_ptr), // %0
|
|
"+r"(dst_ptr), // %1
|
|
"+r"(dst_width) // %2
|
|
: "r"((intptr_t)(src_stride)), // %3
|
|
"m"(kShuffleSplitUV), // %4
|
|
"m"(kShuffleMergeUV) // %5
|
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
|
|
}
|
|
#endif // HAS_SCALEUVROWDOWN2BOX_AVX2
|
|
|
|
#endif // defined(__x86_64__) || defined(__i386__)
|
|
|
|
#ifdef __cplusplus
|
|
} // extern "C"
|
|
} // namespace libyuv
|
|
#endif
|