1*fb1b10abSAndroid Build Coastguard Worker /*
2*fb1b10abSAndroid Build Coastguard Worker * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3*fb1b10abSAndroid Build Coastguard Worker *
4*fb1b10abSAndroid Build Coastguard Worker * Use of this source code is governed by a BSD-style license
5*fb1b10abSAndroid Build Coastguard Worker * that can be found in the LICENSE file in the root of the source
6*fb1b10abSAndroid Build Coastguard Worker * tree. An additional intellectual property rights grant can be found
7*fb1b10abSAndroid Build Coastguard Worker * in the file PATENTS. All contributing project authors may
8*fb1b10abSAndroid Build Coastguard Worker * be found in the AUTHORS file in the root of the source tree.
9*fb1b10abSAndroid Build Coastguard Worker */
10*fb1b10abSAndroid Build Coastguard Worker
11*fb1b10abSAndroid Build Coastguard Worker #include <arm_neon.h>
12*fb1b10abSAndroid Build Coastguard Worker #include <assert.h>
13*fb1b10abSAndroid Build Coastguard Worker
14*fb1b10abSAndroid Build Coastguard Worker #include "./vpx_config.h"
15*fb1b10abSAndroid Build Coastguard Worker #include "./vpx_dsp_rtcd.h"
16*fb1b10abSAndroid Build Coastguard Worker #include "vpx/vpx_integer.h"
17*fb1b10abSAndroid Build Coastguard Worker #include "vpx_dsp/arm/mem_neon.h"
18*fb1b10abSAndroid Build Coastguard Worker
vpx_subtract_block_neon(int rows,int cols,int16_t * diff,ptrdiff_t diff_stride,const uint8_t * src,ptrdiff_t src_stride,const uint8_t * pred,ptrdiff_t pred_stride)19*fb1b10abSAndroid Build Coastguard Worker void vpx_subtract_block_neon(int rows, int cols, int16_t *diff,
20*fb1b10abSAndroid Build Coastguard Worker ptrdiff_t diff_stride, const uint8_t *src,
21*fb1b10abSAndroid Build Coastguard Worker ptrdiff_t src_stride, const uint8_t *pred,
22*fb1b10abSAndroid Build Coastguard Worker ptrdiff_t pred_stride) {
23*fb1b10abSAndroid Build Coastguard Worker int r = rows, c;
24*fb1b10abSAndroid Build Coastguard Worker
25*fb1b10abSAndroid Build Coastguard Worker if (cols > 16) {
26*fb1b10abSAndroid Build Coastguard Worker do {
27*fb1b10abSAndroid Build Coastguard Worker for (c = 0; c < cols; c += 32) {
28*fb1b10abSAndroid Build Coastguard Worker const uint8x16_t s0 = vld1q_u8(&src[c + 0]);
29*fb1b10abSAndroid Build Coastguard Worker const uint8x16_t s1 = vld1q_u8(&src[c + 16]);
30*fb1b10abSAndroid Build Coastguard Worker const uint8x16_t p0 = vld1q_u8(&pred[c + 0]);
31*fb1b10abSAndroid Build Coastguard Worker const uint8x16_t p1 = vld1q_u8(&pred[c + 16]);
32*fb1b10abSAndroid Build Coastguard Worker const uint16x8_t d0 = vsubl_u8(vget_low_u8(s0), vget_low_u8(p0));
33*fb1b10abSAndroid Build Coastguard Worker const uint16x8_t d1 = vsubl_u8(vget_high_u8(s0), vget_high_u8(p0));
34*fb1b10abSAndroid Build Coastguard Worker const uint16x8_t d2 = vsubl_u8(vget_low_u8(s1), vget_low_u8(p1));
35*fb1b10abSAndroid Build Coastguard Worker const uint16x8_t d3 = vsubl_u8(vget_high_u8(s1), vget_high_u8(p1));
36*fb1b10abSAndroid Build Coastguard Worker vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(d0));
37*fb1b10abSAndroid Build Coastguard Worker vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(d1));
38*fb1b10abSAndroid Build Coastguard Worker vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(d2));
39*fb1b10abSAndroid Build Coastguard Worker vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(d3));
40*fb1b10abSAndroid Build Coastguard Worker }
41*fb1b10abSAndroid Build Coastguard Worker diff += diff_stride;
42*fb1b10abSAndroid Build Coastguard Worker pred += pred_stride;
43*fb1b10abSAndroid Build Coastguard Worker src += src_stride;
44*fb1b10abSAndroid Build Coastguard Worker } while (--r);
45*fb1b10abSAndroid Build Coastguard Worker } else if (cols > 8) {
46*fb1b10abSAndroid Build Coastguard Worker do {
47*fb1b10abSAndroid Build Coastguard Worker const uint8x16_t s = vld1q_u8(&src[0]);
48*fb1b10abSAndroid Build Coastguard Worker const uint8x16_t p = vld1q_u8(&pred[0]);
49*fb1b10abSAndroid Build Coastguard Worker const uint16x8_t d0 = vsubl_u8(vget_low_u8(s), vget_low_u8(p));
50*fb1b10abSAndroid Build Coastguard Worker const uint16x8_t d1 = vsubl_u8(vget_high_u8(s), vget_high_u8(p));
51*fb1b10abSAndroid Build Coastguard Worker vst1q_s16(&diff[0], vreinterpretq_s16_u16(d0));
52*fb1b10abSAndroid Build Coastguard Worker vst1q_s16(&diff[8], vreinterpretq_s16_u16(d1));
53*fb1b10abSAndroid Build Coastguard Worker diff += diff_stride;
54*fb1b10abSAndroid Build Coastguard Worker pred += pred_stride;
55*fb1b10abSAndroid Build Coastguard Worker src += src_stride;
56*fb1b10abSAndroid Build Coastguard Worker } while (--r);
57*fb1b10abSAndroid Build Coastguard Worker } else if (cols > 4) {
58*fb1b10abSAndroid Build Coastguard Worker do {
59*fb1b10abSAndroid Build Coastguard Worker const uint8x8_t s = vld1_u8(&src[0]);
60*fb1b10abSAndroid Build Coastguard Worker const uint8x8_t p = vld1_u8(&pred[0]);
61*fb1b10abSAndroid Build Coastguard Worker const uint16x8_t v_diff = vsubl_u8(s, p);
62*fb1b10abSAndroid Build Coastguard Worker vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff));
63*fb1b10abSAndroid Build Coastguard Worker diff += diff_stride;
64*fb1b10abSAndroid Build Coastguard Worker pred += pred_stride;
65*fb1b10abSAndroid Build Coastguard Worker src += src_stride;
66*fb1b10abSAndroid Build Coastguard Worker } while (--r);
67*fb1b10abSAndroid Build Coastguard Worker } else {
68*fb1b10abSAndroid Build Coastguard Worker assert(cols == 4);
69*fb1b10abSAndroid Build Coastguard Worker do {
70*fb1b10abSAndroid Build Coastguard Worker const uint8x8_t s = load_unaligned_u8(src, (int)src_stride);
71*fb1b10abSAndroid Build Coastguard Worker const uint8x8_t p = load_unaligned_u8(pred, (int)pred_stride);
72*fb1b10abSAndroid Build Coastguard Worker const uint16x8_t d = vsubl_u8(s, p);
73*fb1b10abSAndroid Build Coastguard Worker vst1_s16(diff + 0 * diff_stride, vreinterpret_s16_u16(vget_low_u16(d)));
74*fb1b10abSAndroid Build Coastguard Worker vst1_s16(diff + 1 * diff_stride, vreinterpret_s16_u16(vget_high_u16(d)));
75*fb1b10abSAndroid Build Coastguard Worker diff += 2 * diff_stride;
76*fb1b10abSAndroid Build Coastguard Worker pred += 2 * pred_stride;
77*fb1b10abSAndroid Build Coastguard Worker src += 2 * src_stride;
78*fb1b10abSAndroid Build Coastguard Worker r -= 2;
79*fb1b10abSAndroid Build Coastguard Worker } while (r);
80*fb1b10abSAndroid Build Coastguard Worker }
81*fb1b10abSAndroid Build Coastguard Worker }
82*fb1b10abSAndroid Build Coastguard Worker
83*fb1b10abSAndroid Build Coastguard Worker #if CONFIG_VP9_HIGHBITDEPTH
vpx_highbd_subtract_block_neon(int rows,int cols,int16_t * diff_ptr,ptrdiff_t diff_stride,const uint8_t * src8_ptr,ptrdiff_t src_stride,const uint8_t * pred8_ptr,ptrdiff_t pred_stride,int bd)84*fb1b10abSAndroid Build Coastguard Worker void vpx_highbd_subtract_block_neon(int rows, int cols, int16_t *diff_ptr,
85*fb1b10abSAndroid Build Coastguard Worker ptrdiff_t diff_stride,
86*fb1b10abSAndroid Build Coastguard Worker const uint8_t *src8_ptr,
87*fb1b10abSAndroid Build Coastguard Worker ptrdiff_t src_stride,
88*fb1b10abSAndroid Build Coastguard Worker const uint8_t *pred8_ptr,
89*fb1b10abSAndroid Build Coastguard Worker ptrdiff_t pred_stride, int bd) {
90*fb1b10abSAndroid Build Coastguard Worker int r = rows, c;
91*fb1b10abSAndroid Build Coastguard Worker uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);
92*fb1b10abSAndroid Build Coastguard Worker uint16_t *pred = CONVERT_TO_SHORTPTR(pred8_ptr);
93*fb1b10abSAndroid Build Coastguard Worker (void)bd;
94*fb1b10abSAndroid Build Coastguard Worker
95*fb1b10abSAndroid Build Coastguard Worker if (cols >= 16) {
96*fb1b10abSAndroid Build Coastguard Worker do {
97*fb1b10abSAndroid Build Coastguard Worker for (c = 0; c < cols; c += 16) {
98*fb1b10abSAndroid Build Coastguard Worker const uint16x8_t s0 = vld1q_u16(&src[c + 0]);
99*fb1b10abSAndroid Build Coastguard Worker const uint16x8_t s1 = vld1q_u16(&src[c + 8]);
100*fb1b10abSAndroid Build Coastguard Worker const uint16x8_t p0 = vld1q_u16(&pred[c + 0]);
101*fb1b10abSAndroid Build Coastguard Worker const uint16x8_t p1 = vld1q_u16(&pred[c + 8]);
102*fb1b10abSAndroid Build Coastguard Worker const uint16x8_t d0 = vsubq_u16(s0, p0);
103*fb1b10abSAndroid Build Coastguard Worker const uint16x8_t d1 = vsubq_u16(s1, p1);
104*fb1b10abSAndroid Build Coastguard Worker vst1q_s16(&diff_ptr[c + 0], vreinterpretq_s16_u16(d0));
105*fb1b10abSAndroid Build Coastguard Worker vst1q_s16(&diff_ptr[c + 8], vreinterpretq_s16_u16(d1));
106*fb1b10abSAndroid Build Coastguard Worker }
107*fb1b10abSAndroid Build Coastguard Worker diff_ptr += diff_stride;
108*fb1b10abSAndroid Build Coastguard Worker pred += pred_stride;
109*fb1b10abSAndroid Build Coastguard Worker src += src_stride;
110*fb1b10abSAndroid Build Coastguard Worker } while (--r);
111*fb1b10abSAndroid Build Coastguard Worker } else if (cols >= 8) {
112*fb1b10abSAndroid Build Coastguard Worker do {
113*fb1b10abSAndroid Build Coastguard Worker for (c = 0; c < cols; c += 8) {
114*fb1b10abSAndroid Build Coastguard Worker const uint16x8_t s = vld1q_u16(&src[c]);
115*fb1b10abSAndroid Build Coastguard Worker const uint16x8_t p = vld1q_u16(&pred[c]);
116*fb1b10abSAndroid Build Coastguard Worker const uint16x8_t d0 = vsubq_u16(s, p);
117*fb1b10abSAndroid Build Coastguard Worker vst1q_s16(&diff_ptr[c], vreinterpretq_s16_u16(d0));
118*fb1b10abSAndroid Build Coastguard Worker }
119*fb1b10abSAndroid Build Coastguard Worker diff_ptr += diff_stride;
120*fb1b10abSAndroid Build Coastguard Worker pred += pred_stride;
121*fb1b10abSAndroid Build Coastguard Worker src += src_stride;
122*fb1b10abSAndroid Build Coastguard Worker } while (--r);
123*fb1b10abSAndroid Build Coastguard Worker } else if (cols >= 4) {
124*fb1b10abSAndroid Build Coastguard Worker do {
125*fb1b10abSAndroid Build Coastguard Worker for (c = 0; c < cols; c += 4) {
126*fb1b10abSAndroid Build Coastguard Worker const uint16x4_t s = vld1_u16(&src[c]);
127*fb1b10abSAndroid Build Coastguard Worker const uint16x4_t p = vld1_u16(&pred[c]);
128*fb1b10abSAndroid Build Coastguard Worker const uint16x4_t v_diff = vsub_u16(s, p);
129*fb1b10abSAndroid Build Coastguard Worker vst1_s16(&diff_ptr[c], vreinterpret_s16_u16(v_diff));
130*fb1b10abSAndroid Build Coastguard Worker }
131*fb1b10abSAndroid Build Coastguard Worker diff_ptr += diff_stride;
132*fb1b10abSAndroid Build Coastguard Worker pred += pred_stride;
133*fb1b10abSAndroid Build Coastguard Worker src += src_stride;
134*fb1b10abSAndroid Build Coastguard Worker } while (--r);
135*fb1b10abSAndroid Build Coastguard Worker }
136*fb1b10abSAndroid Build Coastguard Worker }
137*fb1b10abSAndroid Build Coastguard Worker #endif // CONFIG_VP9_HIGHBITDEPTH
138