1*dfc6aa5cSAndroid Build Coastguard Worker /*
2*dfc6aa5cSAndroid Build Coastguard Worker * jdsample-neon.c - upsampling (Arm Neon)
3*dfc6aa5cSAndroid Build Coastguard Worker *
4*dfc6aa5cSAndroid Build Coastguard Worker * Copyright (C) 2020, Arm Limited. All Rights Reserved.
5*dfc6aa5cSAndroid Build Coastguard Worker * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
6*dfc6aa5cSAndroid Build Coastguard Worker *
7*dfc6aa5cSAndroid Build Coastguard Worker * This software is provided 'as-is', without any express or implied
8*dfc6aa5cSAndroid Build Coastguard Worker * warranty. In no event will the authors be held liable for any damages
9*dfc6aa5cSAndroid Build Coastguard Worker * arising from the use of this software.
10*dfc6aa5cSAndroid Build Coastguard Worker *
11*dfc6aa5cSAndroid Build Coastguard Worker * Permission is granted to anyone to use this software for any purpose,
12*dfc6aa5cSAndroid Build Coastguard Worker * including commercial applications, and to alter it and redistribute it
13*dfc6aa5cSAndroid Build Coastguard Worker * freely, subject to the following restrictions:
14*dfc6aa5cSAndroid Build Coastguard Worker *
15*dfc6aa5cSAndroid Build Coastguard Worker * 1. The origin of this software must not be misrepresented; you must not
16*dfc6aa5cSAndroid Build Coastguard Worker * claim that you wrote the original software. If you use this software
17*dfc6aa5cSAndroid Build Coastguard Worker * in a product, an acknowledgment in the product documentation would be
18*dfc6aa5cSAndroid Build Coastguard Worker * appreciated but is not required.
19*dfc6aa5cSAndroid Build Coastguard Worker * 2. Altered source versions must be plainly marked as such, and must not be
20*dfc6aa5cSAndroid Build Coastguard Worker * misrepresented as being the original software.
21*dfc6aa5cSAndroid Build Coastguard Worker * 3. This notice may not be removed or altered from any source distribution.
22*dfc6aa5cSAndroid Build Coastguard Worker */
23*dfc6aa5cSAndroid Build Coastguard Worker
24*dfc6aa5cSAndroid Build Coastguard Worker #define JPEG_INTERNALS
25*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jinclude.h"
26*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jpeglib.h"
27*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jsimd.h"
28*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jdct.h"
29*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jsimddct.h"
30*dfc6aa5cSAndroid Build Coastguard Worker #include "../jsimd.h"
31*dfc6aa5cSAndroid Build Coastguard Worker
32*dfc6aa5cSAndroid Build Coastguard Worker #include <arm_neon.h>
33*dfc6aa5cSAndroid Build Coastguard Worker
34*dfc6aa5cSAndroid Build Coastguard Worker
35*dfc6aa5cSAndroid Build Coastguard Worker /* The diagram below shows a row of samples produced by h2v1 downsampling.
36*dfc6aa5cSAndroid Build Coastguard Worker *
37*dfc6aa5cSAndroid Build Coastguard Worker * s0 s1 s2
38*dfc6aa5cSAndroid Build Coastguard Worker * +---------+---------+---------+
39*dfc6aa5cSAndroid Build Coastguard Worker * | | | |
40*dfc6aa5cSAndroid Build Coastguard Worker * | p0 p1 | p2 p3 | p4 p5 |
41*dfc6aa5cSAndroid Build Coastguard Worker * | | | |
42*dfc6aa5cSAndroid Build Coastguard Worker * +---------+---------+---------+
43*dfc6aa5cSAndroid Build Coastguard Worker *
44*dfc6aa5cSAndroid Build Coastguard Worker * Samples s0-s2 were created by averaging the original pixel component values
45*dfc6aa5cSAndroid Build Coastguard Worker * centered at positions p0-p5 above. To approximate those original pixel
46*dfc6aa5cSAndroid Build Coastguard Worker * component values, we proportionally blend the adjacent samples in each row.
47*dfc6aa5cSAndroid Build Coastguard Worker *
48*dfc6aa5cSAndroid Build Coastguard Worker * An upsampled pixel component value is computed by blending the sample
49*dfc6aa5cSAndroid Build Coastguard Worker * containing the pixel center with the nearest neighboring sample, in the
50*dfc6aa5cSAndroid Build Coastguard Worker * ratio 3:1. For example:
51*dfc6aa5cSAndroid Build Coastguard Worker * p1(upsampled) = 3/4 * s0 + 1/4 * s1
52*dfc6aa5cSAndroid Build Coastguard Worker * p2(upsampled) = 3/4 * s1 + 1/4 * s0
53*dfc6aa5cSAndroid Build Coastguard Worker * When computing the first and last pixel component values in the row, there
54*dfc6aa5cSAndroid Build Coastguard Worker * is no adjacent sample to blend, so:
55*dfc6aa5cSAndroid Build Coastguard Worker * p0(upsampled) = s0
56*dfc6aa5cSAndroid Build Coastguard Worker * p5(upsampled) = s2
57*dfc6aa5cSAndroid Build Coastguard Worker */
58*dfc6aa5cSAndroid Build Coastguard Worker
jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,JDIMENSION downsampled_width,JSAMPARRAY input_data,JSAMPARRAY * output_data_ptr)59*dfc6aa5cSAndroid Build Coastguard Worker void jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,
60*dfc6aa5cSAndroid Build Coastguard Worker JDIMENSION downsampled_width,
61*dfc6aa5cSAndroid Build Coastguard Worker JSAMPARRAY input_data,
62*dfc6aa5cSAndroid Build Coastguard Worker JSAMPARRAY *output_data_ptr)
63*dfc6aa5cSAndroid Build Coastguard Worker {
64*dfc6aa5cSAndroid Build Coastguard Worker JSAMPARRAY output_data = *output_data_ptr;
65*dfc6aa5cSAndroid Build Coastguard Worker JSAMPROW inptr, outptr;
66*dfc6aa5cSAndroid Build Coastguard Worker int inrow;
67*dfc6aa5cSAndroid Build Coastguard Worker unsigned colctr;
68*dfc6aa5cSAndroid Build Coastguard Worker /* Set up constants. */
69*dfc6aa5cSAndroid Build Coastguard Worker const uint16x8_t one_u16 = vdupq_n_u16(1);
70*dfc6aa5cSAndroid Build Coastguard Worker const uint8x8_t three_u8 = vdup_n_u8(3);
71*dfc6aa5cSAndroid Build Coastguard Worker
72*dfc6aa5cSAndroid Build Coastguard Worker for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
73*dfc6aa5cSAndroid Build Coastguard Worker inptr = input_data[inrow];
74*dfc6aa5cSAndroid Build Coastguard Worker outptr = output_data[inrow];
75*dfc6aa5cSAndroid Build Coastguard Worker /* First pixel component value in this row of the original image */
76*dfc6aa5cSAndroid Build Coastguard Worker *outptr = (JSAMPLE)GETJSAMPLE(*inptr);
77*dfc6aa5cSAndroid Build Coastguard Worker
78*dfc6aa5cSAndroid Build Coastguard Worker /* 3/4 * containing sample + 1/4 * nearest neighboring sample
79*dfc6aa5cSAndroid Build Coastguard Worker * For p1: containing sample = s0, nearest neighboring sample = s1
80*dfc6aa5cSAndroid Build Coastguard Worker * For p2: containing sample = s1, nearest neighboring sample = s0
81*dfc6aa5cSAndroid Build Coastguard Worker */
82*dfc6aa5cSAndroid Build Coastguard Worker uint8x16_t s0 = vld1q_u8(inptr);
83*dfc6aa5cSAndroid Build Coastguard Worker uint8x16_t s1 = vld1q_u8(inptr + 1);
84*dfc6aa5cSAndroid Build Coastguard Worker /* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes
85*dfc6aa5cSAndroid Build Coastguard Worker * denote low half and high half respectively.
86*dfc6aa5cSAndroid Build Coastguard Worker */
87*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t s1_add_3s0_l =
88*dfc6aa5cSAndroid Build Coastguard Worker vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
89*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t s1_add_3s0_h =
90*dfc6aa5cSAndroid Build Coastguard Worker vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
91*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t s0_add_3s1_l =
92*dfc6aa5cSAndroid Build Coastguard Worker vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
93*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t s0_add_3s1_h =
94*dfc6aa5cSAndroid Build Coastguard Worker vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
95*dfc6aa5cSAndroid Build Coastguard Worker /* Add ordered dithering bias to odd pixel values. */
96*dfc6aa5cSAndroid Build Coastguard Worker s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
97*dfc6aa5cSAndroid Build Coastguard Worker s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
98*dfc6aa5cSAndroid Build Coastguard Worker
99*dfc6aa5cSAndroid Build Coastguard Worker /* The offset is initially 1, because the first pixel component has already
100*dfc6aa5cSAndroid Build Coastguard Worker * been stored. However, in subsequent iterations of the SIMD loop, this
101*dfc6aa5cSAndroid Build Coastguard Worker * offset is (2 * colctr - 1) to stay within the bounds of the sample
102*dfc6aa5cSAndroid Build Coastguard Worker * buffers without having to resort to a slow scalar tail case for the last
103*dfc6aa5cSAndroid Build Coastguard Worker * (downsampled_width % 16) samples. See "Creation of 2-D sample arrays"
104*dfc6aa5cSAndroid Build Coastguard Worker * in jmemmgr.c for more details.
105*dfc6aa5cSAndroid Build Coastguard Worker */
106*dfc6aa5cSAndroid Build Coastguard Worker unsigned outptr_offset = 1;
107*dfc6aa5cSAndroid Build Coastguard Worker uint8x16x2_t output_pixels;
108*dfc6aa5cSAndroid Build Coastguard Worker
109*dfc6aa5cSAndroid Build Coastguard Worker /* We use software pipelining to maximise performance. The code indented
110*dfc6aa5cSAndroid Build Coastguard Worker * an extra two spaces begins the next iteration of the loop.
111*dfc6aa5cSAndroid Build Coastguard Worker */
112*dfc6aa5cSAndroid Build Coastguard Worker for (colctr = 16; colctr < downsampled_width; colctr += 16) {
113*dfc6aa5cSAndroid Build Coastguard Worker
114*dfc6aa5cSAndroid Build Coastguard Worker s0 = vld1q_u8(inptr + colctr - 1);
115*dfc6aa5cSAndroid Build Coastguard Worker s1 = vld1q_u8(inptr + colctr);
116*dfc6aa5cSAndroid Build Coastguard Worker
117*dfc6aa5cSAndroid Build Coastguard Worker /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
118*dfc6aa5cSAndroid Build Coastguard Worker output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
119*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_u16(s1_add_3s0_h, 2));
120*dfc6aa5cSAndroid Build Coastguard Worker output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
121*dfc6aa5cSAndroid Build Coastguard Worker vshrn_n_u16(s0_add_3s1_h, 2));
122*dfc6aa5cSAndroid Build Coastguard Worker
123*dfc6aa5cSAndroid Build Coastguard Worker /* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes
124*dfc6aa5cSAndroid Build Coastguard Worker * denote low half and high half respectively.
125*dfc6aa5cSAndroid Build Coastguard Worker */
126*dfc6aa5cSAndroid Build Coastguard Worker s1_add_3s0_l =
127*dfc6aa5cSAndroid Build Coastguard Worker vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
128*dfc6aa5cSAndroid Build Coastguard Worker s1_add_3s0_h =
129*dfc6aa5cSAndroid Build Coastguard Worker vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
130*dfc6aa5cSAndroid Build Coastguard Worker s0_add_3s1_l =
131*dfc6aa5cSAndroid Build Coastguard Worker vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
132*dfc6aa5cSAndroid Build Coastguard Worker s0_add_3s1_h =
133*dfc6aa5cSAndroid Build Coastguard Worker vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
134*dfc6aa5cSAndroid Build Coastguard Worker /* Add ordered dithering bias to odd pixel values. */
135*dfc6aa5cSAndroid Build Coastguard Worker s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
136*dfc6aa5cSAndroid Build Coastguard Worker s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
137*dfc6aa5cSAndroid Build Coastguard Worker
138*dfc6aa5cSAndroid Build Coastguard Worker /* Store pixel component values to memory. */
139*dfc6aa5cSAndroid Build Coastguard Worker vst2q_u8(outptr + outptr_offset, output_pixels);
140*dfc6aa5cSAndroid Build Coastguard Worker outptr_offset = 2 * colctr - 1;
141*dfc6aa5cSAndroid Build Coastguard Worker }
142*dfc6aa5cSAndroid Build Coastguard Worker
143*dfc6aa5cSAndroid Build Coastguard Worker /* Complete the last iteration of the loop. */
144*dfc6aa5cSAndroid Build Coastguard Worker
145*dfc6aa5cSAndroid Build Coastguard Worker /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
146*dfc6aa5cSAndroid Build Coastguard Worker output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
147*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_u16(s1_add_3s0_h, 2));
148*dfc6aa5cSAndroid Build Coastguard Worker output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
149*dfc6aa5cSAndroid Build Coastguard Worker vshrn_n_u16(s0_add_3s1_h, 2));
150*dfc6aa5cSAndroid Build Coastguard Worker /* Store pixel component values to memory. */
151*dfc6aa5cSAndroid Build Coastguard Worker vst2q_u8(outptr + outptr_offset, output_pixels);
152*dfc6aa5cSAndroid Build Coastguard Worker
153*dfc6aa5cSAndroid Build Coastguard Worker /* Last pixel component value in this row of the original image */
154*dfc6aa5cSAndroid Build Coastguard Worker outptr[2 * downsampled_width - 1] =
155*dfc6aa5cSAndroid Build Coastguard Worker GETJSAMPLE(inptr[downsampled_width - 1]);
156*dfc6aa5cSAndroid Build Coastguard Worker }
157*dfc6aa5cSAndroid Build Coastguard Worker }
158*dfc6aa5cSAndroid Build Coastguard Worker
159*dfc6aa5cSAndroid Build Coastguard Worker
160*dfc6aa5cSAndroid Build Coastguard Worker /* The diagram below shows an array of samples produced by h2v2 downsampling.
161*dfc6aa5cSAndroid Build Coastguard Worker *
162*dfc6aa5cSAndroid Build Coastguard Worker * s0 s1 s2
163*dfc6aa5cSAndroid Build Coastguard Worker * +---------+---------+---------+
164*dfc6aa5cSAndroid Build Coastguard Worker * | p0 p1 | p2 p3 | p4 p5 |
165*dfc6aa5cSAndroid Build Coastguard Worker * sA | | | |
166*dfc6aa5cSAndroid Build Coastguard Worker * | p6 p7 | p8 p9 | p10 p11|
167*dfc6aa5cSAndroid Build Coastguard Worker * +---------+---------+---------+
168*dfc6aa5cSAndroid Build Coastguard Worker * | p12 p13| p14 p15| p16 p17|
169*dfc6aa5cSAndroid Build Coastguard Worker * sB | | | |
170*dfc6aa5cSAndroid Build Coastguard Worker * | p18 p19| p20 p21| p22 p23|
171*dfc6aa5cSAndroid Build Coastguard Worker * +---------+---------+---------+
172*dfc6aa5cSAndroid Build Coastguard Worker * | p24 p25| p26 p27| p28 p29|
173*dfc6aa5cSAndroid Build Coastguard Worker * sC | | | |
174*dfc6aa5cSAndroid Build Coastguard Worker * | p30 p31| p32 p33| p34 p35|
175*dfc6aa5cSAndroid Build Coastguard Worker * +---------+---------+---------+
176*dfc6aa5cSAndroid Build Coastguard Worker *
177*dfc6aa5cSAndroid Build Coastguard Worker * Samples s0A-s2C were created by averaging the original pixel component
178*dfc6aa5cSAndroid Build Coastguard Worker * values centered at positions p0-p35 above. To approximate one of those
179*dfc6aa5cSAndroid Build Coastguard Worker * original pixel component values, we proportionally blend the sample
180*dfc6aa5cSAndroid Build Coastguard Worker * containing the pixel center with the nearest neighboring samples in each
181*dfc6aa5cSAndroid Build Coastguard Worker * row, column, and diagonal.
182*dfc6aa5cSAndroid Build Coastguard Worker *
183*dfc6aa5cSAndroid Build Coastguard Worker * An upsampled pixel component value is computed by first blending the sample
184*dfc6aa5cSAndroid Build Coastguard Worker * containing the pixel center with the nearest neighboring samples in the
185*dfc6aa5cSAndroid Build Coastguard Worker * same column, in the ratio 3:1, and then blending each column sum with the
186*dfc6aa5cSAndroid Build Coastguard Worker * nearest neighboring column sum, in the ratio 3:1. For example:
187*dfc6aa5cSAndroid Build Coastguard Worker * p14(upsampled) = 3/4 * (3/4 * s1B + 1/4 * s1A) +
188*dfc6aa5cSAndroid Build Coastguard Worker * 1/4 * (3/4 * s0B + 1/4 * s0A)
189*dfc6aa5cSAndroid Build Coastguard Worker * = 9/16 * s1B + 3/16 * s1A + 3/16 * s0B + 1/16 * s0A
190*dfc6aa5cSAndroid Build Coastguard Worker * When computing the first and last pixel component values in the row, there
191*dfc6aa5cSAndroid Build Coastguard Worker * is no horizontally adjacent sample to blend, so:
192*dfc6aa5cSAndroid Build Coastguard Worker * p12(upsampled) = 3/4 * s0B + 1/4 * s0A
193*dfc6aa5cSAndroid Build Coastguard Worker * p23(upsampled) = 3/4 * s2B + 1/4 * s2C
194*dfc6aa5cSAndroid Build Coastguard Worker * When computing the first and last pixel component values in the column,
195*dfc6aa5cSAndroid Build Coastguard Worker * there is no vertically adjacent sample to blend, so:
196*dfc6aa5cSAndroid Build Coastguard Worker * p2(upsampled) = 3/4 * s1A + 1/4 * s0A
197*dfc6aa5cSAndroid Build Coastguard Worker * p33(upsampled) = 3/4 * s1C + 1/4 * s2C
198*dfc6aa5cSAndroid Build Coastguard Worker * When computing the corner pixel component values, there is no adjacent
199*dfc6aa5cSAndroid Build Coastguard Worker * sample to blend, so:
200*dfc6aa5cSAndroid Build Coastguard Worker * p0(upsampled) = s0A
201*dfc6aa5cSAndroid Build Coastguard Worker * p35(upsampled) = s2C
202*dfc6aa5cSAndroid Build Coastguard Worker */
203*dfc6aa5cSAndroid Build Coastguard Worker
jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor,JDIMENSION downsampled_width,JSAMPARRAY input_data,JSAMPARRAY * output_data_ptr)204*dfc6aa5cSAndroid Build Coastguard Worker void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor,
205*dfc6aa5cSAndroid Build Coastguard Worker JDIMENSION downsampled_width,
206*dfc6aa5cSAndroid Build Coastguard Worker JSAMPARRAY input_data,
207*dfc6aa5cSAndroid Build Coastguard Worker JSAMPARRAY *output_data_ptr)
208*dfc6aa5cSAndroid Build Coastguard Worker {
209*dfc6aa5cSAndroid Build Coastguard Worker JSAMPARRAY output_data = *output_data_ptr;
210*dfc6aa5cSAndroid Build Coastguard Worker JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
211*dfc6aa5cSAndroid Build Coastguard Worker int inrow, outrow;
212*dfc6aa5cSAndroid Build Coastguard Worker unsigned colctr;
213*dfc6aa5cSAndroid Build Coastguard Worker /* Set up constants. */
214*dfc6aa5cSAndroid Build Coastguard Worker const uint16x8_t seven_u16 = vdupq_n_u16(7);
215*dfc6aa5cSAndroid Build Coastguard Worker const uint8x8_t three_u8 = vdup_n_u8(3);
216*dfc6aa5cSAndroid Build Coastguard Worker const uint16x8_t three_u16 = vdupq_n_u16(3);
217*dfc6aa5cSAndroid Build Coastguard Worker
218*dfc6aa5cSAndroid Build Coastguard Worker inrow = outrow = 0;
219*dfc6aa5cSAndroid Build Coastguard Worker while (outrow < max_v_samp_factor) {
220*dfc6aa5cSAndroid Build Coastguard Worker inptr0 = input_data[inrow - 1];
221*dfc6aa5cSAndroid Build Coastguard Worker inptr1 = input_data[inrow];
222*dfc6aa5cSAndroid Build Coastguard Worker inptr2 = input_data[inrow + 1];
223*dfc6aa5cSAndroid Build Coastguard Worker /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
224*dfc6aa5cSAndroid Build Coastguard Worker * respectively.
225*dfc6aa5cSAndroid Build Coastguard Worker */
226*dfc6aa5cSAndroid Build Coastguard Worker outptr0 = output_data[outrow++];
227*dfc6aa5cSAndroid Build Coastguard Worker outptr1 = output_data[outrow++];
228*dfc6aa5cSAndroid Build Coastguard Worker
229*dfc6aa5cSAndroid Build Coastguard Worker /* First pixel component value in this row of the original image */
230*dfc6aa5cSAndroid Build Coastguard Worker int s0colsum0 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr0);
231*dfc6aa5cSAndroid Build Coastguard Worker *outptr0 = (JSAMPLE)((s0colsum0 * 4 + 8) >> 4);
232*dfc6aa5cSAndroid Build Coastguard Worker int s0colsum1 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr2);
233*dfc6aa5cSAndroid Build Coastguard Worker *outptr1 = (JSAMPLE)((s0colsum1 * 4 + 8) >> 4);
234*dfc6aa5cSAndroid Build Coastguard Worker
235*dfc6aa5cSAndroid Build Coastguard Worker /* Step 1: Blend samples vertically in columns s0 and s1.
236*dfc6aa5cSAndroid Build Coastguard Worker * Leave the divide by 4 until the end, when it can be done for both
237*dfc6aa5cSAndroid Build Coastguard Worker * dimensions at once, right-shifting by 4.
238*dfc6aa5cSAndroid Build Coastguard Worker */
239*dfc6aa5cSAndroid Build Coastguard Worker
240*dfc6aa5cSAndroid Build Coastguard Worker /* Load and compute s0colsum0 and s0colsum1. */
241*dfc6aa5cSAndroid Build Coastguard Worker uint8x16_t s0A = vld1q_u8(inptr0);
242*dfc6aa5cSAndroid Build Coastguard Worker uint8x16_t s0B = vld1q_u8(inptr1);
243*dfc6aa5cSAndroid Build Coastguard Worker uint8x16_t s0C = vld1q_u8(inptr2);
244*dfc6aa5cSAndroid Build Coastguard Worker /* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes
245*dfc6aa5cSAndroid Build Coastguard Worker * denote low half and high half respectively.
246*dfc6aa5cSAndroid Build Coastguard Worker */
247*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)),
248*dfc6aa5cSAndroid Build Coastguard Worker vget_low_u8(s0B), three_u8);
249*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)),
250*dfc6aa5cSAndroid Build Coastguard Worker vget_high_u8(s0B), three_u8);
251*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)),
252*dfc6aa5cSAndroid Build Coastguard Worker vget_low_u8(s0B), three_u8);
253*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)),
254*dfc6aa5cSAndroid Build Coastguard Worker vget_high_u8(s0B), three_u8);
255*dfc6aa5cSAndroid Build Coastguard Worker /* Load and compute s1colsum0 and s1colsum1. */
256*dfc6aa5cSAndroid Build Coastguard Worker uint8x16_t s1A = vld1q_u8(inptr0 + 1);
257*dfc6aa5cSAndroid Build Coastguard Worker uint8x16_t s1B = vld1q_u8(inptr1 + 1);
258*dfc6aa5cSAndroid Build Coastguard Worker uint8x16_t s1C = vld1q_u8(inptr2 + 1);
259*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)),
260*dfc6aa5cSAndroid Build Coastguard Worker vget_low_u8(s1B), three_u8);
261*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)),
262*dfc6aa5cSAndroid Build Coastguard Worker vget_high_u8(s1B), three_u8);
263*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)),
264*dfc6aa5cSAndroid Build Coastguard Worker vget_low_u8(s1B), three_u8);
265*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)),
266*dfc6aa5cSAndroid Build Coastguard Worker vget_high_u8(s1B), three_u8);
267*dfc6aa5cSAndroid Build Coastguard Worker
268*dfc6aa5cSAndroid Build Coastguard Worker /* Step 2: Blend the already-blended columns. */
269*dfc6aa5cSAndroid Build Coastguard Worker
270*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
271*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
272*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
273*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
274*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
275*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
276*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
277*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
278*dfc6aa5cSAndroid Build Coastguard Worker /* Add ordered dithering bias to odd pixel values. */
279*dfc6aa5cSAndroid Build Coastguard Worker output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
280*dfc6aa5cSAndroid Build Coastguard Worker output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
281*dfc6aa5cSAndroid Build Coastguard Worker output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
282*dfc6aa5cSAndroid Build Coastguard Worker output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
283*dfc6aa5cSAndroid Build Coastguard Worker /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
284*dfc6aa5cSAndroid Build Coastguard Worker uint8x16x2_t output_pixels0 = { {
285*dfc6aa5cSAndroid Build Coastguard Worker vcombine_u8(vshrn_n_u16(output0_p1_l, 4), vshrn_n_u16(output0_p1_h, 4)),
286*dfc6aa5cSAndroid Build Coastguard Worker vcombine_u8(vrshrn_n_u16(output0_p2_l, 4), vrshrn_n_u16(output0_p2_h, 4))
287*dfc6aa5cSAndroid Build Coastguard Worker } };
288*dfc6aa5cSAndroid Build Coastguard Worker uint8x16x2_t output_pixels1 = { {
289*dfc6aa5cSAndroid Build Coastguard Worker vcombine_u8(vshrn_n_u16(output1_p1_l, 4), vshrn_n_u16(output1_p1_h, 4)),
290*dfc6aa5cSAndroid Build Coastguard Worker vcombine_u8(vrshrn_n_u16(output1_p2_l, 4), vrshrn_n_u16(output1_p2_h, 4))
291*dfc6aa5cSAndroid Build Coastguard Worker } };
292*dfc6aa5cSAndroid Build Coastguard Worker
293*dfc6aa5cSAndroid Build Coastguard Worker /* Store pixel component values to memory.
294*dfc6aa5cSAndroid Build Coastguard Worker * The minimum size of the output buffer for each row is 64 bytes => no
295*dfc6aa5cSAndroid Build Coastguard Worker * need to worry about buffer overflow here. See "Creation of 2-D sample
296*dfc6aa5cSAndroid Build Coastguard Worker * arrays" in jmemmgr.c for more details.
297*dfc6aa5cSAndroid Build Coastguard Worker */
298*dfc6aa5cSAndroid Build Coastguard Worker vst2q_u8(outptr0 + 1, output_pixels0);
299*dfc6aa5cSAndroid Build Coastguard Worker vst2q_u8(outptr1 + 1, output_pixels1);
300*dfc6aa5cSAndroid Build Coastguard Worker
301*dfc6aa5cSAndroid Build Coastguard Worker /* The first pixel of the image shifted our loads and stores by one byte.
302*dfc6aa5cSAndroid Build Coastguard Worker * We have to re-align on a 32-byte boundary at some point before the end
303*dfc6aa5cSAndroid Build Coastguard Worker * of the row (we do it now on the 32/33 pixel boundary) to stay within the
304*dfc6aa5cSAndroid Build Coastguard Worker * bounds of the sample buffers without having to resort to a slow scalar
305*dfc6aa5cSAndroid Build Coastguard Worker * tail case for the last (downsampled_width % 16) samples. See "Creation
306*dfc6aa5cSAndroid Build Coastguard Worker * of 2-D sample arrays" in jmemmgr.c for more details.
307*dfc6aa5cSAndroid Build Coastguard Worker */
308*dfc6aa5cSAndroid Build Coastguard Worker for (colctr = 16; colctr < downsampled_width; colctr += 16) {
309*dfc6aa5cSAndroid Build Coastguard Worker /* Step 1: Blend samples vertically in columns s0 and s1. */
310*dfc6aa5cSAndroid Build Coastguard Worker
311*dfc6aa5cSAndroid Build Coastguard Worker /* Load and compute s0colsum0 and s0colsum1. */
312*dfc6aa5cSAndroid Build Coastguard Worker s0A = vld1q_u8(inptr0 + colctr - 1);
313*dfc6aa5cSAndroid Build Coastguard Worker s0B = vld1q_u8(inptr1 + colctr - 1);
314*dfc6aa5cSAndroid Build Coastguard Worker s0C = vld1q_u8(inptr2 + colctr - 1);
315*dfc6aa5cSAndroid Build Coastguard Worker s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)), vget_low_u8(s0B),
316*dfc6aa5cSAndroid Build Coastguard Worker three_u8);
317*dfc6aa5cSAndroid Build Coastguard Worker s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)), vget_high_u8(s0B),
318*dfc6aa5cSAndroid Build Coastguard Worker three_u8);
319*dfc6aa5cSAndroid Build Coastguard Worker s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)), vget_low_u8(s0B),
320*dfc6aa5cSAndroid Build Coastguard Worker three_u8);
321*dfc6aa5cSAndroid Build Coastguard Worker s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)), vget_high_u8(s0B),
322*dfc6aa5cSAndroid Build Coastguard Worker three_u8);
323*dfc6aa5cSAndroid Build Coastguard Worker /* Load and compute s1colsum0 and s1colsum1. */
324*dfc6aa5cSAndroid Build Coastguard Worker s1A = vld1q_u8(inptr0 + colctr);
325*dfc6aa5cSAndroid Build Coastguard Worker s1B = vld1q_u8(inptr1 + colctr);
326*dfc6aa5cSAndroid Build Coastguard Worker s1C = vld1q_u8(inptr2 + colctr);
327*dfc6aa5cSAndroid Build Coastguard Worker s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)), vget_low_u8(s1B),
328*dfc6aa5cSAndroid Build Coastguard Worker three_u8);
329*dfc6aa5cSAndroid Build Coastguard Worker s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)), vget_high_u8(s1B),
330*dfc6aa5cSAndroid Build Coastguard Worker three_u8);
331*dfc6aa5cSAndroid Build Coastguard Worker s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)), vget_low_u8(s1B),
332*dfc6aa5cSAndroid Build Coastguard Worker three_u8);
333*dfc6aa5cSAndroid Build Coastguard Worker s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)), vget_high_u8(s1B),
334*dfc6aa5cSAndroid Build Coastguard Worker three_u8);
335*dfc6aa5cSAndroid Build Coastguard Worker
336*dfc6aa5cSAndroid Build Coastguard Worker /* Step 2: Blend the already-blended columns. */
337*dfc6aa5cSAndroid Build Coastguard Worker
338*dfc6aa5cSAndroid Build Coastguard Worker output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
339*dfc6aa5cSAndroid Build Coastguard Worker output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
340*dfc6aa5cSAndroid Build Coastguard Worker output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
341*dfc6aa5cSAndroid Build Coastguard Worker output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
342*dfc6aa5cSAndroid Build Coastguard Worker output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
343*dfc6aa5cSAndroid Build Coastguard Worker output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
344*dfc6aa5cSAndroid Build Coastguard Worker output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
345*dfc6aa5cSAndroid Build Coastguard Worker output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
346*dfc6aa5cSAndroid Build Coastguard Worker /* Add ordered dithering bias to odd pixel values. */
347*dfc6aa5cSAndroid Build Coastguard Worker output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
348*dfc6aa5cSAndroid Build Coastguard Worker output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
349*dfc6aa5cSAndroid Build Coastguard Worker output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
350*dfc6aa5cSAndroid Build Coastguard Worker output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
351*dfc6aa5cSAndroid Build Coastguard Worker /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
352*dfc6aa5cSAndroid Build Coastguard Worker output_pixels0.val[0] = vcombine_u8(vshrn_n_u16(output0_p1_l, 4),
353*dfc6aa5cSAndroid Build Coastguard Worker vshrn_n_u16(output0_p1_h, 4));
354*dfc6aa5cSAndroid Build Coastguard Worker output_pixels0.val[1] = vcombine_u8(vrshrn_n_u16(output0_p2_l, 4),
355*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_u16(output0_p2_h, 4));
356*dfc6aa5cSAndroid Build Coastguard Worker output_pixels1.val[0] = vcombine_u8(vshrn_n_u16(output1_p1_l, 4),
357*dfc6aa5cSAndroid Build Coastguard Worker vshrn_n_u16(output1_p1_h, 4));
358*dfc6aa5cSAndroid Build Coastguard Worker output_pixels1.val[1] = vcombine_u8(vrshrn_n_u16(output1_p2_l, 4),
359*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_u16(output1_p2_h, 4));
360*dfc6aa5cSAndroid Build Coastguard Worker /* Store pixel component values to memory. */
361*dfc6aa5cSAndroid Build Coastguard Worker vst2q_u8(outptr0 + 2 * colctr - 1, output_pixels0);
362*dfc6aa5cSAndroid Build Coastguard Worker vst2q_u8(outptr1 + 2 * colctr - 1, output_pixels1);
363*dfc6aa5cSAndroid Build Coastguard Worker }
364*dfc6aa5cSAndroid Build Coastguard Worker
365*dfc6aa5cSAndroid Build Coastguard Worker /* Last pixel component value in this row of the original image */
366*dfc6aa5cSAndroid Build Coastguard Worker int s1colsum0 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
367*dfc6aa5cSAndroid Build Coastguard Worker GETJSAMPLE(inptr0[downsampled_width - 1]);
368*dfc6aa5cSAndroid Build Coastguard Worker outptr0[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum0 * 4 + 7) >> 4);
369*dfc6aa5cSAndroid Build Coastguard Worker int s1colsum1 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
370*dfc6aa5cSAndroid Build Coastguard Worker GETJSAMPLE(inptr2[downsampled_width - 1]);
371*dfc6aa5cSAndroid Build Coastguard Worker outptr1[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum1 * 4 + 7) >> 4);
372*dfc6aa5cSAndroid Build Coastguard Worker inrow++;
373*dfc6aa5cSAndroid Build Coastguard Worker }
374*dfc6aa5cSAndroid Build Coastguard Worker }
375*dfc6aa5cSAndroid Build Coastguard Worker
376*dfc6aa5cSAndroid Build Coastguard Worker
377*dfc6aa5cSAndroid Build Coastguard Worker /* The diagram below shows a column of samples produced by h1v2 downsampling
378*dfc6aa5cSAndroid Build Coastguard Worker * (or by losslessly rotating or transposing an h2v1-downsampled image.)
379*dfc6aa5cSAndroid Build Coastguard Worker *
380*dfc6aa5cSAndroid Build Coastguard Worker * +---------+
381*dfc6aa5cSAndroid Build Coastguard Worker * | p0 |
382*dfc6aa5cSAndroid Build Coastguard Worker * sA | |
383*dfc6aa5cSAndroid Build Coastguard Worker * | p1 |
384*dfc6aa5cSAndroid Build Coastguard Worker * +---------+
385*dfc6aa5cSAndroid Build Coastguard Worker * | p2 |
386*dfc6aa5cSAndroid Build Coastguard Worker * sB | |
387*dfc6aa5cSAndroid Build Coastguard Worker * | p3 |
388*dfc6aa5cSAndroid Build Coastguard Worker * +---------+
389*dfc6aa5cSAndroid Build Coastguard Worker * | p4 |
390*dfc6aa5cSAndroid Build Coastguard Worker * sC | |
391*dfc6aa5cSAndroid Build Coastguard Worker * | p5 |
392*dfc6aa5cSAndroid Build Coastguard Worker * +---------+
393*dfc6aa5cSAndroid Build Coastguard Worker *
394*dfc6aa5cSAndroid Build Coastguard Worker * Samples sA-sC were created by averaging the original pixel component values
395*dfc6aa5cSAndroid Build Coastguard Worker * centered at positions p0-p5 above. To approximate those original pixel
396*dfc6aa5cSAndroid Build Coastguard Worker * component values, we proportionally blend the adjacent samples in each
397*dfc6aa5cSAndroid Build Coastguard Worker * column.
398*dfc6aa5cSAndroid Build Coastguard Worker *
399*dfc6aa5cSAndroid Build Coastguard Worker * An upsampled pixel component value is computed by blending the sample
400*dfc6aa5cSAndroid Build Coastguard Worker * containing the pixel center with the nearest neighboring sample, in the
401*dfc6aa5cSAndroid Build Coastguard Worker * ratio 3:1. For example:
402*dfc6aa5cSAndroid Build Coastguard Worker * p1(upsampled) = 3/4 * sA + 1/4 * sB
403*dfc6aa5cSAndroid Build Coastguard Worker * p2(upsampled) = 3/4 * sB + 1/4 * sA
404*dfc6aa5cSAndroid Build Coastguard Worker * When computing the first and last pixel component values in the column,
405*dfc6aa5cSAndroid Build Coastguard Worker * there is no adjacent sample to blend, so:
406*dfc6aa5cSAndroid Build Coastguard Worker * p0(upsampled) = sA
407*dfc6aa5cSAndroid Build Coastguard Worker * p5(upsampled) = sC
408*dfc6aa5cSAndroid Build Coastguard Worker */
409*dfc6aa5cSAndroid Build Coastguard Worker
jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor,JDIMENSION downsampled_width,JSAMPARRAY input_data,JSAMPARRAY * output_data_ptr)410*dfc6aa5cSAndroid Build Coastguard Worker void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor,
411*dfc6aa5cSAndroid Build Coastguard Worker JDIMENSION downsampled_width,
412*dfc6aa5cSAndroid Build Coastguard Worker JSAMPARRAY input_data,
413*dfc6aa5cSAndroid Build Coastguard Worker JSAMPARRAY *output_data_ptr)
414*dfc6aa5cSAndroid Build Coastguard Worker {
415*dfc6aa5cSAndroid Build Coastguard Worker JSAMPARRAY output_data = *output_data_ptr;
416*dfc6aa5cSAndroid Build Coastguard Worker JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
417*dfc6aa5cSAndroid Build Coastguard Worker int inrow, outrow;
418*dfc6aa5cSAndroid Build Coastguard Worker unsigned colctr;
419*dfc6aa5cSAndroid Build Coastguard Worker /* Set up constants. */
420*dfc6aa5cSAndroid Build Coastguard Worker const uint16x8_t one_u16 = vdupq_n_u16(1);
421*dfc6aa5cSAndroid Build Coastguard Worker const uint8x8_t three_u8 = vdup_n_u8(3);
422*dfc6aa5cSAndroid Build Coastguard Worker
423*dfc6aa5cSAndroid Build Coastguard Worker inrow = outrow = 0;
424*dfc6aa5cSAndroid Build Coastguard Worker while (outrow < max_v_samp_factor) {
425*dfc6aa5cSAndroid Build Coastguard Worker inptr0 = input_data[inrow - 1];
426*dfc6aa5cSAndroid Build Coastguard Worker inptr1 = input_data[inrow];
427*dfc6aa5cSAndroid Build Coastguard Worker inptr2 = input_data[inrow + 1];
428*dfc6aa5cSAndroid Build Coastguard Worker /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
429*dfc6aa5cSAndroid Build Coastguard Worker * respectively.
430*dfc6aa5cSAndroid Build Coastguard Worker */
431*dfc6aa5cSAndroid Build Coastguard Worker outptr0 = output_data[outrow++];
432*dfc6aa5cSAndroid Build Coastguard Worker outptr1 = output_data[outrow++];
433*dfc6aa5cSAndroid Build Coastguard Worker inrow++;
434*dfc6aa5cSAndroid Build Coastguard Worker
435*dfc6aa5cSAndroid Build Coastguard Worker /* The size of the input and output buffers is always a multiple of 32
436*dfc6aa5cSAndroid Build Coastguard Worker * bytes => no need to worry about buffer overflow when reading/writing
437*dfc6aa5cSAndroid Build Coastguard Worker * memory. See "Creation of 2-D sample arrays" in jmemmgr.c for more
438*dfc6aa5cSAndroid Build Coastguard Worker * details.
439*dfc6aa5cSAndroid Build Coastguard Worker */
440*dfc6aa5cSAndroid Build Coastguard Worker for (colctr = 0; colctr < downsampled_width; colctr += 16) {
441*dfc6aa5cSAndroid Build Coastguard Worker /* Load samples. */
442*dfc6aa5cSAndroid Build Coastguard Worker uint8x16_t sA = vld1q_u8(inptr0 + colctr);
443*dfc6aa5cSAndroid Build Coastguard Worker uint8x16_t sB = vld1q_u8(inptr1 + colctr);
444*dfc6aa5cSAndroid Build Coastguard Worker uint8x16_t sC = vld1q_u8(inptr2 + colctr);
445*dfc6aa5cSAndroid Build Coastguard Worker /* Blend samples vertically. */
446*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(sA)),
447*dfc6aa5cSAndroid Build Coastguard Worker vget_low_u8(sB), three_u8);
448*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(sA)),
449*dfc6aa5cSAndroid Build Coastguard Worker vget_high_u8(sB), three_u8);
450*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(sC)),
451*dfc6aa5cSAndroid Build Coastguard Worker vget_low_u8(sB), three_u8);
452*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(sC)),
453*dfc6aa5cSAndroid Build Coastguard Worker vget_high_u8(sB), three_u8);
454*dfc6aa5cSAndroid Build Coastguard Worker /* Add ordered dithering bias to pixel values in even output rows. */
455*dfc6aa5cSAndroid Build Coastguard Worker colsum0_l = vaddq_u16(colsum0_l, one_u16);
456*dfc6aa5cSAndroid Build Coastguard Worker colsum0_h = vaddq_u16(colsum0_h, one_u16);
457*dfc6aa5cSAndroid Build Coastguard Worker /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
458*dfc6aa5cSAndroid Build Coastguard Worker uint8x16_t output_pixels0 = vcombine_u8(vshrn_n_u16(colsum0_l, 2),
459*dfc6aa5cSAndroid Build Coastguard Worker vshrn_n_u16(colsum0_h, 2));
460*dfc6aa5cSAndroid Build Coastguard Worker uint8x16_t output_pixels1 = vcombine_u8(vrshrn_n_u16(colsum1_l, 2),
461*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_u16(colsum1_h, 2));
462*dfc6aa5cSAndroid Build Coastguard Worker /* Store pixel component values to memory. */
463*dfc6aa5cSAndroid Build Coastguard Worker vst1q_u8(outptr0 + colctr, output_pixels0);
464*dfc6aa5cSAndroid Build Coastguard Worker vst1q_u8(outptr1 + colctr, output_pixels1);
465*dfc6aa5cSAndroid Build Coastguard Worker }
466*dfc6aa5cSAndroid Build Coastguard Worker }
467*dfc6aa5cSAndroid Build Coastguard Worker }
468*dfc6aa5cSAndroid Build Coastguard Worker
469*dfc6aa5cSAndroid Build Coastguard Worker
470*dfc6aa5cSAndroid Build Coastguard Worker /* The diagram below shows a row of samples produced by h2v1 downsampling.
471*dfc6aa5cSAndroid Build Coastguard Worker *
472*dfc6aa5cSAndroid Build Coastguard Worker * s0 s1
473*dfc6aa5cSAndroid Build Coastguard Worker * +---------+---------+
474*dfc6aa5cSAndroid Build Coastguard Worker * | | |
475*dfc6aa5cSAndroid Build Coastguard Worker * | p0 p1 | p2 p3 |
476*dfc6aa5cSAndroid Build Coastguard Worker * | | |
477*dfc6aa5cSAndroid Build Coastguard Worker * +---------+---------+
478*dfc6aa5cSAndroid Build Coastguard Worker *
479*dfc6aa5cSAndroid Build Coastguard Worker * Samples s0 and s1 were created by averaging the original pixel component
480*dfc6aa5cSAndroid Build Coastguard Worker * values centered at positions p0-p3 above. To approximate those original
481*dfc6aa5cSAndroid Build Coastguard Worker * pixel component values, we duplicate the samples horizontally:
482*dfc6aa5cSAndroid Build Coastguard Worker * p0(upsampled) = p1(upsampled) = s0
483*dfc6aa5cSAndroid Build Coastguard Worker * p2(upsampled) = p3(upsampled) = s1
484*dfc6aa5cSAndroid Build Coastguard Worker */
485*dfc6aa5cSAndroid Build Coastguard Worker
jsimd_h2v1_upsample_neon(int max_v_samp_factor,JDIMENSION output_width,JSAMPARRAY input_data,JSAMPARRAY * output_data_ptr)486*dfc6aa5cSAndroid Build Coastguard Worker void jsimd_h2v1_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
487*dfc6aa5cSAndroid Build Coastguard Worker JSAMPARRAY input_data,
488*dfc6aa5cSAndroid Build Coastguard Worker JSAMPARRAY *output_data_ptr)
489*dfc6aa5cSAndroid Build Coastguard Worker {
490*dfc6aa5cSAndroid Build Coastguard Worker JSAMPARRAY output_data = *output_data_ptr;
491*dfc6aa5cSAndroid Build Coastguard Worker JSAMPROW inptr, outptr;
492*dfc6aa5cSAndroid Build Coastguard Worker int inrow;
493*dfc6aa5cSAndroid Build Coastguard Worker unsigned colctr;
494*dfc6aa5cSAndroid Build Coastguard Worker
495*dfc6aa5cSAndroid Build Coastguard Worker for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
496*dfc6aa5cSAndroid Build Coastguard Worker inptr = input_data[inrow];
497*dfc6aa5cSAndroid Build Coastguard Worker outptr = output_data[inrow];
498*dfc6aa5cSAndroid Build Coastguard Worker for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
499*dfc6aa5cSAndroid Build Coastguard Worker uint8x16_t samples = vld1q_u8(inptr + colctr);
500*dfc6aa5cSAndroid Build Coastguard Worker /* Duplicate the samples. The store operation below interleaves them so
501*dfc6aa5cSAndroid Build Coastguard Worker * that adjacent pixel component values take on the same sample value,
502*dfc6aa5cSAndroid Build Coastguard Worker * per above.
503*dfc6aa5cSAndroid Build Coastguard Worker */
504*dfc6aa5cSAndroid Build Coastguard Worker uint8x16x2_t output_pixels = { { samples, samples } };
505*dfc6aa5cSAndroid Build Coastguard Worker /* Store pixel component values to memory.
506*dfc6aa5cSAndroid Build Coastguard Worker * Due to the way sample buffers are allocated, we don't need to worry
507*dfc6aa5cSAndroid Build Coastguard Worker * about tail cases when output_width is not a multiple of 32. See
508*dfc6aa5cSAndroid Build Coastguard Worker * "Creation of 2-D sample arrays" in jmemmgr.c for details.
509*dfc6aa5cSAndroid Build Coastguard Worker */
510*dfc6aa5cSAndroid Build Coastguard Worker vst2q_u8(outptr + 2 * colctr, output_pixels);
511*dfc6aa5cSAndroid Build Coastguard Worker }
512*dfc6aa5cSAndroid Build Coastguard Worker }
513*dfc6aa5cSAndroid Build Coastguard Worker }
514*dfc6aa5cSAndroid Build Coastguard Worker
515*dfc6aa5cSAndroid Build Coastguard Worker
516*dfc6aa5cSAndroid Build Coastguard Worker /* The diagram below shows an array of samples produced by h2v2 downsampling.
517*dfc6aa5cSAndroid Build Coastguard Worker *
518*dfc6aa5cSAndroid Build Coastguard Worker * s0 s1
519*dfc6aa5cSAndroid Build Coastguard Worker * +---------+---------+
520*dfc6aa5cSAndroid Build Coastguard Worker * | p0 p1 | p2 p3 |
521*dfc6aa5cSAndroid Build Coastguard Worker * sA | | |
522*dfc6aa5cSAndroid Build Coastguard Worker * | p4 p5 | p6 p7 |
523*dfc6aa5cSAndroid Build Coastguard Worker * +---------+---------+
524*dfc6aa5cSAndroid Build Coastguard Worker * | p8 p9 | p10 p11|
525*dfc6aa5cSAndroid Build Coastguard Worker * sB | | |
526*dfc6aa5cSAndroid Build Coastguard Worker * | p12 p13| p14 p15|
527*dfc6aa5cSAndroid Build Coastguard Worker * +---------+---------+
528*dfc6aa5cSAndroid Build Coastguard Worker *
529*dfc6aa5cSAndroid Build Coastguard Worker * Samples s0A-s1B were created by averaging the original pixel component
530*dfc6aa5cSAndroid Build Coastguard Worker * values centered at positions p0-p15 above. To approximate those original
531*dfc6aa5cSAndroid Build Coastguard Worker * pixel component values, we duplicate the samples both horizontally and
532*dfc6aa5cSAndroid Build Coastguard Worker * vertically:
533*dfc6aa5cSAndroid Build Coastguard Worker * p0(upsampled) = p1(upsampled) = p4(upsampled) = p5(upsampled) = s0A
534*dfc6aa5cSAndroid Build Coastguard Worker * p2(upsampled) = p3(upsampled) = p6(upsampled) = p7(upsampled) = s1A
535*dfc6aa5cSAndroid Build Coastguard Worker * p8(upsampled) = p9(upsampled) = p12(upsampled) = p13(upsampled) = s0B
536*dfc6aa5cSAndroid Build Coastguard Worker * p10(upsampled) = p11(upsampled) = p14(upsampled) = p15(upsampled) = s1B
537*dfc6aa5cSAndroid Build Coastguard Worker */
538*dfc6aa5cSAndroid Build Coastguard Worker
jsimd_h2v2_upsample_neon(int max_v_samp_factor,JDIMENSION output_width,JSAMPARRAY input_data,JSAMPARRAY * output_data_ptr)539*dfc6aa5cSAndroid Build Coastguard Worker void jsimd_h2v2_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
540*dfc6aa5cSAndroid Build Coastguard Worker JSAMPARRAY input_data,
541*dfc6aa5cSAndroid Build Coastguard Worker JSAMPARRAY *output_data_ptr)
542*dfc6aa5cSAndroid Build Coastguard Worker {
543*dfc6aa5cSAndroid Build Coastguard Worker JSAMPARRAY output_data = *output_data_ptr;
544*dfc6aa5cSAndroid Build Coastguard Worker JSAMPROW inptr, outptr0, outptr1;
545*dfc6aa5cSAndroid Build Coastguard Worker int inrow, outrow;
546*dfc6aa5cSAndroid Build Coastguard Worker unsigned colctr;
547*dfc6aa5cSAndroid Build Coastguard Worker
548*dfc6aa5cSAndroid Build Coastguard Worker for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
549*dfc6aa5cSAndroid Build Coastguard Worker inptr = input_data[inrow];
550*dfc6aa5cSAndroid Build Coastguard Worker outptr0 = output_data[outrow++];
551*dfc6aa5cSAndroid Build Coastguard Worker outptr1 = output_data[outrow++];
552*dfc6aa5cSAndroid Build Coastguard Worker
553*dfc6aa5cSAndroid Build Coastguard Worker for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
554*dfc6aa5cSAndroid Build Coastguard Worker uint8x16_t samples = vld1q_u8(inptr + colctr);
555*dfc6aa5cSAndroid Build Coastguard Worker /* Duplicate the samples. The store operation below interleaves them so
556*dfc6aa5cSAndroid Build Coastguard Worker * that adjacent pixel component values take on the same sample value,
557*dfc6aa5cSAndroid Build Coastguard Worker * per above.
558*dfc6aa5cSAndroid Build Coastguard Worker */
559*dfc6aa5cSAndroid Build Coastguard Worker uint8x16x2_t output_pixels = { { samples, samples } };
560*dfc6aa5cSAndroid Build Coastguard Worker /* Store pixel component values for both output rows to memory.
561*dfc6aa5cSAndroid Build Coastguard Worker * Due to the way sample buffers are allocated, we don't need to worry
562*dfc6aa5cSAndroid Build Coastguard Worker * about tail cases when output_width is not a multiple of 32. See
563*dfc6aa5cSAndroid Build Coastguard Worker * "Creation of 2-D sample arrays" in jmemmgr.c for details.
564*dfc6aa5cSAndroid Build Coastguard Worker */
565*dfc6aa5cSAndroid Build Coastguard Worker vst2q_u8(outptr0 + 2 * colctr, output_pixels);
566*dfc6aa5cSAndroid Build Coastguard Worker vst2q_u8(outptr1 + 2 * colctr, output_pixels);
567*dfc6aa5cSAndroid Build Coastguard Worker }
568*dfc6aa5cSAndroid Build Coastguard Worker }
569*dfc6aa5cSAndroid Build Coastguard Worker }
570