xref: /aosp_15_r20/external/libjpeg-turbo/simd/arm/jdsample-neon.c (revision dfc6aa5c1cfd4bc4e2018dc74aa96e29ee49c6da)
1*dfc6aa5cSAndroid Build Coastguard Worker /*
2*dfc6aa5cSAndroid Build Coastguard Worker  * jdsample-neon.c - upsampling (Arm Neon)
3*dfc6aa5cSAndroid Build Coastguard Worker  *
4*dfc6aa5cSAndroid Build Coastguard Worker  * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
5*dfc6aa5cSAndroid Build Coastguard Worker  * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
6*dfc6aa5cSAndroid Build Coastguard Worker  *
7*dfc6aa5cSAndroid Build Coastguard Worker  * This software is provided 'as-is', without any express or implied
8*dfc6aa5cSAndroid Build Coastguard Worker  * warranty.  In no event will the authors be held liable for any damages
9*dfc6aa5cSAndroid Build Coastguard Worker  * arising from the use of this software.
10*dfc6aa5cSAndroid Build Coastguard Worker  *
11*dfc6aa5cSAndroid Build Coastguard Worker  * Permission is granted to anyone to use this software for any purpose,
12*dfc6aa5cSAndroid Build Coastguard Worker  * including commercial applications, and to alter it and redistribute it
13*dfc6aa5cSAndroid Build Coastguard Worker  * freely, subject to the following restrictions:
14*dfc6aa5cSAndroid Build Coastguard Worker  *
15*dfc6aa5cSAndroid Build Coastguard Worker  * 1. The origin of this software must not be misrepresented; you must not
16*dfc6aa5cSAndroid Build Coastguard Worker  *    claim that you wrote the original software. If you use this software
17*dfc6aa5cSAndroid Build Coastguard Worker  *    in a product, an acknowledgment in the product documentation would be
18*dfc6aa5cSAndroid Build Coastguard Worker  *    appreciated but is not required.
19*dfc6aa5cSAndroid Build Coastguard Worker  * 2. Altered source versions must be plainly marked as such, and must not be
20*dfc6aa5cSAndroid Build Coastguard Worker  *    misrepresented as being the original software.
21*dfc6aa5cSAndroid Build Coastguard Worker  * 3. This notice may not be removed or altered from any source distribution.
22*dfc6aa5cSAndroid Build Coastguard Worker  */
23*dfc6aa5cSAndroid Build Coastguard Worker 
24*dfc6aa5cSAndroid Build Coastguard Worker #define JPEG_INTERNALS
25*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jinclude.h"
26*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jpeglib.h"
27*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jsimd.h"
28*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jdct.h"
29*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jsimddct.h"
30*dfc6aa5cSAndroid Build Coastguard Worker #include "../jsimd.h"
31*dfc6aa5cSAndroid Build Coastguard Worker 
32*dfc6aa5cSAndroid Build Coastguard Worker #include <arm_neon.h>
33*dfc6aa5cSAndroid Build Coastguard Worker 
34*dfc6aa5cSAndroid Build Coastguard Worker 
35*dfc6aa5cSAndroid Build Coastguard Worker /* The diagram below shows a row of samples produced by h2v1 downsampling.
36*dfc6aa5cSAndroid Build Coastguard Worker  *
37*dfc6aa5cSAndroid Build Coastguard Worker  *                s0        s1        s2
38*dfc6aa5cSAndroid Build Coastguard Worker  *            +---------+---------+---------+
39*dfc6aa5cSAndroid Build Coastguard Worker  *            |         |         |         |
40*dfc6aa5cSAndroid Build Coastguard Worker  *            | p0   p1 | p2   p3 | p4   p5 |
41*dfc6aa5cSAndroid Build Coastguard Worker  *            |         |         |         |
42*dfc6aa5cSAndroid Build Coastguard Worker  *            +---------+---------+---------+
43*dfc6aa5cSAndroid Build Coastguard Worker  *
44*dfc6aa5cSAndroid Build Coastguard Worker  * Samples s0-s2 were created by averaging the original pixel component values
45*dfc6aa5cSAndroid Build Coastguard Worker  * centered at positions p0-p5 above.  To approximate those original pixel
46*dfc6aa5cSAndroid Build Coastguard Worker  * component values, we proportionally blend the adjacent samples in each row.
47*dfc6aa5cSAndroid Build Coastguard Worker  *
48*dfc6aa5cSAndroid Build Coastguard Worker  * An upsampled pixel component value is computed by blending the sample
49*dfc6aa5cSAndroid Build Coastguard Worker  * containing the pixel center with the nearest neighboring sample, in the
50*dfc6aa5cSAndroid Build Coastguard Worker  * ratio 3:1.  For example:
51*dfc6aa5cSAndroid Build Coastguard Worker  *     p1(upsampled) = 3/4 * s0 + 1/4 * s1
52*dfc6aa5cSAndroid Build Coastguard Worker  *     p2(upsampled) = 3/4 * s1 + 1/4 * s0
53*dfc6aa5cSAndroid Build Coastguard Worker  * When computing the first and last pixel component values in the row, there
54*dfc6aa5cSAndroid Build Coastguard Worker  * is no adjacent sample to blend, so:
55*dfc6aa5cSAndroid Build Coastguard Worker  *     p0(upsampled) = s0
56*dfc6aa5cSAndroid Build Coastguard Worker  *     p5(upsampled) = s2
57*dfc6aa5cSAndroid Build Coastguard Worker  */
58*dfc6aa5cSAndroid Build Coastguard Worker 
jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,JDIMENSION downsampled_width,JSAMPARRAY input_data,JSAMPARRAY * output_data_ptr)59*dfc6aa5cSAndroid Build Coastguard Worker void jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,
60*dfc6aa5cSAndroid Build Coastguard Worker                                     JDIMENSION downsampled_width,
61*dfc6aa5cSAndroid Build Coastguard Worker                                     JSAMPARRAY input_data,
62*dfc6aa5cSAndroid Build Coastguard Worker                                     JSAMPARRAY *output_data_ptr)
63*dfc6aa5cSAndroid Build Coastguard Worker {
64*dfc6aa5cSAndroid Build Coastguard Worker   JSAMPARRAY output_data = *output_data_ptr;
65*dfc6aa5cSAndroid Build Coastguard Worker   JSAMPROW inptr, outptr;
66*dfc6aa5cSAndroid Build Coastguard Worker   int inrow;
67*dfc6aa5cSAndroid Build Coastguard Worker   unsigned colctr;
68*dfc6aa5cSAndroid Build Coastguard Worker   /* Set up constants. */
69*dfc6aa5cSAndroid Build Coastguard Worker   const uint16x8_t one_u16 = vdupq_n_u16(1);
70*dfc6aa5cSAndroid Build Coastguard Worker   const uint8x8_t three_u8 = vdup_n_u8(3);
71*dfc6aa5cSAndroid Build Coastguard Worker 
72*dfc6aa5cSAndroid Build Coastguard Worker   for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
73*dfc6aa5cSAndroid Build Coastguard Worker     inptr = input_data[inrow];
74*dfc6aa5cSAndroid Build Coastguard Worker     outptr = output_data[inrow];
75*dfc6aa5cSAndroid Build Coastguard Worker     /* First pixel component value in this row of the original image */
76*dfc6aa5cSAndroid Build Coastguard Worker     *outptr = (JSAMPLE)GETJSAMPLE(*inptr);
77*dfc6aa5cSAndroid Build Coastguard Worker 
78*dfc6aa5cSAndroid Build Coastguard Worker     /*    3/4 * containing sample + 1/4 * nearest neighboring sample
79*dfc6aa5cSAndroid Build Coastguard Worker      * For p1: containing sample = s0, nearest neighboring sample = s1
80*dfc6aa5cSAndroid Build Coastguard Worker      * For p2: containing sample = s1, nearest neighboring sample = s0
81*dfc6aa5cSAndroid Build Coastguard Worker      */
82*dfc6aa5cSAndroid Build Coastguard Worker     uint8x16_t s0 = vld1q_u8(inptr);
83*dfc6aa5cSAndroid Build Coastguard Worker     uint8x16_t s1 = vld1q_u8(inptr + 1);
84*dfc6aa5cSAndroid Build Coastguard Worker     /* Multiplication makes vectors twice as wide.  '_l' and '_h' suffixes
85*dfc6aa5cSAndroid Build Coastguard Worker      * denote low half and high half respectively.
86*dfc6aa5cSAndroid Build Coastguard Worker      */
87*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t s1_add_3s0_l =
88*dfc6aa5cSAndroid Build Coastguard Worker       vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
89*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t s1_add_3s0_h =
90*dfc6aa5cSAndroid Build Coastguard Worker       vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
91*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t s0_add_3s1_l =
92*dfc6aa5cSAndroid Build Coastguard Worker       vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
93*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t s0_add_3s1_h =
94*dfc6aa5cSAndroid Build Coastguard Worker       vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
95*dfc6aa5cSAndroid Build Coastguard Worker     /* Add ordered dithering bias to odd pixel values. */
96*dfc6aa5cSAndroid Build Coastguard Worker     s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
97*dfc6aa5cSAndroid Build Coastguard Worker     s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
98*dfc6aa5cSAndroid Build Coastguard Worker 
99*dfc6aa5cSAndroid Build Coastguard Worker     /* The offset is initially 1, because the first pixel component has already
100*dfc6aa5cSAndroid Build Coastguard Worker      * been stored.  However, in subsequent iterations of the SIMD loop, this
101*dfc6aa5cSAndroid Build Coastguard Worker      * offset is (2 * colctr - 1) to stay within the bounds of the sample
102*dfc6aa5cSAndroid Build Coastguard Worker      * buffers without having to resort to a slow scalar tail case for the last
103*dfc6aa5cSAndroid Build Coastguard Worker      * (downsampled_width % 16) samples.  See "Creation of 2-D sample arrays"
104*dfc6aa5cSAndroid Build Coastguard Worker      * in jmemmgr.c for more details.
105*dfc6aa5cSAndroid Build Coastguard Worker      */
106*dfc6aa5cSAndroid Build Coastguard Worker     unsigned outptr_offset = 1;
107*dfc6aa5cSAndroid Build Coastguard Worker     uint8x16x2_t output_pixels;
108*dfc6aa5cSAndroid Build Coastguard Worker 
109*dfc6aa5cSAndroid Build Coastguard Worker     /* We use software pipelining to maximise performance.  The code indented
110*dfc6aa5cSAndroid Build Coastguard Worker      * an extra two spaces begins the next iteration of the loop.
111*dfc6aa5cSAndroid Build Coastguard Worker      */
112*dfc6aa5cSAndroid Build Coastguard Worker     for (colctr = 16; colctr < downsampled_width; colctr += 16) {
113*dfc6aa5cSAndroid Build Coastguard Worker 
114*dfc6aa5cSAndroid Build Coastguard Worker         s0 = vld1q_u8(inptr + colctr - 1);
115*dfc6aa5cSAndroid Build Coastguard Worker         s1 = vld1q_u8(inptr + colctr);
116*dfc6aa5cSAndroid Build Coastguard Worker 
117*dfc6aa5cSAndroid Build Coastguard Worker       /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
118*dfc6aa5cSAndroid Build Coastguard Worker       output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
119*dfc6aa5cSAndroid Build Coastguard Worker                                          vrshrn_n_u16(s1_add_3s0_h, 2));
120*dfc6aa5cSAndroid Build Coastguard Worker       output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
121*dfc6aa5cSAndroid Build Coastguard Worker                                          vshrn_n_u16(s0_add_3s1_h, 2));
122*dfc6aa5cSAndroid Build Coastguard Worker 
123*dfc6aa5cSAndroid Build Coastguard Worker         /* Multiplication makes vectors twice as wide.  '_l' and '_h' suffixes
124*dfc6aa5cSAndroid Build Coastguard Worker          * denote low half and high half respectively.
125*dfc6aa5cSAndroid Build Coastguard Worker          */
126*dfc6aa5cSAndroid Build Coastguard Worker         s1_add_3s0_l =
127*dfc6aa5cSAndroid Build Coastguard Worker           vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
128*dfc6aa5cSAndroid Build Coastguard Worker         s1_add_3s0_h =
129*dfc6aa5cSAndroid Build Coastguard Worker           vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
130*dfc6aa5cSAndroid Build Coastguard Worker         s0_add_3s1_l =
131*dfc6aa5cSAndroid Build Coastguard Worker           vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
132*dfc6aa5cSAndroid Build Coastguard Worker         s0_add_3s1_h =
133*dfc6aa5cSAndroid Build Coastguard Worker           vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
134*dfc6aa5cSAndroid Build Coastguard Worker         /* Add ordered dithering bias to odd pixel values. */
135*dfc6aa5cSAndroid Build Coastguard Worker         s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
136*dfc6aa5cSAndroid Build Coastguard Worker         s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
137*dfc6aa5cSAndroid Build Coastguard Worker 
138*dfc6aa5cSAndroid Build Coastguard Worker       /* Store pixel component values to memory. */
139*dfc6aa5cSAndroid Build Coastguard Worker       vst2q_u8(outptr + outptr_offset, output_pixels);
140*dfc6aa5cSAndroid Build Coastguard Worker       outptr_offset = 2 * colctr - 1;
141*dfc6aa5cSAndroid Build Coastguard Worker     }
142*dfc6aa5cSAndroid Build Coastguard Worker 
143*dfc6aa5cSAndroid Build Coastguard Worker     /* Complete the last iteration of the loop. */
144*dfc6aa5cSAndroid Build Coastguard Worker 
145*dfc6aa5cSAndroid Build Coastguard Worker     /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
146*dfc6aa5cSAndroid Build Coastguard Worker     output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
147*dfc6aa5cSAndroid Build Coastguard Worker                                        vrshrn_n_u16(s1_add_3s0_h, 2));
148*dfc6aa5cSAndroid Build Coastguard Worker     output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
149*dfc6aa5cSAndroid Build Coastguard Worker                                        vshrn_n_u16(s0_add_3s1_h, 2));
150*dfc6aa5cSAndroid Build Coastguard Worker     /* Store pixel component values to memory. */
151*dfc6aa5cSAndroid Build Coastguard Worker     vst2q_u8(outptr + outptr_offset, output_pixels);
152*dfc6aa5cSAndroid Build Coastguard Worker 
153*dfc6aa5cSAndroid Build Coastguard Worker     /* Last pixel component value in this row of the original image */
154*dfc6aa5cSAndroid Build Coastguard Worker     outptr[2 * downsampled_width - 1] =
155*dfc6aa5cSAndroid Build Coastguard Worker       GETJSAMPLE(inptr[downsampled_width - 1]);
156*dfc6aa5cSAndroid Build Coastguard Worker   }
157*dfc6aa5cSAndroid Build Coastguard Worker }
158*dfc6aa5cSAndroid Build Coastguard Worker 
159*dfc6aa5cSAndroid Build Coastguard Worker 
160*dfc6aa5cSAndroid Build Coastguard Worker /* The diagram below shows an array of samples produced by h2v2 downsampling.
161*dfc6aa5cSAndroid Build Coastguard Worker  *
162*dfc6aa5cSAndroid Build Coastguard Worker  *                s0        s1        s2
163*dfc6aa5cSAndroid Build Coastguard Worker  *            +---------+---------+---------+
164*dfc6aa5cSAndroid Build Coastguard Worker  *            | p0   p1 | p2   p3 | p4   p5 |
165*dfc6aa5cSAndroid Build Coastguard Worker  *       sA   |         |         |         |
166*dfc6aa5cSAndroid Build Coastguard Worker  *            | p6   p7 | p8   p9 | p10  p11|
167*dfc6aa5cSAndroid Build Coastguard Worker  *            +---------+---------+---------+
168*dfc6aa5cSAndroid Build Coastguard Worker  *            | p12  p13| p14  p15| p16  p17|
169*dfc6aa5cSAndroid Build Coastguard Worker  *       sB   |         |         |         |
170*dfc6aa5cSAndroid Build Coastguard Worker  *            | p18  p19| p20  p21| p22  p23|
171*dfc6aa5cSAndroid Build Coastguard Worker  *            +---------+---------+---------+
172*dfc6aa5cSAndroid Build Coastguard Worker  *            | p24  p25| p26  p27| p28  p29|
173*dfc6aa5cSAndroid Build Coastguard Worker  *       sC   |         |         |         |
174*dfc6aa5cSAndroid Build Coastguard Worker  *            | p30  p31| p32  p33| p34  p35|
175*dfc6aa5cSAndroid Build Coastguard Worker  *            +---------+---------+---------+
176*dfc6aa5cSAndroid Build Coastguard Worker  *
177*dfc6aa5cSAndroid Build Coastguard Worker  * Samples s0A-s2C were created by averaging the original pixel component
178*dfc6aa5cSAndroid Build Coastguard Worker  * values centered at positions p0-p35 above.  To approximate one of those
179*dfc6aa5cSAndroid Build Coastguard Worker  * original pixel component values, we proportionally blend the sample
180*dfc6aa5cSAndroid Build Coastguard Worker  * containing the pixel center with the nearest neighboring samples in each
181*dfc6aa5cSAndroid Build Coastguard Worker  * row, column, and diagonal.
182*dfc6aa5cSAndroid Build Coastguard Worker  *
183*dfc6aa5cSAndroid Build Coastguard Worker  * An upsampled pixel component value is computed by first blending the sample
184*dfc6aa5cSAndroid Build Coastguard Worker  * containing the pixel center with the nearest neighboring samples in the
185*dfc6aa5cSAndroid Build Coastguard Worker  * same column, in the ratio 3:1, and then blending each column sum with the
186*dfc6aa5cSAndroid Build Coastguard Worker  * nearest neighboring column sum, in the ratio 3:1.  For example:
187*dfc6aa5cSAndroid Build Coastguard Worker  *     p14(upsampled) = 3/4 * (3/4 * s1B + 1/4 * s1A) +
188*dfc6aa5cSAndroid Build Coastguard Worker  *                      1/4 * (3/4 * s0B + 1/4 * s0A)
189*dfc6aa5cSAndroid Build Coastguard Worker  *                    = 9/16 * s1B + 3/16 * s1A + 3/16 * s0B + 1/16 * s0A
190*dfc6aa5cSAndroid Build Coastguard Worker  * When computing the first and last pixel component values in the row, there
191*dfc6aa5cSAndroid Build Coastguard Worker  * is no horizontally adjacent sample to blend, so:
192*dfc6aa5cSAndroid Build Coastguard Worker  *     p12(upsampled) = 3/4 * s0B + 1/4 * s0A
193*dfc6aa5cSAndroid Build Coastguard Worker  *     p23(upsampled) = 3/4 * s2B + 1/4 * s2C
194*dfc6aa5cSAndroid Build Coastguard Worker  * When computing the first and last pixel component values in the column,
195*dfc6aa5cSAndroid Build Coastguard Worker  * there is no vertically adjacent sample to blend, so:
196*dfc6aa5cSAndroid Build Coastguard Worker  *     p2(upsampled) = 3/4 * s1A + 1/4 * s0A
197*dfc6aa5cSAndroid Build Coastguard Worker  *     p33(upsampled) = 3/4 * s1C + 1/4 * s2C
198*dfc6aa5cSAndroid Build Coastguard Worker  * When computing the corner pixel component values, there is no adjacent
199*dfc6aa5cSAndroid Build Coastguard Worker  * sample to blend, so:
200*dfc6aa5cSAndroid Build Coastguard Worker  *     p0(upsampled) = s0A
201*dfc6aa5cSAndroid Build Coastguard Worker  *     p35(upsampled) = s2C
202*dfc6aa5cSAndroid Build Coastguard Worker  */
203*dfc6aa5cSAndroid Build Coastguard Worker 
jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor,JDIMENSION downsampled_width,JSAMPARRAY input_data,JSAMPARRAY * output_data_ptr)204*dfc6aa5cSAndroid Build Coastguard Worker void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor,
205*dfc6aa5cSAndroid Build Coastguard Worker                                     JDIMENSION downsampled_width,
206*dfc6aa5cSAndroid Build Coastguard Worker                                     JSAMPARRAY input_data,
207*dfc6aa5cSAndroid Build Coastguard Worker                                     JSAMPARRAY *output_data_ptr)
208*dfc6aa5cSAndroid Build Coastguard Worker {
209*dfc6aa5cSAndroid Build Coastguard Worker   JSAMPARRAY output_data = *output_data_ptr;
210*dfc6aa5cSAndroid Build Coastguard Worker   JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
211*dfc6aa5cSAndroid Build Coastguard Worker   int inrow, outrow;
212*dfc6aa5cSAndroid Build Coastguard Worker   unsigned colctr;
213*dfc6aa5cSAndroid Build Coastguard Worker   /* Set up constants. */
214*dfc6aa5cSAndroid Build Coastguard Worker   const uint16x8_t seven_u16 = vdupq_n_u16(7);
215*dfc6aa5cSAndroid Build Coastguard Worker   const uint8x8_t three_u8 = vdup_n_u8(3);
216*dfc6aa5cSAndroid Build Coastguard Worker   const uint16x8_t three_u16 = vdupq_n_u16(3);
217*dfc6aa5cSAndroid Build Coastguard Worker 
218*dfc6aa5cSAndroid Build Coastguard Worker   inrow = outrow = 0;
219*dfc6aa5cSAndroid Build Coastguard Worker   while (outrow < max_v_samp_factor) {
220*dfc6aa5cSAndroid Build Coastguard Worker     inptr0 = input_data[inrow - 1];
221*dfc6aa5cSAndroid Build Coastguard Worker     inptr1 = input_data[inrow];
222*dfc6aa5cSAndroid Build Coastguard Worker     inptr2 = input_data[inrow + 1];
223*dfc6aa5cSAndroid Build Coastguard Worker     /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
224*dfc6aa5cSAndroid Build Coastguard Worker      * respectively.
225*dfc6aa5cSAndroid Build Coastguard Worker      */
226*dfc6aa5cSAndroid Build Coastguard Worker     outptr0 = output_data[outrow++];
227*dfc6aa5cSAndroid Build Coastguard Worker     outptr1 = output_data[outrow++];
228*dfc6aa5cSAndroid Build Coastguard Worker 
229*dfc6aa5cSAndroid Build Coastguard Worker     /* First pixel component value in this row of the original image */
230*dfc6aa5cSAndroid Build Coastguard Worker     int s0colsum0 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr0);
231*dfc6aa5cSAndroid Build Coastguard Worker     *outptr0 = (JSAMPLE)((s0colsum0 * 4 + 8) >> 4);
232*dfc6aa5cSAndroid Build Coastguard Worker     int s0colsum1 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr2);
233*dfc6aa5cSAndroid Build Coastguard Worker     *outptr1 = (JSAMPLE)((s0colsum1 * 4 + 8) >> 4);
234*dfc6aa5cSAndroid Build Coastguard Worker 
235*dfc6aa5cSAndroid Build Coastguard Worker     /* Step 1: Blend samples vertically in columns s0 and s1.
236*dfc6aa5cSAndroid Build Coastguard Worker      * Leave the divide by 4 until the end, when it can be done for both
237*dfc6aa5cSAndroid Build Coastguard Worker      * dimensions at once, right-shifting by 4.
238*dfc6aa5cSAndroid Build Coastguard Worker      */
239*dfc6aa5cSAndroid Build Coastguard Worker 
240*dfc6aa5cSAndroid Build Coastguard Worker     /* Load and compute s0colsum0 and s0colsum1. */
241*dfc6aa5cSAndroid Build Coastguard Worker     uint8x16_t s0A = vld1q_u8(inptr0);
242*dfc6aa5cSAndroid Build Coastguard Worker     uint8x16_t s0B = vld1q_u8(inptr1);
243*dfc6aa5cSAndroid Build Coastguard Worker     uint8x16_t s0C = vld1q_u8(inptr2);
244*dfc6aa5cSAndroid Build Coastguard Worker     /* Multiplication makes vectors twice as wide.  '_l' and '_h' suffixes
245*dfc6aa5cSAndroid Build Coastguard Worker      * denote low half and high half respectively.
246*dfc6aa5cSAndroid Build Coastguard Worker      */
247*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)),
248*dfc6aa5cSAndroid Build Coastguard Worker                                       vget_low_u8(s0B), three_u8);
249*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)),
250*dfc6aa5cSAndroid Build Coastguard Worker                                       vget_high_u8(s0B), three_u8);
251*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)),
252*dfc6aa5cSAndroid Build Coastguard Worker                                       vget_low_u8(s0B), three_u8);
253*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)),
254*dfc6aa5cSAndroid Build Coastguard Worker                                       vget_high_u8(s0B), three_u8);
255*dfc6aa5cSAndroid Build Coastguard Worker     /* Load and compute s1colsum0 and s1colsum1. */
256*dfc6aa5cSAndroid Build Coastguard Worker     uint8x16_t s1A = vld1q_u8(inptr0 + 1);
257*dfc6aa5cSAndroid Build Coastguard Worker     uint8x16_t s1B = vld1q_u8(inptr1 + 1);
258*dfc6aa5cSAndroid Build Coastguard Worker     uint8x16_t s1C = vld1q_u8(inptr2 + 1);
259*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)),
260*dfc6aa5cSAndroid Build Coastguard Worker                                       vget_low_u8(s1B), three_u8);
261*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)),
262*dfc6aa5cSAndroid Build Coastguard Worker                                       vget_high_u8(s1B), three_u8);
263*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)),
264*dfc6aa5cSAndroid Build Coastguard Worker                                       vget_low_u8(s1B), three_u8);
265*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)),
266*dfc6aa5cSAndroid Build Coastguard Worker                                       vget_high_u8(s1B), three_u8);
267*dfc6aa5cSAndroid Build Coastguard Worker 
268*dfc6aa5cSAndroid Build Coastguard Worker     /* Step 2: Blend the already-blended columns. */
269*dfc6aa5cSAndroid Build Coastguard Worker 
270*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
271*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
272*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
273*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
274*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
275*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
276*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
277*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
278*dfc6aa5cSAndroid Build Coastguard Worker     /* Add ordered dithering bias to odd pixel values. */
279*dfc6aa5cSAndroid Build Coastguard Worker     output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
280*dfc6aa5cSAndroid Build Coastguard Worker     output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
281*dfc6aa5cSAndroid Build Coastguard Worker     output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
282*dfc6aa5cSAndroid Build Coastguard Worker     output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
283*dfc6aa5cSAndroid Build Coastguard Worker     /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
284*dfc6aa5cSAndroid Build Coastguard Worker     uint8x16x2_t output_pixels0 = { {
285*dfc6aa5cSAndroid Build Coastguard Worker       vcombine_u8(vshrn_n_u16(output0_p1_l, 4), vshrn_n_u16(output0_p1_h, 4)),
286*dfc6aa5cSAndroid Build Coastguard Worker       vcombine_u8(vrshrn_n_u16(output0_p2_l, 4), vrshrn_n_u16(output0_p2_h, 4))
287*dfc6aa5cSAndroid Build Coastguard Worker     } };
288*dfc6aa5cSAndroid Build Coastguard Worker     uint8x16x2_t output_pixels1 = { {
289*dfc6aa5cSAndroid Build Coastguard Worker       vcombine_u8(vshrn_n_u16(output1_p1_l, 4), vshrn_n_u16(output1_p1_h, 4)),
290*dfc6aa5cSAndroid Build Coastguard Worker       vcombine_u8(vrshrn_n_u16(output1_p2_l, 4), vrshrn_n_u16(output1_p2_h, 4))
291*dfc6aa5cSAndroid Build Coastguard Worker     } };
292*dfc6aa5cSAndroid Build Coastguard Worker 
293*dfc6aa5cSAndroid Build Coastguard Worker     /* Store pixel component values to memory.
294*dfc6aa5cSAndroid Build Coastguard Worker      * The minimum size of the output buffer for each row is 64 bytes => no
295*dfc6aa5cSAndroid Build Coastguard Worker      * need to worry about buffer overflow here.  See "Creation of 2-D sample
296*dfc6aa5cSAndroid Build Coastguard Worker      * arrays" in jmemmgr.c for more details.
297*dfc6aa5cSAndroid Build Coastguard Worker      */
298*dfc6aa5cSAndroid Build Coastguard Worker     vst2q_u8(outptr0 + 1, output_pixels0);
299*dfc6aa5cSAndroid Build Coastguard Worker     vst2q_u8(outptr1 + 1, output_pixels1);
300*dfc6aa5cSAndroid Build Coastguard Worker 
301*dfc6aa5cSAndroid Build Coastguard Worker     /* The first pixel of the image shifted our loads and stores by one byte.
302*dfc6aa5cSAndroid Build Coastguard Worker      * We have to re-align on a 32-byte boundary at some point before the end
303*dfc6aa5cSAndroid Build Coastguard Worker      * of the row (we do it now on the 32/33 pixel boundary) to stay within the
304*dfc6aa5cSAndroid Build Coastguard Worker      * bounds of the sample buffers without having to resort to a slow scalar
305*dfc6aa5cSAndroid Build Coastguard Worker      * tail case for the last (downsampled_width % 16) samples.  See "Creation
306*dfc6aa5cSAndroid Build Coastguard Worker      * of 2-D sample arrays" in jmemmgr.c for more details.
307*dfc6aa5cSAndroid Build Coastguard Worker      */
308*dfc6aa5cSAndroid Build Coastguard Worker     for (colctr = 16; colctr < downsampled_width; colctr += 16) {
309*dfc6aa5cSAndroid Build Coastguard Worker       /* Step 1: Blend samples vertically in columns s0 and s1. */
310*dfc6aa5cSAndroid Build Coastguard Worker 
311*dfc6aa5cSAndroid Build Coastguard Worker       /* Load and compute s0colsum0 and s0colsum1. */
312*dfc6aa5cSAndroid Build Coastguard Worker       s0A = vld1q_u8(inptr0 + colctr - 1);
313*dfc6aa5cSAndroid Build Coastguard Worker       s0B = vld1q_u8(inptr1 + colctr - 1);
314*dfc6aa5cSAndroid Build Coastguard Worker       s0C = vld1q_u8(inptr2 + colctr - 1);
315*dfc6aa5cSAndroid Build Coastguard Worker       s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)), vget_low_u8(s0B),
316*dfc6aa5cSAndroid Build Coastguard Worker                              three_u8);
317*dfc6aa5cSAndroid Build Coastguard Worker       s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)), vget_high_u8(s0B),
318*dfc6aa5cSAndroid Build Coastguard Worker                              three_u8);
319*dfc6aa5cSAndroid Build Coastguard Worker       s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)), vget_low_u8(s0B),
320*dfc6aa5cSAndroid Build Coastguard Worker                              three_u8);
321*dfc6aa5cSAndroid Build Coastguard Worker       s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)), vget_high_u8(s0B),
322*dfc6aa5cSAndroid Build Coastguard Worker                              three_u8);
323*dfc6aa5cSAndroid Build Coastguard Worker       /* Load and compute s1colsum0 and s1colsum1. */
324*dfc6aa5cSAndroid Build Coastguard Worker       s1A = vld1q_u8(inptr0 + colctr);
325*dfc6aa5cSAndroid Build Coastguard Worker       s1B = vld1q_u8(inptr1 + colctr);
326*dfc6aa5cSAndroid Build Coastguard Worker       s1C = vld1q_u8(inptr2 + colctr);
327*dfc6aa5cSAndroid Build Coastguard Worker       s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)), vget_low_u8(s1B),
328*dfc6aa5cSAndroid Build Coastguard Worker                              three_u8);
329*dfc6aa5cSAndroid Build Coastguard Worker       s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)), vget_high_u8(s1B),
330*dfc6aa5cSAndroid Build Coastguard Worker                              three_u8);
331*dfc6aa5cSAndroid Build Coastguard Worker       s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)), vget_low_u8(s1B),
332*dfc6aa5cSAndroid Build Coastguard Worker                              three_u8);
333*dfc6aa5cSAndroid Build Coastguard Worker       s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)), vget_high_u8(s1B),
334*dfc6aa5cSAndroid Build Coastguard Worker                              three_u8);
335*dfc6aa5cSAndroid Build Coastguard Worker 
336*dfc6aa5cSAndroid Build Coastguard Worker       /* Step 2: Blend the already-blended columns. */
337*dfc6aa5cSAndroid Build Coastguard Worker 
338*dfc6aa5cSAndroid Build Coastguard Worker       output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
339*dfc6aa5cSAndroid Build Coastguard Worker       output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
340*dfc6aa5cSAndroid Build Coastguard Worker       output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
341*dfc6aa5cSAndroid Build Coastguard Worker       output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
342*dfc6aa5cSAndroid Build Coastguard Worker       output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
343*dfc6aa5cSAndroid Build Coastguard Worker       output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
344*dfc6aa5cSAndroid Build Coastguard Worker       output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
345*dfc6aa5cSAndroid Build Coastguard Worker       output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
346*dfc6aa5cSAndroid Build Coastguard Worker       /* Add ordered dithering bias to odd pixel values. */
347*dfc6aa5cSAndroid Build Coastguard Worker       output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
348*dfc6aa5cSAndroid Build Coastguard Worker       output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
349*dfc6aa5cSAndroid Build Coastguard Worker       output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
350*dfc6aa5cSAndroid Build Coastguard Worker       output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
351*dfc6aa5cSAndroid Build Coastguard Worker       /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
352*dfc6aa5cSAndroid Build Coastguard Worker       output_pixels0.val[0] = vcombine_u8(vshrn_n_u16(output0_p1_l, 4),
353*dfc6aa5cSAndroid Build Coastguard Worker                                           vshrn_n_u16(output0_p1_h, 4));
354*dfc6aa5cSAndroid Build Coastguard Worker       output_pixels0.val[1] = vcombine_u8(vrshrn_n_u16(output0_p2_l, 4),
355*dfc6aa5cSAndroid Build Coastguard Worker                                           vrshrn_n_u16(output0_p2_h, 4));
356*dfc6aa5cSAndroid Build Coastguard Worker       output_pixels1.val[0] = vcombine_u8(vshrn_n_u16(output1_p1_l, 4),
357*dfc6aa5cSAndroid Build Coastguard Worker                                           vshrn_n_u16(output1_p1_h, 4));
358*dfc6aa5cSAndroid Build Coastguard Worker       output_pixels1.val[1] = vcombine_u8(vrshrn_n_u16(output1_p2_l, 4),
359*dfc6aa5cSAndroid Build Coastguard Worker                                           vrshrn_n_u16(output1_p2_h, 4));
360*dfc6aa5cSAndroid Build Coastguard Worker       /* Store pixel component values to memory. */
361*dfc6aa5cSAndroid Build Coastguard Worker       vst2q_u8(outptr0 + 2 * colctr - 1, output_pixels0);
362*dfc6aa5cSAndroid Build Coastguard Worker       vst2q_u8(outptr1 + 2 * colctr - 1, output_pixels1);
363*dfc6aa5cSAndroid Build Coastguard Worker     }
364*dfc6aa5cSAndroid Build Coastguard Worker 
365*dfc6aa5cSAndroid Build Coastguard Worker     /* Last pixel component value in this row of the original image */
366*dfc6aa5cSAndroid Build Coastguard Worker     int s1colsum0 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
367*dfc6aa5cSAndroid Build Coastguard Worker                     GETJSAMPLE(inptr0[downsampled_width - 1]);
368*dfc6aa5cSAndroid Build Coastguard Worker     outptr0[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum0 * 4 + 7) >> 4);
369*dfc6aa5cSAndroid Build Coastguard Worker     int s1colsum1 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
370*dfc6aa5cSAndroid Build Coastguard Worker                     GETJSAMPLE(inptr2[downsampled_width - 1]);
371*dfc6aa5cSAndroid Build Coastguard Worker     outptr1[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum1 * 4 + 7) >> 4);
372*dfc6aa5cSAndroid Build Coastguard Worker     inrow++;
373*dfc6aa5cSAndroid Build Coastguard Worker   }
374*dfc6aa5cSAndroid Build Coastguard Worker }
375*dfc6aa5cSAndroid Build Coastguard Worker 
376*dfc6aa5cSAndroid Build Coastguard Worker 
377*dfc6aa5cSAndroid Build Coastguard Worker /* The diagram below shows a column of samples produced by h1v2 downsampling
378*dfc6aa5cSAndroid Build Coastguard Worker  * (or by losslessly rotating or transposing an h2v1-downsampled image.)
379*dfc6aa5cSAndroid Build Coastguard Worker  *
380*dfc6aa5cSAndroid Build Coastguard Worker  *            +---------+
381*dfc6aa5cSAndroid Build Coastguard Worker  *            |   p0    |
382*dfc6aa5cSAndroid Build Coastguard Worker  *     sA     |         |
383*dfc6aa5cSAndroid Build Coastguard Worker  *            |   p1    |
384*dfc6aa5cSAndroid Build Coastguard Worker  *            +---------+
385*dfc6aa5cSAndroid Build Coastguard Worker  *            |   p2    |
386*dfc6aa5cSAndroid Build Coastguard Worker  *     sB     |         |
387*dfc6aa5cSAndroid Build Coastguard Worker  *            |   p3    |
388*dfc6aa5cSAndroid Build Coastguard Worker  *            +---------+
389*dfc6aa5cSAndroid Build Coastguard Worker  *            |   p4    |
390*dfc6aa5cSAndroid Build Coastguard Worker  *     sC     |         |
391*dfc6aa5cSAndroid Build Coastguard Worker  *            |   p5    |
392*dfc6aa5cSAndroid Build Coastguard Worker  *            +---------+
393*dfc6aa5cSAndroid Build Coastguard Worker  *
394*dfc6aa5cSAndroid Build Coastguard Worker  * Samples sA-sC were created by averaging the original pixel component values
395*dfc6aa5cSAndroid Build Coastguard Worker  * centered at positions p0-p5 above.  To approximate those original pixel
396*dfc6aa5cSAndroid Build Coastguard Worker  * component values, we proportionally blend the adjacent samples in each
397*dfc6aa5cSAndroid Build Coastguard Worker  * column.
398*dfc6aa5cSAndroid Build Coastguard Worker  *
399*dfc6aa5cSAndroid Build Coastguard Worker  * An upsampled pixel component value is computed by blending the sample
400*dfc6aa5cSAndroid Build Coastguard Worker  * containing the pixel center with the nearest neighboring sample, in the
401*dfc6aa5cSAndroid Build Coastguard Worker  * ratio 3:1.  For example:
402*dfc6aa5cSAndroid Build Coastguard Worker  *     p1(upsampled) = 3/4 * sA + 1/4 * sB
403*dfc6aa5cSAndroid Build Coastguard Worker  *     p2(upsampled) = 3/4 * sB + 1/4 * sA
404*dfc6aa5cSAndroid Build Coastguard Worker  * When computing the first and last pixel component values in the column,
405*dfc6aa5cSAndroid Build Coastguard Worker  * there is no adjacent sample to blend, so:
406*dfc6aa5cSAndroid Build Coastguard Worker  *     p0(upsampled) = sA
407*dfc6aa5cSAndroid Build Coastguard Worker  *     p5(upsampled) = sC
408*dfc6aa5cSAndroid Build Coastguard Worker  */
409*dfc6aa5cSAndroid Build Coastguard Worker 
jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor,JDIMENSION downsampled_width,JSAMPARRAY input_data,JSAMPARRAY * output_data_ptr)410*dfc6aa5cSAndroid Build Coastguard Worker void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor,
411*dfc6aa5cSAndroid Build Coastguard Worker                                     JDIMENSION downsampled_width,
412*dfc6aa5cSAndroid Build Coastguard Worker                                     JSAMPARRAY input_data,
413*dfc6aa5cSAndroid Build Coastguard Worker                                     JSAMPARRAY *output_data_ptr)
414*dfc6aa5cSAndroid Build Coastguard Worker {
415*dfc6aa5cSAndroid Build Coastguard Worker   JSAMPARRAY output_data = *output_data_ptr;
416*dfc6aa5cSAndroid Build Coastguard Worker   JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
417*dfc6aa5cSAndroid Build Coastguard Worker   int inrow, outrow;
418*dfc6aa5cSAndroid Build Coastguard Worker   unsigned colctr;
419*dfc6aa5cSAndroid Build Coastguard Worker   /* Set up constants. */
420*dfc6aa5cSAndroid Build Coastguard Worker   const uint16x8_t one_u16 = vdupq_n_u16(1);
421*dfc6aa5cSAndroid Build Coastguard Worker   const uint8x8_t three_u8 = vdup_n_u8(3);
422*dfc6aa5cSAndroid Build Coastguard Worker 
423*dfc6aa5cSAndroid Build Coastguard Worker   inrow = outrow = 0;
424*dfc6aa5cSAndroid Build Coastguard Worker   while (outrow < max_v_samp_factor) {
425*dfc6aa5cSAndroid Build Coastguard Worker     inptr0 = input_data[inrow - 1];
426*dfc6aa5cSAndroid Build Coastguard Worker     inptr1 = input_data[inrow];
427*dfc6aa5cSAndroid Build Coastguard Worker     inptr2 = input_data[inrow + 1];
428*dfc6aa5cSAndroid Build Coastguard Worker     /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
429*dfc6aa5cSAndroid Build Coastguard Worker      * respectively.
430*dfc6aa5cSAndroid Build Coastguard Worker      */
431*dfc6aa5cSAndroid Build Coastguard Worker     outptr0 = output_data[outrow++];
432*dfc6aa5cSAndroid Build Coastguard Worker     outptr1 = output_data[outrow++];
433*dfc6aa5cSAndroid Build Coastguard Worker     inrow++;
434*dfc6aa5cSAndroid Build Coastguard Worker 
435*dfc6aa5cSAndroid Build Coastguard Worker     /* The size of the input and output buffers is always a multiple of 32
436*dfc6aa5cSAndroid Build Coastguard Worker      * bytes => no need to worry about buffer overflow when reading/writing
437*dfc6aa5cSAndroid Build Coastguard Worker      * memory.  See "Creation of 2-D sample arrays" in jmemmgr.c for more
438*dfc6aa5cSAndroid Build Coastguard Worker      * details.
439*dfc6aa5cSAndroid Build Coastguard Worker      */
440*dfc6aa5cSAndroid Build Coastguard Worker     for (colctr = 0; colctr < downsampled_width; colctr += 16) {
441*dfc6aa5cSAndroid Build Coastguard Worker       /* Load samples. */
442*dfc6aa5cSAndroid Build Coastguard Worker       uint8x16_t sA = vld1q_u8(inptr0 + colctr);
443*dfc6aa5cSAndroid Build Coastguard Worker       uint8x16_t sB = vld1q_u8(inptr1 + colctr);
444*dfc6aa5cSAndroid Build Coastguard Worker       uint8x16_t sC = vld1q_u8(inptr2 + colctr);
445*dfc6aa5cSAndroid Build Coastguard Worker       /* Blend samples vertically. */
446*dfc6aa5cSAndroid Build Coastguard Worker       uint16x8_t colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(sA)),
447*dfc6aa5cSAndroid Build Coastguard Worker                                       vget_low_u8(sB), three_u8);
448*dfc6aa5cSAndroid Build Coastguard Worker       uint16x8_t colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(sA)),
449*dfc6aa5cSAndroid Build Coastguard Worker                                       vget_high_u8(sB), three_u8);
450*dfc6aa5cSAndroid Build Coastguard Worker       uint16x8_t colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(sC)),
451*dfc6aa5cSAndroid Build Coastguard Worker                                       vget_low_u8(sB), three_u8);
452*dfc6aa5cSAndroid Build Coastguard Worker       uint16x8_t colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(sC)),
453*dfc6aa5cSAndroid Build Coastguard Worker                                       vget_high_u8(sB), three_u8);
454*dfc6aa5cSAndroid Build Coastguard Worker       /* Add ordered dithering bias to pixel values in even output rows. */
455*dfc6aa5cSAndroid Build Coastguard Worker       colsum0_l = vaddq_u16(colsum0_l, one_u16);
456*dfc6aa5cSAndroid Build Coastguard Worker       colsum0_h = vaddq_u16(colsum0_h, one_u16);
457*dfc6aa5cSAndroid Build Coastguard Worker       /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
458*dfc6aa5cSAndroid Build Coastguard Worker       uint8x16_t output_pixels0 = vcombine_u8(vshrn_n_u16(colsum0_l, 2),
459*dfc6aa5cSAndroid Build Coastguard Worker                                               vshrn_n_u16(colsum0_h, 2));
460*dfc6aa5cSAndroid Build Coastguard Worker       uint8x16_t output_pixels1 = vcombine_u8(vrshrn_n_u16(colsum1_l, 2),
461*dfc6aa5cSAndroid Build Coastguard Worker                                               vrshrn_n_u16(colsum1_h, 2));
462*dfc6aa5cSAndroid Build Coastguard Worker       /* Store pixel component values to memory. */
463*dfc6aa5cSAndroid Build Coastguard Worker       vst1q_u8(outptr0 + colctr, output_pixels0);
464*dfc6aa5cSAndroid Build Coastguard Worker       vst1q_u8(outptr1 + colctr, output_pixels1);
465*dfc6aa5cSAndroid Build Coastguard Worker     }
466*dfc6aa5cSAndroid Build Coastguard Worker   }
467*dfc6aa5cSAndroid Build Coastguard Worker }
468*dfc6aa5cSAndroid Build Coastguard Worker 
469*dfc6aa5cSAndroid Build Coastguard Worker 
470*dfc6aa5cSAndroid Build Coastguard Worker /* The diagram below shows a row of samples produced by h2v1 downsampling.
471*dfc6aa5cSAndroid Build Coastguard Worker  *
472*dfc6aa5cSAndroid Build Coastguard Worker  *                s0        s1
473*dfc6aa5cSAndroid Build Coastguard Worker  *            +---------+---------+
474*dfc6aa5cSAndroid Build Coastguard Worker  *            |         |         |
475*dfc6aa5cSAndroid Build Coastguard Worker  *            | p0   p1 | p2   p3 |
476*dfc6aa5cSAndroid Build Coastguard Worker  *            |         |         |
477*dfc6aa5cSAndroid Build Coastguard Worker  *            +---------+---------+
478*dfc6aa5cSAndroid Build Coastguard Worker  *
479*dfc6aa5cSAndroid Build Coastguard Worker  * Samples s0 and s1 were created by averaging the original pixel component
480*dfc6aa5cSAndroid Build Coastguard Worker  * values centered at positions p0-p3 above.  To approximate those original
481*dfc6aa5cSAndroid Build Coastguard Worker  * pixel component values, we duplicate the samples horizontally:
482*dfc6aa5cSAndroid Build Coastguard Worker  *     p0(upsampled) = p1(upsampled) = s0
483*dfc6aa5cSAndroid Build Coastguard Worker  *     p2(upsampled) = p3(upsampled) = s1
484*dfc6aa5cSAndroid Build Coastguard Worker  */
485*dfc6aa5cSAndroid Build Coastguard Worker 
jsimd_h2v1_upsample_neon(int max_v_samp_factor,JDIMENSION output_width,JSAMPARRAY input_data,JSAMPARRAY * output_data_ptr)486*dfc6aa5cSAndroid Build Coastguard Worker void jsimd_h2v1_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
487*dfc6aa5cSAndroid Build Coastguard Worker                               JSAMPARRAY input_data,
488*dfc6aa5cSAndroid Build Coastguard Worker                               JSAMPARRAY *output_data_ptr)
489*dfc6aa5cSAndroid Build Coastguard Worker {
490*dfc6aa5cSAndroid Build Coastguard Worker   JSAMPARRAY output_data = *output_data_ptr;
491*dfc6aa5cSAndroid Build Coastguard Worker   JSAMPROW inptr, outptr;
492*dfc6aa5cSAndroid Build Coastguard Worker   int inrow;
493*dfc6aa5cSAndroid Build Coastguard Worker   unsigned colctr;
494*dfc6aa5cSAndroid Build Coastguard Worker 
495*dfc6aa5cSAndroid Build Coastguard Worker   for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
496*dfc6aa5cSAndroid Build Coastguard Worker     inptr = input_data[inrow];
497*dfc6aa5cSAndroid Build Coastguard Worker     outptr = output_data[inrow];
498*dfc6aa5cSAndroid Build Coastguard Worker     for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
499*dfc6aa5cSAndroid Build Coastguard Worker       uint8x16_t samples = vld1q_u8(inptr + colctr);
500*dfc6aa5cSAndroid Build Coastguard Worker       /* Duplicate the samples.  The store operation below interleaves them so
501*dfc6aa5cSAndroid Build Coastguard Worker        * that adjacent pixel component values take on the same sample value,
502*dfc6aa5cSAndroid Build Coastguard Worker        * per above.
503*dfc6aa5cSAndroid Build Coastguard Worker        */
504*dfc6aa5cSAndroid Build Coastguard Worker       uint8x16x2_t output_pixels = { { samples, samples } };
505*dfc6aa5cSAndroid Build Coastguard Worker       /* Store pixel component values to memory.
506*dfc6aa5cSAndroid Build Coastguard Worker        * Due to the way sample buffers are allocated, we don't need to worry
507*dfc6aa5cSAndroid Build Coastguard Worker        * about tail cases when output_width is not a multiple of 32.  See
508*dfc6aa5cSAndroid Build Coastguard Worker        * "Creation of 2-D sample arrays" in jmemmgr.c for details.
509*dfc6aa5cSAndroid Build Coastguard Worker        */
510*dfc6aa5cSAndroid Build Coastguard Worker       vst2q_u8(outptr + 2 * colctr, output_pixels);
511*dfc6aa5cSAndroid Build Coastguard Worker     }
512*dfc6aa5cSAndroid Build Coastguard Worker   }
513*dfc6aa5cSAndroid Build Coastguard Worker }
514*dfc6aa5cSAndroid Build Coastguard Worker 
515*dfc6aa5cSAndroid Build Coastguard Worker 
516*dfc6aa5cSAndroid Build Coastguard Worker /* The diagram below shows an array of samples produced by h2v2 downsampling.
517*dfc6aa5cSAndroid Build Coastguard Worker  *
518*dfc6aa5cSAndroid Build Coastguard Worker  *                s0        s1
519*dfc6aa5cSAndroid Build Coastguard Worker  *            +---------+---------+
520*dfc6aa5cSAndroid Build Coastguard Worker  *            | p0   p1 | p2   p3 |
521*dfc6aa5cSAndroid Build Coastguard Worker  *       sA   |         |         |
522*dfc6aa5cSAndroid Build Coastguard Worker  *            | p4   p5 | p6   p7 |
523*dfc6aa5cSAndroid Build Coastguard Worker  *            +---------+---------+
524*dfc6aa5cSAndroid Build Coastguard Worker  *            | p8   p9 | p10  p11|
525*dfc6aa5cSAndroid Build Coastguard Worker  *       sB   |         |         |
526*dfc6aa5cSAndroid Build Coastguard Worker  *            | p12  p13| p14  p15|
527*dfc6aa5cSAndroid Build Coastguard Worker  *            +---------+---------+
528*dfc6aa5cSAndroid Build Coastguard Worker  *
529*dfc6aa5cSAndroid Build Coastguard Worker  * Samples s0A-s1B were created by averaging the original pixel component
530*dfc6aa5cSAndroid Build Coastguard Worker  * values centered at positions p0-p15 above.  To approximate those original
531*dfc6aa5cSAndroid Build Coastguard Worker  * pixel component values, we duplicate the samples both horizontally and
532*dfc6aa5cSAndroid Build Coastguard Worker  * vertically:
533*dfc6aa5cSAndroid Build Coastguard Worker  *     p0(upsampled) = p1(upsampled) = p4(upsampled) = p5(upsampled) = s0A
534*dfc6aa5cSAndroid Build Coastguard Worker  *     p2(upsampled) = p3(upsampled) = p6(upsampled) = p7(upsampled) = s1A
535*dfc6aa5cSAndroid Build Coastguard Worker  *     p8(upsampled) = p9(upsampled) = p12(upsampled) = p13(upsampled) = s0B
536*dfc6aa5cSAndroid Build Coastguard Worker  *     p10(upsampled) = p11(upsampled) = p14(upsampled) = p15(upsampled) = s1B
537*dfc6aa5cSAndroid Build Coastguard Worker  */
538*dfc6aa5cSAndroid Build Coastguard Worker 
jsimd_h2v2_upsample_neon(int max_v_samp_factor,JDIMENSION output_width,JSAMPARRAY input_data,JSAMPARRAY * output_data_ptr)539*dfc6aa5cSAndroid Build Coastguard Worker void jsimd_h2v2_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
540*dfc6aa5cSAndroid Build Coastguard Worker                               JSAMPARRAY input_data,
541*dfc6aa5cSAndroid Build Coastguard Worker                               JSAMPARRAY *output_data_ptr)
542*dfc6aa5cSAndroid Build Coastguard Worker {
543*dfc6aa5cSAndroid Build Coastguard Worker   JSAMPARRAY output_data = *output_data_ptr;
544*dfc6aa5cSAndroid Build Coastguard Worker   JSAMPROW inptr, outptr0, outptr1;
545*dfc6aa5cSAndroid Build Coastguard Worker   int inrow, outrow;
546*dfc6aa5cSAndroid Build Coastguard Worker   unsigned colctr;
547*dfc6aa5cSAndroid Build Coastguard Worker 
548*dfc6aa5cSAndroid Build Coastguard Worker   for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
549*dfc6aa5cSAndroid Build Coastguard Worker     inptr = input_data[inrow];
550*dfc6aa5cSAndroid Build Coastguard Worker     outptr0 = output_data[outrow++];
551*dfc6aa5cSAndroid Build Coastguard Worker     outptr1 = output_data[outrow++];
552*dfc6aa5cSAndroid Build Coastguard Worker 
553*dfc6aa5cSAndroid Build Coastguard Worker     for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
554*dfc6aa5cSAndroid Build Coastguard Worker       uint8x16_t samples = vld1q_u8(inptr + colctr);
555*dfc6aa5cSAndroid Build Coastguard Worker       /* Duplicate the samples.  The store operation below interleaves them so
556*dfc6aa5cSAndroid Build Coastguard Worker        * that adjacent pixel component values take on the same sample value,
557*dfc6aa5cSAndroid Build Coastguard Worker        * per above.
558*dfc6aa5cSAndroid Build Coastguard Worker        */
559*dfc6aa5cSAndroid Build Coastguard Worker       uint8x16x2_t output_pixels = { { samples, samples } };
560*dfc6aa5cSAndroid Build Coastguard Worker       /* Store pixel component values for both output rows to memory.
561*dfc6aa5cSAndroid Build Coastguard Worker        * Due to the way sample buffers are allocated, we don't need to worry
562*dfc6aa5cSAndroid Build Coastguard Worker        * about tail cases when output_width is not a multiple of 32.  See
563*dfc6aa5cSAndroid Build Coastguard Worker        * "Creation of 2-D sample arrays" in jmemmgr.c for details.
564*dfc6aa5cSAndroid Build Coastguard Worker        */
565*dfc6aa5cSAndroid Build Coastguard Worker       vst2q_u8(outptr0 + 2 * colctr, output_pixels);
566*dfc6aa5cSAndroid Build Coastguard Worker       vst2q_u8(outptr1 + 2 * colctr, output_pixels);
567*dfc6aa5cSAndroid Build Coastguard Worker     }
568*dfc6aa5cSAndroid Build Coastguard Worker   }
569*dfc6aa5cSAndroid Build Coastguard Worker }
570